diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml index 730fd08..ac346f4 100644 --- a/.github/workflows/tox-pytest.yml +++ b/.github/workflows/tox-pytest.yml @@ -11,7 +11,7 @@ jobs: id-token: write strategy: matrix: - python-version: ["3.10", "3.11"] + python-version: ["3.12"] fail-fast: false defaults: run: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2aaf16a..7516290 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -35,7 +35,6 @@ repos: rev: 24.10.0 hooks: - id: black - language_version: python3.11 - repo: https://github.com/pre-commit/mirrors-prettier rev: v4.0.0-alpha.8 diff --git a/environment.yml b/environment.yml index 4a985d5..89eadc8 100644 --- a/environment.yml +++ b/environment.yml @@ -5,7 +5,7 @@ channels: dependencies: # Packages required for setting up the environment - pip>=21,<24 - - python>=3.10,<3.12 + - python>=3.10,<=3.12 - setuptools>=66,<69 # Packages specified in setup.py that need or benefit from binary conda packages @@ -19,11 +19,14 @@ dependencies: # Jupyter packages: - jupyterlab>=3.2,<4 - - nbconvert>=6,<7 # Used to clear notebook outputs in pre-commit hooks + - nbconvert>=7 # Used to clear notebook outputs in pre-commit hooks # These are not normal Python packages available on PyPI - nodejs # Useful for Jupyter and prettier pre-commit hook + - dask>=2024 + - gdal + # Use pip to install the package defined by this repo for development: - pip: - --editable ./[dev,docs,tests,types] diff --git a/notebooks/16-kl-splink-ex21-filer-link.ipynb b/notebooks/16-kl-splink-ex21-filer-link.ipynb new file mode 100644 index 0000000..efef952 --- /dev/null +++ b/notebooks/16-kl-splink-ex21-filer-link.ipynb @@ -0,0 +1,5678 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "c535d97b-5dfa-4298-87f5-55c56c4c82ed", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 3" + ] + }, + { + "cell_type": "code", + "execution_count": 260, + "id": "e1222c94-36cd-4bae-95fb-089e5411e490", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[autoreload of mozilla_sec_eia.models.sec10k.utils.cloud failed: Traceback (most recent call last):\n", + " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py\", line 274, in check\n", + " superreload(m, reload, self.old_objects, self.shell)\n", + " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py\", line 500, in superreload\n", + " update_generic(old_obj, new_obj)\n", + " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py\", line 397, in update_generic\n", + " update(a, b)\n", + " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py\", line 335, in update_class\n", + " if (old_obj == new_obj) is True:\n", + " ^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/_collections_abc.py\", line 834, in __eq__\n", + " return dict(self.items()) == dict(other.items())\n", + " ^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/_collections_abc.py\", line 893, in __iter__\n", + " for key in self._mapping:\n", + " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/pydantic/_internal/_mock_val_ser.py\", line 46, in __iter__\n", + " return self._get_built().__iter__()\n", + " ^^^^^^^^^^^^^^^^^\n", + " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/pydantic/_internal/_mock_val_ser.py\", line 57, in _get_built\n", + " raise PydanticUserError(self._error_message, code=self._code)\n", + "pydantic.errors.PydanticUserError: Pydantic models should inherit from BaseModel, BaseModel cannot be instantiated directly\n", + "\n", + "For further information visit https://errors.pydantic.dev/2.9/u/base-model-instantiated\n", + "]\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from upath import UPath\n", + "\n", + "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive, convert_ex21_id_to_filename\n", + "from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import prepare_sec10k_basic_info_df, prepare_ex21_df, add_sec_company_id_to_subsidiaries" + ] + }, + { + "cell_type": "markdown", + "id": "16cd6122-4cb9-42aa-8be1-84c997a34e96", + "metadata": {}, + "source": [ + "# Read in Inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c29d0b75-759f-445c-adac-b2a6baf1fd0e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# for now try just training on 2023\n", + "raw_sec_df = pd.concat([pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q1.parquet\"),\n", + " pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q2.parquet\"),\n", + " pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q3.parquet\"),\n", + " pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q4.parquet\"),\n", + " ]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "dbf3b15c-3a5a-4b74-a929-71aec18750a1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "raw_sec_df = raw_sec_df.reset_index().pivot_table(values=\"value\", index=\"filename\", columns=\"key\", aggfunc=\"first\")\n", + "raw_sec_df.columns.name = None" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a8ec4fad-c92f-4cfc-a3d2-409a72a2df1e", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_path = UPath(\"gs://sec10k-outputs/v2/ex21_company_ownership_info\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8e7a642d-7718-4101-b851-f1f4ee07180e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "raw_ex21_df = pd.DataFrame()\n", + "for file in ex21_path.iterdir():\n", + " if file.name.split(\".\")[-1] == \"parquet\":\n", + " report_year = file.name[:4]\n", + " # for now just train with 2023\n", + " if report_year != \"2023\":\n", + " continue\n", + " year_quarter_df = pd.read_parquet(ex21_path / file.name)\n", + " year_quarter_df.loc[:, \"report_year\"] = report_year\n", + " year_quarter_df.loc[:, \"report_year\"] = pd.to_datetime(year_quarter_df[\"report_year\"], format=\"%Y\").dt.year\n", + " raw_ex21_df = pd.concat([raw_ex21_df, year_quarter_df])" + ] + }, + { + "cell_type": "markdown", + "id": "f3d5db08-3c42-4715-9f0d-4d02674b828a", + "metadata": {}, + "source": [ + "# Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39706c77-90db-4f49-8011-47a9777a88b6", + "metadata": {}, + "outputs": [], + "source": [ + "sec_df = prepare_sec10k_basic_info_df(raw_sec_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "id": "98d4f59e-d61f-4a24-84bc-6caa0d761e07", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/katielamb/CatalystCoop/mozilla-sec-eia/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py:233: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " )\n" + ] + } + ], + "source": [ + "ex21_df = prepare_ex21_df(raw_ex21_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "34a86ec8-5b6c-4147-8f94-021fa271174c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ex21_df.record_id.is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "505b0c45-1748-4517-8cac-d2acf2fa9037", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_df.record_id.is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11caf325-8530-430d-a3d2-a54043447021", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# sec_df has filename as unique ID\n", + "sec_df.filename.is_unique" + ] + }, + { + "cell_type": "markdown", + "id": "ceed053b-f6ae-4aad-8b12-b2083ba8e236", + "metadata": {}, + "source": [ + "Note: not removing paragraph layout docs, but maybe should" + ] + }, + { + "cell_type": "markdown", + "id": "6de284e1-2b76-418d-ac5e-9a84bd275c51", + "metadata": {}, + "source": [ + "# Try to just match on cleaned name and location" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "id": "2c9a384d-a9e1-4e4a-829f-e92f1a007c90", + "metadata": {}, + "outputs": [], + "source": [ + "sec_match_df = sec_df.drop_duplicates(subset=[\"central_index_key\", \"company_name\", \"loc_of_incorporation\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 179, + "id": "4bab406d-b1e0-495b-beee-90ae6b0c036b", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df = sec_match_df.merge(ex21_df, how=\"inner\", on=\"company_name\", suffixes=(\"_sec\", \"_ex21\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "id": "b8732fda-9f0a-412c-b7ba-8f307ee7b213", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 florida\n", + "1 delaware\n", + "2 missouri\n", + "3 delaware\n", + "4 NaN\n", + " ... \n", + "515 delaware\n", + "516 delaware\n", + "517 delaware\n", + "518 delaware\n", + "519 delaware\n", + "Name: loc_of_incorporation_sec, Length: 520, dtype: object" + ] + }, + "execution_count": 185, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df[\"loc_of_incorporation_sec\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "id": "3427d77c-3c3f-4a05-99db-7f96d3f0f193", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df.loc[:, \"loc_tokens_sec\"] = merged_df[\"loc_of_incorporation_sec\"].fillna(\"\").str.lower().str.split()\n", + "merged_df.loc[:, \"loc_tokens_ex21\"] = merged_df[\"loc_of_incorporation_ex21\"].fillna(\"\").str.lower().str.split()\n", + "merged_df[\"loc_overlap\"] = merged_df.apply(\n", + " lambda row: len(set(row[\"loc_tokens_sec\"]) & set(row[\"loc_tokens_ex21\"])), axis=1\n", + ")\n", + "\n", + "# Select the row with the highest word overlap for each CIK and company name\n", + "closest_match = merged_df.loc[merged_df.groupby([\"central_index_key\", \"company_name\"])['loc_overlap'].idxmax()].reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "id": "92cc6570-f34c-4782-9bbf-0cdeaf2ce044", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 480\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 210, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# this should be 0\n", + "closest_match.duplicated(subset=[\"company_name\", \"loc_of_incorporation_ex21\"]).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "id": "d0c650d0-303d-43a4-9ae3-35c4fb6d481b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "central_index_key\n", + "False 480\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 200, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# it's okay if there's duplication here, but not ideal\n", + "# multiple subsidiaries can point to the same CIK\n", + "closest_match.central_index_key.duplicated().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "id": "2b3a2c1f-7df4-4515-8727-a339303ebd4e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
record_id_secfilenamephone_numbercentral_index_keycitycompany_name_raw_secdate_of_name_changefilm_numberfiscal_year_endform_typeformer_conformed_nameirs_numberorganization_namesec_actsec_file_numberstandard_industrial_classificationstatestate_of_incorporationstreet_addressstreet_address_2zip_codereport_datereport_year_secloc_of_incorporation_seccompany_namecompany_name_no_legal_seccompany_name_mphone_secrecord_id_ex21idcompany_name_raw_ex21loc_of_incorporation_ex21own_perreport_year_ex21company_name_no_legal_ex21company_name_mphone_ex21loc_tokens_secloc_tokens_ex21loc_overlap
07990edgar/data/910638/0000910638-23-000009.txt80332639000000910638rock hill3d systems corp1993081623738595123110-k3 d systems corp954431352NaN1934 act001-34220services-prepackaged software [7372]scde333 three d systems circleNaN297302023-03-162023delaware3d systems corporation3d systemsT SSTMS150739910638-0000910638-23-0000093d systems corporationdelawareNaN20233d systemsT SSTMS[delaware][delaware]1
17526edgar/data/824142/0000824142-23-000019.txt91858322660000824142tulsaaaon, inc.1992070323675207123110-kaaon inc870448736NaN1934 act000-18953air cond & warm air heating equip & comm & ind...oknv2425 south yukon ave.NaN741072023-02-272023nevadaaaon incorporatedaaonN142821824142-0000824142-23-000019aaon, incoklahomaNaN2023aaonN[nevada][oklahoma]0
\n", + "
" + ], + "text/plain": [ + " record_id_sec filename phone_number central_index_key city company_name_raw_sec date_of_name_change film_number fiscal_year_end form_type former_conformed_name irs_number organization_name sec_act sec_file_number standard_industrial_classification state state_of_incorporation street_address street_address_2 zip_code report_date report_year_sec loc_of_incorporation_sec company_name company_name_no_legal_sec company_name_mphone_sec record_id_ex21 id company_name_raw_ex21 loc_of_incorporation_ex21 own_per report_year_ex21 company_name_no_legal_ex21 company_name_mphone_ex21 loc_tokens_sec loc_tokens_ex21 loc_overlap\n", + "0 7990 edgar/data/910638/0000910638-23-000009.txt 8033263900 0000910638 rock hill 3d systems corp 19930816 23738595 1231 10-k 3 d systems corp 954431352 NaN 1934 act 001-34220 services-prepackaged software [7372] sc de 333 three d systems circle NaN 29730 2023-03-16 2023 delaware 3d systems corporation 3d systems T SSTMS 150739 910638-0000910638-23-000009 3d systems corporation delaware NaN 2023 3d systems T SSTMS [delaware] [delaware] 1\n", + "1 7526 edgar/data/824142/0000824142-23-000019.txt 9185832266 0000824142 tulsa aaon, inc. 19920703 23675207 1231 10-k aaon inc 870448736 NaN 1934 act 000-18953 air cond & warm air heating equip & comm & ind... ok nv 2425 south yukon ave. NaN 74107 2023-02-27 2023 nevada aaon incorporated aaon N 142821 824142-0000824142-23-000019 aaon, inc oklahoma NaN 2023 aaon N [nevada] [oklahoma] 0" + ] + }, + "execution_count": 201, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "closest_match.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 241, + "id": "78dfc42c-3921-444e-8342-d34fc2fd1a7a", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_with_cik = ex21_df.merge(\n", + " closest_match[[\"company_name\", \"central_index_key\", \"loc_of_incorporation_ex21\"]].rename(columns={\"loc_of_incorporation_ex21\": \"loc_of_incorporation\"}),\n", + " how=\"left\",\n", + " on=[\"company_name\", \"loc_of_incorporation\"],\n", + ").rename(columns={\"central_index_key\": \"subsidiary_cik\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 242, + "id": "1f4bca08-3a65-484d-ac6b-cb7d4584b4e7", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_with_cik = ex21_with_cik.merge(closest_match[[\"company_name\", \"central_index_key\"]],\n", + " how=\"left\",\n", + " on=\"company_name\"\n", + " ).rename(columns={\"central_index_key\": \"company_name_merge_cik\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 243, + "id": "5462d9bb-23dd-45fb-b5bf-35396caba399", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "subsidiary_cik\n", + "True 191387\n", + "False 480\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 243, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ex21_with_cik.subsidiary_cik.isnull().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 244, + "id": "a38c45ad-56f3-49ad-bd62-fb91c4d89940", + "metadata": {}, + "outputs": [], + "source": [ + "# if a subsidiary doesn't have a CIK and has a null location\n", + "# but its name was assigned a CIK (with a different location)\n", + "# then assign that CIK to the subsidiary\n", + "ex21_with_cik[\"subsidiary_cik\"] = ex21_with_cik[\"subsidiary_cik\"].where(\n", + " ~(ex21_with_cik.subsidiary_cik.isnull()) | ~(ex21_with_cik.loc_of_incorporation.isnull()), \n", + " ex21_with_cik[\"company_name_merge_cik\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 245, + "id": "4cca9da1-8371-4b45-b88d-8c2911209707", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "subsidiary_cik\n", + "True 191386\n", + "False 481\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 245, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ex21_with_cik.subsidiary_cik.isnull().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 252, + "id": "e5b57a88-ffaa-4834-bea4-c5b4779bd551", + "metadata": {}, + "outputs": [], + "source": [ + "archive = GCSArchive()\n", + "md = archive.get_metadata()" + ] + }, + { + "cell_type": "code", + "execution_count": 261, + "id": "a33be6e3-056f-4e4a-acd4-9a6dc6f98c90", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_with_cik.loc[:, \"filename\"] = convert_ex21_id_to_filename(ex21_with_cik)" + ] + }, + { + "cell_type": "code", + "execution_count": 263, + "id": "d0dec8af-d730-4a06-af5e-f390fa228ac8", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_with_cik = ex21_with_cik.merge(md[\"cik\"], how=\"left\", left_on=\"filename\", right_index=True).rename(columns={\"cik\": \"parent_cik\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 264, + "id": "228a1d4b-bc19-49eb-b557-4f26d1febbd9", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_with_cik = add_sec_company_id_to_subsidiaries(ex21_with_cik)" + ] + }, + { + "cell_type": "code", + "execution_count": 265, + "id": "c1b88c44-81d7-4d9d-a2a3-be1b030348bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
record_ididcompany_name_rawloc_of_incorporationown_perreport_yearcompany_namecompany_name_no_legalcompany_name_mphonesubsidiary_cikcompany_name_merge_cikfilenameparent_ciksec_company_id
1644821644821000045-0000950170-23-030037nicholas data services, incflorida100.02023nicholas data services incorporatednicholas data servicesNXLS TT SRFSSNaNNaNedgar/data/1000045/0000950170-23-030037.txt10000451000045_1
1644811644811000045-0000950170-23-030037nicholas financial, incflorida100.02023nicholas financial incorporatednicholas financialNXLS FNNXL00010000450001000045edgar/data/1000045/0000950170-23-030037.txt10000450001000045
89891000209-0000950170-23-007273medallion bankutahNaN2023medallion bankmedallion bankMTLN BNKNaNNaNedgar/data/1000209/0000950170-23-007273.txt10002091000209_1
88881000209-0000950170-23-007273freshstart venture capital corpnew yorkNaN2023freshstart venture capital corporationfreshstart venture capitalFRXSTRT FNTR KPTLNaNNaNedgar/data/1000209/0000950170-23-007273.txt10002091000209_2
87871000209-0000950170-23-007273medallion capital, incminnesotaNaN2023medallion capital incorporatedmedallion capitalMTLN KPTLNaNNaNedgar/data/1000209/0000950170-23-007273.txt10002091000209_3
.............................................
1619571619579984-0000009984-23-000060barnes molding solutions korea limitedkoreaNaN2023barnes molding solutions korea limitedbarnes molding solutions koreaBRNS MLTNK SLXNS KRNaNNaNedgar/data/9984/0000009984-23-000060.txt99849984_99
1619561619569984-0000009984-23-000060barnes molding solutions (jiangsu) co., ltdchinaNaN2023barnes molding solutions company limitedbarnes molding solutionsBRNS MLTNK SLXNSNaNNaNedgar/data/9984/0000009984-23-000060.txt99849984_100
1619551619559984-0000009984-23-000060barnes korea ltdkoreaNaN2023barnes korea limitedbarnes koreaBRNS KRNaNNaNedgar/data/9984/0000009984-23-000060.txt99849984_101
1619651619659984-0000009984-23-000060gimatic automation india pvt ltdindiaNaN2023gimatic automation india pvt limitedgimatic automation india pvtJMTK ATMXN INT PFTNaNNaNedgar/data/9984/0000009984-23-000060.txt99849984_102
1620181620189984-0000009984-23-000060synventive molding solutions ltdabrazilNaN2023synventive molding solutions ltdasynventive molding solutions ltdaSNFNTF MLTNK SLXNS LTTNaNNaNedgar/data/9984/0000009984-23-000060.txt99849984_103
\n", + "

191867 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " record_id id company_name_raw loc_of_incorporation own_per report_year company_name company_name_no_legal company_name_mphone subsidiary_cik company_name_merge_cik filename parent_cik sec_company_id\n", + "164482 164482 1000045-0000950170-23-030037 nicholas data services, inc florida 100.0 2023 nicholas data services incorporated nicholas data services NXLS TT SRFSS NaN NaN edgar/data/1000045/0000950170-23-030037.txt 1000045 1000045_1\n", + "164481 164481 1000045-0000950170-23-030037 nicholas financial, inc florida 100.0 2023 nicholas financial incorporated nicholas financial NXLS FNNXL 0001000045 0001000045 edgar/data/1000045/0000950170-23-030037.txt 1000045 0001000045\n", + "89 89 1000209-0000950170-23-007273 medallion bank utah NaN 2023 medallion bank medallion bank MTLN BNK NaN NaN edgar/data/1000209/0000950170-23-007273.txt 1000209 1000209_1\n", + "88 88 1000209-0000950170-23-007273 freshstart venture capital corp new york NaN 2023 freshstart venture capital corporation freshstart venture capital FRXSTRT FNTR KPTL NaN NaN edgar/data/1000209/0000950170-23-007273.txt 1000209 1000209_2\n", + "87 87 1000209-0000950170-23-007273 medallion capital, inc minnesota NaN 2023 medallion capital incorporated medallion capital MTLN KPTL NaN NaN edgar/data/1000209/0000950170-23-007273.txt 1000209 1000209_3\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "161957 161957 9984-0000009984-23-000060 barnes molding solutions korea limited korea NaN 2023 barnes molding solutions korea limited barnes molding solutions korea BRNS MLTNK SLXNS KR NaN NaN edgar/data/9984/0000009984-23-000060.txt 9984 9984_99\n", + "161956 161956 9984-0000009984-23-000060 barnes molding solutions (jiangsu) co., ltd china NaN 2023 barnes molding solutions company limited barnes molding solutions BRNS MLTNK SLXNS NaN NaN edgar/data/9984/0000009984-23-000060.txt 9984 9984_100\n", + "161955 161955 9984-0000009984-23-000060 barnes korea ltd korea NaN 2023 barnes korea limited barnes korea BRNS KR NaN NaN edgar/data/9984/0000009984-23-000060.txt 9984 9984_101\n", + "161965 161965 9984-0000009984-23-000060 gimatic automation india pvt ltd india NaN 2023 gimatic automation india pvt limited gimatic automation india pvt JMTK ATMXN INT PFT NaN NaN edgar/data/9984/0000009984-23-000060.txt 9984 9984_102\n", + "162018 162018 9984-0000009984-23-000060 synventive molding solutions ltda brazil NaN 2023 synventive molding solutions ltda synventive molding solutions ltda SNFNTF MLTNK SLXNS LTT NaN NaN edgar/data/9984/0000009984-23-000060.txt 9984 9984_103\n", + "\n", + "[191867 rows x 14 columns]" + ] + }, + "execution_count": 265, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ex21_with_cik" + ] + }, + { + "cell_type": "code", + "execution_count": 266, + "id": "192d3cac-b156-4e5c-8148-0cbdc3e8900d", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_with_cik.to_parquet(\"ex21_2023.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "1bb694c9-cfbd-4e2f-b69c-9996a588d2d2", + "metadata": { + "tags": [] + }, + "source": [ + "# Match Ex. 21 Subsidiaries to a SEC filer" + ] + }, + { + "cell_type": "markdown", + "id": "01d3a5e1-ad17-4266-b2ef-358f246749db", + "metadata": { + "tags": [] + }, + "source": [ + "## Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "4df63893-8a18-4b00-9b16-d036108bd567", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
statestate_of_incorporation
1nyde
2nyde
5camd
6gade
7njde
.........
8265nyde
8266txde
8267nyoh
8268txde
8269ctde
\n", + "

5051 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " state state_of_incorporation\n", + "1 ny de\n", + "2 ny de\n", + "5 ca md\n", + "6 ga de\n", + "7 nj de\n", + "... ... ...\n", + "8265 ny de\n", + "8266 tx de\n", + "8267 ny oh\n", + "8268 tx de\n", + "8269 ct de\n", + "\n", + "[5051 rows x 2 columns]" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_df[(sec_df[\"state\"] != sec_df[\"state_of_incorporation\"]) & (~sec_df[\"state_of_incorporation\"].isnull())][[\"state\", \"state_of_incorporation\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "24890018-8efb-445f-ad91-ca316edccbe8", + "metadata": {}, + "outputs": [], + "source": [ + "sec_match_df = sec_df.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "83f859df-1764-4e97-addc-0064bdcb31b7", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "loc_of_incorporation\n", + "False 6382\n", + "True 749\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_match_df[\"loc_of_incorporation\"].isnull().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "e9d0828f-0ad8-41ea-a449-ddd274a888d0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ex21_match_df = ex21_df.copy()" + ] + }, + { + "cell_type": "markdown", + "id": "ef3f01c7-c21e-4755-ac99-4ea01f359c43", + "metadata": {}, + "source": [ + "Remove clearly \"invalid\" strings and fill nulls" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "4ca07927-185d-4bc6-978a-e8788a8f77b3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "company_name\n", + "rush truck center 120\n", + "encompass health rehabilitation hospital 79\n", + "rush peterbilt truck center 57\n", + "branch 52\n", + "sci funeral services llc iowa limited liability company 33\n", + "partnership limited partnership 32\n", + "alderwoods group llc de limited liability company 27\n", + "encompass health rehabilitation hospital of 26\n", + "u haul co of 26\n", + "at and t 25\n", + "corporation 21\n", + "amh portfolio management 20\n", + "rush bus center 20\n", + "limited partnership limited partnership 18\n", + "therapy limited partnership 15\n", + "rush isuzu trucks 15\n", + "colgate palmolive limited 14\n", + "johnson and johnson limited 11\n", + "ecolab limited 11\n", + "rush truck centres 11\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ex21_match_df.company_name.value_counts().head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "8a4839e5-a2e5-4098-826a-4d340cdde638", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ex21_match_df = ex21_match_df[[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]\n", + "sec_match_df = sec_match_df[[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "baab7dfc-4efb-4c08-b090-32dd47025e15", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/c0/5zrbrqhx17d5jm6t03bw2nkw0000gn/T/ipykernel_26291/3959766958.py:2: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " sec_match_df.loc[:, \"company_name_mphone_list\"] = sec_match_df[\"company_name_mphone\"].str.split(\" \")\n", + "/var/folders/c0/5zrbrqhx17d5jm6t03bw2nkw0000gn/T/ipykernel_26291/3959766958.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " ex21_match_df.loc[:, \"company_name_mphone_list\"] = ex21_match_df[\"company_name_mphone\"].str.split(\" \")\n" + ] + } + ], + "source": [ + "# TEMP\n", + "sec_match_df.loc[:, \"company_name_mphone_list\"] = sec_match_df[\"company_name_mphone\"].str.split(\" \")\n", + "ex21_match_df.loc[:, \"company_name_mphone_list\"] = ex21_match_df[\"company_name_mphone\"].str.split(\" \")" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "a1a6634e-e554-4a94-8a57-c2755048db22", + "metadata": {}, + "outputs": [], + "source": [ + "sec_match_df.loc[:, \"loc_list\"] = sec_match_df[\"loc_of_incorporation\"].str.replace(\",\", '').str.split(\" \")\n", + "ex21_match_df.loc[:, \"loc_list\"] = ex21_match_df[\"loc_of_incorporation\"].str.replace(\",\", '').str.split(\" \")" + ] + }, + { + "cell_type": "markdown", + "id": "c294372b-159c-4c90-a031-61c34532b965", + "metadata": {}, + "source": [ + "## Exploratory Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "c9dbc620-ed49-4a8e-9d02-6b6f2e0a14cf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from splink.exploratory import completeness_chart, profile_columns\n", + "from splink import DuckDBAPI\n", + "\n", + "db_api = DuckDBAPI()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "422ca098-e4e7-4284-8b04-74e976e36023", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "match_cols = [\"report_year\", \"company_name\", \"loc_of_incorporation\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "232b5718-c1ed-4e63-8384-b4acf33210d3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# sometimes this will show up as 100% complete in loc_of_incorporation, not sure why\n", + "completeness_chart([ex21_match_df[match_cols], sec_match_df[match_cols]], db_api=db_api)" + ] + }, + { + "cell_type": "markdown", + "id": "6b6b20bc-cd22-42cc-b24d-8d581a311ca8", + "metadata": {}, + "source": [ + "There is strong skew in the location of incorporation field with around 40-50% of the values being Delaware in both datasets. We therefore want to use `term_frequency_adjustments` in our linkage model." + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "a5c26016-2c59-4335-bd39-8b2e7ea91840", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "profile_columns(ex21_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "2a57f717-140f-434d-8998-983b8bf38ac5", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "profile_columns(sec_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)" + ] + }, + { + "cell_type": "markdown", + "id": "1f258250-97c1-4f19-b535-cb91ff9e0ea9", + "metadata": { + "tags": [] + }, + "source": [ + "## Blocking\n", + "\n", + "Can maybe do the subsidiary to filers match without blocking but probably want a blocking rule. \n", + "\n", + "TODO: can we block on nearest 5 report years instead of exact match report year?" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "fb6d143b-5201-4b31-849c-97db80781ade", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from splink import block_on\n", + "from splink.blocking_analysis import count_comparisons_from_blocking_rule, n_largest_blocks" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "22766c9f-7371-483f-82b0-015549a84357", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "br = \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)\"" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "60937a9c-dff6-4d68-808f-81b8228fc9f6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'number_of_comparisons_generated_pre_filter_conditions': 531298,\n", + " 'number_of_comparisons_to_be_scored_post_filter_conditions': 531298,\n", + " 'filter_conditions_identified': '',\n", + " 'equi_join_conditions_identified': 'SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',\n", + " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# br0 = block_on(\"report_year\", \"report_year\")\n", + "# br1 = \"jaccard(l.company_name, r.company_name) < .1\"\n", + "# br2 = block_on(\"company_name\", \"company_name\")\n", + "\n", + "counts = count_comparisons_from_blocking_rule(\n", + " table_or_tables=[sec_match_df, ex21_match_df],\n", + " blocking_rule=br,\n", + " link_type=\"link_only\",\n", + " unique_id_column_name='record_id',\n", + " db_api=db_api,\n", + ")\n", + "\n", + "counts" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "67717313-2c17-4b6b-b984-8f7bc955c678", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
key_0count_lcount_rblock_count
0AMRK5662535000
1FRST5655531080
2INTR3065919770
\n", + "
" + ], + "text/plain": [ + " key_0 count_l count_r block_count\n", + "0 AMRK 56 625 35000\n", + "1 FRST 56 555 31080\n", + "2 INTR 30 659 19770" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = n_largest_blocks(\n", + " table_or_tables=[sec_match_df, ex21_match_df],\n", + " blocking_rule=br,\n", + " link_type=\"link_only\",\n", + " db_api=db_api,\n", + " n_largest=3\n", + ")\n", + "\n", + "result.as_pandas_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "6fe6fb99-f5fd-4538-a8bc-c9dd41f4ff9c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", + "\n", + "blocking_rules_for_analysis = [\n", + " br\n", + "]\n", + "\n", + "\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=[sec_match_df, ex21_match_df],\n", + " blocking_rules=blocking_rules_for_analysis,\n", + " db_api=db_api,\n", + " unique_id_column_name='record_id',\n", + " link_type=\"link_only\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b553f3fb-0661-46ab-b43c-f5fcba608a09", + "metadata": {}, + "source": [ + "## Create Model\n", + "\n", + "Maybe want to deduplicate the Ex. 21 data first, then conduct a link to SEC filers?" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "1f12d114-22fd-4f12-a0be-6a62500e80d5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import splink.comparison_library as cl\n", + "from splink import Linker, SettingsCreator" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "e9cf27ac-6f65-4c73-9e11-9445a8977531", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'ExactMatch' of \"company_name\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'company_name is NULL' with SQL rule: \"company_name_l\" IS NULL OR \"company_name_r\" IS NULL\n", + " - 'Exact match on company_name' with SQL rule: \"company_name_l\" = \"company_name_r\"\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "company_name_comparison = cl.ExactMatch(\n", + " \"company_name\",\n", + ")\n", + "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "a0d056b4-b7b5-4f01-ad60-3ffc2bec54eb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'LevenshteinAtThresholds' of \"company_name\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'company_name is NULL' with SQL rule: \"company_name_l\" IS NULL OR \"company_name_r\" IS NULL\n", + " - 'Exact match on company_name' with SQL rule: \"company_name_l\" = \"company_name_r\"\n", + " - 'Levenshtein distance of company_name <= 1' with SQL rule: levenshtein(\"company_name_l\", \"company_name_r\") <= 1\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "company_name_comparison = cl.LevenshteinAtThresholds(\n", + " \"company_name\",\n", + " distance_threshold_or_thresholds=[1]\n", + ")\n", + "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "bf199c98-5239-4a1e-8856-19d74e42b7db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'ArrayIntersectAtSizes' of \"company_name_mphone_list\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'company_name_mphone_list is NULL' with SQL rule: \"company_name_mphone_list_l\" IS NULL OR \"company_name_mphone_list_r\" IS NULL\n", + " - 'Array intersection size >= 3' with SQL rule: array_length(list_intersect(\"company_name_mphone_list_l\", \"company_name_mphone_list_r\")) >= 3\n", + " - 'Array intersection size >= 2' with SQL rule: array_length(list_intersect(\"company_name_mphone_list_l\", \"company_name_mphone_list_r\")) >= 2\n", + " - 'Array intersection size >= 1' with SQL rule: array_length(list_intersect(\"company_name_mphone_list_l\", \"company_name_mphone_list_r\")) >= 1\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "company_name_comparison = cl.ArrayIntersectAtSizes(\n", + " \"company_name_mphone_list\",\n", + " size_threshold_or_thresholds=[3,2,1]\n", + ")\n", + "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "id": "7d2697d3-efdb-4be4-8911-18b457f5bab4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'JaroWinklerAtThresholds' of \"loc_of_incorporation\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'loc_of_incorporation is NULL' with SQL rule: \"loc_of_incorporation_l\" IS NULL OR \"loc_of_incorporation_r\" IS NULL\n", + " - 'Exact match on loc_of_incorporation' with SQL rule: \"loc_of_incorporation_l\" = \"loc_of_incorporation_r\"\n", + " - 'Jaro-Winkler distance of loc_of_incorporation >= 0.9' with SQL rule: jaro_winkler_similarity(\"loc_of_incorporation_l\", \"loc_of_incorporation_r\") >= 0.9\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "# try with Levenshtein too\n", + "location_comparison = cl.JaroWinklerAtThresholds(\n", + " \"loc_of_incorporation\",\n", + " score_threshold_or_thresholds=[0.9]\n", + ")\n", + "print(location_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "f3529a5a-7ced-46dd-af22-7bb44ed92aa2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'ArrayIntersectAtSizes' of \"loc_list\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'loc_list is NULL' with SQL rule: \"loc_list_l\" IS NULL OR \"loc_list_r\" IS NULL\n", + " - 'Array intersection size >= 2' with SQL rule: array_length(list_intersect(\"loc_list_l\", \"loc_list_r\")) >= 2\n", + " - 'Array intersection size >= 1' with SQL rule: array_length(list_intersect(\"loc_list_l\", \"loc_list_r\")) >= 1\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "location_comparison = cl.ArrayIntersectAtSizes(\n", + " \"loc_list\",\n", + " size_threshold_or_thresholds=[2,1]\n", + ")\n", + "print(location_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "92c1ad6b-4516-4ab4-90eb-394669c4a02b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "settings = SettingsCreator(\n", + " link_type=\"link_only\",\n", + " unique_id_column_name=\"record_id\",\n", + " comparisons=[\n", + " company_name_comparison,\n", + " location_comparison.configure(term_frequency_adjustments=True)\n", + " ],\n", + " blocking_rules_to_generate_predictions=[\n", + " br\n", + " ],\n", + " retain_intermediate_calculation_columns=True,\n", + ")\n", + "\n", + "linker = Linker([sec_match_df, ex21_match_df], settings, db_api=DuckDBAPI())" + ] + }, + { + "cell_type": "markdown", + "id": "2f293657-b40c-4539-8abd-8524d11c39c0", + "metadata": {}, + "source": [ + "Estimate probability two random records match" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "id": "e9eb59b9-49cc-45b7-8ffa-b8f7e5372608", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d8daffbf12a14f72a247e47fc2fa719a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 8.21e-05.\n", + "This means that amongst all possible pairwise record comparisons, one in 12,184.39 are expected to match. With 1,368,717,009 total possible comparisons, we expect a total of around 112,333.68 matching pairs\n" + ] + } + ], + "source": [ + "deterministic_rules = [\n", + " block_on(\"company_name_mphone\", \"company_name_mphone\"),\n", + " \"jaccard(r.company_name, l.company_name) >= .95 and l.loc_of_incorporation = r.loc_of_incorporation\",\n", + " \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and jaccard(r.company_name, l.company_name) >= .95\",\n", + " # \"substr(l.company_name_mphone,1,5) = substr(r.company_name_mphone,1,5) and l.loc_of_incorporation = r.loc_of_incorporation\"\n", + "]\n", + "\n", + "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.95)" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "5117653e-e72b-4c13-b923-d1228b39d357", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n", + "\n", + "Estimated u probabilities using random sampling\n", + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - company_name (no m values are trained).\n", + " - loc_of_incorporation (no m values are trained).\n" + ] + } + ], + "source": [ + "linker.training.estimate_u_using_random_sampling(max_pairs=1e7)" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "id": "8b089a0d-4c91-4b4d-9806-ed83c9bd59b9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n", + "Estimating the m probabilities of the model by blocking on:\n", + "(l.\"company_name_mphone\" = r.\"company_name_mphone\") AND (l.\"company_name_mphone\" = r.\"company_name_mphone\")\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - company_name\n", + " - loc_of_incorporation\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + "\n", + "Iteration 1: Largest change in params was -0.38 in the m_probability of loc_of_incorporation, level `Exact match on loc_of_incorporation`\n", + "Iteration 2: Largest change in params was 0.027 in the m_probability of loc_of_incorporation, level `All other comparisons`\n", + "Iteration 3: Largest change in params was -0.000274 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 4: Largest change in params was -0.00056 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 5: Largest change in params was 0.00112 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 6: Largest change in params was 0.00214 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 7: Largest change in params was 0.00387 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 8: Largest change in params was -0.00648 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 9: Largest change in params was 0.00989 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 10: Largest change in params was 0.0137 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 11: Largest change in params was 0.0171 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 12: Largest change in params was -0.0197 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 13: Largest change in params was 0.0209 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 14: Largest change in params was -0.0209 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 15: Largest change in params was -0.0201 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 16: Largest change in params was -0.0187 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 17: Largest change in params was -0.017 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 18: Largest change in params was 0.0153 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 19: Largest change in params was -0.0136 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 20: Largest change in params was -0.0121 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 21: Largest change in params was -0.0107 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 22: Largest change in params was -0.0094 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 23: Largest change in params was 0.00828 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 24: Largest change in params was -0.00728 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 25: Largest change in params was -0.00641 in the m_probability of company_name, level `Exact match on company_name`\n", + "\n", + "EM converged after 25 iterations\n", + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] + } + ], + "source": [ + "training_blocking_rule = block_on(\"company_name_mphone\", \"company_name_mphone\")\n", + "training_session_fname_sname = (\n", + " linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "id": "88e058bc-800d-4da4-92aa-6ddb7377b4bf", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 127, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.match_weights_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "673a4776-1de1-46ce-a411-f7fd1668d54f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 128, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.m_u_parameters_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "ebf9e326-38f1-4d78-b302-15867cda1009", + "metadata": {}, + "outputs": [], + "source": [ + "settings = linker.misc.save_model_to_json(\n", + " \"../sec_ex21_model_settings/2023_model.json\", overwrite=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a14055d2-6761-4906-8555-35c92553a0e9", + "metadata": {}, + "source": [ + "Log model in MLFlow." + ] + }, + { + "cell_type": "markdown", + "id": "dfe4feca-e694-4ec6-a5b0-11382c740516", + "metadata": {}, + "source": [ + "## Make predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "id": "72ff6575-68e3-4256-8253-85eb2564501f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Blocking time: 0.20 seconds\n", + "Predict time: 0.12 seconds\n" + ] + } + ], + "source": [ + "df_predictions = linker.inference.predict(threshold_match_probability=0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "id": "24e14675-11cf-4c46-a592-7733326113d2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "preds_df = df_predictions.as_pandas_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "id": "d50332a5-a8dc-444b-be92-b9d29f73763e", + "metadata": {}, + "outputs": [], + "source": [ + "preds_df = preds_df.merge(sec_df[[\"record_id\", \"company_name_raw\"]], how=\"left\", left_on=\"record_id_l\", right_on=\"record_id\").rename(columns={\"company_name_raw\": \"company_name_sec\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "id": "fddbed17-3d71-4c85-95d5-c3d0fd517f9d", + "metadata": {}, + "outputs": [], + "source": [ + "preds_df = preds_df.merge(ex21_df[[\"record_id\", \"company_name_raw\"]], how=\"left\", left_on=\"record_id_r\", right_on=\"record_id\").rename(columns={\"company_name_raw\": \"company_name_ex21\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "id": "3d733c2a-7004-4ce8-8d3f-25ed1e720c36", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_lcompany_name_rgamma_company_namebf_company_nameloc_of_incorporation_lloc_of_incorporation_rgamma_loc_of_incorporationtf_loc_of_incorporation_ltf_loc_of_incorporation_rbf_loc_of_incorporationbf_tf_adj_loc_of_incorporationcompany_name_mphone_lcompany_name_mphone_rrecord_id_xcompany_name_secrecord_id_ycompany_name_ex21
06.8166910.991207__splink__input_table_0__splink__input_table_18180159390national instruments corporationnational instruments corporation12.492261e+06delawarerepublic of korea00.3728420.0002340.5510651.0NXNL INSTRMNTSNXNL INSTRMNTS8180national instruments corp159390national instruments (korea) corporation
1766.8166910.991207__splink__input_table_0__splink__input_table_16034107265afternext healthtech acquisition corporationafternext healthtech acquisition corporation12.492261e+06e9cayman islands00.0010690.0153870.5510651.0AFTRNKST HL0TX AKKSXNAFTRNKST HL0TX AKKSXN6034afternext healthtech acquisition corp.107265afternext healthtech acquisition corp
1786.8166910.991207__splink__input_table_0__splink__input_table_16799117610gap incorporatedgap incorporated12.492261e+06delawarepuerto rico00.3728420.0015480.5510651.0KPKP6799gap inc117610gap (puerto rico), inc
1836.8166910.991207__splink__input_table_0__splink__input_table_15811170135rockley photonics holdings limitedrockley photonics holdings limited12.492261e+06e9cayman islands00.0010690.0153870.5510651.0RKL FTNKS HLTNKSRKL FTNKS HLTNKS5811rockley photonics holdings ltd170135rockley photonics holdings limited
1846.8166910.991207__splink__input_table_0__splink__input_table_16799117608gap incorporatedgap incorporated12.492261e+06delawarecalifornia00.3728420.0159780.5510651.0KPKP6799gap inc117608gap (itm) inc
1866.8166910.991207__splink__input_table_0__splink__input_table_16799117605gap incorporatedgap incorporated12.492261e+06delawarecanada00.3728420.0121910.5510651.0KPKP6799gap inc117605gap (canada) inc
4126.8166910.991207__splink__input_table_0__splink__input_table_11524165843aircastle limitedaircastle limited12.492261e+06d0ireland00.0001500.0083150.5510651.0ARKSTLARKSTL1524aircastle ltd165843aircastle (ireland) limited
1896.8166910.991207__splink__input_table_0__splink__input_table_16753115383arthur j gallagher and companyarthur j gallagher and company12.492261e+06illinoisdelaware00.0061150.3728420.5510651.0AR0R J KLKHR ANTAR0R J KLKHR ANT6753arthur j. gallagher & co.115383arthur j. gallagher & co
1936.8166910.991207__splink__input_table_0__splink__input_table_16651110797flowserve corporationflowserve corporation12.492261e+06new yorkmauritius00.0099130.0010750.5510651.0FLSRFFLSRF6651flowserve corp110797flowserve (mauritius) corporation
4066.8166910.991207__splink__input_table_0__splink__input_table_157824844united parcel service incorporatedunited parcel service incorporated12.492261e+06delawareohio00.3728420.0081360.5510651.0UNTT PRSL SRFSUNTT PRSL SRFS578united parcel service inc24844united parcel service, inc
1986.8166910.991207__splink__input_table_0__splink__input_table_15812171905nextracker incorporatednextracker incorporated12.492261e+06delawareunited states delaware00.3728420.0022780.5510651.0NKSTRKRNKSTRKR5812nextracker inc.171905nextracker inc
1996.8166910.991207__splink__input_table_0__splink__input_table_1584351850sculptor acquisition corp isculptor acquisition corp i12.492261e+06e9cayman islands00.0010690.0153870.5510651.0SKLPTR AKKSXN ISKLPTR AKKSXN I5843sculptor acquisition corp i51850sculptor acquisition corp i
1746.8166910.991207__splink__input_table_0__splink__input_table_17095179994cintas corporationcintas corporation12.492261e+06washingtonnevada00.0029960.0146520.5510651.0SNTSSNTS7095cintas corp179994cintas corporation
4056.8166910.991207__splink__input_table_0__splink__input_table_128512641onespan incorporatedonespan incorporated12.492261e+06delawareusa, state of delaware00.3728420.0000110.5510651.0ONSPNONSPN285onespan inc.12641onespan inc
2076.8166910.991207__splink__input_table_0__splink__input_table_1628297173mars acquisition corporationmars acquisition corporation12.492261e+06e9delaware00.0010690.3728420.5510651.0MRS AKKSXNMRS AKKSXN6282mars acquisition corp.97173mars acquisition corp
2126.8166910.991207__splink__input_table_0__splink__input_table_1483497747viatris incorporatedviatris incorporated12.492261e+06delawarephilippines00.3728420.0019270.5510651.0FTRSFTRS4834viatris inc97747viatris, inc
3976.8166910.991207__splink__input_table_0__splink__input_table_1120535911turning point brands incorporatedturning point brands incorporated12.492261e+06delawareontario, canada00.3728420.0008520.5510651.0TRNNK PNT BRNTSTRNNK PNT BRNTS1205turning point brands, inc.35911turning point brands (canada) inc
3966.8166910.991207__splink__input_table_0__splink__input_table_1117135941clearpoint neuro incorporatedclearpoint neuro incorporated12.492261e+06delawarecanada new brunswick00.3728420.0000060.5510651.0KLRPNT NRKLRPNT NR1171clearpoint neuro, inc.35941clearpoint neuro (canada) inc
3936.8166910.991207__splink__input_table_0__splink__input_table_1176551537genpact limitedgenpact limited12.492261e+06d0united kingdom00.0001500.0315210.5510651.0JNPKTJNPKT1765genpact ltd51537genpact (uk) ltd
2236.8166910.991207__splink__input_table_0__splink__input_table_16181106386perimeter solutions saperimeter solutions sa12.492261e+06n4grand of luxembourg00.0000170.0000110.5510651.0PRMTR SLXNS SPRMTR SLXNS S6181perimeter solutions, sa106386perimeter solutions sa
3906.8166910.991207__splink__input_table_0__splink__input_table_194934324ceva incorporatedceva incorporated12.492261e+06delawarecayman islands00.3728420.0153870.5510651.0SFSF949ceva inc34324ceva inc
2266.8166910.991207__splink__input_table_0__splink__input_table_16825123476harte hanks incorporatedharte hanks incorporated12.492261e+06delawareohio00.3728420.0081360.5510651.0HRT HNKSHRT HNKS6825harte hanks inc123476harte hanks, inc
2286.8166910.991207__splink__input_table_0__splink__input_table_12346600jones lang lasalle incorporatedjones lang lasalle incorporated12.492261e+06marylandpuerto rico00.0077860.0015480.5510651.0JNS LNK LSLJNS LNK LSL234jones lang lasalle inc6600jones lang lasalle (puerto rico), inc
2296.8166910.991207__splink__input_table_0__splink__input_table_12346596jones lang lasalle incorporatedjones lang lasalle incorporated12.492261e+06marylandphilippines00.0077860.0019270.5510651.0JNS LNK LSLJNS LNK LSL234jones lang lasalle inc6596jones lang lasalle (philippines), inc
2316.8166910.991207__splink__input_table_0__splink__input_table_1209754939optimizerx corporationoptimizerx corporation12.492261e+06nevadamichigan00.0146520.0071510.5510651.0OPTMSRKSOPTMSRKS2097optimizerx corp54939optimizerx corporation
2016.8166910.991207__splink__input_table_0__splink__input_table_16176166072phoenix motor incorporatedphoenix motor incorporated12.492261e+06delawareus00.3728420.0009080.5510651.0FNKS MTRFNKS MTR6176phoenix motor inc.166072phoenix motor inc
2326.8166910.991207__splink__input_table_0__splink__input_table_1211757288transocean limitedtransocean limited12.492261e+06v8switzerland00.0000330.0064210.5510651.0TRNSSNTRNSSN2117transocean ltd.57288transocean ltd
4216.8166910.991207__splink__input_table_0__splink__input_table_1134840725lazard group limited liability companylazard group limited liability company12.492261e+06delawareus00.3728420.0009080.5510651.0LSRT KRPLSRT KRP1348lazard group llc40725lazard group llc
1696.8166910.991207__splink__input_table_0__splink__input_table_16922189462analog devices incorporatedanalog devices incorporated12.492261e+06massachusettsunited states00.0044660.0121460.5510651.0ANLK TFSSANLK TFSS6922analog devices inc189462analog devices, inc
1156.8166910.991207__splink__input_table_0__splink__input_table_12485167379ameriguard security services incorporatedameriguard security services incorporated12.492261e+06nevadacalifornia00.0146520.0159780.5510651.0AMRKRT SKRT SRFSSAMRKRT SKRT SRFSS2485ameriguard security services, inc.167379ameriguard security services, inc
1166.8166910.991207__splink__input_table_0__splink__input_table_12486167379ameriguard security services incorporatedameriguard security services incorporated12.492261e+06nevadacalifornia00.0146520.0159780.5510651.0AMRKRT SKRT SRFSSAMRKRT SKRT SRFSS2486ameriguard security services, inc.167379ameriguard security services, inc
1206.8166910.991207__splink__input_table_0__splink__input_table_1468395837advantage solutions incorporatedadvantage solutions incorporated12.492261e+06delawarecanada00.3728420.0121910.5510651.0ATFNTJ SLXNSATFNTJ SLXNS4683advantage solutions inc.95837advantage solutions inc
4456.8166910.991207__splink__input_table_0__splink__input_table_1926165871commvault systems incorporatedcommvault systems incorporated12.492261e+06delawareontario, canada00.3728420.0008520.5510651.0KMFLT SSTMSKMFLT SSTMS926commvault systems inc165871commvault systems (canada) inc
1246.8166910.991207__splink__input_table_0__splink__input_table_1414890738firstsun capital bancorpfirstsun capital bancorp12.492261e+06delawarenew mexico00.3728420.0006520.5510651.0FRSTSN KPTL BNKRPFRSTSN KPTL BNKRP4148firstsun capital bancorp90738firstsun capital bancorp
1266.8166910.991207__splink__input_table_0__splink__input_table_1554426048taboola com limitedtaboola com limited12.492261e+06l3israel00.0000610.0030570.5510651.0TBL KMTBL KM5544taboola.com ltd.26048taboola.com ltd
4436.8166910.991207__splink__input_table_0__splink__input_table_1296henry schein incorporatedhenry schein incorporated12.492261e+06delawarepennsylvania00.3728420.0079190.5510651.0HNR SXNHNR SXN2henry schein inc96henry schein (lancaster, pa) inc
1326.8166910.991207__splink__input_table_0__splink__input_table_16668117995tomi environmental solutions incorporatedtomi environmental solutions incorporated12.492261e+06floridanevada00.0146910.0146520.5510651.0TM ENFRNMNTL SLXNSTM ENFRNMNTL SLXNS6668tomi environmental solutions, inc.117995tomi environmental solutions, inc
1366.8166910.991207__splink__input_table_0__splink__input_table_16148107455esab corporationesab corporation12.492261e+06delawareunited states00.3728420.0121460.5510651.0ESBESB6148esab corp107455esab corporation
1376.8166910.991207__splink__input_table_0__splink__input_table_16958104521apache corporationapache corporation12.492261e+06delawarenew jersey00.3728420.0061430.5510651.0APXAPX6958apache corp104521apache corporation
1386.8166910.991207__splink__input_table_0__splink__input_table_17011121758ncr corporationncr corporation12.492261e+06marylandnew zealand00.0077860.0025900.5510651.0NKRNKR7011ncr corp121758ncr (nz) corporation
4236.8166910.991207__splink__input_table_0__splink__input_table_177165059jakks pacific incorporatedjakks pacific incorporated12.492261e+06delawarecanada00.3728420.0121910.5510651.0JKS PSFKJKS PSFK77jakks pacific inc165059jakks pacific (canada), inc
1396.8166910.991207__splink__input_table_0__splink__input_table_14902170051gan limitedgan limited12.492261e+06d0england and wales00.0001500.0035360.5510651.0KNKN4902gan ltd170051gan (uk) limited
1416.8166910.991207__splink__input_table_0__splink__input_table_16613108716cts corporationcts corporation12.492261e+06indianadelaware00.0040600.3728420.5510651.0KTSKTS6613cts corp108716cts corporation
4376.8166910.991207__splink__input_table_0__splink__input_table_173829776garmin limitedgarmin limited12.492261e+06v8thailand00.0000330.0023780.5510651.0KRMNKRMN738garmin ltd29776garmin (thailand) ltd
4356.8166910.991207__splink__input_table_0__splink__input_table_12779849c h robinson worldwide incorporatedc h robinson worldwide incorporated12.492261e+06delawareunited states00.3728420.0121460.5510651.0K H RBNSN WRLTWTK H RBNSN WRLTWT277c. h. robinson worldwide, inc.9849c.h. robinson worldwide, inc
1466.8166910.991207__splink__input_table_0__splink__input_table_16763176423richardson electronics limitedrichardson electronics limited12.492261e+06delawarethailand00.3728420.0023780.5510651.0RXRTSN ELKTRNKSRXRTSN ELKTRNKS6763richardson electronics, ltd.176423richardson electronics (thailand) limited
1496.8166910.991207__splink__input_table_0__splink__input_table_1487598755api group corporationapi group corporation12.492261e+06d8delaware00.0000780.3728420.5510651.0AP KRPAP KRP4875api group corp98755api group corporation
4326.8166910.991207__splink__input_table_0__splink__input_table_12310167475thermon group holdings incorporatedthermon group holdings incorporated12.492261e+06delawaredelaware, united states00.3728420.0021390.5510651.00RMN KRP HLTNKS0RMN KRP HLTNKS2310thermon group holdings, inc.167475thermon group holdings, inc
1566.8166910.991207__splink__input_table_0__splink__input_table_16677118432aon public limited companyaon public limited company12.492261e+06l2ireland00.0001110.0083150.5510651.0ANAN6677aon plc118432aon plc
1586.8166910.991207__splink__input_table_0__splink__input_table_1595580272minority equality opportunities acquisition in...minority equality opportunities acquisition in...12.492261e+06delawaredelaware, united states00.3728420.0021390.5510651.0MNRT EKLT OPRTNTS AKKSXNMNRT EKLT OPRTNTS AKKSXN5955minority equality opportunities acquisition inc.80272minority equality opportunities acquisition inc
\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_l company_name_r gamma_company_name bf_company_name loc_of_incorporation_l loc_of_incorporation_r gamma_loc_of_incorporation tf_loc_of_incorporation_l tf_loc_of_incorporation_r bf_loc_of_incorporation bf_tf_adj_loc_of_incorporation company_name_mphone_l company_name_mphone_r record_id_x company_name_sec record_id_y company_name_ex21\n", + "0 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 8180 159390 national instruments corporation national instruments corporation 1 2.492261e+06 delaware republic of korea 0 0.372842 0.000234 0.551065 1.0 NXNL INSTRMNTS NXNL INSTRMNTS 8180 national instruments corp 159390 national instruments (korea) corporation\n", + "176 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6034 107265 afternext healthtech acquisition corporation afternext healthtech acquisition corporation 1 2.492261e+06 e9 cayman islands 0 0.001069 0.015387 0.551065 1.0 AFTRNKST HL0TX AKKSXN AFTRNKST HL0TX AKKSXN 6034 afternext healthtech acquisition corp. 107265 afternext healthtech acquisition corp\n", + "178 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6799 117610 gap incorporated gap incorporated 1 2.492261e+06 delaware puerto rico 0 0.372842 0.001548 0.551065 1.0 KP KP 6799 gap inc 117610 gap (puerto rico), inc\n", + "183 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 5811 170135 rockley photonics holdings limited rockley photonics holdings limited 1 2.492261e+06 e9 cayman islands 0 0.001069 0.015387 0.551065 1.0 RKL FTNKS HLTNKS RKL FTNKS HLTNKS 5811 rockley photonics holdings ltd 170135 rockley photonics holdings limited\n", + "184 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6799 117608 gap incorporated gap incorporated 1 2.492261e+06 delaware california 0 0.372842 0.015978 0.551065 1.0 KP KP 6799 gap inc 117608 gap (itm) inc\n", + "186 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6799 117605 gap incorporated gap incorporated 1 2.492261e+06 delaware canada 0 0.372842 0.012191 0.551065 1.0 KP KP 6799 gap inc 117605 gap (canada) inc\n", + "412 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 1524 165843 aircastle limited aircastle limited 1 2.492261e+06 d0 ireland 0 0.000150 0.008315 0.551065 1.0 ARKSTL ARKSTL 1524 aircastle ltd 165843 aircastle (ireland) limited\n", + "189 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6753 115383 arthur j gallagher and company arthur j gallagher and company 1 2.492261e+06 illinois delaware 0 0.006115 0.372842 0.551065 1.0 AR0R J KLKHR ANT AR0R J KLKHR ANT 6753 arthur j. gallagher & co. 115383 arthur j. gallagher & co\n", + "193 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6651 110797 flowserve corporation flowserve corporation 1 2.492261e+06 new york mauritius 0 0.009913 0.001075 0.551065 1.0 FLSRF FLSRF 6651 flowserve corp 110797 flowserve (mauritius) corporation\n", + "406 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 578 24844 united parcel service incorporated united parcel service incorporated 1 2.492261e+06 delaware ohio 0 0.372842 0.008136 0.551065 1.0 UNTT PRSL SRFS UNTT PRSL SRFS 578 united parcel service inc 24844 united parcel service, inc\n", + "198 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 5812 171905 nextracker incorporated nextracker incorporated 1 2.492261e+06 delaware united states delaware 0 0.372842 0.002278 0.551065 1.0 NKSTRKR NKSTRKR 5812 nextracker inc. 171905 nextracker inc\n", + "199 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 5843 51850 sculptor acquisition corp i sculptor acquisition corp i 1 2.492261e+06 e9 cayman islands 0 0.001069 0.015387 0.551065 1.0 SKLPTR AKKSXN I SKLPTR AKKSXN I 5843 sculptor acquisition corp i 51850 sculptor acquisition corp i\n", + "174 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 7095 179994 cintas corporation cintas corporation 1 2.492261e+06 washington nevada 0 0.002996 0.014652 0.551065 1.0 SNTS SNTS 7095 cintas corp 179994 cintas corporation\n", + "405 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 285 12641 onespan incorporated onespan incorporated 1 2.492261e+06 delaware usa, state of delaware 0 0.372842 0.000011 0.551065 1.0 ONSPN ONSPN 285 onespan inc. 12641 onespan inc\n", + "207 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6282 97173 mars acquisition corporation mars acquisition corporation 1 2.492261e+06 e9 delaware 0 0.001069 0.372842 0.551065 1.0 MRS AKKSXN MRS AKKSXN 6282 mars acquisition corp. 97173 mars acquisition corp\n", + "212 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 4834 97747 viatris incorporated viatris incorporated 1 2.492261e+06 delaware philippines 0 0.372842 0.001927 0.551065 1.0 FTRS FTRS 4834 viatris inc 97747 viatris, inc\n", + "397 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 1205 35911 turning point brands incorporated turning point brands incorporated 1 2.492261e+06 delaware ontario, canada 0 0.372842 0.000852 0.551065 1.0 TRNNK PNT BRNTS TRNNK PNT BRNTS 1205 turning point brands, inc. 35911 turning point brands (canada) inc\n", + "396 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 1171 35941 clearpoint neuro incorporated clearpoint neuro incorporated 1 2.492261e+06 delaware canada new brunswick 0 0.372842 0.000006 0.551065 1.0 KLRPNT NR KLRPNT NR 1171 clearpoint neuro, inc. 35941 clearpoint neuro (canada) inc\n", + "393 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 1765 51537 genpact limited genpact limited 1 2.492261e+06 d0 united kingdom 0 0.000150 0.031521 0.551065 1.0 JNPKT JNPKT 1765 genpact ltd 51537 genpact (uk) ltd\n", + "223 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6181 106386 perimeter solutions sa perimeter solutions sa 1 2.492261e+06 n4 grand of luxembourg 0 0.000017 0.000011 0.551065 1.0 PRMTR SLXNS S PRMTR SLXNS S 6181 perimeter solutions, sa 106386 perimeter solutions sa\n", + "390 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 949 34324 ceva incorporated ceva incorporated 1 2.492261e+06 delaware cayman islands 0 0.372842 0.015387 0.551065 1.0 SF SF 949 ceva inc 34324 ceva inc\n", + "226 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6825 123476 harte hanks incorporated harte hanks incorporated 1 2.492261e+06 delaware ohio 0 0.372842 0.008136 0.551065 1.0 HRT HNKS HRT HNKS 6825 harte hanks inc 123476 harte hanks, inc\n", + "228 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 234 6600 jones lang lasalle incorporated jones lang lasalle incorporated 1 2.492261e+06 maryland puerto rico 0 0.007786 0.001548 0.551065 1.0 JNS LNK LSL JNS LNK LSL 234 jones lang lasalle inc 6600 jones lang lasalle (puerto rico), inc\n", + "229 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 234 6596 jones lang lasalle incorporated jones lang lasalle incorporated 1 2.492261e+06 maryland philippines 0 0.007786 0.001927 0.551065 1.0 JNS LNK LSL JNS LNK LSL 234 jones lang lasalle inc 6596 jones lang lasalle (philippines), inc\n", + "231 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2097 54939 optimizerx corporation optimizerx corporation 1 2.492261e+06 nevada michigan 0 0.014652 0.007151 0.551065 1.0 OPTMSRKS OPTMSRKS 2097 optimizerx corp 54939 optimizerx corporation\n", + "201 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6176 166072 phoenix motor incorporated phoenix motor incorporated 1 2.492261e+06 delaware us 0 0.372842 0.000908 0.551065 1.0 FNKS MTR FNKS MTR 6176 phoenix motor inc. 166072 phoenix motor inc\n", + "232 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2117 57288 transocean limited transocean limited 1 2.492261e+06 v8 switzerland 0 0.000033 0.006421 0.551065 1.0 TRNSSN TRNSSN 2117 transocean ltd. 57288 transocean ltd\n", + "421 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 1348 40725 lazard group limited liability company lazard group limited liability company 1 2.492261e+06 delaware us 0 0.372842 0.000908 0.551065 1.0 LSRT KRP LSRT KRP 1348 lazard group llc 40725 lazard group llc\n", + "169 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6922 189462 analog devices incorporated analog devices incorporated 1 2.492261e+06 massachusetts united states 0 0.004466 0.012146 0.551065 1.0 ANLK TFSS ANLK TFSS 6922 analog devices inc 189462 analog devices, inc\n", + "115 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2485 167379 ameriguard security services incorporated ameriguard security services incorporated 1 2.492261e+06 nevada california 0 0.014652 0.015978 0.551065 1.0 AMRKRT SKRT SRFSS AMRKRT SKRT SRFSS 2485 ameriguard security services, inc. 167379 ameriguard security services, inc\n", + "116 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2486 167379 ameriguard security services incorporated ameriguard security services incorporated 1 2.492261e+06 nevada california 0 0.014652 0.015978 0.551065 1.0 AMRKRT SKRT SRFSS AMRKRT SKRT SRFSS 2486 ameriguard security services, inc. 167379 ameriguard security services, inc\n", + "120 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 4683 95837 advantage solutions incorporated advantage solutions incorporated 1 2.492261e+06 delaware canada 0 0.372842 0.012191 0.551065 1.0 ATFNTJ SLXNS ATFNTJ SLXNS 4683 advantage solutions inc. 95837 advantage solutions inc\n", + "445 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 926 165871 commvault systems incorporated commvault systems incorporated 1 2.492261e+06 delaware ontario, canada 0 0.372842 0.000852 0.551065 1.0 KMFLT SSTMS KMFLT SSTMS 926 commvault systems inc 165871 commvault systems (canada) inc\n", + "124 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 4148 90738 firstsun capital bancorp firstsun capital bancorp 1 2.492261e+06 delaware new mexico 0 0.372842 0.000652 0.551065 1.0 FRSTSN KPTL BNKRP FRSTSN KPTL BNKRP 4148 firstsun capital bancorp 90738 firstsun capital bancorp\n", + "126 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 5544 26048 taboola com limited taboola com limited 1 2.492261e+06 l3 israel 0 0.000061 0.003057 0.551065 1.0 TBL KM TBL KM 5544 taboola.com ltd. 26048 taboola.com ltd\n", + "443 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2 96 henry schein incorporated henry schein incorporated 1 2.492261e+06 delaware pennsylvania 0 0.372842 0.007919 0.551065 1.0 HNR SXN HNR SXN 2 henry schein inc 96 henry schein (lancaster, pa) inc\n", + "132 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6668 117995 tomi environmental solutions incorporated tomi environmental solutions incorporated 1 2.492261e+06 florida nevada 0 0.014691 0.014652 0.551065 1.0 TM ENFRNMNTL SLXNS TM ENFRNMNTL SLXNS 6668 tomi environmental solutions, inc. 117995 tomi environmental solutions, inc\n", + "136 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6148 107455 esab corporation esab corporation 1 2.492261e+06 delaware united states 0 0.372842 0.012146 0.551065 1.0 ESB ESB 6148 esab corp 107455 esab corporation\n", + "137 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6958 104521 apache corporation apache corporation 1 2.492261e+06 delaware new jersey 0 0.372842 0.006143 0.551065 1.0 APX APX 6958 apache corp 104521 apache corporation\n", + "138 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 7011 121758 ncr corporation ncr corporation 1 2.492261e+06 maryland new zealand 0 0.007786 0.002590 0.551065 1.0 NKR NKR 7011 ncr corp 121758 ncr (nz) corporation\n", + "423 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 77 165059 jakks pacific incorporated jakks pacific incorporated 1 2.492261e+06 delaware canada 0 0.372842 0.012191 0.551065 1.0 JKS PSFK JKS PSFK 77 jakks pacific inc 165059 jakks pacific (canada), inc\n", + "139 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 4902 170051 gan limited gan limited 1 2.492261e+06 d0 england and wales 0 0.000150 0.003536 0.551065 1.0 KN KN 4902 gan ltd 170051 gan (uk) limited\n", + "141 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6613 108716 cts corporation cts corporation 1 2.492261e+06 indiana delaware 0 0.004060 0.372842 0.551065 1.0 KTS KTS 6613 cts corp 108716 cts corporation\n", + "437 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 738 29776 garmin limited garmin limited 1 2.492261e+06 v8 thailand 0 0.000033 0.002378 0.551065 1.0 KRMN KRMN 738 garmin ltd 29776 garmin (thailand) ltd\n", + "435 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 277 9849 c h robinson worldwide incorporated c h robinson worldwide incorporated 1 2.492261e+06 delaware united states 0 0.372842 0.012146 0.551065 1.0 K H RBNSN WRLTWT K H RBNSN WRLTWT 277 c. h. robinson worldwide, inc. 9849 c.h. robinson worldwide, inc\n", + "146 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6763 176423 richardson electronics limited richardson electronics limited 1 2.492261e+06 delaware thailand 0 0.372842 0.002378 0.551065 1.0 RXRTSN ELKTRNKS RXRTSN ELKTRNKS 6763 richardson electronics, ltd. 176423 richardson electronics (thailand) limited\n", + "149 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 4875 98755 api group corporation api group corporation 1 2.492261e+06 d8 delaware 0 0.000078 0.372842 0.551065 1.0 AP KRP AP KRP 4875 api group corp 98755 api group corporation\n", + "432 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2310 167475 thermon group holdings incorporated thermon group holdings incorporated 1 2.492261e+06 delaware delaware, united states 0 0.372842 0.002139 0.551065 1.0 0RMN KRP HLTNKS 0RMN KRP HLTNKS 2310 thermon group holdings, inc. 167475 thermon group holdings, inc\n", + "156 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6677 118432 aon public limited company aon public limited company 1 2.492261e+06 l2 ireland 0 0.000111 0.008315 0.551065 1.0 AN AN 6677 aon plc 118432 aon plc\n", + "158 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 5955 80272 minority equality opportunities acquisition in... minority equality opportunities acquisition in... 1 2.492261e+06 delaware delaware, united states 0 0.372842 0.002139 0.551065 1.0 MNRT EKLT OPRTNTS AKKSXN MNRT EKLT OPRTNTS AKKSXN 5955 minority equality opportunities acquisition inc. 80272 minority equality opportunities acquisition inc" + ] + }, + "execution_count": 133, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds_df.sort_values(by=\"match_probability\").iloc[0:50]" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "59cc74aa-674b-4c89-95d6-181d0f7c162a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_lcompany_name_rgamma_company_namebf_company_nameloc_of_incorporation_lloc_of_incorporation_rgamma_loc_of_incorporationtf_loc_of_incorporation_ltf_loc_of_incorporation_rbf_loc_of_incorporationbf_tf_adj_loc_of_incorporationcompany_name_mphone_lcompany_name_mphone_r
06.3399090.987805__splink__input_table_0__splink__input_table_18180159390national instruments corporationnational instruments corporation21.774257e+06delawarerepublic of korea00.3728420.0002340.5562301.000000NXNL INSTRMNTSNXNL INSTRMNTS
16.3399090.987805__splink__input_table_0__splink__input_table_17912154757enbridge incorporatedenbridge incorporated21.774257e+06a0alberta00.0000330.0008800.5562301.000000ENBRJENBRJ
26.3399090.987805__splink__input_table_0__splink__input_table_17557140921spectrum pharmaceuticals incorporatedspectrum pharmaceuticals incorporated21.774257e+06delawarecayman islands00.3728420.0153870.5562301.000000SPKTRM FRMSTKLSSPKTRM FRMSTKLS
37.7176390.995272__splink__input_table_0__splink__input_table_18057152329american eagle outfitters incorporatedamerican eagle outfitters incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079AMRKN EKL OTFTRSAMRKN EKL OTFTRS
414.1263620.999944__splink__input_table_0__splink__input_table_1731528974pruco life insurance companypruco life insurance company21.774257e+06arizonaarizona20.0043880.0043882.48746749.368830PRK LF INSRNSPRK LF INSRNS
57.1861560.993180__splink__input_table_0__splink__input_table_17419142779national presto industries incorporatednational presto industries incorporated21.774257e+06wisconsinNone-10.004110NaN1.0000001.000000NXNL PRST INTSTRSNXNL PRST INTSTRS
66.3399090.987805__splink__input_table_0__splink__input_table_17387142016national bankshares incorporatednational bankshares incorporated21.774257e+06virginiacommonwealth virginia00.0062760.0000220.5562301.000000NXNL BNKXRSNXNL BNKXRS
713.6101420.999920__splink__input_table_0__splink__input_table_17387127697national bankshares incorporatednational bankshares incorporated21.774257e+06virginiavirginia20.0062760.0062762.48746734.518756NXNL BNKXRSNXNL BNKXRS
87.7176390.995272__splink__input_table_0__splink__input_table_18258162906thermo fisher scientific incorporatedthermo fisher scientific incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.5810790RM FXR SSNTFK0RM FXR SSNTFK
912.1018550.999773__splink__input_table_0__splink__input_table_1742860197general motors financial company incorporatedgeneral motors financial company incorporated21.774257e+06texastexas20.0178540.0178542.48746712.134323JNRL MTRS FNNXLJNRL MTRS FNNXL
106.3399090.987805__splink__input_table_0__splink__input_table_18258163501thermo fisher scientific incorporatedthermo fisher scientific incorporated21.774257e+06delawaremexico00.3728420.0112050.5562301.0000000RM FXR SSNTFK0RM FXR SSNTFK
116.3399090.987805__splink__input_table_0__splink__input_table_1549852885apollo strategic growth capital iiapollo strategic growth capital ii21.774257e+06e9cayman islands00.0010690.0153870.5562301.000000APL STRTJK KR0 KPTLAPL STRTJK KR0 KPTL
126.3399090.987805__splink__input_table_0__splink__input_table_18258162892thermo fisher scientific incorporatedthermo fisher scientific incorporated21.774257e+06delawarecanada00.3728420.0121910.5562301.0000000RM FXR SSNTFK0RM FXR SSNTFK
136.3399090.987805__splink__input_table_0__splink__input_table_18258162847thermo fisher scientific incorporatedthermo fisher scientific incorporated21.774257e+06delawarerussia00.3728420.0011080.5562301.0000000RM FXR SSNTFK0RM FXR SSNTFK
146.3399090.987805__splink__input_table_0__splink__input_table_149818301intellinetics incorporatedintellinetics incorporated21.774257e+06nevadaohio00.0146520.0081360.5562301.000000INTLNTKSINTLNTKS
156.3399090.987805__splink__input_table_0__splink__input_table_11533165897high sierra technologies incorporatedhigh sierra technologies incorporated21.774257e+06coloradonevada00.0048170.0146520.5562301.000000H SR TXNLJSH SR TXNLJS
1613.9918580.999939__splink__input_table_0__splink__input_table_1212761213lnpr group incorporatedlnpr group incorporated21.774257e+06coloradocolorado20.0048170.0048172.48746744.974148LNPR KRPLNPR KRP
177.1861560.993180__splink__input_table_0__splink__input_table_1931969norwood financial corporationnorwood financial corporation21.774257e+06pennsylvaniaNone-10.007919NaN1.0000001.000000NRWT FNNXLNRWT FNNXL
186.3399090.987805__splink__input_table_0__splink__input_table_11512257nov incorporatednov incorporated21.774257e+06delawaremauritius00.3728420.0010750.5562301.000000NFNF
196.3399090.987805__splink__input_table_0__splink__input_table_128010975juniper networks incorporatedjuniper networks incorporated21.774257e+06delawarecalifornia, usa00.3728420.0002340.5562301.000000JNPR NTWRKSJNPR NTWRKS
203.2523920.905028__splink__input_table_0__splink__input_table_11399157790logiq incorporatedlogiq3 incorporated12.087284e+05delawarecanada00.3728420.0121910.5562301.000000LJKLJK
217.7176390.995272__splink__input_table_0__splink__input_table_11720166283edgio incorporatededgio incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079EJEJ
226.3399090.987805__splink__input_table_0__splink__input_table_12020184709arem pacific corporationarem pacific corporation21.774257e+06delawarearizona00.3728420.0043880.5562301.000000ARM PSFKARM PSFK
237.1861560.993180__splink__input_table_0__splink__input_table_175626596ensign group incorporatedensign group incorporated21.774257e+06Nonenevada-1NaN0.0146521.0000001.000000ENSKN KRPENSKN KRP
247.1861560.993180__splink__input_table_0__splink__input_table_1110424668cco holdings limited liability companycco holdings limited liability company21.774257e+06Nonedelaware-1NaN0.3728421.0000001.000000KK HLTNKSKK HLTNKS
257.7176390.995272__splink__input_table_0__splink__input_table_132111011pc connection incorporatedpc connection incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079KNKXNKNKXN
266.3399090.987805__splink__input_table_0__splink__input_table_147714483polarityte incorporatedpolarityte incorporated21.774257e+06delawarenevada00.3728420.0146520.5562301.000000PLRTTPLRTT
277.7176390.995272__splink__input_table_0__splink__input_table_181025991atlas air worldwide holdings incorporatedatlas air worldwide holdings incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079ATLS AR WRLTWT HLTNKSATLS AR WRLTWT HLTNKS
286.3399090.987805__splink__input_table_0__splink__input_table_11003166010spi energy co limitedspi energy co limited21.774257e+06e9cayman00.0010690.0003450.5562301.000000SP ENRJSP ENRJ
297.7176390.995272__splink__input_table_0__splink__input_table_11012165926bimi international medical incorporatedbimi international medical incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079BM INTRNXNL MTKLBM INTRNXNL MTKL
307.1861560.993180__splink__input_table_0__splink__input_table_1186851876phreesia incorporatedphreesia incorporated21.774257e+06delawareNone-10.372842NaN1.0000001.000000FRXFRX
316.3399090.987805__splink__input_table_0__splink__input_table_1219878290secureworks corporationsecureworks corporation21.774257e+06delawareunited states00.3728420.0121460.5562301.000000SKRWRKSSKRWRKS
327.7176390.995272__splink__input_table_0__splink__input_table_1227358771ryerson holding corporationryerson holding corporation21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079RYRSN HLTNKRYRSN HLTNK
337.1861560.993180__splink__input_table_0__splink__input_table_12219106comfort systems usa incorporatedcomfort systems usa incorporated21.774257e+06Nonearkansas-1NaN0.0012531.0000001.000000KMFRT SSTMS USKMFRT SSTMS US
3414.3518090.999952__splink__input_table_0__splink__input_table_1478180383winnebago industries incorporatedwinnebago industries incorporated21.774257e+06minnesotaminnesota20.0037540.0037542.48746757.719048WNBK INTSTRSWNBK INTSTRS
356.3399090.987805__splink__input_table_0__splink__input_table_11913166068renewable energy acquisition corporationrenewable energy acquisition corporation21.774257e+06nevadaus00.0146520.0009080.5562301.000000RNWBL ENRJ AKKSXNRNWBL ENRJ AKKSXN
367.1861560.993180__splink__input_table_0__splink__input_table_1257164606riverview bancorp incorporatedriverview bancorp incorporated21.774257e+06washingtonNone-10.002996NaN1.0000001.000000RFRF BNKRPRFRF BNKRP
377.1861560.993180__splink__input_table_0__splink__input_table_1294182945timberland bancorp incorporatedtimberland bancorp incorporated21.774257e+06washingtonNone-10.002996NaN1.0000001.000000TMBRLNT BNKRPTMBRLNT BNKRP
387.7176390.995272__splink__input_table_0__splink__input_table_141518543lkq corporationlkq corporation21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079LKKLKK
397.7176390.995272__splink__input_table_0__splink__input_table_167423252berkshire hills bancorp incorporatedberkshire hills bancorp incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079BRKXR HLS BNKRPBRKXR HLS BNKRP
406.3399090.987805__splink__input_table_0__splink__input_table_11270181001dolby laboratories incorporateddolby laboratories incorporated21.774257e+06delawarecalifornia00.3728420.0159780.5562301.000000TLB LBRTRSTLB LBRTRS
413.2523920.905028__splink__input_table_0__splink__input_table_11321132984tss incorporateddss incorporated12.087284e+05delawarenew york00.3728420.0099130.5562301.000000TSTS
427.7176390.995272__splink__input_table_0__splink__input_table_1148246045anywhere real estate incorporatedanywhere real estate incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079ANHR RL ESTTANHR RL ESTT
436.3399090.987805__splink__input_table_0__splink__input_table_1149447625kbr incorporatedkbr incorporated21.774257e+06delawareunited states00.3728420.0121460.5562301.000000KBRKBR
447.7176390.995272__splink__input_table_0__splink__input_table_11972166348reshape lifesciences incorporatedreshape lifesciences incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079RXP LFSSNSSRXP LFSSNSS
4512.3870180.999813__splink__input_table_0__splink__input_table_11457172081imperalis holding corporationimperalis holding corporation21.774257e+06nevadanevada20.0146520.0146522.48746714.786255IMPRLS HLTNKIMPRLS HLTNK
4612.3870180.999813__splink__input_table_0__splink__input_table_12037172091bitnile metaverse incorporatedbitnile metaverse incorporated21.774257e+06nevadanevada20.0146520.0146522.48746714.786255BTNL MTFRSBTNL MTFRS
477.7176390.995272__splink__input_table_0__splink__input_table_1105835808qvc incorporatedqvc incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079KFKKFK
489.6928770.998793__splink__input_table_0__splink__input_table_1170547703irhythm technologies incorporatedirhythm technologies incorporated21.774257e+06delawareus delaware10.3728420.0003235.6832681.000000IRH0M TXNLJSIRH0M TXNLJS
497.1861560.993180__splink__input_table_0__splink__input_table_133813985essex property trust incorporatedessex property trust incorporated21.774257e+06marylandNone-10.007786NaN1.0000001.000000ESKS PRPRT TRSTESKS PRPRT TRST
\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_l company_name_r gamma_company_name bf_company_name loc_of_incorporation_l loc_of_incorporation_r gamma_loc_of_incorporation tf_loc_of_incorporation_l tf_loc_of_incorporation_r bf_loc_of_incorporation bf_tf_adj_loc_of_incorporation company_name_mphone_l company_name_mphone_r\n", + "0 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 8180 159390 national instruments corporation national instruments corporation 2 1.774257e+06 delaware republic of korea 0 0.372842 0.000234 0.556230 1.000000 NXNL INSTRMNTS NXNL INSTRMNTS\n", + "1 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 7912 154757 enbridge incorporated enbridge incorporated 2 1.774257e+06 a0 alberta 0 0.000033 0.000880 0.556230 1.000000 ENBRJ ENBRJ\n", + "2 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 7557 140921 spectrum pharmaceuticals incorporated spectrum pharmaceuticals incorporated 2 1.774257e+06 delaware cayman islands 0 0.372842 0.015387 0.556230 1.000000 SPKTRM FRMSTKLS SPKTRM FRMSTKLS\n", + "3 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 8057 152329 american eagle outfitters incorporated american eagle outfitters incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 AMRKN EKL OTFTRS AMRKN EKL OTFTRS\n", + "4 14.126362 0.999944 __splink__input_table_0 __splink__input_table_1 7315 28974 pruco life insurance company pruco life insurance company 2 1.774257e+06 arizona arizona 2 0.004388 0.004388 2.487467 49.368830 PRK LF INSRNS PRK LF INSRNS\n", + "5 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 7419 142779 national presto industries incorporated national presto industries incorporated 2 1.774257e+06 wisconsin None -1 0.004110 NaN 1.000000 1.000000 NXNL PRST INTSTRS NXNL PRST INTSTRS\n", + "6 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 7387 142016 national bankshares incorporated national bankshares incorporated 2 1.774257e+06 virginia commonwealth virginia 0 0.006276 0.000022 0.556230 1.000000 NXNL BNKXRS NXNL BNKXRS\n", + "7 13.610142 0.999920 __splink__input_table_0 __splink__input_table_1 7387 127697 national bankshares incorporated national bankshares incorporated 2 1.774257e+06 virginia virginia 2 0.006276 0.006276 2.487467 34.518756 NXNL BNKXRS NXNL BNKXRS\n", + "8 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 8258 162906 thermo fisher scientific incorporated thermo fisher scientific incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 0RM FXR SSNTFK 0RM FXR SSNTFK\n", + "9 12.101855 0.999773 __splink__input_table_0 __splink__input_table_1 7428 60197 general motors financial company incorporated general motors financial company incorporated 2 1.774257e+06 texas texas 2 0.017854 0.017854 2.487467 12.134323 JNRL MTRS FNNXL JNRL MTRS FNNXL\n", + "10 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 8258 163501 thermo fisher scientific incorporated thermo fisher scientific incorporated 2 1.774257e+06 delaware mexico 0 0.372842 0.011205 0.556230 1.000000 0RM FXR SSNTFK 0RM FXR SSNTFK\n", + "11 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 5498 52885 apollo strategic growth capital ii apollo strategic growth capital ii 2 1.774257e+06 e9 cayman islands 0 0.001069 0.015387 0.556230 1.000000 APL STRTJK KR0 KPTL APL STRTJK KR0 KPTL \n", + "12 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 8258 162892 thermo fisher scientific incorporated thermo fisher scientific incorporated 2 1.774257e+06 delaware canada 0 0.372842 0.012191 0.556230 1.000000 0RM FXR SSNTFK 0RM FXR SSNTFK\n", + "13 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 8258 162847 thermo fisher scientific incorporated thermo fisher scientific incorporated 2 1.774257e+06 delaware russia 0 0.372842 0.001108 0.556230 1.000000 0RM FXR SSNTFK 0RM FXR SSNTFK\n", + "14 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 498 18301 intellinetics incorporated intellinetics incorporated 2 1.774257e+06 nevada ohio 0 0.014652 0.008136 0.556230 1.000000 INTLNTKS INTLNTKS\n", + "15 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 1533 165897 high sierra technologies incorporated high sierra technologies incorporated 2 1.774257e+06 colorado nevada 0 0.004817 0.014652 0.556230 1.000000 H SR TXNLJS H SR TXNLJS\n", + "16 13.991858 0.999939 __splink__input_table_0 __splink__input_table_1 2127 61213 lnpr group incorporated lnpr group incorporated 2 1.774257e+06 colorado colorado 2 0.004817 0.004817 2.487467 44.974148 LNPR KRP LNPR KRP\n", + "17 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 93 1969 norwood financial corporation norwood financial corporation 2 1.774257e+06 pennsylvania None -1 0.007919 NaN 1.000000 1.000000 NRWT FNNXL NRWT FNNXL\n", + "18 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 151 2257 nov incorporated nov incorporated 2 1.774257e+06 delaware mauritius 0 0.372842 0.001075 0.556230 1.000000 NF NF\n", + "19 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 280 10975 juniper networks incorporated juniper networks incorporated 2 1.774257e+06 delaware california, usa 0 0.372842 0.000234 0.556230 1.000000 JNPR NTWRKS JNPR NTWRKS\n", + "20 3.252392 0.905028 __splink__input_table_0 __splink__input_table_1 1399 157790 logiq incorporated logiq3 incorporated 1 2.087284e+05 delaware canada 0 0.372842 0.012191 0.556230 1.000000 LJK LJK\n", + "21 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 1720 166283 edgio incorporated edgio incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 EJ EJ\n", + "22 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 2020 184709 arem pacific corporation arem pacific corporation 2 1.774257e+06 delaware arizona 0 0.372842 0.004388 0.556230 1.000000 ARM PSFK ARM PSFK\n", + "23 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 756 26596 ensign group incorporated ensign group incorporated 2 1.774257e+06 None nevada -1 NaN 0.014652 1.000000 1.000000 ENSKN KRP ENSKN KRP\n", + "24 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 1104 24668 cco holdings limited liability company cco holdings limited liability company 2 1.774257e+06 None delaware -1 NaN 0.372842 1.000000 1.000000 KK HLTNKS KK HLTNKS\n", + "25 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 321 11011 pc connection incorporated pc connection incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 KNKXN KNKXN\n", + "26 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 477 14483 polarityte incorporated polarityte incorporated 2 1.774257e+06 delaware nevada 0 0.372842 0.014652 0.556230 1.000000 PLRTT PLRTT\n", + "27 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 810 25991 atlas air worldwide holdings incorporated atlas air worldwide holdings incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 ATLS AR WRLTWT HLTNKS ATLS AR WRLTWT HLTNKS\n", + "28 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 1003 166010 spi energy co limited spi energy co limited 2 1.774257e+06 e9 cayman 0 0.001069 0.000345 0.556230 1.000000 SP ENRJ SP ENRJ\n", + "29 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 1012 165926 bimi international medical incorporated bimi international medical incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 BM INTRNXNL MTKL BM INTRNXNL MTKL\n", + "30 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 1868 51876 phreesia incorporated phreesia incorporated 2 1.774257e+06 delaware None -1 0.372842 NaN 1.000000 1.000000 FRX FRX\n", + "31 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 2198 78290 secureworks corporation secureworks corporation 2 1.774257e+06 delaware united states 0 0.372842 0.012146 0.556230 1.000000 SKRWRKS SKRWRKS\n", + "32 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 2273 58771 ryerson holding corporation ryerson holding corporation 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 RYRSN HLTNK RYRSN HLTNK\n", + "33 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 221 9106 comfort systems usa incorporated comfort systems usa incorporated 2 1.774257e+06 None arkansas -1 NaN 0.001253 1.000000 1.000000 KMFRT SSTMS US KMFRT SSTMS US\n", + "34 14.351809 0.999952 __splink__input_table_0 __splink__input_table_1 478 180383 winnebago industries incorporated winnebago industries incorporated 2 1.774257e+06 minnesota minnesota 2 0.003754 0.003754 2.487467 57.719048 WNBK INTSTRS WNBK INTSTRS\n", + "35 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 1913 166068 renewable energy acquisition corporation renewable energy acquisition corporation 2 1.774257e+06 nevada us 0 0.014652 0.000908 0.556230 1.000000 RNWBL ENRJ AKKSXN RNWBL ENRJ AKKSXN\n", + "36 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 257 164606 riverview bancorp incorporated riverview bancorp incorporated 2 1.774257e+06 washington None -1 0.002996 NaN 1.000000 1.000000 RFRF BNKRP RFRF BNKRP\n", + "37 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 294 182945 timberland bancorp incorporated timberland bancorp incorporated 2 1.774257e+06 washington None -1 0.002996 NaN 1.000000 1.000000 TMBRLNT BNKRP TMBRLNT BNKRP\n", + "38 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 415 18543 lkq corporation lkq corporation 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 LKK LKK\n", + "39 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 674 23252 berkshire hills bancorp incorporated berkshire hills bancorp incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 BRKXR HLS BNKRP BRKXR HLS BNKRP\n", + "40 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 1270 181001 dolby laboratories incorporated dolby laboratories incorporated 2 1.774257e+06 delaware california 0 0.372842 0.015978 0.556230 1.000000 TLB LBRTRS TLB LBRTRS\n", + "41 3.252392 0.905028 __splink__input_table_0 __splink__input_table_1 1321 132984 tss incorporated dss incorporated 1 2.087284e+05 delaware new york 0 0.372842 0.009913 0.556230 1.000000 TS TS\n", + "42 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 1482 46045 anywhere real estate incorporated anywhere real estate incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 ANHR RL ESTT ANHR RL ESTT\n", + "43 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 1494 47625 kbr incorporated kbr incorporated 2 1.774257e+06 delaware united states 0 0.372842 0.012146 0.556230 1.000000 KBR KBR\n", + "44 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 1972 166348 reshape lifesciences incorporated reshape lifesciences incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 RXP LFSSNSS RXP LFSSNSS\n", + "45 12.387018 0.999813 __splink__input_table_0 __splink__input_table_1 1457 172081 imperalis holding corporation imperalis holding corporation 2 1.774257e+06 nevada nevada 2 0.014652 0.014652 2.487467 14.786255 IMPRLS HLTNK IMPRLS HLTNK\n", + "46 12.387018 0.999813 __splink__input_table_0 __splink__input_table_1 2037 172091 bitnile metaverse incorporated bitnile metaverse incorporated 2 1.774257e+06 nevada nevada 2 0.014652 0.014652 2.487467 14.786255 BTNL MTFRS BTNL MTFRS\n", + "47 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 1058 35808 qvc incorporated qvc incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 KFK KFK\n", + "48 9.692877 0.998793 __splink__input_table_0 __splink__input_table_1 1705 47703 irhythm technologies incorporated irhythm technologies incorporated 2 1.774257e+06 delaware us delaware 1 0.372842 0.000323 5.683268 1.000000 IRH0M TXNLJS IRH0M TXNLJS\n", + "49 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 338 13985 essex property trust incorporated essex property trust incorporated 2 1.774257e+06 maryland None -1 0.007786 NaN 1.000000 1.000000 ESKS PRPRT TRST ESKS PRPRT TRST" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds_df[preds_df.match_probability > .9]" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "8e658c36-7b6f-480f-9d74-37af9510ebe2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_probabilitycompany_name_lcompany_name_rloc_list_lloc_list_rcompany_name_mphone_lcompany_name_mphone_r
4650.914612conns incorporatedinvenco incorporated[delaware][delaware]KNSINFNK
4660.914612vishay intertechnology incorporatedvishay precision foil, incorporated[delaware][delaware]FX INTRTXNLJFX PRSXN FL
4670.980607vishay precision group, incorporatedvishay precision foil, incorporated[delaware][delaware]FX PRSXN KRPFX PRSXN FL
4700.975104jones lang lasalle incorporatedjones lang lasalle limited[maryland][hong, kong]JNS LNK LSLJNS LNK LSL
4710.951657nrg energy, incorporatednrg energy, incorporated[delaware][delaware]NRK ENRJNRK ENRJ
4720.914612firstenergy corporationfirstenergy ventures corporation[ohio][ohio]FRSTNRJFRSTNRJ FNTRS
4780.914612hudson pacific properties, incorporatedhudson pacific services, incorporated[maryland][maryland]HTSN PSFK PRPRTSHTSN PSFK SRFSS
4790.980607hudson pacific properties, incorporatedhudson pacific properties, limited partnership[maryland][maryland]HTSN PSFK PRPRTSHTSN PSFK PRPRTS
4810.914612digital ally, incorporateddigital ally international, incorporated[nevada][nevada]TJTL ALTJTL AL INTRNXNL
4890.976947cco holdings limited liability companyrhfw holdings, limited liability companyNaN[delaware]KK HLTNKSRHF HLTNKS
4930.975104intuitive surgical incorporatedintuitive surgical limited[delaware][united, kingdom]INTTF SRJKLINTTF SRJKL
4940.975104jones lang lasalle incorporatedjones lang lasalle limited[maryland][england]JNS LNK LSLJNS LNK LSL
5000.975104becton dickinson and companybecton, dickinson and company, limited[new, jersey][ireland]BKTN TKNSN ANTBKTN TKNSN ANT
5010.975104united parcel service incorporatedunited guaranty services, incorporated[delaware][north, carolina]UNTT PRSL SRFSUNTT KRNT SRFSS
5090.914612estee lauder companies incorporatedestee lauder international, incorporated[delaware][delaware]EST LTR KMPNSEST LTR INTRNXNL
5100.914612maxcyte, incorporatedcues, incorporated[delaware][delaware]MKSSTKS
5150.980607zimmer biomet holdings, incorporatedzimmer biomet spine, incorporated[delaware][delaware]SMR BMT HLTNKSSMR BMT SPN
5180.914612nordicus partners corporationnordco enterprises, incorporated[delaware][wilmington, delaware]NRTKS PRTNRSNRTK ENTRPRSS
5190.975104valero energy corp/txvalero energy incorporated[delaware][canada]FLR ENRJ TKSFLR ENRJ
5270.914612nrg energy, incorporatednrg energy holdings incorporated[delaware][delaware]NRK ENRJNRK ENRJ HLTNKS
5280.914612everi holdings incorporatededi holdings, incorporated[delaware][delaware]EFR HLTNKSET HLTNKS
5350.914612estee lauder companies incorporatedestee lauder incorporated[delaware][delaware]EST LTR KMPNSEST LTR
5480.975104universal logistics holdings, incorporateduniversal logistics corporation[michigan][florida]UNFRSL LJSTKS HLTNKSUNFRSL LJSTKS
5510.975104alliant energy corporationallergan gi corporation[wisconsin][delaware]ALNT ENRJALRKN J
5550.975104smartmetric, incorporatedsmartpetro incorporated[nevada][philippines]SMRTMTRKSMRTPTR
5660.914612republic services, incorporatedrepublic conduit, incorporated[delaware][delaware]RPBLK SRFSSRPBLK KNTT
5710.975104freedom holdings, incorporatedfreedom designs, incorporated[maryland][california]FRTM HLTNKSFRTM TSKNS
5730.938457ares real estate income trust incorporatedares real estate income trust incorporated[maryland][delaware]ARS RL ESTT INKM TRSTARS RL ESTT INKM TRST
5740.975104bank of new york mellon corporationbank of new york mellon sa/nv[delaware][belgium]BNK OF N YRK MLNBNK OF N YRK MLN SNF
5760.914612southern companysouthern wood piedmont company[delaware][delaware]S0RNS0RN WT PTMNT
5820.914612ameresco, incorporatedameripath, incorporated[delaware][delaware]AMRSKAMRP0
5840.914612trevena incorporatedanr, incorporated[delaware][delaware]TRFNANR
5900.975104bank of new york mellon corporationbank of new york mellon[delaware][new, york]BNK OF N YRK MLNBNK OF N YRK MLN
5910.938457xerox holdings corporationxerox holdings corporation[connecticut][new, york]SRKS HLTNKSSRKS HLTNKS
5940.975104jones lang lasalle incorporatedjones lang lasalle ip, incorporated[maryland][delaware]JNS LNK LSLJNS LNK LSL IP
5950.914612iron mountain incorporatediron mountain global holdings, incorporated[delaware][delaware]IRN MNTNIRN MNTN KLBL HLTNKS
5970.980607extreme networks incorporatedextreme networks ihc, incorporated[delaware][delaware]EKSTRM NTWRKSEKSTRM NTWRKS IK
5990.976947q2 holdings, incorporatedvr holdings, incorporatedNaN[colorado]K HLTNKSFR HLTNKS
6000.980607extreme networks incorporatedextreme networks, incorporated[delaware][delaware]EKSTRM NTWRKSEKSTRM NTWRKS
6040.914612cutera incorporatedvrec, incorporated[delaware][delaware]KTRFRK
6050.975104assured guaranty limitedassured guaranty services limited[d0][england]ASRT KRNTASRT KRNT SRFSS
6060.976947virtra, incorporatedviator, incorporated[nevada]NaNFRTRFTR
6180.975104sculptor capital management, incorporatedsculptor capital management hong kong limited[delaware][hong, kong]SKLPTR KPTL MNJMNTSKLPTR KPTL MNJMNT HNK KNK
6250.975104enstar group limitedenstar limited[d0][bermuda]ENSTR KRPENSTR
6260.975104sellas life sciences group, incorporatedsellas life sciences group limited[delaware][bermuda]SLS LF SSNSS KRPSLS LF SSNSS KRP
6270.975104intuitive surgical incorporatedintuitive surgical canada incorporated[delaware][canada]INTTF SRJKLINTTF SRJKL KNT
6300.951657forestar group incorporatedforestar group incorporated[delaware][delaware]FRSTR KRPFRSTR KRP
6370.914612dcp midstream, limited partnershipdcp midstream operating, limited partnership[delaware][delaware]TKP MTSTRMTKP MTSTRM OPRTNK
6390.951657equitable holdings, incorporatedequitable holdings, incorporated[delaware][delaware]EKTBL HLTNKSEKTBL HLTNKS
6430.914612energy transfer limited partnershipenergy transfer partners, limited liability co...[delaware][delaware]ENRJ TRNSFRENRJ TRNSFR PRTNRS
\n", + "
" + ], + "text/plain": [ + " match_probability company_name_l company_name_r loc_list_l loc_list_r company_name_mphone_l company_name_mphone_r\n", + "465 0.914612 conns incorporated invenco incorporated [delaware] [delaware] KNS INFNK\n", + "466 0.914612 vishay intertechnology incorporated vishay precision foil, incorporated [delaware] [delaware] FX INTRTXNLJ FX PRSXN FL\n", + "467 0.980607 vishay precision group, incorporated vishay precision foil, incorporated [delaware] [delaware] FX PRSXN KRP FX PRSXN FL\n", + "470 0.975104 jones lang lasalle incorporated jones lang lasalle limited [maryland] [hong, kong] JNS LNK LSL JNS LNK LSL\n", + "471 0.951657 nrg energy, incorporated nrg energy, incorporated [delaware] [delaware] NRK ENRJ NRK ENRJ\n", + "472 0.914612 firstenergy corporation firstenergy ventures corporation [ohio] [ohio] FRSTNRJ FRSTNRJ FNTRS\n", + "478 0.914612 hudson pacific properties, incorporated hudson pacific services, incorporated [maryland] [maryland] HTSN PSFK PRPRTS HTSN PSFK SRFSS\n", + "479 0.980607 hudson pacific properties, incorporated hudson pacific properties, limited partnership [maryland] [maryland] HTSN PSFK PRPRTS HTSN PSFK PRPRTS\n", + "481 0.914612 digital ally, incorporated digital ally international, incorporated [nevada] [nevada] TJTL AL TJTL AL INTRNXNL\n", + "489 0.976947 cco holdings limited liability company rhfw holdings, limited liability company NaN [delaware] KK HLTNKS RHF HLTNKS\n", + "493 0.975104 intuitive surgical incorporated intuitive surgical limited [delaware] [united, kingdom] INTTF SRJKL INTTF SRJKL\n", + "494 0.975104 jones lang lasalle incorporated jones lang lasalle limited [maryland] [england] JNS LNK LSL JNS LNK LSL\n", + "500 0.975104 becton dickinson and company becton, dickinson and company, limited [new, jersey] [ireland] BKTN TKNSN ANT BKTN TKNSN ANT\n", + "501 0.975104 united parcel service incorporated united guaranty services, incorporated [delaware] [north, carolina] UNTT PRSL SRFS UNTT KRNT SRFSS\n", + "509 0.914612 estee lauder companies incorporated estee lauder international, incorporated [delaware] [delaware] EST LTR KMPNS EST LTR INTRNXNL\n", + "510 0.914612 maxcyte, incorporated cues, incorporated [delaware] [delaware] MKSST KS\n", + "515 0.980607 zimmer biomet holdings, incorporated zimmer biomet spine, incorporated [delaware] [delaware] SMR BMT HLTNKS SMR BMT SPN\n", + "518 0.914612 nordicus partners corporation nordco enterprises, incorporated [delaware] [wilmington, delaware] NRTKS PRTNRS NRTK ENTRPRSS\n", + "519 0.975104 valero energy corp/tx valero energy incorporated [delaware] [canada] FLR ENRJ TKS FLR ENRJ\n", + "527 0.914612 nrg energy, incorporated nrg energy holdings incorporated [delaware] [delaware] NRK ENRJ NRK ENRJ HLTNKS\n", + "528 0.914612 everi holdings incorporated edi holdings, incorporated [delaware] [delaware] EFR HLTNKS ET HLTNKS\n", + "535 0.914612 estee lauder companies incorporated estee lauder incorporated [delaware] [delaware] EST LTR KMPNS EST LTR\n", + "548 0.975104 universal logistics holdings, incorporated universal logistics corporation [michigan] [florida] UNFRSL LJSTKS HLTNKS UNFRSL LJSTKS\n", + "551 0.975104 alliant energy corporation allergan gi corporation [wisconsin] [delaware] ALNT ENRJ ALRKN J\n", + "555 0.975104 smartmetric, incorporated smartpetro incorporated [nevada] [philippines] SMRTMTRK SMRTPTR\n", + "566 0.914612 republic services, incorporated republic conduit, incorporated [delaware] [delaware] RPBLK SRFSS RPBLK KNTT\n", + "571 0.975104 freedom holdings, incorporated freedom designs, incorporated [maryland] [california] FRTM HLTNKS FRTM TSKNS\n", + "573 0.938457 ares real estate income trust incorporated ares real estate income trust incorporated [maryland] [delaware] ARS RL ESTT INKM TRST ARS RL ESTT INKM TRST\n", + "574 0.975104 bank of new york mellon corporation bank of new york mellon sa/nv [delaware] [belgium] BNK OF N YRK MLN BNK OF N YRK MLN SNF\n", + "576 0.914612 southern company southern wood piedmont company [delaware] [delaware] S0RN S0RN WT PTMNT\n", + "582 0.914612 ameresco, incorporated ameripath, incorporated [delaware] [delaware] AMRSK AMRP0\n", + "584 0.914612 trevena incorporated anr, incorporated [delaware] [delaware] TRFN ANR\n", + "590 0.975104 bank of new york mellon corporation bank of new york mellon [delaware] [new, york] BNK OF N YRK MLN BNK OF N YRK MLN\n", + "591 0.938457 xerox holdings corporation xerox holdings corporation [connecticut] [new, york] SRKS HLTNKS SRKS HLTNKS\n", + "594 0.975104 jones lang lasalle incorporated jones lang lasalle ip, incorporated [maryland] [delaware] JNS LNK LSL JNS LNK LSL IP\n", + "595 0.914612 iron mountain incorporated iron mountain global holdings, incorporated [delaware] [delaware] IRN MNTN IRN MNTN KLBL HLTNKS\n", + "597 0.980607 extreme networks incorporated extreme networks ihc, incorporated [delaware] [delaware] EKSTRM NTWRKS EKSTRM NTWRKS IK\n", + "599 0.976947 q2 holdings, incorporated vr holdings, incorporated NaN [colorado] K HLTNKS FR HLTNKS\n", + "600 0.980607 extreme networks incorporated extreme networks, incorporated [delaware] [delaware] EKSTRM NTWRKS EKSTRM NTWRKS\n", + "604 0.914612 cutera incorporated vrec, incorporated [delaware] [delaware] KTR FRK\n", + "605 0.975104 assured guaranty limited assured guaranty services limited [d0] [england] ASRT KRNT ASRT KRNT SRFSS\n", + "606 0.976947 virtra, incorporated viator, incorporated [nevada] NaN FRTR FTR\n", + "618 0.975104 sculptor capital management, incorporated sculptor capital management hong kong limited [delaware] [hong, kong] SKLPTR KPTL MNJMNT SKLPTR KPTL MNJMNT HNK KNK\n", + "625 0.975104 enstar group limited enstar limited [d0] [bermuda] ENSTR KRP ENSTR\n", + "626 0.975104 sellas life sciences group, incorporated sellas life sciences group limited [delaware] [bermuda] SLS LF SSNSS KRP SLS LF SSNSS KRP\n", + "627 0.975104 intuitive surgical incorporated intuitive surgical canada incorporated [delaware] [canada] INTTF SRJKL INTTF SRJKL KNT\n", + "630 0.951657 forestar group incorporated forestar group incorporated [delaware] [delaware] FRSTR KRP FRSTR KRP\n", + "637 0.914612 dcp midstream, limited partnership dcp midstream operating, limited partnership [delaware] [delaware] TKP MTSTRM TKP MTSTRM OPRTNK\n", + "639 0.951657 equitable holdings, incorporated equitable holdings, incorporated [delaware] [delaware] EKTBL HLTNKS EKTBL HLTNKS\n", + "643 0.914612 energy transfer limited partnership energy transfer partners, limited liability co... [delaware] [delaware] ENRJ TRNSFR ENRJ TRNSFR PRTNRS" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds_df[preds_df.match_probability >= .9][[\"match_probability\", \"company_name_l\", \"company_name_r\", \"loc_list_l\", \"loc_list_r\", \"company_name_mphone_l\", \"company_name_mphone_r\"]].iloc[150:200]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb2122d8-ff0a-4117-a91c-17a0523dcfcb", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mozilla_sec_eia", + "language": "python", + "name": "mozilla_sec_eia" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/17-kl-paragraph-layout-metrics.ipynb b/notebooks/17-kl-paragraph-layout-metrics.ipynb new file mode 100644 index 0000000..f7c3a8d --- /dev/null +++ b/notebooks/17-kl-paragraph-layout-metrics.ipynb @@ -0,0 +1,687 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "748b07d1-61ac-43b8-bff9-9f660626da1b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 3" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bb513a3e-31f7-49da-895b-e3ed4f52efd4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "\n", + "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "29c9b2e0-7f2f-4ab7-9972-f1ed30ff196a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "archive = GCSArchive()\n", + "md = archive.get_metadata()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1608bf1e-d6cf-4e3a-8f69-0e62744d0dfd", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cikcompany_nameform_typedate_filedexhibit_21_versionyear_quarter
filename
edgar/data/17206/0000017206-94-000007.txt17206CAPITAL HOLDING CORP10-K/A1993-12-22None1993q4
edgar/data/29082/0000950131-94-000021.txt29082DISNEY WALT CO10-K1993-12-22211993q4
edgar/data/32377/0000032377-94-000001.txt32377ELIZABETHTOWN GAS CO10-K1993-12-13211993q4
edgar/data/353944/0000353944-94-000005.txt353944INTERNATIONAL GAME TECHNOLOGY10-K1993-12-23211993q4
edgar/data/60512/0000060512-94-000006.txt60512LOUISIANA LAND & EXPLORATION CO10-K/A1993-10-07None1993q4
.....................
edgar/data/932021/0001493152-23-046428.txt932021GLOBAL TECHNOLOGIES LTD10-K2023-12-2921.12023q4
edgar/data/933974/0001558370-23-019262.txt933974Azenta, Inc.10-K2023-11-2121.02023q4
edgar/data/935419/0001628280-23-041580.txt935419RCI HOSPITALITY HOLDINGS, INC.10-K2023-12-1421.12023q4
edgar/data/936395/0000936395-23-000044.txt936395CIENA CORP10-K2023-12-1521.12023q4
edgar/data/936528/0000936528-23-000207.txt936528WAFD INC10-K2023-11-17None2023q4
\n", + "

290379 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " cik \\\n", + "filename \n", + "edgar/data/17206/0000017206-94-000007.txt 17206 \n", + "edgar/data/29082/0000950131-94-000021.txt 29082 \n", + "edgar/data/32377/0000032377-94-000001.txt 32377 \n", + "edgar/data/353944/0000353944-94-000005.txt 353944 \n", + "edgar/data/60512/0000060512-94-000006.txt 60512 \n", + "... ... \n", + "edgar/data/932021/0001493152-23-046428.txt 932021 \n", + "edgar/data/933974/0001558370-23-019262.txt 933974 \n", + "edgar/data/935419/0001628280-23-041580.txt 935419 \n", + "edgar/data/936395/0000936395-23-000044.txt 936395 \n", + "edgar/data/936528/0000936528-23-000207.txt 936528 \n", + "\n", + " company_name \\\n", + "filename \n", + "edgar/data/17206/0000017206-94-000007.txt CAPITAL HOLDING CORP \n", + "edgar/data/29082/0000950131-94-000021.txt DISNEY WALT CO \n", + "edgar/data/32377/0000032377-94-000001.txt ELIZABETHTOWN GAS CO \n", + "edgar/data/353944/0000353944-94-000005.txt INTERNATIONAL GAME TECHNOLOGY \n", + "edgar/data/60512/0000060512-94-000006.txt LOUISIANA LAND & EXPLORATION CO \n", + "... ... \n", + "edgar/data/932021/0001493152-23-046428.txt GLOBAL TECHNOLOGIES LTD \n", + "edgar/data/933974/0001558370-23-019262.txt Azenta, Inc. \n", + "edgar/data/935419/0001628280-23-041580.txt RCI HOSPITALITY HOLDINGS, INC. \n", + "edgar/data/936395/0000936395-23-000044.txt CIENA CORP \n", + "edgar/data/936528/0000936528-23-000207.txt WAFD INC \n", + "\n", + " form_type date_filed \\\n", + "filename \n", + "edgar/data/17206/0000017206-94-000007.txt 10-K/A 1993-12-22 \n", + "edgar/data/29082/0000950131-94-000021.txt 10-K 1993-12-22 \n", + "edgar/data/32377/0000032377-94-000001.txt 10-K 1993-12-13 \n", + "edgar/data/353944/0000353944-94-000005.txt 10-K 1993-12-23 \n", + "edgar/data/60512/0000060512-94-000006.txt 10-K/A 1993-10-07 \n", + "... ... ... \n", + "edgar/data/932021/0001493152-23-046428.txt 10-K 2023-12-29 \n", + "edgar/data/933974/0001558370-23-019262.txt 10-K 2023-11-21 \n", + "edgar/data/935419/0001628280-23-041580.txt 10-K 2023-12-14 \n", + "edgar/data/936395/0000936395-23-000044.txt 10-K 2023-12-15 \n", + "edgar/data/936528/0000936528-23-000207.txt 10-K 2023-11-17 \n", + "\n", + " exhibit_21_version year_quarter \n", + "filename \n", + "edgar/data/17206/0000017206-94-000007.txt None 1993q4 \n", + "edgar/data/29082/0000950131-94-000021.txt 21 1993q4 \n", + "edgar/data/32377/0000032377-94-000001.txt 21 1993q4 \n", + "edgar/data/353944/0000353944-94-000005.txt 21 1993q4 \n", + "edgar/data/60512/0000060512-94-000006.txt None 1993q4 \n", + "... ... ... \n", + "edgar/data/932021/0001493152-23-046428.txt 21.1 2023q4 \n", + "edgar/data/933974/0001558370-23-019262.txt 21.0 2023q4 \n", + "edgar/data/935419/0001628280-23-041580.txt 21.1 2023q4 \n", + "edgar/data/936395/0000936395-23-000044.txt 21.1 2023q4 \n", + "edgar/data/936528/0000936528-23-000207.txt None 2023q4 \n", + "\n", + "[290379 rows x 6 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bb94754e-3765-43f2-a5e1-8b55a4021da4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.DataFrame()\n", + "dir_name = Path(\"paragraph_layout_md\")\n", + "for filename in os.listdir(dir_name):\n", + " if filename.split(\".\")[-1] != \"parquet\":\n", + " continue\n", + " yq_df = pd.read_parquet(dir_name / filename)\n", + " df = pd.concat([df, yq_df])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "52828dfa-a951-4bc5-88a1-f8c2dca2628b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
paragraph
1011174-0001193125-10-030674False
1010612-0000950123-10-019499False
1003410-0001193125-10-046549True
1011308-0000921895-10-000357True
1009672-0000950123-10-018301True
......
898293-0000950144-04-010550False
894490-0001193125-04-212822False
930803-0000950136-04-004585False
893430-0001193125-04-212647False
920354-0000950135-04-005647True
\n", + "

98712 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " paragraph\n", + "1011174-0001193125-10-030674 False\n", + "1010612-0000950123-10-019499 False\n", + "1003410-0001193125-10-046549 True\n", + "1011308-0000921895-10-000357 True\n", + "1009672-0000950123-10-018301 True\n", + "... ...\n", + "898293-0000950144-04-010550 False\n", + "894490-0001193125-04-212822 False\n", + "930803-0000950136-04-004585 False\n", + "893430-0001193125-04-212647 False\n", + "920354-0000950135-04-005647 True\n", + "\n", + "[98712 rows x 1 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "94b2ecbc-1e08-4b3a-835f-a10327f88298", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.loc[:, \"full_filename\"] = \"edgar/data/\" + df.index.str.replace('-', '/', n=1) + \".txt\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b9c56e81-3e98-44bf-8c70-256ce1d58d80", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "md[\"date_filed\"] = md[\"date_filed\"].astype(\"datetime64[ns]\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d60efebc-72ff-41e8-b765-8edcadbe185e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
paragraphfull_filename
1011174-0001193125-10-030674Falseedgar/data/1011174/0001193125-10-030674.txt
1010612-0000950123-10-019499Falseedgar/data/1010612/0000950123-10-019499.txt
\n", + "
" + ], + "text/plain": [ + " paragraph \\\n", + "1011174-0001193125-10-030674 False \n", + "1010612-0000950123-10-019499 False \n", + "\n", + " full_filename \n", + "1011174-0001193125-10-030674 edgar/data/1011174/0001193125-10-030674.txt \n", + "1010612-0000950123-10-019499 edgar/data/1010612/0000950123-10-019499.txt " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0f6d512f-b07a-4204-b3cf-69e08848ef2d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.27785882162249775" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what percentage of files are paragraph layout?\n", + "md_merged = md.reset_index().merge(df, left_on=\"filename\", right_on=\"full_filename\", how=\"left\", validate=\"1:1\")\n", + "md_merged = md_merged.dropna(subset=\"paragraph\")\n", + "len(md_merged[md_merged.paragraph])/len(md_merged)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "67e63df0-ca52-4eef-b6aa-a1715f1ab081", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecikcompany_nameform_typedate_filedexhibit_21_versionyear_quarterparagraphfull_filename
6edgar/data/100240/0000950144-94-000787.txt100240TURNER BROADCASTING SYSTEM INC10-K1994-03-31211994q1Falseedgar/data/100240/0000950144-94-000787.txt
11edgar/data/100885/0000100885-94-000006.txt100885UNION PACIFIC CORP10-K1994-03-29211994q1Falseedgar/data/100885/0000100885-94-000006.txt
\n", + "
" + ], + "text/plain": [ + " filename cik \\\n", + "6 edgar/data/100240/0000950144-94-000787.txt 100240 \n", + "11 edgar/data/100885/0000100885-94-000006.txt 100885 \n", + "\n", + " company_name form_type date_filed exhibit_21_version \\\n", + "6 TURNER BROADCASTING SYSTEM INC 10-K 1994-03-31 21 \n", + "11 UNION PACIFIC CORP 10-K 1994-03-29 21 \n", + "\n", + " year_quarter paragraph full_filename \n", + "6 1994q1 False edgar/data/100240/0000950144-94-000787.txt \n", + "11 1994q1 False edgar/data/100885/0000100885-94-000006.txt " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md_merged.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1e11faef-853b-48f2-9eb0-af7f8715cd41", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.10292571287189956" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what percentage of CIKs are only covered by paragraph layout docs\n", + "# get the set of unique CIKs in md_merged\n", + "all_ciks = set(md_merged.cik)\n", + "# remove the paragraph layout docs\n", + "no_paragraph_ciks = set(md_merged[md_merged[\"paragraph\"] == False].cik)\n", + "# get the set of CIKs that are in the full set but not the paragraph removed set\n", + "only_paragraph_ciks = all_ciks - no_paragraph_ciks\n", + "# divide that number by the total number of CIKs\n", + "len(only_paragraph_ciks)/len(all_ciks)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "6062d722-b1c7-4589-975e-7fe8cef65a40", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1664" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(only_paragraph_ciks)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b1f6ab8-e3be-48c2-9ecb-346425af3777", + "metadata": {}, + "outputs": [], + "source": [ + "# what percentage of CIK and year-quarter coverage do we get if we exclude all paragraph filings" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mozilla_sec_eia", + "language": "python", + "name": "mozilla_sec_eia" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb new file mode 100644 index 0000000..81f3513 --- /dev/null +++ b/notebooks/18-kl-splink-sec-eia.ipynb @@ -0,0 +1,3799 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "9029518c-ea19-4055-a938-36a5ea1804d8", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 3" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1107fe42-197c-4fea-9c48-06d08699af0b", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix\n", + "from splink import block_on, DuckDBAPI, Linker, SettingsCreator\n", + "from splink.blocking_analysis import count_comparisons_from_blocking_rule, cumulative_comparisons_to_be_scored_from_blocking_rules_chart, n_largest_blocks\n", + "import splink.comparison_library as cl\n", + "import splink.comparison_level_library as cll\n", + "from splink.exploratory import completeness_chart, profile_columns\n", + "\n", + "from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import (\n", + " BLOCKING_RULES,\n", + " MATCH_COLS,\n", + " SHARED_COLS,\n", + " address_comparison,\n", + " city_comparison,\n", + " company_name_comparison,\n", + " deterministic_blocking_rules,\n", + " state_comparison\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9b8224d4-7596-45b7-bfb5-028f29a96f3d", + "metadata": {}, + "source": [ + "# Inputs" + ] + }, + { + "cell_type": "markdown", + "id": "fb6b3f3f-8c30-4810-90dd-75cfbeecc4e0", + "metadata": {}, + "source": [ + "### EIA" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8b1add80-34d7-44a8-a7b4-181a770bb2cb", + "metadata": {}, + "outputs": [], + "source": [ + "eia_df = pd.read_parquet(\"gs://sec10k-outputs/v2/core_eia__parents_and_subsidiaries.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "755ab2a3-a32b-4ac1-81a5-0fb3a85dcdb3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20821" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(eia_df)" + ] + }, + { + "cell_type": "markdown", + "id": "ae7706d6-3f32-4c99-aeb6-dac6c6529eec", + "metadata": {}, + "source": [ + "### SEC 10K Basic Info" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3f5f9e6c-0725-48e1-920f-3d516b4388a6", + "metadata": {}, + "outputs": [], + "source": [ + "sec_df = pd.read_pickle(\"/Users/katielamb/CatalystCoop/dagster_home/storage/core_sec_10k__parents_and_subsidiaries\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "63d97f0d-df22-4c27-b3e7-1035166b4011", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "61026" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(sec_df)" + ] + }, + { + "cell_type": "markdown", + "id": "381e4a8f-d8da-4802-b86e-1ac4fc3094db", + "metadata": {}, + "source": [ + "# Preprocess SEC and EIA\n", + "\n", + "Does it make more sense to do a direct match on company name after\n", + "the SEC basic info to EIA match is done? And if there's a conflicting SEC match (one basic info and one Ex. 21) then review it manually?" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7d2d103a-2bbd-4974-b770-44626bdc5111", + "metadata": {}, + "outputs": [], + "source": [ + "sec_match_df = sec_df[sec_df.files_10k][SHARED_COLS]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c734137b-fd76-4f6a-a828-23cd12d4ac27", + "metadata": {}, + "outputs": [], + "source": [ + "eia_match_df = eia_df[SHARED_COLS]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e754b2ef-5a0d-4582-8694-047528dfd339", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_match_df.record_id.is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "38ad3504-2cde-455f-8896-6a435677541c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eia_match_df.record_id.is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "856c14d8-3250-4650-a2db-3808b4718f19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Note that sec_company_id isn't unique here because we are keeping each unique company name and address pair\n", + "sec_df.sec_company_id.is_unique" + ] + }, + { + "cell_type": "markdown", + "id": "b18fef7e-c316-4c90-b2bc-04706401135e", + "metadata": {}, + "source": [ + "There should probably be no duplicate record, but if there are, keep the most recent version of that record." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "842fa02e-5202-445c-b728-72bce42e740d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 20821\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eia_match_df.duplicated(subset=MATCH_COLS).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "b53e6244-f0ca-4256-bc09-9c3264675389", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 61026\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_match_df.duplicated(subset=MATCH_COLS).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "e4d54448-0c2f-452b-931c-ff79a5cc3669", + "metadata": {}, + "outputs": [], + "source": [ + "sec_match_df = sec_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=MATCH_COLS, keep=\"first\")\n", + "eia_match_df = eia_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=MATCH_COLS, keep=\"first\")" + ] + }, + { + "cell_type": "markdown", + "id": "46d967d4-3722-437d-b2f0-37cbac17624f", + "metadata": {}, + "source": [ + "# Link SEC and EIA" + ] + }, + { + "cell_type": "markdown", + "id": "509988b1-ed2c-41b3-9334-f44ae599cf4f", + "metadata": {}, + "source": [ + "## Exploratory Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "de894ad0-0b72-4486-b737-c3bfb95f8f05", + "metadata": {}, + "outputs": [], + "source": [ + "db_api = DuckDBAPI()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "4bab1568-6a55-427c-9a78-e44db8b0584d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "completeness_chart(sec_match_df, db_api=db_api)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "6b9479e3-e836-4407-a2b6-926c185065a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "completeness_chart(eia_match_df, db_api=db_api)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "95bd4db7-c447-4a0e-ab37-59d4beac0b11", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "profile_columns(sec_match_df[MATCH_COLS], db_api=DuckDBAPI(), top_n=10, bottom_n=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "d9cbc91b-b061-461b-b1f2-83036c70d7c7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "profile_columns(eia_match_df[MATCH_COLS], db_api=DuckDBAPI(), top_n=10, bottom_n=5)" + ] + }, + { + "cell_type": "markdown", + "id": "69f5fc54-f479-495c-86fc-48accda883d0", + "metadata": {}, + "source": [ + "## Blocking" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "1e6d3d24-f8ce-4118-9d31-d4f237d28237", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'number_of_comparisons_generated_pre_filter_conditions': 487944,\n", + " 'number_of_comparisons_to_be_scored_post_filter_conditions': 487944,\n", + " 'filter_conditions_identified': '',\n", + " 'equi_join_conditions_identified': 'SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',\n", + " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# useful for experimenting with a new blocking rule\n", + "counts = count_comparisons_from_blocking_rule(\n", + " table_or_tables=[sec_match_df, eia_match_df],\n", + " blocking_rule=BLOCKING_RULES[0],\n", + " link_type=\"link_only\",\n", + " unique_id_column_name='record_id',\n", + " db_api=db_api,\n", + ")\n", + "\n", + "counts" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "c37c117a-0fa2-4b8a-9fae-da3569895ca3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
key_0count_lcount_rblock_count
0INTR4457633820
1AMRK8513832338
2FRST8163629376
\n", + "
" + ], + "text/plain": [ + " key_0 count_l count_r block_count\n", + "0 INTR 445 76 33820\n", + "1 AMRK 851 38 32338\n", + "2 FRST 816 36 29376" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = n_largest_blocks(\n", + " table_or_tables=[sec_match_df, eia_match_df],\n", + " blocking_rule=BLOCKING_RULES[0],\n", + " link_type=\"link_only\",\n", + " db_api=db_api,\n", + " n_largest=3\n", + ")\n", + "\n", + "result.as_pandas_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "4e1a9844-5d98-4cac-a083-eef134f083ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=[sec_match_df, eia_match_df],\n", + " blocking_rules=BLOCKING_RULES,\n", + " db_api=db_api,\n", + " unique_id_column_name='record_id',\n", + " link_type=\"link_only\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "377b0017-e46f-4d06-8cb5-af2b7725fc0e", + "metadata": {}, + "source": [ + "## Create Model" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "f72f546c-c338-4c9f-aab1-ba03c3169e18", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'NameComparison' of \"company_name_no_legal\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'company_name_no_legal is NULL' with SQL rule: \"company_name_no_legal_l\" IS NULL OR \"company_name_no_legal_r\" IS NULL\n", + " - 'Exact match on company_name_no_legal' with SQL rule: \"company_name_no_legal_l\" = \"company_name_no_legal_r\"\n", + " - 'Jaro-Winkler distance of company_name_no_legal >= 0.95' with SQL rule: jaro_winkler_similarity(\"company_name_no_legal_l\", \"company_name_no_legal_r\") >= 0.95\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "4298a288-c306-4d75-9d72-e5b8f87774ce", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'LevenshteinAtThresholds' of \"street_address\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'street_address is NULL' with SQL rule: \"street_address_l\" IS NULL OR \"street_address_r\" IS NULL\n", + " - 'Exact match on street_address' with SQL rule: \"street_address_l\" = \"street_address_r\"\n", + " - 'Levenshtein distance of street_address <= 1' with SQL rule: levenshtein(\"street_address_l\", \"street_address_r\") <= 1\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "print(address_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "afdd5872-bc29-406f-bd0a-d5f4436f6794", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'ExactMatch' of \"state\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'state is NULL' with SQL rule: \"state_l\" IS NULL OR \"state_r\" IS NULL\n", + " - 'Exact match on state' with SQL rule: \"state_l\" = \"state_r\"\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "print(state_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "90596d17-edb4-4ed1-9306-ea6c33ad00c6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'NameComparison' of \"city\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'city is NULL' with SQL rule: \"city_l\" IS NULL OR \"city_r\" IS NULL\n", + " - 'Exact match on city' with SQL rule: \"city_l\" = \"city_r\"\n", + " - 'Jaro-Winkler distance of city >= 0.9' with SQL rule: jaro_winkler_similarity(\"city_l\", \"city_r\") >= 0.9\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "print(city_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "a946c820-9b93-41f1-9fc2-6da47d0cf407", + "metadata": {}, + "outputs": [], + "source": [ + "settings = SettingsCreator(\n", + " link_type=\"link_only\",\n", + " unique_id_column_name=\"record_id\",\n", + " comparisons=[\n", + " company_name_comparison,\n", + " address_comparison,\n", + " state_comparison,\n", + " city_comparison\n", + " ],\n", + " blocking_rules_to_generate_predictions=BLOCKING_RULES,\n", + " retain_intermediate_calculation_columns=True,\n", + ")\n", + "\n", + "linker = Linker([sec_match_df, eia_match_df], settings, db_api=DuckDBAPI())" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "36cae876-783d-4bff-89df-9d30cc5e60d6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 2.37e-06.\n", + "This means that amongst all possible pairwise record comparisons, one in 421,176.28 are expected to match. With 1,270,622,346 total possible comparisons, we expect a total of around 3,016.84 matching pairs\n" + ] + } + ], + "source": [ + "linker.training.estimate_probability_two_random_records_match(deterministic_blocking_rules, recall=0.95)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "a5190bda-070d-4d8e-a6db-be7c65b04db3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c4bcd9c2605a413aab003a2484a4a006", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b15bb7a15e37447ba1366278db3ab2bd", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n", + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - company_name_no_legal (no m values are trained).\n", + " - street_address (no m values are trained).\n", + " - state (no m values are trained).\n", + " - city (no m values are trained).\n" + ] + } + ], + "source": [ + "linker.training.estimate_u_using_random_sampling(max_pairs=1e8)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "f8dd1336-8a5b-49d2-a054-df0c0a0e953f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n", + "Estimating the m probabilities of the model by blocking on:\n", + "(l.\"company_name\" = r.\"company_name\") AND (l.\"company_name\" = r.\"company_name\")\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - company_name_no_legal\n", + " - street_address\n", + " - state\n", + " - city\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + "\n", + "WARNING:\n", + "Level Jaro-Winkler distance of company_name_no_legal >= 0.95 on comparison company_name_no_legal not observed in dataset, unable to train m value\n", + "\n", + "WARNING:\n", + "Level All other comparisons on comparison company_name_no_legal not observed in dataset, unable to train m value\n", + "\n", + "Iteration 1: Largest change in params was 0.702 in the m_probability of street_address, level `All other comparisons`\n", + "Iteration 2: Largest change in params was 0.283 in probability_two_random_records_match\n", + "Iteration 3: Largest change in params was 0.282 in probability_two_random_records_match\n", + "Iteration 4: Largest change in params was 0.000535 in probability_two_random_records_match\n", + "Iteration 5: Largest change in params was 1.09e-07 in probability_two_random_records_match\n", + "\n", + "EM converged after 5 iterations\n", + "m probability not trained for company_name_no_legal - Jaro-Winkler distance of company_name_no_legal >= 0.95 (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n", + "m probability not trained for company_name_no_legal - All other comparisons (comparison vector value: 0). This usually means the comparison level was never observed in the training data.\n", + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - company_name_no_legal (some m values are not trained).\n" + ] + } + ], + "source": [ + "training_blocking_rule = block_on(\"company_name\", \"company_name\")\n", + "training_session_fname_sname = (\n", + " linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "9581aa18-3352-429a-86c4-6078bcf13a55", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n", + "Estimating the m probabilities of the model by blocking on:\n", + "(l.\"street_address\" = r.\"street_address\") AND (l.\"street_address\" = r.\"street_address\")\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - company_name_no_legal\n", + " - state\n", + " - city\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - street_address\n", + "\n", + "Iteration 1: Largest change in params was -0.967 in the m_probability of company_name_no_legal, level `Exact match on company_name_no_legal`\n", + "Iteration 2: Largest change in params was 0.476 in probability_two_random_records_match\n", + "Iteration 3: Largest change in params was 0.0397 in probability_two_random_records_match\n", + "Iteration 4: Largest change in params was 0.0442 in the m_probability of city, level `All other comparisons`\n", + "Iteration 5: Largest change in params was 0.0194 in probability_two_random_records_match\n", + "Iteration 6: Largest change in params was 0.00729 in probability_two_random_records_match\n", + "Iteration 7: Largest change in params was 0.00274 in probability_two_random_records_match\n", + "Iteration 8: Largest change in params was 0.00104 in probability_two_random_records_match\n", + "Iteration 9: Largest change in params was 0.000398 in probability_two_random_records_match\n", + "Iteration 10: Largest change in params was 0.000153 in probability_two_random_records_match\n", + "Iteration 11: Largest change in params was 5.88e-05 in probability_two_random_records_match\n", + "\n", + "EM converged after 11 iterations\n", + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] + } + ], + "source": [ + "training_blocking_rule = block_on(\"street_address\", \"street_address\")\n", + "training_session_fname_sname = (\n", + " linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "61298aa2-dbd4-4f2a-9c25-5f831d226d13", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.match_weights_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "f365f59e-e4d0-44f3-a1fb-62e0d63d7ba3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.m_u_parameters_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 420, + "id": "fedb78e1-ee73-4d1e-8a96-3b27f6561a91", + "metadata": {}, + "outputs": [], + "source": [ + "# you could save the model weights like this\n", + "settings = linker.misc.save_model_to_json(\n", + " \"model_unsupervised_0.json\", overwrite=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "31f9d73d-cfa4-41fa-906f-c8501a29283b", + "metadata": {}, + "source": [ + "## Make Predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "94e96441-89b6-4516-aa6a-4d1593ce03be", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Blocking time: 0.16 seconds\n", + "Predict time: 0.26 seconds\n" + ] + } + ], + "source": [ + "df_predictions = linker.inference.predict()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "722effbc-24e8-45ca-aa64-8cdcd75846e0", + "metadata": {}, + "outputs": [], + "source": [ + "preds_df = df_predictions.as_pandas_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "21b19a11-15c6-46ab-bd73-7c4752f7e25e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_no_legal_lcompany_name_no_legal_rgamma_company_name_no_legaltf_company_name_no_legal_ltf_company_name_no_legal_rbf_company_name_no_legalbf_tf_adj_company_name_no_legalstreet_address_lstreet_address_rgamma_street_addresstf_street_address_ltf_street_address_rbf_street_addressbf_tf_adj_street_addressstate_lstate_rgamma_statetf_state_ltf_state_rbf_statebf_tf_adj_statecity_lcity_rgamma_citytf_city_ltf_city_rbf_citybf_tf_adj_citycompany_name_mphone_lcompany_name_mphone_rmatch_key
295287-22.9679751.218850e-07__splink__input_table_0__splink__input_table_15623019078union pacificunion electric00.0000490.0000980.9860461.0000001416 dodge stmc 140000.0000490.0000490.8816581.000000nemo00.0064550.0101180.1990121.000000omahast louis00.0034480.0027640.2967141.000000UNN PSFKUNN ELKTRK0
384509-22.9679751.218850e-07__splink__input_table_0__splink__input_table_15648419138united states lime and mineralsunited water conservation00.0000370.0000240.9860461.0000005429 lbj fwy1701 north lombard st00.0000240.0000120.8816581.000000txca00.0798410.1579600.1990121.000000dallasoxnard00.0138550.0002570.2967141.000000UNTT STTS LM ANT MNRLSUNTT WTR KNSRFXN0
384504-22.9679751.218850e-07__splink__input_table_0__splink__input_table_15643619138united rentalsunited water conservation00.0000240.0000240.9860461.000000100 first stamford pl1701 north lombard st00.0001220.0000120.8816581.000000ctca00.0208760.1579600.1990121.000000stamfordoxnard00.0039500.0002570.2967141.000000UNTT RNTLSUNTT WTR KNSRFXN0
384503-22.9679751.218850e-07__splink__input_table_0__splink__input_table_15642419138united parcel serviceunited water conservation00.0000240.0000240.9860461.00000055 glenlake pkwy ne1701 north lombard st00.0000120.0000120.8816581.000000gaca00.0186260.1579600.1990121.000000atlantaoxnard00.0084620.0002570.2967141.000000UNTT PRSL SRFSUNTT WTR KNSRFXN0
384502-22.9679751.218850e-07__splink__input_table_0__splink__input_table_15631219138united bancorp /oh/united water conservation00.0000240.0000240.9860461.000000201 south fourth st1701 north lombard st00.0000120.0000120.8816581.000000ohca00.0169910.1579600.1990121.000000martins ferryoxnard00.0000240.0002570.2967141.000000UNTT BNKRPUNTT WTR KNSRFXN0
..................................................................................................................
16381527.5196061.000000e+00__splink__input_table_0__splink__input_table_13981613109northwestern public servicenorthwestern public service20.0000730.000073415263.1332690.01661633 third st se33 third st se20.0000370.0000379605.7816940.311992sdsd10.0019300.00193015.44555927.217182huronhuron20.0000730.000073102.01412391.382644NR0WSTRN PBLK SRFSNR0WSTRN PBLK SRFS0
24159327.5265141.000000e+00__splink__input_table_0__splink__input_table_1246508047green mountain powergreen mountain power20.0000370.000037415263.1332690.033231163 acorn ln163 acorn ln20.0000370.0000379605.7816940.311992vtvt10.0015370.00153715.44555934.184780colchestercolchester20.0001830.000183102.01412336.553058KRN MNTN PWRKRN MNTN PWR0
16548727.7573381.000000e+00__splink__input_table_0__splink__input_table_15884219906wausau paper millswausau paper mills20.0000240.000024415263.1332690.049847one clarks isone clarks is20.0000240.0000249605.7816940.467987wiwi10.0088400.00884015.4455595.943112wausauwausau20.0000610.000061102.014123109.659173WS PPR MLSWS PPR MLS0
34041427.8843651.000000e+00__splink__input_table_0__splink__input_table_15156717450st joseph light and powerst joseph light and power20.0000240.000024415263.1332690.049847520 francis st520 francis st20.0000240.0000249605.7816940.467987momo10.0101180.01011815.4455595.192099st josephst joseph20.0000490.000049102.014123137.073967ST JSF LT ANT PWRST JSF LT ANT PWR0
27476029.2110121.000000e+00__splink__input_table_0__splink__input_table_1205886741fibermarkfibermark20.0000370.000037415263.1332690.033231161 wellington rd161 wellington rd20.0000240.0000249605.7816940.467987vtvt10.0015370.00153715.44555934.184780brattleborobrattleboro20.0000860.000086102.01412378.327981FBRMRKFBRMRK0
\n", + "

590575 rows × 37 columns

\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key\n", + "295287 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56230 19078 union pacific union electric 0 0.000049 0.000098 0.986046 1.000000 1416 dodge st mc 1400 0 0.000049 0.000049 0.881658 1.000000 ne mo 0 0.006455 0.010118 0.199012 1.000000 omaha st louis 0 0.003448 0.002764 0.296714 1.000000 UNN PSFK UNN ELKTRK 0\n", + "384509 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56484 19138 united states lime and minerals united water conservation 0 0.000037 0.000024 0.986046 1.000000 5429 lbj fwy 1701 north lombard st 0 0.000024 0.000012 0.881658 1.000000 tx ca 0 0.079841 0.157960 0.199012 1.000000 dallas oxnard 0 0.013855 0.000257 0.296714 1.000000 UNTT STTS LM ANT MNRLS UNTT WTR KNSRFXN 0\n", + "384504 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56436 19138 united rentals united water conservation 0 0.000024 0.000024 0.986046 1.000000 100 first stamford pl 1701 north lombard st 0 0.000122 0.000012 0.881658 1.000000 ct ca 0 0.020876 0.157960 0.199012 1.000000 stamford oxnard 0 0.003950 0.000257 0.296714 1.000000 UNTT RNTLS UNTT WTR KNSRFXN 0\n", + "384503 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56424 19138 united parcel service united water conservation 0 0.000024 0.000024 0.986046 1.000000 55 glenlake pkwy ne 1701 north lombard st 0 0.000012 0.000012 0.881658 1.000000 ga ca 0 0.018626 0.157960 0.199012 1.000000 atlanta oxnard 0 0.008462 0.000257 0.296714 1.000000 UNTT PRSL SRFS UNTT WTR KNSRFXN 0\n", + "384502 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56312 19138 united bancorp /oh/ united water conservation 0 0.000024 0.000024 0.986046 1.000000 201 south fourth st 1701 north lombard st 0 0.000012 0.000012 0.881658 1.000000 oh ca 0 0.016991 0.157960 0.199012 1.000000 martins ferry oxnard 0 0.000024 0.000257 0.296714 1.000000 UNTT BNKRP UNTT WTR KNSRFXN 0\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "163815 27.519606 1.000000e+00 __splink__input_table_0 __splink__input_table_1 39816 13109 northwestern public service northwestern public service 2 0.000073 0.000073 415263.133269 0.016616 33 third st se 33 third st se 2 0.000037 0.000037 9605.781694 0.311992 sd sd 1 0.001930 0.001930 15.445559 27.217182 huron huron 2 0.000073 0.000073 102.014123 91.382644 NR0WSTRN PBLK SRFS NR0WSTRN PBLK SRFS 0\n", + "241593 27.526514 1.000000e+00 __splink__input_table_0 __splink__input_table_1 24650 8047 green mountain power green mountain power 2 0.000037 0.000037 415263.133269 0.033231 163 acorn ln 163 acorn ln 2 0.000037 0.000037 9605.781694 0.311992 vt vt 1 0.001537 0.001537 15.445559 34.184780 colchester colchester 2 0.000183 0.000183 102.014123 36.553058 KRN MNTN PWR KRN MNTN PWR 0\n", + "165487 27.757338 1.000000e+00 __splink__input_table_0 __splink__input_table_1 58842 19906 wausau paper mills wausau paper mills 2 0.000024 0.000024 415263.133269 0.049847 one clarks is one clarks is 2 0.000024 0.000024 9605.781694 0.467987 wi wi 1 0.008840 0.008840 15.445559 5.943112 wausau wausau 2 0.000061 0.000061 102.014123 109.659173 WS PPR MLS WS PPR MLS 0\n", + "340414 27.884365 1.000000e+00 __splink__input_table_0 __splink__input_table_1 51567 17450 st joseph light and power st joseph light and power 2 0.000024 0.000024 415263.133269 0.049847 520 francis st 520 francis st 2 0.000024 0.000024 9605.781694 0.467987 mo mo 1 0.010118 0.010118 15.445559 5.192099 st joseph st joseph 2 0.000049 0.000049 102.014123 137.073967 ST JSF LT ANT PWR ST JSF LT ANT PWR 0\n", + "274760 29.211012 1.000000e+00 __splink__input_table_0 __splink__input_table_1 20588 6741 fibermark fibermark 2 0.000037 0.000037 415263.133269 0.033231 161 wellington rd 161 wellington rd 2 0.000024 0.000024 9605.781694 0.467987 vt vt 1 0.001537 0.001537 15.445559 34.184780 brattleboro brattleboro 2 0.000086 0.000086 102.014123 78.327981 FBRMRK FBRMRK 0\n", + "\n", + "[590575 rows x 37 columns]" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds_df.sort_values(by=\"match_probability\")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "c0b292c8-26ed-407a-866e-75851577d567", + "metadata": {}, + "outputs": [], + "source": [ + "# join on utility_id_eia and CIK\n", + "preds_validation_df = preds_df.merge(sec_df[[\"record_id\", \"sec_company_id\", \"central_index_key\", \"company_name_raw\"]],\n", + " how=\"left\",\n", + " left_on=\"record_id_l\",\n", + " right_on=\"record_id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "e8744d54-3cc6-434c-bbbe-a10d2d3cd9c0", + "metadata": {}, + "outputs": [], + "source": [ + "preds_validation_df = preds_validation_df.merge(eia_df[[\"record_id\", \"utility_id_eia\"]],\n", + " how=\"left\",\n", + " left_on=\"record_id_r\",\n", + " right_on=\"record_id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "5103190c-3775-427f-a8f2-cc8a8f79892b", + "metadata": {}, + "outputs": [], + "source": [ + "preds_validation_df = preds_validation_df.sort_values(\n", + " by=[\"sec_company_id\", \"utility_id_eia\", \"match_probability\"], ascending=False\n", + ").drop_duplicates(subset=[\"sec_company_id\", \"utility_id_eia\"], keep=\"first\")" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "8fa04f18-beff-4bdc-bbbb-705950eab5b8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_no_legal_lcompany_name_no_legal_rgamma_company_name_no_legaltf_company_name_no_legal_ltf_company_name_no_legal_rbf_company_name_no_legalbf_tf_adj_company_name_no_legalstreet_address_lstreet_address_rgamma_street_addresstf_street_address_ltf_street_address_rbf_street_addressbf_tf_adj_street_addressstate_lstate_rgamma_statetf_state_ltf_state_rbf_statebf_tf_adj_statecity_lcity_rgamma_citytf_city_ltf_city_rbf_citybf_tf_adj_citycompany_name_mphone_lcompany_name_mphone_rmatch_keyrecord_id_xsec_company_idcentral_index_keycompany_name_rawrecord_id_yutility_id_eia
2187973.8245780.934072__splink__input_table_0__splink__input_table_1146926293craneentergy nuclear power marketing00.0000120.0000120.9860461.0100 first stamford pl100 first stamford pl20.0001220.0001229605.7816940.093597ctct10.0208760.02087615.4455592.516547stamfordstamford20.0039500.003950102.0141231.697510KRNENTRJ NKLR PWR MRKTNK11469200019440130001944013crane co629355243
2200364.6199870.960922__splink__input_table_0__splink__input_table_1177525535dte electric securitization funding idte sustainable generation00.0000120.0000120.9860461.0one energy plzone energy plz20.0003300.0003309605.7816940.034666mimi10.0151470.01514715.4455593.468423detroitdetroit20.0011620.001162102.0141235.771535TT ELKTRK SKRTSXN FNTNK ITT SSTNBL JNRXN11775200018760680001876068dte electric securitization funding i llc553564331
3581524.6199870.960922__splink__input_table_0__splink__input_table_1177525522dte electric securitization funding idte electric00.0000120.0000370.9860461.0one energy plzone energy plz20.0003300.0003309605.7816940.034666mimi10.0151470.01514715.4455593.468423detroitdetroit20.0011620.001162102.0141235.771535TT ELKTRK SKRTSXN FNTNK ITT ELKTRK01775200018760680001876068dte electric securitization funding i llc55225109
\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key record_id_x sec_company_id central_index_key company_name_raw record_id_y utility_id_eia\n", + "218797 3.824578 0.934072 __splink__input_table_0 __splink__input_table_1 14692 6293 crane entergy nuclear power marketing 0 0.000012 0.000012 0.986046 1.0 100 first stamford pl 100 first stamford pl 2 0.000122 0.000122 9605.781694 0.093597 ct ct 1 0.020876 0.020876 15.445559 2.516547 stamford stamford 2 0.003950 0.003950 102.014123 1.697510 KRN ENTRJ NKLR PWR MRKTNK 1 14692 0001944013 0001944013 crane co 6293 55243\n", + "220036 4.619987 0.960922 __splink__input_table_0 __splink__input_table_1 17752 5535 dte electric securitization funding i dte sustainable generation 0 0.000012 0.000012 0.986046 1.0 one energy plz one energy plz 2 0.000330 0.000330 9605.781694 0.034666 mi mi 1 0.015147 0.015147 15.445559 3.468423 detroit detroit 2 0.001162 0.001162 102.014123 5.771535 TT ELKTRK SKRTSXN FNTNK I TT SSTNBL JNRXN 1 17752 0001876068 0001876068 dte electric securitization funding i llc 5535 64331\n", + "358152 4.619987 0.960922 __splink__input_table_0 __splink__input_table_1 17752 5522 dte electric securitization funding i dte electric 0 0.000012 0.000037 0.986046 1.0 one energy plz one energy plz 2 0.000330 0.000330 9605.781694 0.034666 mi mi 1 0.015147 0.015147 15.445559 3.468423 detroit detroit 2 0.001162 0.001162 102.014123 5.771535 TT ELKTRK SKRTSXN FNTNK I TT ELKTRK 0 17752 0001876068 0001876068 dte electric securitization funding i llc 5522 5109" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds_validation_df[preds_validation_df.match_probability > .9].head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "07fbec17-cef2-4b9c-a005-1623c65c5e20", + "metadata": {}, + "source": [ + "Figure out what to do about this validation CSV, maybe it should be part of package data? It's not a very big sample size and it's imperfect so the metrics gained from it are should be taken with a grain of salt." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "11190456-12a9-49df-b863-7a6f674e39eb", + "metadata": {}, + "outputs": [], + "source": [ + "validation_df = pd.read_csv(\"sec_eia_validation_set.csv\", dtype={\"central_index_key\": str})" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "5a57bae5-6188-4a6c-be9a-14a159f82d81", + "metadata": {}, + "outputs": [], + "source": [ + "validation_df[\"central_index_key\"] = validation_df[\"central_index_key\"].str.zfill(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "461725d0-28c8-43dd-bcd5-086b7d3f7d4b", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df = validation_df.merge(\n", + " preds_validation_df[[\"record_id_l\", \"record_id_r\", \"central_index_key\", \"utility_id_eia\", \"match_probability\", \"gamma_company_name_no_legal\"]].drop_duplicates(keep=\"first\"),\n", + " how=\"left\",\n", + " on=[\"central_index_key\", \"utility_id_eia\"],\n", + " indicator=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "4d45f339-7a5b-466a-81f5-c71e425a77df", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df[\"predicted_match\"] = merged_df[\"_merge\"].map({\"both\": 1, \"left_only\": 0})" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "182732ea-39c5-4fb4-9429-5f7aed781ef5", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df[\"predicted_match\"] = merged_df[\"predicted_match\"].where(\n", + " (merged_df.match_probability > .95),\n", + " 0\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "bc058dbf-6f71-4978-90ce-5405edbbf9e5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
central_index_keyutility_id_eiasec_company_nameeia_company_namematchrecord_id_lrecord_id_rmatch_probabilitygamma_company_name_no_legal_mergepredicted_match
00000003153195alabama power coNaN11701.0478.01.0000002.0both1.0
1000186894158702fluence energy, inc.Fluence021792.06889.00.0165290.0both0.0
200000410917140georgia power coNaN123416.07653.00.9999972.0both1.0
300000221984062columbus southern power co /oh/Columbus Southern Power Co113310.04281.00.9999821.0both1.0
400013261605416duke energy corpNaN117793.05564.00.9272932.0both0.0
5000003037154905duke energy carolinas, llcDuke Energy Carolinas LLC117790.05558.00.9999872.0both1.0
6000086944657140berkshire realty co inc /deBerkshire Wind Power Cooperative Corp07449.01712.00.0019120.0both0.0
7000009212218195southern cosouthern co services inc050964.017068.00.0072160.0both0.0
8000009212217650southern coSouthern Power Co050963.017089.00.0342320.0both0.0
9000007548814328pacific gas & electric coNaN141598.013933.00.9999482.0both1.0
1000010312966526firstenergy corpFirstEnergy021579.06776.00.9999982.0both1.0
11000103129654776firstenergy corpFirstEnergy Nuclear Generation Corp021579.06780.00.9865420.0both1.0
1200010312966458firstenergy corpFirst Energy Services021579.06763.00.0854660.0both0.0
13000103129632208firstenergy corpFirst Energy Corp1NaNNaNNaNNaNleft_only0.0
14000010012224211tucson electric power coNaN155725.018901.01.0000002.0both1.0
15000009627118454tampa electric coNaN153604.018180.00.9910592.0both1.0
1600007159575248dominion energy, incNaN117484.05386.00.9999852.0both1.0
17000101387159883nrg energy, incNRG Energy Gas & Wind Holdings Inc040084.013240.00.3001650.0both0.0
18000101387113377nrg energy incNRG Energy Inc140084.013243.00.9998132.0both1.0
19000078881613994oglethorpe power corpNaN140576.013515.01.0000002.0both1.0
2000000186753266central maine power coNaN110876.03424.01.0000002.0both1.0
21000103220861296sempra energySempra Generation149303.016270.00.5590710.0both0.0
220000004904488american electric power co incAmerican Electric Power Inc12927.0793.00.9960762.0both1.0
2300007159575248dominion energy, incDominion Energy Inc.117484.05386.00.9999852.0both1.0
\n", + "
" + ], + "text/plain": [ + " central_index_key utility_id_eia sec_company_name eia_company_name match record_id_l record_id_r match_probability gamma_company_name_no_legal _merge predicted_match\n", + "0 0000003153 195 alabama power co NaN 1 1701.0 478.0 1.000000 2.0 both 1.0\n", + "1 0001868941 58702 fluence energy, inc. Fluence 0 21792.0 6889.0 0.016529 0.0 both 0.0\n", + "2 0000041091 7140 georgia power co NaN 1 23416.0 7653.0 0.999997 2.0 both 1.0\n", + "3 0000022198 4062 columbus southern power co /oh/ Columbus Southern Power Co 1 13310.0 4281.0 0.999982 1.0 both 1.0\n", + "4 0001326160 5416 duke energy corp NaN 1 17793.0 5564.0 0.927293 2.0 both 0.0\n", + "5 0000030371 54905 duke energy carolinas, llc Duke Energy Carolinas LLC 1 17790.0 5558.0 0.999987 2.0 both 1.0\n", + "6 0000869446 57140 berkshire realty co inc /de Berkshire Wind Power Cooperative Corp 0 7449.0 1712.0 0.001912 0.0 both 0.0\n", + "7 0000092122 18195 southern co southern co services inc 0 50964.0 17068.0 0.007216 0.0 both 0.0\n", + "8 0000092122 17650 southern co Southern Power Co 0 50963.0 17089.0 0.034232 0.0 both 0.0\n", + "9 0000075488 14328 pacific gas & electric co NaN 1 41598.0 13933.0 0.999948 2.0 both 1.0\n", + "10 0001031296 6526 firstenergy corp FirstEnergy 0 21579.0 6776.0 0.999998 2.0 both 1.0\n", + "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 21579.0 6780.0 0.986542 0.0 both 1.0\n", + "12 0001031296 6458 firstenergy corp First Energy Services 0 21579.0 6763.0 0.085466 0.0 both 0.0\n", + "13 0001031296 32208 firstenergy corp First Energy Corp 1 NaN NaN NaN NaN left_only 0.0\n", + "14 0000100122 24211 tucson electric power co NaN 1 55725.0 18901.0 1.000000 2.0 both 1.0\n", + "15 0000096271 18454 tampa electric co NaN 1 53604.0 18180.0 0.991059 2.0 both 1.0\n", + "16 0000715957 5248 dominion energy, inc NaN 1 17484.0 5386.0 0.999985 2.0 both 1.0\n", + "17 0001013871 59883 nrg energy, inc NRG Energy Gas & Wind Holdings Inc 0 40084.0 13240.0 0.300165 0.0 both 0.0\n", + "18 0001013871 13377 nrg energy inc NRG Energy Inc 1 40084.0 13243.0 0.999813 2.0 both 1.0\n", + "19 0000788816 13994 oglethorpe power corp NaN 1 40576.0 13515.0 1.000000 2.0 both 1.0\n", + "20 0000018675 3266 central maine power co NaN 1 10876.0 3424.0 1.000000 2.0 both 1.0\n", + "21 0001032208 61296 sempra energy Sempra Generation 1 49303.0 16270.0 0.559071 0.0 both 0.0\n", + "22 0000004904 488 american electric power co inc American Electric Power Inc 1 2927.0 793.0 0.996076 2.0 both 1.0\n", + "23 0000715957 5248 dominion energy, inc Dominion Energy Inc. 1 17484.0 5386.0 0.999985 2.0 both 1.0" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df.head(50)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "2fc6ed05-5b35-4856-a668-f16e253e7fea", + "metadata": {}, + "outputs": [], + "source": [ + "precision = precision_score(merged_df['match'], merged_df['predicted_match'])\n", + "recall = recall_score(merged_df['match'], merged_df['predicted_match'])\n", + "accuracy = accuracy_score(merged_df['match'], merged_df['predicted_match'])\n", + "# roc_auc = roc_auc_score(merged_df['match'], merged_df['match_probability'])\n", + "\n", + "# Confusion matrix\n", + "conf_matrix = confusion_matrix(merged_df['match'], merged_df['predicted_match'])" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "99a7bc64-9f32-4f53-9a89-63a31ff7debe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(np.float64(0.8666666666666667), np.float64(0.8125), 0.7916666666666666)" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "precision, recall, accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "08932be5-b90c-440d-9efb-156cb4d63c93", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Predicted NegativePredicted Positive
Negative62
Positive313
\n", + "
" + ], + "text/plain": [ + " Predicted Negative Predicted Positive\n", + "Negative 6 2\n", + "Positive 3 13" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(\n", + " conf_matrix,\n", + " index=[\"Negative\", \"Positive\"],\n", + " columns=[\"Predicted Negative\", \"Predicted Positive\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "025c80e9-5055-4eaa-a873-38b910cd7f94", + "metadata": {}, + "outputs": [], + "source": [ + "incorrect_df = merged_df[merged_df.match != merged_df.predicted_match]" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "e4f44fda-1b55-4f9c-a96d-5eec36dac768", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
central_index_keyutility_id_eiasec_company_nameeia_company_namematchrecord_id_lrecord_id_rmatch_probabilitygamma_company_name_no_legal_mergepredicted_match
400013261605416duke energy corpNaN117793.05564.00.9272932.0both0.0
1000010312966526firstenergy corpFirstEnergy021579.06776.00.9999982.0both1.0
11000103129654776firstenergy corpFirstEnergy Nuclear Generation Corp021579.06780.00.9865420.0both1.0
13000103129632208firstenergy corpFirst Energy Corp1NaNNaNNaNNaNleft_only0.0
21000103220861296sempra energySempra Generation149303.016270.00.5590710.0both0.0
\n", + "
" + ], + "text/plain": [ + " central_index_key utility_id_eia sec_company_name eia_company_name match record_id_l record_id_r match_probability gamma_company_name_no_legal _merge predicted_match\n", + "4 0001326160 5416 duke energy corp NaN 1 17793.0 5564.0 0.927293 2.0 both 0.0\n", + "10 0001031296 6526 firstenergy corp FirstEnergy 0 21579.0 6776.0 0.999998 2.0 both 1.0\n", + "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 21579.0 6780.0 0.986542 0.0 both 1.0\n", + "13 0001031296 32208 firstenergy corp First Energy Corp 1 NaN NaN NaN NaN left_only 0.0\n", + "21 0001032208 61296 sempra energy Sempra Generation 1 49303.0 16270.0 0.559071 0.0 both 0.0" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "incorrect_df" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "id": "c425a676-aa6e-4d8f-b814-931da392c2ff", + "metadata": {}, + "outputs": [], + "source": [ + "recs_to_view = []\n", + "for idx, rec in incorrect_df.iterrows():\n", + " full_rec = preds_validation_df[\n", + " (preds_validation_df.record_id_l == rec.record_id_l) & \n", + " (preds_validation_df.record_id_r == rec.record_id_r)\n", + " ].squeeze()\n", + " if full_rec.empty:\n", + " continue\n", + " recs_to_view.append(full_rec.to_dict())" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "id": "ff55f2cb-7ce1-4697-99e7-bf22918f7ed1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 152, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.waterfall_chart(recs_to_view, filter_nulls=True)" + ] + }, + { + "cell_type": "markdown", + "id": "a2ba43b6-a664-462a-823f-e3f08585bb51", + "metadata": {}, + "source": [ + "# Save good predictions\n", + "Make the predictions one to one. First, keep the highest probability EIA utility ID for each SEC company. Then, keep the highest probability SEC company for each EIA utility" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "92172e2f-39ba-49e3-8312-98597256ca4f", + "metadata": {}, + "outputs": [], + "source": [ + "one_to_one_preds = preds_validation_df[preds_validation_df.match_probability >= .95].sort_values(\n", + " by=\"match_probability\", ascending=False\n", + ").drop_duplicates(\n", + " subset=\"sec_company_id\", keep=\"first\"\n", + ").drop_duplicates(\n", + " subset=\"utility_id_eia\", keep=\"first\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "07ca81ae-1b26-4cd3-ade6-75381028028a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "534" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(one_to_one_preds)" + ] + }, + { + "cell_type": "markdown", + "id": "c3db3175-7cf3-497c-8f22-e68a6c9c6af2", + "metadata": {}, + "source": [ + "# Add `utility_id_eia` onto the SEC table to create output table" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "361b3e30-e823-4137-9062-6a00eae537fe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_no_legal_lcompany_name_no_legal_rgamma_company_name_no_legaltf_company_name_no_legal_ltf_company_name_no_legal_rbf_company_name_no_legalbf_tf_adj_company_name_no_legalstreet_address_lstreet_address_rgamma_street_addresstf_street_address_ltf_street_address_rbf_street_addressbf_tf_adj_street_addressstate_lstate_rgamma_statetf_state_ltf_state_rbf_statebf_tf_adj_statecity_lcity_rgamma_citytf_city_ltf_city_rbf_citybf_tf_adj_citycompany_name_mphone_lcompany_name_mphone_rmatch_keyrecord_id_xsec_company_idcentral_index_keycompany_name_rawrecord_id_yutility_id_eia
27476029.2110121.000000__splink__input_table_0__splink__input_table_1205886741fibermarkfibermark20.0000370.000037415263.1332690.033231161 wellington rd161 wellington rd20.0000240.0000249605.7816940.467987vtvt10.0015370.00153715.44555934.184780brattleborobrattleboro20.0000860.000086102.01412378.327981FBRMRKFBRMRK02058800008875910000887591fibermark inc67416309
34041427.8843651.000000__splink__input_table_0__splink__input_table_15156717450st joseph light and powerst joseph light and power20.0000240.000024415263.1332690.049847520 francis st520 francis st20.0000240.0000249605.7816940.467987momo10.0101180.01011815.4455595.192099st josephst joseph20.0000490.000049102.014123137.073967ST JSF LT ANT PWRST JSF LT ANT PWR05156700000862510000086251st joseph light & power co1745017881
16548727.7573381.000000__splink__input_table_0__splink__input_table_15884219906wausau paper millswausau paper mills20.0000240.000024415263.1332690.049847one clarks isone clarks is20.0000240.0000249605.7816940.467987wiwi10.0088400.00884015.4455595.943112wausauwausau20.0000610.000061102.014123109.659173WS PPR MLSWS PPR MLS05884200001050760000105076wausau paper mills co1990620190
24159327.5265141.000000__splink__input_table_0__splink__input_table_1246508047green mountain powergreen mountain power20.0000370.000037415263.1332690.033231163 acorn ln163 acorn ln20.0000370.0000379605.7816940.311992vtvt10.0015370.00153715.44555934.184780colchestercolchester20.0001830.000183102.01412336.553058KRN MNTN PWRKRN MNTN PWR02465000000437040000043704green mountain power corp80477601
16381527.5196061.000000__splink__input_table_0__splink__input_table_13981613109northwestern public servicenorthwestern public service20.0000730.000073415263.1332690.01661633 third st se33 third st se20.0000370.0000379605.7816940.311992sdsd10.0019300.00193015.44555927.217182huronhuron20.0000730.000073102.01412391.382644NR0WSTRN PBLK SRFSNR0WSTRN PBLK SRFS03981600000730880000073088northwestern public service co1310913809
....................................................................................................................................
14834.3371210.952856__splink__input_table_0__splink__input_table_15800417611vistacarestirling energy systems solar three00.0000240.0000370.9860461.0000004800 n scottsdale rd4800 n scottsdale rd20.0001100.0001109605.7816940.103997azaz10.0128720.01287215.4455594.081277scottsdalescottsdale20.0049890.004989102.0141231.343862FSTKRSTRLNK ENRJ SSTMS SLR 0R15800400007870300000787030vistacare, inc.1761156168
2184534.2721570.950792__splink__input_table_0__splink__input_table_1191747605enovisgenon sabine delaware00.0000120.0000120.9860461.0000002711 centerville rd2711 centerville rd20.0000610.0000619605.7816940.187195dede10.0117170.01171715.4455594.483838wilmingtonwilmington20.0103210.010321102.0141230.649640ENFSJNN SBN TLWR11917400014208000001420800enovis corp760556922
10554.2721570.950792__splink__input_table_0__splink__input_table_1165016368aisystemsshannon wind00.0000240.0000240.9860461.0000002711 centerville rd2711 centerville rd20.0000610.0000619605.7816940.187195dede10.0117170.01171715.4455594.483838wilmingtonwilmington20.0103210.010321102.0141230.649640ASSTMSXNN WNT1165000013287690001328769aisystems, inc.1636858872
72164.2721570.950792__splink__input_table_0__splink__input_table_13240314089lease investment flight trustpasadena statutory trust00.0000120.0000120.9860461.0000001100 north market st1100 north market st20.0000610.0000619605.7816940.187195dede10.0117170.01171715.4455594.483838wilmingtonwilmington20.0103210.010321102.0141230.649640LS INFSTMNT FLT TRSTPSTN STTTR TRST13240300011583890001158389lease investment flight trust1408961235
61134.2721570.950792__splink__input_table_0__splink__input_table_1162616195airplanes us trustse solar trust v c00.0000120.0000120.9860461.0000001100 north market st1100 north market st20.0000610.0000619605.7816940.187195dede10.0117170.01171715.4455594.483838wilmingtonwilmington20.0103210.010321102.0141230.649640ARPLNS US TRSTS SLR TRST F K1162600010045400001004540airplanes us trust1619556900
\n", + "

534 rows × 43 columns

\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key record_id_x sec_company_id central_index_key company_name_raw record_id_y utility_id_eia\n", + "274760 29.211012 1.000000 __splink__input_table_0 __splink__input_table_1 20588 6741 fibermark fibermark 2 0.000037 0.000037 415263.133269 0.033231 161 wellington rd 161 wellington rd 2 0.000024 0.000024 9605.781694 0.467987 vt vt 1 0.001537 0.001537 15.445559 34.184780 brattleboro brattleboro 2 0.000086 0.000086 102.014123 78.327981 FBRMRK FBRMRK 0 20588 0000887591 0000887591 fibermark inc 6741 6309\n", + "340414 27.884365 1.000000 __splink__input_table_0 __splink__input_table_1 51567 17450 st joseph light and power st joseph light and power 2 0.000024 0.000024 415263.133269 0.049847 520 francis st 520 francis st 2 0.000024 0.000024 9605.781694 0.467987 mo mo 1 0.010118 0.010118 15.445559 5.192099 st joseph st joseph 2 0.000049 0.000049 102.014123 137.073967 ST JSF LT ANT PWR ST JSF LT ANT PWR 0 51567 0000086251 0000086251 st joseph light & power co 17450 17881\n", + "165487 27.757338 1.000000 __splink__input_table_0 __splink__input_table_1 58842 19906 wausau paper mills wausau paper mills 2 0.000024 0.000024 415263.133269 0.049847 one clarks is one clarks is 2 0.000024 0.000024 9605.781694 0.467987 wi wi 1 0.008840 0.008840 15.445559 5.943112 wausau wausau 2 0.000061 0.000061 102.014123 109.659173 WS PPR MLS WS PPR MLS 0 58842 0000105076 0000105076 wausau paper mills co 19906 20190\n", + "241593 27.526514 1.000000 __splink__input_table_0 __splink__input_table_1 24650 8047 green mountain power green mountain power 2 0.000037 0.000037 415263.133269 0.033231 163 acorn ln 163 acorn ln 2 0.000037 0.000037 9605.781694 0.311992 vt vt 1 0.001537 0.001537 15.445559 34.184780 colchester colchester 2 0.000183 0.000183 102.014123 36.553058 KRN MNTN PWR KRN MNTN PWR 0 24650 0000043704 0000043704 green mountain power corp 8047 7601\n", + "163815 27.519606 1.000000 __splink__input_table_0 __splink__input_table_1 39816 13109 northwestern public service northwestern public service 2 0.000073 0.000073 415263.133269 0.016616 33 third st se 33 third st se 2 0.000037 0.000037 9605.781694 0.311992 sd sd 1 0.001930 0.001930 15.445559 27.217182 huron huron 2 0.000073 0.000073 102.014123 91.382644 NR0WSTRN PBLK SRFS NR0WSTRN PBLK SRFS 0 39816 0000073088 0000073088 northwestern public service co 13109 13809\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "1483 4.337121 0.952856 __splink__input_table_0 __splink__input_table_1 58004 17611 vistacare stirling energy systems solar three 0 0.000024 0.000037 0.986046 1.000000 4800 n scottsdale rd 4800 n scottsdale rd 2 0.000110 0.000110 9605.781694 0.103997 az az 1 0.012872 0.012872 15.445559 4.081277 scottsdale scottsdale 2 0.004989 0.004989 102.014123 1.343862 FSTKR STRLNK ENRJ SSTMS SLR 0R 1 58004 0000787030 0000787030 vistacare, inc. 17611 56168\n", + "218453 4.272157 0.950792 __splink__input_table_0 __splink__input_table_1 19174 7605 enovis genon sabine delaware 0 0.000012 0.000012 0.986046 1.000000 2711 centerville rd 2711 centerville rd 2 0.000061 0.000061 9605.781694 0.187195 de de 1 0.011717 0.011717 15.445559 4.483838 wilmington wilmington 2 0.010321 0.010321 102.014123 0.649640 ENFS JNN SBN TLWR 1 19174 0001420800 0001420800 enovis corp 7605 56922\n", + "1055 4.272157 0.950792 __splink__input_table_0 __splink__input_table_1 1650 16368 aisystems shannon wind 0 0.000024 0.000024 0.986046 1.000000 2711 centerville rd 2711 centerville rd 2 0.000061 0.000061 9605.781694 0.187195 de de 1 0.011717 0.011717 15.445559 4.483838 wilmington wilmington 2 0.010321 0.010321 102.014123 0.649640 ASSTMS XNN WNT 1 1650 0001328769 0001328769 aisystems, inc. 16368 58872\n", + "7216 4.272157 0.950792 __splink__input_table_0 __splink__input_table_1 32403 14089 lease investment flight trust pasadena statutory trust 0 0.000012 0.000012 0.986046 1.000000 1100 north market st 1100 north market st 2 0.000061 0.000061 9605.781694 0.187195 de de 1 0.011717 0.011717 15.445559 4.483838 wilmington wilmington 2 0.010321 0.010321 102.014123 0.649640 LS INFSTMNT FLT TRST PSTN STTTR TRST 1 32403 0001158389 0001158389 lease investment flight trust 14089 61235\n", + "6113 4.272157 0.950792 __splink__input_table_0 __splink__input_table_1 1626 16195 airplanes us trust se solar trust v c 0 0.000012 0.000012 0.986046 1.000000 1100 north market st 1100 north market st 2 0.000061 0.000061 9605.781694 0.187195 de de 1 0.011717 0.011717 15.445559 4.483838 wilmington wilmington 2 0.010321 0.010321 102.014123 0.649640 ARPLNS US TRST S SLR TRST F K 1 1626 0001004540 0001004540 airplanes us trust 16195 56900\n", + "\n", + "[534 rows x 43 columns]" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "one_to_one_preds" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "1d3e41bd-f92a-4f77-a0a7-0bd24f7ea70c", + "metadata": {}, + "outputs": [], + "source": [ + "out_df = sec_df.merge(\n", + " one_to_one_preds[[\"sec_company_id\", \"utility_id_eia\"]],\n", + " how=\"left\",\n", + " on=\"sec_company_id\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "cce2b383-48b3-4efd-977a-0c734b0e3ec2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "utility_id_eia\n", + "True 59895\n", + "False 1131\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out_df.utility_id_eia.isnull().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cf0be2e-b1ef-4eb1-a07a-28e977c40252", + "metadata": {}, + "outputs": [], + "source": [ + "len(one_to_one_preds" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mozilla_sec_eia", + "language": "python", + "name": "mozilla_sec_eia" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/20-kl-validate-sec-output-table.ipynb b/notebooks/20-kl-validate-sec-output-table.ipynb new file mode 100644 index 0000000..061a227 --- /dev/null +++ b/notebooks/20-kl-validate-sec-output-table.ipynb @@ -0,0 +1,1456 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "d383d1dd-6cdc-45ea-a371-105046c009e2", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 3" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3c58ad67-151d-4054-a972-a1e7ee12949f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from upath import UPath" + ] + }, + { + "cell_type": "markdown", + "id": "511b2c77-ebd2-43b0-8e45-1d1c76fb321d", + "metadata": {}, + "source": [ + "### EIA" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4907820f-2552-4a3b-866a-30c3181af91b", + "metadata": {}, + "outputs": [], + "source": [ + "eia_df = pd.read_parquet(\"gs://sec10k-outputs/v2/core_eia__parents_and_subsidiaries.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "5f488f86-4b34-4a94-985f-588f991ba86b", + "metadata": {}, + "source": [ + "### Ex. 21" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c1795acc-8005-4b6d-be4d-27c722b634f1", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_df = pd.read_pickle(\"/Users/katielamb/CatalystCoop/dagster_home/storage/transformed_ex21_subsidiary_table\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "291ce873-4971-4e03-985a-65dbdd8b0850", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sec_company_idcompany_name_rawlocation_of_incown_perfilenamereport_datereport_yearcompany_namecompany_name_no_legalcompany_name_mphoneparent_company_cik
00000000020_colormax limited_united kingdomcolormax limitedunited kingdomNaNedgar/data/20/0000893220-06-000650.txt2006-03-232006colormax limitedcolormaxKLRMKS0000000020
10000000020_gundlach equipment corporation_dela...gundlach equipment corporationdelawareNaNedgar/data/20/0000950123-10-024631.txt2010-03-152010gundlach equipment corporationgundlach equipmentKNTLX EKPMNT0000000020
20000000020_jeffrey rader ab_swedenjeffrey rader abswedenNaNedgar/data/20/0000950123-10-024631.txt2010-03-152010jeffrey rader abjeffrey rader abJFR RTR AB0000000020
30000000020_jeffrey rader canada company_canadajeffrey rader canada companycanadaNaNedgar/data/20/0000950123-10-024631.txt2010-03-152010jeffrey rader canada companyjeffrey rader canadaJFR RTR KNT0000000020
40000000020_jeffrey rader corporation_delawarejeffrey rader corporationdelawareNaNedgar/data/20/0000950123-10-024631.txt2010-03-152010jeffrey rader corporationjeffrey raderJFR RTR0000000020
....................................
10559820001967649_vestis supply chain limited liabili...vestis (supply chain), llcdelawareNaNedgar/data/1967649/0001967649-23-000025.txt2023-12-212023vestis supply chain limited liability companyvestis supply chainFSTS SPL XN0001967649
10559830001967649_vestis syracuse limited liability c...vestis (syracuse), llcdelawareNaNedgar/data/1967649/0001967649-23-000025.txt2023-12-212023vestis syracuse limited liability companyvestis syracuseFSTS SRKS0001967649
10559840001967649_vestis texas limited liability comp...vestis (texas), llcdelawareNaNedgar/data/1967649/0001967649-23-000025.txt2023-12-212023vestis texas limited liability companyvestis texasFSTS TKSS0001967649
10559850001967649_vestis west adams limited liability...vestis (west adams), llcdelawareNaNedgar/data/1967649/0001967649-23-000025.txt2023-12-212023vestis west adams limited liability companyvestis west adamsFSTS WST ATMS0001967649
10559860001978811_gouverneur savings and loan associa...gouverneur savings and loan associationnew york100.0edgar/data/1978811/0001558370-23-020009.txt2023-12-262023gouverneur savings and loan associationgouverneur savings and loanKFRNR SFNKS ANT LN0001978811
\n", + "

1055987 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " sec_company_id \\\n", + "0 0000000020_colormax limited_united kingdom \n", + "1 0000000020_gundlach equipment corporation_dela... \n", + "2 0000000020_jeffrey rader ab_sweden \n", + "3 0000000020_jeffrey rader canada company_canada \n", + "4 0000000020_jeffrey rader corporation_delaware \n", + "... ... \n", + "1055982 0001967649_vestis supply chain limited liabili... \n", + "1055983 0001967649_vestis syracuse limited liability c... \n", + "1055984 0001967649_vestis texas limited liability comp... \n", + "1055985 0001967649_vestis west adams limited liability... \n", + "1055986 0001978811_gouverneur savings and loan associa... \n", + "\n", + " company_name_raw location_of_inc own_per \\\n", + "0 colormax limited united kingdom NaN \n", + "1 gundlach equipment corporation delaware NaN \n", + "2 jeffrey rader ab sweden NaN \n", + "3 jeffrey rader canada company canada NaN \n", + "4 jeffrey rader corporation delaware NaN \n", + "... ... ... ... \n", + "1055982 vestis (supply chain), llc delaware NaN \n", + "1055983 vestis (syracuse), llc delaware NaN \n", + "1055984 vestis (texas), llc delaware NaN \n", + "1055985 vestis (west adams), llc delaware NaN \n", + "1055986 gouverneur savings and loan association new york 100.0 \n", + "\n", + " filename report_date report_year \\\n", + "0 edgar/data/20/0000893220-06-000650.txt 2006-03-23 2006 \n", + "1 edgar/data/20/0000950123-10-024631.txt 2010-03-15 2010 \n", + "2 edgar/data/20/0000950123-10-024631.txt 2010-03-15 2010 \n", + "3 edgar/data/20/0000950123-10-024631.txt 2010-03-15 2010 \n", + "4 edgar/data/20/0000950123-10-024631.txt 2010-03-15 2010 \n", + "... ... ... ... \n", + "1055982 edgar/data/1967649/0001967649-23-000025.txt 2023-12-21 2023 \n", + "1055983 edgar/data/1967649/0001967649-23-000025.txt 2023-12-21 2023 \n", + "1055984 edgar/data/1967649/0001967649-23-000025.txt 2023-12-21 2023 \n", + "1055985 edgar/data/1967649/0001967649-23-000025.txt 2023-12-21 2023 \n", + "1055986 edgar/data/1978811/0001558370-23-020009.txt 2023-12-26 2023 \n", + "\n", + " company_name \\\n", + "0 colormax limited \n", + "1 gundlach equipment corporation \n", + "2 jeffrey rader ab \n", + "3 jeffrey rader canada company \n", + "4 jeffrey rader corporation \n", + "... ... \n", + "1055982 vestis supply chain limited liability company \n", + "1055983 vestis syracuse limited liability company \n", + "1055984 vestis texas limited liability company \n", + "1055985 vestis west adams limited liability company \n", + "1055986 gouverneur savings and loan association \n", + "\n", + " company_name_no_legal company_name_mphone parent_company_cik \n", + "0 colormax KLRMKS 0000000020 \n", + "1 gundlach equipment KNTLX EKPMNT 0000000020 \n", + "2 jeffrey rader ab JFR RTR AB 0000000020 \n", + "3 jeffrey rader canada JFR RTR KNT 0000000020 \n", + "4 jeffrey rader JFR RTR 0000000020 \n", + "... ... ... ... \n", + "1055982 vestis supply chain FSTS SPL XN 0001967649 \n", + "1055983 vestis syracuse FSTS SRKS 0001967649 \n", + "1055984 vestis texas FSTS TKSS 0001967649 \n", + "1055985 vestis west adams FSTS WST ATMS 0001967649 \n", + "1055986 gouverneur savings and loan KFRNR SFNKS ANT LN 0001978811 \n", + "\n", + "[1055987 rows x 11 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ex21_df" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "304d929b-ce6c-4508-b511-475f287a6b37", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df = ex21_df.merge(\n", + " eia_df.drop_duplicates(subset=\"company_name\")[[\"company_name\", \"utility_id_eia\"]], how=\"left\", on=\"company_name\", suffixes=(\"_ex21\", \"_eia\")\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d315f8d5-7166-4161-bc4e-79c45ed3ad59", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1055987, 20821)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(ex21_df), len(eia_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "3aae6d2c-a941-478e-8178-84cf1321e0b3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "utility_id_eia\n", + "True 1050887\n", + "False 5100\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df.utility_id_eia.isnull().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "6aba0ae8-a8ee-47ef-8eb9-a0ef9f283b51", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1675" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(merged_df.utility_id_eia.unique())" + ] + }, + { + "cell_type": "markdown", + "id": "8d178634-b494-4769-93e3-c0213e4a0326", + "metadata": {}, + "source": [ + "### Read in SEC output table" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "25e8183d-3248-440c-aa4e-e7ee7db4c487", + "metadata": {}, + "outputs": [], + "source": [ + "# review outputs from Dagster\n", + "sec_out_df = pd.read_parquet(UPath(\"gs://sec10k-outputs/v2/out_sec_10k__parents_and_subsidiaries/2023q1.parquet\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "3881bfbd-cdc3-4f9c-92af-9e74d7758e51", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sec_company_idfilenamebusiness_phonecentral_index_keycitycompany_namedate_of_name_changefilm_numberfiscal_year_endform_type...street_1street_2zipreport_datereport_yearlocation_of_inccompany_name_cleanparent_company_cikown_perfiles_10k
00000001800edgar/data/1800/0001628280-23-004026.txt22466761000000001800abbott parkabbott laboratoriesNone23642562123110-k...100 abbott park roadNone60064-35002023-02-172023illinoisabbott laboratoriesNoneNoneTrue
10000001800_3a nutrition (vietnam) company limi...edgar/data/1800/0001628280-23-004026.txtNoneNoneNone3a nutrition (vietnam) company limitedNoneNoneNoneNone...NoneNoneNone2023-02-172023viet nam3a nutrition vietnam company limited0000001800NoneFalse
20000001800_abbott (jiaxing) nutrition co., ltd...edgar/data/1800/0001628280-23-004026.txtNoneNoneNoneabbott (jiaxing) nutrition co., ltdNoneNoneNoneNone...NoneNoneNone2023-02-172023chinaabbott jiaxing nutrition co limited0000001800NoneFalse
30000001800_abbott (shanghai) diagnostics sales...edgar/data/1800/0001628280-23-004026.txtNoneNoneNoneabbott (shanghai) diagnostics sales co., ltdNoneNoneNoneNone...NoneNoneNone2023-02-172023chinaabbott shanghai diagnostics sales co limited0000001800NoneFalse
40000001800_abbott (uk) finance limited_united ...edgar/data/1800/0001628280-23-004026.txtNoneNoneNoneabbott (uk) finance limitedNoneNoneNoneNone...NoneNoneNone2023-02-172023united kingdomabbott uk finance limited0000001800NoneFalse
..................................................................
1713580001951118edgar/data/1951118/0001853620-23-000117.txt(248) 991-67000001951118farmington hillsmercedes-benz auto receivables trust 2022-1None23764946123110-k...35555 w. twelve mile rd.suite 100483312023-03-272023delawaremercedes benz auto receivables trust 2022 1NoneNoneTrue
1713590001951752edgar/data/1951752/0001951752-23-000016.txt31359434950001951752dearbornford credit auto owner trust 2022-dNone23751556123110-k...c/o ford motor co , whq ste 801-c1one american road481262023-03-222023Noneford credit auto owner trust 2022 dNoneNoneTrue
1713600001954336edgar/data/1477336/0001954336-23-000024.txt313-656-55000001954336wilmingtonally auto receivables trust 2022-3None23759320123110-k...1209 orange streetNone198012023-03-242023delawareally auto receivables trust 2022 3NoneNoneTrue
1713610001954436edgar/data/1954436/0000929638-23-001050.txt(214) 572-82760001954436irvingexeter automobile receivables trust 2022-6None23784761123110-k...2101 w. john carpenter freewayNone750632023-03-312023delawareexeter automobile receivables trust 2022 6NoneNoneTrue
1713620001955010edgar/data/1955010/0001140361-23-012122.txt212-326-15000001955010new yorkoha senior private lending fund (u) llcNone23740150123110-k...one vanderbilt, 16th floorNone100172023-03-172023delawareoha senior private lending fund u limited liab...NoneNoneTrue
\n", + "

171363 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " sec_company_id \\\n", + "0 0000001800 \n", + "1 0000001800_3a nutrition (vietnam) company limi... \n", + "2 0000001800_abbott (jiaxing) nutrition co., ltd... \n", + "3 0000001800_abbott (shanghai) diagnostics sales... \n", + "4 0000001800_abbott (uk) finance limited_united ... \n", + "... ... \n", + "171358 0001951118 \n", + "171359 0001951752 \n", + "171360 0001954336 \n", + "171361 0001954436 \n", + "171362 0001955010 \n", + "\n", + " filename business_phone \\\n", + "0 edgar/data/1800/0001628280-23-004026.txt 2246676100 \n", + "1 edgar/data/1800/0001628280-23-004026.txt None \n", + "2 edgar/data/1800/0001628280-23-004026.txt None \n", + "3 edgar/data/1800/0001628280-23-004026.txt None \n", + "4 edgar/data/1800/0001628280-23-004026.txt None \n", + "... ... ... \n", + "171358 edgar/data/1951118/0001853620-23-000117.txt (248) 991-6700 \n", + "171359 edgar/data/1951752/0001951752-23-000016.txt 3135943495 \n", + "171360 edgar/data/1477336/0001954336-23-000024.txt 313-656-5500 \n", + "171361 edgar/data/1954436/0000929638-23-001050.txt (214) 572-8276 \n", + "171362 edgar/data/1955010/0001140361-23-012122.txt 212-326-1500 \n", + "\n", + " central_index_key city \\\n", + "0 0000001800 abbott park \n", + "1 None None \n", + "2 None None \n", + "3 None None \n", + "4 None None \n", + "... ... ... \n", + "171358 0001951118 farmington hills \n", + "171359 0001951752 dearborn \n", + "171360 0001954336 wilmington \n", + "171361 0001954436 irving \n", + "171362 0001955010 new york \n", + "\n", + " company_name date_of_name_change \\\n", + "0 abbott laboratories None \n", + "1 3a nutrition (vietnam) company limited None \n", + "2 abbott (jiaxing) nutrition co., ltd None \n", + "3 abbott (shanghai) diagnostics sales co., ltd None \n", + "4 abbott (uk) finance limited None \n", + "... ... ... \n", + "171358 mercedes-benz auto receivables trust 2022-1 None \n", + "171359 ford credit auto owner trust 2022-d None \n", + "171360 ally auto receivables trust 2022-3 None \n", + "171361 exeter automobile receivables trust 2022-6 None \n", + "171362 oha senior private lending fund (u) llc None \n", + "\n", + " film_number fiscal_year_end form_type ... \\\n", + "0 23642562 1231 10-k ... \n", + "1 None None None ... \n", + "2 None None None ... \n", + "3 None None None ... \n", + "4 None None None ... \n", + "... ... ... ... ... \n", + "171358 23764946 1231 10-k ... \n", + "171359 23751556 1231 10-k ... \n", + "171360 23759320 1231 10-k ... \n", + "171361 23784761 1231 10-k ... \n", + "171362 23740150 1231 10-k ... \n", + "\n", + " street_1 street_2 zip \\\n", + "0 100 abbott park road None 60064-3500 \n", + "1 None None None \n", + "2 None None None \n", + "3 None None None \n", + "4 None None None \n", + "... ... ... ... \n", + "171358 35555 w. twelve mile rd. suite 100 48331 \n", + "171359 c/o ford motor co , whq ste 801-c1 one american road 48126 \n", + "171360 1209 orange street None 19801 \n", + "171361 2101 w. john carpenter freeway None 75063 \n", + "171362 one vanderbilt, 16th floor None 10017 \n", + "\n", + " report_date report_year location_of_inc \\\n", + "0 2023-02-17 2023 illinois \n", + "1 2023-02-17 2023 viet nam \n", + "2 2023-02-17 2023 china \n", + "3 2023-02-17 2023 china \n", + "4 2023-02-17 2023 united kingdom \n", + "... ... ... ... \n", + "171358 2023-03-27 2023 delaware \n", + "171359 2023-03-22 2023 None \n", + "171360 2023-03-24 2023 delaware \n", + "171361 2023-03-31 2023 delaware \n", + "171362 2023-03-17 2023 delaware \n", + "\n", + " company_name_clean parent_company_cik \\\n", + "0 abbott laboratories None \n", + "1 3a nutrition vietnam company limited 0000001800 \n", + "2 abbott jiaxing nutrition co limited 0000001800 \n", + "3 abbott shanghai diagnostics sales co limited 0000001800 \n", + "4 abbott uk finance limited 0000001800 \n", + "... ... ... \n", + "171358 mercedes benz auto receivables trust 2022 1 None \n", + "171359 ford credit auto owner trust 2022 d None \n", + "171360 ally auto receivables trust 2022 3 None \n", + "171361 exeter automobile receivables trust 2022 6 None \n", + "171362 oha senior private lending fund u limited liab... None \n", + "\n", + " own_per files_10k \n", + "0 None True \n", + "1 None False \n", + "2 None False \n", + "3 None False \n", + "4 None False \n", + "... ... ... \n", + "171358 None True \n", + "171359 None True \n", + "171360 None True \n", + "171361 None True \n", + "171362 None True \n", + "\n", + "[171363 rows x 27 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df" + ] + }, + { + "cell_type": "markdown", + "id": "3447dcdb-4506-4de0-9201-9711ff9259ee", + "metadata": {}, + "source": [ + "### There are a combination of SEC 10K filers and subsidiary companies:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "0d654dfc-2fb2-41d3-9ff8-6fe70732a04a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "files_10k\n", + "False 165824\n", + "True 5539\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df.files_10k.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "6797b5b7-be91-430a-a30c-cc26c62aa7b1", + "metadata": {}, + "source": [ + "### `sec_company_id` and `central_index_key` should be unique:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "67e0e789-feb0-4866-ba82-8346c62c1bef", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df.sec_company_id.is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "053d65c9-dbdd-4622-a4ee-badc7db2a88d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df.central_index_key.dropna().is_unique" + ] + }, + { + "cell_type": "markdown", + "id": "b7e05e03-fa05-4655-a085-c66afcfba442", + "metadata": {}, + "source": [ + "### Location of incorporation should be clean and standardized for filers and subsidiaries." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "cb33b703-be24-4ddc-a9f2-148850c3f4af", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "location_of_inc\n", + "delaware 3076\n", + "nevada 300\n", + "maryland 299\n", + "cayman islands 135\n", + "north carolina 92\n", + "new york 74\n", + "florida 74\n", + "pennsylvania 71\n", + "california 57\n", + "texas 56\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[sec_out_df.files_10k][\"location_of_inc\"].value_counts().head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "cb6fc7b5-b9c0-46ae-991c-cae41f86e8f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "location_of_inc\n", + "bahamas 1\n", + "germany 1\n", + "hong kong 1\n", + "china 1\n", + "virgin islands, u.s. 1\n", + "quebec, canada 1\n", + "new brunswick, canada 1\n", + "new hampshire 1\n", + "netherlands antilles 1\n", + "malaysia 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[sec_out_df.files_10k][\"location_of_inc\"].value_counts().tail(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "5373ced2-75e9-4229-b927-3ad4b8d33e39", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "location_of_inc\n", + "delaware 67546\n", + "united kingdom 4979\n", + "cayman islands 3000\n", + "texas 2881\n", + "netherlands 2615\n", + "california 2566\n", + "germany 2381\n", + "china 2305\n", + "florida 2130\n", + "australia 1938\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[~sec_out_df.files_10k][\"location_of_inc\"].value_counts().head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "3ceb1aa2-c622-4a97-9293-281325637f09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "location_of_inc\n", + "ontario, can 1\n", + "british col, can 1\n", + "hong kong china china 1\n", + "zhongshan, china 1\n", + "jacksonville, florida 1\n", + "toronto, ontario, canada 1\n", + "limassol, cyprus 1\n", + "doncaster, syorkshire, uk 1\n", + "manchester, england 1\n", + "cote 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[~sec_out_df.files_10k][\"location_of_inc\"].value_counts().tail(10)" + ] + }, + { + "cell_type": "markdown", + "id": "95d51bdb-c378-45bc-9848-4a2a8895b470", + "metadata": {}, + "source": [ + "### All non SEC 10K filers should have a `parent_company_cik`" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "89cd6bdb-a06c-40ae-8b49-c610e769f9c8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "parent_company_cik\n", + "False 165824\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[~sec_out_df.files_10k][\"parent_company_cik\"].isnull().value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "8f4bd494-951f-417f-ba56-fa0202d741a5", + "metadata": {}, + "source": [ + "### When run on all year quarters, all `parent_company_cik` should appear in `central_index_key` column" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "d024bc29-d0b1-45cd-a0a2-c9b66e73e0d6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2954" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n_parent_company_cik = len(set(sec_out_df.parent_company_cik))\n", + "n_parent_company_cik" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "d70660f2-559e-4ec1-8167-1bfdce45c287", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2832" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n_overlap = len(set(sec_out_df.parent_company_cik).intersection(set(sec_out_df.central_index_key)))\n", + "n_overlap" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "0eb86d64-5ca0-423a-864c-dbfb00b5b9fa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "122" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n_parent_company_cik - n_overlap" + ] + }, + { + "cell_type": "markdown", + "id": "60366af2-259a-4a87-a93f-2180d8777c67", + "metadata": {}, + "source": [ + "### There should be filer companies that have a `parent_company_cik` because they were matched to a subsidiary" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "b5c53dab-3be5-48f1-90f6-583acfb452ab", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "parent_company_cik\n", + "True 5474\n", + "False 65\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[sec_out_df.files_10k].parent_company_cik.isnull().value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "5358a4e1-38a7-489d-bf1a-f53de58447ba", + "metadata": {}, + "source": [ + "### There should be no non-filer companies that have a CIK" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "4a19df26-79c3-4aa1-bcbf-916b822346ca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "central_index_key\n", + "True 165824\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[~sec_out_df.files_10k].central_index_key.isnull().value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "bde4f03f-e5b0-4895-ade6-ae44b260e78e", + "metadata": {}, + "source": [ + "### There should be no duplicated `company_name`, `location_of_inc`, `parent_company_cik` records" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "ca87709a-daa7-4396-83a4-0f5bb8ec2cd4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sec_company_idfilenamebusiness_phonecentral_index_keycitycompany_namedate_of_name_changefilm_numberfiscal_year_endform_type...street_1street_2zipreport_datereport_yearlocation_of_inccompany_name_cleanparent_company_cikown_perfiles_10k
\n", + "

0 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [sec_company_id, filename, business_phone, central_index_key, city, company_name, date_of_name_change, film_number, fiscal_year_end, form_type, former_conformed_name, irs_number, sec_act, sec_file_number, standard_industrial_classification, state, state_of_incorporation, street_1, street_2, zip, report_date, report_year, location_of_inc, company_name_clean, parent_company_cik, own_per, files_10k]\n", + "Index: []\n", + "\n", + "[0 rows x 27 columns]" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[sec_out_df.duplicated(subset=[\"company_name\", \"location_of_inc\", \"parent_company_cik\"])]" + ] + }, + { + "cell_type": "markdown", + "id": "bca9e395-bd96-4183-b299-46cd589d97d5", + "metadata": {}, + "source": [ + "### There can be companies with the same name, location, and CIK, but different parent companies." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "cc1880f3-a9d3-4f8a-a42b-2f9ff428ca45", + "metadata": {}, + "outputs": [], + "source": [ + "sec_out_df = sec_out_df.fillna({\"central_index_key\": pd.NA})" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "f87257df-00f7-48a8-882a-fb1ea8c27e18", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
company_namelocation_of_inccentral_index_keyparent_company_cik
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [company_name, location_of_inc, central_index_key, parent_company_cik]\n", + "Index: []" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[~sec_out_df.central_index_key.isnull() \n", + " & (sec_out_df.duplicated(\n", + " subset=[\"company_name\", \"location_of_inc\", \"central_index_key\"], keep=False\n", + " ))][[\"company_name\", \"location_of_inc\", \"central_index_key\", \"parent_company_cik\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2169181-dcd8-4b43-b03e-9526f597147d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mozilla_sec_eia", + "language": "python", + "name": "mozilla_sec_eia" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index b4bc1f2..72536e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,11 +14,12 @@ readme = {file = "README.rst", content-type = "text/x-rst"} authors = [ {name = "Catalyst Cooperative", email = "pudl@catalyst.coop"} ] -requires-python = ">=3.10,<3.12" +requires-python = ">=3.10,<=3.12" dynamic = ["version"] license = {file = "LICENSE.txt"} dependencies = [ "accelerate>=0.21.0,<2.0", # Hugging Face dependency for PyTorch models + "catalystcoop.pudl @ git+https://github.com/catalyst-cooperative/pudl.git", "cloud-sql-python-connector[pg8000]", "dagster>=1.7.15", # 1.7.13 & 1.7.14 were both breaking things "dagster-mlflow", @@ -30,6 +31,7 @@ dependencies = [ "google-cloud-secret-manager>=2,<3", "google-cloud-storage>=2,<3", "hypothesis", + "jellyfish>=1.1", "matplotlib>=3.8,<4", "mlflow>=2.12", "opencv-python", @@ -44,6 +46,7 @@ dependencies = [ "pydantic-settings>=2", "python-bidi<0.7.0", "pymupdf", # Convert PDF to image + "splink>=4,<5", "sqlalchemy>=2,<3", "timm>0.9,<2", # dependency for Hugging Face computer vision models "torch>=2.2,<3", @@ -61,6 +64,7 @@ classifiers = [ "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ] keywords = [ "template", @@ -92,9 +96,9 @@ dev = [ docs = [ "doc8>=1,<2", # Ensures clean documentation formatting "furo>=2022.4.7", - "sphinx>=6,<8.2", # The default Python documentation engine + "sphinx>=6,<8.1", # The default Python documentation engine "sphinx-autoapi>=2,<4", # Generates documentation from docstrings - "sphinx-issues>=1.2,<6", # Allows references to GitHub issues + "sphinx-issues>=5", # Allows references to GitHub issues ] tests = [ @@ -200,8 +204,8 @@ lint.ignore = [ "EXE002", ] -# Assume Python 3.11 -target-version = "py311" +# Assume Python 3.12 +target-version = "py312" line-length = 88 # Don't automatically concatenate strings -- sometimes we forget a comma! @@ -230,6 +234,6 @@ inline-quotes = "double" multiline-quotes = "double" [tool.mypy] -python_version = "3.10" +python_version = "3.12" warn_return_any = true warn_unused_configs = true diff --git a/src/mozilla_sec_eia/library/record_linkage_utils.py b/src/mozilla_sec_eia/library/record_linkage_utils.py new file mode 100644 index 0000000..217fb9b --- /dev/null +++ b/src/mozilla_sec_eia/library/record_linkage_utils.py @@ -0,0 +1,170 @@ +"""Utility functions for cleaning strings during modeling preprocessing steps.""" + +import json +from enum import StrEnum +from importlib import resources + +import jellyfish +import pandas as pd + +from pudl.analysis.record_linkage import name_cleaner + +INVALID_NAMES = [ + "llc", + "limited liability company", + "limited", + "ltd", + "iiii", + "inc", + "incorporated", + "partnership", + "i", + "name", + "company", + "&", + "", +] + +company_name_cleaner = name_cleaner.CompanyNameCleaner( + cleaning_rules_list=[ + "remove_word_the_from_the_end", + "remove_word_the_from_the_beginning", + "replace_ampersand_by_AND", + "replace_hyphen_by_space", + "replace_underscore_by_space", + "remove_text_punctuation", + "remove_parentheses", + "remove_brackets", + "remove_curly_brackets", + "enforce_single_space_between_words", + ] +) + +legal_term_remover = name_cleaner.CompanyNameCleaner( + cleaning_rules_list=[], handle_legal_terms=2 +) + + +def clean_company_name( + df: pd.DataFrame, col_name: str = "company_name" +) -> pd.DataFrame: + """Conduct cleaning on a company name column and add column without legal terms. + + Uses the PUDL name cleaner object to do basic cleaning on `col_name` column + such as stripping punctuation, correcting case, normalizing legal + terms etc. The clean column becomes the `col_name` column and the original + `col_name` column is renamed to `{col_name}_raw`. Also adds a column called + `{col_name}_no_legal` which has legal terms stripped from the clean strings. + + Arguments: + df: The dataframe that is to be cleaned. Must contain `col_name` column. + col_name: The name of the column with the company name strings. + + Returns: + pd.DataFrame: The original dataframe with `col_name` now containing + cleaned strings and an additional column with the raw strings + and a column with the legal terms stripped from the company name. + """ + df[col_name] = df[col_name].fillna(pd.NA).str.strip().str.lower().replace("", pd.NA) + df.loc[:, f"{col_name}_clean"] = company_name_cleaner.apply_name_cleaning( + df[[col_name]] + ).str.strip() + df = df[df[f"{col_name}_clean"] != ""] + df = df.rename(columns={col_name: f"{col_name}_raw"}).rename( + columns={f"{col_name}_clean": col_name} + ) + df.loc[:, f"{col_name}_no_legal"] = legal_term_remover.apply_name_cleaning( + df[[col_name]] + ) + return df + + +def handle_invalid_names( + df: pd.DataFrame, col_name: str = "company_name", drop_invalid: bool = True +) -> pd.DataFrame: + """Drop rows that have invalid company names, like just 'llc', or 'partnership'. + + Either drop invalid company name values or fill with the empty string. Invalid + values are contained in `INVALID_NAMES`. + """ + if drop_invalid: + return df[(~df[col_name].isin(INVALID_NAMES))] + df[col_name] = df[col_name].where(~df[col_name].isin(INVALID_NAMES), "") + return df + + +def flatten_companies_across_time( + df: pd.DataFrame, key_cols: list[str], date_col: str = "report_date" +) -> pd.DataFrame: + """Keep only the most recent record for each group of `key_cols`. + + Dataframe must have all of `key_cols` and `date_col`. + """ + df = ( + df.sort_values(by=date_col, ascending=False).groupby(key_cols).first() + ).reset_index() + return df + + +# TODO: this is in PUDL, deduplicate +def get_metaphone_col(col: pd.Series) -> pd.Series: + """Get the metaphones of the strings in a column.""" + return col.apply(jellyfish.metaphone) + + +class HandleNulls(StrEnum): + """Enum for handling null values in company name transform.""" + + DROP = "drop" + FILL_EMPTY_STR = "fill_empty_str" + + +def transform_company_name( + df: pd.DataFrame, + col_name: str = "company_name", + handle_nulls: HandleNulls = HandleNulls.DROP, +) -> pd.DataFrame: + """Apply cleaning, get metaphone col, drop invalid rows.""" + df = clean_company_name(df, col_name=col_name) + if handle_nulls == HandleNulls.DROP: + df = handle_invalid_names(df, col_name, drop_invalid=True) + df = df[~df[col_name].isnull()] + elif handle_nulls == HandleNulls.FILL_EMPTY_STR: + df = handle_invalid_names(df, col_name, drop_invalid=False) + df = df.fillna({col_name: ""}) + df.loc[:, f"{col_name}_mphone"] = get_metaphone_col(df[f"{col_name}_no_legal"]) + + return df + + +def fill_street_address_nulls( + df: pd.DataFrame, + address_col: str = "street_address", + secondary_address_col: str = "street_address_2", +) -> pd.DataFrame: + """Fill null street address with value from secondary address column.""" + df[address_col] = df[address_col].where( + (~df[address_col].isnull()) | (df[secondary_address_col].isnull()), + df[secondary_address_col], + ) + return df + + +def expand_street_name_abbreviations(col: pd.Series) -> pd.Series: + """Standardize street address suffixes, like street to st. + + Expects lower case strings in column. + """ + # remove punctuation from column first + col = col.str.replace(r"[^\w\s]", "", regex=True) + + json_source = ( + resources.files("mozilla_sec_eia.package_data") + / "street_suffix_abbreviations.json" + ) + with json_source.open() as f: + address_expansions = json.load(f) + for standard_abbr, suffix_list in address_expansions.items(): + pattern = r"\b(" + "|".join(suffix_list) + r")\b" + col = col.str.replace(pattern, standard_abbr, regex=True) + return col diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb new file mode 100644 index 0000000..7e2852f --- /dev/null +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -0,0 +1,1097 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0da8c588-2d09-464b-945f-168704c0cdac", + "metadata": { + "tags": [] + }, + "source": [ + "# Exhibit 21 extraction\n", + "\n", + "This notebook implements a model built on top of [layoutlmv3](https://huggingface.co/microsoft/layoutlmv3-base/tree/main)\n", + "from Exhibit 21 attachments to SEC-10k filings. These documents contain a list of all subsidiary companies owned by a filing\n", + "company." + ] + }, + { + "cell_type": "markdown", + "id": "84aab877-9d59-4ec7-bf4b-c75e216fb1d6", + "metadata": {}, + "source": [ + "## Load upstream assets and configuration\n", + "The following cell can be run interactively to set configuration and load upstream assets. When running the notebook in dagster, this cell will be replaced with assets from the dagster run and dagster run configuration.\n", + "\n", + "### Config\n", + "- `layoutlm_uri`: If `None` the notebook will finetune layoutlm using `ex21_training_data`. If `layoutlm_uri` points to a valid model on the mlflow tracking server, the notebook will use the pre-trained model and perform inference on the validation set, logging validation metrics to a child run nested under the mlflow run associated with the pretrained model.\n", + "\n", + "### Upstream assets\n", + "We are using dagster assets to construct training/validation data outside the notebook to allow for easy caching. These datasets are fairly compute intensive to create, so this is useful when iterating on the model using the same data.\n", + "\n", + "NOTE: The notebook will load the most recent version of these assets, so to update the training/validation data you must rerun the dagster assets with desired configuration.\n", + "\n", + "- `ex21_training_data`: Dataset containing labeled data produced in label-studio to train `layoutlm`\n", + "- `ex21_validation_set`: Labeled validation data describing expected inference output on validation filings\n", + "- `ex21_failed_parsing_metadata`: Metadata for any validation filings that couldn't be parsed (usually empty)\n", + "- `ex21_inference_dataset`: Parsed validation filings prepped for inference model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48f185de-95ef-4194-9245-93f8d603d2e6", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "import dagstermill\n", + "\n", + "from mozilla_sec_eia.models.sec10k import defs\n", + "\n", + "context = dagstermill.get_context(op_config={\n", + " \"layoutlm_training_run\": \"layoutlm-labeledv0.2\",\n", + "})\n", + "\n", + "ex21_training_data = defs.load_asset_value(\"ex21_training_data\")\n", + "\n", + "ex21_failed_parsing_metadata = defs.load_asset_value(\"ex21_failed_parsing_metadata\")\n", + "ex21_inference_dataset = defs.load_asset_value(\"ex21_inference_dataset\")\n", + "ex21_validation_set = defs.load_asset_value(\"ex21_validation_set\")" + ] + }, + { + "cell_type": "markdown", + "id": "7f299b2b-2358-4526-b023-f29c817316d9", + "metadata": { + "tags": [] + }, + "source": [ + "## Train Layoutlmv3" + ] + }, + { + "cell_type": "markdown", + "id": "32edcce1-ab18-40b6-9da8-ce0ea53c2f72", + "metadata": { + "tags": [] + }, + "source": [ + "### Define training metrics\n", + "The method `compute_metrics` will be used to score the model. It computes precision, recall, f1 score, and accuracy on bounding box labels output by `layoutlm`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9372b908-d9b9-4d18-a5bf-d332648b3e49", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "from tempfile import TemporaryDirectory\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from mozilla_sec_eia.models.sec10k.utils.cloud import get_metadata_filename\n", + "\n", + "\n", + "def compute_metrics(p, metric, label_list, return_entity_level_metrics=False):\n", + " \"\"\"Compute metrics to train and evaluate the model on.\"\"\"\n", + " predictions, labels = p\n", + " predictions = np.argmax(predictions, axis=2)\n", + "\n", + " # Remove ignored index (special tokens)\n", + " true_predictions = [\n", + " [label_list[pred] for (pred, lab) in zip(prediction, label) if lab != -100]\n", + " for prediction, label in zip(predictions, labels)\n", + " ]\n", + " true_labels = [\n", + " [label_list[lab] for (pred, lab) in zip(prediction, label) if lab != -100]\n", + " for prediction, label in zip(predictions, labels)\n", + " ]\n", + "\n", + " results = metric.compute(predictions=true_predictions, references=true_labels)\n", + " if return_entity_level_metrics:\n", + " # Unpack nested dictionaries\n", + " final_results = {}\n", + " for key, value in results.items():\n", + " if isinstance(value, dict):\n", + " for n, v in value.items():\n", + " final_results[f\"{key}_{n}\"] = v\n", + " else:\n", + " final_results[key] = value\n", + " return final_results\n", + " return {\n", + " \"precision\": results[\"overall_precision\"],\n", + " \"recall\": results[\"overall_recall\"],\n", + " \"f1\": results[\"overall_f1\"],\n", + " \"accuracy\": results[\"overall_accuracy\"],\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "8160263c-8f69-437c-918b-e56ad007961a", + "metadata": { + "tags": [] + }, + "source": [ + "#### Finetune Model\n", + "The next cell will use the functions defined in the previous section to actually construct a huggingface dataset from labeled data and finetune the `layoutlm` model. Model finetuning will only be run if configured to do so, otherwise a pretrained version will be used from the `mlflow` tracking server.\n", + "\n", + "Model training contains several steps implemented below:\n", + "1. Use temporary path to convert filings to PDF's and stash labels\n", + "2. Use PDF's and labels to convert PDF's and labels to NER annotations\n", + "3. Construct huggingface dataset from NER annotations and split into train and test sets\n", + "4. Load pretrained model from huggingface\n", + "5. Finetune model on training data and evaluate on test data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "71d205b2-e6ea-4ad0-982c-22e762269119", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import mlflow\n", + "from datasets import (\n", + " Array2D,\n", + " Array3D,\n", + " Dataset,\n", + " Features,\n", + " Sequence,\n", + " Value,\n", + " load_metric,\n", + ")\n", + "from dotenv import load_dotenv\n", + "from transformers import (\n", + " AutoProcessor,\n", + " LayoutLMv3ForTokenClassification,\n", + " Trainer,\n", + " TrainingArguments,\n", + ")\n", + "from transformers.data.data_collator import default_data_collator\n", + "\n", + "from mozilla_sec_eia.library.mlflow import configure_mlflow\n", + "from mozilla_sec_eia.models.sec10k.ex_21.data.common import (\n", + " BBOX_COLS,\n", + " LABELS,\n", + " get_id_label_conversions,\n", + ")\n", + "\n", + "load_dotenv()\n", + "\n", + "\n", + "configure_mlflow()\n", + "mlflow.set_experiment(\"exhibit21_extraction_test\")\n", + "\n", + "\n", + "def _prepare_dataset(annotations, processor, label2id):\n", + " \"\"\"Put the dataset in its final format for training LayoutLM.\"\"\"\n", + "\n", + " def _convert_ner_tags_to_id(ner_tags, label2id):\n", + " return [int(label2id[ner_tag]) for ner_tag in ner_tags]\n", + "\n", + " images = annotations[\"image\"]\n", + " words = annotations[\"tokens\"]\n", + " boxes = annotations[\"bboxes\"]\n", + " # Map over labels and convert to numeric id for each ner_tag\n", + " ner_tags = [\n", + " _convert_ner_tags_to_id(ner_tags, label2id)\n", + " for ner_tags in annotations[\"ner_tags\"]\n", + " ]\n", + "\n", + " encoding = processor(\n", + " images,\n", + " words,\n", + " boxes=boxes,\n", + " word_labels=ner_tags,\n", + " truncation=True,\n", + " padding=\"max_length\",\n", + " )\n", + "\n", + " return encoding\n", + "\n", + "if (run_name := context.op_config[\"layoutlm_training_run\"]) is not None:\n", + " filter_string = f\"attributes.run_name = '{run_name}'\"\n", + " run = mlflow.search_runs(filter_string=filter_string, output_format=\"list\")[0]\n", + " training_run_id = run.info.run_id\n", + "else:\n", + " training_run_id = None\n", + "\n", + "# Only finetune if configured to do so\n", + "if training_run_id is None:\n", + " id2label, label2id = get_id_label_conversions(LABELS)\n", + " # Change temp_dir to save training data locally for inspection\n", + " # Cache/prepare training data\n", + " dataset = Dataset.from_list(ex21_training_data)\n", + "\n", + " # Load pretrained model\n", + " model = LayoutLMv3ForTokenClassification.from_pretrained(\n", + " \"microsoft/layoutlmv3-base\", id2label=id2label, label2id=label2id\n", + " )\n", + " processor = AutoProcessor.from_pretrained(\n", + " \"microsoft/layoutlmv3-base\", apply_ocr=False\n", + " )\n", + "\n", + " # Prepare our train & eval dataset\n", + " column_names = dataset.column_names\n", + " features = Features(\n", + " {\n", + " \"pixel_values\": Array3D(dtype=\"float32\", shape=(3, 224, 224)),\n", + " \"input_ids\": Sequence(feature=Value(dtype=\"int64\")),\n", + " \"attention_mask\": Sequence(Value(dtype=\"int64\")),\n", + " \"bbox\": Array2D(dtype=\"int64\", shape=(512, 4)),\n", + " \"labels\": Sequence(feature=Value(dtype=\"int64\")),\n", + " }\n", + " )\n", + " dataset = dataset.map(\n", + " lambda annotations: _prepare_dataset(annotations, processor, label2id),\n", + " batched=True,\n", + " remove_columns=column_names,\n", + " features=features,\n", + " )\n", + " dataset.set_format(\"torch\")\n", + " split_dataset = dataset.train_test_split(test_size=0.2)\n", + " train_dataset, eval_dataset = split_dataset[\"train\"], split_dataset[\"test\"]\n", + "\n", + " # Initialize our Trainer\n", + " metric = load_metric(\"seqeval\")\n", + " training_args = TrainingArguments(\n", + " max_steps=1000,\n", + " per_device_train_batch_size=1,\n", + " per_device_eval_batch_size=1,\n", + " learning_rate=1e-5,\n", + " evaluation_strategy=\"steps\",\n", + " eval_steps=100,\n", + " load_best_model_at_end=True,\n", + " metric_for_best_model=\"f1\",\n", + " output_dir=\"./layoutlm\",\n", + " )\n", + " trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=eval_dataset,\n", + " tokenizer=processor,\n", + " data_collator=default_data_collator,\n", + " compute_metrics=lambda p: compute_metrics(p, metric=metric, label_list=LABELS),\n", + " )\n", + "\n", + " with mlflow.start_run() as training_run:\n", + " # Train inside mlflow run. Mlflow will automatically handle logging training metrcis\n", + " trainer.train()\n", + "\n", + " # Log finetuend model with mlflow\n", + " model = {\"model\": trainer.model, \"tokenizer\": trainer.tokenizer}\n", + " mlflow.transformers.log_model(\n", + " model, artifact_path=\"layoutlm_extractor\", task=\"token-classification\"\n", + " )\n", + " training_run_id = training_run.info. run_id" + ] + }, + { + "cell_type": "markdown", + "id": "ee9b4e20-7781-43a7-b7aa-caf0690a201e", + "metadata": {}, + "source": [ + "## Model inference\n", + "Use the finetuned model to perform inference and evaluate on labeled validation data. First create a Huggingface `Pipeline` which wraps layoutlm with some custom pre/post processing steps." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "42c8e920-d671-40c2-b5db-c43611a33897", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import torch\n", + "from transformers import Pipeline, pipeline\n", + "from transformers.tokenization_utils_base import BatchEncoding\n", + "\n", + "from mozilla_sec_eia.models.sec10k.ex_21.data.common import (\n", + " get_flattened_mode_predictions,\n", + ")\n", + "from mozilla_sec_eia.models.sec10k.utils.layoutlm import (\n", + " iob_to_label,\n", + ")\n", + "\n", + "\n", + "def separate_entities_by_row(entity_df):\n", + " \"\"\"Separate entities that span multiple rows and should be distinct.\n", + "\n", + " Sometimes LayoutLM groups multiple entities that span multiple rows\n", + " into one entity. This function makes an attempt to break these out\n", + " into multiple entities, by taking the average distance between rows\n", + " and separating a grouped entity if the distance between y values\n", + " is greater than the third quantile of y value spacing.\n", + " \"\"\"\n", + " threshold = 1.0\n", + " entity_df.loc[:, \"line_group\"] = entity_df.loc[:, \"top_left_y\"].transform(\n", + " lambda y: (y // threshold).astype(int)\n", + " )\n", + " # Get the unique y-values for each line (group) per file\n", + " line_positions = (\n", + " entity_df.groupby([\"line_group\"])[\"top_left_y\"].mean().reset_index()\n", + " )\n", + " # Calculate the difference between adjacent y-values (i.e., distance between lines)\n", + " line_positions.loc[:, \"y_diff\"] = line_positions.loc[:, \"top_left_y\"].diff()\n", + " # Filter out NaN values and take the mean of the valid distances\n", + " y_diffs = line_positions[\"y_diff\"].dropna()\n", + " avg_y_diff = y_diffs.apply(np.floor).mean()\n", + " # if an I labeled entity is more than avg_y_diff from it's previoius box then make it a B entity\n", + " entity_df.loc[:, \"prev_y\"] = entity_df.loc[:, \"top_left_y\"].shift(1)\n", + " entity_df.loc[:, \"prev_iob\"] = entity_df.loc[:, \"iob_pred\"].shift(1)\n", + "\n", + " # If the current prediction is an I label\n", + " # and y distance exceeds the average y difference\n", + " # update to a B label and make it the start of a new entity\n", + " entity_df.loc[:, \"iob_pred\"] = np.where(\n", + " (entity_df[\"iob_pred\"].str[0] == \"I\")\n", + " & ((entity_df[\"top_left_y\"] - entity_df[\"prev_y\"]) >= avg_y_diff),\n", + " \"B\" + entity_df[\"iob_pred\"].str[1:], # Update to 'B'\n", + " entity_df[\"iob_pred\"], # Keep as is\n", + " )\n", + "\n", + " # Drop temporary columns\n", + " return entity_df.drop(columns=[\"prev_y\", \"prev_iob\"])\n", + "\n", + "class LayoutLMInferencePipeline(Pipeline):\n", + " \"\"\"Pipeline for performing inference with fine-tuned LayoutLM.\"\"\"\n", + "\n", + " def __init__(self, *args, **kwargs):\n", + " \"\"\"Initialize LayoutLMInferencePipeline.\"\"\"\n", + " super().__init__(*args, **kwargs)\n", + "\n", + " def _sanitize_parameters(self, **kwargs):\n", + " preprocess_kwargs = {}\n", + " if \"maybe_arg\" in kwargs:\n", + " preprocess_kwargs[\"maybe_arg\"] = kwargs[\"maybe_arg\"]\n", + " return preprocess_kwargs, {}, {}\n", + "\n", + " def preprocess(self, doc_dict):\n", + " \"\"\"Encode and tokenize model inputs.\"\"\"\n", + " image = doc_dict[\"image\"]\n", + " words = doc_dict[\"tokens\"]\n", + " boxes = doc_dict[\"bboxes\"]\n", + " encoding = self.tokenizer(\n", + " image,\n", + " words,\n", + " boxes=boxes,\n", + " return_tensors=\"pt\",\n", + " truncation=True,\n", + " padding=\"max_length\",\n", + " max_length=512, # this is the maximum max_length\n", + " stride=128,\n", + " return_offsets_mapping=True,\n", + " return_overflowing_tokens=True,\n", + " )\n", + " model_inputs = {}\n", + " model_inputs[\"raw_encoding\"] = encoding.copy()\n", + " model_inputs[\"doc_dict\"] = doc_dict\n", + " model_inputs[\"offset_mapping\"] = encoding.pop(\"offset_mapping\")\n", + " model_inputs[\"sample_mapping\"] = encoding.pop(\"overflow_to_sample_mapping\")\n", + " # TODO: do we actually need to make these into ints?\n", + " encoding[\"input_ids\"] = encoding[\"input_ids\"].to(torch.int64)\n", + " encoding[\"attention_mask\"] = encoding[\"attention_mask\"].to(torch.int64)\n", + " encoding[\"bbox\"] = encoding[\"bbox\"].to(torch.int64)\n", + " encoding[\"pixel_values\"] = torch.stack(encoding[\"pixel_values\"])\n", + " model_inputs[\"encoding\"] = encoding\n", + " return model_inputs\n", + "\n", + " def _forward(self, model_inputs):\n", + " # encoding is passed as a UserDict in the model_inputs dictionary\n", + " # turn it back into a BatchEncoding\n", + " encoding = BatchEncoding(model_inputs[\"encoding\"])\n", + " if torch.cuda.is_available():\n", + " encoding.to(\"cuda\")\n", + " self.model.to(\"cuda\")\n", + " # since we're doing inference, we don't need gradient computation\n", + " with torch.no_grad():\n", + " output = self.model(**encoding)\n", + " return {\n", + " \"logits\": output.logits,\n", + " \"predictions\": output.logits.argmax(-1).squeeze().tolist(),\n", + " \"raw_encoding\": model_inputs[\"raw_encoding\"],\n", + " \"doc_dict\": model_inputs[\"doc_dict\"],\n", + " }\n", + "\n", + " def postprocess(self, output_dict):\n", + " \"\"\"Return logits, model predictions, and the extracted dataframe.\"\"\"\n", + " output_df = self.extract_table(output_dict)\n", + " output_dict[\"output_df\"] = output_df\n", + " return output_dict\n", + "\n", + " def extract_table(self, output_dict):\n", + " \"\"\"Extract a structured table from a set of inference predictions.\n", + "\n", + " This function essentially works by stacking bounding boxes and predictions\n", + " into a dataframe and going from left to right and top to bottom. Then, every\n", + " every time a new subsidiary entity is encountered, it assigns a new group or\n", + " \"row\" to that subsidiary. Next, location and ownership percentage words/labeled\n", + " entities in between these subsidiary groups are assigned to a subsidiary row/group.\n", + " Finally, this is all formatted into a dataframe with an ID column from the original\n", + " filename and a basic cleaning function normalizes strings.\n", + " \"\"\"\n", + " # TODO: when model more mature, break this into sub functions to make it\n", + " # clearer what's going on\n", + " predictions = output_dict[\"predictions\"]\n", + " encoding = output_dict[\"raw_encoding\"]\n", + " doc_dict = output_dict[\"doc_dict\"]\n", + "\n", + " token_boxes_tensor = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1)\n", + " predictions_tensor = torch.tensor(predictions)\n", + " mode_predictions = get_flattened_mode_predictions(\n", + " token_boxes_tensor, predictions_tensor\n", + " )\n", + " token_boxes = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1).tolist()\n", + " predicted_labels = [\n", + " self.model.config.id2label[pred] for pred in mode_predictions\n", + " ]\n", + " simple_preds = [iob_to_label(pred).lower() for pred in predicted_labels]\n", + "\n", + " df = pd.DataFrame(data=token_boxes, columns=BBOX_COLS)\n", + " df.loc[:, \"iob_pred\"] = predicted_labels\n", + " df.loc[:, \"pred\"] = simple_preds\n", + " invalid_mask = (\n", + " (df[\"top_left_x\"] == 0)\n", + " & (df[\"top_left_y\"] == 0)\n", + " & (df[\"bottom_right_x\"] == 0)\n", + " & (df[\"bottom_right_y\"] == 0)\n", + " )\n", + " df = df[~invalid_mask]\n", + " # we want to get actual words on the dataframe, not just subwords that correspond to tokens\n", + " # subwords from the same word share the same bounding box coordinates\n", + " # so we merge the original words onto our dataframe on bbox coordinates\n", + " words_df = pd.DataFrame(data=doc_dict[\"bboxes\"], columns=BBOX_COLS)\n", + " words_df.loc[:, \"word\"] = doc_dict[\"tokens\"]\n", + " df = df.merge(words_df, how=\"left\", on=BBOX_COLS).drop_duplicates(\n", + " subset=BBOX_COLS + [\"pred\", \"word\"]\n", + " )\n", + " df = df.sort_values(by=[\"top_left_y\", \"top_left_x\"])\n", + " # rows that are the first occurrence in a new group (subsidiary, loc, own_per)\n", + " # should always have a B entity label. Manually override labels so this is true.\n", + " first_in_group_df = df[\n", + " (df[\"pred\"].ne(df[\"pred\"].shift())) & (df[\"pred\"] != \"other\")\n", + " ]\n", + " first_in_group_df.loc[:, \"iob_pred\"] = (\n", + " \"B\" + first_in_group_df[\"iob_pred\"].str[1:]\n", + " )\n", + " df.update(first_in_group_df)\n", + " # filter for just words that were labeled with non \"other\" entities\n", + " entities_df = df[df[\"pred\"] != \"other\"]\n", + " # boxes that have the same group label but are on different rows\n", + " # should be updated to have two different B labels\n", + "\n", + " entities_df = entities_df.groupby(\"pred\").apply(separate_entities_by_row, include_groups=False)\n", + " entities_df = entities_df.reset_index(\"pred\").sort_index()\n", + " # merge B and I entities to form one entity group\n", + " # (i.e. \"B-Subsidiary\" and \"I-Subsidiary\" become just \"subsidiary\"), assign a group ID\n", + " entities_df[\"group\"] = (entities_df[\"iob_pred\"].str.startswith(\"B-\")).cumsum()\n", + " grouped_df = (\n", + " entities_df.groupby([\"group\", \"pred\"])[\"word\"]\n", + " .apply(\" \".join)\n", + " .reset_index()[[\"pred\", \"word\"]]\n", + " )\n", + " # assign a new row every time there's a new subsidiary\n", + " grouped_df[\"row\"] = (grouped_df[\"pred\"].str.startswith(\"subsidiary\")).cumsum()\n", + " output_df = grouped_df.pivot_table(\n", + " index=\"row\", columns=\"pred\", values=\"word\", aggfunc=lambda x: \" \".join(x)\n", + " ).reset_index()\n", + " if output_df.empty:\n", + " return output_df\n", + " output_df.loc[:, \"id\"] = doc_dict[\"id\"]\n", + " return output_df" + ] + }, + { + "cell_type": "markdown", + "id": "ea9fe887-43ca-43e2-85e3-bf5371bd165f", + "metadata": {}, + "source": [ + "Next, wrap the `LayoutLMInferencePipeline` in an `mlflow` `pyfunc` model, which handles loading the pretrained model and managing inputs/outputs." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4d802e00-1ca4-40b3-b15b-561711a9db70", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'training_run_id' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 12\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmozilla_sec_eia\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msec10k\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mex_21\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mex21_validation_helpers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 8\u001b[0m clean_extracted_df,\n\u001b[1;32m 9\u001b[0m )\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# If a model was trained in this notebook, use it. Otherwise, use\u001b[39;00m\n\u001b[0;32m---> 12\u001b[0m model_uri \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mruns:/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[43mtraining_run_id\u001b[49m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/layoutlm_extractor\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 13\u001b[0m model_info \u001b[38;5;241m=\u001b[39m mlflow\u001b[38;5;241m.\u001b[39mmodels\u001b[38;5;241m.\u001b[39mget_model_info(model_uri)\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_get_data\u001b[39m(dataset):\n", + "\u001b[0;31mNameError\u001b[0m: name 'training_run_id' is not defined" + ] + } + ], + "source": [ + "from PIL import Image\n", + "\n", + "from mozilla_sec_eia.models.sec10k.entities import (\n", + " Ex21CompanyOwnership,\n", + " Sec10kExtractionMetadata,\n", + ")\n", + "from mozilla_sec_eia.models.sec10k.ex_21.ex21_validation_helpers import (\n", + " clean_extracted_df,\n", + ")\n", + "\n", + "# If a model was trained in this notebook, use it. Otherwise, use\n", + "model_uri = f\"runs:/{training_run_id}/layoutlm_extractor\"\n", + "model_info = mlflow.models.get_model_info(model_uri)\n", + "\n", + "def _get_data(dataset):\n", + " yield from dataset\n", + "\n", + "def _fill_known_nulls(df):\n", + " \"\"\"Fill known nulls in location and own per column.\n", + "\n", + " Fill with known values from rows with same subsidiary.\n", + " If an extracted Ex. 21 table looks like the following:\n", + "\n", + " subsidiary loc own_per\n", + " Company A NaN NaN\n", + " Company A Delaware 50\n", + "\n", + " Then fill in the first row with location and ownership\n", + " percentage from the second row.\n", + " \"\"\"\n", + " if \"own_per\" in df:\n", + " df[\"own_per\"] = df.groupby([\"id\", \"subsidiary\"])[\"own_per\"].transform(\n", + " lambda group: group.ffill()\n", + " )\n", + " if \"loc\" in df:\n", + " df[\"loc\"] = df.groupby([\"id\", \"subsidiary\"])[\"loc\"].transform(\n", + " lambda group: group.ffill()\n", + " )\n", + " return df\n", + "\n", + "class Ex21Extractor(mlflow.pyfunc.PythonModel):\n", + " \"\"\"Create an mlflow pyfunc model to perform full EX21 extraction.\"\"\"\n", + " def load_context(self, context):\n", + " \"\"\"Load pretrained model.\"\"\"\n", + " os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n", + " self.model_components = mlflow.transformers.load_model(\n", + " context.artifacts[\"model_components\"], return_type=\"components\"\n", + " )\n", + "\n", + " def predict(self, context, model_input: pd.DataFrame, params=None):\n", + " \"\"\"Use pretrained model and inference pipeline to perform inference.\"\"\"\n", + " # Convert dataframe to pyarrow Dataset\n", + " model_input[\"image\"] = model_input.apply(\n", + " lambda row: Image.frombytes(\n", + " row[\"mode\"], (row[\"width\"], row[\"height\"]), row[\"image\"]\n", + " ),\n", + " axis=1,\n", + " )\n", + " dataset = Dataset.from_list(model_input.drop([\"mode\", \"width\", \"height\"], axis=1).to_dict(\"records\"))\n", + "\n", + " # TODO: figure out device argument\n", + " pipe = pipeline(\n", + " \"token-classification\",\n", + " model=self.model_components[\"model\"],\n", + " tokenizer=self.model_components[\"tokenizer\"],\n", + " pipeline_class=LayoutLMInferencePipeline,\n", + " device=torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\"),\n", + " )\n", + "\n", + " logits = []\n", + " predictions = []\n", + " all_output_df = Ex21CompanyOwnership.example(size=0)\n", + " extraction_metadata = Sec10kExtractionMetadata.example(size=0)\n", + " for output_dict in pipe(_get_data(dataset)):\n", + " logits.append(output_dict[\"logits\"])\n", + " predictions.append(output_dict[\"predictions\"])\n", + " output_df = output_dict[\"output_df\"]\n", + " if not output_df.empty:\n", + " filename = get_metadata_filename(output_df[\"id\"].iloc[0])\n", + " extraction_metadata.loc[filename, [\"success\"]] = True\n", + " all_output_df = pd.concat([all_output_df, output_df])\n", + " all_output_df.columns.name = None\n", + " all_output_df = clean_extracted_df(all_output_df)\n", + " all_output_df = _fill_known_nulls(all_output_df)\n", + " all_output_df = all_output_df[[\"id\", \"subsidiary\", \"loc\", \"own_per\"]].drop_duplicates()\n", + " all_output_df = all_output_df.reset_index(drop=True)\n", + " outputs_dict = {\n", + " \"all_output_df\": all_output_df,\n", + " \"logits\": logits,\n", + " \"predictions\": predictions,\n", + " }\n", + " return extraction_metadata, outputs_dict\n", + "\n", + "# Save model to local temp dir with artifacts, then reload for evaluation\n", + "with TemporaryDirectory() as tmp_dir:\n", + " mlflow.pyfunc.save_model(\n", + " path=tmp_dir,\n", + " python_model=Ex21Extractor(),\n", + " artifacts={\"model_components\": model_uri},\n", + " )\n", + " ex21_extraction_model = mlflow.pyfunc.load_model(tmp_dir)" + ] + }, + { + "cell_type": "markdown", + "id": "fee84b13-6c37-4afe-8faa-003ff149aa2d", + "metadata": { + "tags": [] + }, + "source": [ + "### Model Evaluation\n", + "Now the full extraction model can be evaluated using labeled validation data and logged to `mlflow`. The `mlflow` run used to evaluate and log the inference model will be created as a nested child run to the run used to train `layoutlm`. This setup allows multiple versions/configurations of inference to be associated with a single version of `layoutlm`, creating a clean organizational structure for testing the base model and inference logic separately." + ] + }, + { + "cell_type": "markdown", + "id": "1dee550f-7b06-4091-a65e-71c6b23a5bea", + "metadata": { + "tags": [] + }, + "source": [ + "#### Validate model\n", + "Finally, run the full model on the validation set and log metrics to mlflow. The logged metrics/model will appear in a nested run below the training run used for the current version of the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfb56470-8527-424c-a9e5-4135e55fde4d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/10/16 17:11:53 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n", + "/tmp/ipykernel_48762/2514174394.py:29: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " lambda group: group.ffill()\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_validation_set = pd.concat(\n", + "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", + " padded_compute_set = pd.concat(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values `_ for more details.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "db36592620c244479123275dfc464648", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading artifacts: 0%| | 0/17 [00:00 str: return base64.b64encode(hash_md5.digest()).decode() +def convert_ex21_id_to_filename(df: pd.DataFrame, id_col_name: str = "id"): + """Convert the ID column to GCS archive filenames. + + The extracted Ex. 21 tables have an ID that doesn't match + the filenames in the GCS archive. Create a new column "filename" + that converts this ID column into the GCS archive filename + for that filing. + """ + df.loc[:, "filename"] = ( + "edgar/data/" + df[id_col_name].str.replace("-", "/", n=1) + ".txt" + ) + return df + + class Exhibit21(BaseModel): """This is a class to wrap Exhibit 21's, which are included in many SEC 10ks.""" diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py new file mode 100644 index 0000000..3350449 --- /dev/null +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py @@ -0,0 +1,82 @@ +"""Implement record linkage model between SEC companies and EIA utilities.""" + +from dagster import ( + AssetKey, + AssetSpec, + Definitions, + StaticPartitionsDefinition, + load_assets_from_modules, +) +from dagstermill import ( + ConfigurableLocalOutputNotebookIOManager, +) +from upath import UPath + +from mozilla_sec_eia.library import model_jobs +from mozilla_sec_eia.library.generic_io_managers import ( + PandasParquetIOManager, + PickleUPathIOManager, +) +from mozilla_sec_eia.library.mlflow import ( + MlflowPyfuncModelIOManager, + mlflow_interface_resource, + mlflow_train_test_io_managers, +) +from mozilla_sec_eia.models.sec10k.utils.cloud import cloud_interface_resource + +from ..sec10k.extract import year_quarter_partitions +from . import transform_eia_input, transform_sec_input + +eia_assets = load_assets_from_modules([transform_eia_input]) +sec_assets = load_assets_from_modules([transform_sec_input]) + +eia_input_table_production_job = model_jobs.create_production_model_job( + "eia_input_table_creation", transform_eia_input.production_assets +) +sec_input_table_production_job = model_jobs.create_production_model_job( + "sec_input_table_creation", transform_sec_input.production_assets +) + +# Create year_quarter partitions +completed_partitions = StaticPartitionsDefinition( + [ + year_quarter + for year_quarter in year_quarter_partitions.get_partition_keys() + if year_quarter + not in ["2018q1", "2018q2", "2019q1", "2020q1", "2021q1", "2022q1"] + ] +) + +basic_10k_company_info = AssetSpec( + key=AssetKey("basic_10k_company_info"), partitions_def=completed_partitions +).with_io_manager_key("pandas_parquet_io_manager") + +ex21_company_ownership_info = AssetSpec( + key=AssetKey("ex21_company_ownership_info"), partitions_def=completed_partitions +).with_io_manager_key("pandas_parquet_io_manager") + +sec10k_filing_metadata = AssetSpec( + key=AssetKey("sec10k_filing_metadata"), partitions_def=completed_partitions +).with_io_manager_key("io_manager") + +defs = Definitions( + sec_assets + + eia_assets + + [basic_10k_company_info, ex21_company_ownership_info, sec10k_filing_metadata], + jobs=[eia_input_table_production_job, sec_input_table_production_job], + resources={ + "cloud_interface": cloud_interface_resource, + "mlflow_interface": mlflow_interface_resource, + "pandas_parquet_io_manager": PandasParquetIOManager( + base_path=UPath("gs://sec10k-outputs/v2") + ), + "pickle_gcs_io_manager": PickleUPathIOManager( + base_path=UPath("gs://sec10k-outputs/dagster_storage") + ), + "pyfunc_model_io_manager": MlflowPyfuncModelIOManager( + mlflow_interface=mlflow_interface_resource + ), + "output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager(), + } + | mlflow_train_test_io_managers, +) diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py new file mode 100644 index 0000000..c8ccfd9 --- /dev/null +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py @@ -0,0 +1,56 @@ +"""Configuration file for the splink SEC to EIA record linkage model.""" + +import splink.comparison_library as cl +from splink import block_on + +STR_COLS = [ + "company_name", + "street_address", + "street_address_2", + "city", + "state", + "zip_code", +] + +SHARED_COLS = [ + "record_id", + "report_date", + "report_year", + "company_name", + "company_name_no_legal", + "company_name_mphone", + "street_address", + "street_address_2", + "city", + "state", # could use state of incorporation from SEC + "zip_code", + "phone_number", +] + +MATCH_COLS = ["company_name", "state", "city", "street_address"] + +BLOCKING_RULES = [ + "substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)", + "l.street_address = r.street_address", + "substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and l.city = r.city", + # "substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and array_length(list_intersect(l.street_address_list, r.street_address_list)) >= 2", +] + +company_name_comparison = cl.NameComparison( + "company_name_no_legal", + jaro_winkler_thresholds=[0.95], +) + +address_comparison = cl.LevenshteinAtThresholds( + "street_address", distance_threshold_or_thresholds=[1] +).configure(term_frequency_adjustments=True) + +state_comparison = cl.ExactMatch("state").configure(term_frequency_adjustments=True) +city_comparison = cl.NameComparison("city", jaro_winkler_thresholds=[0.9]) + +# blocking rules for estimating probability two random records match +deterministic_blocking_rules = [ + block_on("company_name_mphone", "company_name_mphone"), + "jaro_winkler_similarity(r.company_name, l.company_name) >= .95 and l.city = r.city", + "substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city and l.street_address = r.street_address", +] diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py new file mode 100644 index 0000000..c8f311c --- /dev/null +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py @@ -0,0 +1,117 @@ +"""Create an EIA input utilities table that's ready for record linkage with the SEC 10K companies.""" + +import numpy as np +import pandas as pd +from dagster import AssetOut, asset + +from mozilla_sec_eia.library.record_linkage_utils import ( + expand_street_name_abbreviations, + fill_street_address_nulls, + flatten_companies_across_time, + transform_company_name, +) +from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import STR_COLS + +EIA_COL_MAP = { + "utility_name_eia": "company_name", # TODO: should be linking to owner or operator name? + "address_2": "street_address_2", +} + + +# TODO: make Dagster inputs instead of reading from AWS? +def harvest_eia861_utilities(): + """Get the utilities contained in EIA Form 861. + + TODO: In PUDL we should eventually implement an actual thorough + harvesting of utilities from all EIA Form 861 tables, but this is + good enough for now. + """ + raw_eia861_df = pd.read_parquet( + "s3://pudl.catalyst.coop/stable/core_eia861__assn_utility.parquet" + ) + harvested_df = pd.concat( + [ + pd.read_parquet( + "s3://pudl.catalyst.coop/stable/core_eia861__yearly_utility_data_misc.parquet" + )[["report_date", "utility_id_eia", "utility_name_eia"]], + pd.read_parquet( + "s3://pudl.catalyst.coop/stable/core_eia861__yearly_operational_data_misc.parquet" + )[["report_date", "utility_id_eia", "utility_name_eia"]], + pd.read_parquet( + "s3://pudl.catalyst.coop/stable/core_eia861__yearly_demand_side_management_misc.parquet" + )[["report_date", "utility_id_eia", "utility_name_eia"]], + pd.read_parquet( + "s3://pudl.catalyst.coop/stable/core_eia861__yearly_energy_efficiency.parquet" + )[["report_date", "utility_id_eia", "utility_name_eia"]], + ] + ) + eia861_df = raw_eia861_df.merge( + harvested_df, on=["report_date", "utility_id_eia"], how="left" + ).drop_duplicates(subset=["report_date", "utility_id_eia"]) + mergers_df = pd.read_parquet( + "s3://pudl.catalyst.coop/stable/core_eia861__yearly_mergers.parquet" + ) + mergers_df = mergers_df[mergers_df["new_parent"].notna()] + eia861_df = eia861_df.merge( + mergers_df[ + ["report_date", "new_parent", "merge_address", "merge_city", "merge_state"] + ], + how="left", + left_on=["report_date", "utility_name_eia"], + right_on=["report_date", "new_parent"], + ) + eia861_df = eia861_df.rename( + columns={"merge_address": "street_address", "merge_city": "city"} + ) + eia861_df = ( + eia861_df.groupby(["report_date", "utility_id_eia"]).first().reset_index() + ) + + eia861_df["state"] = eia861_df["state"].where( + eia861_df["merge_state"].isnull(), eia861_df["merge_state"] + ) + eia861_df = eia861_df.drop(columns=["new_parent", "merge_state"]) + return eia861_df + + +@asset( + outs={ + "core_eia__parents_and_subsidiaries": AssetOut( + io_manager_key="pandas_parquet_io_manager" + ) + # TODO: allow year partitions? + } +) +# TODO: add Dagster asset inputs for PUDL inputs instead of reading from AWS? +def eia_rl_input_table(): + """Create a table of EIA Form 860 and 861 utilities.""" + raw_eia_df = pd.read_parquet( + "s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet" + ) + eia861_df = harvest_eia861_utilities() + eia_df = ( + pd.concat([raw_eia_df, eia861_df]) + .dropna(subset=["utility_name_eia"]) + .rename(columns=EIA_COL_MAP) + .assign( + report_date=lambda df: df["report_date"].astype("datetime64[ns]"), + report_year=lambda df: df["report_date"].dt.year, + zip_code=lambda df: df["zip_code"].str[:5], + ) + .pipe(transform_company_name) + .pipe(fill_street_address_nulls) + .pipe(lambda df: df.fillna(np.nan)) + .reset_index(drop=True) + ) + eia_df[STR_COLS] = eia_df[STR_COLS].apply(lambda x: x.str.strip().str.lower()) + eia_df["street_address"] = expand_street_name_abbreviations( + eia_df["street_address"] + ) + eia_df = flatten_companies_across_time( + df=eia_df, key_cols=["company_name", "street_address"] + ).reset_index(names="record_id") + + return eia_df + + +production_assets = [eia_rl_input_table] diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py new file mode 100644 index 0000000..666f010 --- /dev/null +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py @@ -0,0 +1,392 @@ +"""Module for creating an SEC 10K output table with filing companies and subsidiary companies.""" + +import logging +import re +from importlib import resources +from pathlib import Path + +import numpy as np +import pandas as pd +from dagster import AssetIn, asset + +from mozilla_sec_eia.library.record_linkage_utils import ( + expand_street_name_abbreviations, + fill_street_address_nulls, + flatten_companies_across_time, + transform_company_name, +) +from mozilla_sec_eia.models.sec10k.utils.cloud import ( + convert_ex21_id_to_filename, +) +from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import STR_COLS + +logger = logging.getLogger(f"catalystcoop.{__name__}") + + +EX21_COL_MAP = {"subsidiary": "company_name", "loc": "location_of_inc"} +SEC_COL_MAP = { + "company_conformed_name": "company_name", + "street_1": "street_address", + "street_2": "street_address_2", + "zip": "zip_code", + "business_phone": "phone_number", +} + +INVALID_NAMES = [ + "llc", + "limited liability company", + "limited", + "ltd", + "iiii", + "inc", + "incorporated", + "partnership", + "i", + "name", + "company", + "&", + "", +] + + +def _remove_weird_sec_cols(sec_df) -> pd.DataFrame: + weird_cols = ["]fiscal_year_end", "]irs_number", "]state_of_incorporation"] + for weird_col in weird_cols: + if weird_col not in sec_df: + continue + normal_col = weird_col[1:] + sec_df.loc[:, normal_col] = sec_df[normal_col].where( + sec_df[weird_col].isnull(), sec_df[weird_col] + ) + sec_df = sec_df.drop(columns=[weird_col]) + return sec_df + + +def _add_report_year_to_sec(sec_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFrame: + """Merge metadata on to get a report year for extracted SEC data. + + Expects filename to be the index of the SEC dataframe. + """ + sec_df = sec_df.merge(md[["filename", "date_filed"]], how="left", on=["filename"]) + sec_df = sec_df.rename(columns={"date_filed": "report_date"}) + sec_df.loc[:, "report_year"] = ( + sec_df["report_date"].astype("datetime64[ns]").dt.year + ) + return sec_df + + +def get_sec_state_code_dict() -> dict[str, str]: + """Create a dictionary mapping state codes to their names. + + Table found at https://www.sec.gov/submit-filings/filer-support-resources/edgar-state-country-codes + Published by SEC and reports valid state codes + for filers of Form D. Used to standardize the state codes + in the SEC 10K filings. The expanded names of the state codes + are comments in the XML file, so we have to read the XML in as + text and parse it. + """ + # TODO: make a check to see if SEC has published a new version of this table + xml_filepath = ( + resources.files("mozilla_sec_eia.package_data") / "formDStateCodes.xsd.xml" + ) + with Path.open(xml_filepath) as file: + xml_text = file.read() + + pattern = r'.*?' + state_code_dict = { + code.lower(): name.lower() + for code, name in re.findall(pattern, xml_text, re.DOTALL) + } + return state_code_dict + + +def clean_location_of_inc(df) -> pd.DataFrame: + """Clean location of incorporation column in SEC basic 10K or Ex. 21 dataframe. + + Arguments: + df: Ex. 21 or SEC 10K basic info dataframe with location_of_inc + column. + """ + if "state_of_incorporation" in df: + df.loc[:, "location_of_inc"] = df["state_of_incorporation"] + state_code_to_name = get_sec_state_code_dict() + df.loc[:, "location_of_inc"] = ( + df["location_of_inc"] + .replace(state_code_to_name) + .fillna(pd.NA) + .str.strip() + .str.lower() + .replace("", pd.NA) + ) + return df + + +def _add_parent_company_cik(ex21_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFrame: + """Add the CIK of the parent company to Ex. 21 subsidiaries.""" + ex21_df = ex21_df.merge(md[["filename", "cik"]], how="left", on="filename").rename( + columns={"cik": "parent_company_cik"} + ) + ex21_df.loc[:, "parent_company_cik"] = ( + ex21_df["parent_company_cik"].astype(str).str.zfill(10) + ) + return ex21_df + + +def match_ex21_subsidiaries_to_filer_company( + basic10k_df: pd.DataFrame, ex21_df: pd.DataFrame +) -> pd.DataFrame: + """Match Ex. 21 subsidiaries to filer companies. + + We want to assign CIKs to Ex. 21 subsidiaries if they in turn + file a 10k. To do this, we merge the Ex. 21 subsidiaries to 10k + filers on comapny name. If there are multiple matches with the same + company name we take the company with the most overlap in location of + incorporation and nearest report years. Then we merge the CIK back onto + the Ex. 21 df. + + Returns: + A dataframe of the Ex. 21 subsidiaries with a column for the + subsidiaries CIK (null if the subsidiary doesn't file). + """ + basic10k_df = basic10k_df.drop_duplicates( + subset=[ + "central_index_key", + "company_name", + "location_of_inc", + "report_year", + ] + ) + merged_df = basic10k_df.merge( + ex21_df, how="inner", on="company_name", suffixes=("_sec", "_ex21") + ) + # split up the location of incorporation on whitespace, creating a column + # with lists of word tokens + merged_df.loc[:, "loc_tokens_sec"] = ( + merged_df["location_of_inc_sec"].fillna("").str.lower().str.split() + ) + merged_df.loc[:, "loc_tokens_ex21"] = ( + merged_df["location_of_inc_ex21"].fillna("").str.lower().str.split() + ) + # get the number of words overlapping between location of incorporation tokens + merged_df["loc_overlap"] = merged_df.apply( + lambda row: len(set(row["loc_tokens_sec"]) & set(row["loc_tokens_ex21"])), + axis=1, + ) + # get the difference in report years + merged_df["report_year_diff"] = merged_df.apply( + lambda row: abs(int(row["report_year_sec"]) - int(row["report_year_ex21"])), + axis=1, + ) + merged_df = merged_df.sort_values( + by=[ + "company_name", + "location_of_inc_ex21", + "loc_overlap", + "report_year_diff", + ], + ascending=[True, True, False, True], + ) + # Select the row with the highest loc overlap and nearest report years + # for each company name, location, and parent company record + closest_match_df = merged_df.groupby( + ["company_name", "location_of_inc_ex21", "parent_company_cik"], as_index=False + ).first() + ex21_with_cik_df = ex21_df.merge( + closest_match_df[ + [ + "company_name", + "parent_company_cik", + "location_of_inc_ex21", + "central_index_key", + ] + ].rename(columns={"location_of_inc_ex21": "location_of_inc"}), + how="left", + on=["company_name", "location_of_inc", "parent_company_cik"], + ).rename(columns={"central_index_key": "subsidiary_cik"}) + # if a subsidiary doesn't have a CIK and has a null location + # but its company name was assigned a CIK (with a different location) + # then assign that CIK to the subsidiary + ex21_with_cik_df = ex21_with_cik_df.merge( + closest_match_df[["company_name", "central_index_key"]], + how="left", + on="company_name", + ).rename(columns={"central_index_key": "company_name_merge_cik"}) + ex21_with_cik_df["subsidiary_cik"] = ex21_with_cik_df["subsidiary_cik"].where( + ~(ex21_with_cik_df.subsidiary_cik.isnull()) + | ~(ex21_with_cik_df.location_of_inc.isnull()), + ex21_with_cik_df["company_name_merge_cik"], + ) + ex21_with_cik_df = ex21_with_cik_df.drop(columns="company_name_merge_cik") + ex21_with_cik_df = ex21_with_cik_df.rename( + columns={"subsidiary_cik": "central_index_key"} + ) + ex21_with_cik_df = ex21_with_cik_df.drop_duplicates() + + return ex21_with_cik_df + + +def create_sec_company_id_for_ex21_subs(ex21_df: pd.DataFrame) -> pd.DataFrame: + """Create an sec_company_id for Ex. 21 subsidiaries. + + This is a unique identifier string for Ex. 21 subsidiaries. + This ID is necessary for tracking subsidiaries who aren't ultimately + matched to a 10K filer company. + """ + ex21_df.loc[:, "sec_company_id"] = ( + ex21_df["parent_company_cik"] + + "_" + + ex21_df["company_name"] + + "_" + + ex21_df["location_of_inc"] + ) + return ex21_df + + +@asset( + ins={ + "ex21_dfs": AssetIn("ex21_company_ownership_info"), + "sec10k_filing_metadata_dfs": AssetIn("sec10k_filing_metadata"), + }, +) +def transformed_ex21_subsidiary_table( + ex21_dfs: dict[str, pd.DataFrame], + sec10k_filing_metadata_dfs: dict[str, pd.DataFrame], +) -> pd.DataFrame: + """Transform Ex. 21 table of subsidiaries before combining with basic 10k table.""" + ex21_df = pd.concat(ex21_dfs.values()) + sec10k_filing_metadata = pd.concat(sec10k_filing_metadata_dfs.values()) + + ex21_df.loc[:, "filename"] = convert_ex21_id_to_filename(ex21_df) + ex21_df = ex21_df.drop(columns=["id"]) + ex21_df = _add_report_year_to_sec(ex21_df, sec10k_filing_metadata) + ex21_df = ex21_df.rename(columns=EX21_COL_MAP) + ex21_df = clean_location_of_inc(ex21_df) + ex21_df = transform_company_name(ex21_df) + ex21_df = _add_parent_company_cik(ex21_df, sec10k_filing_metadata) + # add an sec_company_id, ultimately this ID become the subsidiary's CIK + # if the subsidiary is matched to an SEC filer + ex21_df = create_sec_company_id_for_ex21_subs(ex21_df=ex21_df) + ex21_df = flatten_companies_across_time( + df=ex21_df, key_cols=["sec_company_id"], date_col="report_year" + ) + ex21_df = ex21_df.fillna(np.nan) + + return ex21_df + + +def transform_basic10k_table( + basic_10k_df: pd.DataFrame, sec10k_filing_metadata: pd.DataFrame +) -> pd.DataFrame: + """Transformations on SEC basic 10K filer table to prepare for record linkage.""" + basic_10k_df = basic_10k_df.reset_index().pivot_table( + values="value", index="filename", columns="key", aggfunc="first" + ) + basic_10k_df.columns.name = None + basic_10k_df = ( + basic_10k_df.reset_index() + .pipe(_remove_weird_sec_cols) + .pipe(_add_report_year_to_sec, sec10k_filing_metadata) + .rename(columns=SEC_COL_MAP) + .pipe(clean_location_of_inc) + .pipe(transform_company_name) + .assign( + zip_code=lambda df: df["zip_code"].str[:5], + files_10k=True, + sec_company_id=lambda df: df["central_index_key"], + ) + .pipe(fill_street_address_nulls) + ) + basic_10k_df[STR_COLS] = basic_10k_df[STR_COLS].apply( + lambda x: x.str.strip().str.lower() + ) + basic_10k_df["street_address"] = expand_street_name_abbreviations( + basic_10k_df["street_address"] + ) + # flatten across time on unique company name and address pair + basic_10k_df = flatten_companies_across_time( + df=basic_10k_df, key_cols=["company_name", "street_address"] + ) + + return basic_10k_df + + +@asset( + ins={ + "basic_10k_dfs": AssetIn("basic_10k_company_info"), + "sec10k_filing_metadata_dfs": AssetIn("sec10k_filing_metadata"), + }, +) +def core_sec_10k__filers( + basic_10k_dfs: dict[str, pd.DataFrame], + sec10k_filing_metadata_dfs: dict[str, pd.DataFrame], +) -> pd.DataFrame: + """Asset for creating a cleaned basic 10k table with EIA utility matched. + + Flatten the table across time to only keep the most recent record + for each unique company name and address pair. Clean table and link filers + to EIA utilities. + """ + basic_10k_df = pd.concat(basic_10k_dfs.values()) + sec10k_filing_metadata = pd.concat(sec10k_filing_metadata_dfs.values()) + basic_10k_df = transform_basic10k_table(basic_10k_df, sec10k_filing_metadata) + out_df = basic_10k_df.fillna(np.nan).reset_index(names="record_id") + # match EIA utilities to filers + # TODO: Here we conduct the match to EIA and add on a column with utility_id_eia + return out_df + + +@asset( + ins={ + "sec_10k_filers_matched_df": AssetIn("core_sec_10k__filers"), + "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"), + "clean_eia_df": AssetIn("core_eia__parents_and_subsidiaries"), + }, +) +def out_sec_10k__parents_and_subsidiaries( + sec_10k_filers_matched_df: pd.DataFrame, + clean_ex21_df: pd.DataFrame, + clean_eia_df: pd.DataFrame, +) -> pd.DataFrame: + """Asset for creating an SEC 10K output table. + + Add in Ex. 21 subsidiaries and link them to already present + filing companies. Create an sec_company_id for subsidiaries + that aren't linked to a CIK. + """ + ex21_df_with_cik = match_ex21_subsidiaries_to_filer_company( + basic10k_df=sec_10k_filers_matched_df, ex21_df=clean_ex21_df + ) + sec_10k_filers_matched_df = sec_10k_filers_matched_df.merge( + ex21_df_with_cik[["central_index_key", "parent_company_cik", "own_per"]], + how="left", + on="central_index_key", + ) + # get the subsidiary companies that weren't matched to a 10K filing company + ex21_non_filing_subs_df = ex21_df_with_cik[ + ex21_df_with_cik["central_index_key"].isnull() + ] + ex21_non_filing_subs_df.loc[:, "files_10k"] = False + # the last step is to take the EIA utilities that haven't been matched + # to a filer company, and merge them by company name onto the Ex. 21 subs + unmatched_eia_df = clean_eia_df[ + ~clean_eia_df["utility_id_eia"].isin( + sec_10k_filers_matched_df.utility_id_eia.unique() + ) + ].drop_duplicates(subset="company_name") + ex21_non_filing_subs_df = ex21_non_filing_subs_df.merge( + unmatched_eia_df[["utility_id_eia", "company_name"]], + how="left", + on="company_name", + ).drop_duplicates(subset="sec_company_id") + logger.info( + f"Ex. 21 subsidiary names matched to an EIA utility name: {len(ex21_non_filing_subs_df["utility_id_eia"].unique())}" + ) + out_df = pd.concat([sec_10k_filers_matched_df, ex21_non_filing_subs_df]) + return out_df + + +production_assets = [ + core_sec_10k__filers, + transformed_ex21_subsidiary_table, + out_sec_10k__parents_and_subsidiaries, +] diff --git a/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml b/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml new file mode 100644 index 0000000..2ec0c2b --- /dev/null +++ b/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml @@ -0,0 +1,328 @@ + + + + + + + + + + + + Set of valid State and Country Codes according to EDGAR. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/mozilla_sec_eia/package_data/street_suffix_abbreviations.json b/src/mozilla_sec_eia/package_data/street_suffix_abbreviations.json new file mode 100644 index 0000000..e305113 --- /dev/null +++ b/src/mozilla_sec_eia/package_data/street_suffix_abbreviations.json @@ -0,0 +1,203 @@ +{ + "aly": ["alley", "allee", "ally"], + "anx": ["anex", "annex", "annx"], + "arc": ["arcade"], + "ave": ["avenue", "av", "aven", "avenu", "avn", "avnue"], + "byu": ["bayou", "bayoo"], + "bch": ["beach"], + "bnd": ["bend"], + "blf": ["bluff", "bluf"], + "blfs": ["bluffs"], + "btm": ["bottom", "bot", "bottm"], + "blvd": ["boulevard", "boul", "boulv"], + "br": ["branch", "brnch"], + "brg": ["bridge", "brdge"], + "brk": ["brook"], + "brks": ["brooks"], + "bg": ["burg"], + "bgs": ["burgs"], + "byp": ["bypass", "bypa", "bypas", "byps"], + "cp": ["camp", "cmp"], + "cyn": ["canyon", "canyn", "cnyn"], + "cpe": ["cape"], + "cswy": ["causeway", "causwa"], + "ctr": ["center", "cen", "cent", "centr", "centre", "cnter", "cntr"], + "ctrs": ["centers"], + "cir": ["circle", "circ", "circl", "crcl", "crcle"], + "cirs": ["circles"], + "clf": ["cliff"], + "clfs": ["cliffs"], + "clb": ["club"], + "cmn": ["common"], + "cmns": ["commons"], + "cor": ["corner"], + "cors": ["corners"], + "crse": ["course"], + "ct": ["court"], + "cts": ["courts"], + "cv": ["cove"], + "cvs": ["coves"], + "crk": ["creek"], + "cres": ["crescent", "crsent", "crsnt"], + "crst": ["crest"], + "xing": ["crossing", "crssng"], + "xrd": ["crossroad"], + "xrds": ["crossroads"], + "curv": ["curve"], + "dl": ["dale"], + "dm": ["dam"], + "dv": ["divide", "div", "dvd"], + "dr": ["drive", "driv", "drv"], + "drs": ["drives"], + "est": ["estate"], + "ests": ["estates"], + "expy": ["expressway", "exp", "expr", "express", "expw"], + "ext": ["extension", "extn", "extnsn"], + "exts": ["extensions"], + "fls": ["falls"], + "fry": ["ferry", "frry"], + "fld": ["field"], + "flds": ["fields"], + "flt": ["flat"], + "flts": ["flats"], + "frd": ["ford"], + "frds": ["fords"], + "frst": ["forest", "forests"], + "frg": ["forge", "forg"], + "frgs": ["forges"], + "frk": ["fork"], + "frks": ["forks"], + "ft": ["fort", "frt"], + "fwy": ["freeway", "freewy", "frway", "frwy"], + "gdn": ["garden", "gardn", "grden", "grdn"], + "gdns": ["gardens", "grdns"], + "gtwy": ["gateway", "gatewy", "gatway", "gtway"], + "gln": ["glen"], + "glns": ["glens"], + "grn": ["green"], + "grns": ["greens"], + "grv": ["grove", "grov"], + "grvs": ["groves"], + "hbr": ["harbor", "harb", "harbr", "hrbor"], + "hbrs": ["harbors"], + "hvn": ["haven"], + "hts": ["heights", "ht"], + "hwy": ["highway", "highwy", "hiway", "hiwy", "hway"], + "hl": ["hill"], + "hls": ["hills"], + "holw": ["hollow", "hllw", "hollows", "holws"], + "inlt": ["inlet"], + "is": ["island", "islnd"], + "iss": ["islands", "islnds"], + "isle": ["isles"], + "jct": ["junction", "jction", "jctn", "junctn", "juncton"], + "jcts": ["junctions", "jctns"], + "ky": ["key"], + "kys": ["keys"], + "knl": ["knoll", "knol"], + "knls": ["knolls"], + "lk": ["lake"], + "lks": ["lakes"], + "land": ["land"], + "lndg": ["landing", "lndng"], + "ln": ["lane"], + "lgt": ["light"], + "lgts": ["lights"], + "lf": ["loaf"], + "lck": ["lock"], + "lcks": ["locks"], + "ldg": ["lodge", "ldge", "lodg"], + "loop": ["loops"], + "mall": ["mall"], + "mnr": ["manor"], + "mnrs": ["manors"], + "mdw": ["meadow"], + "mdws": ["meadows", "mdw", "medows"], + "mews": ["mews"], + "ml": ["mill"], + "mls": ["mills"], + "msn": ["mission", "missn", "mssn"], + "mtwy": ["motorway"], + "mt": ["mount", "mnt"], + "mtn": ["mountain", "mntain", "mntn", "mountin", "mtin"], + "mtns": ["mountains", "mntns"], + "nck": ["neck"], + "orch": ["orchard", "orchrd"], + "oval": ["ovl"], + "opas": ["overpass"], + "park": ["parks"], + "pkwy": ["parkway", "parkwy", "pkway", "pky", "parkways", "pkwys"], + "pass": ["pass"], + "psge": ["passage"], + "path": ["paths"], + "pike": ["pikes"], + "pne": ["pine"], + "pnes": ["pines"], + "pl": ["place"], + "pln": ["plain"], + "plns": ["plains"], + "plz": ["plaza", "plza"], + "pt": ["point"], + "pts": ["points"], + "prt": ["port"], + "prts": ["ports"], + "pr": ["prairie", "prr"], + "radl": ["radial", "rad", "radiel"], + "ramp": ["ramp"], + "rnch": ["ranch", "ranches", "rnchs"], + "rpd": ["rapid"], + "rpds": ["rapids"], + "rst": ["rest"], + "rdg": ["ridge", "rdge"], + "rdgs": ["ridges"], + "riv": ["river", "rvr", "rivr"], + "rd": ["road"], + "rds": ["roads"], + "rte": ["route"], + "row": ["row"], + "rue": ["rue"], + "run": ["run"], + "shl": ["shoal"], + "shls": ["shoals"], + "shr": ["shore", "shoar"], + "shrs": ["shores", "shoars"], + "skwy": ["skyway"], + "spg": ["spring", "spng", "sprng"], + "spgs": ["springs", "spngs", "sprngs"], + "spur": ["spurs"], + "sq": ["square", "sqr", "sqre", "squ"], + "sqs": ["squares", "sqrs"], + "sta": ["station", "statn", "stn"], + "stra": ["stravenue", "strav", "straven", "stravn", "strvn", "strvnue"], + "strm": ["stream", "streme"], + "st": ["street", "strt", "str"], + "sts": ["streets"], + "smt": ["summit", "sumit", "sumitt"], + "ter": ["terrace", "terr"], + "trwy": ["throughway"], + "trce": ["trace", "traces"], + "trak": ["track", "tracks", "trk", "trks"], + "trfy": ["trafficway"], + "trl": ["trail", "trails", "trls"], + "trlr": ["trailer", "trlrs"], + "tunl": ["tunnel", "tunel", "tunls", "tunnels", "tunnl"], + "tpke": ["turnpike", "trnpk", "turnpk"], + "upas": ["underpass"], + "un": ["union"], + "uns": ["unions"], + "vly": ["valley", "vally", "vlly"], + "vlys": ["valleys"], + "via": ["viaduct", "vdct", "viadct"], + "vw": ["view"], + "vws": ["views"], + "vlg": ["village", "vill", "villag", "villg", "villiage"], + "vlgs": ["villages"], + "vl": ["ville"], + "vis": ["vista", "vist", "vst", "vsta"], + "walk": ["walks"], + "wall": ["wall"], + "way": ["wy"], + "ways": ["ways"], + "wl": ["well"], + "wls": ["wells"] +} diff --git a/test_environment.yml b/test_environment.yml index 5fa9b2d..f54968d 100644 --- a/test_environment.yml +++ b/test_environment.yml @@ -6,7 +6,7 @@ channels: dependencies: # Packages required for setting up the environment - pip>=21,<24 - - python>=3.10,<3.12 + - python>=3.10,<=3.12 - setuptools>=66,<69 # Packages specified in setup.py that need or benefit from binary conda packages @@ -29,6 +29,10 @@ dependencies: - pytorch>=2.2,<3 - torchvision + # GDAL is a transitive dependency whose binaries must match those installed by the + # pudl-dev conda environment, so we also install it with conda here. + - gdal==3.9.3 # pinned to ensure it matches pudl-dev environment exactly. + # Use pip to install the package defined by this repo for development: - pip: - --editable ./[dev,docs,tests,types] diff --git a/workspace.yaml b/workspace.yaml index 144aada..a208373 100644 --- a/workspace.yaml +++ b/workspace.yaml @@ -1,2 +1,3 @@ load_from: - python_module: mozilla_sec_eia.models.sec10k + - python_module: mozilla_sec_eia.models.sec_eia_record_linkage