diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9919311b..fcbf96f5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -34,7 +34,7 @@ repos: rev: 22.3.0 hooks: - id: black - language_version: python3.9 + language_version: python3.10 # Check for PEP8 non-compliance, code complexity, style, errors, etc: - repo: https://github.com/PyCQA/flake8 diff --git a/docs/data-mart/county_commission_election_info.md b/docs/data-mart/county_commission_election_info.md index d6a08ced..d46262c4 100644 --- a/docs/data-mart/county_commission_election_info.md +++ b/docs/data-mart/county_commission_election_info.md @@ -1,19 +1,37 @@ # county_commission_election_info -This table contains all upcoming county commissioner elections from Ballot Ready. +This table contains the upcoming general, primary and run off races for county commissioners. + +Not all counties will have races. ## Column Descriptions -**Unique Key Column(s):** (`election_id`, `county_fips_id`) +**Unique Key Column(s):** (`county_fips_id`) |Subject|Column|Description|Source|Notes| |----|----|----|----|----| |Elections|`county_name`|County name|Census|| ||`county_id_fips`|County FIPS ID|Ballot Ready|| -||`election_id`|A unique identifier for each election in the BallotReady database||| -||`election_name`|A descriptive name for the election according to BallotReady's naming conventions.||| -||`election_day`|The date of the election||| -||`total_n_of_seats`|Total number of people who will be elected to positions for the election.||| -||`total_n_races`|Total number of positions in the election. This will always be less than or equal to `total_n_of_seats` because a position can have multiple seats.||| -||`all_race_names`|All position names up for election.||| -||`frequency`|How often the position is regularly scheduled for election. This field describes a position, not an election so the maximum value is selected in the aggregation.||| -||`reference_year`|Refers to a base year that the election frequency can be calculated from, either into the future or the past. This field describes a position, not an election so the maximum value is selected in the aggregation.||| +||`next_general_election_id`|Unique identifier for the next general election for county commissioners in the county||| +||`next_general_election_name`|A descriptive name for the next general election for county commissioners||| +||`next_general_election_day`|The date of the next general election for county commissioners||| +||`next_general_total_n_seats`|Total number of people who will be elected to positions for the election||| +||`next_general_total_n_races`|Total number of positions in the election. This will always be less than or equal to `total_n_seats` because a position can have multiple seats.||| +||`next_general_all_race_names`|All position names up for election||| +||`next_general_frequency`|How often the position is regularly scheduled for election. This field describes a position, not an election so the mode value is selected in the aggregation.||| +||`next_general_reference_year`|Refers to a base year that the election frequency can be calculated from, either into the future or the past. This field describes a position, not an election so the maximum value is selected in the aggregation.||| +||`next_primary_election_id`|Unique identifier for the next primary election for county commissioners in the county||| +||`next_primary_election_name`|A descriptive name for the next primary election for county commissioners||| +||`next_primary_election_day`|The date of the next primary election for county commissioners||| +||`next_primary_total_n_seats`|Total number of people who will be elected to positions for the election||| +||`next_primary_total_n_races`|Total number of positions in the election. This will always be less than or equal to `total_n_seats` because a position can have multiple seats.||| +||`next_primary_all_race_names`|All position names up for election||| +||`next_primary_frequency`|How often the position is regularly scheduled for election. This field describes a position, not an election so the mode value is selected in the aggregation.||| +||`next_primary_reference_year`|Refers to a base year that the election frequency can be calculated from, either into the future or the past. This field describes a position, not an election so the maximum value is selected in the aggregation.||| +||`next_run_off_election_id`|Unique identifier for the next run off election for county commissioners in the county||| +||`next_run_off_election_name`|A descriptive name for the next run off election for county commissioners||| +||`next_run_off_election_day`|The date of the next run off election for county commissioners||| +||`next_run_off_total_n_seats`|Total number of people who will be elected to positions for the election||| +||`next_run_off_total_n_races`|Total number of positions in the election. This will always be less than or equal to `total_n_seats` because a position can have multiple seats.||| +||`next_run_off_all_race_names`|All position names up for election||| +||`next_run_off_frequency`|How often the position is regularly scheduled for election. This field describes a position, not an election so the mode value is selected in the aggregation.||| +||`next_run_off_reference_year`|Refers to a base year that the election frequency can be calculated from, either into the future or the past. This field describes a position, not an election so the maximum value is selected in the aggregation.||| diff --git a/notebooks/33-bdn-ballot-ready-normalization.ipynb b/notebooks/33-bdn-ballot-ready-normalization.ipynb new file mode 100644 index 00000000..9afb3e06 --- /dev/null +++ b/notebooks/33-bdn-ballot-ready-normalization.ipynb @@ -0,0 +1,3152 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "27b8d0af-63da-4b7e-8c03-ae2fec3d5b1e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/app/.local/lib/python3.10/site-packages/geopandas/_compat.py:123: UserWarning: The Shapely GEOS version (3.11.1-CAPI-1.17.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.3-CAPI-1.16.1). Conversions between both will be slow.\n", + " warnings.warn(\n", + "/app/.local/lib/python3.10/site-packages/pudl/analysis/spatial.py:7: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:\n", + "\n", + "import os\n", + "os.environ['USE_PYGEOS'] = '0'\n", + "import geopandas\n", + "\n", + "In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).\n", + " import geopandas as gpd\n", + "2023-09-12 14:02:08 [ INFO] catalystcoop.pudl.helpers:203 Assigned state FIPS codes for 100.00% of records.\n", + "2023-09-12 14:02:11 [ INFO] catalystcoop.pudl.helpers:219 Assigned county FIPS codes for 99.61% of records.\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import dbcp\n", + "\n", + "source_uri = \"gs://dgm-archive/ballot_ready/BallotReady_upcoming_races_with_counties_08_14_2023.csv\"\n", + "raw_dfs = dbcp.extract.ballot_ready.extract(source_uri)\n", + "raw_ballot_ready = raw_dfs[\"raw_ballot_ready\"]\n", + "br_election_data = dbcp.transform.ballot_ready._explode_counties(raw_ballot_ready)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5a67b7aa-9be1-4a9e-9787-02a968d452a6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 188074 entries, 1543 to 82775\n", + "Data columns (total 29 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 election_id 188074 non-null Int64 \n", + " 1 election_name 188074 non-null string \n", + " 2 election_day 188074 non-null datetime64[ns]\n", + " 3 race_id 188074 non-null Int64 \n", + " 4 is_primary 188074 non-null boolean \n", + " 5 is_runoff 188074 non-null boolean \n", + " 6 is_unexpired 188074 non-null boolean \n", + " 7 position_id 188074 non-null Int64 \n", + " 8 position_name 188074 non-null string \n", + " 9 sub_area_name 114957 non-null string \n", + " 10 sub_area_value 125790 non-null string \n", + " 11 sub_area_name_secondary 11501 non-null string \n", + " 12 sub_area_value_secondary 12522 non-null string \n", + " 13 raw_state 188074 non-null string \n", + " 14 level 188074 non-null string \n", + " 15 tier 188074 non-null Int64 \n", + " 16 is_judicial 188074 non-null boolean \n", + " 17 is_retention 188074 non-null boolean \n", + " 18 number_of_seats 188074 non-null Int64 \n", + " 19 normalized_position_id 188074 non-null Int64 \n", + " 20 normalized_position_name 188074 non-null string \n", + " 21 frequency 188074 non-null string \n", + " 22 reference_year 188074 non-null Int64 \n", + " 23 partisan_type 188060 non-null string \n", + " 24 raw_county 188074 non-null object \n", + " 25 race_created_at 188074 non-null datetime64[ns]\n", + " 26 race_updated_at 188074 non-null datetime64[ns]\n", + " 27 state_id_fips 188074 non-null string \n", + " 28 county_id_fips 188074 non-null object \n", + "dtypes: Int64(7), boolean(5), datetime64[ns](3), object(2), string(12)\n", + "memory usage: 38.9+ MB\n" + ] + } + ], + "source": [ + "br_election_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a8870a44-0b83-44c4-b4be-e85e87e3431c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1543 False\n", + "1545 False\n", + "1546 False\n", + "9958 False\n", + "41760 False\n", + " ... \n", + "82771 False\n", + "82772 False\n", + "82773 False\n", + "82774 False\n", + "82775 False\n", + "Name: raw_county, Length: 188074, dtype: bool" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_election_data.raw_county.isna()" + ] + }, + { + "cell_type": "markdown", + "id": "8d910585-4b93-4cf0-b428-d8816f1011e7", + "metadata": {}, + "source": [ + "## Elections" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "59627963-428e-4e52-b20f-8469eb72fe5b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 188074 entries, 1543 to 82775\n", + "Data columns (total 29 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 election_id 188074 non-null Int64 \n", + " 1 election_name 188074 non-null string \n", + " 2 election_day 188074 non-null datetime64[ns]\n", + " 3 race_id 188074 non-null Int64 \n", + " 4 is_primary 188074 non-null boolean \n", + " 5 is_runoff 188074 non-null boolean \n", + " 6 is_unexpired 188074 non-null boolean \n", + " 7 position_id 188074 non-null Int64 \n", + " 8 position_name 188074 non-null string \n", + " 9 sub_area_name 114957 non-null string \n", + " 10 sub_area_value 125790 non-null string \n", + " 11 sub_area_name_secondary 11501 non-null string \n", + " 12 sub_area_value_secondary 12522 non-null string \n", + " 13 raw_state 188074 non-null string \n", + " 14 level 188074 non-null string \n", + " 15 tier 188074 non-null Int64 \n", + " 16 is_judicial 188074 non-null boolean \n", + " 17 is_retention 188074 non-null boolean \n", + " 18 number_of_seats 188074 non-null Int64 \n", + " 19 normalized_position_id 188074 non-null Int64 \n", + " 20 normalized_position_name 188074 non-null string \n", + " 21 frequency 188074 non-null string \n", + " 22 reference_year 188074 non-null Int64 \n", + " 23 partisan_type 188060 non-null string \n", + " 24 raw_county 188074 non-null object \n", + " 25 race_created_at 188074 non-null datetime64[ns]\n", + " 26 race_updated_at 188074 non-null datetime64[ns]\n", + " 27 state_id_fips 188074 non-null string \n", + " 28 county_id_fips 188074 non-null object \n", + "dtypes: Int64(7), boolean(5), datetime64[ns](3), object(2), string(12)\n", + "memory usage: 38.9+ MB\n" + ] + } + ], + "source": [ + "br_election_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b1ea7c03-362d-4777-aa62-415495d04d18", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "election_id True\n", + "election_name True\n", + "election_day True\n", + "dtype: bool" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "election_fields = [\n", + " \"election_id\",\n", + " \"election_name\",\n", + " \"election_day\",\n", + "]\n", + "(br_election_data.groupby(\"election_id\")[election_fields].nunique() <= 1).all()" + ] + }, + { + "cell_type": "markdown", + "id": "d85b81e7-58eb-46c0-ba55-4b614a6853e2", + "metadata": {}, + "source": [ + "## Position" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cceda727-eb0b-4d3e-864b-3b02ba58c6a4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "reference_year False\n", + "position_id True\n", + "position_name True\n", + "sub_area_name True\n", + "sub_area_value True\n", + "sub_area_name_secondary True\n", + "sub_area_value_secondary True\n", + "raw_state True\n", + "level True\n", + "tier True\n", + "is_judicial True\n", + "is_retention True\n", + "normalized_position_id True\n", + "normalized_position_name True\n", + "frequency False\n", + "partisan_type True\n", + "dtype: bool" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "position_fields = [\n", + " \"reference_year\",\n", + " \"position_id\",\n", + " \"position_name\",\n", + " \"sub_area_name\",\n", + " \"sub_area_value\",\n", + " \"sub_area_name_secondary\",\n", + " \"sub_area_value_secondary\",\n", + " \"raw_state\",\n", + " \"level\",\n", + " \"tier\",\n", + " \"is_judicial\",\n", + " \"is_retention\",\n", + " \"normalized_position_id\",\n", + " \"normalized_position_name\",\n", + " \"frequency\",\n", + " \"partisan_type\", \n", + "]\n", + "\n", + "(br_election_data.groupby(\"position_id\")[position_fields].nunique() <= 1).all()" + ] + }, + { + "cell_type": "markdown", + "id": "f9bec5d3-9e2f-4b66-965a-019dfabdcae4", + "metadata": {}, + "source": [ + "### Frequency" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b3600e10-5515-47e1-896b-44ec933d8a96", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 37405\n", + "2 1\n", + "Name: frequency, dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "freq_group = br_election_data.groupby(\"position_id\")[\"frequency\"]\n", + "freq_group.nunique().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a41c6129-f414-4b31-83b9-704a1b62ed0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "position_id\n", + "156594 [[4], [2]]\n", + "Name: frequency, dtype: object" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "freq_group.unique()[freq_group.nunique() > 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f116b062-ef43-4470-8bd2-2e1da6001d1f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
election_idelection_nameelection_dayrace_idis_primaryis_runoffis_unexpiredposition_idposition_namesub_area_namesub_area_valuesub_area_name_secondarysub_area_value_secondaryraw_stateleveltieris_judicialis_retentionnumber_of_seatsnormalized_position_idnormalized_position_namefrequencyreference_yearpartisan_typeraw_countyrace_created_atrace_updated_atstate_id_fipscounty_id_fips
783834317California General Election2024-11-052020782FalseFalseFalse156594San Jose City Mayor<NA><NA><NA><NA>CAcity3FalseFalse11500City Executive//Mayor[4]2024nonpartisanSanta Clara County2023-01-26 22:12:14.5442023-01-26 22:12:14.5440606085
783845367California Primary Election2024-03-052020783TrueFalseFalse156594San Jose City Mayor<NA><NA><NA><NA>CAcity3FalseFalse11500City Executive//Mayor[2]2022nonpartisanSanta Clara County2023-01-26 22:12:14.6462023-01-26 22:12:14.6460606085
\n", + "
" + ], + "text/plain": [ + " election_id election_name election_day race_id is_primary is_runoff is_unexpired position_id position_name sub_area_name sub_area_value sub_area_name_secondary sub_area_value_secondary raw_state level tier is_judicial is_retention number_of_seats normalized_position_id normalized_position_name frequency reference_year partisan_type raw_county race_created_at race_updated_at state_id_fips county_id_fips\n", + "78383 4317 California General Election 2024-11-05 2020782 False False False 156594 San Jose City Mayor CA city 3 False False 1 1500 City Executive//Mayor [4] 2024 nonpartisan Santa Clara County 2023-01-26 22:12:14.544 2023-01-26 22:12:14.544 06 06085\n", + "78384 5367 California Primary Election 2024-03-05 2020783 True False False 156594 San Jose City Mayor CA city 3 False False 1 1500 City Executive//Mayor [2] 2022 nonpartisan Santa Clara County 2023-01-26 22:12:14.646 2023-01-26 22:12:14.646 06 06085" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_election_data.query(\"position_id == 156594\")" + ] + }, + { + "cell_type": "markdown", + "id": "e6e960d5-76e3-4aeb-8044-bc5b4bb4ba67", + "metadata": {}, + "source": [ + "Not sure if this is on instance of a non unique frequency is a ballot ready issue or expected." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "cd073ca8-ed4d-49b1-8690-773ed64b4809", + "metadata": {}, + "outputs": [], + "source": [ + "new_index = br_election_data.position_id.max() + 1\n", + "assert new_index not in br_election_data.position_id\n", + "br_election_data.loc[br_election_data.race_id == 2020783, \"position_id\"] = new_index" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "cdd710d3-8557-406f-aac0-e7cc308777db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "reference_year True\n", + "position_id True\n", + "position_name True\n", + "sub_area_name True\n", + "sub_area_value True\n", + "sub_area_name_secondary True\n", + "sub_area_value_secondary True\n", + "raw_state True\n", + "level True\n", + "tier True\n", + "is_judicial True\n", + "is_retention True\n", + "normalized_position_id True\n", + "normalized_position_name True\n", + "frequency True\n", + "partisan_type True\n", + "dtype: bool" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(br_election_data.groupby(\"position_id\")[position_fields].nunique() <= 1).all()" + ] + }, + { + "cell_type": "markdown", + "id": "2bb8cb2b-acf4-4a93-8b3a-499fb90d9fa3", + "metadata": { + "tags": [] + }, + "source": [ + "### reference year" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "61bb412f-7206-4660-8047-b65148267617", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 37407\n", + "Name: reference_year, dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ref_group = br_election_data.groupby(\"position_id\")[\"reference_year\"]\n", + "ref_group.nunique().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "f93b088f-a0ae-432f-bbe8-9a2bbd0c96e7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Series([], Name: reference_year, dtype: object)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ref_group.unique()[ref_group.nunique() > 1]" + ] + }, + { + "cell_type": "markdown", + "id": "03c2f35d-a52a-4a18-94b0-af31c20dde1a", + "metadata": {}, + "source": [ + "Ok same same problem county." + ] + }, + { + "cell_type": "markdown", + "id": "ac095fbc-f8c7-4756-8b78-311f05c2a98f", + "metadata": {}, + "source": [ + "## Race" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "c5332a63-7b24-40f9-8a64-df606c8f3b8a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "race_id True\n", + "is_primary True\n", + "is_runoff True\n", + "is_unexpired True\n", + "number_of_seats True\n", + "race_created_at True\n", + "race_updated_at True\n", + "raw_state True\n", + "raw_county False\n", + "state_id_fips True\n", + "county_id_fips False\n", + "dtype: bool" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "race_fields = [\n", + " \"race_id\",\n", + " \"is_primary\",\n", + " \"is_runoff\",\n", + " \"is_unexpired\",\n", + " \"number_of_seats\",\n", + " \"race_created_at\",\n", + " \"race_updated_at\",\n", + " \"raw_state\",\n", + " \"raw_county\",\n", + " \"state_id_fips\",\n", + " \"county_id_fips\"\n", + "]\n", + "\n", + "(br_election_data.groupby(\"race_id\")[race_fields].nunique() <= 1).all()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d26fcd3e-9cae-41d5-88bb-e7a90abecee2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
raw_county
race_id
13689735
13689744
136897639
136897939
13689816
......
277800114
27780026
27780036
27780045
27780056
\n", + "

15444 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " raw_county\n", + "race_id \n", + "1368973 5\n", + "1368974 4\n", + "1368976 39\n", + "1368979 39\n", + "1368981 6\n", + "... ...\n", + "2778001 14\n", + "2778002 6\n", + "2778003 6\n", + "2778004 5\n", + "2778005 6\n", + "\n", + "[15444 rows x 1 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_election_data.groupby(\"race_id\")[race_fields].nunique()[\"raw_county\"].to_frame().query(\"raw_county > 1\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6caa10d4-c400-4d69-ab22-26109e52be26", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "01a985ca-7f60-42ba-b3b7-71a7a4f7660d", + "metadata": {}, + "source": [ + "## Normalize" + ] + }, + { + "cell_type": "markdown", + "id": "6114388c-7179-46a7-8a60-2ae34c6b2510", + "metadata": {}, + "source": [ + "### Elections\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "411c7b67-855e-4181-88e9-4aa034d1cbde", + "metadata": {}, + "outputs": [], + "source": [ + "br_elections = br_election_data.drop_duplicates(subset=election_fields)[election_fields].copy()\n", + "\n", + "assert br_elections.election_id.is_unique" + ] + }, + { + "cell_type": "markdown", + "id": "61237f44-1a90-46bb-bb2e-a31be82d0b5c", + "metadata": {}, + "source": [ + "### Positions" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d76b4925-92d8-4390-bddc-d4e7cf5453c5", + "metadata": {}, + "outputs": [], + "source": [ + "br_positions = br_election_data.drop_duplicates(subset=position_fields)[position_fields].copy()\n", + "\n", + "assert br_positions.position_id.is_unique" + ] + }, + { + "cell_type": "markdown", + "id": "5f87b801-d419-432d-bdb3-a64e0041b2f5", + "metadata": {}, + "source": [ + "### Races" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "e61112a0-2ce0-4d52-a7ed-9edda948d40a", + "metadata": {}, + "outputs": [], + "source": [ + "race_fields = race_fields + [\"election_id\", \"position_id\"]\n", + "br_races = br_election_data.drop_duplicates(subset=race_fields)[race_fields].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "067b9120-1b4b-44bd-af58-3643109de2ad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(br_races) == len(br_election_data)" + ] + }, + { + "cell_type": "markdown", + "id": "6b2ca0dc-1619-4613-9eaf-6090c1b006b4", + "metadata": {}, + "source": [ + "## Positions - Counties\n", + "A county can have multiple positions. A position can be in multiple counties. I think it makes more sense to include county information in a four position - county table because positions are a more static entity where races are a position for a given eleciton." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "5f202087-d7bd-4d0e-b3a0-329981f9b8e2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "position_id\n", + "270596 1\n", + "279007 1\n", + "279006 1\n", + "279005 1\n", + "279004 1\n", + " ... \n", + "153655 254\n", + "153642 254\n", + "48 254\n", + "153654 254\n", + "46231 254\n", + "Name: county_id_fips, Length: 37407, dtype: int64" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_election_data.groupby(\"position_id\").county_id_fips.nunique().sort_values()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "f50c2c41-ff06-46e4-b5c9-1c575c997a69", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
election_idelection_nameelection_dayrace_idis_primaryis_runoffis_unexpiredposition_idposition_namesub_area_namesub_area_valuesub_area_name_secondarysub_area_value_secondaryraw_stateleveltieris_judicialis_retentionnumber_of_seatsnormalized_position_idnormalized_position_namefrequencyreference_yearpartisan_typeraw_countyrace_created_atrace_updated_atstate_id_fipscounty_id_fips
99584325Alaska General Election2024-11-051472432FalseFalseFalse2President of the United States<NA><NA><NA><NA>AKfederal1FalseFalse110President of the United States of America[4]2024partisanChugach Census Area2020-01-14 23:13:16.0972020-01-14 23:13:16.0970202063
622114325Alaska General Election2024-11-051978169FalseFalseFalse375211Vice President of the United States<NA><NA><NA><NA>AKfederal1FalseFalse111Vice President of the United States of America[4]2024partisanChugach Census Area2022-08-17 18:23:10.8412022-08-17 18:23:10.8410202063
99584325Alaska General Election2024-11-051472432FalseFalseFalse2President of the United States<NA><NA><NA><NA>AKfederal1FalseFalse110President of the United States of America[4]2024partisanCopper River Census Area2020-01-14 23:13:16.0972020-01-14 23:13:16.0970202066
622114325Alaska General Election2024-11-051978169FalseFalseFalse375211Vice President of the United States<NA><NA><NA><NA>AKfederal1FalseFalse111Vice President of the United States of America[4]2024partisanCopper River Census Area2022-08-17 18:23:10.8412022-08-17 18:23:10.8410202066
2694258Oklahoma General Election2024-11-051371233FalseFalseFalse40President of the United States<NA><NA><NA><NA>OKfederal1FalseFalse110President of the United States of America[4]2024partisanAdair County2020-01-14 23:08:41.6442020-01-14 23:08:41.6444040001
..........................................................................................
784325963Michigan Presidential Primary Election2024-02-272020923TrueFalseFalse26President of the United States<NA><NA><NA><NA>MIfederal1FalseFalse110President of the United States of America[4]2024partisanTuscola County2023-02-08 20:44:47.6682023-02-08 20:44:47.6682626157
784325963Michigan Presidential Primary Election2024-02-272020923TrueFalseFalse26President of the United States<NA><NA><NA><NA>MIfederal1FalseFalse110President of the United States of America[4]2024partisanVan Buren County2023-02-08 20:44:47.6682023-02-08 20:44:47.6682626159
784325963Michigan Presidential Primary Election2024-02-272020923TrueFalseFalse26President of the United States<NA><NA><NA><NA>MIfederal1FalseFalse110President of the United States of America[4]2024partisanWashtenaw County2023-02-08 20:44:47.6682023-02-08 20:44:47.6682626161
784325963Michigan Presidential Primary Election2024-02-272020923TrueFalseFalse26President of the United States<NA><NA><NA><NA>MIfederal1FalseFalse110President of the United States of America[4]2024partisanWayne County2023-02-08 20:44:47.6682023-02-08 20:44:47.6682626163
784325963Michigan Presidential Primary Election2024-02-272020923TrueFalseFalse26President of the United States<NA><NA><NA><NA>MIfederal1FalseFalse110President of the United States of America[4]2024partisanWexford County2023-02-08 20:44:47.6682023-02-08 20:44:47.6682626165
\n", + "

8934 rows × 29 columns

\n", + "
" + ], + "text/plain": [ + " election_id election_name election_day race_id is_primary is_runoff is_unexpired position_id position_name sub_area_name sub_area_value sub_area_name_secondary sub_area_value_secondary raw_state level tier is_judicial is_retention number_of_seats normalized_position_id normalized_position_name frequency reference_year partisan_type raw_county race_created_at race_updated_at state_id_fips county_id_fips\n", + "9958 4325 Alaska General Election 2024-11-05 1472432 False False False 2 President of the United States AK federal 1 False False 1 10 President of the United States of America [4] 2024 partisan Chugach Census Area 2020-01-14 23:13:16.097 2020-01-14 23:13:16.097 02 02063\n", + "62211 4325 Alaska General Election 2024-11-05 1978169 False False False 375211 Vice President of the United States AK federal 1 False False 1 11 Vice President of the United States of America [4] 2024 partisan Chugach Census Area 2022-08-17 18:23:10.841 2022-08-17 18:23:10.841 02 02063\n", + "9958 4325 Alaska General Election 2024-11-05 1472432 False False False 2 President of the United States AK federal 1 False False 1 10 President of the United States of America [4] 2024 partisan Copper River Census Area 2020-01-14 23:13:16.097 2020-01-14 23:13:16.097 02 02066\n", + "62211 4325 Alaska General Election 2024-11-05 1978169 False False False 375211 Vice President of the United States AK federal 1 False False 1 11 Vice President of the United States of America [4] 2024 partisan Copper River Census Area 2022-08-17 18:23:10.841 2022-08-17 18:23:10.841 02 02066\n", + "269 4258 Oklahoma General Election 2024-11-05 1371233 False False False 40 President of the United States OK federal 1 False False 1 10 President of the United States of America [4] 2024 partisan Adair County 2020-01-14 23:08:41.644 2020-01-14 23:08:41.644 40 40001\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "78432 5963 Michigan Presidential Primary Election 2024-02-27 2020923 True False False 26 President of the United States MI federal 1 False False 1 10 President of the United States of America [4] 2024 partisan Tuscola County 2023-02-08 20:44:47.668 2023-02-08 20:44:47.668 26 26157\n", + "78432 5963 Michigan Presidential Primary Election 2024-02-27 2020923 True False False 26 President of the United States MI federal 1 False False 1 10 President of the United States of America [4] 2024 partisan Van Buren County 2023-02-08 20:44:47.668 2023-02-08 20:44:47.668 26 26159\n", + "78432 5963 Michigan Presidential Primary Election 2024-02-27 2020923 True False False 26 President of the United States MI federal 1 False False 1 10 President of the United States of America [4] 2024 partisan Washtenaw County 2023-02-08 20:44:47.668 2023-02-08 20:44:47.668 26 26161\n", + "78432 5963 Michigan Presidential Primary Election 2024-02-27 2020923 True False False 26 President of the United States MI federal 1 False False 1 10 President of the United States of America [4] 2024 partisan Wayne County 2023-02-08 20:44:47.668 2023-02-08 20:44:47.668 26 26163\n", + "78432 5963 Michigan Presidential Primary Election 2024-02-27 2020923 True False False 26 President of the United States MI federal 1 False False 1 10 President of the United States of America [4] 2024 partisan Wexford County 2023-02-08 20:44:47.668 2023-02-08 20:44:47.668 26 26165\n", + "\n", + "[8934 rows x 29 columns]" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_election_data[br_election_data.normalized_position_name.str.contains(\"United States\")]" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "dc35ad5e-8d86-4516-8f19-4bbda1e44579", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "02 30\n", + "Name: state_id_fips, dtype: Int64" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_election_data.query(\"position_id == 2\").state_id_fips.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "cc7c6608-5d1b-4759-8af0-d755b451cd1b", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "58e460fa-e018-4d55-91ec-457e2ea3cd70", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
position_idcounty_id_fipsraw_countystate_id_fipsraw_state
154322590602063Chugach Census Area02AK
154522590802063Chugach Census Area02AK
154622590902063Chugach Census Area02AK
9958202063Chugach Census Area02AK
417604625502063Chugach Census Area02AK
..................
8276241600551107Loudoun County51VA
8276720758251690Martinsville city51VA
8276833085351640Galax city51VA
8277341605845045Greenville County45SC
8277533080851530Buena Vista city51VA
\n", + "

90751 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " position_id county_id_fips raw_county state_id_fips raw_state\n", + "1543 225906 02063 Chugach Census Area 02 AK\n", + "1545 225908 02063 Chugach Census Area 02 AK\n", + "1546 225909 02063 Chugach Census Area 02 AK\n", + "9958 2 02063 Chugach Census Area 02 AK\n", + "41760 46255 02063 Chugach Census Area 02 AK\n", + "... ... ... ... ... ...\n", + "82762 416005 51107 Loudoun County 51 VA\n", + "82767 207582 51690 Martinsville city 51 VA\n", + "82768 330853 51640 Galax city 51 VA\n", + "82773 416058 45045 Greenville County 45 SC\n", + "82775 330808 51530 Buena Vista city 51 VA\n", + "\n", + "[90751 rows x 5 columns]" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "position_counties_fields = [\"position_id\", \"county_id_fips\", \"raw_county\", \"state_id_fips\", \"raw_state\"]\n", + "\n", + "br_election_data.drop_duplicates(subset=[\"position_id\", \"county_id_fips\"])[position_counties_fields]" + ] + }, + { + "cell_type": "markdown", + "id": "f65f34cb-569e-4fa5-8ebe-f4b3f74f1ca0", + "metadata": {}, + "source": [ + "## Test" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "02f84caf-696b-4f3e-8882-5ca6f15fceae", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-01 15:37:47 [ INFO] catalystcoop.pudl.helpers:203 Assigned state FIPS codes for 100.00% of records.\n", + "2023-09-01 15:37:49 [ INFO] catalystcoop.pudl.helpers:219 Assigned county FIPS codes for 99.61% of records.\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import dbcp\n", + "\n", + "source_uri = \"gs://dgm-archive/ballot_ready/BallotReady_upcoming_races_with_counties_08_14_2023.csv\"\n", + "raw_df = dbcp.extract.ballot_ready.extract(source_uri)\n", + "transformed = dbcp.transform.ballot_ready.transform(raw_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "58c39b22-29a0-401a-9577-eba7d67a023e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(82776, 29)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_df[\"raw_ballot_ready\"].shape" + ] + }, + { + "cell_type": "markdown", + "id": "4567af15-3170-476b-8c60-9161f8492d3c", + "metadata": {}, + "source": [ + "## construct data mart table" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "323ed662-a6ad-444b-93da-4b227bb9de69", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-01 15:46:39 [ INFO] catalystcoop.pudl.helpers:203 Assigned state FIPS codes for 100.00% of records.\n", + "2023-09-01 15:46:41 [ INFO] catalystcoop.pudl.helpers:219 Assigned county FIPS codes for 99.61% of records.\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import dbcp\n", + "\n", + "source_uri = \"gs://dgm-archive/ballot_ready/BallotReady_upcoming_races_with_counties_08_14_2023.csv\"\n", + "raw_df = dbcp.extract.ballot_ready.extract(source_uri)\n", + "transformed = dbcp.transform.ballot_ready.transform(raw_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "75da1cb1-98f3-4df3-8bdd-b9f8e3642823", + "metadata": {}, + "outputs": [], + "source": [ + "engine = dbcp.helpers.get_sql_engine()\n", + "with engine.connect() as con:\n", + " br_elections = pd.read_sql_table(\"br_elections\", con, schema=\"data_warehouse\")\n", + " br_races = pd.read_sql_table(\"br_races\", con, schema=\"data_warehouse\")\n", + " br_positions = pd.read_sql_table(\"br_positions\", con, schema=\"data_warehouse\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "af9a8174-cee6-43cf-8603-df877f26603c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 67316\n", + "2 4673\n", + "3 3319\n", + "4 2082\n", + "5 1468\n", + " ... \n", + "34 2\n", + "37 2\n", + "43 2\n", + "47 2\n", + "51 1\n", + "Name: county_id_fips, Length: 81, dtype: int64" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_races.groupby(\"race_id\").county_id_fips.nunique().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "bf44bdb7-6ad1-4b2d-a1b0-0ec08bada03b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "39 78\n", + "23 78\n", + "33 67\n", + "21 65\n", + "27 58\n", + " ..\n", + "328 1\n", + "237 1\n", + "155 1\n", + "211 1\n", + "184 1\n", + "Name: race_id, Length: 202, dtype: int64" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_races.groupby(\"county_id_fips\").race_id.nunique().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "6e0e024b-dbf4-42e5-aacf-61563379799d", + "metadata": {}, + "outputs": [], + "source": [ + "br_election_data = br_races.merge(br_elections, how=\"left\", on=\"election_id\", validate=\"m:1\")\n", + "br_election_data = br_election_data.merge(br_positions, how=\"left\", on=\"position_id\", validate=\"m:1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "30c04347-467a-4368-bf3d-1de0ad67a226", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 188074 entries, 0 to 188073\n", + "Data columns (total 29 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 race_id 188074 non-null int64 \n", + " 1 raw_county 188074 non-null object \n", + " 2 is_primary 188074 non-null bool \n", + " 3 is_runoff 188074 non-null bool \n", + " 4 is_unexpired 188074 non-null bool \n", + " 5 number_of_seats 188074 non-null int64 \n", + " 6 race_created_at 188074 non-null datetime64[ns]\n", + " 7 race_updated_at 188074 non-null datetime64[ns]\n", + " 8 raw_state 188074 non-null object \n", + " 9 state_id_fips 188074 non-null object \n", + " 10 county_id_fips 188074 non-null object \n", + " 11 election_id 188074 non-null int64 \n", + " 12 position_id 188074 non-null int64 \n", + " 13 election_name 188074 non-null object \n", + " 14 election_day 188074 non-null datetime64[ns]\n", + " 15 position_name 188074 non-null object \n", + " 16 reference_year 188074 non-null int64 \n", + " 17 sub_area_name 114957 non-null object \n", + " 18 sub_area_value 125790 non-null object \n", + " 19 sub_area_name_secondary 11501 non-null object \n", + " 20 sub_area_value_secondary 12522 non-null object \n", + " 21 level 188074 non-null object \n", + " 22 tier 188074 non-null int64 \n", + " 23 is_judicial 188074 non-null bool \n", + " 24 is_retention 188074 non-null bool \n", + " 25 normalized_position_id 188074 non-null int64 \n", + " 26 normalized_position_name 188074 non-null object \n", + " 27 frequency 188074 non-null object \n", + " 28 partisan_type 188060 non-null object \n", + "dtypes: bool(5), datetime64[ns](3), int64(7), object(14)\n", + "memory usage: 36.8+ MB\n" + ] + } + ], + "source": [ + "br_election_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a6d42ef0-fa8e-44ba-802d-9beb9beb30fb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['br_elections', 'br_positions', 'br_races'])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transformed.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0d2d9fef-ef28-4b98-a93a-3ed6c24d558c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['election_id', 'election_name', 'election_day'], dtype='object')\n", + "\n", + "Index(['reference_year', 'position_id', 'position_name', 'sub_area_name', 'sub_area_value', 'sub_area_name_secondary', 'sub_area_value_secondary', 'raw_state', 'level', 'tier', 'is_judicial', 'is_retention', 'normalized_position_id', 'normalized_position_name', 'frequency', 'partisan_type'], dtype='object')\n", + "\n", + "Index(['race_id', 'is_primary', 'is_runoff', 'is_unexpired', 'number_of_seats', 'race_created_at', 'race_updated_at', 'raw_state', 'raw_county', 'raw_county', 'state_id_fips', 'county_id_fips', 'election_id', 'position_id'], dtype='object')\n", + "\n" + ] + } + ], + "source": [ + "for df in transformed.values():\n", + " print(df.columns)\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d463e5ce-1c9a-4d79-a3d7-ed9fa9ba864b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 188074\n", + "dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transformed[\"br_races\"].duplicated(subset=[\"race_id\", \"raw_county\"]).value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "ecffc286-f29c-4e0f-978b-0d7e7caa8a87", + "metadata": {}, + "source": [ + "## Fips business" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c5a32318-fafb-4512-8134-8f09008958a3", + "metadata": {}, + "outputs": [], + "source": [ + "census_uri = \"gs://dgm-archive/census/tl_2021_us_county.zip\"\n", + "fips = dbcp.extract.fips_tables._extract_census_counties(census_uri)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ac5f998f-273e-4441-9604-574e52b1394c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
STATEFPCOUNTYFPCOUNTYNSGEOIDNAMENAMELSADLSADCLASSFPMTFCCCSAFPCBSAFPMETDIVFPFUNCSTATALANDAWATERINTPTLATINTPTLONgeometry
2956020660280440102066Copper RiverCopper River Census Area05H5G4020NaNNaNNaNS639523355921217429937+62.0344790-143.9221674POLYGON ((-147.74430 61.42628, -147.71096 61.4...
\n", + "
" + ], + "text/plain": [ + " STATEFP COUNTYFP COUNTYNS GEOID NAME NAMELSAD LSAD CLASSFP MTFCC CSAFP CBSAFP METDIVFP FUNCSTAT ALAND AWATER INTPTLAT INTPTLON geometry\n", + "2956 02 066 02804401 02066 Copper River Copper River Census Area 05 H5 G4020 NaN NaN NaN S 63952335592 1217429937 +62.0344790 -143.9221674 POLYGON ((-147.74430 61.42628, -147.71096 61.4..." + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fips.query(\"GEOID == '02066'\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e42d25e4-a691-41a2-a0a2-d661f2e4113e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
race_idis_primaryis_runoffis_unexpirednumber_of_seatsrace_created_atrace_updated_atraw_stateraw_countyraw_countystate_id_fipscounty_id_fipselection_idposition_id
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [race_id, is_primary, is_runoff, is_unexpired, number_of_seats, race_created_at, race_updated_at, raw_state, raw_county, raw_county, state_id_fips, county_id_fips, election_id, position_id]\n", + "Index: []" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_races = transformed[\"br_races\"]\n", + "\n", + "br_races.query(\"county_id_fips == '02261'\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9e3b3608-50af-498e-a1bf-adaf117eaafb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0, 14)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "valdez = br_races.query(\"county_id_fips == '02261'\").copy()\n", + "valdez.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "7604e0c6-27ff-4ea9-a9e7-1bef80e6823e", + "metadata": {}, + "outputs": [], + "source": [ + "br_races = br_races[br_races.county_id_fips != '02261'].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "79991fed-523d-49f7-b03f-3a8bcef4b8f4", + "metadata": {}, + "outputs": [], + "source": [ + "valdez_corrections = [\n", + " {'raw_county': 'Chugach Census Area', 'county_id_fips': '02063'},\n", + " {'raw_county': 'Copper River Census Area', 'county_id_fips': '02066'},\n", + "]\n", + "\n", + "valdez_corrections_dfs = []\n", + "for cor in valdez_corrections:\n", + " corrected_df = valdez.copy()\n", + " for field, value in cor.items():\n", + " corrected_df[field] = value\n", + " valdez_corrections_dfs.append(corrected_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "840ae452-c1ce-449c-8917-0bf371437f60", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
race_idis_primaryis_runoffis_unexpirednumber_of_seatsrace_created_atrace_updated_atraw_stateraw_countyraw_countystate_id_fipscounty_id_fipselection_idposition_id
15431385101FalseFalseFalse12020-01-14 23:09:11.7332021-09-30 19:37:57.406AKValdez-Cordova Census AreaChugach Census Area02020634325225906
15451385103FalseFalseFalse12020-01-14 23:09:11.7332021-09-30 19:37:57.406AKValdez-Cordova Census AreaChugach Census Area02020634325225908
15461385104FalseFalseFalse12020-01-14 23:09:11.7332021-09-30 19:37:57.406AKValdez-Cordova Census AreaChugach Census Area02020634325225909
99581472432FalseFalseFalse12020-01-14 23:13:16.0972020-01-14 23:13:16.097AKValdez-Cordova Census AreaChugach Census Area020206343252
417601675001FalseFalseFalse12020-01-15 22:48:10.7492020-01-15 22:48:10.749AKValdez-Cordova Census AreaChugach Census Area0202063432546255
.............................................
827712778453FalseTrueFalse12023-08-09 23:45:46.2812023-08-09 23:45:46.281TNDavidson CountyNaN47470375002233401
827722778454FalseTrueFalse12023-08-09 23:45:56.4232023-08-09 23:45:56.423TNDavidson CountyNaN47470375002233376
827732778472FalseFalseFalse12023-08-10 17:01:23.4432023-08-10 17:01:23.443SCGreenville CountyNaN45450454693416058
827742778473TrueFalseFalse12023-08-10 17:01:23.4782023-08-10 17:01:23.478SCGreenville CountyNaN45450455392416058
827752778491FalseFalseTrue12023-08-10 20:31:12.8252023-08-10 20:35:25.196VABuena Vista cityNaN51515304333330808
\n", + "

188074 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " race_id is_primary is_runoff is_unexpired number_of_seats race_created_at race_updated_at raw_state raw_county raw_county state_id_fips county_id_fips election_id position_id\n", + "1543 1385101 False False False 1 2020-01-14 23:09:11.733 2021-09-30 19:37:57.406 AK Valdez-Cordova Census Area Chugach Census Area 02 02063 4325 225906\n", + "1545 1385103 False False False 1 2020-01-14 23:09:11.733 2021-09-30 19:37:57.406 AK Valdez-Cordova Census Area Chugach Census Area 02 02063 4325 225908\n", + "1546 1385104 False False False 1 2020-01-14 23:09:11.733 2021-09-30 19:37:57.406 AK Valdez-Cordova Census Area Chugach Census Area 02 02063 4325 225909\n", + "9958 1472432 False False False 1 2020-01-14 23:13:16.097 2020-01-14 23:13:16.097 AK Valdez-Cordova Census Area Chugach Census Area 02 02063 4325 2\n", + "41760 1675001 False False False 1 2020-01-15 22:48:10.749 2020-01-15 22:48:10.749 AK Valdez-Cordova Census Area Chugach Census Area 02 02063 4325 46255\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "82771 2778453 False True False 1 2023-08-09 23:45:46.281 2023-08-09 23:45:46.281 TN Davidson County NaN 47 47037 5002 233401\n", + "82772 2778454 False True False 1 2023-08-09 23:45:56.423 2023-08-09 23:45:56.423 TN Davidson County NaN 47 47037 5002 233376\n", + "82773 2778472 False False False 1 2023-08-10 17:01:23.443 2023-08-10 17:01:23.443 SC Greenville County NaN 45 45045 4693 416058\n", + "82774 2778473 True False False 1 2023-08-10 17:01:23.478 2023-08-10 17:01:23.478 SC Greenville County NaN 45 45045 5392 416058\n", + "82775 2778491 False False True 1 2023-08-10 20:31:12.825 2023-08-10 20:35:25.196 VA Buena Vista city NaN 51 51530 4333 330808\n", + "\n", + "[188074 rows x 14 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat(valdez_corrections_dfs + [br_races])" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "96a4d873-c483-4cb5-9243-05e1dbe29248", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
race_idis_primaryis_runoffis_unexpirednumber_of_seatsrace_created_atrace_updated_atraw_stateraw_countystate_id_fipscounty_id_fipselection_idposition_id
01365674FalseFalseFalse32020-01-14 23:08:21.0162021-09-30 19:37:57.406ILMacon County17171154206247556
11365754FalseFalseFalse12020-01-14 23:08:21.0162023-02-02 19:54:36.541ILCook County17170314206226054
21365856FalseFalseFalse12020-01-14 23:08:21.0162023-02-02 19:54:49.430ILCook County17170314206226051
31365861FalseFalseFalse12020-01-14 23:08:21.0162023-02-02 19:53:14.016ILCook County17170314206226035
41365863FalseFalseFalse12020-01-14 23:08:21.0162023-02-02 19:53:39.554ILCook County17170314206226030
..........................................
827712778453FalseTrueFalse12023-08-09 23:45:46.2812023-08-09 23:45:46.281TNDavidson County47470375002233401
827722778454FalseTrueFalse12023-08-09 23:45:56.4232023-08-09 23:45:56.423TNDavidson County47470375002233376
827732778472FalseFalseFalse12023-08-10 17:01:23.4432023-08-10 17:01:23.443SCGreenville County45450454693416058
827742778473TrueFalseFalse12023-08-10 17:01:23.4782023-08-10 17:01:23.478SCGreenville County45450455392416058
827752778491FalseFalseTrue12023-08-10 20:31:12.8252023-08-10 20:35:25.196VABuena Vista city51515304333330808
\n", + "

188034 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " race_id is_primary is_runoff is_unexpired number_of_seats race_created_at race_updated_at raw_state raw_county state_id_fips county_id_fips election_id position_id\n", + "0 1365674 False False False 3 2020-01-14 23:08:21.016 2021-09-30 19:37:57.406 IL Macon County 17 17115 4206 247556\n", + "1 1365754 False False False 1 2020-01-14 23:08:21.016 2023-02-02 19:54:36.541 IL Cook County 17 17031 4206 226054\n", + "2 1365856 False False False 1 2020-01-14 23:08:21.016 2023-02-02 19:54:49.430 IL Cook County 17 17031 4206 226051\n", + "3 1365861 False False False 1 2020-01-14 23:08:21.016 2023-02-02 19:53:14.016 IL Cook County 17 17031 4206 226035\n", + "4 1365863 False False False 1 2020-01-14 23:08:21.016 2023-02-02 19:53:39.554 IL Cook County 17 17031 4206 226030\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "82771 2778453 False True False 1 2023-08-09 23:45:46.281 2023-08-09 23:45:46.281 TN Davidson County 47 47037 5002 233401\n", + "82772 2778454 False True False 1 2023-08-09 23:45:56.423 2023-08-09 23:45:56.423 TN Davidson County 47 47037 5002 233376\n", + "82773 2778472 False False False 1 2023-08-10 17:01:23.443 2023-08-10 17:01:23.443 SC Greenville County 45 45045 4693 416058\n", + "82774 2778473 True False False 1 2023-08-10 17:01:23.478 2023-08-10 17:01:23.478 SC Greenville County 45 45045 5392 416058\n", + "82775 2778491 False False True 1 2023-08-10 20:31:12.825 2023-08-10 20:35:25.196 VA Buena Vista city 51 51530 4333 330808\n", + "\n", + "[188034 rows x 13 columns]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_races" + ] + }, + { + "cell_type": "markdown", + "id": "5723c81c-9d2c-4aaf-a844-dcfa819dd733", + "metadata": {}, + "source": [ + "## Examine data mart table" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "3b7bae7d-8597-47ec-af1a-4a5c08a0ba62", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-12 16:59:22 [ INFO] catalystcoop.pudl.helpers:203 Assigned state FIPS codes for 100.00% of records.\n", + "2023-09-12 16:59:26 [ INFO] catalystcoop.pudl.helpers:219 Assigned county FIPS codes for 99.61% of records.\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import dbcp\n", + "\n", + "source_uri = \"gs://dgm-archive/ballot_ready/BallotReady_upcoming_races_with_counties_08_14_2023.csv\"\n", + "raw_dfs = dbcp.extract.ballot_ready.extract(source_uri)\n", + "raw_ballot_ready = raw_dfs[\"raw_ballot_ready\"]\n", + "exploded_br_election_data = dbcp.transform.ballot_ready._explode_counties(raw_ballot_ready)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "432514ae-448b-4b87-ab89-0b53162ede6c", + "metadata": {}, + "outputs": [], + "source": [ + "engine = dbcp.helpers.get_sql_engine()\n", + "with engine.connect() as con:\n", + " br_election_data = pd.read_sql_table(\"br_election_data\", con, schema=\"data_mart\")" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "0a674c35-47b2-46b7-a158-bf9ba8915aea", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 188074 entries, 0 to 188073\n", + "Data columns (total 31 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 state_name 188074 non-null object \n", + " 1 county_name 188074 non-null object \n", + " 2 raw_county 188074 non-null object \n", + " 3 election_id 188074 non-null int64 \n", + " 4 election_name 188074 non-null object \n", + " 5 election_day 188074 non-null datetime64[ns]\n", + " 6 race_id 188074 non-null int64 \n", + " 7 is_primary 188074 non-null bool \n", + " 8 is_runoff 188074 non-null bool \n", + " 9 is_unexpired 188074 non-null bool \n", + " 10 position_id 188074 non-null int64 \n", + " 11 position_name 188074 non-null object \n", + " 12 sub_area_name 114957 non-null object \n", + " 13 sub_area_value 125790 non-null object \n", + " 14 sub_area_name_secondary 11501 non-null object \n", + " 15 sub_area_value_secondary 12522 non-null object \n", + " 16 raw_state 188074 non-null object \n", + " 17 level 188074 non-null object \n", + " 18 tier 188074 non-null int64 \n", + " 19 is_judicial 188074 non-null bool \n", + " 20 is_retention 188074 non-null bool \n", + " 21 number_of_seats 188074 non-null int64 \n", + " 22 normalized_position_id 188074 non-null int64 \n", + " 23 normalized_position_name 188074 non-null object \n", + " 24 frequency 188074 non-null object \n", + " 25 reference_year 188074 non-null int64 \n", + " 26 partisan_type 188060 non-null object \n", + " 27 race_created_at 188074 non-null datetime64[ns]\n", + " 28 race_updated_at 188074 non-null datetime64[ns]\n", + " 29 state_id_fips 188074 non-null object \n", + " 30 county_id_fips 188074 non-null object \n", + "dtypes: bool(5), datetime64[ns](3), int64(7), object(16)\n", + "memory usage: 38.2+ MB\n" + ] + } + ], + "source": [ + "br_election_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "6e3a1031-449f-413f-9bd7-871649b672cf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 188074 entries, 1543 to 82775\n", + "Data columns (total 29 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 election_id 188074 non-null Int64 \n", + " 1 election_name 188074 non-null string \n", + " 2 election_day 188074 non-null datetime64[ns]\n", + " 3 race_id 188074 non-null Int64 \n", + " 4 is_primary 188074 non-null boolean \n", + " 5 is_runoff 188074 non-null boolean \n", + " 6 is_unexpired 188074 non-null boolean \n", + " 7 position_id 188074 non-null Int64 \n", + " 8 position_name 188074 non-null string \n", + " 9 sub_area_name 114957 non-null string \n", + " 10 sub_area_value 125790 non-null string \n", + " 11 sub_area_name_secondary 11501 non-null string \n", + " 12 sub_area_value_secondary 12522 non-null string \n", + " 13 raw_state 188074 non-null string \n", + " 14 level 188074 non-null string \n", + " 15 tier 188074 non-null Int64 \n", + " 16 is_judicial 188074 non-null boolean \n", + " 17 is_retention 188074 non-null boolean \n", + " 18 number_of_seats 188074 non-null Int64 \n", + " 19 normalized_position_id 188074 non-null Int64 \n", + " 20 normalized_position_name 188074 non-null string \n", + " 21 frequency 188074 non-null string \n", + " 22 reference_year 188074 non-null Int64 \n", + " 23 partisan_type 188060 non-null string \n", + " 24 raw_county 188074 non-null object \n", + " 25 race_created_at 188074 non-null datetime64[ns]\n", + " 26 race_updated_at 188074 non-null datetime64[ns]\n", + " 27 state_id_fips 188074 non-null string \n", + " 28 county_id_fips 188074 non-null object \n", + "dtypes: Int64(7), boolean(5), datetime64[ns](3), object(2), string(12)\n", + "memory usage: 38.9+ MB\n" + ] + } + ], + "source": [ + "exploded_br_election_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "1d2c6987-f828-4c07-b30c-a28ba0acd90b", + "metadata": {}, + "outputs": [], + "source": [ + "with engine.connect() as con:\n", + " county_commission_election_info = pd.read_sql_table(\"county_commission_election_info\", con, schema=\"data_mart\")" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "ad0c82b7-8dc5-48d8-8022-cabecf85735e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
county_namecounty_id_fipsnext_general_election_idnext_general_election_namenext_general_election_daynext_general_total_n_seatsnext_general_total_n_racesnext_general_all_race_namesnext_general_frequencynext_general_reference_yearnext_primary_election_idnext_primary_election_namenext_primary_election_daynext_primary_total_n_seatsnext_primary_total_n_racesnext_primary_all_race_namesnext_primary_frequencynext_primary_reference_yearnext_run_off_election_idnext_run_off_election_namenext_run_off_election_daynext_run_off_total_n_seatsnext_run_off_total_n_racesnext_run_off_all_race_namesnext_run_off_frequencynext_run_off_reference_year
0Autauga010014261.0Alabama General Election2024-11-055.05.0Autauga County Commission - District 2,Autauga...[4]2024.05303.0Alabama Primary Election2024-03-055.05.0Autauga County Commission - District 2,Autauga...[4]2024.05766.0Alabama Primary Runoff Election2024-04-025.05.0Autauga County Commission - District 1,Autauga...[4]2024.0
1Bibb010074261.0Alabama General Election2024-11-053.03.0Bibb County Commission - District 1,Bibb Count...[4]2024.05303.0Alabama Primary Election2024-03-053.03.0Bibb County Commission - District 1,Bibb Count...[4]2024.05766.0Alabama Primary Runoff Election2024-04-023.03.0Bibb County Commission - District 1,Bibb Count...[4]2024.0
2Blount010094261.0Alabama General Election2024-11-052.02.0Blount County Commission - District 4,Blount C...[4]2024.05303.0Alabama Primary Election2024-03-052.02.0Blount County Commission - District 4,Blount C...[4]2024.05766.0Alabama Primary Runoff Election2024-04-022.02.0Blount County Commission - District 2,Blount C...[4]2024.0
3Butler010134261.0Alabama General Election2024-11-053.03.0Butler County Commission - District 4,Butler C...[4]2024.05303.0Alabama Primary Election2024-03-053.03.0Butler County Commission - District 4,Butler C...[4]2024.05766.0Alabama Primary Runoff Election2024-04-023.03.0Butler County Commission - District 1,Butler C...[4]2024.0
4Chambers010174261.0Alabama General Election2024-11-053.03.0Chambers County Commission - District 1,Chambe...[4]2024.05303.0Alabama Primary Election2024-03-053.03.0Chambers County Commission - District 1,Chambe...[4]2024.05766.0Alabama Primary Runoff Election2024-04-023.03.0Chambers County Commission - District 1,Chambe...[4]2024.0
.................................................................................
2739Sweetwater560374463.0Wyoming General Election2024-11-052.01.0Sweetwater County Commission[4]2024.05294.0Wyoming Primary Election2024-08-202.01.0Sweetwater County Commission[4]2024.0NaNNoneNaTNaNNaNNoneNoneNaN
2740Teton560394463.0Wyoming General Election2024-11-052.01.0Teton County Commission[4]2024.05294.0Wyoming Primary Election2024-08-202.01.0Teton County Commission[4]2024.0NaNNoneNaTNaNNaNNoneNoneNaN
2741Uinta560414463.0Wyoming General Election2024-11-051.01.0Uinta County Commission[4]2024.05294.0Wyoming Primary Election2024-08-201.01.0Uinta County Commission[4]2024.0NaNNoneNaTNaNNaNNoneNoneNaN
2742Washakie560434463.0Wyoming General Election2024-11-051.01.0Washakie County Commission[4]2024.05294.0Wyoming Primary Election2024-08-201.01.0Washakie County Commission[4]2024.0NaNNoneNaTNaNNaNNoneNoneNaN
2743Weston560454463.0Wyoming General Election2024-11-052.01.0Weston County Commission[4]2024.05294.0Wyoming Primary Election2024-08-202.01.0Weston County Commission[4]2024.0NaNNoneNaTNaNNaNNoneNoneNaN
\n", + "

2744 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " county_name county_id_fips next_general_election_id next_general_election_name next_general_election_day next_general_total_n_seats next_general_total_n_races next_general_all_race_names next_general_frequency next_general_reference_year next_primary_election_id next_primary_election_name next_primary_election_day next_primary_total_n_seats next_primary_total_n_races next_primary_all_race_names next_primary_frequency next_primary_reference_year next_run_off_election_id next_run_off_election_name next_run_off_election_day next_run_off_total_n_seats next_run_off_total_n_races next_run_off_all_race_names next_run_off_frequency next_run_off_reference_year\n", + "0 Autauga 01001 4261.0 Alabama General Election 2024-11-05 5.0 5.0 Autauga County Commission - District 2,Autauga... [4] 2024.0 5303.0 Alabama Primary Election 2024-03-05 5.0 5.0 Autauga County Commission - District 2,Autauga... [4] 2024.0 5766.0 Alabama Primary Runoff Election 2024-04-02 5.0 5.0 Autauga County Commission - District 1,Autauga... [4] 2024.0\n", + "1 Bibb 01007 4261.0 Alabama General Election 2024-11-05 3.0 3.0 Bibb County Commission - District 1,Bibb Count... [4] 2024.0 5303.0 Alabama Primary Election 2024-03-05 3.0 3.0 Bibb County Commission - District 1,Bibb Count... [4] 2024.0 5766.0 Alabama Primary Runoff Election 2024-04-02 3.0 3.0 Bibb County Commission - District 1,Bibb Count... [4] 2024.0\n", + "2 Blount 01009 4261.0 Alabama General Election 2024-11-05 2.0 2.0 Blount County Commission - District 4,Blount C... [4] 2024.0 5303.0 Alabama Primary Election 2024-03-05 2.0 2.0 Blount County Commission - District 4,Blount C... [4] 2024.0 5766.0 Alabama Primary Runoff Election 2024-04-02 2.0 2.0 Blount County Commission - District 2,Blount C... [4] 2024.0\n", + "3 Butler 01013 4261.0 Alabama General Election 2024-11-05 3.0 3.0 Butler County Commission - District 4,Butler C... [4] 2024.0 5303.0 Alabama Primary Election 2024-03-05 3.0 3.0 Butler County Commission - District 4,Butler C... [4] 2024.0 5766.0 Alabama Primary Runoff Election 2024-04-02 3.0 3.0 Butler County Commission - District 1,Butler C... [4] 2024.0\n", + "4 Chambers 01017 4261.0 Alabama General Election 2024-11-05 3.0 3.0 Chambers County Commission - District 1,Chambe... [4] 2024.0 5303.0 Alabama Primary Election 2024-03-05 3.0 3.0 Chambers County Commission - District 1,Chambe... [4] 2024.0 5766.0 Alabama Primary Runoff Election 2024-04-02 3.0 3.0 Chambers County Commission - District 1,Chambe... [4] 2024.0\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "2739 Sweetwater 56037 4463.0 Wyoming General Election 2024-11-05 2.0 1.0 Sweetwater County Commission [4] 2024.0 5294.0 Wyoming Primary Election 2024-08-20 2.0 1.0 Sweetwater County Commission [4] 2024.0 NaN None NaT NaN NaN None None NaN\n", + "2740 Teton 56039 4463.0 Wyoming General Election 2024-11-05 2.0 1.0 Teton County Commission [4] 2024.0 5294.0 Wyoming Primary Election 2024-08-20 2.0 1.0 Teton County Commission [4] 2024.0 NaN None NaT NaN NaN None None NaN\n", + "2741 Uinta 56041 4463.0 Wyoming General Election 2024-11-05 1.0 1.0 Uinta County Commission [4] 2024.0 5294.0 Wyoming Primary Election 2024-08-20 1.0 1.0 Uinta County Commission [4] 2024.0 NaN None NaT NaN NaN None None NaN\n", + "2742 Washakie 56043 4463.0 Wyoming General Election 2024-11-05 1.0 1.0 Washakie County Commission [4] 2024.0 5294.0 Wyoming Primary Election 2024-08-20 1.0 1.0 Washakie County Commission [4] 2024.0 NaN None NaT NaN NaN None None NaN\n", + "2743 Weston 56045 4463.0 Wyoming General Election 2024-11-05 2.0 1.0 Weston County Commission [4] 2024.0 5294.0 Wyoming Primary Election 2024-08-20 2.0 1.0 Weston County Commission [4] 2024.0 NaN None NaT NaN NaN None None NaN\n", + "\n", + "[2744 rows x 26 columns]" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "county_commission_election_info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06c02121-bda8-4f8f-9c13-5b25a951c757", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/34-bdn-comissioner-table.ipynb b/notebooks/34-bdn-comissioner-table.ipynb new file mode 100644 index 00000000..bf57a54f --- /dev/null +++ b/notebooks/34-bdn-comissioner-table.ipynb @@ -0,0 +1,3397 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "0829e38a-123d-47c5-9994-298be08f3ca6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/app/.local/lib/python3.10/site-packages/geopandas/_compat.py:123: UserWarning: The Shapely GEOS version (3.11.1-CAPI-1.17.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.3-CAPI-1.16.1). Conversions between both will be slow.\n", + " warnings.warn(\n", + "/app/.local/lib/python3.10/site-packages/pudl/analysis/spatial.py:7: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:\n", + "\n", + "import os\n", + "os.environ['USE_PYGEOS'] = '0'\n", + "import geopandas\n", + "\n", + "In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).\n", + " import geopandas as gpd\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import dbcp" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3a590826-c856-4d1b-86c7-b7681f7de709", + "metadata": {}, + "outputs": [], + "source": [ + "engine = dbcp.helpers.get_sql_engine()\n", + "with engine.connect() as con:\n", + " br_elections = pd.read_sql_table(\"br_elections\", con, schema=\"data_warehouse\")\n", + " br_races = pd.read_sql_table(\"br_races\", con, schema=\"data_warehouse\")\n", + " br_positions = pd.read_sql_table(\"br_positions\", con, schema=\"data_warehouse\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c8b8a81e-f9bd-4b81-97e9-ccd6a250edfa", + "metadata": {}, + "outputs": [], + "source": [ + "commissioner_positions = br_positions.query(\"tier > 2 & is_judicial == False & normalized_position_id in (910,912)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "279355b4-1d01-4b1b-a245-6f1f6fd86ce9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 10192 entries, 168 to 37364\n", + "Data columns (total 15 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 position_id 10192 non-null int64 \n", + " 1 position_name 10192 non-null object\n", + " 2 reference_year 10192 non-null int64 \n", + " 3 sub_area_name 9095 non-null object\n", + " 4 sub_area_value 9629 non-null object\n", + " 5 sub_area_name_secondary 92 non-null object\n", + " 6 sub_area_value_secondary 114 non-null object\n", + " 7 level 10192 non-null object\n", + " 8 tier 10192 non-null int64 \n", + " 9 is_judicial 10192 non-null bool \n", + " 10 is_retention 10192 non-null bool \n", + " 11 normalized_position_id 10192 non-null int64 \n", + " 12 normalized_position_name 10192 non-null object\n", + " 13 frequency 10192 non-null object\n", + " 14 partisan_type 10192 non-null object\n", + "dtypes: bool(2), int64(4), object(9)\n", + "memory usage: 1.1+ MB\n" + ] + } + ], + "source": [ + "commissioner_positions.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2f5e89e6-c59a-43aa-b38a-b356d84a3f41", + "metadata": {}, + "outputs": [], + "source": [ + "position_fields = [\n", + " \"position_id\",\n", + " \"position_name\",\n", + " \"reference_year\",\n", + " \"frequency\"\n", + "]\n", + "commissioner_positions = commissioner_positions[position_fields]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "41bfa9e9-feb1-4b1f-b8d4-811a4110ad1e", + "metadata": {}, + "outputs": [], + "source": [ + "engine = dbcp.helpers.get_sql_engine()\n", + "with engine.connect() as con:\n", + " br_election_data = pd.read_sql_table(\"br_election_data\", con, schema=\"data_mart\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "763dd6dc-4c76-4609-a3c8-ccac37c75b66", + "metadata": {}, + "outputs": [], + "source": [ + "commissioner_races = br_election_data.query(\"tier > 2 & is_judicial == False & normalized_position_id in (910,912)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "80097d81-bcdc-4385-b865-58bf65591e15", + "metadata": {}, + "outputs": [], + "source": [ + "pos_grp = commissioner_races.groupby([\"election_id\", \"county_id_fips\"]).nunique().position_id" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a84e8b6b-f52d-4d7a-9e27-27bfe6c10453", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "election_id county_id_fips\n", + "4217 53001 2\n", + " 53003 2\n", + " 53005 2\n", + " 53007 2\n", + " 53009 1\n", + " ..\n", + "6909 40145 1\n", + " 40147 1\n", + " 40149 1\n", + " 40151 1\n", + " 40153 1\n", + "Name: position_id, Length: 6708, dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pos_grp" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2db06bcd-4648-4946-816d-4041ffdbd9bb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "election_id county_id_fips\n", + "5194 22047 37\n", + "4718 22047 37\n", + "5194 22007 37\n", + "4706 55025 37\n", + "4718 22017 38\n", + "5292 55073 38\n", + "5194 22057 38\n", + "4718 22057 38\n", + "5194 22073 38\n", + "4706 55073 38\n", + "4718 22073 38\n", + "5194 22017 38\n", + "4407 36001 39\n", + "5194 22009 39\n", + "5225 36001 39\n", + "4718 22009 39\n", + " 22079 40\n", + "5194 22079 40\n", + "4718 22033 40\n", + "5194 22033 40\n", + " 22013 41\n", + " 22059 41\n", + "4718 22059 41\n", + " 22013 41\n", + "5194 22099 45\n", + "4718 22099 45\n", + "5194 22041 45\n", + "4718 22061 45\n", + " 22041 45\n", + "5194 22061 45\n", + " 22045 46\n", + "4718 22127 46\n", + "5194 22127 46\n", + "4718 22045 46\n", + "5194 22021 47\n", + "4718 22021 47\n", + " 22005 51\n", + "5194 22005 51\n", + "4718 22069 64\n", + "5194 22069 64\n", + "Name: position_id, dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pos_grp[pos_grp > 1].sort_values().tail(40)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "9505312d-12c8-446e-b65f-cbda1899d6a9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
state_namecounty_nameraw_countyelection_idelection_nameelection_dayrace_idis_primaryis_runoffis_unexpiredposition_idposition_namesub_area_namesub_area_valuesub_area_name_secondarysub_area_value_secondaryraw_stateleveltieris_judicialis_retentionnumber_of_seatsnormalized_position_idnormalized_position_namefrequencyreference_yearpartisan_typerace_created_atrace_updated_atstate_id_fipscounty_id_fips
69298New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627002TrueFalseFalse263057Albany County Legislature - District 19District19NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:17.0203636001
69299New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627008TrueFalseFalse263068Albany County Legislature - District 29District29NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:18.7813636001
69300New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627040TrueFalseFalse263073Albany County Legislature - District 33District33NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:18.8843636001
69301New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627041TrueFalseFalse263062Albany County Legislature - District 23District23NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:17.8543636001
69303New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627061TrueFalseFalse263060Albany County Legislature - District 21District21NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:17.7813636001
69304New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627064TrueFalseFalse263081Albany County Legislature - District 5District5NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:19.0443636001
69305New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627066TrueFalseFalse263076Albany County Legislature - District 36District36NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:18.9663636001
69307New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627082TrueFalseFalse263077Albany County Legislature - District 37District37NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:18.8753636001
69308New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627091TrueFalseFalse263083Albany County Legislature - District 7District7NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:19.0323636001
69313New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627123TrueFalseFalse263051Albany County Legislature - District 13District13NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:16.8903636001
69314New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627124TrueFalseFalse263085Albany County Legislature - District 9District9NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:19.3823636001
69319New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627156TrueFalseFalse263075Albany County Legislature - District 35District35NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:18.8823636001
69320New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627162TrueFalseFalse263078Albany County Legislature - District 38District38NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:18.9593636001
69321New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627164TrueFalseFalse263074Albany County Legislature - District 34District34NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:19.0153636001
69325New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627261TrueFalseFalse263079Albany County Legislature - District 39District39NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:18.9513636001
69326New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627266TrueFalseFalse263053Albany County Legislature - District 15District15NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:16.9273636001
69328New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627305TrueFalseFalse263056Albany County Legislature - District 18District18NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:16.9423636001
69340New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627385TrueFalseFalse263082Albany County Legislature - District 6District6NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:19.1603636001
69341New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627386TrueFalseFalse263063Albany County Legislature - District 24District24NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:17.8953636001
69342New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627390TrueFalseFalse263064Albany County Legislature - District 25District25NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:17.8423636001
69344New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627405TrueFalseFalse263066Albany County Legislature - District 27District27NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:18.6583636001
69347New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627455TrueFalseFalse263072Albany County Legislature - District 32District32NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:18.9483636001
69348New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627459TrueFalseFalse263080Albany County Legislature - District 4District4NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:19.0013636001
69352New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627513TrueFalseFalse263070Albany County Legislature - District 30District30NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:19.0983636001
69359New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627559TrueFalseFalse263065Albany County Legislature - District 26District26NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:17.9113636001
69362New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627578TrueFalseFalse263048Albany County Legislature - District 10District10NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:16.7533636001
69365New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627618TrueFalseFalse263058Albany County Legislature - District 2District2NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:17.1163636001
69373New YorkAlbanyAlbany County5225New York Primary Election2023-06-271627670TrueFalseFalse263069Albany County Legislature - District 3District3NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:18.9223636001
69408New YorkAlbanyAlbany County5225New York Primary Election2023-06-271628063TrueFalseFalse263050Albany County Legislature - District 12District12NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:16.9483636001
69418New YorkAlbanyAlbany County5225New York Primary Election2023-06-271628210TrueFalseFalse263052Albany County Legislature - District 14District14NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:16.9673636001
69419New YorkAlbanyAlbany County5225New York Primary Election2023-06-271628220TrueFalseFalse263067Albany County Legislature - District 28District28NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:18.7123636001
69428New YorkAlbanyAlbany County5225New York Primary Election2023-06-271628319TrueFalseFalse263071Albany County Legislature - District 31District31NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:18.9893636001
69430New YorkAlbanyAlbany County5225New York Primary Election2023-06-271628324TrueFalseFalse263054Albany County Legislature - District 16District16NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:17.0073636001
69434New YorkAlbanyAlbany County5225New York Primary Election2023-06-271628351TrueFalseFalse263049Albany County Legislature - District 11District11NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:16.9193636001
69437New YorkAlbanyAlbany County5225New York Primary Election2023-06-271628373TrueFalseFalse263047Albany County Legislature - District 1District1NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:16.8213636001
69472New YorkAlbanyAlbany County5225New York Primary Election2023-06-271628698TrueFalseFalse263055Albany County Legislature - District 17District17NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:17.1043636001
69475New YorkAlbanyAlbany County5225New York Primary Election2023-06-271628732TrueFalseFalse263059Albany County Legislature - District 20District20NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:17.1533636001
69478New YorkAlbanyAlbany County5225New York Primary Election2023-06-271628748TrueFalseFalse263084Albany County Legislature - District 8District8NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:19.0213636001
69479New YorkAlbanyAlbany County5225New York Primary Election2023-06-271628752TrueFalseFalse263061Albany County Legislature - District 22District22NoneNoneNYcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:19:50.8942023-04-05 14:30:17.8363636001
\n", + "
" + ], + "text/plain": [ + " state_name county_name raw_county election_id election_name election_day race_id is_primary is_runoff is_unexpired position_id position_name sub_area_name sub_area_value sub_area_name_secondary sub_area_value_secondary raw_state level tier is_judicial is_retention number_of_seats normalized_position_id normalized_position_name frequency reference_year partisan_type race_created_at race_updated_at state_id_fips county_id_fips\n", + "69298 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627002 True False False 263057 Albany County Legislature - District 19 District 19 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:17.020 36 36001\n", + "69299 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627008 True False False 263068 Albany County Legislature - District 29 District 29 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:18.781 36 36001\n", + "69300 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627040 True False False 263073 Albany County Legislature - District 33 District 33 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:18.884 36 36001\n", + "69301 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627041 True False False 263062 Albany County Legislature - District 23 District 23 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:17.854 36 36001\n", + "69303 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627061 True False False 263060 Albany County Legislature - District 21 District 21 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:17.781 36 36001\n", + "69304 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627064 True False False 263081 Albany County Legislature - District 5 District 5 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:19.044 36 36001\n", + "69305 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627066 True False False 263076 Albany County Legislature - District 36 District 36 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:18.966 36 36001\n", + "69307 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627082 True False False 263077 Albany County Legislature - District 37 District 37 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:18.875 36 36001\n", + "69308 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627091 True False False 263083 Albany County Legislature - District 7 District 7 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:19.032 36 36001\n", + "69313 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627123 True False False 263051 Albany County Legislature - District 13 District 13 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:16.890 36 36001\n", + "69314 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627124 True False False 263085 Albany County Legislature - District 9 District 9 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:19.382 36 36001\n", + "69319 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627156 True False False 263075 Albany County Legislature - District 35 District 35 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:18.882 36 36001\n", + "69320 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627162 True False False 263078 Albany County Legislature - District 38 District 38 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:18.959 36 36001\n", + "69321 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627164 True False False 263074 Albany County Legislature - District 34 District 34 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:19.015 36 36001\n", + "69325 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627261 True False False 263079 Albany County Legislature - District 39 District 39 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:18.951 36 36001\n", + "69326 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627266 True False False 263053 Albany County Legislature - District 15 District 15 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:16.927 36 36001\n", + "69328 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627305 True False False 263056 Albany County Legislature - District 18 District 18 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:16.942 36 36001\n", + "69340 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627385 True False False 263082 Albany County Legislature - District 6 District 6 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:19.160 36 36001\n", + "69341 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627386 True False False 263063 Albany County Legislature - District 24 District 24 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:17.895 36 36001\n", + "69342 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627390 True False False 263064 Albany County Legislature - District 25 District 25 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:17.842 36 36001\n", + "69344 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627405 True False False 263066 Albany County Legislature - District 27 District 27 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:18.658 36 36001\n", + "69347 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627455 True False False 263072 Albany County Legislature - District 32 District 32 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:18.948 36 36001\n", + "69348 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627459 True False False 263080 Albany County Legislature - District 4 District 4 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:19.001 36 36001\n", + "69352 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627513 True False False 263070 Albany County Legislature - District 30 District 30 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:19.098 36 36001\n", + "69359 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627559 True False False 263065 Albany County Legislature - District 26 District 26 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:17.911 36 36001\n", + "69362 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627578 True False False 263048 Albany County Legislature - District 10 District 10 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:16.753 36 36001\n", + "69365 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627618 True False False 263058 Albany County Legislature - District 2 District 2 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:17.116 36 36001\n", + "69373 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1627670 True False False 263069 Albany County Legislature - District 3 District 3 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:18.922 36 36001\n", + "69408 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1628063 True False False 263050 Albany County Legislature - District 12 District 12 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:16.948 36 36001\n", + "69418 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1628210 True False False 263052 Albany County Legislature - District 14 District 14 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:16.967 36 36001\n", + "69419 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1628220 True False False 263067 Albany County Legislature - District 28 District 28 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:18.712 36 36001\n", + "69428 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1628319 True False False 263071 Albany County Legislature - District 31 District 31 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:18.989 36 36001\n", + "69430 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1628324 True False False 263054 Albany County Legislature - District 16 District 16 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:17.007 36 36001\n", + "69434 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1628351 True False False 263049 Albany County Legislature - District 11 District 11 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:16.919 36 36001\n", + "69437 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1628373 True False False 263047 Albany County Legislature - District 1 District 1 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:16.821 36 36001\n", + "69472 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1628698 True False False 263055 Albany County Legislature - District 17 District 17 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:17.104 36 36001\n", + "69475 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1628732 True False False 263059 Albany County Legislature - District 20 District 20 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:17.153 36 36001\n", + "69478 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1628748 True False False 263084 Albany County Legislature - District 8 District 8 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:19.021 36 36001\n", + "69479 New York Albany Albany County 5225 New York Primary Election 2023-06-27 1628752 True False False 263061 Albany County Legislature - District 22 District 22 None None NY county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:19:50.894 2023-04-05 14:30:17.836 36 36001" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commissioner_races.query(\"election_id == 5225 & county_id_fips == '36001'\")" + ] + }, + { + "cell_type": "markdown", + "id": "3b70a98c-bb9f-4073-9d80-76b50cc7a744", + "metadata": {}, + "source": [ + "Looks like there are multiple districts per county which makes sense." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "aef82113-d804-4a6c-b46c-8ff257200300", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
state_namecounty_nameraw_countyelection_idelection_nameelection_dayrace_idis_primaryis_runoffis_unexpiredposition_idposition_namesub_area_namesub_area_valuesub_area_name_secondarysub_area_value_secondaryraw_stateleveltieris_judicialis_retentionnumber_of_seatsnormalized_position_idnormalized_position_namefrequencyreference_yearpartisan_typerace_created_atrace_updated_atstate_id_fipscounty_id_fips
16152LouisianaNatchitochesNatchitoches Parish4718Louisiana General Election2023-11-181454013FalseFalseFalse239163Sabine Parish Police Juror - District 7District7NoneNoneLAcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:12:39.1992021-09-30 19:37:57.4062222069
16274LouisianaNatchitochesNatchitoches Parish4718Louisiana General Election2023-11-181454034FalseFalseFalse239046Grant Parish Police Juror - District 4District4NoneNoneLAcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:12:39.1992021-09-30 19:37:57.4062222069
16316LouisianaNatchitochesNatchitoches Parish4718Louisiana General Election2023-11-181454051FalseFalseFalse239260Vernon Parish Police Juror - District 8District8NoneNoneLAcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:12:39.1992021-09-30 19:37:57.4062222069
16331LouisianaNatchitochesNatchitoches Parish4718Louisiana General Election2023-11-181454053FalseFalseFalse238931Bienville Parish Police Juror - District 4District4NoneNoneLAcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:12:39.1992021-09-30 19:37:57.4062222069
16361LouisianaNatchitochesNatchitoches Parish4718Louisiana General Election2023-11-181454060FalseFalseFalse238928Bienville Parish Police Juror - District 1District1NoneNoneLAcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:12:39.1992021-09-30 19:37:57.4062222069
................................................................................................
19504LouisianaNatchitochesNatchitoches Parish4718Louisiana General Election2023-11-181454761FalseFalseFalse239294Winn Parish Police Juror - District 6District6NoneNoneLAcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:12:39.1992021-09-30 19:37:57.4062222069
19567LouisianaNatchitochesNatchitoches Parish4718Louisiana General Election2023-11-181454773FalseFalseFalse238933Bienville Parish Police Juror - District 6District6NoneNoneLAcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:12:39.1992021-09-30 19:37:57.4062222069
19611LouisianaNatchitochesNatchitoches Parish4718Louisiana General Election2023-11-181454783FalseFalseFalse239137Rapides Parish Police Juror - District FDistrictFNoneNoneLAcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:12:39.1992021-09-30 19:37:57.4062222069
19687LouisianaNatchitochesNatchitoches Parish4718Louisiana General Election2023-11-181454804FalseFalseFalse239258Vernon Parish Police Juror - District 6District6NoneNoneLAcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:12:39.1992021-09-30 19:37:57.4062222069
19802LouisianaNatchitochesNatchitoches Parish4718Louisiana General Election2023-11-181454837FalseFalseFalse239143Red River Parish Police Juror - District 3District3NoneNoneLAcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:12:39.1992021-09-30 19:37:57.4062222069
\n", + "

64 rows × 31 columns

\n", + "
" + ], + "text/plain": [ + " state_name county_name raw_county election_id election_name election_day race_id is_primary is_runoff is_unexpired position_id position_name sub_area_name sub_area_value sub_area_name_secondary sub_area_value_secondary raw_state level tier is_judicial is_retention number_of_seats normalized_position_id normalized_position_name frequency reference_year partisan_type race_created_at race_updated_at state_id_fips county_id_fips\n", + "16152 Louisiana Natchitoches Natchitoches Parish 4718 Louisiana General Election 2023-11-18 1454013 False False False 239163 Sabine Parish Police Juror - District 7 District 7 None None LA county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:12:39.199 2021-09-30 19:37:57.406 22 22069\n", + "16274 Louisiana Natchitoches Natchitoches Parish 4718 Louisiana General Election 2023-11-18 1454034 False False False 239046 Grant Parish Police Juror - District 4 District 4 None None LA county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:12:39.199 2021-09-30 19:37:57.406 22 22069\n", + "16316 Louisiana Natchitoches Natchitoches Parish 4718 Louisiana General Election 2023-11-18 1454051 False False False 239260 Vernon Parish Police Juror - District 8 District 8 None None LA county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:12:39.199 2021-09-30 19:37:57.406 22 22069\n", + "16331 Louisiana Natchitoches Natchitoches Parish 4718 Louisiana General Election 2023-11-18 1454053 False False False 238931 Bienville Parish Police Juror - District 4 District 4 None None LA county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:12:39.199 2021-09-30 19:37:57.406 22 22069\n", + "16361 Louisiana Natchitoches Natchitoches Parish 4718 Louisiana General Election 2023-11-18 1454060 False False False 238928 Bienville Parish Police Juror - District 1 District 1 None None LA county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:12:39.199 2021-09-30 19:37:57.406 22 22069\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "19504 Louisiana Natchitoches Natchitoches Parish 4718 Louisiana General Election 2023-11-18 1454761 False False False 239294 Winn Parish Police Juror - District 6 District 6 None None LA county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:12:39.199 2021-09-30 19:37:57.406 22 22069\n", + "19567 Louisiana Natchitoches Natchitoches Parish 4718 Louisiana General Election 2023-11-18 1454773 False False False 238933 Bienville Parish Police Juror - District 6 District 6 None None LA county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:12:39.199 2021-09-30 19:37:57.406 22 22069\n", + "19611 Louisiana Natchitoches Natchitoches Parish 4718 Louisiana General Election 2023-11-18 1454783 False False False 239137 Rapides Parish Police Juror - District F District F None None LA county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:12:39.199 2021-09-30 19:37:57.406 22 22069\n", + "19687 Louisiana Natchitoches Natchitoches Parish 4718 Louisiana General Election 2023-11-18 1454804 False False False 239258 Vernon Parish Police Juror - District 6 District 6 None None LA county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:12:39.199 2021-09-30 19:37:57.406 22 22069\n", + "19802 Louisiana Natchitoches Natchitoches Parish 4718 Louisiana General Election 2023-11-18 1454837 False False False 239143 Red River Parish Police Juror - District 3 District 3 None None LA county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:12:39.199 2021-09-30 19:37:57.406 22 22069\n", + "\n", + "[64 rows x 31 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "la_races = commissioner_races.query(\"election_id == 4718 & county_id_fips == '22069'\")\n", + "la_races" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f678512d-a064-434b-aac3-0216469528fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
state_namecounty_nameraw_countyelection_idelection_nameelection_dayrace_idis_primaryis_runoffis_unexpiredposition_idposition_namesub_area_namesub_area_valuesub_area_name_secondarysub_area_value_secondaryraw_stateleveltieris_judicialis_retentionnumber_of_seatsnormalized_position_idnormalized_position_namefrequencyreference_yearpartisan_typerace_created_atrace_updated_atstate_id_fipscounty_id_fips
17290LouisianaNatchitochesNatchitoches Parish4718Louisiana General Election2023-11-181454280FalseFalseFalse238799Natchitoches Parish Council - District 3District3NoneNoneLAcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:12:39.1992023-07-06 15:30:10.1772222069
17336LouisianaNatchitochesNatchitoches Parish4718Louisiana General Election2023-11-181454291FalseFalseFalse238801Natchitoches Parish Council - District 5District5NoneNoneLAcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:12:39.1992023-07-06 15:30:10.3612222069
18234LouisianaNatchitochesNatchitoches Parish4718Louisiana General Election2023-11-181454509FalseFalseFalse238797Natchitoches Parish Council - District 1District1NoneNoneLAcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:12:39.1992023-07-06 15:30:10.1072222069
18280LouisianaNatchitochesNatchitoches Parish4718Louisiana General Election2023-11-181454524FalseFalseFalse238800Natchitoches Parish Council - District 4District4NoneNoneLAcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:12:39.1992023-07-06 15:30:10.7402222069
18353LouisianaNatchitochesNatchitoches Parish4718Louisiana General Election2023-11-181454548FalseFalseFalse238798Natchitoches Parish Council - District 2District2NoneNoneLAcounty3FalseFalse1910County Legislature//Executive Board[4]2023partisan2020-01-14 23:12:39.1992023-07-06 15:30:09.8732222069
\n", + "
" + ], + "text/plain": [ + " state_name county_name raw_county election_id election_name election_day race_id is_primary is_runoff is_unexpired position_id position_name sub_area_name sub_area_value sub_area_name_secondary sub_area_value_secondary raw_state level tier is_judicial is_retention number_of_seats normalized_position_id normalized_position_name frequency reference_year partisan_type race_created_at race_updated_at state_id_fips county_id_fips\n", + "17290 Louisiana Natchitoches Natchitoches Parish 4718 Louisiana General Election 2023-11-18 1454280 False False False 238799 Natchitoches Parish Council - District 3 District 3 None None LA county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:12:39.199 2023-07-06 15:30:10.177 22 22069\n", + "17336 Louisiana Natchitoches Natchitoches Parish 4718 Louisiana General Election 2023-11-18 1454291 False False False 238801 Natchitoches Parish Council - District 5 District 5 None None LA county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:12:39.199 2023-07-06 15:30:10.361 22 22069\n", + "18234 Louisiana Natchitoches Natchitoches Parish 4718 Louisiana General Election 2023-11-18 1454509 False False False 238797 Natchitoches Parish Council - District 1 District 1 None None LA county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:12:39.199 2023-07-06 15:30:10.107 22 22069\n", + "18280 Louisiana Natchitoches Natchitoches Parish 4718 Louisiana General Election 2023-11-18 1454524 False False False 238800 Natchitoches Parish Council - District 4 District 4 None None LA county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:12:39.199 2023-07-06 15:30:10.740 22 22069\n", + "18353 Louisiana Natchitoches Natchitoches Parish 4718 Louisiana General Election 2023-11-18 1454548 False False False 238798 Natchitoches Parish Council - District 2 District 2 None None LA county 3 False False 1 910 County Legislature//Executive Board [4] 2023 partisan 2020-01-14 23:12:39.199 2023-07-06 15:30:09.873 22 22069" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "la_races[la_races.position_name.str.contains(\"Natchitoches\")]" + ] + }, + { + "cell_type": "markdown", + "id": "8d053b85-e1d9-463d-9734-a195bf015346", + "metadata": {}, + "source": [ + "It looks like LA has some straight up duplicates?" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "6324c5d6-d632-4134-ac49-c5e381d81192", + "metadata": {}, + "outputs": [], + "source": [ + "county_name_in_position = commissioner_races.apply(lambda row: row.county_name in row.position_name, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "7aac74c7-0dca-4bbb-9c48-d19646297bc1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True 23563\n", + "False 6005\n", + "dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "county_name_in_position.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "3ed7a30a-2b37-46bd-bbb7-42462d407a70", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
race_idcounty_namestate_nameraw_countyposition_name
7161371024BenewahIdahoBenewah CountyLatah County Commission - District 3
7171371024ClearwaterIdahoClearwater CountyLatah County Commission - District 3
7191371024Nez PerceIdahoNez Perce CountyLatah County Commission - District 3
7211371025CamasIdahoCamas CountyBlaine County Council - District 3
7221371025PowerIdahoPower CountyBlaine County Council - District 3
7241371026MinidokaIdahoMinidoka CountyCassia County Council - District 3
7251371027CaribouIdahoCaribou CountyFranklin County Council - District 3
7281371029IdahoIdahoIdaho CountyLewis County Council - District 3
7301371029Nez PerceIdahoNez Perce CountyLewis County Council - District 3
7311371030CanyonIdahoCanyon CountyPayette County Council - District 3
\n", + "
" + ], + "text/plain": [ + " race_id county_name state_name raw_county position_name\n", + "716 1371024 Benewah Idaho Benewah County Latah County Commission - District 3\n", + "717 1371024 Clearwater Idaho Clearwater County Latah County Commission - District 3\n", + "719 1371024 Nez Perce Idaho Nez Perce County Latah County Commission - District 3\n", + "721 1371025 Camas Idaho Camas County Blaine County Council - District 3\n", + "722 1371025 Power Idaho Power County Blaine County Council - District 3\n", + "724 1371026 Minidoka Idaho Minidoka County Cassia County Council - District 3\n", + "725 1371027 Caribou Idaho Caribou County Franklin County Council - District 3\n", + "728 1371029 Idaho Idaho Idaho County Lewis County Council - District 3\n", + "730 1371029 Nez Perce Idaho Nez Perce County Lewis County Council - District 3\n", + "731 1371030 Canyon Idaho Canyon County Payette County Council - District 3" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "incorrect_counties = commissioner_races[~county_name_in_position]\n", + "incorrect_counties[[\"race_id\", \"county_name\", \"state_name\", \"raw_county\", \"position_name\"]].head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "0b563bd1-99e2-48c2-8768-5c61e7721b2d", + "metadata": {}, + "source": [ + "- race_id = 1371024: Benewah, Clearwater, and Nez Perce are different counties than Latah county\n", + "- race_id (2778451 - 2778454): This one looks reasonable: nashville city council is in Davidson County.\n", + "- race_id = 1371025: camas and power are adjacent counties to Blaine.\n", + "- " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "831dd117-4c08-42e5-b9f3-e7bfc29ab0a6", + "metadata": {}, + "outputs": [], + "source": [ + "incorrect_counties.position_name.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aad66e6e-51ed-41ba-91ee-6b46b8bf9c9c", + "metadata": {}, + "outputs": [], + "source": [ + "incorrect_counties.position_name.str.contains(\"-\").value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a56c2d5-bf9e-4592-8aa3-3605cd148beb", + "metadata": {}, + "outputs": [], + "source": [ + "# Janky string clustering\n", + "# incorrect_counties.position_name.str.split(\"-\").str[0].str.split(\" \").str[1:].str.join(\" \").value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26ee671f-225e-4db8-bf6c-7de7a77848e9", + "metadata": {}, + "outputs": [], + "source": [ + "incorrect_counties.position_name.str.contains(\"County|Parish\").value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "b64d9972-2699-457b-a147-574913b48e2c", + "metadata": {}, + "source": [ + "## Clean incorrect comissions races" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5b7fd91-c778-447e-b62e-8237ad6bdd22", + "metadata": {}, + "outputs": [], + "source": [ + "county_name_in_position = commissioner_races.apply(lambda row: row.county_name in row.position_name, axis=1)\n", + "\n", + "commissioner_races_with_name = commissioner_races[county_name_in_position].copy()\n", + "commissioner_races_without_name = commissioner_races[~county_name_in_position].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb8b94a2-704f-4164-8126-3472498d9f07", + "metadata": {}, + "outputs": [], + "source": [ + "commissioner_races_without_name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1971da07-6dc5-44ab-9aa5-37873ec863d3", + "metadata": {}, + "outputs": [], + "source": [ + "is_likely_incorrectly_geocoded_comissioner_races = commissioner_races_without_name.position_name.str.contains(\"County|Parish\")\n", + "is_likely_incorrectly_geocoded_comissioner_races.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac5e728c-53b3-47da-b2bf-1ffecb009991", + "metadata": {}, + "outputs": [], + "source": [ + "corrected_comissioner_races = commissioner_races_with_name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6741a43d-755a-4f00-ac2c-82c254f9cd47", + "metadata": {}, + "outputs": [], + "source": [ + "corrected_comissioner_races[~corrected_comissioner_races.position_name.str.contains(\"County Commission\")].sample(10)" + ] + }, + { + "cell_type": "markdown", + "id": "fa048c4a-4c14-4ce8-9659-409250f05e98", + "metadata": {}, + "source": [ + "- AS: Justice of the Peace\n", + "- LA: Police Juror\n", + "- WI: Board of Supervisors\n", + "- IL: County Board\n", + "- WA: County Council\n", + "- NY: County Legislature, County Legislature" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9e08c6b-e24a-4733-a8dd-c89ed3e8235d", + "metadata": {}, + "outputs": [], + "source": [ + "corrected_comissioner_races.position_name.str.contains(\"City Council\").value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74b812b9-6105-4437-89e4-0617f7e6c93d", + "metadata": {}, + "outputs": [], + "source": [ + "corrected_comissioner_races = corrected_comissioner_races[~corrected_comissioner_races.position_name.str.contains(\"City Council\")]" + ] + }, + { + "cell_type": "markdown", + "id": "b70045cd-1206-4dc3-a1fb-34f2565762e3", + "metadata": {}, + "source": [ + "## relationships" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66ab35d0-aa33-4ff9-abd0-1cfc138eeed2", + "metadata": {}, + "outputs": [], + "source": [ + "grouped_races = corrected_comissioner_races.groupby([\"election_id\", \"county_id_fips\"])\n", + "grouped_races.nunique().reference_year.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c17286a-e8d6-4045-ae1d-c588bcbb1f74", + "metadata": {}, + "outputs": [], + "source": [ + "grouped_races.nunique().reference_year[grouped_races.nunique().reference_year > 1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "948f9b24-3ace-4d8b-949d-c2e3b98082cb", + "metadata": {}, + "outputs": [], + "source": [ + "fields = [\"position_name\", \"election_name\", \"reference_year\"]\n", + "corrected_comissioner_races.query(\"election_id == 4245 & county_id_fips == '16001'\")[fields]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41bfde47-567a-4874-b883-bc4a9ca6a859", + "metadata": {}, + "outputs": [], + "source": [ + "mode = lambda x: x.value_counts().index[0]\n", + "\n", + "grp_fields = [\"election_id\", \"county_id_fips\", \"election_name\", \"election_day\"]\n", + "agg_funcs = {\n", + " \"position_id\": \"count\",\n", + " \"number_of_seats\": \"sum\",\n", + " \"position_name\": lambda x: ','.join(x),\n", + " \"frequency\": mode,\n", + " \"reference_year\": mode\n", + "}\n", + "\n", + "rename_dict = {\n", + " \"number_of_seats\": \"total_n_of_seats\",\n", + " \"position_id\": \"total_n_races\",\n", + " \"position_name\": \"all_race_names\",\n", + "}\n", + "\n", + "comissioner_elections = corrected_comissioner_races.groupby(grp_fields).agg(agg_funcs).reset_index()\n", + "comissioner_elections = comissioner_elections.rename(columns=rename_dict)\n", + "\n", + "assert ~comissioner_elections.duplicated(subset=[\"county_id_fips\", \"election_id\"]).any(), \"County comissioner election primary key is not unique.\"\n", + "assert (comissioner_elections.total_n_of_seats >= comissioner_elections.total_n_races).all()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49b3441b-9c88-40ba-b4be-fe890be24328", + "metadata": {}, + "outputs": [], + "source": [ + "comissioner_elections" + ] + }, + { + "cell_type": "markdown", + "id": "8c2d15cf-3261-4d9f-90a6-196d50c70f4f", + "metadata": {}, + "source": [ + "## Create wide" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa13b3c1-9cb4-429c-9656-66c328815143", + "metadata": {}, + "outputs": [], + "source": [ + "from dbcp.data_mart.br_election_data import _create_br_election_data_mart, _create_county_commission_elections_long, _create_county_commission_elections_wide\n", + "import dbcp\n", + "\n", + "engine = dbcp.helpers.get_sql_engine()\n", + "\n", + "dfs = {}\n", + "\n", + "dfs[\"br_election_data\"] = _create_br_election_data_mart(engine)\n", + "\n", + "county_commission_elections_long = _create_county_commission_elections_long(\n", + " dfs[\"br_election_data\"]\n", + " )\n", + "\n", + "county_commission_elections_wide = _create_county_commission_elections_wide(\n", + " county_commission_elections_long\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b5d5c36-7c29-4697-bbe2-aa05a60cabb6", + "metadata": {}, + "outputs": [], + "source": [ + "county_commission_elections_wide" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66d2fec5-b7b9-4e11-8174-133cb2e27636", + "metadata": {}, + "outputs": [], + "source": [ + "e_type = \"primary\"\n", + "invalid = county_commission_elections_wide[f\"next_{e_type}_total_n_seats\"] >= county_commission_elections_wide[f\"next_{e_type}_total_n_races\"]\n", + "\n", + "invalid.all()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a940a513-8336-432d-b4a1-df6a143ed7e0", + "metadata": {}, + "outputs": [], + "source": [ + "with engine.connect() as con:\n", + " df = pd.read_sql_table(\n", + " \"county_commission_election_info\", con, schema=\"data_mart\"\n", + " ).convert_dtypes()\n", + " \n", + "\n", + "assert (\n", + " county_commission_elections_wide.next_primary_total_n_seats >= county_commission_elections_wide.next_primary_total_n_races\n", + ").all(), \"Found more races than seats in county_commission_election_info!\"\n", + "assert (\n", + " df.next_general_total_n_seats >= df.next_general_total_n_races\n", + ").all(), \"Found more races than seats in county_commission_election_info!\"\n", + "assert (\n", + " df.next_run_off_total_n_seats >= df.next_run_off_total_n_races\n", + ").all(), \"Found more races than seats in county_commission_election_info!\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abb6da3c-7ffb-4ac3-a15f-77b849be2b3e", + "metadata": {}, + "outputs": [], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8921acb-a7e6-46d2-bb6e-25882ba8c664", + "metadata": {}, + "outputs": [], + "source": [ + "invalid = ~(df.next_primary_total_n_seats >= df.next_primary_total_n_races)\n", + "\n", + "\n", + "df[invalid]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72969a5a-a832-4cd1-88a9-06c2cdc178c5", + "metadata": {}, + "outputs": [], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f85cbe8a-e81c-4ff7-8e55-8712b1dc1b8b", + "metadata": {}, + "outputs": [], + "source": [ + "county_commission_elections_wide.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16818fc4-2176-4853-819f-75f50e392e40", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10a3370e-d566-4291-b305-ba2d5b2d9a2f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ddf2cfa-2e2f-413e-9f3f-79e3bff75fca", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb04b59a-5b49-406d-bedb-f27cea0fc568", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ccef07f-3f26-4b4d-a7ae-55557bea5498", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "county_commission_elections_long[\"election_type\"] = pd.Series()\n", + "\n", + "county_commission_elections_long[\"election_type\"] = county_commission_elections_long.election_type.mask(county_commission_elections_long.is_primary, \"primary\")\n", + "county_commission_elections_long[\"election_type\"] = county_commission_elections_long.election_type.mask(county_commission_elections_long.is_runoff, \"run_off\")\n", + "county_commission_elections_long[\"election_type\"] = county_commission_elections_long[\"election_type\"].fillna(\"general\")\n", + "\n", + "county_commission_elections_long = county_commission_elections_long.drop(columns=[\"is_primary\", \"is_runoff\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40582a57-e5ea-4441-8c04-bda73391d261", + "metadata": {}, + "outputs": [], + "source": [ + "county_commission_elections_long[\"election_type\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e7f6268-a9da-483c-a1a2-67658609be57", + "metadata": {}, + "outputs": [], + "source": [ + "county_commission_elections_long" + ] + }, + { + "cell_type": "markdown", + "id": "6ec82c57-e35b-4006-8c07-50f2b0c1724c", + "metadata": {}, + "source": [ + "Can a county have multiple upcoming elections?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8f2983d-5e81-4012-a375-be4cb6b60e51", + "metadata": {}, + "outputs": [], + "source": [ + "for election_type in county_commission_elections_long.election_type.unique().tolist():\n", + " com_election_type = county_commission_elections_long.query(\"election_type == @election_type\")\n", + " print(com_election_type.groupby(\"county_id_fips\").election_id.nunique().value_counts())\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "id": "9702a970-b882-46e2-bdc5-93ec236fbe86", + "metadata": {}, + "source": [ + "Yes!" + ] + }, + { + "cell_type": "markdown", + "id": "bf350810-9594-47ba-9bf8-8036b8ddb13c", + "metadata": {}, + "source": [ + "- select the closest upcoming election for each county/type\n", + "- pivot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8708f01d-51ee-4763-99bf-b17d8328fddc", + "metadata": {}, + "outputs": [], + "source": [ + "next_county_commission_elections_long = county_commission_elections_long.loc[county_commission_elections_long.groupby([\"county_id_fips\", \"election_type\"])[\"election_day\"].idxmax()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a324711-48a4-40e8-b2e0-33cc3fc6e046", + "metadata": {}, + "outputs": [], + "source": [ + "next_county_commission_elections_wide = next_county_commission_elections_long.pivot(index=[\"county_id_fips\",\"county_name\"], columns=[\"election_type\"])\n", + "\n", + "next_county_commission_elections_wide.columns = next_county_commission_elections_wide.swaplevel(axis=1).columns\n", + "next_county_commission_elections_wide = next_county_commission_elections_wide.sort_index(axis=1, level='election_type')\n", + "\n", + "next_county_commission_elections_wide.columns = next_county_commission_elections_wide.columns.map('_'.join)\n", + "next_county_commission_elections_wide.columns = [\"next_\" + col for col in next_county_commission_elections_wide.columns]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b028d376-6eee-4865-801d-2a616f60adb4", + "metadata": {}, + "outputs": [], + "source": [ + "next_county_commission_elections_wide = next_county_commission_elections_wide.reset_index().convert_dtypes()\n", + "\n", + "assert next_county_commission_elections_wide.county_id_fips.is_unique, \"county_id_fips is not unique!\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec3efd84-55a8-4b02-b752-6e9561779a31", + "metadata": {}, + "outputs": [], + "source": [ + "next_county_commission_elections_wide.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a66d257f-f757-45c0-9684-7d0599fe1a0a", + "metadata": {}, + "outputs": [], + "source": [ + "next_county_commission_elections_wide.next_run_off_election_id.isna().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3aa599a8-e6c8-4f6e-be3c-6f7a171f6eac", + "metadata": {}, + "outputs": [], + "source": [ + "e_type = \"general\"\n", + "invalid = ~(next_county_commission_elections_wide[f\"next_{e_type}_total_n_seats\"] >= next_county_commission_elections_wide[f\"next_{e_type}_total_n_races\"])\n", + "next_county_commission_elections_wide[invalid]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75e73f5d-2b57-41a5-b88f-4226500eb6d0", + "metadata": {}, + "outputs": [], + "source": [ + "\"next_general_total_n_seats\" == \"next_general_total_n_of_races\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cfb3976-baf9-4b13-8b6a-4aa5c53a1694", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c14d8454-46e5-4d02-bf50-ed753ef0d315", + "metadata": {}, + "outputs": [], + "source": [ + "data = {\n", + " ('A', 'X'): [1, 2, 3],\n", + " ('A', 'Y'): [4, 5, 6],\n", + " ('B', 'X'): [7, 8, 9],\n", + " ('B', 'Y'): [10, 11, 12],\n", + "}\n", + "\n", + "df = pd.DataFrame(data)\n", + "\n", + "# Concatenate the multi-index columns into a single index\n", + "df.columns = df.columns.map('_'.join)\n", + "\n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "749155ad-0646-4164-a228-3783d6578d39", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1720fd6f-1f35-49eb-830d-dab87b98e9e6", + "metadata": {}, + "outputs": [], + "source": [ + "next_county_commission_elections_wide" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc7bbd30-b4bc-45d5-bbe2-a1b0cd20caeb", + "metadata": {}, + "outputs": [], + "source": [ + "next_county_commission_elections_wide.sort_index(axis=1, level='election_type').columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36646bdf-a903-41c3-8498-f56ebd547f20", + "metadata": {}, + "outputs": [], + "source": [ + "key=lambda x: x.map({'general': 1, 'primary': 2, 'runoff': 3}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87a2318c-521d-4719-8f1d-53255f654c9b", + "metadata": {}, + "outputs": [], + "source": [ + "list(next_county_commission_elections_wide)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71d9ee29-a057-46df-baec-b57944e30e8f", + "metadata": {}, + "outputs": [], + "source": [ + "next_county_commission_elections_wide.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7ec551e-7c83-4844-8ad3-07a2ba8321e5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89625c00-0145-4c25-af26-d31923986c85", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "# Create a sample DataFrame with multi-index columns\n", + "data = {\n", + " ('A', 'X'): [1, 2, 3],\n", + " ('A', 'Y'): [4, 5, 6],\n", + " ('B', 'X'): [7, 8, 9],\n", + " ('B', 'Y'): [10, 11, 12],\n", + "}\n", + "\n", + "df = pd.DataFrame(data)\n", + "\n", + "df.columns = df.columns.map('_'.join)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efaed5f1-4c23-4864-b58c-f37431fbe505", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40838292-2398-44d3-bb61-157831816fdf", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "623eb1fd-5ec0-4f70-ae11-d53dd1d30836", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a7e2fd2-355d-4cda-b89d-e7c688470ed0", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9026ad3f-c051-4645-b04d-99b091ef6573", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9517b78-42f3-4d17-8901-0c610eed2d1e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68eb2a8c-44ca-4354-aaf5-af230358739f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b2d29fa-d9db-4e7f-906e-bda81f166b6c", + "metadata": {}, + "outputs": [], + "source": [ + "generals.groupby(\"county_id_fips\").election_id.nunique().sort_values(ascending=False).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db7efb13-0342-4c13-8b9d-b78eda1f0cd4", + "metadata": {}, + "outputs": [], + "source": [ + "generals.query(\"county_id_fips == '34027'\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8443d49-f3e0-4539-b452-fd64e254c5bf", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91a4ea61-c38c-41a3-b1d2-356a54713599", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a946eb1c-a975-431d-8bbf-02d507e4a56f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eadd8c13-4d70-4544-9e8e-369b3c9aad3c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "223c9af9-b432-451b-8bd1-966b7923db1e", + "metadata": {}, + "outputs": [], + "source": [ + "county_commission_elections_long.pivot(index=\"county_id_fips\", columns=[\"election_type\", \"election_name\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c3f39f4-8dcb-4bdf-b9a4-937c721d104a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/35-bdn-duplicate-raw-ballot-ready.ipynb b/notebooks/35-bdn-duplicate-raw-ballot-ready.ipynb new file mode 100644 index 00000000..ebdbdae2 --- /dev/null +++ b/notebooks/35-bdn-duplicate-raw-ballot-ready.ipynb @@ -0,0 +1,275 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "46b6a1ab-786a-4632-bc07-0bcbc79f93a0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/app/.local/lib/python3.10/site-packages/geopandas/_compat.py:123: UserWarning: The Shapely GEOS version (3.11.1-CAPI-1.17.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.3-CAPI-1.16.1). Conversions between both will be slow.\n", + " warnings.warn(\n", + "/app/.local/lib/python3.10/site-packages/pudl/analysis/spatial.py:7: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:\n", + "\n", + "import os\n", + "os.environ['USE_PYGEOS'] = '0'\n", + "import geopandas\n", + "\n", + "In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).\n", + " import geopandas as gpd\n" + ] + } + ], + "source": [ + "import dbcp" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "4838a807-0f29-4c04-ac76-bcc39f6c870e", + "metadata": {}, + "outputs": [], + "source": [ + "source_uri = \"gs://dgm-archive/ballot_ready/BallotReady_upcoming_races_with_counties_08_14_2023.csv\"\n", + "raw_df = dbcp.extract.ballot_ready.extract(source_uri)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f671c540-12d0-4295-88aa-8d67730d5eb5", + "metadata": {}, + "outputs": [], + "source": [ + "raw_ballot_ready = raw_df[\"raw_ballot_ready\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1bde508d-217b-4475-b07d-0f171b185239", + "metadata": {}, + "outputs": [], + "source": [ + "ballot_ready = raw_ballot_ready.convert_dtypes()\n", + "\n", + "# Explode counties column\n", + "ballot_ready[\"counties\"] = (\n", + " ballot_ready.counties.str.replace('\"', \"\").str[1:-1].str.split(\", \")\n", + ")\n", + "\n", + "exp_ballot_ready = ballot_ready.explode(\"counties\").rename(\n", + " columns={\"counties\": \"county\"}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "a42e77c8-4002-4979-9d51-29b84f89bf5b", + "metadata": {}, + "outputs": [], + "source": [ + "duplicate_race = exp_ballot_ready.duplicated(\n", + " subset=[\"county\", \"race_id\"], keep=False\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "07db98a7-1aca-4d27-bb25-e07f94b6ec33", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 188780\n", + "True 20\n", + "dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "duplicate_race.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "2cd0ebcd-4270-40ea-8264-1ab935d4ac55", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "duplicate_race.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "73dd6752-26f3-4aa0-a95d-1a3f46fb47e1", + "metadata": {}, + "outputs": [], + "source": [ + "exp_ballot_ready[duplicate_race].to_csv(\"br_duplicate_raw_counties.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "7e45a70c-c943-44b9-877d-2d88464a162d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idelection_idelection_nameelection_dayrace_idis_primaryis_runoffis_unexpiredposition_idposition_namesub_area_namesub_area_valuesub_area_name_secondarysub_area_value_secondarystateleveltieris_judicialis_retentionnumber_of_seatsnormalized_position_idnormalized_position_nameposition_descriptionfrequencyreference_yearpartisan_typecountiesrace_created_atrace_updated_at
21813710244245Idaho General Election2024-11-051371024FalseFalseFalse269679Latah County Commission - District 3District3NaNNaNIDcounty3FalseFalse1910County Legislature//Executive BoardThe County Legislature or Executive Board is t...[2, 4]2022partisan[\"Benewah County\", \"Clearwater County\", \"Latah...2020-01-14 23:08:36.6612020-01-14 23:08:36.661
\n", + "
" + ], + "text/plain": [ + " id election_id election_name election_day race_id is_primary is_runoff is_unexpired position_id position_name sub_area_name sub_area_value sub_area_name_secondary sub_area_value_secondary state level tier is_judicial is_retention number_of_seats normalized_position_id normalized_position_name position_description frequency reference_year partisan_type counties race_created_at race_updated_at\n", + "218 1371024 4245 Idaho General Election 2024-11-05 1371024 False False False 269679 Latah County Commission - District 3 District 3 NaN NaN ID county 3 False False 1 910 County Legislature//Executive Board The County Legislature or Executive Board is t... [2, 4] 2022 partisan [\"Benewah County\", \"Clearwater County\", \"Latah... 2020-01-14 23:08:36.661 2020-01-14 23:08:36.661" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_ballot_ready.query(\"race_id == 1371024\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c87ac13-1f9c-4a6b-aff1-2134fb8367f7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/36-tpb-ballot-ready-normalization.ipynb b/notebooks/36-tpb-ballot-ready-normalization.ipynb new file mode 100644 index 00000000..378130ad --- /dev/null +++ b/notebooks/36-tpb-ballot-ready-normalization.ipynb @@ -0,0 +1,1537 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "27b8d0af-63da-4b7e-8c03-ae2fec3d5b1e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/app/.local/lib/python3.10/site-packages/geopandas/_compat.py:123: UserWarning: The Shapely GEOS version (3.11.1-CAPI-1.17.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.3-CAPI-1.16.1). Conversions between both will be slow.\n", + " warnings.warn(\n", + "/app/.local/lib/python3.10/site-packages/pudl/analysis/spatial.py:7: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:\n", + "\n", + "import os\n", + "os.environ['USE_PYGEOS'] = '0'\n", + "import geopandas\n", + "\n", + "In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).\n", + " import geopandas as gpd\n", + "2023-09-12 03:57:46 [ INFO] catalystcoop.pudl.helpers:203 Assigned state FIPS codes for 100.00% of records.\n", + "2023-09-12 03:57:50 [ INFO] catalystcoop.pudl.helpers:219 Assigned county FIPS codes for 99.61% of records.\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import dbcp\n", + "\n", + "source_uri = \"gs://dgm-archive/ballot_ready/BallotReady_upcoming_races_with_counties_08_14_2023.csv\"\n", + "raw_dfs = dbcp.extract.ballot_ready.extract(source_uri)\n", + "raw_ballot_ready = raw_dfs[\"raw_ballot_ready\"]\n", + "br_election_data = dbcp.transform.ballot_ready._explode_counties(raw_ballot_ready)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5a67b7aa-9be1-4a9e-9787-02a968d452a6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 188074 entries, 1543 to 82775\n", + "Data columns (total 29 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 election_id 188074 non-null Int64 \n", + " 1 election_name 188074 non-null string \n", + " 2 election_day 188074 non-null datetime64[ns]\n", + " 3 race_id 188074 non-null Int64 \n", + " 4 is_primary 188074 non-null boolean \n", + " 5 is_runoff 188074 non-null boolean \n", + " 6 is_unexpired 188074 non-null boolean \n", + " 7 position_id 188074 non-null Int64 \n", + " 8 position_name 188074 non-null string \n", + " 9 sub_area_name 114957 non-null string \n", + " 10 sub_area_value 125790 non-null string \n", + " 11 sub_area_name_secondary 11501 non-null string \n", + " 12 sub_area_value_secondary 12522 non-null string \n", + " 13 raw_state 188074 non-null string \n", + " 14 level 188074 non-null string \n", + " 15 tier 188074 non-null Int64 \n", + " 16 is_judicial 188074 non-null boolean \n", + " 17 is_retention 188074 non-null boolean \n", + " 18 number_of_seats 188074 non-null Int64 \n", + " 19 normalized_position_id 188074 non-null Int64 \n", + " 20 normalized_position_name 188074 non-null string \n", + " 21 frequency 188074 non-null string \n", + " 22 reference_year 188074 non-null Int64 \n", + " 23 partisan_type 188060 non-null string \n", + " 24 raw_county 188074 non-null object \n", + " 25 race_created_at 188074 non-null datetime64[ns]\n", + " 26 race_updated_at 188074 non-null datetime64[ns]\n", + " 27 state_id_fips 188074 non-null string \n", + " 28 county_id_fips 188074 non-null object \n", + "dtypes: Int64(7), boolean(5), datetime64[ns](3), object(2), string(12)\n", + "memory usage: 38.9+ MB\n" + ] + } + ], + "source": [ + "br_election_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a8870a44-0b83-44c4-b4be-e85e87e3431c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "election_id 298\n", + "election_name 189\n", + "election_day 87\n", + "race_id 82760\n", + "is_primary 2\n", + "is_runoff 2\n", + "is_unexpired 2\n", + "position_id 37406\n", + "position_name 35306\n", + "sub_area_name 41\n", + "sub_area_value 2559\n", + "sub_area_name_secondary 18\n", + "sub_area_value_secondary 183\n", + "raw_state 51\n", + "level 5\n", + "tier 3\n", + "is_judicial 2\n", + "is_retention 2\n", + "number_of_seats 19\n", + "normalized_position_id 191\n", + "normalized_position_name 191\n", + "frequency 15\n", + "reference_year 8\n", + "partisan_type 3\n", + "raw_county 1878\n", + "race_created_at 27027\n", + "race_updated_at 40152\n", + "state_id_fips 51\n", + "county_id_fips 3143\n", + "dtype: int64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_election_data.nunique()" + ] + }, + { + "cell_type": "markdown", + "id": "0b7d489d", + "metadata": {}, + "source": [ + "### Frequency\n", + "Looks like it should be a position field, but there is one erroneous value. Apply a manual correction." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7acb7eaf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 37405\n", + "2 1\n", + "Name: frequency, dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "freq_group = br_election_data.groupby(\"position_id\")[\"frequency\"]\n", + "freq_group.nunique().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "da2f2b13", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "position_id\n", + "156594 [[4], [2]]\n", + "Name: frequency, dtype: object" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "freq_group.unique()[freq_group.nunique() > 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "045871f6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
election_idelection_nameelection_dayrace_idis_primaryis_runoffis_unexpiredposition_idposition_namesub_area_namesub_area_valuesub_area_name_secondarysub_area_value_secondaryraw_stateleveltieris_judicialis_retentionnumber_of_seatsnormalized_position_idnormalized_position_namefrequencyreference_yearpartisan_typeraw_countyrace_created_atrace_updated_atstate_id_fipscounty_id_fips
783834317California General Election2024-11-052020782FalseFalseFalse156594San Jose City Mayor<NA><NA><NA><NA>CAcity3FalseFalse11500City Executive//Mayor[4]2024nonpartisanSanta Clara County2023-01-26 22:12:14.5442023-01-26 22:12:14.5440606085
783845367California Primary Election2024-03-052020783TrueFalseFalse156594San Jose City Mayor<NA><NA><NA><NA>CAcity3FalseFalse11500City Executive//Mayor[2]2022nonpartisanSanta Clara County2023-01-26 22:12:14.6462023-01-26 22:12:14.6460606085
\n", + "
" + ], + "text/plain": [ + " election_id election_name election_day race_id is_primary is_runoff is_unexpired position_id position_name sub_area_name sub_area_value sub_area_name_secondary sub_area_value_secondary raw_state level tier is_judicial is_retention number_of_seats normalized_position_id normalized_position_name frequency reference_year partisan_type raw_county race_created_at race_updated_at state_id_fips county_id_fips\n", + "78383 4317 California General Election 2024-11-05 2020782 False False False 156594 San Jose City Mayor CA city 3 False False 1 1500 City Executive//Mayor [4] 2024 nonpartisan Santa Clara County 2023-01-26 22:12:14.544 2023-01-26 22:12:14.544 06 06085\n", + "78384 5367 California Primary Election 2024-03-05 2020783 True False False 156594 San Jose City Mayor CA city 3 False False 1 1500 City Executive//Mayor [2] 2022 nonpartisan Santa Clara County 2023-01-26 22:12:14.646 2023-01-26 22:12:14.646 06 06085" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_election_data.query(\"position_id == 156594\")" + ] + }, + { + "cell_type": "markdown", + "id": "8e01b807", + "metadata": {}, + "source": [ + "Not sure if this is on instance of a non unique frequency is a ballot ready issue or expected." + ] + }, + { + "cell_type": "markdown", + "id": "9f9f578a", + "metadata": {}, + "source": [ + "### reference year" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fe1c5dc2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 37405\n", + "2 1\n", + "Name: reference_year, dtype: int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ref_group = br_election_data.groupby(\"position_id\")[\"reference_year\"]\n", + "ref_group.nunique().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "adc897bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "position_id\n", + "156594 [2024, 2022]\n", + "Name: reference_year, dtype: object" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ref_group.unique()[ref_group.nunique() > 1]" + ] + }, + { + "cell_type": "markdown", + "id": "cfa1fdf0", + "metadata": {}, + "source": [ + "Ok same same problem county." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# manually assign a new position id\n", + "new_index = br_election_data.position_id.max() + 1\n", + "assert new_index not in br_election_data.position_id\n", + "br_election_data.loc[br_election_data.race_id == 2020783, \"position_id\"] = new_index" + ] + }, + { + "cell_type": "markdown", + "id": "87c956b2", + "metadata": {}, + "source": [ + "## Normalization" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "28aad8f5", + "metadata": {}, + "outputs": [], + "source": [ + "id_cols = [\"election_id\", \"position_id\", \"race_id\"]\n", + "levels = pd.concat([\n", + " br_election_data.groupby(id_col).nunique().le(1).all().rename(id_col) for id_col in id_cols\n", + " ], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9e684152", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
election_idposition_idrace_id
raw_stateTrueTrueTrue
state_id_fipsTrueTrueTrue
election_nameTrueFalseTrue
election_dayTrueFalseTrue
position_nameFalseTrueTrue
sub_area_nameFalseTrueTrue
sub_area_valueFalseTrueTrue
sub_area_name_secondaryFalseTrueTrue
sub_area_value_secondaryFalseTrueTrue
levelFalseTrueTrue
tierFalseTrueTrue
is_judicialFalseTrueTrue
is_retentionFalseTrueTrue
number_of_seatsFalseTrueTrue
normalized_position_idFalseTrueTrue
normalized_position_nameFalseTrueTrue
frequencyFalseTrueTrue
reference_yearFalseTrueTrue
partisan_typeFalseTrueTrue
is_primaryFalseFalseTrue
is_runoffFalseFalseTrue
is_unexpiredFalseFalseTrue
race_created_atFalseFalseTrue
race_updated_atFalseFalseTrue
raw_countyFalseFalseFalse
county_id_fipsFalseFalseFalse
\n", + "
" + ], + "text/plain": [ + " election_id position_id race_id\n", + "raw_state True True True\n", + "state_id_fips True True True\n", + "election_name True False True\n", + "election_day True False True\n", + "position_name False True True\n", + "sub_area_name False True True\n", + "sub_area_value False True True\n", + "sub_area_name_secondary False True True\n", + "sub_area_value_secondary False True True\n", + "level False True True\n", + "tier False True True\n", + "is_judicial False True True\n", + "is_retention False True True\n", + "number_of_seats False True True\n", + "normalized_position_id False True True\n", + "normalized_position_name False True True\n", + "frequency False True True\n", + "reference_year False True True\n", + "partisan_type False True True\n", + "is_primary False False True\n", + "is_runoff False False True\n", + "is_unexpired False False True\n", + "race_created_at False False True\n", + "race_updated_at False False True\n", + "raw_county False False False\n", + "county_id_fips False False False" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "levels.sort_values(id_cols, ascending=False).dropna() # dropna just removes the id fields themselves, which are 'missing' due to being in the index" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "54baf736", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# are all fields unique on some level?\n", + "levels.dropna().any(axis=1).all()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "69ca7359", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
election_idposition_idrace_id
race_idFalseFalseNaN
raw_countyFalseFalseFalse
county_id_fipsFalseFalseFalse
\n", + "
" + ], + "text/plain": [ + " election_id position_id race_id\n", + "race_id False False NaN\n", + "raw_county False False False\n", + "county_id_fips False False False" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# which ones fail?\n", + "levels.loc[~levels.any(axis=1)]" + ] + }, + { + "cell_type": "markdown", + "id": "8b17f290", + "metadata": {}, + "source": [ + "`race_id` is an ID, so it should fail in this test. Counties should be a m:m relationship with districts, which are either position level or maybe race level fields. I'm not sure which one because I'm not sure which (if either) encodes the temporal changes in geography due to redistricting. Have to test it. [update: it's position level. So it doesn't change over time, despite the fact that the underlying districts do.]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "19a4f91c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 82760\n", + "Name: race_id, dtype: int64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# a race is a specific instance of a position in an election. It is a m:m relationship between position and election.\n", + "br_election_data.groupby([\"election_id\", \"position_id\"])['race_id'].nunique().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "6ab203de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'election_day', 'election_name', 'raw_state', 'state_id_fips'}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "election_fields = set(levels.loc[levels.election_id.fillna(False)].index)\n", + "election_fields" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "20531251", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'frequency',\n", + " 'is_judicial',\n", + " 'is_retention',\n", + " 'level',\n", + " 'normalized_position_id',\n", + " 'normalized_position_name',\n", + " 'number_of_seats',\n", + " 'partisan_type',\n", + " 'position_name',\n", + " 'reference_year',\n", + " 'sub_area_name',\n", + " 'sub_area_name_secondary',\n", + " 'sub_area_value',\n", + " 'sub_area_value_secondary',\n", + " 'tier'}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "position_fields = set(levels.loc[levels.position_id.fillna(False)].index) - election_fields - set(id_cols)\n", + "position_fields" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f657851c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'is_primary',\n", + " 'is_runoff',\n", + " 'is_unexpired',\n", + " 'race_created_at',\n", + " 'race_updated_at'}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "race_fields = set(levels.loc[levels.race_id.fillna(False)].index) - election_fields - position_fields - set(id_cols)\n", + "race_fields" + ] + }, + { + "cell_type": "markdown", + "id": "c140bcf8", + "metadata": {}, + "source": [ + "### Check geography relationships\n", + "It turns out that counties are consistent between races for the same position. So it is a position-level attribute." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "bdd4cf06", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(82760,)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "counties_by_race = br_election_data.groupby(id_cols[1:])['county_id_fips'].agg(lambda x: set(x.unique()))\n", + "counties_by_race.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "4b61be82", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "position_id race_id\n", + "2 1472432 {02070, 02013, 02180, 02063, 02158, 02195, 022...\n", + "3 1536258 {01079, 01057, 01025, 01067, 01065, 01075, 010...\n", + " 1729666 {01079, 01057, 01025, 01067, 01065, 01075, 010...\n", + "4 1446390 {04021, 04013, 04027, 04015, 04017, 04019, 040...\n", + "5 1377495 {05039, 05109, 05149, 05147, 05043, 05023, 050...\n", + "Name: county_id_fips, dtype: object" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "counties_by_race.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "272c3f65", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 37407\n", + "Name: county_id_fips, dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Do counties differ within a position group? [no.]\n", + "from functools import reduce\n", + "diffs = counties_by_race.groupby(level='position_id').agg(lambda x: reduce(set.union, x) - reduce(set.intersection, x))\n", + "diffs.apply(len).value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "01a985ca-7f60-42ba-b3b7-71a7a4f7660d", + "metadata": {}, + "source": [ + "## Normalize" + ] + }, + { + "cell_type": "markdown", + "id": "6114388c-7179-46a7-8a60-2ae34c6b2510", + "metadata": {}, + "source": [ + "### Elections\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "411c7b67-855e-4181-88e9-4aa034d1cbde", + "metadata": {}, + "outputs": [], + "source": [ + "br_elections = br_election_data.drop_duplicates(subset='election_id')[list(election_fields) + ['election_id']].copy()\n", + "\n", + "assert br_elections.duplicated(subset=list(election_fields)).sum() == 0" + ] + }, + { + "cell_type": "markdown", + "id": "61237f44-1a90-46bb-bb2e-a31be82d0b5c", + "metadata": {}, + "source": [ + "### Positions" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "9f3117c1", + "metadata": {}, + "outputs": [], + "source": [ + "br_positions = br_election_data.drop_duplicates(subset='position_id')[list(position_fields) + ['position_id']].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "b06f3db2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(37407, 16)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_positions.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "1e5c0ddc", + "metadata": {}, + "outputs": [ + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[24], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39massert\u001b[39;00m br_positions\u001b[39m.\u001b[39mduplicated(subset\u001b[39m=\u001b[39m\u001b[39mlist\u001b[39m(position_fields))\u001b[39m.\u001b[39msum() \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "assert br_positions.duplicated(subset=list(position_fields)).sum() == 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d76b4925-92d8-4390-bddc-d4e7cf5453c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
is_judicialposition_namesub_area_namesub_area_value_secondarysub_area_valueis_retentiontierfrequencysub_area_name_secondarynumber_of_seatsnormalized_position_namereference_yearnormalized_position_idpartisan_typelevel
515FalseAdair County Sheriff<NA><NA><NA>False3[4]<NA>1County Sheriff2024980partisancounty
10357FalseAdair County Sheriff<NA><NA><NA>False3[4]<NA>1County Sheriff2024980partisancounty
21489FalseAdair County Sheriff<NA><NA><NA>False3[4]<NA>1County Sheriff2024980partisancounty
21175FalseAdams County Auditor<NA><NA><NA>False3[4]<NA>1County Auditor2024930partisancounty
21522FalseAdams County Auditor<NA><NA><NA>False3[4]<NA>1County Auditor2024930partisancounty
................................................
64396TrueFlorida Appeals Court Judge - District 1 (Reta...District<NA>1True2[6]<NA>1State Appellate Court Justice - Retention20244052nonpartisanstate
55388TrueLincoln County Probate Judge<NA><NA><NA>False3[4]<NA>1County Court Judge - Probate//County Court Jud...20244475partisancounty
82303TrueLincoln County Probate Judge<NA><NA><NA>False3[4]<NA>1County Court Judge - Probate//County Court Jud...20244475partisancounty
65265TrueNew York Supreme Court - District 10District<NA>10False3[14]<NA>1State Trial Court Judge - General20234027partisanstate
65273TrueNew York Supreme Court - District 10District<NA>10False3[14]<NA>1State Trial Court Judge - General20234027partisanstate
\n", + "

2106 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " is_judicial position_name sub_area_name sub_area_value_secondary sub_area_value is_retention tier frequency sub_area_name_secondary number_of_seats normalized_position_name reference_year normalized_position_id partisan_type level\n", + "515 False Adair County Sheriff False 3 [4] 1 County Sheriff 2024 980 partisan county\n", + "10357 False Adair County Sheriff False 3 [4] 1 County Sheriff 2024 980 partisan county\n", + "21489 False Adair County Sheriff False 3 [4] 1 County Sheriff 2024 980 partisan county\n", + "21175 False Adams County Auditor False 3 [4] 1 County Auditor 2024 930 partisan county\n", + "21522 False Adams County Auditor False 3 [4] 1 County Auditor 2024 930 partisan county\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "64396 True Florida Appeals Court Judge - District 1 (Reta... District 1 True 2 [6] 1 State Appellate Court Justice - Retention 2024 4052 nonpartisan state\n", + "55388 True Lincoln County Probate Judge False 3 [4] 1 County Court Judge - Probate//County Court Jud... 2024 4475 partisan county\n", + "82303 True Lincoln County Probate Judge False 3 [4] 1 County Court Judge - Probate//County Court Jud... 2024 4475 partisan county\n", + "65265 True New York Supreme Court - District 10 District 10 False 3 [14] 1 State Trial Court Judge - General 2023 4027 partisan state\n", + "65273 True New York Supreme Court - District 10 District 10 False 3 [14] 1 State Trial Court Judge - General 2023 4027 partisan state\n", + "\n", + "[2106 rows x 15 columns]" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# is normalized_position_id the de-duplicated version of position_id? [Update: no]\n", + "br_positions.loc[br_positions.duplicated(subset=list(position_fields), keep=False),:].sort_values(list(position_fields))" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "5ab991a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 41\n", + "2 20\n", + "4 11\n", + "5 8\n", + "3 8\n", + " ..\n", + "282 1\n", + "1535 1\n", + "467 1\n", + "36 1\n", + "141 1\n", + "Name: position_id, Length: 84, dtype: int64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_election_data.groupby('normalized_position_id')['position_id'].nunique().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "c531b35f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "frequency False\n", + "position_name False\n", + "sub_area_value_secondary False\n", + "is_retention True\n", + "reference_year False\n", + "number_of_seats False\n", + "sub_area_value False\n", + "sub_area_name_secondary False\n", + "level False\n", + "tier True\n", + "normalized_position_name True\n", + "partisan_type False\n", + "is_judicial True\n", + "normalized_position_id True\n", + "sub_area_name False\n", + "dtype: bool" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_election_data.groupby('normalized_position_id')[list(position_fields)].nunique().le(1).all()" + ] + }, + { + "cell_type": "markdown", + "id": "306b8352", + "metadata": {}, + "source": [ + "I guess I'll just leave the dupes in. The IDs are unique, and I need to be able to define them." + ] + }, + { + "cell_type": "markdown", + "id": "5f87b801-d419-432d-bdb3-a64e0041b2f5", + "metadata": {}, + "source": [ + "### Races" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "e61112a0-2ce0-4d52-a7ed-9edda948d40a", + "metadata": {}, + "outputs": [], + "source": [ + "race_fields_ids = race_fields | set(id_cols)\n", + "br_position_election_assoc = br_election_data.drop_duplicates('race_id')[list(race_fields_ids)].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "58e460fa-e018-4d55-91ec-457e2ea3cd70", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(82760, 8)" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_position_election_assoc.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "36b175a6", + "metadata": {}, + "outputs": [], + "source": [ + "assert br_position_election_assoc.duplicated(subset=race_fields_ids).sum() == 0" + ] + }, + { + "cell_type": "markdown", + "id": "71703591", + "metadata": {}, + "source": [ + "### Position : Counties" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "012dd2cc", + "metadata": {}, + "outputs": [], + "source": [ + "br_position_county_assoc = br_election_data.groupby(['position_id', 'county_id_fips'], as_index=False)['raw_county'].first()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "eb03693b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(90751, 3)" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "br_position_county_assoc.shape" + ] + }, + { + "cell_type": "markdown", + "id": "f65f34cb-569e-4fa5-8ebe-f4b3f74f1ca0", + "metadata": {}, + "source": [ + "## Test" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "405170cc", + "metadata": {}, + "outputs": [], + "source": [ + "# check all columns are accounted for\n", + "assert set(br_election_data.columns).symmetric_difference(set(id_cols) | election_fields | position_fields | race_fields | {'raw_county', 'county_id_fips'}) == set()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/dbcp/data_mart/br_election_data.py b/src/dbcp/data_mart/br_election_data.py index c3306ee2..bbcda480 100644 --- a/src/dbcp/data_mart/br_election_data.py +++ b/src/dbcp/data_mart/br_election_data.py @@ -8,61 +8,154 @@ def _create_br_election_data_mart(engine: sa.engine.Engine) -> pd.DataFrame: - """Create a minimally transformed data mart table for Ballot Ready data.""" - query = """ + """Denormalize the ballot ready entities.""" + pos_county_query = """ SELECT cfips.county_name, sfips.state_name, br.* - FROM data_warehouse.br_election_data as br + FROM data_warehouse.br_positions_counties_assoc as br LEFT JOIN data_warehouse.county_fips as cfips USING (county_id_fips) LEFT JOIN data_warehouse.state_fips as sfips ON sfips.state_id_fips = br.state_id_fips """ with engine.connect() as con: - df = pd.read_sql(query, con) - return df + br_races = pd.read_sql_table("br_races", con, schema="data_warehouse") + br_elections = pd.read_sql_table("br_elections", con, schema="data_warehouse") + br_positions = pd.read_sql_table("br_positions", con, schema="data_warehouse") + br_positions_counties_assoc = pd.read_sql(pos_county_query, con) + br_election_data = br_races.merge( + br_elections, how="left", on="election_id", validate="m:1" + ) + br_election_data = br_election_data.merge( + br_positions, how="left", on="position_id", validate="m:1" + ) + br_election_data = br_election_data.merge( + br_positions_counties_assoc, how="left", on="position_id", validate="m:m" + ) + return br_election_data -def _create_county_commission_election_info(engine: sa.engine.Engine) -> pd.DataFrame: + +def _create_county_commission_elections_long( + br_election_data: pd.DataFrame, +) -> pd.DataFrame: """Create a data mart of county commission elections.""" - # Each row in this query describe an election in a county. - # I select the maximum frequency and reference_year because they describes a position, not an election. - query = """ - WITH - county_commission_positions AS ( - SELECT - county_id_fips, - election_id, - election_name, - election_day, - SUM(number_of_seats) AS total_n_of_seats, - COUNT(position_id) AS total_n_races, - STRING_AGG(position_name, ',') AS all_race_names, - MAX(reference_year) AS reference_year, - MAX(frequency) AS frequency - FROM - data_warehouse.br_election_data - WHERE - tier > 2 - AND is_judicial = FALSE - AND normalized_position_id IN (910, - 912) - GROUP BY 1, 2, 3, 4 - ORDER BY 3, 1 + commissioner_races = br_election_data.query( + "tier > 2 & is_judicial == False & normalized_position_id in (910,912)" ) - SELECT - cfips.county_name, - ccp.* - FROM county_commission_positions AS ccp - LEFT JOIN data_warehouse.county_fips AS cfips - USING (county_id_fips) - """ - with engine.connect() as con: - county_commission_election_info = pd.read_sql_query(query, con) - return county_commission_election_info + county_name_in_position = commissioner_races.apply( + lambda row: row.county_name in row.position_name, axis=1 + ) + # I think ballot ready incorrectly geocoded some races. For example, + # race_id = 1371024: Benewah, Clearwater, and Nez Perce have elections + # for Latah county comissioners. + corrected_comissioner_races = commissioner_races[county_name_in_position].copy() + + # Remove city council races for now + corrected_comissioner_races = corrected_comissioner_races[ + ~corrected_comissioner_races.position_name.str.contains("City Council") + ] + + # Aggregate + mode = lambda x: x.value_counts().index[0] # noqa: E731 + + grp_fields = [ + "election_id", + "county_id_fips", + "county_name", + "election_name", + "election_day", + "is_primary", + "is_runoff", + ] + agg_funcs = { + "position_id": "count", + "number_of_seats": "sum", + "position_name": lambda x: ",".join(x), + "frequency": mode, # frequency describes position, not an election so we select the mode + "reference_year": mode, # frequency describes position, not an election so we select the mode + } + + rename_dict = { + "number_of_seats": "total_n_seats", + "position_id": "total_n_races", + "position_name": "all_race_names", + } + + comissioner_elections = ( + corrected_comissioner_races.groupby(grp_fields).agg(agg_funcs).reset_index() + ) + comissioner_elections = comissioner_elections.rename(columns=rename_dict) + + assert ~comissioner_elections.duplicated( + subset=["county_id_fips", "election_id"] + ).any(), "County comissioner election primary key is not unique." + assert ( + comissioner_elections.total_n_seats >= comissioner_elections.total_n_races + ).all(), "Number of seats should always be greater or equal to number of races in a county." + return comissioner_elections + + +def _create_county_commission_elections_wide( + county_commission_elections_long: pd.DataFrame, +) -> pd.DataFrame: + """Create a dataframe of county comissioner races where each row is a county with columns for regular, primary and special elections.""" + # Create election_type column to pivot on + county_commission_elections_long["election_type"] = pd.Series() + county_commission_elections_long[ + "election_type" + ] = county_commission_elections_long.election_type.mask( + county_commission_elections_long.is_primary, "primary" + ) + county_commission_elections_long[ + "election_type" + ] = county_commission_elections_long.election_type.mask( + county_commission_elections_long.is_runoff, "run_off" + ) + county_commission_elections_long[ + "election_type" + ] = county_commission_elections_long["election_type"].fillna("general") + county_commission_elections_long = county_commission_elections_long.drop( + columns=["is_primary", "is_runoff"] + ) + + # Grab the next upcoming election for each election type and county + next_county_commission_elections_long = county_commission_elections_long.loc[ + county_commission_elections_long.groupby(["county_id_fips", "election_type"])[ + "election_day" + ].idxmax() + ] + + # Pivot and rename columns + next_county_commission_elections_wide = next_county_commission_elections_long.pivot( + index=["county_id_fips", "county_name"], columns=["election_type"] + ) + + next_county_commission_elections_wide.columns = ( + next_county_commission_elections_wide.swaplevel(axis=1).columns + ) + next_county_commission_elections_wide = ( + next_county_commission_elections_wide.sort_index(axis=1, level="election_type") + ) + + next_county_commission_elections_wide.columns = ( + next_county_commission_elections_wide.columns.map("_".join) + ) + next_county_commission_elections_wide.columns = [ + "next_" + col for col in next_county_commission_elections_wide.columns + ] + + next_county_commission_elections_wide = ( + next_county_commission_elections_wide.reset_index().convert_dtypes() + ) + + assert ( + next_county_commission_elections_wide.county_id_fips.is_unique + ), "county_id_fips is not unique!" + return next_county_commission_elections_wide def create_data_mart( @@ -83,7 +176,11 @@ def create_data_mart( dfs = {} dfs["br_election_data"] = _create_br_election_data_mart(engine) - dfs["county_commission_election_info"] = _create_county_commission_election_info( - engine + + county_commission_elections_long = _create_county_commission_elections_long( + dfs["br_election_data"] + ) + dfs["county_commission_election_info"] = _create_county_commission_elections_wide( + county_commission_elections_long ) return dfs diff --git a/src/dbcp/data_mart/counties.py b/src/dbcp/data_mart/counties.py index 77e9c743..ceea17f4 100644 --- a/src/dbcp/data_mart/counties.py +++ b/src/dbcp/data_mart/counties.py @@ -345,7 +345,7 @@ def _fossil_infrastructure_counties(engine: sa.engine.Engine) -> pd.DataFrame: # sum(co2e_tonnes_per_year) as co2e_tonnes_per_year, # sum(pm2_5_tonnes_per_year) as pm2_5_tonnes_per_year, # sum(nox_tonnes_per_year) as nox_tonnes_per_year, - # 'power plant' as facility_type, + # 'fossil infrastructure' as facility_type, # 'proposed' as status # from data_mart.fossil_infrastructure_projects # group by 1, 2 diff --git a/src/dbcp/metadata/data_mart.py b/src/dbcp/metadata/data_mart.py index 3a18cb90..74c916a5 100644 --- a/src/dbcp/metadata/data_mart.py +++ b/src/dbcp/metadata/data_mart.py @@ -415,13 +415,13 @@ br_election_data = Table( "br_election_data", metadata, + Column("race_id", Integer, nullable=False, primary_key=True), + Column("raw_county", String, nullable=False, primary_key=True), Column("state_name", String, nullable=False), Column("county_name", String), - Column("raw_county", String, nullable=False, primary_key=True), Column("election_id", Integer, nullable=False), Column("election_name", String, nullable=False), Column("election_day", DateTime, nullable=False), - Column("race_id", Integer, nullable=False, primary_key=True), Column("is_primary", Boolean, nullable=False), Column("is_runoff", Boolean, nullable=False), Column("is_unexpired", Boolean, nullable=False), @@ -452,15 +452,31 @@ county_commission_election_info = Table( "county_commission_election_info", metadata, - Column("county_name", String, nullable=False), Column("county_id_fips", String, nullable=False, primary_key=True), - Column("election_id", Integer, nullable=False, primary_key=True), - Column("election_name", String, nullable=False), - Column("election_day", DateTime, nullable=False), - Column("total_n_of_seats", Integer, nullable=False), - Column("total_n_races", Integer, nullable=False), - Column("all_race_names", String, nullable=False), - Column("frequency", String, nullable=False), - Column("reference_year", Integer, nullable=False), + Column("county_name", String, nullable=False), + Column("next_general_election_id", Integer), + Column("next_general_election_name", String), + Column("next_general_election_day", DateTime), + Column("next_general_total_n_seats", Integer), + Column("next_general_total_n_races", Integer), + Column("next_general_all_race_names", String), + Column("next_general_frequency", String), + Column("next_general_reference_year", Integer), + Column("next_primary_election_id", Integer), + Column("next_primary_election_name", String), + Column("next_primary_election_day", DateTime), + Column("next_primary_total_n_seats", Integer), + Column("next_primary_total_n_races", Integer), + Column("next_primary_all_race_names", String), + Column("next_primary_frequency", String), + Column("next_primary_reference_year", Integer), + Column("next_run_off_election_id", Integer), + Column("next_run_off_election_name", String), + Column("next_run_off_election_day", DateTime), + Column("next_run_off_total_n_seats", Integer), + Column("next_run_off_total_n_races", Integer), + Column("next_run_off_all_race_names", String), + Column("next_run_off_frequency", String), + Column("next_run_off_reference_year", Integer), schema=schema, ) diff --git a/src/dbcp/metadata/data_warehouse.py b/src/dbcp/metadata/data_warehouse.py index 7cd2a370..fdb55168 100644 --- a/src/dbcp/metadata/data_warehouse.py +++ b/src/dbcp/metadata/data_warehouse.py @@ -1199,40 +1199,86 @@ ################ # Ballot Ready # ################ - -br_election_data = Table( - "br_election_data", +br_elections = Table( + "br_elections", metadata, - Column("raw_county", String, nullable=False, primary_key=True), - Column("election_id", Integer, nullable=False), + Column("election_id", Integer, nullable=False, primary_key=True), Column("election_name", String, nullable=False), Column("election_day", DateTime, nullable=False), - Column("race_id", Integer, nullable=False, primary_key=True), - Column("is_primary", Boolean, nullable=False), - Column("is_runoff", Boolean, nullable=False), - Column("is_unexpired", Boolean, nullable=False), - Column("position_id", Integer, nullable=False), + schema=schema, +) + +br_positions = Table( + "br_positions", + metadata, + Column("position_id", Integer, nullable=False, primary_key=True), Column("position_name", String, nullable=False), - Column("sub_area_name", String, nullable=True), - Column("sub_area_value", String, nullable=True), - Column("sub_area_name_secondary", String, nullable=True), - Column("sub_area_value_secondary", String, nullable=True), - Column("raw_state", String, nullable=False), + Column("reference_year", Integer, nullable=False), + Column("sub_area_name", String), + Column("sub_area_value", String), + Column("sub_area_name_secondary", String), + Column("sub_area_value_secondary", String), Column("level", String, nullable=False), Column("tier", Integer, nullable=False), Column("is_judicial", Boolean, nullable=False), Column("is_retention", Boolean, nullable=False), - Column("number_of_seats", Integer, nullable=False), Column("normalized_position_id", Integer, nullable=False), Column("normalized_position_name", String, nullable=False), Column("frequency", String, nullable=False), - Column("reference_year", Integer, nullable=False), - Column("partisan_type", String, nullable=True), + Column("partisan_type", String), + schema=schema, +) + +br_races = Table( + "br_races", + metadata, + Column("race_id", Integer, nullable=False, primary_key=True), + Column("is_primary", Boolean, nullable=False), + Column("is_runoff", Boolean, nullable=False), + Column("is_unexpired", Boolean, nullable=False), + Column("number_of_seats", Integer, nullable=False), Column("race_created_at", DateTime, nullable=False), Column("race_updated_at", DateTime, nullable=False), - Column("state_id_fips", String, nullable=False), Column( - "county_id_fips", String, nullable=True + "election_id", + Integer, + ForeignKey("data_warehouse.br_elections.election_id"), + nullable=False, + ), + Column( + "position_id", + Integer, + ForeignKey("data_warehouse.br_positions.position_id"), + nullable=False, + ), + schema=schema, +) + +br_positions_counties_assoc = Table( + "br_positions_counties_assoc", + metadata, + Column( + "position_id", + Integer, + ForeignKey("data_warehouse.br_positions.position_id"), + nullable=False, + primary_key=True, + ), + Column( + "raw_county", String, nullable=False, primary_key=True + ), # Can't use county_id_fips because Connecticut changed it's county system recently + Column("raw_state", String, nullable=False), + Column( + "state_id_fips", + String, + ForeignKey("data_warehouse.state_fips.state_id_fips"), + nullable=False, + ), + Column( + "county_id_fips", + String, + ForeignKey("data_warehouse.county_fips.county_id_fips"), + nullable=True, ), # Should not be nullable in future updates schema=schema, ) diff --git a/src/dbcp/transform/ballot_ready.py b/src/dbcp/transform/ballot_ready.py index 20c1996e..69657461 100644 --- a/src/dbcp/transform/ballot_ready.py +++ b/src/dbcp/transform/ballot_ready.py @@ -6,22 +6,122 @@ DATETIME_COLUMNS = ["race_created_at", "race_updated_at", "election_day"] -def transform(raw_dfs: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]: - """Clean the Ballot Ready data. - - Transformations include: - * Correct datatypes - * Explode counties columns +def _normalize_entities(ballot_ready: pd.DataFrame) -> dict[str, pd.DataFrame]: + """Normalize ballot ready data into elections, position and race entities. Args: - raw_dfs: dictionary of dataframe names to raw dataframes. + ballot_ready: Exploded, lightly cleaned raw data. - Returns - trns_dfs: dictionary of dataframe names to cleaned dataframes. + Returns: + trns_dfs: dataframes for elecitons, positions and races. + """ + trns_dfs = {} + # Elections + election_pk_fields = ["election_id"] + + election_fields = [ + "election_id", + "election_name", + "election_day", + ] + assert ( + (ballot_ready.groupby(election_pk_fields)[election_fields].nunique() <= 1) + .all() + .all() + ), "There is duplicate entity information in the elections dataframe." + + br_elections = ballot_ready.drop_duplicates(subset=election_pk_fields)[ + election_fields + ].copy() + assert br_elections.election_id.is_unique, "election_id is not unique." + + trns_dfs["br_elections"] = br_elections + + # Positions + position_pk_fields = ["position_id"] + + position_fields = [ + "reference_year", + "position_id", + "position_name", + "sub_area_name", + "sub_area_value", + "sub_area_name_secondary", + "sub_area_value_secondary", + "level", + "tier", + "is_judicial", + "is_retention", + "normalized_position_id", + "normalized_position_name", + "frequency", + "partisan_type", + ] + # position_id == 156594 is the only position with two frequencies and reference_years + # Create a new index for it + new_index = ballot_ready.position_id.max() + 1 + assert new_index not in ballot_ready.position_id + ballot_ready.loc[ballot_ready.race_id == 2020783, "position_id"] = new_index + assert ( + (ballot_ready.groupby(position_pk_fields)[position_fields].nunique() <= 1) + .all() + .all() + ), "There is duplicate entity information in the positions dataframe." + br_positions = ballot_ready.drop_duplicates(subset=position_pk_fields)[ + position_fields + ].copy() + trns_dfs["br_positions"] = br_positions + + # Races + race_pk_fields = ["race_id"] + + race_fields = [ + "race_id", + "is_primary", + "is_runoff", + "is_unexpired", + "number_of_seats", + "race_created_at", + "race_updated_at", + ] + assert ( + (ballot_ready.groupby(race_pk_fields)[race_fields].nunique() <= 1).all().all() + ), "There is duplicate entity informaiton in the races table." + # Add some one to many fields to the races table dataframe. + race_fields += [ + "election_id", + "position_id", + ] + br_races = ballot_ready.drop_duplicates(subset=race_pk_fields)[race_fields].copy() + assert len(br_races) < len(ballot_ready) + assert br_races.race_id.is_unique, "race_id is not unique!" + + trns_dfs["br_races"] = br_races + + # Create a county and position association table + position_counties_fields = [ + "position_id", + "county_id_fips", + "raw_county", + "state_id_fips", + "raw_state", + ] + trns_dfs["br_positions_counties_assoc"] = ballot_ready.drop_duplicates( + subset=["position_id", "county_id_fips"] + )[position_counties_fields].copy() + return trns_dfs + + +def _explode_counties(raw_ballot_ready: pd.DataFrame) -> pd.DataFrame: + """Correct datatypes and explode counties columns. + + Args: + raw_ballot_ready: raw ballot ready data. + Returns: + ballot_ready: lightly cleaned and exploded dataframe. """ - ballot_ready = raw_dfs["raw_ballot_ready"] # Correct datatypes - ballot_ready = ballot_ready.convert_dtypes() + ballot_ready = raw_ballot_ready.convert_dtypes() for col in DATETIME_COLUMNS: ballot_ready[col] = pd.to_datetime(ballot_ready[col]) @@ -39,7 +139,7 @@ def transform(raw_dfs: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]: ) # Initial batch of raw data has duplicates in counties assert ( - duplicate_race.sum() <= 506 + duplicate_race.sum() <= 20 ), "Found more duplicate county/race combinations that expected." # Drop duplicates. A later version of ballot ready data will remedy this problem. @@ -53,12 +153,51 @@ def transform(raw_dfs: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]: ] = "La Salle Parish" ballot_ready = add_fips_ids(ballot_ready) + # Valdez-Cordova Census Area was split into two areas in 2019 + # https://www.census.gov/programs-surveys/geography/technical-documentation/county-changes/2010.html + # All elections are state and federal level so I will duplicate the races for the two new census areas + valdez = ballot_ready.query("county_id_fips == '02261'") + assert valdez.level.isin( + ["state", "federal"] + ).all(), "Found a local election in the Valdez-Cordova Census Area!" + + ballot_ready = ballot_ready[ballot_ready.county_id_fips != "02261"].copy() + + valdez_corrections = [ + {"county": "Chugach Census Area", "county_id_fips": "02063"}, + {"county": "Copper River Census Area", "county_id_fips": "02066"}, + ] + + valdez_corrections_dfs = [] + for cor in valdez_corrections: + corrected_df = valdez.copy() + for field, value in cor.items(): + corrected_df[field] = value + valdez_corrections_dfs.append(corrected_df) + + ballot_ready = pd.concat(valdez_corrections_dfs + [ballot_ready]) + # Drop unused columns ballot_ready = ballot_ready.drop(columns=["position_description", "id"]) ballot_ready = ballot_ready.rename( columns={"county": "raw_county", "state": "raw_state"} ) + return ballot_ready - trns_dfs = {} - trns_dfs["br_election_data"] = ballot_ready - return trns_dfs + +def transform(raw_dfs: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]: + """Clean the Ballot Ready data. + + Transformations include: + * Correct datatypes + * Explode counties columns + + Args: + raw_dfs: dictionary of dataframe names to raw dataframes. + + Returns + trns_dfs: dictionary of dataframe names to cleaned dataframes. + """ + raw_ballot_ready = raw_dfs["raw_ballot_ready"] + ballot_ready = _explode_counties(raw_ballot_ready) + return _normalize_entities(ballot_ready) diff --git a/src/dbcp/validation/tests.py b/src/dbcp/validation/tests.py index cea8cc6e..16a7d0b7 100644 --- a/src/dbcp/validation/tests.py +++ b/src/dbcp/validation/tests.py @@ -141,13 +141,19 @@ def test_iso_projects_data_mart_aggregates_are_close(engine: Engine): def test_county_commission_election_info(engine: Engine): - """Check total_n_of_seats is >= total_n_races.""" + """Check total_n_seats is >= total_n_races.""" with engine.connect() as con: df = pd.read_sql_table( "county_commission_election_info", con, schema="data_mart" - ) + ).convert_dtypes() + assert ( + df.next_primary_total_n_seats >= df.next_primary_total_n_races + ).all(), "Found more races than seats in county_commission_election_info!" + assert ( + df.next_general_total_n_seats >= df.next_general_total_n_races + ).all(), "Found more races than seats in county_commission_election_info!" assert ( - df.total_n_of_seats >= df.total_n_races + df.next_run_off_total_n_seats >= df.next_run_off_total_n_races ).all(), "Found more races than seats in county_commission_election_info!"