diff --git a/README.md b/README.md index 6f1c0d62..fd0ca66f 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,39 @@ conda activate dbcp-dev This conda environment has python, pip and pre-commit installed in it. This env is just for running pre-commits, the actual ETL development happens in docker. +## GCP Authentication + +The ETL requires access to some data stored in Google Cloud Platform (GCP). +To authenticate the docker container with GCP install the [gcloud utilities](https://cloud.google.com/sdk/docs/install) on your +computer. There are several ways to do this. We recommend using ``conda`` or its faster +sibling ``mamba``. If you're not using ``conda`` environments, there are other +ways to install the Google Cloud SDK explained in the link above. + +``` +conda install -c conda-forge google-cloud-sdk +``` + +Finally, use ``gcloud`` to establish application default credentials + +``` + gcloud auth application-default login +``` + +This will send you to an authentication page in your default browser. Once +authenticated, the command should print out a message: + +``` +Credentials saved to file: +``` + +Add this path to the `GOOGLE_APPLICATION_CREDENTIALS_PATH` environment variable in +your `.env` file (see Environment Variables section below). + +`GOOGLE_APPLICATION_CREDENTIALS_PATH=`` + +`GOOGLE_APPLICATION_CREDENTIALS_PATH` will be mounted into the container so +the GCP APIs in the container can access the data stored in GCP. + ## Git Pre-commit Hooks Git hooks let you automatically run scripts at various points as you manage your source code. “Pre-commit” hook scripts are run when you try to make a new commit. These scripts can review your code and identify bugs, formatting errors, bad coding habits, and other issues before the code gets checked in. This gives you the opportunity to fix those issues before publishing them. diff --git a/data/raw/gridstatus/interconnection_queues/.gitignore b/data/raw/gridstatus/interconnection_queues/.gitignore new file mode 100644 index 00000000..d6b7ef32 --- /dev/null +++ b/data/raw/gridstatus/interconnection_queues/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/docker-compose.yaml b/docker-compose.yaml index cf5d74b4..b72a1acd 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -3,6 +3,9 @@ services: app: build: . env_file: local.env + environment: + - GOOGLE_APPLICATION_CREDENTIALS=/app/gcloud_application_default_credentials.json + - GOOGLE_CLOUD_PROJECT=dbcp-dev-350818 depends_on: - postgres volumes: @@ -10,6 +13,7 @@ services: - ./notebooks:/app/notebooks:rw - ./data:/app/data:rw - ./test:/app/test:r + - ${GOOGLE_APPLICATION_CREDENTIALS_PATH}:/app/gcloud_application_default_credentials.json:r ports: - ${JUPYTER_PORT}:${JUPYTER_PORT} command: jupyter lab --port ${JUPYTER_PORT} --ip 0.0.0.0 --no-browser diff --git a/notebooks/37-tpb-gridstatus_LBNL_status_mapping.ipynb b/notebooks/37-tpb-gridstatus_LBNL_status_mapping.ipynb new file mode 100644 index 00000000..c76a7549 --- /dev/null +++ b/notebooks/37-tpb-gridstatus_LBNL_status_mapping.ipynb @@ -0,0 +1,1834 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GridStatus <-> LBNL Status Mapping\n", + "\n", + "GridStatus reads raw ISO queues. The LBNL queue statuses are standardized versions of ISO-specific categories. This notebook attempts to reverse engineer the mappings between the two." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pathlib import Path\n", + "import numpy as np\n", + "\n", + "from dbcp.extract.lbnl_iso_queue import extract\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get the data\n", + "### LBNL-Compiled Queues" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# partial implementation of transform. I don't want to include deduplication.\n", + "def partial_transform(active_projects: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"Transform active iso queue data.\"\"\"\n", + " rename_dict = {\n", + " \"state\": \"raw_state_name\",\n", + " \"county\": \"raw_county_name\",\n", + " }\n", + " active_projects = active_projects.rename(columns=rename_dict) # copy\n", + " # Harmonize the interconnection_status_lbnl values.\n", + " mapping = {\n", + " \"Feasability Study\": \"Feasibility Study\",\n", + " \"Feasibility\": \"Feasibility Study\",\n", + " \"Facilities Study\": \"Facility Study\",\n", + " \"IA in Progress\": \"In Progress (unknown study)\",\n", + " \"Unknown\": \"In Progress (unknown study)\",\n", + " \"Withdrawn, Feasibility Study\": \"Withdrawn\",\n", + " }\n", + " active_projects.loc[:, \"interconnection_status_lbnl\"] = active_projects.loc[\n", + " :, \"interconnection_status_lbnl\"\n", + " ].replace(mapping)\n", + " # drop irrelevant columns (structurally all nan due to 'active' filter)\n", + " active_projects.drop(columns=[\"date_withdrawn\", \"date_operational\"], inplace=True)\n", + " return active_projects\n", + "\n", + "\n", + "source_path = Path(\"/app/data/raw/queues_2022_clean_data.xlsx\")\n", + "raw_lbnl = extract(source_path)[\"lbnl_iso_queue\"]\n", + "lbnl = partial_transform(raw_lbnl)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl.shape, lbnl.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl.head(2)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### GridStatus Queues" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import dbcp\n", + "\n", + "# These are the revision numbers of the oldest archives we have\n", + "iso_queue_versions: dict[str, str] = {\n", + " \"miso\": \"1681775160487863\",\n", + " \"caiso\": \"1681775162586588\",\n", + " \"pjm\": \"1681775160979859\",\n", + " \"ercot\": \"1681775161342766\",\n", + " \"spp\": \"1681775162935809\",\n", + " \"nyiso\": \"1681775159356063\",\n", + " \"isone\": \"1681775162111351\",\n", + "}\n", + "\n", + "gs_dfs = dbcp.extract.gridstatus_isoqueues.extract(iso_queue_versions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "{k: v.shape for k, v in gs_dfs.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # These are manually downloaded from our archives. I went back as far as I could,\n", + "# # which is April 17 2023.\n", + "# # There is a 3.5 month gap between the LBNL queues and our oldest GridStatus archive.\n", + "# root_path = Path(\"/app/data/raw/gridstatus/interconnection_queues\")\n", + "# assert root_path.exists()\n", + "# # filenames are like \"interconnection_queues_caiso_4-17-2023.parquet\"\n", + "# gs_dfs = {\n", + "# path.name.split(\"_\")[2]: pd.read_parquet(path)\n", + "# for path in root_path.glob(\"*.parquet\")\n", + "# }\n", + "# {k: v.shape for k, v in gs_dfs.items()}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# wayyy fewer items in GridStatus than LBNL.\n", + "sum([v.shape[0] for v in gs_dfs.values()]) - lbnl.shape[0]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl.region.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl_iso = lbnl[~lbnl[\"region\"].str.contains(\"non-ISO\", na=False)].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sum([v.shape[0] for v in gs_dfs.values()]) - lbnl_iso.shape[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Gridstatus has more ISO projects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# globals().update(gs_dfs) # this works fine but the static type checker/linter can't introspect it.\n", + "raw_ercot = gs_dfs[\"ercot\"]\n", + "raw_nyiso = gs_dfs[\"nyiso\"]\n", + "raw_isone = gs_dfs[\"isone\"]\n", + "raw_miso = gs_dfs[\"miso\"]\n", + "raw_pjm = gs_dfs[\"pjm\"]\n", + "raw_spp = gs_dfs[\"spp\"]\n", + "raw_caiso = gs_dfs[\"caiso\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LBNL_JOIN_COLS = [\n", + " \"queue_id\", # join key\n", + " \"project_name\", # for manually checking the joins\n", + " \"queue_date\", # for manually checking the joins\n", + " \"queue_status\", # for manually checking the joins\n", + " \"interconnection_status_raw\", # see what LBNL interpreted\n", + " \"interconnection_status_lbnl\", # final mapping value\n", + " \"capacity_mw_resource_1\",\n", + " \"resource_type_1\"\n", + " \n", + "]\n", + "\n", + "\n", + "def join_lbnl(\n", + " iso_df: pd.DataFrame, lbnl: pd.DataFrame, iso_name: str, iso_id_col=\"Queue ID\"\n", + ") -> pd.DataFrame:\n", + " \"\"\"Join LBNL queue data to GridStatus queue data.\"\"\"\n", + " assert iso_df[iso_id_col].is_unique, \"ID column not unique\"\n", + " lbnl_iso = lbnl.loc[lbnl[\"entity\"].eq(iso_name), LBNL_JOIN_COLS]\n", + " assert not lbnl_iso.empty, f\"Empty LBNL queue for {iso_name}\"\n", + " assert lbnl_iso[\"queue_id\"].is_unique, \"LBNL queue ID not unique\"\n", + " out = iso_df.merge(lbnl_iso, how=\"outer\", left_on=iso_id_col, right_on=\"queue_id\")\n", + " out[\"in_lbnl\"] = ~out[\"queue_id\"].isna()\n", + " out[\"in_gs\"] = ~out[iso_id_col].isna()\n", + " return out\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Comparisons\n", + "### ERCOT\n", + "* ERCOT's queue document has separate excel sheets for large and small (< 20MW) projects, and does not list withdrawn projects. GridStatus only includes large projects, whereas LBNL data includes the other categories. I'm not sure where LBNL gets the withdrawn projects from.\n", + " * We might be able to maintain a withdrawn list by 1) getting all the current withdrawn projects from the LBNL data, then 2) loading all the archived GridStatus ERCOT queues and checking for any projects that enter then exit the list. But I don't think we actually need a withdrawn list?\n", + "* GridStatus defines \"status\" as \"IA Signed\".isna(). LBNL calls the entire \"large active\" dataset \"active\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_ercot.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot = join_lbnl(raw_ercot, lbnl, \"ERCOT\")\n", + "ercot.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot.loc[\n", + " ercot[\"Queue Date\"]\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"in_gs\", \"in_lbnl\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot[[\"Status\", \"queue_status\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot.loc[\n", + " ercot[\"Queue Date\"]\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"Status\", \"queue_status\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot[[\"GIM Study Phase\", \"queue_status\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot[\"Queue Date\"].eq(pd.to_datetime(ercot[\"queue_date\"])).where(\n", + " (ercot[\"Queue Date\"].notna() & ercot[\"queue_date\"].notnull())\n", + ").value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get old version of ercot data from December 2022\n", + "old_ercot_path = Path(\"/app/data/raw/gridstatus/GIS_Report_December_2022.xlsx\")\n", + "assert old_ercot_path.exists()\n", + "\n", + "# copy and paste some GridStatus ETL code\n", + "def extract_ercot(path: Path) -> pd.DataFrame:\n", + " queue = pd.read_excel(\n", + " path,\n", + " sheet_name=\"Project Details - Large Gen\",\n", + " skiprows=30,\n", + " ).iloc[4:]\n", + "\n", + " queue[\"State\"] = \"Texas\"\n", + " queue[\"Queue Date\"] = queue[\"Screening Study Started\"]\n", + "\n", + " fuel_type_map = {\n", + " \"BIO\": \"Biomass\",\n", + " \"COA\": \"Coal\",\n", + " \"GAS\": \"Gas\",\n", + " \"GEO\": \"Geothermal\",\n", + " \"HYD\": \"Hydrogen\",\n", + " \"NUC\": \"Nuclear\",\n", + " \"OIL\": \"Fuel Oil\",\n", + " \"OTH\": \"Other\",\n", + " \"PET\": \"Petcoke\",\n", + " \"SOL\": \"Solar\",\n", + " \"WAT\": \"Water\",\n", + " \"WIN\": \"Wind\",\n", + " }\n", + "\n", + " technology_type_map = {\n", + " \"BA\": \"Battery Energy Storage\",\n", + " \"CC\": \"Combined-Cycle\",\n", + " \"CE\": \"Compressed Air Energy Storage\",\n", + " \"CP\": \"Concentrated Solar Power\",\n", + " \"EN\": \"Energy Storage\",\n", + " \"FC\": \"Fuel Cell\",\n", + " \"GT\": \"Combustion (gas) Turbine, but not part of a Combined-Cycle\",\n", + " \"HY\": \"Hydroelectric Turbine\",\n", + " \"IC\": \"Internal Combustion Engine, eg. Reciprocating\",\n", + " \"OT\": \"Other\",\n", + " \"PV\": \"Photovoltaic Solar\",\n", + " \"ST\": \"Steam Turbine other than Combined-Cycle\",\n", + " \"WT\": \"Wind Turbine\",\n", + " }\n", + "\n", + " queue[\"Fuel\"] = queue[\"Fuel\"].map(fuel_type_map)\n", + " queue[\"Technology\"] = queue[\"Technology\"].map(technology_type_map)\n", + "\n", + " queue[\"Generation Type\"] = queue[\"Fuel\"] + \" - \" + queue[\"Technology\"]\n", + "\n", + " queue[\"Status\"] = (\n", + " queue[\"IA Signed\"]\n", + " .isna()\n", + " .map(\n", + " {\n", + " True: \"Active\",\n", + " False: \"Completed\",\n", + " },\n", + " )\n", + " )\n", + "\n", + " queue[\"Actual Completion Date\"] = queue[\"Approved for Synchronization\"]\n", + "\n", + " rename = {\n", + " \"INR\": \"Queue ID\",\n", + " \"Project Name\": \"Project Name\",\n", + " \"Interconnecting Entity\": \"Interconnecting Entity\",\n", + " \"Projected COD\": \"Proposed Completion Date\",\n", + " \"POI Location\": \"Interconnection Location\",\n", + " \"County\": \"County\",\n", + " \"State\": \"State\",\n", + " \"Capacity (MW)\": \"Capacity (MW)\",\n", + " \"Queue Date\": \"Queue Date\",\n", + " \"Generation Type\": \"Generation Type\",\n", + " \"Actual Completion Date\": \"Actual Completion Date\",\n", + " \"Status\": \"Status\",\n", + " }\n", + " return queue.rename(columns=rename)\n", + "old_ercot = extract_ercot(old_ercot_path)\n", + "old_ercot.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "old_ercot = join_lbnl(old_ercot, lbnl, \"ERCOT\", iso_id_col=\"Queue ID\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "old_ercot[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "old_ercot[[\"Status\", \"queue_status\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.concat(\n", + " (\n", + " ercot[[\"Status\", \"queue_status\"]].value_counts(dropna=False).rename(\"gs\"),\n", + " old_ercot[[\"Status\", \"queue_status\"]].value_counts(dropna=False).rename(\"old\"),\n", + " ),\n", + " axis=1,\n", + " join=\"outer\",\n", + ").sort_index().drop_duplicates()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "old_ercot[\"Queue Date\"].eq(pd.to_datetime(old_ercot[\"queue_date\"])).where(\n", + " (old_ercot[\"Queue Date\"].notna() & old_ercot[\"queue_date\"].notnull())\n", + ").value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "old_ercot[\"Queue Date\"].sub(pd.to_datetime(old_ercot[\"queue_date\"])).lt(pd.Timedelta(days=2)).where(\n", + " (old_ercot[\"Queue Date\"].notna() & old_ercot[\"queue_date\"].notnull())\n", + ").value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "old_ercot[\"Queue Date\"].sub(pd.to_datetime(old_ercot[\"queue_date\"])).dt.total_seconds().div(60 * 60 * 24).replace(0, 1e-1).transform(np.log10).hist(bins=20, log=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mismatched_dates = (\n", + " old_ercot[\"Queue Date\"]\n", + " .ne(pd.to_datetime(old_ercot[\"queue_date\"]))\n", + " .where((old_ercot[\"Queue Date\"].notna() & old_ercot[\"queue_date\"].notnull()))\n", + ").fillna(False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# none of the dates match. No idea where LBNL queue date comes from. Not from the ISO data!\n", + "old_ercot.loc[mismatched_dates, [c for c in old_ercot.columns if 'date' in c.lower()]]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "old_ercot[['GIM Study Phase', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "old_ercot['GIM Study Phase'].eq(old_ercot['interconnection_status_raw']).value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# LBNL status looks consistently applied (1:1) but the ordering doesn't really make sense to me.\n", + "raw_status_match = old_ercot['GIM Study Phase'].eq(old_ercot['interconnection_status_raw'])\n", + "old_ercot.loc[raw_status_match, ['GIM Study Phase', 'interconnection_status_lbnl']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "old_ercot.loc[raw_status_match, ['GIM Study Phase', 'queue_status']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# also includes technology type, which is nice.\n", + "raw_ercot['Generation Type'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### NYISO\n", + "\n", + "* LBNL appears to have used the column labeled `S`, which is an ordinal status number corresponding to NYISO's LFIP. Key:\n", + " * 0=Withdrawn\n", + " * 1=Scoping Meeting Pending\n", + " * 2=FES Pending\n", + " * 3=FES in Progress\n", + " * 4=SRIS/SIS Pending\n", + " * 5=SRIS/SIS in Progress\n", + " * 6=SRIS/SIS Approved\n", + " * 7=FS Pending\n", + " * 8=Rejected Cost Allocation/Next FS Pending\n", + " * 9=FS in Progress\n", + " * 10=Accepted Cost Allocation/IA in Progress\n", + " * 11=IA Completed\n", + " * 12=Under Construction\n", + " * 13=In Service for Test\n", + " * 14=In Service Commercial\n", + " * 15=Partial In-Service\n", + "* Availability of Studies Key: None=Not Available, FES=Feasibility Study Available, SRIS=System Reliability Impact Study Available, FS=Facilities Study and/or ATRA Available\n", + "* CY Completion/SGIA Tender refers to the Attachment X milestone used to apply the 4-year COD limitation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_nyiso.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_nyiso[raw_nyiso[\"Queue ID\"].duplicated(keep=False)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'\n", + "raw_nyiso.loc[raw_nyiso.duplicated(subset='Queue ID', keep=False), :].sort_values(by='Queue ID')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso = join_lbnl(raw_nyiso.drop_duplicates(subset='Queue ID', keep='last'), lbnl, \"NYISO\")\n", + "nyiso.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# marginal improvement from date filter\n", + "nyiso.loc[\n", + " nyiso[\"Queue Date\"]\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"in_gs\", \"in_lbnl\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "nyiso[[\"Status\", \"queue_status\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Why are there so many projects that have a status in GridStatus but no status in LBNL? NYISO has separate sheets for withdrawn, in service and active. Why is LBNL missing so much information?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso.loc[\n", + " nyiso[\"Queue Date\"]\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"Status\", \"queue_status\"],\n", + "].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso['interconnection_status_raw'].str.replace(' ', '').replace([np.nan, None, 'None'], '').str.split(',').apply(lambda x: ','.join(sorted(set(x)))).value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso['Availability of Studies'].str.replace(r'[ \\d]', '', regex=True).replace([np.nan, None, 'None'], '').str.split(',').apply(lambda x: ','.join(sorted(set(x).difference({'CY'})))).value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[\"simplified_studies\"] = (\n", + " nyiso[\"Availability of Studies\"]\n", + " .str.replace(r\"[ \\d]\", \"\", regex=True)\n", + " .replace([np.nan, None, \"None\"], \"\")\n", + " .str.split(\",\")\n", + " .apply(lambda x: \",\".join(sorted(set(x).difference({\"CY\"}))))\n", + " .replace('', np.nan)\n", + ")\n", + "nyiso[\"simplified_lbnl_raw\"] = (\n", + " nyiso[\"interconnection_status_raw\"]\n", + " .str.replace(\" \", \"\")\n", + " .replace([np.nan, None, \"None\"], \"\")\n", + " .str.split(\",\")\n", + " .apply(lambda x: \",\".join(sorted(set(x))))\n", + " .replace('', np.nan)\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso['S'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.to_numeric(nyiso['S'].astype('string').str.split(',').str[-1], errors='raise').astype(pd.UInt8Dtype()).value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[\"s_simplified\"] = pd.to_numeric(nyiso['S'].astype('string').str.split(',').str[-1], errors='raise').astype(pd.UInt8Dtype())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[[\"queue_status\", \"simplified_studies\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[[\"simplified_lbnl_raw\", \"simplified_studies\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[\"simplified_lbnl_raw\"].astype('string').eq(nyiso[\"simplified_studies\"].astype('string')).value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[[\"queue_status\", \"s_simplified\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[[\"s_simplified\", \"simplified_studies\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[[\"s_simplified\", \"simplified_lbnl_raw\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[[\"interconnection_status_lbnl\", \"s_simplified\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[\"Queue Date\"].eq(pd.to_datetime(nyiso[\"queue_date\"])).where(\n", + " (nyiso[\"Queue Date\"].notna() & nyiso[\"queue_date\"].notnull())\n", + ").value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# needs standardization\n", + "raw_nyiso['Generation Type'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### NYISO Capacity Comparison\n", + "- Compare total capacity for projects in lbnl and gs\n", + "- Compare total capacity for active projects in lbnl and gs\n", + "- Compare total capacity for active projects\n", + "- Compare capacity by fuel type? Might challenging because the categories are all over the place" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Compare total capacity for projects in both" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "both_nyiso = nyiso[nyiso.in_lbnl & nyiso.in_gs].copy()\n", + "print(nyiso.shape)\n", + "print(both_nyiso.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "both_nyiso.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gs_nyiso_cap = both_nyiso[\"Capacity (MW)\"].sum()\n", + "lbnl_nyiso_cap = both_nyiso[\"capacity_mw_resource_1\"].sum()\n", + "\n", + "print(gs_nyiso_cap)\n", + "print(lbnl_nyiso_cap)\n", + "print(lbnl_nyiso_cap / gs_nyiso_cap)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ok! Projects that exist in both have very similar total capacities! That's a good start. I could look into which projects have different capacity values but the amount is so minor that I'm going to skip for now." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Compare total capacity for **active** projects in both" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "both_nyiso[\"queue_status\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "both_nyiso[[\"Status\", \"queue_status\"]].value_counts(dropna=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are some active LBNL projects that have been withdrawn maybe because the GS data is a bit fresher? If so the withdrawl dates should be after 2022-12-31." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.to_datetime(both_nyiso[(both_nyiso[\"Status\"] == \"Withdrawn\") & (both_nyiso[\"queue_status\"] == \"active\")][\"Withdrawn Date\"]).dt.year.value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Great! That explains it. This means we can just compare projects marked active in lbnl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "active_both_nyiso = both_nyiso[(both_nyiso[\"queue_status\"] == \"active\")]\n", + "gs_nyiso_cap = active_both_nyiso[\"Capacity (MW)\"].sum()\n", + "lbnl_nyiso_cap = active_both_nyiso[\"capacity_mw_resource_1\"].sum()\n", + "\n", + "print(gs_nyiso_cap)\n", + "print(lbnl_nyiso_cap)\n", + "print(lbnl_nyiso_cap / gs_nyiso_cap)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similar capacity totals I'm comfortable with." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Compare total capacity for all projects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[[\"Status\", \"queue_status\"]].value_counts(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(nyiso[nyiso.in_gs & ~nyiso.in_lbnl][\"Capacity (MW)\"].sum() / nyiso[nyiso.in_gs][\"Capacity (MW)\"].sum())\n", + "print(nyiso[~nyiso.in_gs & nyiso.in_lbnl][\"capacity_mw_resource_1\"].sum() / nyiso[nyiso.in_lbnl][\"Capacity (MW)\"].sum())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Projects in GS but not in lbnl account for 27% of total capacity in GS nyiso.\n", + "Projects in LBNL but not in GS account for 4% of total capacity in LBNL NYISO.\n", + "\n", + "Why does GS have so much more capacity than LBNL here? Do we care if GS has more capacity than LBNL given GS more closely resembles the source data? Is LBNL doing deduplication work behind the scenes?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MISO\n", + "Very good project coverage, but the IA status categories are a mess." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_miso.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'\n", + "# Nearly whole-row duplicate, except for \"studyPhase\"\n", + "raw_miso.loc[raw_miso.duplicated(subset='Queue ID', keep=False), :].sort_values(by='Queue ID')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'\n", + "# Nearly whole-row duplicate, except for \"studyPhase\"\n", + "lbnl.query('entity == \"MISO\"').loc[lbnl.query('entity == \"MISO\"').duplicated(subset='queue_id', keep=False), :].sort_values(by='queue_id')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso = join_lbnl(raw_miso.drop_duplicates(subset='Queue ID', keep='last'), lbnl.query('entity == \"MISO\"').drop_duplicates(subset='queue_id', keep='last'), \"MISO\")\n", + "miso.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso.sample(4)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# somehow makes things worse?\n", + "miso.loc[\n", + " pd.to_datetime(miso[\"Queue Date\"].str.replace(r'\\d{2}:\\d{2}:\\d{2}Z$', '', regex=True)) # remove time\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"in_gs\", \"in_lbnl\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.to_datetime(miso[\"Queue Date\"].str.replace(r'\\d{2}:\\d{2}:\\d{2}Z$', '', regex=True)).describe()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.to_datetime(miso['queue_date']).describe()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso[[\"Status\", \"queue_status\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso[['queue_status', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso['interconnection_status_lbnl'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso.loc[:, ['studyPhase', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso.loc[:, ['studyPhase', 'interconnection_status_lbnl']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# inconsistent mix of fuel/tech type\n", + "raw_miso['Generation Type'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SPP\n", + "\n", + "* neither LBNL nor GridStatus have withdrawn projects\n", + "* GridStatus destroys the detailed status information during their processing, so we lose the ability to distinguish between \"IA pending\" and \"System Integration Study\". But I don't think that is a problem because both are included in the \"actionable\" criteria in Synapse's model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_spp.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp = join_lbnl(raw_spp, lbnl, \"SPP\")\n", + "spp.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp.sample(4)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp.loc[\n", + " pd.to_datetime(spp[\"Queue Date\"])\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"in_gs\", \"in_lbnl\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp[[\"Status\", \"queue_status\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp.loc[\n", + " pd.to_datetime(spp[\"Queue Date\"])\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"Status\", \"queue_status\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# it turns out these values come from the raw \"status\" values, which GridStatus overwrites 😡\n", + "spp['interconnection_status_raw'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp[['interconnection_status_lbnl', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp['Current Cluster'].str.replace(r'-\\d{4}-\\d{3}(?:-\\d)?', '', regex=True).value_counts(dropna=False) # remove date/ID\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp['cluster_simplified'] = spp['Current Cluster'].str.replace(r'-\\d{4}-\\d{3}(?:-\\d)?', '', regex=True) # remove date/ID\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp[['interconnection_status_raw', 'cluster_simplified']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp[['interconnection_status_lbnl', 'cluster_simplified']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# needs standardization\n", + "raw_spp['Generation Type'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PJM\n", + "Like MISO, good project coverage, but the IA status categories are a mess.\n", + "\n", + "\"Active\" applied up to IA execution. Then \"Engineering and Procurement\" applied to IA execution through COD. Then \"In Service\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_pjm.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# \"Active\" stops at IA execution\n", + "raw_pjm[['Interim/Interconnection Service Agreement Status', 'Status']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pjm = join_lbnl(raw_pjm, lbnl, \"PJM\")\n", + "pjm.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pjm.sample(4)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pjm['Wholesale Market Participation Agreement'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# no status column for this one for some reason\n", + "pjm['wholesale_not_none'] = pjm['Wholesale Market Participation Agreement'].notna()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pjm[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pjm.loc[\n", + " pd.to_datetime(pjm[\"Queue Date\"])\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"in_gs\", \"in_lbnl\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pjm[[\"queue_status\", \"Status\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pjm['interconnection_status_raw'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# lots of IA status info encoded in various status columns. Have to figure out how to reconstruct the LBNL definitions\n", + "status_cols = [\n", + " 'Feasibility Study Status',\n", + " 'System Impact Study Status',\n", + " 'Facilities Study Status',\n", + " 'Interim/Interconnection Service Agreement Status',\n", + " # 'wholesale_not_none', # redundant with IA status \"Wholesale Market Participation Agreement\"\n", + " 'Construction Service Agreement Status',\n", + " 'Upgrade Construction Service Agreement Status'\n", + "]\n", + "with pd.option_context('display.max_rows', None):\n", + " display(pjm.loc[pjm['queue_status'].eq('active'), status_cols[:4] + ['interconnection_status_lbnl']].value_counts(dropna=False).sort_index())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# needs standardization. Also has a long tail of multivalued entries.\n", + "raw_pjm['Generation Type'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CAISO\n", + "Straightforward!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_caiso.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso = join_lbnl(raw_caiso, lbnl, \"CAISO\")\n", + "caiso.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso.query(\"Status == 'ACTIVE'\").sample(8, random_state=42).sort_values('interconnection_status_lbnl')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso.loc[\n", + " pd.to_datetime(caiso[\"Queue Date\"])\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"in_gs\", \"in_lbnl\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# remarkably easy to match status values\n", + "caiso[[\"Status\", \"queue_status\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso['interconnection_status_raw'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso[['interconnection_status_lbnl', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso['Study Process'].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# I don't see any pattern to the Study Process values.\n", + "status_cols_caiso = [\n", + " 'Interconnection Agreement Status',\n", + " 'Facilities Study (FAS) or Phase II Cluster Study',\n", + " 'System Impact Study or Phase I Cluster Study',\n", + " 'Study Process',\n", + "]\n", + "with pd.option_context('display.max_rows', None):\n", + " display(caiso[status_cols_caiso[:-1]].replace(['None'], np.nan).value_counts(dropna=False).where(lambda x: x>2).dropna().astype(int).sort_index())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with pd.option_context('display.max_rows', None):\n", + " display(caiso.query(\"queue_status == 'active'\")[status_cols_caiso[:-1] + ['interconnection_status_lbnl']].replace(['None'], np.nan).value_counts(dropna=False).sort_index())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# categories look standardized, but need to handle multivalued-ness\n", + "raw_caiso['Generation Type'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ISO-NE\n", + "\n", + "* what GridStatus calls \"Queue ID\" was actually \"Queue Position\" in the original data. It is unique amongst active projects but not for withdrawn projects. There is no natural key for withdrawn projects." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone.duplicated(subset=['Queue ID']).sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone.loc[raw_isone.duplicated(subset=['Queue ID'], keep=False), 'Status'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone['Status'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "compound_key = ['Queue ID', 'Status']\n", + "raw_isone.duplicated(subset=compound_key).sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# there is no natural key for this data. But this looks like the best tradeoff between key complexity and uniqueness.\n", + "compound_key = ['Queue ID', 'Project Name']\n", + "raw_isone.duplicated(subset=compound_key).sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Queue ID is unique within the Active projects. Not sure how change over time will impact the join to LBNL.\n", + "raw_isone.query('Status == \"Active\"')['Queue ID'].is_unique\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# duplicates (none Active) have all kinds of variation in values. Maybe best approach is to sort by date updated and take the latest one. It doesn't always differentiate them though.\n", + "raw_isone.loc[raw_isone.duplicated(subset=compound_key, keep=False),:].sort_values(by=compound_key).tail(10)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone['Updated'] = pd.to_datetime(raw_isone['Updated'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# join manually rather than refactoring the func to take compound key\n", + "lbnl_iso = lbnl.loc[lbnl[\"entity\"].eq(\"ISO-NE\"), LBNL_JOIN_COLS].astype({'queue_id': int})\n", + "isone = raw_isone.sort_values('Updated').drop_duplicates(subset=compound_key, keep='last')\n", + "isone = isone.merge(lbnl_iso, how=\"outer\", left_on=compound_key, right_on=[c.lower().replace(' ', '_') for c in compound_key])\n", + "isone[\"in_lbnl\"] = ~isone[\"queue_id\"].isna()\n", + "isone[\"in_gs\"] = ~isone[\"Queue ID\"].isna()\n", + "del lbnl_iso\n", + "isone.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone.head(2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# small improvement from date filter\n", + "isone.loc[\n", + " pd.to_datetime(isone[\"Queue Date\"])\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"in_gs\", \"in_lbnl\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# status values are decently aligned\n", + "isone[[\"Status\", \"queue_status\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone['interconnection_status_raw'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone[['interconnection_status_lbnl', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "status_cols_isone = [\n", + " \"FS\",\n", + " \"SIS\",\n", + " \"OS\",\n", + " \"FAC\",\n", + " \"IA\",\n", + " \"Project Status\",\n", + "]\n", + "with pd.option_context('display.max_rows', None):\n", + " display(isone[status_cols_isone].replace(['None'], np.nan).value_counts(dropna=False).where(lambda x: x>2).dropna().astype(int).sort_index())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# gridstatus doesn't parse the status values, so they are all null. In their defense, the ISONE encodes them as alt text behind icons, which is stupid. But still.\n", + "isone[status_cols_isone]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone['Generation Type'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/38-bdn-gridstatus-lbnl-capacity.ipynb b/notebooks/38-bdn-gridstatus-lbnl-capacity.ipynb new file mode 100644 index 00000000..8f8e6ec2 --- /dev/null +++ b/notebooks/38-bdn-gridstatus-lbnl-capacity.ipynb @@ -0,0 +1,1649 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GridStatus <-> LBNL Status Mapping\n", + "\n", + "GridStatus reads raw ISO queues. The LBNL queue statuses are standardized versions of ISO-specific categories. This notebook attempts to reverse engineer the mappings between the two." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pathlib import Path\n", + "import numpy as np\n", + "\n", + "from dbcp.extract.lbnl_iso_queue import extract\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get the data\n", + "### LBNL-Compiled Queues" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# partial implementation of transform. I don't want to include deduplication.\n", + "def partial_transform(active_projects: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"Transform active iso queue data.\"\"\"\n", + " rename_dict = {\n", + " \"state\": \"raw_state_name\",\n", + " \"county\": \"raw_county_name\",\n", + " }\n", + " active_projects = active_projects.rename(columns=rename_dict) # copy\n", + " # Harmonize the interconnection_status_lbnl values.\n", + " mapping = {\n", + " \"Feasability Study\": \"Feasibility Study\",\n", + " \"Feasibility\": \"Feasibility Study\",\n", + " \"Facilities Study\": \"Facility Study\",\n", + " \"IA in Progress\": \"In Progress (unknown study)\",\n", + " \"Unknown\": \"In Progress (unknown study)\",\n", + " \"Withdrawn, Feasibility Study\": \"Withdrawn\",\n", + " }\n", + " active_projects.loc[:, \"interconnection_status_lbnl\"] = active_projects.loc[\n", + " :, \"interconnection_status_lbnl\"\n", + " ].replace(mapping)\n", + " # drop irrelevant columns (structurally all nan due to 'active' filter)\n", + " active_projects.drop(columns=[\"date_withdrawn\", \"date_operational\"], inplace=True)\n", + " return active_projects\n", + "\n", + "\n", + "source_path = Path(\"/app/data/raw/queues_2022_clean_data.xlsx\")\n", + "raw_lbnl = extract(source_path)[\"lbnl_iso_queue\"]\n", + "lbnl = partial_transform(raw_lbnl)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl.shape, lbnl.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl.head(2)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### GridStatus Queues" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import dbcp\n", + "\n", + "# These are the revision numbers of the oldest archives we have\n", + "iso_queue_versions: dict[str, str] = {\n", + " \"miso\": \"1681775160487863\",\n", + " \"caiso\": \"1681775162586588\",\n", + " \"pjm\": \"1681775160979859\",\n", + " \"ercot\": \"1681775161342766\",\n", + " \"spp\": \"1681775162935809\",\n", + " \"nyiso\": \"1681775159356063\",\n", + " \"isone\": \"1681775162111351\",\n", + "}\n", + "\n", + "gs_dfs = dbcp.extract.gridstatus_isoqueues.extract(iso_queue_versions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "{k: v.shape for k, v in gs_dfs.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # These are manually downloaded from our archives. I went back as far as I could,\n", + "# # which is April 17 2023.\n", + "# # There is a 3.5 month gap between the LBNL queues and our oldest GridStatus archive.\n", + "# root_path = Path(\"/app/data/raw/gridstatus/interconnection_queues\")\n", + "# assert root_path.exists()\n", + "# # filenames are like \"interconnection_queues_caiso_4-17-2023.parquet\"\n", + "# gs_dfs = {\n", + "# path.name.split(\"_\")[2]: pd.read_parquet(path)\n", + "# for path in root_path.glob(\"*.parquet\")\n", + "# }\n", + "# {k: v.shape for k, v in gs_dfs.items()}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# wayyy fewer items in GridStatus than LBNL.\n", + "sum([v.shape[0] for v in gs_dfs.values()]) - lbnl.shape[0]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl.region.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl_iso = lbnl[~lbnl[\"region\"].str.contains(\"non-ISO\", na=False)].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sum([v.shape[0] for v in gs_dfs.values()]) - lbnl_iso.shape[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Gridstatus has more ISO projects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# globals().update(gs_dfs) # this works fine but the static type checker/linter can't introspect it.\n", + "raw_ercot = gs_dfs[\"ercot\"]\n", + "raw_nyiso = gs_dfs[\"nyiso\"]\n", + "raw_isone = gs_dfs[\"isone\"]\n", + "raw_miso = gs_dfs[\"miso\"]\n", + "raw_pjm = gs_dfs[\"pjm\"]\n", + "raw_spp = gs_dfs[\"spp\"]\n", + "raw_caiso = gs_dfs[\"caiso\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LBNL_JOIN_COLS = [\n", + " \"queue_id\", # join key\n", + " \"project_name\", # for manually checking the joins\n", + " \"queue_date\", # for manually checking the joins\n", + " \"queue_status\", # for manually checking the joins\n", + " \"interconnection_status_raw\", # see what LBNL interpreted\n", + " \"interconnection_status_lbnl\", # final mapping value\n", + " \"capacity_mw_resource_1\",\n", + " \"resource_type_1\"\n", + " \n", + "]\n", + "\n", + "\n", + "def join_lbnl(\n", + " iso_df: pd.DataFrame, lbnl: pd.DataFrame, iso_name: str, iso_id_col=\"Queue ID\"\n", + ") -> pd.DataFrame:\n", + " \"\"\"Join LBNL queue data to GridStatus queue data.\"\"\"\n", + " assert iso_df[iso_id_col].is_unique, \"ID column not unique\"\n", + " lbnl_iso = lbnl.loc[lbnl[\"entity\"].eq(iso_name), LBNL_JOIN_COLS]\n", + " assert not lbnl_iso.empty, f\"Empty LBNL queue for {iso_name}\"\n", + " assert lbnl_iso[\"queue_id\"].is_unique, \"LBNL queue ID not unique\"\n", + " out = iso_df.merge(lbnl_iso, how=\"outer\", left_on=iso_id_col, right_on=\"queue_id\")\n", + " out[\"in_lbnl\"] = ~out[\"queue_id\"].isna()\n", + " out[\"in_gs\"] = ~out[iso_id_col].isna()\n", + " return out\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Comparisons\n", + "### ERCOT\n", + "* ERCOT's queue document has separate excel sheets for large and small (< 20MW) projects, and does not list withdrawn projects. GridStatus only includes large projects, whereas LBNL data includes the other categories. I'm not sure where LBNL gets the withdrawn projects from.\n", + " * We might be able to maintain a withdrawn list by 1) getting all the current withdrawn projects from the LBNL data, then 2) loading all the archived GridStatus ERCOT queues and checking for any projects that enter then exit the list. But I don't think we actually need a withdrawn list?\n", + "* GridStatus defines \"status\" as \"IA Signed\".isna(). LBNL calls the entire \"large active\" dataset \"active\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_ercot.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot = join_lbnl(raw_ercot, lbnl, \"ERCOT\")\n", + "ercot.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "#### Compare total capacity for projects in both" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "both_ercot = ercot[ercot.in_lbnl & ercot.in_gs]\n", + "\n", + "compare_capacity(both_ercot)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "#### Compare total capacity for **active** projects in both" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot[[\"Status\", \"queue_status\"]].value_counts(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot_active_in_both = ercot[ercot[\"queue_status\"].eq(\"active\") & ercot[\"Status\"].eq(\"Active\")]\n", + "compare_capacity(ercot_active_in_both)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Very little mismatching in status columns and capacities look good for projects that are active in both. There a few dozen lbnl active projects that are withdrawn or completed in GS." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Compare total capacity for all projects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gs_capacities = ercot[ercot.in_gs][\"Capacity (MW)\"]\n", + "lbnl_capacities = ercot[ercot.in_lbnl][\"capacity_mw_resource_1\"]\n", + "\n", + "print(f\"Ratio of total LBNL capacity to total GS capacity {lbnl_capacities.sum() / gs_capacities.sum()}\")\n", + "print(f\"Ratio of total LBNL project to total GS projects {len(lbnl_capacities) / len(gs_capacities)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Where is LBNL getting historic queue data from?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot[\"queue_status\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.to_datetime(ercot[ercot[\"queue_status\"].eq(\"withdrawn\")][\"queue_date\"]).dt.year.plot.hist()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The [ISO Queue sheet](https://www.ercot.com/misdownload/servlets/mirDownload?doclookupId=955158734') Grid Status uses has an Inactive sheet that only goes back to about 2019. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### NYISO\n", + "\n", + "* LBNL appears to have used the column labeled `S`, which is an ordinal status number corresponding to NYISO's LFIP. Key:\n", + " * 0=Withdrawn\n", + " * 1=Scoping Meeting Pending\n", + " * 2=FES Pending\n", + " * 3=FES in Progress\n", + " * 4=SRIS/SIS Pending\n", + " * 5=SRIS/SIS in Progress\n", + " * 6=SRIS/SIS Approved\n", + " * 7=FS Pending\n", + " * 8=Rejected Cost Allocation/Next FS Pending\n", + " * 9=FS in Progress\n", + " * 10=Accepted Cost Allocation/IA in Progress\n", + " * 11=IA Completed\n", + " * 12=Under Construction\n", + " * 13=In Service for Test\n", + " * 14=In Service Commercial\n", + " * 15=Partial In-Service\n", + "* Availability of Studies Key: None=Not Available, FES=Feasibility Study Available, SRIS=System Reliability Impact Study Available, FS=Facilities Study and/or ATRA Available\n", + "* CY Completion/SGIA Tender refers to the Attachment X milestone used to apply the 4-year COD limitation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_nyiso.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_nyiso[raw_nyiso[\"Queue ID\"].duplicated(keep=False)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'\n", + "raw_nyiso.loc[raw_nyiso.duplicated(subset='Queue ID', keep=False), :].sort_values(by='Queue ID')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso = join_lbnl(raw_nyiso.drop_duplicates(subset='Queue ID', keep='last'), lbnl, \"NYISO\")\n", + "nyiso.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# marginal improvement from date filter\n", + "nyiso.loc[\n", + " nyiso[\"Queue Date\"]\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"in_gs\", \"in_lbnl\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### NYISO Capacity Comparison\n", + "- Compare total capacity for projects in lbnl and gs\n", + "- Compare total capacity for active projects in lbnl and gs\n", + "- Compare total capacity for active projects\n", + "- Compare capacity by fuel type? Might challenging because the categories are all over the place" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Compare total capacity for projects in both" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compare_capacity(comb_iso: pd.DataFrame, gs_cap_col=\"Capacity (MW)\", lbnl_cap_col=\"capacity_mw_resource_1\"): \n", + " gs_iso_cap = comb_iso[gs_cap_col].sum()\n", + " lbnl_iso_cap = comb_iso[lbnl_cap_col].sum()\n", + "\n", + " print(f\"GS Capacity for projects in GS and LBNL: {gs_iso_cap}\")\n", + " print(f\"LBNL Capacity for project in GS and LBNL: {lbnl_iso_cap}\")\n", + " print(f\"Ratio of LBNL to GS Capacity for project in GS and LBNL: {lbnl_iso_cap / gs_iso_cap}\")\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "both_nyiso = nyiso[nyiso.in_lbnl & nyiso.in_gs]\n", + "\n", + "compare_capacity(both_nyiso)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ok! Projects that exist in both have very similar total capacities! That's a good start. I could look into which projects have different capacity values but the amount is so minor that I'm going to skip for now." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Compare total capacity for **active** projects in both" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "both_nyiso[\"queue_status\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "both_nyiso[[\"Status\", \"queue_status\"]].value_counts(dropna=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are some active LBNL projects that have been withdrawn maybe because the GS data is a bit fresher? If so the withdrawl dates should be after 2022-12-31." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.to_datetime(both_nyiso[(both_nyiso[\"Status\"] == \"Withdrawn\") & (both_nyiso[\"queue_status\"] == \"active\")][\"Withdrawn Date\"]).dt.year.value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Great! That explains it. This means we can just compare projects marked active in lbnl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "active_both_nyiso = both_nyiso[(both_nyiso[\"queue_status\"] == \"active\")]\n", + "gs_nyiso_cap = active_both_nyiso[\"Capacity (MW)\"].sum()\n", + "lbnl_nyiso_cap = active_both_nyiso[\"capacity_mw_resource_1\"].sum()\n", + "\n", + "print(gs_nyiso_cap)\n", + "print(lbnl_nyiso_cap)\n", + "print(lbnl_nyiso_cap / gs_nyiso_cap)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similar capacity totals I'm comfortable with." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Compare total capacity for all projects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[[\"Status\", \"queue_status\"]].value_counts(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(nyiso[nyiso.in_gs & ~nyiso.in_lbnl][\"Capacity (MW)\"].sum() / nyiso[nyiso.in_gs][\"Capacity (MW)\"].sum())\n", + "print(nyiso[~nyiso.in_gs & nyiso.in_lbnl][\"capacity_mw_resource_1\"].sum() / nyiso[nyiso.in_lbnl][\"Capacity (MW)\"].sum())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Projects in GS but not in lbnl account for 27% of total capacity in GS nyiso.\n", + "Projects in LBNL but not in GS account for 4% of total capacity in LBNL NYISO.\n", + "\n", + "Why does GS have so much more capacity than LBNL here? Do we care if GS has more capacity than LBNL given GS more closely resembles the source data? Is LBNL doing deduplication work behind the scenes?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gs_capacities = nyiso[nyiso.in_gs][\"Winter Capacity (MW)\"]\n", + "lbnl_capacities = nyiso[nyiso.in_lbnl][\"capacity_mw_resource_1\"]\n", + "\n", + "print(f\"Ratio of total LBNL capacity to total GS capacity {lbnl_capacities.sum() / pd.to_numeric(gs_capacities, errors='coerce').sum()}\")\n", + "print(f\"Ratio of total LBNL project to total GS projects {len(lbnl_capacities) / len(gs_capacities)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are 190 transmission projects in GS that aren't in LBNL. See generation type analysis notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MISO\n", + "Very good project coverage, but the IA status categories are a mess." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_miso.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'\n", + "# Nearly whole-row duplicate, except for \"studyPhase\"\n", + "raw_miso.loc[raw_miso.duplicated(subset='Queue ID', keep=False), :].sort_values(by='Queue ID')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'\n", + "# Nearly whole-row duplicate, except for \"studyPhase\"\n", + "lbnl.query('entity == \"MISO\"').loc[lbnl.query('entity == \"MISO\"').duplicated(subset='queue_id', keep=False), :].sort_values(by='queue_id')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso = join_lbnl(raw_miso.drop_duplicates(subset='Queue ID', keep='last'), lbnl.query('entity == \"MISO\"').drop_duplicates(subset='queue_id', keep='last'), \"MISO\")\n", + "miso.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "#### Compare total capacity for projects in both" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "both_miso = miso[miso.in_lbnl & miso.in_gs]\n", + "\n", + "compare_capacity(both_miso)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "#### Compare total capacity for **active** projects in both" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "both_miso = miso[miso.in_lbnl & miso.in_gs]\n", + "\n", + "both_miso[[\"Status\", \"queue_status\"]].value_counts(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "active_in_lbnl_not_active_in_gs = both_miso[both_miso[\"queue_status\"].eq(\"active\") & both_miso[\"Status\"].ne(\"Active\")]\n", + "active_in_lbnl_not_active_in_gs[\"Status\"].value_counts(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.to_datetime(active_in_lbnl_not_active_in_gs.query(\"Status == 'Done'\")[\"inService\"]).dt.year.value_counts(dropna=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Seems like a bulk of projects that are active in lbnl but considered done in GS were completed in the last 5ish years. Only 43 projects went in service in 2023. Shouldn't LBNL have caught the other projects in service?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.to_datetime(active_in_lbnl_not_active_in_gs.query(\"Status == 'Withdrawn'\")[\"Withdrawn Date\"]).dt.year.value_counts(dropna=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Great! The active LBNL projects that are Withdrawn in GS were mostly withdrawn in 2023." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "active_both_miso = both_miso[both_miso[\"queue_status\"].eq(\"active\") & both_miso[\"Status\"].eq(\"Active\")]\n", + "\n", + "compare_capacity(active_both_miso)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Projects that are marked active in both LBNL and GS have very similar total capacities." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gs_capacities = miso[miso.in_gs][\"Winter Capacity (MW)\"]\n", + "lbnl_capacities = miso[miso.in_lbnl][\"capacity_mw_resource_1\"]\n", + "\n", + "print(f\"Ratio of total LBNL capacity to total GS capacity {lbnl_capacities.sum() / gs_capacities.sum()}\")\n", + "print(f\"Ratio of total LBNL project to total GS projects {len(lbnl_capacities) / len(gs_capacities)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Total MISO Capacity is pretty similar in LBNL and GS. There are some differences in queue status but nothing alarming." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### SPP\n", + "\n", + "* neither LBNL nor GridStatus have withdrawn projects\n", + "* GridStatus destroys the detailed status information during their processing, so we lose the ability to distinguish between \"IA pending\" and \"System Integration Study\". But I don't think that is a problem because both are included in the \"actionable\" criteria in Synapse's model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_spp.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp = join_lbnl(raw_spp, lbnl, \"SPP\")\n", + "spp.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp.sample(4)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "#### Compare total capacity for projects in both" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "both_spp = spp[spp.in_lbnl & spp.in_gs]\n", + "\n", + "compare_capacity(both_spp)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "#### Compare total capacity for **active** projects in both" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp[[\"Status\", \"queue_status\"]].value_counts(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp_gs_completed_lbnl_active = spp[spp[\"queue_status\"].eq(\"active\") & spp[\"Status\"].eq(\"Active\")]\n", + "compare_capacity(spp_gs_completed_lbnl_active)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Some mismatching in status column for projects that are active in both datasets the capacities look good." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Compare total capacity for all projects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gs_capacities = spp[spp.in_gs][\"Capacity (MW)\"]\n", + "lbnl_capacities = spp[spp.in_lbnl][\"capacity_mw_resource_1\"]\n", + "\n", + "print(f\"Ratio of total LBNL capacity to total GS capacity {lbnl_capacities.sum() / gs_capacities.sum()}\")\n", + "print(f\"Ratio of total LBNL project to total GS projects {len(lbnl_capacities) / len(gs_capacities)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PJM\n", + "Like MISO, good project coverage, but the IA status categories are a mess.\n", + "\n", + "\"Active\" applied up to IA execution. Then \"Engineering and Procurement\" applied to IA execution through COD. Then \"In Service\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_pjm.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# \"Active\" stops at IA execution\n", + "raw_pjm[['Interim/Interconnection Service Agreement Status', 'Status']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pjm = join_lbnl(raw_pjm, lbnl, \"PJM\")\n", + "pjm.info()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "#### Compare total capacity for projects in both" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "both_pjm = pjm[pjm.in_lbnl & pjm.in_gs].copy()\n", + "\n", + "compare_capacity(both_pjm)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "GS has about 25% more capacity for projects in both. Which projects have super different values? Is it a units issue?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "both_pjm[\"capacity_diff\"] = both_pjm[\"Capacity (MW)\"] - both_pjm[\"capacity_mw_resource_1\"]\n", + "\n", + "both_pjm[\"capacity_diff\"].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(both_pjm[\"capacity_diff\"].ne(0).value_counts())\n", + "\n", + "cap_fields = [\n", + " \"Capacity (MW)\",\n", + " \"Summer Capacity (MW)\",\n", + " \"Winter Capacity (MW)\",\n", + " \"capacity_mw_resource_1\"\n", + "]\n", + "\n", + "both_pjm_diff_caps = both_pjm[both_pjm[\"capacity_diff\"].ne(0) & ~both_pjm[\"Capacity (MW)\"].isna()].sort_values(\"capacity_diff\").copy()\n", + "both_pjm_diff_caps[cap_fields].head(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What is the difference between \"MW Capacity\", \"MW Energy\", \"MFO\", \"MW In Service\"? It seems like LBNL used \"MW Capacity\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# GS renames \"MW Energy\" to \"Winter Capacity (MW)\"\n", + "compare_capacity(both_pjm, gs_cap_col=\"Winter Capacity (MW)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks like lbnl used \"MW Energy\" to measure capacity. Which columns should we be using?!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "#### Compare total capacity for **active** projects in both" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pjm[[\"Status\", \"queue_status\"]].value_counts(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pjm_active_in_both = pjm[pjm[\"queue_status\"].eq(\"active\") & pjm[\"Status\"].eq(\"Active\")]\n", + "compare_capacity(pjm_active_in_both, gs_cap_col=\"Winter Capacity (MW)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Some mismatching in status column for projects that are active in both datasets the capacities look good." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Compare total capacity for all projects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gs_capacities = pjm[pjm.in_gs][\"Winter Capacity (MW)\"]\n", + "lbnl_capacities = pjm[pjm.in_lbnl][\"capacity_mw_resource_1\"]\n", + "\n", + "print(f\"Ratio of total LBNL capacity to total GS capacity {lbnl_capacities.sum() / gs_capacities.sum()}\")\n", + "print(f\"Ratio of total LBNL project to total GS projects {len(lbnl_capacities) / len(gs_capacities)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "GS has a little more capacity and projecdts than LBNL." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CAISO\n", + "Straightforward!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_caiso.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso = join_lbnl(raw_caiso, lbnl, \"CAISO\")\n", + "caiso.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso.query(\"Status == 'ACTIVE'\").sample(8, random_state=42).sort_values('interconnection_status_lbnl')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso.loc[\n", + " pd.to_datetime(caiso[\"Queue Date\"])\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"in_gs\", \"in_lbnl\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# remarkably easy to match status values\n", + "caiso[[\"Status\", \"queue_status\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "#### Compare total capacity for projects in both" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "both_caiso = caiso[caiso.in_lbnl & caiso.in_gs]\n", + "\n", + "compare_capacity(both_caiso)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "#### Compare total capacity for **active** projects in both" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso[[\"Status\", \"queue_status\"]].value_counts(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso_active_in_both = caiso[caiso[\"queue_status\"].eq(\"active\") & caiso[\"Status\"].eq(\"ACTIVE\")]\n", + "compare_capacity(caiso_active_in_both)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Very little mismatching in status columns and capacities look good for projects that are active in both. There a few dozen lbnl active projects that are withdrawn or completed in GS." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Compare total capacity for all projects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gs_capacities = caiso[caiso.in_gs][\"Capacity (MW)\"]\n", + "lbnl_capacities = caiso[caiso.in_lbnl][\"capacity_mw_resource_1\"]\n", + "\n", + "print(f\"Ratio of total LBNL capacity to total GS capacity {lbnl_capacities.sum() / gs_capacities.sum()}\")\n", + "print(f\"Ratio of total LBNL project to total GS projects {len(lbnl_capacities) / len(gs_capacities)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Basically the same amount of capacity and number of projects. love it" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ISO-NE\n", + "\n", + "* what GridStatus calls \"Queue ID\" was actually \"Queue Position\" in the original data. It is unique amongst active projects but not for withdrawn projects. There is no natural key for withdrawn projects." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone.duplicated(subset=['Queue ID']).sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone.loc[raw_isone.duplicated(subset=['Queue ID'], keep=False), 'Status'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone['Status'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "compound_key = ['Queue ID', 'Status']\n", + "raw_isone.duplicated(subset=compound_key).sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# there is no natural key for this data. But this looks like the best tradeoff between key complexity and uniqueness.\n", + "compound_key = ['Queue ID', 'Project Name']\n", + "raw_isone.duplicated(subset=compound_key).sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Queue ID is unique within the Active projects. Not sure how change over time will impact the join to LBNL.\n", + "raw_isone.query('Status == \"Active\"')['Queue ID'].is_unique\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# duplicates (none Active) have all kinds of variation in values. Maybe best approach is to sort by date updated and take the latest one. It doesn't always differentiate them though.\n", + "raw_isone.loc[raw_isone.duplicated(subset=compound_key, keep=False),:].sort_values(by=compound_key).tail(10)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone['Updated'] = pd.to_datetime(raw_isone['Updated'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# join manually rather than refactoring the func to take compound key\n", + "lbnl_iso = lbnl.loc[lbnl[\"entity\"].eq(\"ISO-NE\"), LBNL_JOIN_COLS].astype({'queue_id': int})\n", + "isone = raw_isone.sort_values('Updated').drop_duplicates(subset=compound_key, keep='last')\n", + "isone = isone.merge(lbnl_iso, how=\"outer\", left_on=compound_key, right_on=[c.lower().replace(' ', '_') for c in compound_key])\n", + "isone[\"in_lbnl\"] = ~isone[\"queue_id\"].isna()\n", + "isone[\"in_gs\"] = ~isone[\"Queue ID\"].isna()\n", + "del lbnl_iso\n", + "isone.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone.head(2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# small improvement from date filter\n", + "isone.loc[\n", + " pd.to_datetime(isone[\"Queue Date\"])\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"in_gs\", \"in_lbnl\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# status values are decently aligned\n", + "isone[[\"Status\", \"queue_status\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone['interconnection_status_raw'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone[['interconnection_status_lbnl', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "status_cols_isone = [\n", + " \"FS\",\n", + " \"SIS\",\n", + " \"OS\",\n", + " \"FAC\",\n", + " \"IA\",\n", + " \"Project Status\",\n", + "]\n", + "with pd.option_context('display.max_rows', None):\n", + " display(isone[status_cols_isone].replace(['None'], np.nan).value_counts(dropna=False).where(lambda x: x>2).dropna().astype(int).sort_index())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# gridstatus doesn't parse the status values, so they are all null. In their defense, the ISONE encodes them as alt text behind icons, which is stupid. But still.\n", + "isone[status_cols_isone]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone['Generation Type'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "#### Compare total capacity for projects in both" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "both_isone = isone[isone.in_lbnl & isone.in_gs]\n", + "\n", + "compare_capacity(both_isone)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "#### Compare total capacity for **active** projects in both" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone[[\"Status\", \"queue_status\"]].value_counts(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone_active_in_both = isone[isone[\"queue_status\"].eq(\"active\") & isone[\"Status\"].eq(\"Active\")]\n", + "compare_capacity(isone_active_in_both)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Decent alignment between project statuses. GS has a bit more projects. Did their status change in 2023?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.to_datetime(isone[isone[\"queue_status\"].isna() & isone[\"Status\"].eq(\"Active\")][\"Queue Date\"]).dt.year.plot.hist()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Compare total capacity for all projects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gs_capacities = isone[isone.in_gs][\"Capacity (MW)\"]\n", + "lbnl_capacities = isone[isone.in_lbnl][\"capacity_mw_resource_1\"]\n", + "\n", + "print(f\"Ratio of total LBNL capacity to total GS capacity {(gs_capacities.sum() - lbnl_capacities.sum()) / lbnl_capacities.sum() * 100}\")\n", + "print(f\"Ratio of total LBNL project to total GS projects {len(lbnl_capacities) / len(gs_capacities)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Hmmm substatially more capacity in GS." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/39-bdn-gridstatus-lbnl-generation-type.ipynb b/notebooks/39-bdn-gridstatus-lbnl-generation-type.ipynb new file mode 100644 index 00000000..7586be9d --- /dev/null +++ b/notebooks/39-bdn-gridstatus-lbnl-generation-type.ipynb @@ -0,0 +1,1146 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GridStatus <-> LBNL Status Mapping\n", + "\n", + "GridStatus reads raw ISO queues. The LBNL queue statuses are standardized versions of ISO-specific categories. This notebook attempts to reverse engineer the mappings between the two." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pathlib import Path\n", + "import numpy as np\n", + "\n", + "from dbcp.extract.lbnl_iso_queue import extract\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get the data\n", + "### LBNL-Compiled Queues" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# partial implementation of transform. I don't want to include deduplication.\n", + "def partial_transform(active_projects: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"Transform active iso queue data.\"\"\"\n", + " rename_dict = {\n", + " \"state\": \"raw_state_name\",\n", + " \"county\": \"raw_county_name\",\n", + " }\n", + " active_projects = active_projects.rename(columns=rename_dict) # copy\n", + " # Harmonize the interconnection_status_lbnl values.\n", + " mapping = {\n", + " \"Feasability Study\": \"Feasibility Study\",\n", + " \"Feasibility\": \"Feasibility Study\",\n", + " \"Facilities Study\": \"Facility Study\",\n", + " \"IA in Progress\": \"In Progress (unknown study)\",\n", + " \"Unknown\": \"In Progress (unknown study)\",\n", + " \"Withdrawn, Feasibility Study\": \"Withdrawn\",\n", + " }\n", + " active_projects.loc[:, \"interconnection_status_lbnl\"] = active_projects.loc[\n", + " :, \"interconnection_status_lbnl\"\n", + " ].replace(mapping)\n", + " # drop irrelevant columns (structurally all nan due to 'active' filter)\n", + " active_projects.drop(columns=[\"date_withdrawn\", \"date_operational\"], inplace=True)\n", + " return active_projects\n", + "\n", + "\n", + "source_path = Path(\"/app/data/raw/queues_2022_clean_data.xlsx\")\n", + "raw_lbnl = extract(source_path)[\"lbnl_iso_queue\"]\n", + "lbnl = partial_transform(raw_lbnl)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl.shape, lbnl.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl.head(2)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### GridStatus Queues" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import dbcp\n", + "\n", + "# These are the revision numbers of the oldest archives we have\n", + "iso_queue_versions: dict[str, str] = {\n", + " \"miso\": \"1681775160487863\",\n", + " \"caiso\": \"1681775162586588\",\n", + " \"pjm\": \"1681775160979859\",\n", + " \"ercot\": \"1681775161342766\",\n", + " \"spp\": \"1681775162935809\",\n", + " \"nyiso\": \"1681775159356063\",\n", + " \"isone\": \"1681775162111351\",\n", + "}\n", + "\n", + "gs_dfs = dbcp.extract.gridstatus_isoqueues.extract(iso_queue_versions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "{k: v.shape for k, v in gs_dfs.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # These are manually downloaded from our archives. I went back as far as I could,\n", + "# # which is April 17 2023.\n", + "# # There is a 3.5 month gap between the LBNL queues and our oldest GridStatus archive.\n", + "# root_path = Path(\"/app/data/raw/gridstatus/interconnection_queues\")\n", + "# assert root_path.exists()\n", + "# # filenames are like \"interconnection_queues_caiso_4-17-2023.parquet\"\n", + "# gs_dfs = {\n", + "# path.name.split(\"_\")[2]: pd.read_parquet(path)\n", + "# for path in root_path.glob(\"*.parquet\")\n", + "# }\n", + "# {k: v.shape for k, v in gs_dfs.items()}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# wayyy fewer items in GridStatus than LBNL.\n", + "sum([v.shape[0] for v in gs_dfs.values()]) - lbnl.shape[0]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl.region.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl_iso = lbnl[~lbnl[\"region\"].str.contains(\"non-ISO\", na=False)].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sum([v.shape[0] for v in gs_dfs.values()]) - lbnl_iso.shape[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Gridstatus has more ISO projects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# globals().update(gs_dfs) # this works fine but the static type checker/linter can't introspect it.\n", + "raw_ercot = gs_dfs[\"ercot\"]\n", + "raw_nyiso = gs_dfs[\"nyiso\"]\n", + "raw_isone = gs_dfs[\"isone\"]\n", + "raw_miso = gs_dfs[\"miso\"]\n", + "raw_pjm = gs_dfs[\"pjm\"]\n", + "raw_spp = gs_dfs[\"spp\"]\n", + "raw_caiso = gs_dfs[\"caiso\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LBNL_JOIN_COLS = [\n", + " \"queue_id\", # join key\n", + " \"project_name\", # for manually checking the joins\n", + " \"queue_date\", # for manually checking the joins\n", + " \"queue_status\", # for manually checking the joins\n", + " \"interconnection_status_raw\", # see what LBNL interpreted\n", + " \"interconnection_status_lbnl\", # final mapping value\n", + " \"capacity_mw_resource_1\",\n", + " \"resource_type_1\"\n", + " \n", + "]\n", + "\n", + "\n", + "def join_lbnl(\n", + " iso_df: pd.DataFrame, lbnl: pd.DataFrame, iso_name: str, iso_id_col=\"Queue ID\"\n", + ") -> pd.DataFrame:\n", + " \"\"\"Join LBNL queue data to GridStatus queue data.\"\"\"\n", + " assert iso_df[iso_id_col].is_unique, \"ID column not unique\"\n", + " lbnl_iso = lbnl.loc[lbnl[\"entity\"].eq(iso_name), LBNL_JOIN_COLS]\n", + " assert not lbnl_iso.empty, f\"Empty LBNL queue for {iso_name}\"\n", + " assert lbnl_iso[\"queue_id\"].is_unique, \"LBNL queue ID not unique\"\n", + " out = iso_df.merge(lbnl_iso, how=\"outer\", left_on=iso_id_col, right_on=\"queue_id\")\n", + " out[\"in_lbnl\"] = ~out[\"queue_id\"].isna()\n", + " out[\"in_gs\"] = ~out[iso_id_col].isna()\n", + " return out\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Comparisons\n", + "### ERCOT\n", + "* ERCOT's queue document has separate excel sheets for large and small (< 20MW) projects, and does not list withdrawn projects. GridStatus only includes large projects, whereas LBNL data includes the other categories. I'm not sure where LBNL gets the withdrawn projects from.\n", + " * We might be able to maintain a withdrawn list by 1) getting all the current withdrawn projects from the LBNL data, then 2) loading all the archived GridStatus ERCOT queues and checking for any projects that enter then exit the list. But I don't think we actually need a withdrawn list?\n", + "* GridStatus defines \"status\" as \"IA Signed\".isna(). LBNL calls the entire \"large active\" dataset \"active\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_ercot.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot = join_lbnl(raw_ercot, lbnl, \"ERCOT\")\n", + "ercot.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot[\"resource_type_1\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot[\"Fuel\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot[\"Technology\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Great! between GS.Fuel and GS.Technology it should be pretty easy to map to the values used in LBNL." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### NYISO\n", + "\n", + "* LBNL appears to have used the column labeled `S`, which is an ordinal status number corresponding to NYISO's LFIP. Key:\n", + " * 0=Withdrawn\n", + " * 1=Scoping Meeting Pending\n", + " * 2=FES Pending\n", + " * 3=FES in Progress\n", + " * 4=SRIS/SIS Pending\n", + " * 5=SRIS/SIS in Progress\n", + " * 6=SRIS/SIS Approved\n", + " * 7=FS Pending\n", + " * 8=Rejected Cost Allocation/Next FS Pending\n", + " * 9=FS in Progress\n", + " * 10=Accepted Cost Allocation/IA in Progress\n", + " * 11=IA Completed\n", + " * 12=Under Construction\n", + " * 13=In Service for Test\n", + " * 14=In Service Commercial\n", + " * 15=Partial In-Service\n", + "* Availability of Studies Key: None=Not Available, FES=Feasibility Study Available, SRIS=System Reliability Impact Study Available, FS=Facilities Study and/or ATRA Available\n", + "* CY Completion/SGIA Tender refers to the Attachment X milestone used to apply the 4-year COD limitation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_nyiso.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_nyiso[raw_nyiso[\"Queue ID\"].duplicated(keep=False)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'\n", + "raw_nyiso.loc[raw_nyiso.duplicated(subset='Queue ID', keep=False), :].sort_values(by='Queue ID')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso = join_lbnl(raw_nyiso.drop_duplicates(subset='Queue ID', keep='last'), lbnl, \"NYISO\")\n", + "nyiso.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# marginal improvement from date filter\n", + "nyiso.loc[\n", + " nyiso[\"Queue Date\"]\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"in_gs\", \"in_lbnl\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### Compare fuel types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[\"resource_type_1\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[\"Generation Type\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso_transmission = nyiso[nyiso[\"Generation Type\"].str.contains(\"Transmission\",na=False)]\n", + "nyiso_transmission.queue_id.isna().value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks like LBNL removes transmission projects.\n", + "\n", + "GS hasmore generation types but should be so hard to consolidate them to LBNLs types." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MISO\n", + "Very good project coverage, but the IA status categories are a mess." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_miso.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'\n", + "# Nearly whole-row duplicate, except for \"studyPhase\"\n", + "raw_miso.loc[raw_miso.duplicated(subset='Queue ID', keep=False), :].sort_values(by='Queue ID')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'\n", + "# Nearly whole-row duplicate, except for \"studyPhase\"\n", + "lbnl.query('entity == \"MISO\"').loc[lbnl.query('entity == \"MISO\"').duplicated(subset='queue_id', keep=False), :].sort_values(by='queue_id')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso = join_lbnl(raw_miso.drop_duplicates(subset='Queue ID', keep='last'), lbnl.query('entity == \"MISO\"').drop_duplicates(subset='queue_id', keep='last'), \"MISO\")\n", + "miso.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare generaiton types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso[\"resource_type_1\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso[\"Generation Type\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generation type mapping looks pretty straight forward." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### SPP\n", + "\n", + "* neither LBNL nor GridStatus have withdrawn projects\n", + "* GridStatus destroys the detailed status information during their processing, so we lose the ability to distinguish between \"IA pending\" and \"System Integration Study\". But I don't think that is a problem because both are included in the \"actionable\" criteria in Synapse's model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_spp.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp = join_lbnl(raw_spp, lbnl, \"SPP\")\n", + "spp.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp.sample(4)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare generaiton types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp[\"resource_type_1\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp[\"Generation Type\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generation type mapping looks tractable" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PJM\n", + "Like MISO, good project coverage, but the IA status categories are a mess.\n", + "\n", + "\"Active\" applied up to IA execution. Then \"Engineering and Procurement\" applied to IA execution through COD. Then \"In Service\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_pjm.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# \"Active\" stops at IA execution\n", + "raw_pjm[['Interim/Interconnection Service Agreement Status', 'Status']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pjm = join_lbnl(raw_pjm, lbnl, \"PJM\")\n", + "pjm.info()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare generaiton types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pjm[\"resource_type_1\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pjm[\"Generation Type\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generation type mapping looks tractable" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CAISO\n", + "Straightforward!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_caiso.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso = join_lbnl(raw_caiso, lbnl, \"CAISO\")\n", + "caiso.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso.query(\"Status == 'ACTIVE'\").sample(8, random_state=42).sort_values('interconnection_status_lbnl')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso.loc[\n", + " pd.to_datetime(caiso[\"Queue Date\"])\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"in_gs\", \"in_lbnl\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# remarkably easy to match status values\n", + "caiso[[\"Status\", \"queue_status\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare generaiton types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso[\"resource_type_1\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso[\"Generation Type\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A bit messier than others but still looks tractable" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ISO-NE\n", + "\n", + "* what GridStatus calls \"Queue ID\" was actually \"Queue Position\" in the original data. It is unique amongst active projects but not for withdrawn projects. There is no natural key for withdrawn projects." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone.duplicated(subset=['Queue ID']).sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone.loc[raw_isone.duplicated(subset=['Queue ID'], keep=False), 'Status'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone['Status'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "compound_key = ['Queue ID', 'Status']\n", + "raw_isone.duplicated(subset=compound_key).sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# there is no natural key for this data. But this looks like the best tradeoff between key complexity and uniqueness.\n", + "compound_key = ['Queue ID', 'Project Name']\n", + "raw_isone.duplicated(subset=compound_key).sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Queue ID is unique within the Active projects. Not sure how change over time will impact the join to LBNL.\n", + "raw_isone.query('Status == \"Active\"')['Queue ID'].is_unique\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# duplicates (none Active) have all kinds of variation in values. Maybe best approach is to sort by date updated and take the latest one. It doesn't always differentiate them though.\n", + "raw_isone.loc[raw_isone.duplicated(subset=compound_key, keep=False),:].sort_values(by=compound_key).tail(10)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone['Updated'] = pd.to_datetime(raw_isone['Updated'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# join manually rather than refactoring the func to take compound key\n", + "lbnl_iso = lbnl.loc[lbnl[\"entity\"].eq(\"ISO-NE\"), LBNL_JOIN_COLS].astype({'queue_id': int})\n", + "isone = raw_isone.sort_values('Updated').drop_duplicates(subset=compound_key, keep='last')\n", + "isone = isone.merge(lbnl_iso, how=\"outer\", left_on=compound_key, right_on=[c.lower().replace(' ', '_') for c in compound_key])\n", + "isone[\"in_lbnl\"] = ~isone[\"queue_id\"].isna()\n", + "isone[\"in_gs\"] = ~isone[\"Queue ID\"].isna()\n", + "del lbnl_iso\n", + "isone.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone.head(2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# small improvement from date filter\n", + "isone.loc[\n", + " pd.to_datetime(isone[\"Queue Date\"])\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"in_gs\", \"in_lbnl\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# status values are decently aligned\n", + "isone[[\"Status\", \"queue_status\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone['interconnection_status_raw'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone[['interconnection_status_lbnl', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "status_cols_isone = [\n", + " \"FS\",\n", + " \"SIS\",\n", + " \"OS\",\n", + " \"FAC\",\n", + " \"IA\",\n", + " \"Project Status\",\n", + "]\n", + "with pd.option_context('display.max_rows', None):\n", + " display(isone[status_cols_isone].replace(['None'], np.nan).value_counts(dropna=False).where(lambda x: x>2).dropna().astype(int).sort_index())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# gridstatus doesn't parse the status values, so they are all null. In their defense, the ISONE encodes them as alt text behind icons, which is stupid. But still.\n", + "isone[status_cols_isone]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone['Generation Type'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare generaiton types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone[\"resource_type_1\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone[\"Generation Type\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generation type mapping looks tractable. Will have to look up the codes listed in is the excel sheets on [this ISONE site](https://www.iso-ne.com/isoexpress/web/reports/operations/-/tree/seasonal-claimed-capability)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl[\"region\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Some fun bonus capcity analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "active_lbnl = lbnl.query(\"queue_status == 'active'\")\n", + "active_lbnl.groupby(lbnl[\"region\"].str.contains(\"non-ISO\", na=False))[\"capacity_mw_resource_1\"].sum() / active_lbnl[\"capacity_mw_resource_1\"].sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### What are the top non ISO utilities" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "non_iso_lbnl = active_lbnl[active_lbnl[\"region\"].str.contains(\"non-ISO\", na=False)]\n", + "non_iso_lbnl_mw = non_iso_lbnl.groupby(\"utility\")[\"capacity_mw_resource_1\"].sum().sort_values() / non_iso_lbnl[\"capacity_mw_resource_1\"].sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "top_mw_non_iso = []\n", + "\n", + "for i in range(5, len(non_iso_lbnl_mw), 5):\n", + " top_mw_non_iso.append((i, non_iso_lbnl_mw.tail(i).sum()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame(top_mw_non_iso, columns=(\"top_n_utilities\", \"pct_total_non_iso_capacity\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "non_iso_lbnl_mw.tail(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/40-bdn-gridstatus-lbnl-counties.ipynb b/notebooks/40-bdn-gridstatus-lbnl-counties.ipynb new file mode 100644 index 00000000..360cfd66 --- /dev/null +++ b/notebooks/40-bdn-gridstatus-lbnl-counties.ipynb @@ -0,0 +1,1168 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GridStatus <-> LBNL Status Mapping\n", + "\n", + "GridStatus reads raw ISO queues. The LBNL queue statuses are standardized versions of ISO-specific categories. This notebook attempts to reverse engineer the mappings between the two." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pathlib import Path\n", + "import numpy as np\n", + "\n", + "from dbcp.extract.lbnl_iso_queue import extract\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get the data\n", + "### LBNL-Compiled Queues" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# partial implementation of transform. I don't want to include deduplication.\n", + "def partial_transform(active_projects: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"Transform active iso queue data.\"\"\"\n", + " rename_dict = {\n", + " \"state\": \"raw_state_name\",\n", + " \"county\": \"raw_county_name\",\n", + " }\n", + " active_projects = active_projects.rename(columns=rename_dict) # copy\n", + " # Harmonize the interconnection_status_lbnl values.\n", + " mapping = {\n", + " \"Feasability Study\": \"Feasibility Study\",\n", + " \"Feasibility\": \"Feasibility Study\",\n", + " \"Facilities Study\": \"Facility Study\",\n", + " \"IA in Progress\": \"In Progress (unknown study)\",\n", + " \"Unknown\": \"In Progress (unknown study)\",\n", + " \"Withdrawn, Feasibility Study\": \"Withdrawn\",\n", + " }\n", + " active_projects.loc[:, \"interconnection_status_lbnl\"] = active_projects.loc[\n", + " :, \"interconnection_status_lbnl\"\n", + " ].replace(mapping)\n", + " # drop irrelevant columns (structurally all nan due to 'active' filter)\n", + " active_projects.drop(columns=[\"date_withdrawn\", \"date_operational\"], inplace=True)\n", + " return active_projects\n", + "\n", + "\n", + "source_path = Path(\"/app/data/raw/queues_2022_clean_data.xlsx\")\n", + "raw_lbnl = extract(source_path)[\"lbnl_iso_queue\"]\n", + "lbnl = partial_transform(raw_lbnl)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl.shape, lbnl.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl.head(2)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### GridStatus Queues" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import dbcp\n", + "\n", + "# These are the revision numbers of the oldest archives we have\n", + "iso_queue_versions: dict[str, str] = {\n", + " \"miso\": \"1681775160487863\",\n", + " \"caiso\": \"1681775162586588\",\n", + " \"pjm\": \"1681775160979859\",\n", + " \"ercot\": \"1681775161342766\",\n", + " \"spp\": \"1681775162935809\",\n", + " \"nyiso\": \"1681775159356063\",\n", + " \"isone\": \"1681775162111351\",\n", + "}\n", + "\n", + "gs_dfs = dbcp.extract.gridstatus_isoqueues.extract(iso_queue_versions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "{k: v.shape for k, v in gs_dfs.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for iso, df in gs_dfs.items():\n", + " print(iso)\n", + " print(df[\"County\"].isna().value_counts(normalize=True))\n", + " print(df[\"State\"].isna().value_counts(normalize=True))\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # These are manually downloaded from our archives. I went back as far as I could,\n", + "# # which is April 17 2023.\n", + "# # There is a 3.5 month gap between the LBNL queues and our oldest GridStatus archive.\n", + "# root_path = Path(\"/app/data/raw/gridstatus/interconnection_queues\")\n", + "# assert root_path.exists()\n", + "# # filenames are like \"interconnection_queues_caiso_4-17-2023.parquet\"\n", + "# gs_dfs = {\n", + "# path.name.split(\"_\")[2]: pd.read_parquet(path)\n", + "# for path in root_path.glob(\"*.parquet\")\n", + "# }\n", + "# {k: v.shape for k, v in gs_dfs.items()}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# wayyy fewer items in GridStatus than LBNL.\n", + "sum([v.shape[0] for v in gs_dfs.values()]) - lbnl.shape[0]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl.region.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl_iso = lbnl[~lbnl[\"region\"].str.contains(\"non-ISO\", na=False)].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sum([v.shape[0] for v in gs_dfs.values()]) - lbnl_iso.shape[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Gridstatus has more ISO projects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# globals().update(gs_dfs) # this works fine but the static type checker/linter can't introspect it.\n", + "raw_ercot = gs_dfs[\"ercot\"]\n", + "raw_nyiso = gs_dfs[\"nyiso\"]\n", + "raw_isone = gs_dfs[\"isone\"]\n", + "raw_miso = gs_dfs[\"miso\"]\n", + "raw_pjm = gs_dfs[\"pjm\"]\n", + "raw_spp = gs_dfs[\"spp\"]\n", + "raw_caiso = gs_dfs[\"caiso\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LBNL_JOIN_COLS = [\n", + " \"queue_id\", # join key\n", + " \"project_name\", # for manually checking the joins\n", + " \"queue_date\", # for manually checking the joins\n", + " \"queue_status\", # for manually checking the joins\n", + " \"interconnection_status_raw\", # see what LBNL interpreted\n", + " \"interconnection_status_lbnl\", # final mapping value\n", + " \"capacity_mw_resource_1\",\n", + " \"resource_type_1\"\n", + " \n", + "]\n", + "\n", + "\n", + "def join_lbnl(\n", + " iso_df: pd.DataFrame, lbnl: pd.DataFrame, iso_name: str, iso_id_col=\"Queue ID\"\n", + ") -> pd.DataFrame:\n", + " \"\"\"Join LBNL queue data to GridStatus queue data.\"\"\"\n", + " assert iso_df[iso_id_col].is_unique, \"ID column not unique\"\n", + " lbnl_iso = lbnl.loc[lbnl[\"entity\"].eq(iso_name), LBNL_JOIN_COLS]\n", + " assert not lbnl_iso.empty, f\"Empty LBNL queue for {iso_name}\"\n", + " assert lbnl_iso[\"queue_id\"].is_unique, \"LBNL queue ID not unique\"\n", + " out = iso_df.merge(lbnl_iso, how=\"outer\", left_on=iso_id_col, right_on=\"queue_id\")\n", + " out[\"in_lbnl\"] = ~out[\"queue_id\"].isna()\n", + " out[\"in_gs\"] = ~out[iso_id_col].isna()\n", + " return out\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Comparisons\n", + "### ERCOT\n", + "* ERCOT's queue document has separate excel sheets for large and small (< 20MW) projects, and does not list withdrawn projects. GridStatus only includes large projects, whereas LBNL data includes the other categories. I'm not sure where LBNL gets the withdrawn projects from.\n", + " * We might be able to maintain a withdrawn list by 1) getting all the current withdrawn projects from the LBNL data, then 2) loading all the archived GridStatus ERCOT queues and checking for any projects that enter then exit the list. But I don't think we actually need a withdrawn list?\n", + "* GridStatus defines \"status\" as \"IA Signed\".isna(). LBNL calls the entire \"large active\" dataset \"active\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_ercot.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot = join_lbnl(raw_ercot, lbnl, \"ERCOT\")\n", + "ercot.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot[\"resource_type_1\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot[\"Fuel\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ercot[\"Technology\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Great! between GS.Fuel and GS.Technology it should be pretty easy to map to the values used in LBNL." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### NYISO\n", + "\n", + "* LBNL appears to have used the column labeled `S`, which is an ordinal status number corresponding to NYISO's LFIP. Key:\n", + " * 0=Withdrawn\n", + " * 1=Scoping Meeting Pending\n", + " * 2=FES Pending\n", + " * 3=FES in Progress\n", + " * 4=SRIS/SIS Pending\n", + " * 5=SRIS/SIS in Progress\n", + " * 6=SRIS/SIS Approved\n", + " * 7=FS Pending\n", + " * 8=Rejected Cost Allocation/Next FS Pending\n", + " * 9=FS in Progress\n", + " * 10=Accepted Cost Allocation/IA in Progress\n", + " * 11=IA Completed\n", + " * 12=Under Construction\n", + " * 13=In Service for Test\n", + " * 14=In Service Commercial\n", + " * 15=Partial In-Service\n", + "* Availability of Studies Key: None=Not Available, FES=Feasibility Study Available, SRIS=System Reliability Impact Study Available, FS=Facilities Study and/or ATRA Available\n", + "* CY Completion/SGIA Tender refers to the Attachment X milestone used to apply the 4-year COD limitation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_nyiso.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_nyiso[raw_nyiso[\"Queue ID\"].duplicated(keep=False)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'\n", + "raw_nyiso.loc[raw_nyiso.duplicated(subset='Queue ID', keep=False), :].sort_values(by='Queue ID')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso = join_lbnl(raw_nyiso.drop_duplicates(subset='Queue ID', keep='last'), lbnl, \"NYISO\")\n", + "nyiso.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# marginal improvement from date filter\n", + "nyiso.loc[\n", + " nyiso[\"Queue Date\"]\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"in_gs\", \"in_lbnl\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### Compare fuel types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[\"resource_type_1\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso[\"Generation Type\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nyiso_transmission = nyiso[nyiso[\"Generation Type\"].str.contains(\"Transmission\",na=False)]\n", + "nyiso_transmission.queue_id.isna().value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks like LBNL removes transmission projects.\n", + "\n", + "GS hasmore generation types but should be so hard to consolidate them to LBNLs types." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MISO\n", + "Very good project coverage, but the IA status categories are a mess." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_miso.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'\n", + "# Nearly whole-row duplicate, except for \"studyPhase\"\n", + "raw_miso.loc[raw_miso.duplicated(subset='Queue ID', keep=False), :].sort_values(by='Queue ID')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# I don't know which is correct (if either), but I'll guess that the later one is. So keep='last'\n", + "# Nearly whole-row duplicate, except for \"studyPhase\"\n", + "lbnl.query('entity == \"MISO\"').loc[lbnl.query('entity == \"MISO\"').duplicated(subset='queue_id', keep=False), :].sort_values(by='queue_id')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso = join_lbnl(raw_miso.drop_duplicates(subset='Queue ID', keep='last'), lbnl.query('entity == \"MISO\"').drop_duplicates(subset='queue_id', keep='last'), \"MISO\")\n", + "miso.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare generaiton types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso[\"resource_type_1\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "miso[\"Generation Type\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generation type mapping looks pretty straight forward." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### SPP\n", + "\n", + "* neither LBNL nor GridStatus have withdrawn projects\n", + "* GridStatus destroys the detailed status information during their processing, so we lose the ability to distinguish between \"IA pending\" and \"System Integration Study\". But I don't think that is a problem because both are included in the \"actionable\" criteria in Synapse's model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_spp.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp = join_lbnl(raw_spp, lbnl, \"SPP\")\n", + "spp.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp.sample(4)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare generaiton types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp[\"resource_type_1\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spp[\"Generation Type\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generation type mapping looks tractable" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PJM\n", + "Like MISO, good project coverage, but the IA status categories are a mess.\n", + "\n", + "\"Active\" applied up to IA execution. Then \"Engineering and Procurement\" applied to IA execution through COD. Then \"In Service\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_pjm.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# \"Active\" stops at IA execution\n", + "raw_pjm[['Interim/Interconnection Service Agreement Status', 'Status']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pjm = join_lbnl(raw_pjm, lbnl, \"PJM\")\n", + "pjm.info()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare generaiton types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pjm[\"resource_type_1\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pjm[\"Generation Type\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generation type mapping looks tractable" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CAISO\n", + "Straightforward!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_caiso.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso = join_lbnl(raw_caiso, lbnl, \"CAISO\")\n", + "caiso.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso.query(\"Status == 'ACTIVE'\").sample(8, random_state=42).sort_values('interconnection_status_lbnl')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso.loc[\n", + " pd.to_datetime(caiso[\"Queue Date\"])\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"in_gs\", \"in_lbnl\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# remarkably easy to match status values\n", + "caiso[[\"Status\", \"queue_status\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare generaiton types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso[\"resource_type_1\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "caiso[\"Generation Type\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A bit messier than others but still looks tractable" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ISO-NE\n", + "\n", + "* what GridStatus calls \"Queue ID\" was actually \"Queue Position\" in the original data. It is unique amongst active projects but not for withdrawn projects. There is no natural key for withdrawn projects." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone.columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone.duplicated(subset=['Queue ID']).sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone.loc[raw_isone.duplicated(subset=['Queue ID'], keep=False), 'Status'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone['Status'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "compound_key = ['Queue ID', 'Status']\n", + "raw_isone.duplicated(subset=compound_key).sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# there is no natural key for this data. But this looks like the best tradeoff between key complexity and uniqueness.\n", + "compound_key = ['Queue ID', 'Project Name']\n", + "raw_isone.duplicated(subset=compound_key).sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Queue ID is unique within the Active projects. Not sure how change over time will impact the join to LBNL.\n", + "raw_isone.query('Status == \"Active\"')['Queue ID'].is_unique\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# duplicates (none Active) have all kinds of variation in values. Maybe best approach is to sort by date updated and take the latest one. It doesn't always differentiate them though.\n", + "raw_isone.loc[raw_isone.duplicated(subset=compound_key, keep=False),:].sort_values(by=compound_key).tail(10)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone['Updated'] = pd.to_datetime(raw_isone['Updated'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# join manually rather than refactoring the func to take compound key\n", + "lbnl_iso = lbnl.loc[lbnl[\"entity\"].eq(\"ISO-NE\"), LBNL_JOIN_COLS].astype({'queue_id': int})\n", + "isone = raw_isone.sort_values('Updated').drop_duplicates(subset=compound_key, keep='last')\n", + "isone = isone.merge(lbnl_iso, how=\"outer\", left_on=compound_key, right_on=[c.lower().replace(' ', '_') for c in compound_key])\n", + "isone[\"in_lbnl\"] = ~isone[\"queue_id\"].isna()\n", + "isone[\"in_gs\"] = ~isone[\"Queue ID\"].isna()\n", + "del lbnl_iso\n", + "isone.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone.head(2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone[[\"in_gs\", \"in_lbnl\"]].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# small improvement from date filter\n", + "isone.loc[\n", + " pd.to_datetime(isone[\"Queue Date\"])\n", + " .fillna(pd.to_datetime(\"2020-01-01\"))\n", + " .lt(pd.to_datetime(\"2023-01-01\")),\n", + " [\"in_gs\", \"in_lbnl\"],\n", + "].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# status values are decently aligned\n", + "isone[[\"Status\", \"queue_status\"]].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone['interconnection_status_raw'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone[['interconnection_status_lbnl', 'interconnection_status_raw']].value_counts(dropna=False).sort_index()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "status_cols_isone = [\n", + " \"FS\",\n", + " \"SIS\",\n", + " \"OS\",\n", + " \"FAC\",\n", + " \"IA\",\n", + " \"Project Status\",\n", + "]\n", + "with pd.option_context('display.max_rows', None):\n", + " display(isone[status_cols_isone].replace(['None'], np.nan).value_counts(dropna=False).where(lambda x: x>2).dropna().astype(int).sort_index())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# gridstatus doesn't parse the status values, so they are all null. In their defense, the ISONE encodes them as alt text behind icons, which is stupid. But still.\n", + "isone[status_cols_isone]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_isone['Generation Type'].value_counts(dropna=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare generaiton types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone[\"resource_type_1\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "isone[\"Generation Type\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generation type mapping looks tractable. Will have to look up the codes listed in is the excel sheets on [this ISONE site](https://www.iso-ne.com/isoexpress/web/reports/operations/-/tree/seasonal-claimed-capability)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lbnl[\"region\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Some fun bonus capcity analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "active_lbnl = lbnl.query(\"queue_status == 'active'\")\n", + "active_lbnl.groupby(lbnl[\"region\"].str.contains(\"non-ISO\", na=False))[\"capacity_mw_resource_1\"].sum() / active_lbnl[\"capacity_mw_resource_1\"].sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### What are the top non ISO utilities" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "non_iso_lbnl = active_lbnl[active_lbnl[\"region\"].str.contains(\"non-ISO\", na=False)]\n", + "non_iso_lbnl_mw = non_iso_lbnl.groupby(\"utility\")[\"capacity_mw_resource_1\"].sum().sort_values() / non_iso_lbnl[\"capacity_mw_resource_1\"].sum()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "top_mw_non_iso = []\n", + "\n", + "for i in range(5, len(non_iso_lbnl_mw), 5):\n", + " top_mw_non_iso.append((i, non_iso_lbnl_mw.tail(i).sum()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame(top_mw_non_iso, columns=(\"top_n_utilities\", \"pct_total_non_iso_capacity\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "non_iso_lbnl_mw.tail(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/41-bdn-queue-comparison.ipynb b/notebooks/41-bdn-queue-comparison.ipynb new file mode 100644 index 00000000..86f31478 --- /dev/null +++ b/notebooks/41-bdn-queue-comparison.ipynb @@ -0,0 +1,718 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4012c949-f6fd-4fbc-8601-7fac99b753b4", + "metadata": {}, + "source": [ + "## Purpose\n", + "This notebooks compares some high level metrics about two versions of the `iso_projects_long_format`. This is helpful for running sanity checks when updating queue data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57c9155c-f91f-4d71-a73e-16273b3d84bd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from dbcp.helpers import get_sql_engine\n", + "import pandas as pd\n", + "\n", + "engine = get_sql_engine()\n", + "\n", + "# with engine.connect() as con:\n", + "# projects_long = pd.read_sql_table(\"iso_projects_long_format\", con, schema=\"data_mart\")\n", + " \n", + "# projects_long = projects_long.convert_dtypes()\n", + "# print(projects_long.date_entered_queue.max())\n", + "# projects_long.to_parquet(\"iso_projects_long_format_gs.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "59a4aa6a-1bfa-4331-a8dc-a72b375a42f2", + "metadata": { + "tags": [] + }, + "source": [ + "## Load projects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2144cc1-a3f2-436f-bc46-bbdf8b283f71", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5c14b8f-328e-4c7f-bc2a-c61d14997f50", + "metadata": {}, + "outputs": [], + "source": [ + "lbnl_projects = pd.read_parquet(\"iso_projects_long_format_lbnl.parquet\")\n", + "gs_projects = pd.read_parquet(\"iso_projects_long_format_gs.parquet\")\n", + "\n", + "print(gs_projects.date_entered_queue.max())\n", + "print(lbnl_projects.date_entered_queue.max())" + ] + }, + { + "cell_type": "markdown", + "id": "c5f96a1c-4936-49da-b618-de427278bc40", + "metadata": {}, + "source": [ + "## Aggregate project dataframes by county" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92d6bb46-fc02-4fc0-a916-d9d61d3f6673", + "metadata": {}, + "outputs": [], + "source": [ + "def aggregate_iso_projects_by_count(df):\n", + "\n", + " def contains_iso_project(grp):\n", + " return any([\"non-ISO\" not in region for region in grp if not isinstance(region, type(pd.NA))])\n", + " \n", + " def get_primary_iso(grp):\n", + " # There are 16 counties that have equal number of projects in multiple regions. Select the first one\n", + " return grp.mode().head(1)\n", + "\n", + " agg_df = df.groupby(\"county_id_fips\").agg(\n", + " has_iso_project=pd.NamedAgg(column=\"iso_region\", aggfunc=contains_iso_project),\n", + " primary_iso_region=pd.NamedAgg(column=\"iso_region\", aggfunc=get_primary_iso),\n", + " capacity_mw=pd.NamedAgg(column=\"capacity_mw\", aggfunc=\"sum\"),\n", + " co2e_tonnes_per_year=pd.NamedAgg(column=\"co2e_tonnes_per_year\", aggfunc=\"sum\")\n", + " )\n", + " \n", + " def agg_actionable_mw(grp_df):\n", + " return grp_df[grp_df.is_actionable].capacity_mw.sum()\n", + "\n", + " def agg_certain_mw(grp_df):\n", + " return grp_df[grp_df.is_nearly_certain].capacity_mw.sum()\n", + "\n", + "\n", + " agg_df[\"actionable_capacity_mw\"] = df.groupby(\"county_id_fips\").apply(agg_actionable_mw)\n", + " agg_df[\"nearly_certain_capacity_mw\"] = df.groupby(\"county_id_fips\").apply(agg_certain_mw)\n", + " \n", + " agg_df[\"actionable_n_projects\"] = df.groupby(\"county_id_fips\").is_actionable.sum()\n", + " agg_df[\"nearly_certain_n_projects\"] = df.groupby(\"county_id_fips\").is_nearly_certain.count()\n", + " return agg_df.reset_index()\n", + "\n", + "new_projects_counties = aggregate_iso_projects_by_count(gs_projects)\n", + "old_projects_counties = aggregate_iso_projects_by_count(lbnl_projects)" + ] + }, + { + "cell_type": "markdown", + "id": "44d2ab53-6b17-40b6-b294-ba8654f59a66", + "metadata": {}, + "source": [ + "## Number of counties with projects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d42e25e-8366-4326-98c1-384f2f34b1bf", + "metadata": {}, + "outputs": [], + "source": [ + "n_counties_with_projects_in_new_not_in_old = len(set(new_projects_counties.county_id_fips) - set(old_projects_counties.county_id_fips))\n", + "n_counties_with_projects_in_old_not_in_new = len(set(old_projects_counties.county_id_fips) - set(new_projects_counties.county_id_fips))\n", + "\n", + "print(n_counties_with_projects_in_new_not_in_old)\n", + "print(n_counties_with_projects_in_old_not_in_new)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c07c6cbf-4989-47f3-81f9-6a7ff08a30fe", + "metadata": {}, + "outputs": [], + "source": [ + "print(len(old_projects_counties))\n", + "print(len(new_projects_counties))" + ] + }, + { + "cell_type": "markdown", + "id": "0f5303c5-9af8-4536-8644-b03d368e22f0", + "metadata": {}, + "source": [ + "## Make sure counties that don't have any ISO projects capacity remain unchanged" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aeb284db-416f-494d-a7ce-19f0357c05ae", + "metadata": {}, + "outputs": [], + "source": [ + "project_counties = old_projects_counties.merge(new_projects_counties, on=\"county_id_fips\", how=\"outer\", validate=\"1:1\", suffixes=(\"_old\", \"_new\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cee0549-f231-4e68-a5a1-e0626ff515b4", + "metadata": {}, + "outputs": [], + "source": [ + "project_counties[\"has_iso_project_old\"] = project_counties.has_iso_project_old.astype(\"boolean\")\n", + "project_counties[\"has_iso_project_new\"] = project_counties.has_iso_project_new.astype(\"boolean\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63a8a454-0bfc-4543-82d8-040a023c5cc8", + "metadata": {}, + "outputs": [], + "source": [ + "print((~project_counties.has_iso_project_old).value_counts())\n", + "print()\n", + "print((~project_counties.has_iso_project_new).value_counts())" + ] + }, + { + "cell_type": "markdown", + "id": "eded13f5-0897-4700-bf09-009c44c86a23", + "metadata": {}, + "source": [ + "Pretty similar number of counties that don't have any ISO projects. Lets plot them to make sure it makes sense with the ISO boundaries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea8dc6eb-38c4-4689-86c7-098d093d7e64", + "metadata": {}, + "outputs": [], + "source": [ + "is_county_without_iso_projects = (~project_counties.has_iso_project_new) & (~project_counties.has_iso_project_old)\n", + "print(is_county_without_iso_projects.value_counts())\n", + "\n", + "counties_without_iso_projects = project_counties[is_county_without_iso_projects]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71769541-b078-45b1-be27-05b40735fbca", + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.offline as pyo\n", + "pyo.init_notebook_mode()\n", + "\n", + "# https://stackoverflow.com/questions/52771328/plotly-chart-not-showing-in-jupyter-notebook\n", + "from urllib.request import urlopen\n", + "import json\n", + "with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:\n", + " counties = json.load(response)\n", + "\n", + "\n", + "import plotly.express as px\n", + "\n", + "fig = px.choropleth(counties_without_iso_projects, geojson=counties, locations='county_id_fips', color='has_iso_project_new',\n", + " color_continuous_scale=\"RdYlGn\",\n", + " range_color=(-4, 4),\n", + " scope=\"usa\",\n", + " labels={'has_iso_project_new': \"Counties that don't have any ISO projects in new and old data\"},\n", + " )\n", + "fig.update_layout(margin={\"r\":0,\"t\":0,\"l\":0,\"b\":0})\n" + ] + }, + { + "cell_type": "markdown", + "id": "37c05446-e4aa-4129-997c-4b46892d44db", + "metadata": {}, + "source": [ + "Great! That rougly aligns with [ISO borders](https://hifld-geoplatform.opendata.arcgis.com/datasets/50f80920d36e435d9a34db2bd0fd3ad8/explore?location=32.163459%2C-97.519448%2C5.23)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d1573b9-912f-4389-a305-c2d710f4fdc3", + "metadata": {}, + "outputs": [], + "source": [ + "assert (counties_without_iso_projects.capacity_mw_old.eq(counties_without_iso_projects.capacity_mw_new)).all(), \"Capacity in counties without ISO projects has changed!\"" + ] + }, + { + "cell_type": "markdown", + "id": "8db021e8-a705-4185-89a3-cd2209c8f1fe", + "metadata": {}, + "source": [ + "## Compare overall capacity between new and old data amongst counties with ISO projects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d673028-d8f7-4fb0-96fc-8e3414bf7eac", + "metadata": {}, + "outputs": [], + "source": [ + "is_county_with_iso_projects = project_counties.has_iso_project_new | project_counties.has_iso_project_old\n", + "print(is_county_with_iso_projects.value_counts())\n", + "\n", + "counties_with_iso_projects = project_counties[is_county_with_iso_projects].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebe10ba7-a880-4791-9310-d68fd4e7b9eb", + "metadata": {}, + "outputs": [], + "source": [ + "counties_with_iso_projects[\"capacity_mw_diff\"] = (counties_with_iso_projects.capacity_mw_new - counties_with_iso_projects.capacity_mw_old)\n", + "counties_with_iso_projects[\"capacity_mw_pct_change\"] = (counties_with_iso_projects[\"capacity_mw_diff\"] / counties_with_iso_projects.capacity_mw_old) * 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cea8bf6d-146e-498d-ae68-3a7fb54854dc", + "metadata": {}, + "outputs": [], + "source": [ + "print(counties_with_iso_projects.capacity_mw_diff.describe())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be3a9ead-a0b2-43c6-9299-b00a0d8db3e9", + "metadata": {}, + "outputs": [], + "source": [ + "counties_with_iso_projects.groupby(\"primary_iso_region_new\").capacity_mw_diff.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25cf8753-b9fa-4be1-bfbe-7d30bd4ae8fa", + "metadata": {}, + "outputs": [], + "source": [ + "print(counties_with_iso_projects.capacity_mw_pct_change.abs().describe())\n", + "print()\n", + "counties_with_iso_projects.capacity_mw_pct_change.abs().plot.box()" + ] + }, + { + "cell_type": "markdown", + "id": "48843c63-dddd-4a03-b94a-bcfda6731446", + "metadata": {}, + "source": [ + "Pretty good! 50% of counties capacity changed by no more than 1%. 75% of counties capacity changed by no more than 25%. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b06925ca-450c-43fe-b4ec-9f3d9cc5eded", + "metadata": {}, + "outputs": [], + "source": [ + "counties_with_iso_projects.primary_iso_region_new.apply(type).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32db4dde-8c58-49d8-856f-7764e60b87b3", + "metadata": {}, + "outputs": [], + "source": [ + "counties_with_iso_projects[\"capacity_mw_abs_pct_change\"] = counties_with_iso_projects.capacity_mw_pct_change.abs()\n", + "\n", + "counties_with_iso_projects.groupby(\"primary_iso_region_new\").capacity_mw_pct_change.describe()" + ] + }, + { + "cell_type": "markdown", + "id": "6ea0a3c2-d594-4235-9d23-b93b96bd780d", + "metadata": {}, + "source": [ + "- CAISO's distribution suggests it mostly went unchaned. Variability could be explained by the 4 month delay of the data.\n", + "- ERCOT mostly went unchanged. Maybe a slight increase. Variability could be explained by the 4 month delay of the data.\n", + "- It looks like ISONE generally increased which kind of makes sense given we have a whole other year of data for this ISO.\n", + "- MISO's distribution suggests it mostly went unchaned with a slight increase. Variability could be explained by the 4 month delay of the data.\n", + "- NYISO distribution suggests it mostly went unchaned. Variability could be explained by the 4 month delay of the data.\n", + "- PJM distribution suggests it mostly went unchaned. Variability could be explained by the 4 month delay of the data.\n", + "- Capacity change in SPP has a pretty wide distribution which could be explained by the additional year of data. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa155859-95de-42ea-a55d-a0c2d4682af6", + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.choropleth(counties_with_iso_projects, geojson=counties, locations='county_id_fips', color='capacity_mw_pct_change',\n", + " color_continuous_scale=\"RdYlGn\",\n", + " range_color=(-100, 100),\n", + " scope=\"usa\",\n", + " labels={'capacity_mw_pct_change': \"Capcity MW % change between old and new data\"},\n", + " )\n", + "fig.update_layout(margin={\"r\":0,\"t\":0,\"l\":0,\"b\":0})" + ] + }, + { + "cell_type": "markdown", + "id": "ceeb7733-818c-495d-aceb-22acd0d7a5f4", + "metadata": {}, + "source": [ + "## Compare **actionable** capacity between new and old data amongst counties with ISO projecdts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51a9c92b-2bac-45b2-9e2c-3e6f5439b781", + "metadata": {}, + "outputs": [], + "source": [ + "counties_with_iso_projects[\"actionable_capacity_mw_diff\"] = (counties_with_iso_projects.actionable_capacity_mw_new - counties_with_iso_projects.actionable_capacity_mw_old)\n", + "counties_with_iso_projects[\"actionable_capacity_mw_diff_pct_change\"] = (counties_with_iso_projects[\"actionable_capacity_mw_diff\"] / counties_with_iso_projects.actionable_capacity_mw_old) * 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0d75223-5d5d-44d3-9768-90497fe83372", + "metadata": {}, + "outputs": [], + "source": [ + "counties_with_iso_projects.groupby(\"primary_iso_region_new\").actionable_capacity_mw_diff_pct_change.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f13f19d6-f61c-4733-b96b-93bf9774eb48", + "metadata": {}, + "outputs": [], + "source": [ + "lbnl_projects[\"iso_region\"] = lbnl_projects[\"iso_region\"].replace(\"ISO-NE\", \"ISONE\")\n", + "\n", + "n_actionable_by_iso = pd.concat([lbnl_projects.groupby(\"iso_region\").is_actionable.sum(), gs_projects.groupby(\"iso_region\").is_actionable.sum()], axis=1)\n", + "n_actionable_by_iso" + ] + }, + { + "cell_type": "markdown", + "id": "c028710c-dd68-46ed-a85e-c6cc4f53880b", + "metadata": {}, + "source": [ + "- Significantly more projects marked actionable in new CAISO" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19496362-cd19-46ee-a4ed-9f9a9e3230fb", + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.choropleth(counties_with_iso_projects, geojson=counties, locations='county_id_fips', color='actionable_capacity_mw_diff_pct_change',\n", + " color_continuous_scale=\"RdYlGn\",\n", + " range_color=(-100, 100),\n", + " scope=\"usa\",\n", + " labels={'actionable_capacity_mw_diff_pct_change': \"Actionable Capacity MW % change\"},\n", + " )\n", + "fig.update_layout(margin={\"r\":0,\"t\":0,\"l\":0,\"b\":0})\n" + ] + }, + { + "cell_type": "markdown", + "id": "f783c0b5-1955-40a2-b362-9929224b2fb3", + "metadata": {}, + "source": [ + "### Compare nearly certain capacity" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a19ac690-a89a-4e0a-931c-94eaa6bce42a", + "metadata": {}, + "outputs": [], + "source": [ + "counties_with_iso_projects[\"nearly_certain_capacity_mw_diff\"] = (counties_with_iso_projects.nearly_certain_capacity_mw_new - counties_with_iso_projects.nearly_certain_capacity_mw_old)\n", + "counties_with_iso_projects[\"nearly_certain_capacity_mw_diff_pct_change\"] = (counties_with_iso_projects[\"nearly_certain_capacity_mw_diff\"] / counties_with_iso_projects.nearly_certain_capacity_mw_old) * 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "772cd436-fa72-4222-87df-38a070c957f2", + "metadata": {}, + "outputs": [], + "source": [ + "counties_with_iso_projects.groupby(\"primary_iso_region_new\").nearly_certain_capacity_mw_diff_pct_change.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bec1205-ed7f-4e30-a71a-effbc8cbbfb1", + "metadata": {}, + "outputs": [], + "source": [ + "pd.concat([lbnl_projects.groupby(\"iso_region\").is_nearly_certain.sum(), gs_projects.groupby(\"iso_region\").is_nearly_certain.sum()], axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "18679cf5-d2c2-4faf-8526-9b222a9d3c85", + "metadata": {}, + "source": [ + "- Significantly less number of projects marked nearly certain in new MISO data.\n", + "- LBNL didn't mark any NYISO projects as nearly certain." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "901ad58b-5712-4a2e-847d-8fefc3df7ca3", + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.choropleth(counties_with_iso_projects, geojson=counties, locations='county_id_fips', color='nearly_certain_capacity_mw_diff_pct_change',\n", + " color_continuous_scale=\"RdYlGn\",\n", + " range_color=(-100, 100),\n", + " scope=\"usa\",\n", + " labels={'nearly_certain_capacity_mw_diff_pct_change': \"Counties that don't have any ISO projects in new and old data\"},\n", + " )\n", + "fig.update_layout(margin={\"r\":0,\"t\":0,\"l\":0,\"b\":0})\n" + ] + }, + { + "cell_type": "markdown", + "id": "67a6b586-7e98-4ab9-9c89-a8d4e8fcc06e", + "metadata": {}, + "source": [ + "## Compare CO2 estimate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c63428f0-00b9-45fc-9a52-be23e9b80868", + "metadata": {}, + "outputs": [], + "source": [ + "counties_with_iso_projects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1b81cfc-6ad8-45d3-ab73-bfdd74a06d70", + "metadata": {}, + "outputs": [], + "source": [ + "counties_with_iso_projects[\"co2e_tonnes_per_year_diff\"] = counties_with_iso_projects.co2e_tonnes_per_year_new - counties_with_iso_projects.co2e_tonnes_per_year_old\n", + "counties_with_iso_projects[\"co2e_tonnes_per_year_pct_change\"] = counties_with_iso_projects[\"co2e_tonnes_per_year_diff\"].div(counties_with_iso_projects.co2e_tonnes_per_year_old) * 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52be0516-afc2-4040-bef6-fb8740be2077", + "metadata": {}, + "outputs": [], + "source": [ + "print(counties_with_iso_projects[counties_with_iso_projects.co2e_tonnes_per_year_old.ne(0)].co2e_tonnes_per_year_pct_change.describe())\n", + "print()\n", + "print(counties_with_iso_projects[counties_with_iso_projects.co2e_tonnes_per_year_new.ne(0)].co2e_tonnes_per_year_pct_change.describe())" + ] + }, + { + "cell_type": "markdown", + "id": "a3762ed1-ce91-4e9f-b4d8-59e6b7c02bc9", + "metadata": {}, + "source": [ + "Great it looks like co2 estimates mostly stayed the same for counties with fossil fuel projets in the queue." + ] + }, + { + "cell_type": "markdown", + "id": "6799c0e5-8d9a-40a9-b3a3-3c87ea24a239", + "metadata": {}, + "source": [ + "## Compare at project level" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e07dab5-0c25-4820-ab1e-82edd0631753", + "metadata": {}, + "outputs": [], + "source": [ + "merged_projects = lbnl_projects.merge(gs_projects, how=\"outer\", on=(\"queue_id\", \"entity\"), suffixes=(\"_lbnl\", \"_gs\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3134f3c8-8e72-44d0-aafe-f3a2430f687a", + "metadata": {}, + "outputs": [], + "source": [ + "miso = merged_projects.query(\"entity == 'MISO'\")\n", + "miso.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2527f8cd-c403-4d4b-80e3-fd41fc49fc4c", + "metadata": {}, + "outputs": [], + "source": [ + "is_nearly_certain_lbnl = miso.is_nearly_certain_lbnl.fillna(False)\n", + "is_nearly_certain_gs = miso.is_nearly_certain_gs.fillna(False)\n", + "print(is_nearly_certain_lbnl.value_counts())\n", + "print(is_nearly_certain_gs.value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81768167-5881-4349-a165-863aeecedaee", + "metadata": {}, + "outputs": [], + "source": [ + "miso[is_nearly_certain_lbnl].iso_region_gs.isna().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "158cb3aa-f8da-49b1-930f-063ce7a35d9e", + "metadata": {}, + "outputs": [], + "source": [ + "miso[is_nearly_certain_lbnl].is_nearly_certain_gs.value_counts(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53eeb0eb-68cc-4551-9017-cfd1e3ad1720", + "metadata": {}, + "outputs": [], + "source": [ + "miso.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "523653da-15f0-4717-b13f-b14f57f7795d", + "metadata": {}, + "outputs": [], + "source": [ + "miso[is_nearly_certain_lbnl][[\"interconnection_status_lbnl\", \"interconnection_status_gs\"]].value_counts(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4631caaa-aa0b-41d2-9a6b-010624152044", + "metadata": {}, + "outputs": [], + "source": [ + "is_actionable_lbnl = miso.is_actionable_lbnl.fillna(False)\n", + "is_actionable_gs = miso.is_actionable_gs.fillna(False)\n", + "print(is_actionable_lbnl.value_counts())\n", + "print(is_actionable_gs.value_counts())\n", + "\n", + "miso[is_actionable_gs][[\"interconnection_status_lbnl\", \"interconnection_status_gs\"]].value_counts(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebd5c85d-dbe1-467b-8997-1dc68dacd032", + "metadata": {}, + "outputs": [], + "source": [ + "miso[[\"interconnection_status_gs\", \"interconnection_status_lbnl\"]].value_counts(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26ffe216-d41f-4dc6-ba74-70680ac9b710", + "metadata": {}, + "outputs": [], + "source": [ + "miso[miso.interconnection_status_gs.eq(\"PHASE 3\") & miso.interconnection_status_lbnl.eq(\"IA Executed\")][[\"date_proposed_online_lbnl\", \"date_proposed_online_gs\", \"date_entered_queue_lbnl\", \"date_entered_queue_gs\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b412c13c-884f-4ca5-bc73-8f488cabc282", + "metadata": {}, + "outputs": [], + "source": [ + "miso.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55f06f84-0d57-48f6-b1db-6a08f3f6a026", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/43-tpb-gridstatus_dedupe.ipynb b/notebooks/43-tpb-gridstatus_dedupe.ipynb new file mode 100644 index 00000000..2061050d --- /dev/null +++ b/notebooks/43-tpb-gridstatus_dedupe.ipynb @@ -0,0 +1,1135 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Revisit De-duplication\n", + "LBNL queues were deduplicated based on a specific definition of \"duplicate\". This notebook revisits the deduplication process to see if it 1) should and 2) can be applied directly to GridStatus data.\n", + "\n", + "## Get Data\n", + "### LBNL Queues" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/app/.local/lib/python3.10/site-packages/geopandas/_compat.py:123: UserWarning: The Shapely GEOS version (3.11.1-CAPI-1.17.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.3-CAPI-1.16.1). Conversions between both will be slow.\n", + " warnings.warn(\n", + "/app/.local/lib/python3.10/site-packages/pudl/analysis/spatial.py:7: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:\n", + "\n", + "import os\n", + "os.environ['USE_PYGEOS'] = '0'\n", + "import geopandas\n", + "\n", + "In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).\n", + " import geopandas as gpd\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from pathlib import Path\n", + "import numpy as np\n", + "\n", + "from dbcp.extract.lbnl_iso_queue import extract" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from dbcp.transform.lbnl_iso_queue import parse_date_columns\n", + "# partial implementation of transform. I don't want to include deduplication.\n", + "def partial_transform(active_projects: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"Transform active iso queue data.\"\"\"\n", + " rename_dict = {\n", + " \"state\": \"raw_state_name\",\n", + " \"county\": \"raw_county_name\",\n", + " }\n", + " active_projects = active_projects.rename(columns=rename_dict) # copy\n", + " # Harmonize the interconnection_status_lbnl values.\n", + " mapping = {\n", + " \"Feasability Study\": \"Feasibility Study\",\n", + " \"Feasibility\": \"Feasibility Study\",\n", + " \"Facilities Study\": \"Facility Study\",\n", + " \"IA in Progress\": \"In Progress (unknown study)\",\n", + " \"Unknown\": \"In Progress (unknown study)\",\n", + " \"Withdrawn, Feasibility Study\": \"Withdrawn\",\n", + " }\n", + " active_projects.loc[:, \"interconnection_status_lbnl\"] = active_projects.loc[\n", + " :, \"interconnection_status_lbnl\"\n", + " ].replace(mapping)\n", + " # drop irrelevant columns (structurally all nan due to 'active' filter)\n", + " active_projects.drop(columns=[\"date_withdrawn\", \"date_operational\"], inplace=True)\n", + " parse_date_columns(active_projects)\n", + " return active_projects\n", + "\n", + "\n", + "source_path = Path(\"/app/data/raw/queues_2022_clean_data.xlsx\")\n", + "raw_lbnl = extract(source_path)[\"lbnl_iso_queue\"]\n", + "lbnl = partial_transform(raw_lbnl)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((29033, 30),\n", + " Index(['queue_id', 'queue_status', 'queue_date_raw', 'queue_year', 'interconnection_date_raw', 'entity', 'project_name', 'developer', 'utility', 'county_1', 'county_2', 'county_3', 'raw_state_name', 'region', 'interconnection_service_type', 'point_of_interconnection', 'date_proposed_raw', 'year_proposed', 'interconnection_status_raw', 'interconnection_status_lbnl', 'resource_type_lbnl', 'resource_type_1', 'resource_type_2', 'resource_type_3', 'capacity_mw_resource_1', 'capacity_mw_resource_2', 'capacity_mw_resource_3', 'queue_date', 'interconnection_date', 'date_proposed'], dtype='object'))" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lbnl.shape, lbnl.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
queue_idqueue_statusqueue_date_rawqueue_yearinterconnection_date_rawentityproject_namedeveloperutilitycounty_1county_2county_3raw_state_nameregioninterconnection_service_typepoint_of_interconnectiondate_proposed_rawyear_proposedinterconnection_status_rawinterconnection_status_lbnlresource_type_lbnlresource_type_1resource_type_2resource_type_3capacity_mw_resource_1capacity_mw_resource_2capacity_mw_resource_3queue_dateinterconnection_datedate_proposed
0GIA-97withdrawn1/7/20222022.0NaNAECNaNNaNAECnew madridNaNNaNMOSoutheast (non-ISO)NetworkNew Madrid - Essex 345kV10/31/20242024.0WithdrawnWithdrawnSolarSolarNaNNaN350.0NaNNaN2022-01-07NaT2024-10-31
1GIA-40active10/24/20092009.0NaNAECNaNNaNAECnew madridNaNNaNMOSoutheast (non-ISO)Network ResourceNM Switchyard (345 kV Bus)11/1/20112011.0Upgrade ApprovedIA ExecutedCoalCoalNaNNaN20.0NaNNaN2009-10-24NaT2011-11-01
\n", + "
" + ], + "text/plain": [ + " queue_id queue_status queue_date_raw queue_year interconnection_date_raw entity project_name developer utility county_1 county_2 county_3 raw_state_name region interconnection_service_type point_of_interconnection date_proposed_raw year_proposed interconnection_status_raw interconnection_status_lbnl resource_type_lbnl resource_type_1 resource_type_2 resource_type_3 capacity_mw_resource_1 capacity_mw_resource_2 capacity_mw_resource_3 queue_date interconnection_date date_proposed\n", + "0 GIA-97 withdrawn 1/7/2022 2022.0 NaN AEC NaN NaN AEC new madrid NaN NaN MO Southeast (non-ISO) Network New Madrid - Essex 345kV 10/31/2024 2024.0 Withdrawn Withdrawn Solar Solar NaN NaN 350.0 NaN NaN 2022-01-07 NaT 2024-10-31\n", + "1 GIA-40 active 10/24/2009 2009.0 NaN AEC NaN NaN AEC new madrid NaN NaN MO Southeast (non-ISO) Network Resource NM Switchyard (345 kV Bus) 11/1/2011 2011.0 Upgrade Approved IA Executed Coal Coal NaN NaN 20.0 NaN NaN 2009-10-24 NaT 2011-11-01" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lbnl.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### GridStatus" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/app/.local/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", + "/app/.local/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", + "/app/.local/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", + "/app/.local/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", + "/app/.local/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", + "/app/.local/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", + "/app/.local/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" + ] + }, + { + "data": { + "text/plain": [ + "((8233, 121),\n", + " Index(['queue_id', 'project_name', 'interconnecting_entity', 'county', 'state', 'point_of_interconnection', 'utility', 'resource', 'capacity_mw', 'summer_capacity_mw',\n", + " ...\n", + " 'Serv', 'I39', 'Dev', 'Zone', 'System Impact Study Completed', 'Feasiblity Study Status', 'Optional Interconnection Study Status', 'Project Status', 'project_id', 'resource_clean'], dtype='object', length=121))" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from dbcp.extract.gridstatus_isoqueues import extract as extract_gs\n", + "from dbcp.transform.gridstatus import (\n", + " _transform_miso,\n", + " _transform_caiso,\n", + " _transform_pjm,\n", + " _transform_ercot,\n", + " _transform_spp,\n", + " _transform_nyiso,\n", + " _transform_isone,\n", + " COLUMN_RENAME_DICT,\n", + " _clean_resource_type,\n", + ")\n", + "def partial_transform_gs(raw_dfs: dict[str, pd.DataFrame]) -> pd.DataFrame:\n", + " # exclude the normalization step\n", + " # create one dataframe\n", + " iso_cleaning_functions = {\n", + " \"miso\": _transform_miso,\n", + " \"caiso\": _transform_caiso,\n", + " \"pjm\": _transform_pjm,\n", + " \"ercot\": _transform_ercot,\n", + " \"spp\": _transform_spp,\n", + " \"nyiso\": _transform_nyiso,\n", + " \"isone\": _transform_isone,\n", + " }\n", + "\n", + " projects = []\n", + " for iso, df in raw_dfs.items():\n", + " # Apply rename\n", + " renamed_df = df.rename(columns=COLUMN_RENAME_DICT).copy()\n", + "\n", + " # Apply iso specific cleaning functions\n", + " renamed_df = iso_cleaning_functions[iso](renamed_df)\n", + "\n", + " renamed_df[\"region\"] = iso\n", + " renamed_df[\"entity\"] = iso.upper()\n", + " projects.append(renamed_df)\n", + "\n", + " active_projects = pd.concat(projects)\n", + " active_projects[\"queue_status\"] = active_projects.queue_status.str.lower()\n", + "\n", + " # parse dates\n", + " date_cols = [col for col in list(active_projects) if \"date\" in col]\n", + " for col in date_cols:\n", + " active_projects[col] = pd.to_datetime(active_projects[col], utc=True)\n", + "\n", + " # create project_id\n", + " active_projects[\"project_id\"] = np.arange(len(active_projects), dtype=np.int32)\n", + "\n", + " # Normalize data\n", + " # (\n", + " # normalized_projects,\n", + " # normalized_capacities,\n", + " # normalized_locations,\n", + " # ) = _normalize_projects(active_projects)\n", + "\n", + " # harmonize types\n", + " active_projects = _clean_resource_type(active_projects)\n", + " return active_projects\n", + "\n", + "raw_gs = extract_gs()\n", + "gs = partial_transform_gs(raw_gs)\n", + "gs.shape, gs.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analyze Duplicates\n", + "### ID Duplicates in LBNL" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mean 0.017532\n", + "sum 509.000000\n", + "dtype: float64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ids_lbnl = ['entity', 'queue_id']\n", + "lbnl.duplicated(subset=ids_lbnl, keep=False).agg(['mean', 'sum'])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mean 0.011125\n", + "sum 114.000000\n", + "dtype: float64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# most ID dupes are from withdrawn or operational projects\n", + "lbnl.query('queue_status == \"active\"').duplicated(subset=ids_lbnl, keep=False).agg(['mean', 'sum'])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "active_lbnl = lbnl.query('queue_status == \"active\"').copy()\n", + "is_id_dupe_active = active_lbnl.duplicated(subset=ids_lbnl, keep=False)\n", + "id_dupe_lbnl_active = active_lbnl.loc[is_id_dupe_active,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Duke 34\n", + "WAPA 29\n", + "DominionSC 24\n", + "SRP 15\n", + "LADWP 10\n", + "PNM 2\n", + "Name: entity, dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# only in non-ISO regions\n", + "id_dupe_lbnl_active['entity'].value_counts(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
meansumcount
resource_type_30.0000000.01.0
developer0.0000000.017.0
interconnection_date0.0000000.02.0
interconnection_date_raw0.0000000.02.0
capacity_mw_resource_20.1428572.014.0
date_proposed_raw0.1607149.056.0
date_proposed0.1636369.055.0
year_proposed0.35849119.053.0
point_of_interconnection0.44339647.0106.0
queue_date0.50943454.0106.0
queue_date_raw0.50943454.0106.0
interconnection_status_raw0.55172448.087.0
capacity_mw_resource_10.55752263.0113.0
county_10.60714368.0112.0
resource_type_20.68571424.035.0
raw_state_name0.74774883.0111.0
resource_type_lbnl0.80701892.0114.0
resource_type_10.877193100.0114.0
queue_year0.90566096.0106.0
interconnection_status_lbnl0.938596107.0114.0
utility0.947368108.0114.0
queue_status1.000000114.0114.0
region1.000000114.0114.0
interconnection_service_type1.00000050.050.0
capacity_mw_resource_3NaN0.00.0
county_2NaN0.00.0
county_3NaN0.00.0
project_nameNaN0.00.0
\n", + "
" + ], + "text/plain": [ + " mean sum count\n", + "resource_type_3 0.000000 0.0 1.0\n", + "developer 0.000000 0.0 17.0\n", + "interconnection_date 0.000000 0.0 2.0\n", + "interconnection_date_raw 0.000000 0.0 2.0\n", + "capacity_mw_resource_2 0.142857 2.0 14.0\n", + "date_proposed_raw 0.160714 9.0 56.0\n", + "date_proposed 0.163636 9.0 55.0\n", + "year_proposed 0.358491 19.0 53.0\n", + "point_of_interconnection 0.443396 47.0 106.0\n", + "queue_date 0.509434 54.0 106.0\n", + "queue_date_raw 0.509434 54.0 106.0\n", + "interconnection_status_raw 0.551724 48.0 87.0\n", + "capacity_mw_resource_1 0.557522 63.0 113.0\n", + "county_1 0.607143 68.0 112.0\n", + "resource_type_2 0.685714 24.0 35.0\n", + "raw_state_name 0.747748 83.0 111.0\n", + "resource_type_lbnl 0.807018 92.0 114.0\n", + "resource_type_1 0.877193 100.0 114.0\n", + "queue_year 0.905660 96.0 106.0\n", + "interconnection_status_lbnl 0.938596 107.0 114.0\n", + "utility 0.947368 108.0 114.0\n", + "queue_status 1.000000 114.0 114.0\n", + "region 1.000000 114.0 114.0\n", + "interconnection_service_type 1.000000 50.0 50.0\n", + "capacity_mw_resource_3 NaN 0.0 0.0\n", + "county_2 NaN 0.0 0.0\n", + "county_3 NaN 0.0 0.0\n", + "project_name NaN 0.0 0.0" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what is the duplicate structure of the rest of the columns?\n", + "# Excluding nulls, count the number of duplicates for each column.\n", + "# Compare the fraction of duplicates (and absolute number of duplicates).\n", + "# Values <= ~0.5 indicate that the column is a good candidate for differentiating ID dupes.\n", + "pd.concat(\n", + " [\n", + " (\n", + " id_dupe_lbnl_active\n", + " .dropna(subset=c)\n", + " .duplicated(subset=ids_lbnl + [c], keep=False)\n", + " .agg(['mean', 'sum', 'count'])\n", + " .rename(c)\n", + " )\n", + " for c in id_dupe_lbnl_active.columns.difference(set(ids_lbnl))\n", + " ],\n", + " axis=1\n", + ").T.sort_values('mean')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Based on the above, the columns that usually differentiate ID duplicates are:\n", + "* `date_proposed`\n", + "* `year_proposed`\n", + "* `capacity_mw_resource_1`\n", + "* `county_1`\n", + "* `point_of_interconnection`\n", + "* `queue_date`\n", + "\n", + "Columns that are usually the same for ID duplicates are:\n", + "* `queue_year`\n", + "* `queue_status`\n", + "* `interconnection_service_type`\n", + "* `interconnection_status_lbnl`\n", + "* `utility`\n", + "* `resource_type_X`\n", + "\n", + "**Assumption / value judgment: if the only differences are dates, then the project is probably the same. The date differences are probably due to the project being resubmitted for contingency.**" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mean 0.0\n", + "sum 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# zero dupes in GS! This matches up with the zero dupes in active LBNL projects because it only includes ISO regions.\n", + "ids_gs = ['region', 'queue_id']\n", + "gs.duplicated(subset=ids_gs, keep=False).agg(['mean', 'sum'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I lost a bunch of work here when the container crashed while I updated drivers in the host OS. I'm not going to re-do it, but here were some takeaways:\n", + "* PJM is missing proposed_completion_date due to a bug in GridStatus's ETL code.\n", + "* fixed some misidentified columns in GS ETL code\n", + "* updated LBNL duplicate prioritization to take the keep the record with the latest `date_proposed`, `queue_date`, and `interconnection_status_lbnl`\n", + " * this fixes all but one ID duplicate (excepting LADWP and DominionSC, which have errors in the source data for queue_id, and WAPA, whose queue_id needs to be combined with state into a composite key)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "dupe_keys_lbnl = [\n", + " \"point_of_interconnection_clean\", # string normalization on point_of_interconnection\n", + " \"capacity_mw_resource_1\",\n", + " \"county_1\",\n", + " \"raw_state_name\", # not often useful but is a nearly certain differentiator\n", + " \"utility_clean\", # utility.fillna(region)\n", + " \"resource_type_1\", # not often useful but is a nearly certain differentiator\n", + " ]\n", + "dupe_keys_gs = [\n", + " \"point_of_interconnection_clean\",\n", + " \"capacity_mw\",\n", + " \"county\",\n", + " \"state\",\n", + " \"utility_clean\",\n", + " \"resource\",\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "def normalize_poi(ser: pd.Series) -> pd.Series:\n", + " # Essentially a poor man's bag-of-words model.\n", + " out = (\n", + " ser\n", + " .astype(\"string\")\n", + " .str.lower()\n", + " .str.replace(\"-| +\", \" \", regex=True)\n", + " .str.replace(r\"(?:sub)station|kv| at |tbd\", \"\", regex=True)\n", + " .fillna(\"\")\n", + " )\n", + " out = pd.Series(\n", + " [\" \".join(sorted(x)) for x in out.str.split()],\n", + " index=out.index,\n", + " dtype=\"string\",\n", + " ).str.strip()\n", + " out.replace(\"\", pd.NA, inplace=True)\n", + " return out\n", + "gs.loc[:, 'point_of_interconnection_clean'] = normalize_poi(gs['point_of_interconnection'])\n", + "gs['utility_clean'] = gs['utility'].fillna(gs['region'])" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 8233 entries, 0 to 987\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 point_of_interconnection 7706 non-null string\n", + " 1 point_of_interconnection_clean 7705 non-null string\n", + "dtypes: string(2)\n", + "memory usage: 193.0 KB\n" + ] + } + ], + "source": [ + "gs[['point_of_interconnection', 'point_of_interconnection_clean']].info()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
meansum
point_of_interconnection_clean0.064132528.0
capacity_mw0.00242920.0
county0.012268101.0
state0.00340128.0
utility_clean0.0000000.0
resource0.013118108.0
utility0.2100091729.0
point_of_interconnection0.064011527.0
\n", + "
" + ], + "text/plain": [ + " mean sum\n", + "point_of_interconnection_clean 0.064132 528.0\n", + "capacity_mw 0.002429 20.0\n", + "county 0.012268 101.0\n", + "state 0.003401 28.0\n", + "utility_clean 0.000000 0.0\n", + "resource 0.013118 108.0\n", + "utility 0.210009 1729.0\n", + "point_of_interconnection 0.064011 527.0" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gs[dupe_keys_gs + ['utility', 'point_of_interconnection']].isna().agg(['mean', 'sum']).T" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mean 0.091704\n", + "sum 755.000000\n", + "dtype: float64" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "is_dupe_gs = gs.duplicated(subset=dupe_keys_gs, keep=False)\n", + "is_dupe_gs.agg(['mean', 'sum'])" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mean 0.109495\n", + "sum 1122.000000\n", + "dtype: float64" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# compare to LBNL\n", + "active_lbnl['utility_clean'] = active_lbnl['utility'].fillna(active_lbnl['entity'])\n", + "active_lbnl['point_of_interconnection_clean'] = normalize_poi(active_lbnl['point_of_interconnection'])\n", + "is_dupe_lbnl = active_lbnl.duplicated(subset=dupe_keys_lbnl, keep=False)\n", + "is_dupe_lbnl.agg(['mean', 'sum'])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(10247, 32)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "active_lbnl.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
meansum
point_of_interconnection_clean0.00751477.0
capacity_mw_resource_10.016005164.0
county_10.036791377.0
raw_state_name0.033961348.0
utility_clean0.0000000.0
resource_type_10.0003904.0
utility0.1479461516.0
point_of_interconnection0.00663668.0
\n", + "
" + ], + "text/plain": [ + " mean sum\n", + "point_of_interconnection_clean 0.007514 77.0\n", + "capacity_mw_resource_1 0.016005 164.0\n", + "county_1 0.036791 377.0\n", + "raw_state_name 0.033961 348.0\n", + "utility_clean 0.000000 0.0\n", + "resource_type_1 0.000390 4.0\n", + "utility 0.147946 1516.0\n", + "point_of_interconnection 0.006636 68.0" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "active_lbnl[dupe_keys_lbnl + ['utility', 'point_of_interconnection']].isna().agg(['mean', 'sum']).T" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now determine which duplicate to keep. Can use the same general approach as for LBNL queues, but GS doesn't have a standardized interconnection status, and PJM is currently missing proposed completion date (but will not in the future)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt index 360ddad1..c2dbb0c7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ pandas~=1.4.0 joblib~=1.2.0 beautifulsoup4~=4.11 plotly~=5.15.0 +gridstatus~=0.20.0 diff --git a/src/dbcp/__init__.py b/src/dbcp/__init__.py index ff7a3b70..d48bb323 100644 --- a/src/dbcp/__init__.py +++ b/src/dbcp/__init__.py @@ -9,6 +9,7 @@ import dbcp.extract.eip_infrastructure # noqa: F401 import dbcp.extract.epa_avert # noqa: F401 import dbcp.extract.fips_tables # noqa: F401 +import dbcp.extract.gridstatus_isoqueues # noqa: F401 import dbcp.extract.helpers # noqa: F401 import dbcp.extract.justice40 # noqa: F401 import dbcp.extract.lbnl_iso_queue # noqa: F401 @@ -24,6 +25,7 @@ import dbcp.transform.eip_infrastructure # noqa: F401 import dbcp.transform.epa_avert # noqa: F401 import dbcp.transform.fips_tables # noqa: F401 +import dbcp.transform.gridstatus # noqa: F401 import dbcp.transform.justice40 # noqa: F401 import dbcp.transform.lbnl_iso_queue # noqa: F401 import dbcp.transform.local_opposition # noqa: F401 diff --git a/src/dbcp/data_mart/projects.py b/src/dbcp/data_mart/projects.py index 9d705df8..0d387bba 100644 --- a/src/dbcp/data_mart/projects.py +++ b/src/dbcp/data_mart/projects.py @@ -15,20 +15,111 @@ from dbcp.helpers import get_sql_engine -def _get_and_join_iso_tables(engine: sa.engine.Engine) -> pd.DataFrame: - """Get ISO projects. +def _get_gridstatus_projects(engine: sa.engine.Engine) -> pd.DataFrame: + # drops transmission projects + query = """ + WITH + proj_res AS ( + SELECT + queue_id, + is_nearly_certain, + project_id, + project_name, + capacity_mw, + developer, + entity, + entity AS iso_region, -- these are different in non-ISO data from LBNL + utility, + proposed_completion_date AS date_proposed_online, + point_of_interconnection, + is_actionable, + resource_clean, + queue_status, + queue_date AS date_entered_queue, + interconnection_status_raw AS interconnection_status + FROM data_warehouse.gridstatus_projects as proj + LEFT JOIN data_warehouse.gridstatus_resource_capacity as res + USING (project_id) + WHERE resource_clean != 'Transmission' + ), + loc as ( + -- projects can have multiple locations, though 99 percent have only one. + -- Can multiply capacity by frac_locations_in_county to allocate it equally. + SELECT + project_id, + state_id_fips, + county_id_fips, + (1.0 / count(*) over (partition by project_id))::real as frac_locations_in_county + FROM data_warehouse.gridstatus_locations + ), + gs as ( + SELECT + proj_res.*, + loc.state_id_fips, + loc.county_id_fips, + -- projects with missing location info get full capacity allocation + coalesce(loc.frac_locations_in_county, 1.0) as frac_locations_in_county + FROM proj_res + LEFT JOIN loc + USING (project_id) + ) + SELECT + sfip.state_name AS state, + cfip.county_name AS county, + gs.*, + 'gridstatus' AS source, + ncsl.permitting_type AS state_permitting_type + FROM gs + LEFT JOIN data_warehouse.ncsl_state_permitting AS ncsl + on gs.state_id_fips = ncsl.state_id_fips + LEFT JOIN data_warehouse.state_fips AS sfip + ON gs.state_id_fips = sfip.state_id_fips + LEFT JOIN data_warehouse.county_fips AS cfip + ON gs.county_id_fips = cfip.county_id_fips + """ + gs = pd.read_sql(query, engine) + return gs - PK should be (project_id, county_id_fips, resource_clean), but county_id_fips has nulls. - Note that this duplicates projects that have multiple prospective locations. Use the frac_locations_in_county - column to allocate capacity and co2e estimates to counties when aggregating. - Otherwise they will be double-counted. +def _merge_lbnl_with_gridstatus(lbnl: pd.DataFrame, gs: pd.DataFrame) -> pd.DataFrame: + """Merge non ISO LBNL projects with ISO projects in GridStatus. + + Args: + lbnl: lbnl ISO queue projects + engine: engine to connect to the local postgres data warehouse """ - query = """ + is_non_iso = lbnl.iso_region.str.contains("non-ISO") + lbnl_non_isos = lbnl.loc[is_non_iso, :].copy() + + # TODO (bendnorman): How should we handle project_ids? This hack + # isn't ideal because the GS data warehouse and data mart project + # ids aren't consistent + max_lbnl_id = lbnl_non_isos.project_id.max() + 1 + gs["project_id"] = list(range(max_lbnl_id, max_lbnl_id + len(gs))) + + shared_ids = set(gs.project_id).intersection(set(lbnl_non_isos.project_id)) + assert len(shared_ids) == 0, f"Found duplicate ids between GS and LBNL {shared_ids}" + + fields_in_gs_not_in_lbnl = gs.columns.difference(lbnl.columns) + fields_in_lbnl_not_in_gs = lbnl.columns.difference(gs.columns) + assert ( + fields_in_gs_not_in_lbnl.empty + ), f"These columns are in Grid Status but not LBNL: {fields_in_gs_not_in_lbnl}" + assert ( + fields_in_lbnl_not_in_gs.empty + ), f"These columns are in LBNL but not Grid Status: {fields_in_lbnl_not_in_gs}" + + return pd.concat([gs, lbnl_non_isos], axis=0, ignore_index=True) + + +def _get_lbnl_projects(engine: sa.engine.Engine, non_iso_only=True) -> pd.DataFrame: + where_clause = "WHERE region ~ 'non-ISO'" if non_iso_only else "" + query = f""" WITH iso_proj_res as ( SELECT proj.project_id, + proj.queue_id, proj.date_proposed as date_proposed_online, proj.developer, proj.entity, @@ -46,6 +137,7 @@ def _get_and_join_iso_tables(engine: sa.engine.Engine) -> pd.DataFrame: FROM data_warehouse.iso_projects as proj INNER JOIN data_warehouse.iso_resource_capacity as res ON proj.project_id = res.project_id + {where_clause} ), loc as ( -- Remember that projects can have multiple locations, though 99 percent have only one. @@ -62,7 +154,8 @@ def _get_and_join_iso_tables(engine: sa.engine.Engine) -> pd.DataFrame: iso_proj_res.*, loc.state_id_fips, loc.county_id_fips, - loc.frac_locations_in_county + -- projects with missing location info get full capacity allocation + coalesce(loc.frac_locations_in_county, 1.0) as frac_locations_in_county from iso_proj_res LEFT JOIN loc ON iso_proj_res.project_id = loc.project_id @@ -71,6 +164,7 @@ def _get_and_join_iso_tables(engine: sa.engine.Engine) -> pd.DataFrame: sfip.state_name as state, cfip.county_name as county, iso.*, + 'lbnl' as source, ncsl.permitting_type as state_permitting_type from iso left join data_warehouse.state_fips as sfip @@ -82,20 +176,44 @@ def _get_and_join_iso_tables(engine: sa.engine.Engine) -> pd.DataFrame: ; """ df = pd.read_sql(query, engine) - # projects with missing location info get full capacity allocation - df["frac_locations_in_county"].fillna(1.0, inplace=True) # one whole-row duplicate due to a multi-county project with missing state value. # Makes both county_id_fips and state_id_fips null. dupes = df.duplicated(keep="first") - assert dupes.sum() == 1, f"Expected 1 duplicate row, got {dupes.sum()}." - assert ( - df.loc[dupes, "project_id"].eq(9118).all() - ), f"Duplicate counties: {df.loc[dupes, ['project_id', 'county']]}" - df = df.loc[~dupes] - _estimate_proposed_power_co2e(df) + assert dupes.sum() == 0, f"Expected 0 duplicates, found {dupes.sum()}." return df +def _get_and_join_iso_tables( + engine: sa.engine.Engine, use_gridstatus=True, use_proprietary_offshore=True +) -> pd.DataFrame: + """Get ISO projects. + + PK should be (project_id, county_id_fips, resource_clean), but county_id_fips has nulls. + + Note that this duplicates projects that have multiple prospective locations. Use the frac_locations_in_county + column to allocate capacity and co2e estimates to counties when aggregating. + Otherwise they will be double-counted. + + Args: + engine: engine to connect to the local postgres data warehouse + use_gridstatus: use gridstatus data for ISO projects. + + Returns: + A dataframe of ISO projects with location, capacity, estimated co2 emissions and state permitting info. + """ + if use_gridstatus: + lbnl = _get_lbnl_projects(engine, non_iso_only=True) + gs = _get_gridstatus_projects(engine) + out = _merge_lbnl_with_gridstatus(lbnl=lbnl, gs=gs) + else: + out = _get_lbnl_projects(engine, non_iso_only=False) + if use_proprietary_offshore: + offshore = _get_proprietary_proposed_offshore(engine) + out = _replace_iso_offshore_with_proprietary(out, offshore) + _estimate_proposed_power_co2e(out) + return out + + def _get_proprietary_proposed_offshore(engine: sa.engine.Engine) -> pd.DataFrame: """Get proprietary offshore wind data in a format that imitates the ISO queues. @@ -129,7 +247,13 @@ def _get_proprietary_proposed_offshore(engine: sa.engine.Engine) -> pd.DataFrame ) -- join the project, state, and county stuff SELECT - assoc.*, + assoc.project_id, + assoc.county_id_fips, + -- projects with missing location info get full capacity allocation + CASE WHEN assoc.frac_locations_in_county IS NULL + THEN 1.0 + ELSE assoc.frac_locations_in_county + END as frac_locations_in_county, substr(assoc.county_id_fips, 1, 2) as state_id_fips, proj.name as project_name, @@ -141,6 +265,7 @@ def _get_proprietary_proposed_offshore(engine: sa.engine.Engine) -> pd.DataFrame 0.0 as co2e_tonnes_per_year, proj.is_actionable, proj.is_nearly_certain, + 'proprietary' as source, sfip.state_name as state, cfip.county_name as county, @@ -171,11 +296,10 @@ def _replace_iso_offshore_with_proprietary( """ iso_to_keep = iso_queues.loc[iso_queues["resource_clean"] != "Offshore Wind", :] out = pd.concat( - [iso_to_keep, proprietary.assign(source="proprietary")], + [iso_to_keep, proprietary], axis=0, ignore_index=True, ) - out["source"].fillna("iso", inplace=True) return out @@ -352,10 +476,11 @@ def _add_derived_columns(mart: pd.DataFrame) -> None: "Solar; Storage": "renewable", "Solar": "renewable", "Steam": np.nan, + "Transmission": "transmission", "Unknown": np.nan, "Waste Heat": "fossil", "Wind; Storage": "renewable", - np.nan: np.nan, + np.nan: np.nan, # not technically necessary but make it explicit } # note that this classifies pure storage facilities as np.nan resources_in_data = set(mart["resource_clean"].unique()) @@ -383,9 +508,9 @@ def create_long_format(engine: sa.engine.Engine) -> pd.DataFrame: Returns: pd.DataFrame: long format table of ISO projects """ - iso = _get_and_join_iso_tables(engine) - offshore = _get_proprietary_proposed_offshore(engine) - iso = _replace_iso_offshore_with_proprietary(iso, offshore) + iso = _get_and_join_iso_tables( + engine, use_gridstatus=True, use_proprietary_offshore=True + ) all_counties = _get_county_fips_df(engine) all_states = _get_state_fips_df(engine) @@ -429,3 +554,9 @@ def create_data_mart( "iso_projects_long_format": long_format, "iso_projects_wide_format": wide_format, } + + +if __name__ == "__main__": + # debugging entry point + mart = create_data_mart() + print("yeehaw") diff --git a/src/dbcp/etl.py b/src/dbcp/etl.py index efca34ee..9368bab6 100644 --- a/src/dbcp/etl.py +++ b/src/dbcp/etl.py @@ -195,6 +195,13 @@ def etl_epa_avert() -> dict[str, pd.DataFrame]: return transformed +def etl_gridstatus_isoqueues(): + """ETL gridstatus ISO queues.""" + raw_dfs = dbcp.extract.gridstatus_isoqueues.extract() + transformed = dbcp.transform.gridstatus.transform(raw_dfs) + return transformed + + def etl_manual_ordinances() -> dict[str, pd.DataFrame]: """ETL manually maintained ordinances.""" raw_dfs = dbcp.extract.manual_ordinances.extract() @@ -214,6 +221,7 @@ def etl(args): SPATIAL_CACHE.reduce_size() etl_funcs = { + "gridstatus": etl_gridstatus_isoqueues, "manual_ordinances": etl_manual_ordinances, "epa_avert": etl_epa_avert, "eip_infrastructure": etl_eip_infrastructure, diff --git a/src/dbcp/extract/gridstatus_isoqueues.py b/src/dbcp/extract/gridstatus_isoqueues.py new file mode 100644 index 00000000..d9fe2772 --- /dev/null +++ b/src/dbcp/extract/gridstatus_isoqueues.py @@ -0,0 +1,44 @@ +""" +Extract gridstatus iso queues data from private bucket archive. + +gridstatus code points directly at interconnection queue spreadsheets +on ISO queues websites. These spreadsheets can change without notice +and break the gridstatus API. We have a private archive of the gridstatus data +that allows us to pin the ETL code to a specific version of the raw +data. The version numbers are automatically generated by Google Cloud Storage +Object Versioning. +""" +import logging + +import pandas as pd + +import dbcp + +logger = logging.getLogger(__name__) + +# These are the earliest version we have for ISOs +# except for spp and ISONE because the recent versions +# have columns the old versions don't. +ISO_QUEUE_VERSIONS: dict[str, str] = { + "miso": "1704654953145483", + "caiso": "1704654953474846", + "pjm": "1704654953842777", + "ercot": "1704654954177109", + "spp": "1704654954488739", + "nyiso": "1702235705611699", + "isone": "1704654954804863", +} + + +def extract(iso_queue_versions: dict[str, str] = ISO_QUEUE_VERSIONS): + """Extract gridstatus ISO Queue data.""" + iso_queues: dict[str, pd.DataFrame] = {} + for iso, revision_num in iso_queue_versions.items(): + uri = f"gs://gridstatus-archive/interconnection_queues/{iso}.parquet" + path = dbcp.extract.helpers.cache_gcs_archive_file_locally( + uri=uri, revision_num=revision_num + ) + + iso_queues[iso] = pd.read_parquet(path) + + return iso_queues diff --git a/src/dbcp/extract/helpers.py b/src/dbcp/extract/helpers.py index 1970470d..30cc1fdc 100644 --- a/src/dbcp/extract/helpers.py +++ b/src/dbcp/extract/helpers.py @@ -1,10 +1,9 @@ """Helper functions for extracting data.""" import logging -import os import re from pathlib import Path -import pydata_google_auth +import google.auth from google.cloud import storage logger = logging.getLogger(__name__) @@ -29,24 +28,19 @@ def cache_gcs_archive_file_locally( Path to the local cache of the file. """ bucket_url, object_name = re.match("gs://(.*?)/(.*)", str(uri)).groups() + credentials, project_id = google.auth.default() local_cache_dir = Path(local_cache_dir) filepath = local_cache_dir / object_name + if revision_num: + filepath = Path(str(filepath) + f"#{revision_num}") if not filepath.exists(): logger.info( f"{object_name} not found in {local_cache_dir}. Downloading from GCS bucket." ) - GCP_PROJECT_ID = os.environ.get("GCP_PROJECT_ID") - SCOPES = [ - "https://www.googleapis.com/auth/cloud-platform", - ] - credentials = pydata_google_auth.get_user_credentials( - SCOPES, use_local_webserver=False - ) - - bucket = storage.Client(credentials=credentials, project=GCP_PROJECT_ID).bucket( - bucket_url, user_project=GCP_PROJECT_ID + bucket = storage.Client(credentials=credentials, project=project_id).bucket( + bucket_url, user_project=project_id ) if revision_num: diff --git a/src/dbcp/extract/manual_ordinances.py b/src/dbcp/extract/manual_ordinances.py index 720b1120..4e236c01 100644 --- a/src/dbcp/extract/manual_ordinances.py +++ b/src/dbcp/extract/manual_ordinances.py @@ -1,9 +1,8 @@ """Load manually maintained ordinances from BigQuery.""" +import google.auth import pandas as pd from pandas_gbq import read_gbq -from dbcp.helpers import GCP_PROJECT_ID, _get_bigquery_credentials - def extract() -> dict[str, pd.DataFrame]: """Extract manually maintained ordinances from BigQuery. @@ -12,7 +11,7 @@ def extract() -> dict[str, pd.DataFrame]: dfs: dictionary of dataframe name to raw dataframe. """ dfs = {} - credentials = _get_bigquery_credentials() + credentials, project_id = google.auth.default() dfs["manual_ordinances"] = read_gbq( """SELECT county_id_fips, @@ -23,7 +22,7 @@ def extract() -> dict[str, pd.DataFrame]: END as ordinance_via_self_maintained FROM `local-jobs-econ-dev-fund.airtable_data.county_permitting_info` """, - project_id=GCP_PROJECT_ID, + project_id=project_id, credentials=credentials, ) return dfs diff --git a/src/dbcp/helpers.py b/src/dbcp/helpers.py index 7deec064..878579d9 100644 --- a/src/dbcp/helpers.py +++ b/src/dbcp/helpers.py @@ -6,9 +6,9 @@ from pathlib import Path import boto3 +import google.auth import pandas as pd import pandas_gbq -import pydata_google_auth import sqlalchemy as sa from botocore import UNSIGNED from botocore.config import Config @@ -26,8 +26,14 @@ "BOOLEAN": "BOOL", "DATETIME": "DATETIME", } +SA_TO_PD_TYPES = { + "VARCHAR": "string", + "INTEGER": "Int64", + "FLOAT": "float64", + "BOOLEAN": "boolean", + "DATETIME": "datetime64[ns]", +} SA_TO_BQ_MODES = {True: "NULLABLE", False: "REQUIRED"} -GCP_PROJECT_ID = os.environ.get("GCP_PROJECT_ID") def get_schema_sql_alchemy_metadata(schema: str) -> sa.MetaData: @@ -71,6 +77,23 @@ def get_bq_schema_from_metadata( return bq_schema +def enforce_dtypes(df: pd.DataFrame, table_name: str, schema: str): + """Apply dtypes to a dataframe using the sqlalchemy metadata.""" + table_name = f"{schema}.{table_name}" + metadata = get_schema_sql_alchemy_metadata(schema) + try: + table = metadata.tables[table_name] + except KeyError: + raise KeyError(f"{table_name} does not exist in metadata.") + + dtypes = { + col.name: SA_TO_PD_TYPES[str(col.type)] + for col in table.columns + if col.name in df.columns + } + return df.astype(dtypes) + + def get_sql_engine() -> sa.engine.Engine: """Create a sql alchemy engine from environment vars.""" user = os.environ["POSTGRES_USER"] @@ -136,14 +159,6 @@ def get_db_schema_tables(engine: sa.engine.Engine, schema: str) -> list[str]: return table_names -def _get_bigquery_credentials(): - SCOPES = [ - "https://www.googleapis.com/auth/cloud-platform", - ] - creds = pydata_google_auth.get_user_credentials(SCOPES, use_local_webserver=False) - return creds - - def upload_schema_to_bigquery(schema: str, dev: bool = True) -> None: """Upload a postgres schema to BigQuery.""" logger.info("Loading tables to BigQuery.") @@ -164,8 +179,8 @@ def upload_schema_to_bigquery(schema: str, dev: bool = True) -> None: ) # load to big query - credentials = _get_bigquery_credentials() - client = bigquery.Client(credentials=credentials, project=GCP_PROJECT_ID) + credentials, project_id = google.auth.default() + client = bigquery.Client(credentials=credentials, project=project_id) for table_name, df in loaded_tables.items(): schema_environment = f"{schema}{'_dev' if dev else ''}" @@ -175,13 +190,13 @@ def upload_schema_to_bigquery(schema: str, dev: bool = True) -> None: # Delete the table because pandas_gbq doesn't recreate the BQ # table schema which leads to problems when we change the metadata. - table_id = f"{GCP_PROJECT_ID}.{schema_environment}.{table_name}" + table_id = f"{project_id}.{schema_environment}.{table_name}" client.delete_table(table_id, not_found_ok=True) pandas_gbq.to_gbq( df, full_table_name, - project_id=GCP_PROJECT_ID, + project_id=project_id, if_exists="replace", credentials=credentials, table_schema=table_schema, @@ -218,31 +233,3 @@ def psql_insert_copy(table, conn, keys, data_iter): sql = f"COPY {table_name} ({columns}) FROM STDIN WITH CSV" cur.copy_expert(sql=sql, file=s_buf) dbapi_conn.commit() - - -SA_TO_PD_TYPES = { - "BOOLEAN": "boolean", - "DATETIME": "datetime64[ns]", - "FLOAT": "float64", - "INTEGER": "Int64", - "VARCHAR": "string", -} - - -def enforce_dtypes(df: pd.DataFrame, table_name: str, schema: str) -> pd.DataFrame: - """Enforce datatypes specified in the dbcp.metadata.sqlalchemy schemas.""" - schema_sa_metadata = { - "data_warehouse": dbcp.metadata.data_warehouse.metadata, - "data_mart": dbcp.metadata.data_mart.metadata, - } - metadata = schema_sa_metadata.get(schema, None) - if not metadata: - raise KeyError(f"Metadata for schema: {schema} does not exists.") - full_table_name = f"{schema}.{table_name}" - return df.astype( - { - column_name: SA_TO_PD_TYPES[str(col.type)] - for column_name, col in metadata.tables[full_table_name].columns.items() - if column_name in df.columns - } - ) diff --git a/src/dbcp/metadata/data_mart.py b/src/dbcp/metadata/data_mart.py index af63aa02..61f1fb22 100644 --- a/src/dbcp/metadata/data_mart.py +++ b/src/dbcp/metadata/data_mart.py @@ -311,6 +311,7 @@ Column("state", String), Column("county", String), Column("county_id_fips", String), + Column("queue_id", String), Column("resource_clean", String, nullable=False), Column("project_id", Integer, nullable=False), Column("date_proposed_online", DateTime), diff --git a/src/dbcp/metadata/data_warehouse.py b/src/dbcp/metadata/data_warehouse.py index d5e76523..e2208dfa 100644 --- a/src/dbcp/metadata/data_warehouse.py +++ b/src/dbcp/metadata/data_warehouse.py @@ -1287,6 +1287,75 @@ schema=schema, ) +############### +# Grid Status # +############### +gridstatus_projects = Table( + "gridstatus_projects", + metadata, + Column("project_id", Integer, primary_key=True, autoincrement=False), + Column("actual_completion_date", DateTime, nullable=True), + Column("interconnecting_entity", String, nullable=True), + Column("point_of_interconnection", String, nullable=True), + Column("project_name", String, nullable=True), + Column("proposed_completion_date", DateTime, nullable=True), + Column("queue_date", DateTime, nullable=True), + Column("queue_id", String, nullable=True), + Column("queue_status", String, nullable=True), + Column("interconnection_status_raw", String, nullable=True), + Column("utility", String, nullable=True), + Column("withdrawal_comment", String, nullable=True), + Column("withdrawn_date", DateTime, nullable=True), + Column("is_actionable", Boolean, nullable=True), + Column("is_nearly_certain", Boolean, nullable=True), + Column("region", String, nullable=False), + Column("entity", String, nullable=False), + Column("developer", String, nullable=True), + schema=schema, +) + +gridstatus_resource_capacity = Table( + "gridstatus_resource_capacity", + metadata, + Column( + "project_id", + Integer, + ForeignKey("data_warehouse.gridstatus_projects.project_id"), + ), + Column("resource", String), + Column("resource_clean", String), + Column("capacity_mw", Float), + schema=schema, +) + +gridstatus_locations = Table( + "gridstatus_locations", + metadata, + Column( + "project_id", + Integer, + ForeignKey("data_warehouse.gridstatus_projects.project_id"), + ), + Column("raw_county_name", String), + Column("raw_state_name", String), + Column( + "state_id_fips", + String, + ForeignKey("data_warehouse.state_fips.state_id_fips"), + nullable=True, + ), + Column( + "county_id_fips", + String, + ForeignKey("data_warehouse.county_fips.county_id_fips"), + nullable=True, + ), + Column("geocoded_locality_name", String), + Column("geocoded_locality_type", String), + Column("geocoded_containing_county", String), + schema=schema, +) + ##################### # MANUAL ORDINANCES # ##################### diff --git a/src/dbcp/transform/gridstatus.py b/src/dbcp/transform/gridstatus.py new file mode 100644 index 00000000..7a862344 --- /dev/null +++ b/src/dbcp/transform/gridstatus.py @@ -0,0 +1,918 @@ +"""Clean Grid Status Interconnection queue data.""" +import logging +from typing import Sequence + +import numpy as np +import pandas as pd + +from dbcp.helpers import enforce_dtypes +from dbcp.transform.helpers import ( + add_county_fips_with_backup_geocoding, + normalize_multicolumns_to_rows, +) +from dbcp.transform.lbnl_iso_queue import ( + _normalize_point_of_interconnection, + deduplicate_active_projects, +) + +COLUMN_RENAME_DICT = { + "Actual Completion Date": "actual_completion_date", + "Capacity (MW)": "capacity_mw", + "County": "county", + "Generation Type": "resource", + "Interconnecting Entity": "interconnecting_entity", + "Interconnection Location": "point_of_interconnection", + "Project Name": "project_name", + "Proposed Completion Date": "proposed_completion_date", + "Queue Date": "queue_date", + "Queue ID": "queue_id", + "State": "state", + "Status": "queue_status", + "Summer Capacity (MW)": "summer_capacity_mw", + "Winter Capacity (MW)": "winter_capacity_mw", + "Withdrawal Comment": "withdrawal_comment", + "Withdrawn Date": "withdrawn_date", + "Transmission Owner": "utility", + "is_actionable": "is_actionable", + "is_nearly_certain": "is_nearly_certain", + "region": "region", +} + +RESOURCE_DICT = { + "Battery Storage": { + "codes": { + "miso": ["Battery Storage"], + "caiso": [ + "Storage", + "Storage + Other", + "Storage + Storage", + "Storage + Photovoltaic + Wind Turbine", + "Battery", + ], + "pjm": ["Storage", "Storage; Solar", "Storage; Wind"], + "ercot": ["Other - Battery Energy Storage", "Other - Energy Storage"], + "spp": ["Battery/Storage"], + "nyiso": ["Energy Storage"], + "isone": ["BAT"], + }, + "type": "Renewable", + }, + "Biofuel": { + "codes": { + "miso": [], + "caiso": [], + "pjm": [], + "ercot": [], + "spp": [], + "nyiso": [], + "isone": [], + }, + "type": "Renewable", + }, + "Biomass": { + "codes": { + "miso": [], + "caiso": [], + "pjm": [], + "ercot": ["Biomass - Steam Turbine other than Combined-Cycle"], + "spp": [], + "nyiso": [], + "isone": [], + }, + "type": "Renewable", + }, + "Coal": { + "codes": { + "miso": ["Coal"], + "caiso": [], + "pjm": [], + "ercot": [], + "spp": [], + "nyiso": [], + "isone": [], + }, + "type": "Fossil", + }, + "Combustion Turbine": { + "codes": { + "miso": [], + "caiso": [], + "pjm": [], + "ercot": [], + "spp": [], + "nyiso": [], + "isone": [], + }, + "type": "Fossil", + }, + "Fuel Cell": { + "codes": { + "miso": [], + "caiso": [], + "pjm": [], + "ercot": [], + "spp": [], + "nyiso": [], + "isone": ["FC"], + }, + "type": "Fossil", + }, + "Geothermal": { + "codes": { + "miso": [], + "caiso": [], + "pjm": [], + "ercot": [], + "spp": [], + "nyiso": [], + "isone": [], + }, + "type": "Renewable", + }, + "Hydro": { + "codes": { + "miso": ["Hydro"], + "caiso": ["Hydro"], + "pjm": ["Hydro"], + "ercot": [], + "spp": [], + "nyiso": [], + "isone": [], + }, + "type": "Renewable", + }, + "Landfill Gas": { + "codes": { + "miso": [], + "caiso": [], + "pjm": [], + "ercot": [], + "spp": [], + "nyiso": [], + "isone": [], + }, + "type": "Fossil", + }, + "Municipal Solid Waste": { + "codes": { + "miso": ["Waste Heat Recovery"], + "caiso": [], + "pjm": [], + "ercot": [], + "spp": [], + "nyiso": [], + "isone": [], + }, + "type": "Fossil", + }, + "Natural Gas": { + "codes": { + "miso": ["Gas"], + "caiso": [ + "Gas Turbine + Storage", + "Gas Turbine", + "Combined Cycle", + "Combined Cycle + Storage", + ], + "pjm": [ + "Natural Gas", + "Natural Gas; Other", + "Methane", + "Gas - Internal Combustion Engine, eg. Reciprocating", + "Gas - Combined-Cycle", + ], + "ercot": [ + "Gas - Combustion (gas) Turbine, but not part of a Combined-Cycle", + "Gas - Steam Turbine other than Combined-Cycle", + ], + "spp": [ + "Thermal - CTG", + "Thermal - CT", + "Thermal - Gas Turbine", + "Thermal - Reciprocating Engine", + "Thermal", + "Thermal - Gas", + "Thermal - Combined Cycle", + "Thermal - RICE", + ], + "nyiso": ["Combined Cycle"], + "isone": ["NG", "DFO NG", "NG SUN BAT"], + }, + "type": "Fossil", + }, + "Nuclear": { + "codes": { + "miso": [], + "caiso": [], + "pjm": [], + "ercot": [], + "spp": [], + "nyiso": [], + "isone": [], + }, + "type": "Renewable", + }, + "Offshore Wind": { + "codes": { + "miso": [], + "caiso": [], + "pjm": ["Offshore Wind"], + "ercot": [], + "spp": [], + "nyiso": [], + "isone": [], + }, + "type": "Renewable", + }, + "Oil": { + "codes": { + "miso": ["Diesel"], + "caiso": [], + "pjm": ["Diesel; Solar"], + "ercot": [], + "spp": ["Thermal - Diesel/Gas"], + "nyiso": [], + "isone": ["KER BAT"], + }, + "type": "Fossil", + }, + "Onshore Wind": { + "codes": { + "miso": ["Wind"], + "caiso": [ + "Wind Turbine", + "Wind Turbine + Storage", + "Storage + Wind Turbine", + "Wind Turbine + Photovoltaic + Storage", + "Wind Turbine + Storage + Photovoltaic", + ], + "pjm": ["Wind", "Wind; Solar", "Solar; Storage; Wind", "Wind; Storage"], + "ercot": ["Wind - Wind Turbine"], + "spp": ["Wind", "Hybrid - Wind/Storage", "Hybrid - Wind/Solar"], + "nyiso": ["Wind"], + "isone": ["WND"], + }, + "type": "Renewable", + }, + "Other": { + "codes": { + "miso": [], + "caiso": [], + "pjm": ["Other"], + "ercot": ["Other - Other"], + "spp": ["Hybrid - Solar\\RECIP Gas"], + "nyiso": [], + "isone": [], + }, + "type": "Unknown Resource", + }, + "Unknown": { + "codes": { + "miso": ["Hybrid", "Co-Gen"], + "caiso": ["Water", "Gravity via Rail"], + "pjm": [], + "ercot": [], + "spp": [], + "nyiso": ["Load"], + "isone": ["WAT", "WDS", "WAT BAT"], + }, + "type": "Unknown Resource", + }, + "Other Storage": { + "codes": { + "miso": [], + "caiso": [], + "pjm": [], + "ercot": [], + "spp": [], + "nyiso": [], + "isone": [], + }, + "type": "Renewable", + }, + "Pumped Storage": { + "codes": { + "miso": [], + "caiso": ["Pumped-Storage hydro"], + "pjm": [], + "ercot": [], + "spp": [], + "nyiso": [], + "isone": [], + }, + "type": "Renewable", + }, + "Solar": { + "codes": { + "miso": ["Solar"], + "caiso": [ + "Storage + Photovoltaic", + "Photovoltaic + Storage", + "Photovoltaic", + "Solar Thermal + Storage", + "Photovoltaic + Storage + Wind Turbine", + ], + "pjm": ["Solar", "Solar; Storage", "Solar; Wind", "Solar; Battery"], + "ercot": ["Solar - Photovoltaic Solar", "Other - Photovoltaic Solar"], + "spp": [ + "Solar", + "Hybrid - Solar/Storage", + "Hybrid - Solar/Battery", + "Hybrid - Solar", + "Hybrid - Solar/Battery/Wind", + ], + "nyiso": ["Solar"], + "isone": ["SUN", "SUN BAT", "SUN WAT"], + }, + "type": "Renewable", + }, + "Steam": { + "codes": { + "miso": [], + "caiso": ["Steam Turbine + Storage", "Steam Turbine"], + "pjm": [], + "ercot": [], + "spp": ["Thermal - Steam"], + "nyiso": [], + "isone": [], + }, + "type": "Fossil", + }, + "Transmission": { + "codes": { + "miso": ["High Voltage DC"], + "caiso": [], + "pjm": [], + "ercot": [], + "spp": [], + "nyiso": ["AC Transmission", "DC Transmission"], + "isone": [], + }, + "type": "Other", + }, + "Waste Heat": { + "codes": { + "miso": [], + "caiso": [], + "pjm": [], + "ercot": [], + "spp": [], + "nyiso": [], + "isone": [], + }, + "type": "Fossil", + }, +} + +logger = logging.getLogger(__name__) + + +def _clean_resource_type(resource_df: pd.DataFrame) -> pd.DataFrame: + """Harmonize resource type for all ISO queues.""" + resource_df = resource_df.copy() + long_dict = {} + + for clean_name, code_type_dict in RESOURCE_DICT.items(): + long_dict[clean_name] = clean_name + for _, codes in code_type_dict["codes"].items(): + for code in codes: + if code: + long_dict[code] = clean_name + + # There are a couple of empty string values + resource_df["resource"] = ( + resource_df["resource"].astype("string").str.strip().replace("", pd.NA) + ) + + resource_df["resource_clean"] = ( + resource_df["resource"].fillna("Unknown").map(long_dict) + ) + + unmapped = resource_df["resource_clean"].isna() + if unmapped.sum() != 0: + debug = resource_df[unmapped]["resource"].value_counts(dropna=False) + raise AssertionError(f"Unmapped resource types in: \n{debug}") + return resource_df + + +def _create_project_status_classification_from_single_column( + iso_df: pd.DataFrame, + status_col: str, + nearly_certain_vals: Sequence[str], + actionable_vals: Sequence[str], +) -> pd.DataFrame: + """Add columns is_actionable and is_nearly_certain that classify each project. + + This function handles data from ISOs that report project status information in + a single column. + + This model was created by a consultant in Excel and translated to python. + """ + iso_df["is_actionable"] = iso_df[status_col].isin(actionable_vals).fillna(False) + iso_df["is_nearly_certain"] = ( + iso_df[status_col].isin(nearly_certain_vals).fillna(False) + ) + + assert ( + ~iso_df[["is_actionable", "is_nearly_certain"]].all(axis=1) + ).all(), "Some projects are marked marked actionable and nearly certain." + + return iso_df + + +def _create_project_status_classification_from_multiple_columns( + iso_df: pd.DataFrame, + system_impact_study_col: str, + facilities_study_status_col: str, + ia_col: str, + completed_strings: Sequence[str], +): + """Add columns is_actionable and is_nearly_certain that classify each project. + + This function handles data from ISOs that report project status information in + a multiple columns. + + This model was created by a consultant in Excel and translated to python. + """ + status_cols = {} + status_cols[system_impact_study_col] = "completed_system_impact_study" + status_cols[facilities_study_status_col] = "completed_facilities_study_status" + status_cols[ia_col] = "executed_ia" + + status_df = ( + iso_df.loc[:, list(status_cols.keys())] + .isin(set(completed_strings)) + .fillna(False) + ) + status_df.rename(columns=status_cols, inplace=True) + + iso_df["is_nearly_certain"] = status_df.loc[:, "executed_ia"] + iso_df["is_actionable"] = ( + status_df.completed_system_impact_study + | status_df.completed_facilities_study_status + ) & ~status_df.executed_ia + + assert ( + ~iso_df[["is_actionable", "is_nearly_certain"]].all(axis=1) + ).all(), "Some projects are marked marked actionable and nearly certain." + return iso_df + + +def _transform_miso(iso_df: pd.DataFrame) -> pd.DataFrame: + """Make miso specific transformations.""" + # Grab all projects that are "Active" and "Done" that are under construction or about to be. + is_active_project = iso_df["Post Generator Interconnection Agreement Status"].isin( + ("Under Construction", "Not Started") + ) & iso_df["queue_status"].isin(("Active", "Done")) + iso_df = iso_df[is_active_project].copy() + + actionable_vals = ( + "PHASE 2", + "PHASE 3", + ) + nearly_certain_vals = ("GIA",) + iso_df = _create_project_status_classification_from_single_column( + iso_df, + "studyPhase", + nearly_certain_vals, + actionable_vals, + ) + iso_df = iso_df.rename(columns={"studyPhase": "interconnection_status_raw"}) + + # GridStatus wrongly sources "Proposed Completion Date" from "negInService". + # It should come from "inService" + iso_df.rename(columns={"proposed_completion_date": "negInService"}, inplace=True) + iso_df.rename(columns={"inService": "proposed_completion_date"}, inplace=True) + + # There are about 30 projects that are duplciated because there is an + # addition record where studyPhase == "Network Upgrade". I don't fully + # understand why but it seems like a reasonable drop + iso_df = iso_df.drop_duplicates(subset="queue_id") + return iso_df + + +def _transform_caiso(iso_df: pd.DataFrame) -> pd.DataFrame: + """Make caiso specific transformations.""" + iso_df = iso_df.query("queue_status == 'ACTIVE'").copy() + + iso_df = _create_project_status_classification_from_multiple_columns( + iso_df, + facilities_study_status_col="Facilities Study (FAS) or Phase II Cluster Study", + system_impact_study_col="System Impact Study or Phase I Cluster Study", + ia_col="Interconnection Agreement Status", + completed_strings=("Executed", "Complete"), + ) + return iso_df + + +def _transform_pjm(iso_df: pd.DataFrame) -> pd.DataFrame: + """Make pjm specific transformations.""" + is_active_project = iso_df.queue_status.isin( + { + "Engineering and Procurement", + "Partially in Service - Under Construction", + "Under Construction", + "Active", + } + ) + iso_df = iso_df.loc[is_active_project, :].copy() + + iso_df = _create_project_status_classification_from_multiple_columns( + iso_df, + facilities_study_status_col="Facilities Study Status", + system_impact_study_col="System Impact Study Status", + ia_col="Interim/Interconnection Service Agreement Status", + completed_strings=("Document Posted",), + ) + + # I think GridStatus wrongly assigned the raw "Name" column to "Project Name" + # instead of "Interconnection Location". 97% of the values for Active projects + # refer to transmission lines ("asdf XXX kV") + stats = ( + iso_df.query('queue_status == "Active"')["project_name"] + .str.lower() + .str.contains(r"\d *kv") + .agg(["mean", "sum"]) + ) + assert ( + stats["mean"] > 0.9 + ), f"Only {stats['mean']:.2%} of Active project_name look like transmission lines." + + iso_df.drop(columns="point_of_interconnection", inplace=True) + iso_df.rename(columns={"project_name": "point_of_interconnection"}, inplace=True) + + # GridStatus also wrongly sources "Proposed Completion Date" from + # "Revised In Service Date". It should come from "Commercial Operation Milestone". + # Switch the column names. + iso_df.rename( + columns={"proposed_completion_date": "Revised In Service Date"}, inplace=True + ) + iso_df.rename( + columns={"Commercial Operation Milestone": "proposed_completion_date"}, + inplace=True, + ) + + # winter_capacity_mw in pjm aligns with the LBNL data + iso_df["capacity_mw"] = iso_df["winter_capacity_mw"] + return iso_df + + +def _transform_ercot(iso_df: pd.DataFrame) -> pd.DataFrame: + """Make ercot specific transformations.""" + actionable_vals = ( + "SS Completed, FIS Started, No IA", + "SS Completed, FIS Completed, No IA", + ) + nearly_certain_vals = ( + "SS Completed, FIS Completed, IA", + "SS Completed, FIS Started, IA", + "SS Completed, FIS Not Started, IA", + ) + + iso_df = _create_project_status_classification_from_single_column( + iso_df, + "GIM Study Phase", + nearly_certain_vals, + actionable_vals, + ) + + iso_df = iso_df.rename(columns={"GIM Study Phase": "interconnection_status_raw"}) + + iso_df = iso_df.rename( + columns={"interconnecting_entity": "developer"}, errors="raise" + ) + return iso_df + + +def _transform_spp(iso_df: pd.DataFrame) -> pd.DataFrame: + """Make spp specific transformations.""" + # SPP queue does not include withdrawn projects. + # Grab all projects that aren't operational. + is_active_project = ~iso_df["Status (Original)"].isin( + ("IA FULLY EXECUTED/COMMERCIAL OPERATION",) + ) + iso_df = iso_df[is_active_project].copy() + # Impute missing status values + # If queue_status is missing and withdrawn date is not missing, mark + # the project as withdrawn + iso_df["queue_status"] = iso_df["queue_status"].mask( + iso_df["queue_status"].isna() & ~iso_df["withdrawn_date"].isna(), "Withdrawn" + ) + # If queue_status is missing and commercial operation date is not missing, mark + # the project as completed + iso_df["queue_status"] = iso_df["queue_status"].mask( + iso_df["queue_status"].isna() & ~iso_df["Commercial Operation Date"].isna(), + "Completed", + ) + assert ( + ~iso_df["queue_status"].isna().any() + ), f"{iso_df['queue_status'].isna().sum()} SPP projects are missing queue_status" + + # Categorize certain and actionable projects + actionable_vals = ("DISIS STAGE", "FACILITY STUDY STAGE") + nearly_certain_vals = ( + "IA FULLY EXECUTED/ON SCHEDULE", + "IA FULLY EXECUTED/ON SUSPENSION", + "IA PENDING", + ) + iso_df = _create_project_status_classification_from_single_column( + iso_df, + "Status (Original)", + nearly_certain_vals, + actionable_vals, + ) + + iso_df = iso_df.rename(columns={"Status (Original)": "interconnection_status_raw"}) + + return iso_df + + +def _transform_nyiso(iso_df: pd.DataFrame) -> pd.DataFrame: + """Make nyiso specific transformations.""" + # NYISO status mapping from the excel sheet + status_mapping = { + 0: "Withdrawn", + 1: "Scoping Meeting Pending", + 2: "FES Pending", + 3: "FES in Progress", + 4: "SRIS/SIS Pending", + 5: "SRIS/SIS in Progress", + 6: "SRIS/SIS Approved", + 7: "FS Pending", + 8: "Rejected Cost Allocation/Next FS Pending", + 9: "FS in Progress", + 10: "Accepted Cost Allocation/IA in Progress", + 11: "IA Completed", + 12: "Under Construction", + 13: "In Service for Test", + 14: "In Service Commercial", + 15: "Partial In-Service", + } + + # Some projects have multiple values listed. Grab the largest value. + if pd.api.types.is_string_dtype(iso_df["S"]) or pd.api.types.is_object_dtype( + iso_df["S"] + ): + iso_df["S"] = ( + iso_df["S"] + .str.split(",") + .apply(lambda lst: max([pd.to_numeric(x) for x in lst])) + ) + iso_df["S"] = pd.to_numeric(iso_df["S"]) + + # Remove all withdrawn and in service projects + iso_df = iso_df.loc[iso_df["S"].ne(0) & iso_df["S"].ne(14), :].copy() + + # Categorize project status + iso_df["is_actionable"] = (iso_df["S"].ge(6) & iso_df["S"].lt(11)).fillna(False) + iso_df["is_nearly_certain"] = iso_df["S"].ge(11).fillna(False) + assert ( + ~iso_df[["is_actionable", "is_nearly_certain"]].all(axis=1) + ).all(), "Some projects are marked marked actionable and nearly certain." + + iso_df["interconnection_status_raw"] = iso_df["S"].map(status_mapping) + return iso_df + + +def _transform_isone(iso_df: pd.DataFrame) -> pd.DataFrame: + """Make isone specific transformations.""" + # Grab all active projects + iso_df = iso_df.query("queue_status == 'Active'").copy() + + iso_df = _create_project_status_classification_from_multiple_columns( + iso_df, + facilities_study_status_col="Facilities Study Status", + system_impact_study_col="System Impact Study Status", + ia_col="Interconnection Agreement Status", + completed_strings=("Document Posted", "Executed"), + ) + + return iso_df + + +def _normalize_project_locations(iso_df: pd.DataFrame) -> pd.DataFrame: + """Create a dataframe of project loctions. + + Some projects list multiple counties in the `county` field. This funciton + explodes and geocodes the county names. + + Args: + iso_df: the complete denormalized iso dataframe. + Returns: + geocoded_locations: a dataframe of geocoded project locations. + + """ + location_cols = [ + "project_id", + "raw_county_name", + "raw_state_name", + "state_id_fips", + "county_id_fips", + "geocoded_locality_name", + "geocoded_locality_type", + "geocoded_containing_county", + ] + + # Create a location table. + locations = iso_df.assign( + county=iso_df["county"].str.split(",|/|-|&| and ") + ).explode("county") + # geocode the projects + locations["county_project_id"] = range(0, len(locations)) + locations = locations.set_index("county_project_id") + + geocoded_locations = add_county_fips_with_backup_geocoding( + locations, state_col="state", locality_col="county" + ) + geocoded_locations["raw_county_name"] = locations["county"] + geocoded_locations["raw_state_name"] = locations["state"] + geocoded_locations = geocoded_locations.reset_index(drop=True) + # correct some fips codes + geocoded_locations.loc[ + geocoded_locations.county_id_fips.eq("51515"), "county_id_fips" + ] = "51019" # https://www.ddorn.net/data/FIPS_County_Code_Changes.pdf + + geocoded_locations = geocoded_locations[location_cols].copy() + duplicate_locations = geocoded_locations[ + geocoded_locations[["county_id_fips", "project_id"]].duplicated(keep=False) + ] + assert ( + len(duplicate_locations) < 30 + ), f"Found more duplicate locations in Grid Status location table than expected:\n {duplicate_locations}" + return geocoded_locations + + +def _normalize_project_capacity(iso_df: pd.DataFrame) -> pd.DataFrame: + """Create a dataframe of project capacities. + + California lists multiple fuel types and capacity values for a single project. + + Args: + iso_df: the complete denormalized iso dataframe. + Returns: + capacity_df: a dataframe of project capacities. + """ + capacity_cols = ["project_id", "resource", "capacity_mw"] + + is_caiso = iso_df.region.eq("caiso") + caiso = iso_df[is_caiso] + + n_multicolumns = 3 + caiso_capacity_cols = ["MW-" + str(n) for n in range(1, n_multicolumns + 1)] + attr_columns = { + "resource": ["Fuel-" + str(n) for n in range(1, n_multicolumns + 1)], + "capacity_mw": caiso_capacity_cols, + } + caiso_capacity_df = normalize_multicolumns_to_rows( + caiso, + attribute_columns_dict=attr_columns, + preserve_original_names=False, + index_cols=["project_id"], + dropna=True, + ) + assert ( + ~caiso_capacity_df[["project_id", "resource"]].duplicated().any() + ), "Found duplicate CAISO capacities." + assert ( + caiso[caiso_capacity_cols].sum().sum() == caiso_capacity_df["capacity_mw"].sum() + ), "Total CAISO capacity not preserved after normaliztion." + + capacity_df = pd.concat( + [iso_df[~is_caiso][capacity_cols], caiso_capacity_df[capacity_cols]] + ) + return capacity_df + + +def _normalize_projects( + iso_df: pd.DataFrame, +) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Normalize Gridstatus projects into projects and capacities. + + CAISO is the only ISO that has multiple "capacities" per project. + + """ + project_cols = [ + "project_id", + "actual_completion_date", + "interconnecting_entity", + "point_of_interconnection", + "project_name", + "proposed_completion_date", + "queue_date", + "queue_id", + "queue_status", + "interconnection_status_raw", + "utility", + "withdrawal_comment", + "withdrawn_date", + "is_actionable", + "is_nearly_certain", + "region", + "entity", + "developer", + ] + location_df = _normalize_project_locations(iso_df) + # Create a capacity table + capacity_df = _normalize_project_capacity(iso_df) + + return iso_df[project_cols], capacity_df, location_df + + +def _prep_for_deduplication(df: pd.DataFrame) -> None: + df["point_of_interconnection_clean"] = _normalize_point_of_interconnection( + df["point_of_interconnection"] + ) + df["utility_clean"] = df["utility"].fillna(df["region"]) + return + + +def transform(raw_dfs: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]: + """Clean Grid Status Interconnection Queue data. + + Args: + raw_dfs: raw dataframes for each ISO. + + Returns: + A dictionary of cleaned Grid Status data queus. + """ + # create one dataframe + iso_cleaning_functions = { + "miso": _transform_miso, + "caiso": _transform_caiso, + "pjm": _transform_pjm, + "ercot": _transform_ercot, + "spp": _transform_spp, + "nyiso": _transform_nyiso, + "isone": _transform_isone, + } + + projects = [] + for iso, df in raw_dfs.items(): + # Apply rename + renamed_df = df.rename(columns=COLUMN_RENAME_DICT).copy() + + # Apply iso specific cleaning functions + renamed_df = iso_cleaning_functions[iso](renamed_df) + + renamed_df["region"] = iso + renamed_df["entity"] = iso.upper() + projects.append(renamed_df) + + active_projects = pd.concat(projects) + active_projects["queue_status"] = active_projects.queue_status.str.lower() + + # parse dates + date_cols = [col for col in list(active_projects) if "date" in col] + for col in date_cols: + active_projects[col] = pd.to_datetime(active_projects[col], utc=True) + + # create project_id + active_projects["project_id"] = np.arange(len(active_projects), dtype=np.int32) + + # deduplicate active projects + pre_dedupe = len(active_projects) + active_projects = deduplicate_active_projects( + active_projects, + key=[ + "point_of_interconnection_clean", # derived in _prep_for_deduplication + "capacity_mw", + "county", + "state", + "utility_clean", # derived in _prep_for_deduplication + "resource", + ], + tiebreak_cols=["queue_date", "proposed_completion_date"], + intermediate_creator=_prep_for_deduplication, + ) + dupes = pre_dedupe - len(active_projects) + logger.info(f"Deduplicated {dupes} ({dupes/pre_dedupe:.2%}) projects.") + + # Normalize data + ( + normalized_projects, + normalized_capacities, + normalized_locations, + ) = _normalize_projects(active_projects) + + # harmonize types + normalized_capacities = _clean_resource_type(normalized_capacities) + + # Correct dtypes + normalized_capacities["capacity_mw"] = pd.to_numeric( + normalized_capacities.capacity_mw + ) + normalized_projects = enforce_dtypes( + normalized_projects, table_name="gridstatus_projects", schema="data_warehouse" + ) + normalized_capacities = enforce_dtypes( + normalized_capacities, + table_name="gridstatus_resource_capacity", + schema="data_warehouse", + ) + + dfs = {} + dfs["gridstatus_projects"] = normalized_projects + dfs["gridstatus_resource_capacity"] = normalized_capacities + dfs["gridstatus_locations"] = normalized_locations + return dfs diff --git a/src/dbcp/transform/helpers.py b/src/dbcp/transform/helpers.py index 542284c7..47320a8e 100644 --- a/src/dbcp/transform/helpers.py +++ b/src/dbcp/transform/helpers.py @@ -1,6 +1,6 @@ """Common transform operations.""" from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Sequence import pandas as pd from joblib import Memory @@ -31,7 +31,7 @@ def normalize_multicolumns_to_rows( df: pd.DataFrame, - attribute_columns_dict: Dict[str, List[str]], + attribute_columns_dict: Dict[str, Sequence[str]], index_cols: Optional[List[str]] = None, preserve_original_names=True, dropna=True, @@ -43,7 +43,7 @@ def normalize_multicolumns_to_rows( Args: df (pd.DataFrame): dataframe with multivalued column(s) encoded as multiple columns - attribute_columns_dict (Dict[str,List[str]]): dict mapping new value names to a list of + attribute_columns_dict (Dict[str,Sequence[str]]): dict mapping new value names to a list of columns containing that value. If there are multiple such lists, the order of associated columns must be the same (eg. if numbered, sorted in same order). See example below. index_cols (Optional[List[str]], optional): Columns to use as IDs in original dataframe. If diff --git a/src/dbcp/transform/justice40.py b/src/dbcp/transform/justice40.py index 95964790..7a236576 100644 --- a/src/dbcp/transform/justice40.py +++ b/src/dbcp/transform/justice40.py @@ -160,21 +160,22 @@ def transform(raw_j40: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]: percent_cols = list(filter(lambda col: col.endswith("_percent"), list(out_df))) for col in percent_cols: col_max = out_df[col].max() - if col_max > 1 and col_max <= 100: - out_df[col] = out_df[col] / 100 - elif col_max <= 1: - continue + col_min = out_df[col].min() + if 0 <= col_max and 0 <= col_min: + if col_max <= 1: + continue + elif col_max <= 105: # 105 to account for minor numerical errors + out_df[col] /= 100 + elif col_max <= 10500: + # sometimes percents have been multiplied by 100 twice + out_df[col] /= 10000 + else: + logger.info(f"{col} is not a percent") else: logger.info(f"{col} is not a percent") # tract_within_tribal_areas_percent has a couple of values that are over 100% - is_over_100_pct = out_df["tract_within_tribal_areas_percent"] > 100 - out_df["tract_within_tribal_areas_percent"] = out_df[ - "tract_within_tribal_areas_percent" - ].mask(is_over_100_pct, 100) - out_df["tract_within_tribal_areas_percent"] = ( - out_df["tract_within_tribal_areas_percent"] / 100 - ) + out_df["tract_within_tribal_areas_percent"].clip(upper=1, inplace=True) # Correct percentiles percentile_cols = list( diff --git a/src/dbcp/transform/lbnl_iso_queue.py b/src/dbcp/transform/lbnl_iso_queue.py index 5db8e29c..e1401794 100644 --- a/src/dbcp/transform/lbnl_iso_queue.py +++ b/src/dbcp/transform/lbnl_iso_queue.py @@ -1,6 +1,6 @@ """Functions to transform LBNL ISO queue tables.""" - -from typing import Dict, List +import logging +from typing import Callable, Dict, List, Sequence import numpy as np import pandas as pd @@ -12,6 +12,8 @@ ) from pudl.helpers import add_fips_ids as _add_fips_ids +logger = logging.getLogger(__name__) + RESOURCE_DICT = { "Battery Storage": { "codes": ["Battery", "Batteries", "BAT", "ES"], @@ -126,8 +128,31 @@ def active_iso_queue_projects(active_projects: pd.DataFrame) -> pd.DataFrame: ) # drop irrelevant columns (structurally all nan due to 'active' filter) active_projects.drop(columns=["date_withdrawn", "date_operational"], inplace=True) - active_projects = remove_duplicates(active_projects) # sets index to project_id parse_date_columns(active_projects) + # deduplicate + pre_dedupe = len(active_projects) + active_projects = deduplicate_active_projects( + active_projects, + key=[ + "point_of_interconnection_clean", # derived in _prep_for_deduplication + "capacity_mw_resource_1", + "county_1", + "raw_state_name", + "utility_clean", # derived in _prep_for_deduplication + "resource_type_1", + ], + tiebreak_cols=[ # first priority to last + "date_proposed", + "status_rank", # derived in _prep_for_deduplication + "queue_date", + ], + intermediate_creator=_prep_for_deduplication, + ) + n_dupes = pre_dedupe - len(active_projects) + logger.info(f"Deduplicated {n_dupes} ({n_dupes/pre_dedupe:.2%}) projects.") + + active_projects.set_index("project_id", inplace=True) + active_projects.sort_index(inplace=True) # manual fix for duplicate resource type in raw data bad_proj_id = 1606 assert ( @@ -324,8 +349,74 @@ def clean_resource_type(resource_df: pd.DataFrame) -> pd.DataFrame: return resource_df -def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame: - """First draft deduplication of ISO queues. +def _normalize_point_of_interconnection(ser: pd.Series) -> pd.Series: + """String normalization for point_of_interconnection. + + Essentially a poor man's bag-of-words model. + """ + out = ( + ser.astype("string") + .str.lower() + .str.replace("-", " ") + .str.replace(r"(?:sub)station|kv| at |tbd", "", regex=True) + .fillna("") + ) + out = pd.Series( # make permutation invariant by sorting + [" ".join(sorted(x)) for x in out.str.split()], + index=out.index, + dtype="string", + ).str.strip() + out.replace("", pd.NA, inplace=True) + return out + + +def _prep_for_deduplication(df: pd.DataFrame) -> None: + df["point_of_interconnection_clean"] = _normalize_point_of_interconnection( + df["point_of_interconnection"] + ) + df["utility_clean"] = df["utility"].fillna(df["region"]) + + status_order = [ # from most to least advanced; will be assigned values N to 0 + # Put withdrawn and suspended near the top (assume they are final statuses) + "operational", + "construction", + "withdrawn", + "suspended", + "ia executed", + "ia pending", + "facility study", + "system impact study", + "phase 4 study", + "feasibility study", + "cluster study", + "in progress (unknown study)", + "combined", + "not started", + ] + # assign numerical values for sorting. Largest value is prioritized. + status_map = dict(zip(reversed(status_order), range(len(status_order)))) + df["status_rank"] = ( + df["interconnection_status_lbnl"] + .str.strip() + .str.lower() + .map(status_map) + .fillna(-1) + ).astype(int) + return + + +def deduplicate_active_projects( + df: pd.DataFrame, + key: Sequence[str], + tiebreak_cols: Sequence[str], + intermediate_creator: Callable[[pd.DataFrame], None], +) -> pd.DataFrame: + """First draft deduplication of ISO queues (for active projects only). + + The intention here is to identify rows that likely describe the same physical + project, but are duplicated due to different proposed start dates or IA statuses. + My assumption is that those kind of duplicates exist to cover contingency or by + mistake and that only one project can actually be built. Args: df (pd.DataFrame): a queue dataframe @@ -334,51 +425,25 @@ def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame: pd.DataFrame: queue dataframe with duplicates removed """ df = df.copy() - # do some string cleaning on point_of_interconnection - # for now "tbd" is mapped to "nan" - df["point_of_interconnection_clean"] = ( - df["point_of_interconnection"] - .astype(str) - .str.lower() - .str.replace("substation", "") - .str.replace("kv", "") - .str.replace("-", " ") - .str.replace("station", "") - .str.replace(",", "") - .str.replace("at", "") - .str.replace("tbd", "nan") + original_cols = df.columns + # create whatever derived columns are needed + intermediate_creator(df) + intermediate_cols = df.columns.difference(original_cols) + + tiebreak_cols = list(tiebreak_cols) + key = list(key) + # Where there are duplicates, keep the row with the largest values in tiebreak_cols + # (usually date_proposed, queue_date, and interconnection_status). + # note that NaT is always sorted to the end, so nth(0) will always choose it last. + dedupe = ( + df.sort_values(key + tiebreak_cols, ascending=False) + .groupby(key, as_index=False, dropna=False) + .nth(0) ) - df["point_of_interconnection_clean"] = [ - " ".join(sorted(x)) for x in df["point_of_interconnection_clean"].str.split() - ] - df["point_of_interconnection_clean"] = df[ - "point_of_interconnection_clean" - ].str.strip() - - key = [ - "point_of_interconnection_clean", - "capacity_mw_resource_1", - "county_1", - "raw_state_name", - "region", - "resource_type_1", - ] - df["len_resource_type"] = df.resource_type_lbnl.str.len() - df.reset_index(drop=True, inplace=True) - dups = df.copy() - dups = dups.groupby(key, as_index=False, dropna=False).len_resource_type.max() - df = dups.merge(df, on=(key + ["len_resource_type"])) - # merge added duplicates with same len_resource_type, drop these - df = df[~(df.duplicated(key, keep="first"))] - - # some final cleanup - df = ( - df.drop(["len_resource_type", "point_of_interconnection_clean"], axis=1) - .set_index("project_id") - .sort_index() - ) - return df + # remove whatever derived columns were created + dedupe.drop(columns=intermediate_cols, inplace=True) + return dedupe def _fix_independent_city_fips(location_df: pd.DataFrame) -> pd.DataFrame: diff --git a/src/dbcp/validation/tests.py b/src/dbcp/validation/tests.py index 4f8f3bef..b5c2d9d6 100644 --- a/src/dbcp/validation/tests.py +++ b/src/dbcp/validation/tests.py @@ -63,81 +63,147 @@ def test_j40_county_fips_coverage(engine: Engine): pd.testing.assert_frame_equal(actual, expected) -def test_iso_projects_data_mart_aggregates_are_close(engine: Engine): - """Test that data mart aggregates are close to simple aggregates of the source tables. +def test_gridstatus_fips_coverage(engine: Engine): + """Make sure we have high coverage for county_id_fips codes or gridstatus_projects.""" + with engine.connect() as con: + gridstatus_locations = pd.read_sql_table( + "gridstatus_locations", con, schema="data_warehouse" + ) + assert ( + gridstatus_locations.county_id_fips.isna().sum() / len(gridstatus_locations) + < 0.02 + ), "More than 2 percent of Grid Status locations could not be geocoded." - These aggregates don't exactly match (I should figure out why), but they're within 0.1%. - Probably either null handling or join logic on multi-location projects. + +def test_iso_projects_sources(engine: Engine): + """Check that the right resources come from the right sources.""" + # all offshore wind projects from the proprietary source + proprietary_offshore = """ + SELECT + source, + count(*) as n_offshore + from data_mart.iso_projects_long_format + where resource_clean ~* 'offshore' + group by 1 """ - data_mart_query = """ + expected_source = {"proprietary"} + offshore_test = pd.read_sql( + proprietary_offshore, engine, index_col="source" + ).squeeze( + axis=1 + ) # make series + actual_source = set(offshore_test.index) + assert ( + actual_source == expected_source + ), f"Found offshore wind projects from the wrong source. {offshore_test}" + + # all ISO projects from the gridstatus source + iso_projects = """ SELECT - count(*) as n_project_location_resource, - sum(capacity_mw::float * frac_locations_in_county) as total_capacity + source, + count(*) as n_iso from data_mart.iso_projects_long_format + where iso_region ~* 'caiso|ercot|miso|nyiso|pjm|spp|isone' + group by 1 """ - source_query = """ - WITH - iso_loc as ( - SELECT - project_id, - county_id_fips, - 1.0 / count(*) OVER (PARTITION BY project_id) as frac_county - FROM data_warehouse.iso_locations - FULL OUTER JOIN data_warehouse.iso_projects - USING (project_id) - ), - iso as ( + expected_source = {"gridstatus"} # region is currently NULL for offshore wind + iso_test = pd.read_sql(iso_projects, engine, index_col="source").squeeze(axis=1) + actual_source = set(iso_test.index) + assert ( + actual_source == expected_source + ), f"Found ISO projects from the wrong source. {iso_test}" + # remaining projects from LBNL (non-ISO, non-offshore) + return + + +def test_iso_projects_capacity_aggs(engine: Engine): + """Check that the capacity aggregates equal the source tables.""" + data_mart_capacity = """ + select + source, + resource_clean, + count(*) as n_project_locations, + -- double count capacity when there are multiple locations. Simplifies the test + sum(capacity_mw) as capacity_double_count_county + from data_mart.iso_projects_long_format + group by 1, 2 + order by 1, 2 + """ + # simplified reimplementation of the data_mart.iso_projects_long_format table. + # This skips over the multi-county allocation stuff for simplicity. + source_capacity = """ + with + lbnl as ( select - -- count project-county-resource items, not projects - count(*) as n_project_location_resource, - sum(capacity_mw * frac_county ) as total_capacity - from data_warehouse.iso_resource_capacity - full outer join iso_loc - USING (project_id) - where resource != 'Offshore Wind' + 'lbnl' as source, + res.resource_clean, + count(*) as n_project_locations, + sum(res.capacity_mw) as capacity_double_count_county + FROM data_warehouse.iso_projects as proj + LEFT JOIN data_warehouse.iso_resource_capacity as res + ON proj.project_id = res.project_id + LEFT JOIN data_warehouse.iso_locations as loc + ON proj.project_id = loc.project_id + WHERE proj.region ~ 'non-ISO' + AND resource_clean != 'Offshore Wind' + group by 1, 2 ), - offshore_proj as ( + gridstatus as ( select - -- count project-county-resource items (only one resource here) - -- not project-location-resource items or projects themselves - count(distinct (project_id, loc.county_id_fips)) as n_project_location_resource, - NULL::REAL as total_capacity - from data_warehouse.offshore_wind_projects - full outer join data_warehouse.offshore_wind_cable_landing_association - USING (project_id) - full outer join data_warehouse.offshore_wind_locations as loc - USING (location_id) - where construction_status != 'Operating' - ), - offshore_cap as ( - SELECT - NULL::INTEGER as n_project_location_resource, - -- calc total capacity separately because the capacity is split - -- between locations. This is an easy way to get the same total - -- without re-implementing the splitting logic. - sum(capacity_mw) as total_capacity - from data_warehouse.offshore_wind_projects - where construction_status != 'Operating' + 'gridstatus' as source, + res.resource_clean, + count(*) as n_project_locations, + sum(res.capacity_mw) as capacity_double_count_county + FROM data_warehouse.gridstatus_projects as proj + LEFT JOIN data_warehouse.gridstatus_resource_capacity as res + ON proj.project_id = res.project_id + LEFT JOIN data_warehouse.gridstatus_locations as loc + ON proj.project_id = loc.project_id + WHERE resource_clean not in ('Offshore Wind', 'Transmission') + group by 1, 2 ), - combined as ( - select * from iso - UNION ALL - select * from offshore_proj - UNION ALL - select * from offshore_cap + offshore as ( + select + 'proprietary' as source, + 'Offshore Wind' as resource_clean, + count(*) as n_project_locations, + sum(proj.capacity_mw) as capacity_double_count_county + FROM data_warehouse.offshore_wind_projects as proj + LEFT JOIN data_warehouse.offshore_wind_cable_landing_association as loc + ON proj.project_id = loc.project_id + WHERE proj.construction_status != 'Online' + group by 1, 2 ) - select - sum(n_project_location_resource) as n_project_location_resource, - sum(total_capacity) as total_capacity - from combined + select * from lbnl + UNION ALL + select * from gridstatus + UNION ALL + select * from offshore + order by 1, 2 """ - source_totals = pd.read_sql(source_query, engine) - data_mart_totals = pd.read_sql(data_mart_query, engine) - absolute_diff = data_mart_totals - source_totals - relative_diff = absolute_diff / source_totals + data_mart = pd.read_sql( + data_mart_capacity, engine, index_col=["source", "resource_clean"] + ) + source = pd.read_sql( + source_capacity, engine, index_col=["source", "resource_clean"] + ) + absolute_diff = data_mart - source + relative_diff = absolute_diff / source assert ( - relative_diff.lt(0.0015).all().all() - ), f"relative_difference too large: {relative_diff}" + relative_diff.lt(1e-5).all().all() + ), f"Aggregate resource metrics have a large relative difference: {relative_diff}" + return + + +def test_iso_projects_data_mart(engine: Engine): + """Test that data mart aggregates are close to simple aggregates of the source tables. + + These aggregates don't exactly match (I should figure out why), but they're within 0.1%. + Probably either null handling or join logic on multi-location projects. + """ + test_iso_projects_sources(engine) + test_iso_projects_capacity_aggs(engine) + return def test_county_commission_election_info(engine: Engine): @@ -270,6 +336,7 @@ def validate_warehouse(engine: Engine): """Run data warehouse validation tests.""" logger.info("Validating data warehouse") test_j40_county_fips_coverage(engine) + test_gridstatus_fips_coverage(engine) def validate_data_mart(engine: Engine): @@ -277,7 +344,7 @@ def validate_data_mart(engine: Engine): logger.info("Validating data mart") test_county_long_vs_wide(engine) test_county_wide_coverage(engine) - test_iso_projects_data_mart_aggregates_are_close(engine) + test_iso_projects_data_mart(engine) test_county_commission_election_info(engine) @@ -288,5 +355,6 @@ def validate_all(engine: Engine): if __name__ == "__main__": + # debugging entry point engine = get_sql_engine() validate_all(engine)