diff --git a/notebooks/work-in-progress/output_asset_aggregations.ipynb b/notebooks/work-in-progress/output_asset_aggregations.ipynb new file mode 100644 index 0000000000..296378ccd3 --- /dev/null +++ b/notebooks/work-in-progress/output_asset_aggregations.ipynb @@ -0,0 +1,1152 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c07231ee-a317-405b-9aec-56d5131ffb0d", + "metadata": {}, + "source": [ + "# Purpose\n", + "This notebook compares unaggregated EIA denormalized and monthly aggregated EIA denormalized tables. Most of the aggregated tables are reported monthly yet we still create monthly aggregation tables. Which aggregated tables are actually different? Do we need to retain all of the monthly aggregated tables?" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "de97d7ba-22f7-433e-9f2f-0b9df8b64fc7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "assert os.environ.get(\"DAGSTER_HOME\"), (\n", + " \"The DAGSTER_HOME env var is not set so dagster won't be able to find the assets.\"\n", + " \"Set the DAGSTER_HOME env var in this notebook or kill the jupyter server and set\"\n", + " \" the DAGSTER_HOME env var in your terminal and relaunch jupyter.\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c54503cc-19a2-4cd0-8724-f371eebf54e4", + "metadata": {}, + "source": [ + "## denorm_generation_eia923" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "d9d4dc6a-4539-436b-bc1a-c887cc5e9d57", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from dagster import AssetKey\n", + "\n", + "from pudl.etl import defs\n", + "import sqlalchemy as sa\n", + "import pandas as pd\n", + "\n", + "asset_key = \"denorm_generation_eia923\"\n", + "\n", + "asset False, \"Replace the sqlite connection string with a full local pudl db!\"\n", + "\n", + "engine = sa.create_engine(\"sqlite:////Users/bendnorman/catalyst/dagster-pudl-work/pudl_output/full_eia_pudl.sqlite\")\n", + "with engine.connect() as con:\n", + " denorm_generation_eia923 = pd.read_sql_table(asset_key, con)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "e185f2b3-491a-47fd-a0bf-836e632451ff", + "metadata": {}, + "outputs": [], + "source": [ + "asset_key = \"denorm_generation_monthly_eia923\"\n", + "\n", + "with engine.connect() as con:\n", + " denorm_generation_monthly_eia923 = pd.read_sql_table(asset_key, con)" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "id": "d6c0c078-35ce-4f36-be3b-b25f1aea032f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(645742, 10)\n", + "(645742, 10)\n" + ] + } + ], + "source": [ + "print(denorm_generation_eia923.shape)\n", + "print(denorm_generation_monthly_eia923.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "f2c1850a-b516-441f-aeef-cb3aa3bda9aa", + "metadata": {}, + "outputs": [], + "source": [ + "denorm_generation_eia923 = denorm_generation_eia923.drop(columns=[\"data_maturity\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "01f3ed0a-64a3-4fbd-9854-dc1388ca8bdf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
report_dateplant_id_eiaplant_id_pudlplant_name_eiautility_id_eiautility_id_pudlutility_name_eiagenerator_idunit_id_pudlnet_generation_mwh
02008-01-01332Barry19518Alabama Power Co1NaN96021.0
12008-02-01332Barry19518Alabama Power Co1NaN79256.0
22008-03-01332Barry19518Alabama Power Co1NaN91687.0
32008-04-01332Barry19518Alabama Power Co1NaN73693.0
42008-05-01332Barry19518Alabama Power Co1NaN68161.0
\n", + "
" + ], + "text/plain": [ + " report_date plant_id_eia plant_id_pudl plant_name_eia utility_id_eia utility_id_pudl utility_name_eia generator_id unit_id_pudl net_generation_mwh\n", + "0 2008-01-01 3 32 Barry 195 18 Alabama Power Co 1 NaN 96021.0\n", + "1 2008-02-01 3 32 Barry 195 18 Alabama Power Co 1 NaN 79256.0\n", + "2 2008-03-01 3 32 Barry 195 18 Alabama Power Co 1 NaN 91687.0\n", + "3 2008-04-01 3 32 Barry 195 18 Alabama Power Co 1 NaN 73693.0\n", + "4 2008-05-01 3 32 Barry 195 18 Alabama Power Co 1 NaN 68161.0" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "denorm_generation_eia923.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "dd9c0b87-0c22-4190-baa1-401f3e7c9daa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
report_dateplant_id_eiaplant_id_pudlplant_name_eiautility_id_eiautility_id_pudlutility_name_eiagenerator_idunit_id_pudlnet_generation_mwh
02008-01-01332Barry19518Alabama Power Co1NaN96021.0
12008-02-01332Barry19518Alabama Power Co1NaN79256.0
22008-03-01332Barry19518Alabama Power Co1NaN91687.0
32008-04-01332Barry19518Alabama Power Co1NaN73693.0
42008-05-01332Barry19518Alabama Power Co1NaN68161.0
\n", + "
" + ], + "text/plain": [ + " report_date plant_id_eia plant_id_pudl plant_name_eia utility_id_eia utility_id_pudl utility_name_eia generator_id unit_id_pudl net_generation_mwh\n", + "0 2008-01-01 3 32 Barry 195 18 Alabama Power Co 1 NaN 96021.0\n", + "1 2008-02-01 3 32 Barry 195 18 Alabama Power Co 1 NaN 79256.0\n", + "2 2008-03-01 3 32 Barry 195 18 Alabama Power Co 1 NaN 91687.0\n", + "3 2008-04-01 3 32 Barry 195 18 Alabama Power Co 1 NaN 73693.0\n", + "4 2008-05-01 3 32 Barry 195 18 Alabama Power Co 1 NaN 68161.0" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "denorm_generation_monthly_eia923.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "8f3a402a-b674-4766-a302-4b6f2ebb9c38", + "metadata": {}, + "outputs": [], + "source": [ + "pk_fields = [\"plant_id_eia\", \"generator_id\", \"report_date\"]\n", + "\n", + "sorted_denorm_generation_monthly_eia923 = denorm_generation_monthly_eia923.sort_values(by=pk_fields).reset_index(drop=True)\n", + "sorted_denorm_generation_eia923 = denorm_generation_eia923.sort_values(by=pk_fields).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "58d78c46-b8e3-41fd-9c59-31c90cc834e9", + "metadata": {}, + "outputs": [], + "source": [ + "compare_df = sorted_denorm_generation_monthly_eia923.compare(sorted_denorm_generation_eia923)\n", + "assert compare_df.empty" + ] + }, + { + "cell_type": "markdown", + "id": "c0e37e82-79dd-4599-98af-c78b59de7852", + "metadata": {}, + "source": [ + "It looks like `denorm_generation_monthly_eia923` and `denorm_generation_eia923` are identical which is expected." + ] + }, + { + "cell_type": "markdown", + "id": "82d28bb4-80b8-4274-96a3-eb596115cdc9", + "metadata": {}, + "source": [ + "## denorm_generation_fuel_combined_eia923" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "4f32df3e-2413-4e12-ba3e-3fcab58bbe16", + "metadata": {}, + "outputs": [], + "source": [ + "asset_key = \"denorm_generation_fuel_combined_eia923\"\n", + "\n", + "with engine.connect() as con:\n", + " denorm_generation_fuel_combined_eia923 = pd.read_sql_table(asset_key, con)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "3d001dfb-3200-4f5a-a408-7816bc65962b", + "metadata": {}, + "outputs": [], + "source": [ + "asset_key = \"denorm_generation_fuel_combined_monthly_eia923\"\n", + "\n", + "with engine.connect() as con:\n", + " denorm_generation_fuel_combined_monthly_eia923 = pd.read_sql_table(asset_key, con)" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "82f3c640-b28f-4340-a09d-16cb21b358b2", + "metadata": {}, + "outputs": [], + "source": [ + "pk_fields = [\n", + " \"plant_id_eia\",\n", + " \"report_date\",\n", + " \"prime_mover_code\",\n", + " \"energy_source_code\",\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "99f2b6b7-2945-4243-b9da-23a2262454f9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2859269, 18)\n", + "(2859269, 16)\n" + ] + } + ], + "source": [ + "print(denorm_generation_fuel_combined_eia923.shape)\n", + "print(denorm_generation_fuel_combined_monthly_eia923.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "f557d61b-ef31-44bc-9cce-9d3c4837d67e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'data_maturity', 'fuel_type_code_aer'}" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "missing_cols = set(denorm_generation_fuel_combined_eia923.columns) - set(denorm_generation_fuel_combined_monthly_eia923.columns)\n", + "missing_cols" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "ba878b08-ff15-4915-a28e-e5e14c15065e", + "metadata": {}, + "outputs": [], + "source": [ + "compare_cols = list(set(denorm_generation_fuel_combined_eia923.columns) - missing_cols)\n", + "\n", + "sorted_denorm_generation_fuel_combined_eia923 = denorm_generation_fuel_combined_eia923[compare_cols].sort_values(by=pk_fields).reset_index(drop=True)\n", + "sorted_denorm_generation_fuel_combined_monthly_eia923 = denorm_generation_fuel_combined_monthly_eia923[compare_cols].sort_values(by=pk_fields).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "7d22621b-e88c-4e8e-80aa-7a14dc760371", + "metadata": {}, + "outputs": [ + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[106], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m compare_df \u001b[38;5;241m=\u001b[39m sorted_denorm_generation_fuel_combined_eia923\u001b[38;5;241m.\u001b[39mcompare(sorted_denorm_generation_fuel_combined_monthly_eia923)\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m compare_df\u001b[38;5;241m.\u001b[39mempty\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "compare_df = sorted_denorm_generation_fuel_combined_eia923.compare(sorted_denorm_generation_fuel_combined_monthly_eia923)\n", + "assert compare_df.empty" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "a8585149-05db-4887-9844-66a01dba7bc3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fuel_mmbtu_per_unit
selfother
05.8255.826211
10.000inf
25.8255.823980
30.000inf
45.8255.824742
.........
28592640.000inf
28592650.000inf
28592660.000inf
28592670.000inf
28592680.000inf
\n", + "

2607882 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " fuel_mmbtu_per_unit \n", + " self other\n", + "0 5.825 5.826211\n", + "1 0.000 inf\n", + "2 5.825 5.823980\n", + "3 0.000 inf\n", + "4 5.825 5.824742\n", + "... ... ...\n", + "2859264 0.000 inf\n", + "2859265 0.000 inf\n", + "2859266 0.000 inf\n", + "2859267 0.000 inf\n", + "2859268 0.000 inf\n", + "\n", + "[2607882 rows x 2 columns]" + ] + }, + "execution_count": 125, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "compare_df" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "3bde1303-1000-4761-bc07-0e663e6ebcb0", + "metadata": {}, + "outputs": [], + "source": [ + "diff_gen_fuel = sorted_denorm_generation_fuel_combined_eia923.loc[compare_df.index]" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "3d28a20d-6846-46c6-97ed-401eceeeee51", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "NG 674250\n", + "DFO 530256\n", + "WAT 373010\n", + "SUN 297877\n", + "WND 174413\n", + "BIT 94935\n", + "LFG 70394\n", + "RFO 61129\n", + "SUB 52729\n", + "WDS 37293\n", + "OG 25569\n", + "OBG 19689\n", + "MSB 19617\n", + "MSN 19335\n", + "KER 17052\n", + "GEO 15649\n", + "BLQ 14734\n", + "PC 13806\n", + "JF 11751\n", + "WO 10507\n", + "OTH 8982\n", + "TDF 7896\n", + "PG 6313\n", + "WH 6289\n", + "WC 5746\n", + "LIG 5193\n", + "RC 4634\n", + "SLW 4374\n", + "OBS 3968\n", + "AB 3624\n", + "MWH 3570\n", + "BFG 3112\n", + "OBL 3016\n", + "PUR 2941\n", + "SC 2385\n", + "WDL 988\n", + "SGC 468\n", + "SGP 206\n", + "ANT 182\n", + "Name: energy_source_code, dtype: int64" + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diff_gen_fuel.energy_source_code.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "id": "bc1c390e-a9cb-4268-9b1c-3cd75dc6e7c7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 124, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(diff_gen_fuel.energy_source_code == \"NUC\").any()" + ] + }, + { + "cell_type": "markdown", + "id": "a83e530b-c89c-42c7-a8b1-91fa7311974e", + "metadata": {}, + "source": [ + "It seems like calculated `fuel_mmbtu_per_unit` creates some rounding errors. Are they all rounding errors?" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "id": "b381ac76-fe26-4a76-8975-d6a78d7840fe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 1.163572e+06\n", + "mean inf\n", + "std NaN\n", + "min -3.098800e+04\n", + "25% -2.462380e-05\n", + "50% -2.018916e-08\n", + "75% 1.896324e-05\n", + "max inf\n", + "dtype: float64" + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "non_inf_compare_df = compare_df[compare_df.fuel_mmbtu_per_unit.other != np.inf]\n", + "fuel_mmbtu_per_unit_diff = (non_inf_compare_df.fuel_mmbtu_per_unit.self - non_inf_compare_df.fuel_mmbtu_per_unit.other)\n", + "fuel_mmbtu_per_unit_diff.describe()" + ] + }, + { + "cell_type": "markdown", + "id": "f6bcb8a3-629f-4b3e-8f12-f9cf92b21d28", + "metadata": {}, + "source": [ + "Looks like they are mostly rounding errors and `inf` values produced by the aggregation step. Can we just use the unaggregated tables for downstream assets?" + ] + }, + { + "cell_type": "markdown", + "id": "36167a17-6ead-477f-ba24-7e9ce83d5378", + "metadata": {}, + "source": [ + "## denorm_boiler_fuel_eia923" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "id": "0c301f07-9b73-4d9d-b80d-0a4f29804b8f", + "metadata": {}, + "outputs": [], + "source": [ + "asset_key = \"denorm_boiler_fuel_eia923\"\n", + "\n", + "with engine.connect() as con:\n", + " denorm_boiler_fuel_eia923 = pd.read_sql_table(asset_key, con)" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "id": "cebacd15-37c8-442c-af44-e18e5fa9a166", + "metadata": {}, + "outputs": [], + "source": [ + "asset_key = \"denorm_boiler_fuel_monthly_eia923\"\n", + "\n", + "with engine.connect() as con:\n", + " denorm_boiler_fuel_monthly_eia923 = pd.read_sql_table(asset_key, con)" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "id": "af500fc3-382a-47c2-a74f-73e8da50df24", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1521304, 17)\n", + "(1521304, 17)\n" + ] + } + ], + "source": [ + "print(denorm_boiler_fuel_eia923.shape)\n", + "print(denorm_boiler_fuel_monthly_eia923.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "id": "11784201-6296-42fd-9b6d-ddd9de79a87a", + "metadata": {}, + "outputs": [], + "source": [ + "pk_fields = [\n", + " \"plant_id_eia\",\n", + " \"boiler_id\",\n", + " \"energy_source_code\",\n", + " \"prime_mover_code\",\n", + " \"report_date\",\n", + " ]\n", + "\n", + "\n", + "sorted_denorm_boiler_fuel_eia923 = denorm_boiler_fuel_eia923.sort_values(by=pk_fields).reset_index(drop=True)\n", + "sorted_denorm_boiler_fuel_monthly_eia923 = denorm_boiler_fuel_monthly_eia923.sort_values(by=pk_fields).reset_index(drop=True)\n", + "\n", + "\n", + "compare_df = sorted_denorm_boiler_fuel_eia923.compare(sorted_denorm_boiler_fuel_monthly_eia923)" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "id": "301c9d58-35f6-445a-80f0-b33d966ca71c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fuel_mmbtu_per_unitsulfur_content_pctash_content_pct
selfotherselfotherselfother
0NaNNaN0.490.49NaNNaN
5NaNNaN0.460.46NaNNaN
6NaNNaN0.460.46NaNNaN
9NaNNaN0.640.64NaNNaN
13NaNNaN0.460.46NaNNaN
.....................
15212870.0NaN0.00NaN0.0NaN
15212880.0NaN0.00NaN0.0NaN
15212890.0NaN0.00NaN0.0NaN
15212900.0NaN0.00NaN0.0NaN
15212910.0NaN0.00NaN0.0NaN
\n", + "

646210 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " fuel_mmbtu_per_unit sulfur_content_pct ash_content_pct \n", + " self other self other self other\n", + "0 NaN NaN 0.49 0.49 NaN NaN\n", + "5 NaN NaN 0.46 0.46 NaN NaN\n", + "6 NaN NaN 0.46 0.46 NaN NaN\n", + "9 NaN NaN 0.64 0.64 NaN NaN\n", + "13 NaN NaN 0.46 0.46 NaN NaN\n", + "... ... ... ... ... ... ...\n", + "1521287 0.0 NaN 0.00 NaN 0.0 NaN\n", + "1521288 0.0 NaN 0.00 NaN 0.0 NaN\n", + "1521289 0.0 NaN 0.00 NaN 0.0 NaN\n", + "1521290 0.0 NaN 0.00 NaN 0.0 NaN\n", + "1521291 0.0 NaN 0.00 NaN 0.0 NaN\n", + "\n", + "[646210 rows x 6 columns]" + ] + }, + "execution_count": 155, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "compare_df" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "id": "1eeddd1a-822d-484c-81dd-4fed71cecce8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 2.213200e+04\n", + "mean 1.812164e-19\n", + "std 2.300727e-16\n", + "min -8.881784e-16\n", + "25% -1.110223e-16\n", + "50% -1.734723e-18\n", + "75% 1.110223e-16\n", + "max 8.881784e-16\n", + "dtype: float64" + ] + }, + "execution_count": 154, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(compare_df.sulfur_content_pct.self - compare_df.sulfur_content_pct.other).describe()" + ] + }, + { + "cell_type": "markdown", + "id": "9f7bad9c-e672-4369-a022-bd47cc69d00d", + "metadata": {}, + "source": [ + "I think this is another case of weird rounding and NaN errors from dividing values during the aggregation." + ] + }, + { + "cell_type": "markdown", + "id": "c468bc02-97cd-44a6-96be-d7f24f31ca7e", + "metadata": {}, + "source": [ + "## denorm_fuel_receipts_costs_eia923" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "id": "9f41751f-43db-4d91-b37f-13d031251b12", + "metadata": {}, + "outputs": [], + "source": [ + "asset_key = \"denorm_fuel_receipts_costs_eia923\"\n", + "\n", + "with engine.connect() as con:\n", + " denorm_fuel_receipts_costs_eia923 = pd.read_sql_table(asset_key, con)" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "id": "2532a5ba-2597-43bb-8113-5682e76d9e67", + "metadata": {}, + "outputs": [], + "source": [ + "asset_key = \"denorm_fuel_receipts_costs_monthly_eia923\"\n", + "\n", + "with engine.connect() as con:\n", + " denorm_fuel_receipts_costs_monthly_eia923 = pd.read_sql_table(asset_key, con)" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "id": "aa543614-c33c-468a-92cf-a44f19299a00", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(631975, 36)\n", + "(258115, 20)\n" + ] + } + ], + "source": [ + "print(denorm_fuel_receipts_costs_eia923.shape)\n", + "print(denorm_fuel_receipts_costs_monthly_eia923.shape)" + ] + }, + { + "cell_type": "markdown", + "id": "8ca784a1-4002-47bf-9189-f5375436bd96", + "metadata": {}, + "source": [ + "Ok! The unaggregated and monthly aggregated tables are clearly different which makes sense given:\n", + "> There can be multiple deliveries of the same type of fuel from the same supplier to the same plant in a single month, so the table has no natural primary key." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}