diff --git a/data/PP/standby_verbruik_demo.ipynb b/demo_standby_verbruik.ipynb similarity index 96% rename from data/PP/standby_verbruik_demo.ipynb rename to demo_standby_verbruik.ipynb index 57ef794..2e0db94 100644 --- a/data/PP/standby_verbruik_demo.ipynb +++ b/demo_standby_verbruik.ipynb @@ -2,14 +2,18 @@ "cells": [ { "cell_type": "code", - "execution_count": 22, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# imports\n", "import polars as pl\n", "import json\n", - "import altair as alt" + "import altair as alt\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "# %autoreload?" ] }, { @@ -28,28 +32,28 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "5.01 μs ± 161 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)\n" + "6.07 μs ± 115 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)\n" ] } ], "source": [ "%%timeit\n", "energy_use_df = pl.scan_ndjson(\n", - " \"energy_use_test1.ndjson\",\n", + " \"data/PP/energy_use_test1.ndjson\",\n", " schema={\"timestamp\": pl.Datetime(time_zone=\"Europe/Brussels\"), \"total\": pl.Float64},\n", ")" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -79,14 +83,14 @@ "└───────────────────────────────┴───────┘" ] }, - "execution_count": 24, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "energy_use_lf_1 = pl.scan_ndjson(\n", - " \"energy_use_test1.ndjson\",\n", + " \"data/PP/energy_use_test1.ndjson\",\n", " schema={\"timestamp\": pl.Datetime(time_zone=\"Europe/Brussels\"), \"total\": pl.Float64},\n", ")\n", "energy_use_lf_1.collect().head()" @@ -101,21 +105,21 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "35.1 ms ± 2.47 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "34.3 ms ± 1.25 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%%timeit\n", "# Read the JSON file\n", - "with open(\"energy_use.json\", \"r\") as file:\n", + "with open(\"data/PP/energy_use.json\", \"r\") as file:\n", " data = json.load(file)\n", "\n", "# Convert the data into a list of dictionaries\n", @@ -131,15 +135,46 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Defining the analysis \n", + "# Base Load analysis\n", "\n", - "and defining the frames\n", - "## WHAT is the standby?\n" + "## loading in the data" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Base Load: 90570.4W\n", + "Daily Usage: 2173.7 kWh\n", + "Base Percentage: 65.5%\n" + ] + } + ], + "source": [ + "from openenergyid.baseload.main import main\n", + "\n", + "metrics = main(\"data/PP/energy_use_big.ndjson\")\n", + "# display(metrics)\n", + "print(f\"Base Load: {metrics.base_load_watts:.1f}W\")\n", + "print(f\"Daily Usage: {metrics.daily_usage_kwh:.1f} kWh\")\n", + "print(f\"Base Percentage: {metrics.base_percentage:.1f}%\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -177,11 +212,41 @@ }, "metadata": {}, "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
timestamptotal
datetime[μs, Europe/Brussels]f64
2024-01-01 00:00:00 CET51.625
2024-01-01 00:15:00 CET50.75
2024-01-01 00:30:00 CET38.5
2024-01-01 00:45:00 CET40.25
2024-01-01 01:00:00 CET59.500004
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌───────────────────────────────┬───────────┐\n", + "│ timestamp ┆ total │\n", + "│ --- ┆ --- │\n", + "│ datetime[μs, Europe/Brussels] ┆ f64 │\n", + "╞═══════════════════════════════╪═══════════╡\n", + "│ 2024-01-01 00:00:00 CET ┆ 51.625 │\n", + "│ 2024-01-01 00:15:00 CET ┆ 50.75 │\n", + "│ 2024-01-01 00:30:00 CET ┆ 38.5 │\n", + "│ 2024-01-01 00:45:00 CET ┆ 40.25 │\n", + "│ 2024-01-01 01:00:00 CET ┆ 59.500004 │\n", + "└───────────────────────────────┴───────────┘" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ "energy_use_lf_1 = pl.scan_ndjson(\n", - " \"energy_use_big.ndjson\",\n", + " \"data/PP/energy_use_big.ndjson\",\n", " schema={\"timestamp\": pl.Datetime(time_zone=\"Europe/Brussels\"), \"total\": pl.Float64},\n", ")\n", "testframe = (\n", @@ -189,19 +254,20 @@ " # .with_columns(pl.col(\"timestamp\").interpolate(method=\"linear\"))\n", ")\n", "tf = testframe.collect()\n", - "display(tf)" + "display(tf)\n", + "display(energy_use_lf_1.collect().head())" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_44261/1445527080.py:2: DeprecationWarning: `GroupBy.count` is deprecated. It has been renamed to `len`.\n", + "/tmp/ipykernel_22265/3491075188.py:2: DeprecationWarning: `GroupBy.count` is deprecated. It has been renamed to `len`.\n", " value_counts = tf.group_by(\"total\").count().sort(\"total\")\n" ] }, @@ -210,23 +276,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -314,7 +380,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -380,7 +446,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -432,7 +498,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -440,23 +506,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -549,35 +615,31 @@ }, { "cell_type": "code", - "execution_count": 47, - "metadata": { - "vscode": { - "languageId": "javascript" - } - }, + "execution_count": 23, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -671,7 +733,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -679,23 +741,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, - "execution_count": 32, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -800,7 +862,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -868,7 +930,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -876,23 +938,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -974,7 +1036,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -982,23 +1044,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, - "execution_count": 35, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1069,7 +1131,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1077,23 +1139,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, - "execution_count": 36, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1184,7 +1246,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1192,23 +1254,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, - "execution_count": 37, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1273,7 +1335,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -1378,7 +1440,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1386,23 +1448,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ @@ -1476,114 +1538,6 @@ "kde_chart.display()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### testing differences between 1. absolute lowest value 2. 1 percent or 3. 5 percent" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "# import altair as alt\n", - "\n", - "# # Assuming df is your DataFrame from the previous analysis\n", - "# # clear lowest1percent variable\n", - "# # df = df.with_columns(pl.lit(None).alias(\"lowest_1_percent\"))\n", - "# # 1. Comparison of Different Standby Measures Over Time\n", - "# standby_comparison = (\n", - "# alt.Chart(df)\n", - "# .transform_fold([\"low_end\", \"lowest_1_percent\", \"lowest_5_percent\"], as_=[\"measure\", \"value\"])\n", - "# .mark_line()\n", - "# .encode(\n", - "# x=\"timestamp:T\",\n", - "# y=\"value:Q\",\n", - "# color=alt.Color(\"measure:N\", scale=alt.Scale(scheme=\"category10\")),\n", - "# tooltip=[\"timestamp:T\", \"value:Q\", \"measure:N\"],\n", - "# )\n", - "# .properties(\n", - "# width=800, height=400, title=\"Comparison of Different Standby Usage Measures Over Time\"\n", - "# )\n", - "# )\n", - "\n", - "# # 2. Daily Percentage of Total for Each Measure\n", - "# percentage_comparison = (\n", - "# alt.Chart(df)\n", - "# .transform_fold([\"low_end\", \"lowest_1_percent\", \"lowest_5_percent\"], as_=[\"measure\", \"value\"])\n", - "# .transform_calculate(percentage=\"datum.value / datum.total * 100\")\n", - "# .mark_line()\n", - "# .encode(\n", - "# x=\"timestamp:T\",\n", - "# y=alt.Y(\"percentage:Q\", axis=alt.Axis(format=\"%\")),\n", - "# color=alt.Color(\"measure:N\", scale=alt.Scale(scheme=\"category10\")),\n", - "# tooltip=[\"timestamp:T\", alt.Tooltip(\"percentage:Q\", format=\".2%\"), \"measure:N\"],\n", - "# )\n", - "# .properties(\n", - "# width=800, height=400, title=\"Daily Percentage of Total Energy Use for Each Standby Measure\"\n", - "# )\n", - "# )\n", - "\n", - "# # 3. Box Plot of Different Measures\n", - "# box_plot = (\n", - "# alt.Chart(df)\n", - "# .transform_fold([\"low_end\", \"lowest_1_percent\", \"lowest_5_percent\"], as_=[\"measure\", \"value\"])\n", - "# .mark_boxplot()\n", - "# .encode(x=\"measure:N\", y=\"value:Q\")\n", - "# .properties(width=400, height=300, title=\"Distribution of Different Standby Measures\")\n", - "# )\n", - "\n", - "# # 4. Scatter Plot: Lowest 1% vs Lowest 5%\n", - "# scatter_1_5 = (\n", - "# alt.Chart(df)\n", - "# .mark_circle()\n", - "# .encode(\n", - "# x=\"lowest_1_percent:Q\",\n", - "# y=\"lowest_5_percent:Q\",\n", - "# color=alt.Color(\"month(timestamp):N\", scale=alt.Scale(scheme=\"category10\")),\n", - "# tooltip=[\"timestamp:T\", \"lowest_1_percent:Q\", \"lowest_5_percent:Q\"],\n", - "# )\n", - "# .properties(width=400, height=400, title=\"Lowest 1% vs Lowest 5%\")\n", - "# )\n", - "\n", - "# # Combine charts\n", - "# combined_chart = (standby_comparison & percentage_comparison) | (box_plot & scatter_1_5)\n", - "\n", - "# # Display the combined chart\n", - "# combined_chart.display()" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "# import pandas as pd\n", - "\n", - "# import plotly.express as px\n", - "# #\n", - "# # Create a histogram\n", - "# fig = px.histogram(\n", - "# df,\n", - "# x=\"timestamp\",\n", - "# histfunc=\"avg\",\n", - "# y=[\"total\", \"lowest_1_percent\"],\n", - "# title=\"Energy use\",\n", - "# labels={\n", - "# \"total\": \"Energy use (kWh)\",\n", - "# \"min_power_usage_per_day\": \"Min Power Usage per Day (kWh)\",\n", - "# },\n", - "# barmode=\"overlay\",\n", - "# )\n", - "# fig.update_traces(xbins_size=\"604800000\")\n", - "\n", - "# fig.show()" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1593,7 +1547,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -1604,6 +1558,14 @@ "An exception has occurred, use %tb to see the full traceback.\n", "\u001b[0;31mSystemExit\u001b[0m\u001b[0;31m:\u001b[0m Stopping the notebook execution here.\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/.cache/pypoetry/virtualenvs/openenergyid-Nm3FK_LY-py3.11/lib/python3.11/site-packages/IPython/core/interactiveshell.py:3585: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.\n", + " warn(\"To exit: use 'exit', 'quit', or Ctrl-D.\", stacklevel=1)\n" + ] } ], "source": [ @@ -1646,7 +1608,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1829,7 +1791,7 @@ "import matplotlib.pyplot as plt\n", "\n", "# Read in pandas series from a json file\n", - "energy_use_lf_1 = pd.read_json(\"energy_use.json\", orient=\"index\")\n", + "energy_use_lf_1 = pd.read_json(\"data/PP/energy_use.json\", orient=\"index\")\n", "energy_use_lf_1.columns = [\"energy_use\"]\n", "energy_use_lf_1.Name = \"energy_use\"\n", "display(energy_use_lf_1)\n", @@ -1975,7 +1937,7 @@ "import numpy as np\n", "\n", "# Read in pandas series from a json file\n", - "energy_use_lf_1 = pd.read_json(\"energy_use.json\", orient=\"index\")\n", + "energy_use_lf_1 = pd.read_json(\"data/PP/energy_use.json\", orient=\"index\")\n", "energy_use_lf_1.columns = [\"energy_use\"]\n", "energy_use_lf_1.Name = \"energy_use\"\n", "display(energy_use_lf_1)\n", @@ -2020,7 +1982,7 @@ "\n", "# Load and preprocess the data\n", "\n", - "data = pd.read_json(\"energy_use.json\", orient=\"index\")\n", + "data = pd.read_json(\"data/PP/energy_use.json\", orient=\"index\")\n", "data.columns = [\"usage\"]\n", "data.index.name = \"timestamp\"\n", "data.index = pd.to_datetime(data.index)" diff --git a/openenergyid/baseload/__init__.py b/openenergyid/baseload/__init__.py new file mode 100644 index 0000000..cbf7ffb --- /dev/null +++ b/openenergyid/baseload/__init__.py @@ -0,0 +1,15 @@ +"""Base Load analysis for Open Energy ID.""" + +from .main import ( + BaseLoadMetrics, + EnergySchema, + load_data, + calculate_base_load, +) + +__all__ = [ + "BaseLoadMetrics", + "EnergySchema", + "load_data", + "calculate_base_load", +] diff --git a/openenergyid/baseload/main.py b/openenergyid/baseload/main.py new file mode 100644 index 0000000..b02f6bf --- /dev/null +++ b/openenergyid/baseload/main.py @@ -0,0 +1,114 @@ +""" +This module provides functionality for loading, validating, and analyzing energy usage data. + +Classes: + BaseLoadMetrics: A NamedTuple container for base load analysis metrics. + EnergySchema: A pandera DataFrameModel for validating energy usage data. + +Functions: + load_data(path: str) -> pl.LazyFrame: + Loads and validates energy usage data from an NDJSON file. + + calculate_base_load(lf: pl.LazyFrame) -> BaseLoadMetrics: + Calculates base load metrics from energy usage data. + + main(file_path: str) -> BaseLoadMetrics: + Processes energy data and returns base load metrics. + + test_energy_validation(): + Tests various data validation scenarios using pytest. +""" + +from typing import NamedTuple +import polars as pl +import pandera.polars as pa +## VERY important to use pandera.polars instead of pandera to avoid pandas errors + + +class BaseLoadMetrics(NamedTuple): + """Container for base load analysis metrics""" + + base_load_watts: float # Average base load in watts + daily_usage_kwh: float # Average daily usage in kWh + base_percentage: float # Base load as percentage of total + + +class EnergySchema(pa.DataFrameModel): + """Schema for energy usage data validation""" + + timestamp: pl.Datetime = pa.Field( + nullable=False, + coerce=True, + title="Measurement Timestamp", + description="Time of energy measurement in Europe/Brussels timezone", + ) + total: float = pa.Field( + ge=0, # Power should be non-negative + nullable=False, + title="Total Power", + description="Total power measurement in kW", + ) + + # Add example of pandera validation: dataframe-level validation + @pa.dataframe_check + def timestamps_are_ordered(self, data: pl.DataFrame) -> bool: + """Check if timestamps are in chronological order""" + return data["timestamp"].is_sorted() + + +def load_data(path: str) -> pl.LazyFrame: + """Load and validate energy usage data from NDJSON file""" + lf = pl.scan_ndjson( + path, + schema={"timestamp": pl.Datetime(time_zone="Europe/Brussels"), "total": pl.Float64}, + ) + # Convert to DataFrame for data-level validation, then back to LazyFrame for processing + validated_df = EnergySchema.validate(lf).collect() # type: ignore + return pl.LazyFrame(validated_df) + + +def calculate_base_load(lf: pl.LazyFrame) -> BaseLoadMetrics: + """ + Calculate base load metrics from energy usage data. + + Takes lowest 10 totals per day to determine base load. + Returns watts, kwh, and percentage metrics. + """ + metrics_df = ( + lf.filter(pl.col("total") >= 0) + .sort("timestamp") + .group_by_dynamic("timestamp", every="1d") + .agg( + [ + pl.col("total").sum().alias("total_daily_usage"), + (pl.col("total").sort().head(10).mean() * 4 * 24).alias("base_load_daily_kwh"), + ] + ) + .with_columns( + [ + (pl.col("base_load_daily_kwh") / pl.col("total_daily_usage") * 100).alias( + "base_percentage" + ) + ] + ) + .select( + [ + pl.col("base_load_daily_kwh").mean().alias("avg_daily_kwh"), + (pl.col("base_load_daily_kwh") * 1000 / 24).mean().alias("avg_watts"), + pl.col("base_percentage").mean().alias("avg_percentage"), + ] + ) + .collect() # TODO add validation for input data: correct format, not null, etc. + ) + + return BaseLoadMetrics( + base_load_watts=metrics_df[0, "avg_watts"], + daily_usage_kwh=metrics_df[0, "avg_daily_kwh"], + base_percentage=metrics_df[0, "avg_percentage"], + ) + + +def main(file_path: str) -> BaseLoadMetrics: + """Process energy data and return base load metrics""" + lf = load_data(file_path) + return calculate_base_load(lf) diff --git a/vis/basislast/monthly_bar.png b/vis/basislast/monthly_bar.png new file mode 100644 index 0000000..19d45d2 Binary files /dev/null and b/vis/basislast/monthly_bar.png differ diff --git a/vis/basislast/visualization.png b/vis/basislast/visualization.png new file mode 100644 index 0000000..c4bdb97 Binary files /dev/null and b/vis/basislast/visualization.png differ diff --git a/vis/basislast/visualization_month.png b/vis/basislast/visualization_month.png new file mode 100644 index 0000000..ed05fbe Binary files /dev/null and b/vis/basislast/visualization_month.png differ