diff --git a/demo_baseLoad.ipynb b/demo_baseLoad.ipynb index 8471c63..555fe76 100644 --- a/demo_baseLoad.ipynb +++ b/demo_baseLoad.ipynb @@ -55,24 +55,38 @@ "}" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "> ⚠️**Note:** we have several example files available, \n", + "> *energy_use_big* is from a giant building with incredible base load. \n", + "> *energy_use_test1* is from a regular family residence.\n" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "from openenergyid.baseload.main import main\n", + "from openenergyid.baseload.main import load_energy_data, analyze_base_load\n", "from openenergyid.enums import Granularity\n", "\n", + "# example_file = \"data/PP/energy_use_big.ndjson\"\n", + "example_file = \"data/PP/energy_use_test1.ndjson\"\n", + "# Load energy data\n", + "energy_data = load_energy_data(example_file)\n", "\n", "# Monthly analysis\n", - "monthly_metrics = main(\"data/PP/energy_use_big.ndjson\", Granularity.P1M)\n", + "monthly_metrics = analyze_base_load(energy_data, Granularity.P1M)\n", "\n", "# Daily analysis\n", - "daily_metrics = main(\"data/PP/energy_use_big.ndjson\", Granularity.P1D)\n", + "daily_metrics = analyze_base_load(energy_data, Granularity.P1D)\n", "\n", "# Hourly analysis\n", - "hourly_metrics = main(\"data/PP/energy_use_big.ndjson\", Granularity.PT1H)\n", + "hourly_metrics = analyze_base_load(energy_data, Granularity.PT1H)\n", "\n", "display(monthly_metrics)\n", "display(daily_metrics)\n", @@ -178,7 +192,7 @@ "outputs": [], "source": [ "energy_use_lf_1 = pl.scan_ndjson(\n", - " \"data/PP/energy_use_big.ndjson\",\n", + " example_file,\n", " schema={\"timestamp\": pl.Datetime(time_zone=\"Europe/Brussels\"), \"total\": pl.Float64},\n", ")\n", "testframe = (\n", @@ -197,7 +211,7 @@ "outputs": [], "source": [ "# Compute the value counts using Polars\n", - "value_counts = tf.group_by(\"total\").count().sort(\"total\")\n", + "value_counts = tf.group_by(\"total\").agg(pl.count(\"total\").alias(\"count\")).sort(\"total\")\n", "\n", "\n", "bar_chart = (\n", @@ -231,7 +245,7 @@ "outputs": [], "source": [ "lf = (\n", - " energy_use_lf_1.filter(pl.col(\"total\") >= 0)\n", + " energy_data.filter(pl.col(\"total\") >= 0)\n", " .sort(\"timestamp\")\n", " .group_by_dynamic(\"timestamp\", every=\"1d\")\n", " .agg(\n", @@ -407,7 +421,7 @@ " alt.Tooltip(\"month(timestamp):T\", title=\"Month\"),\n", " alt.Tooltip(\"total_monthly_usage:Q\", title=\"Total Usage (kWh)\"),\n", " alt.Tooltip(\"basislast_monthly_kwh:Q\", title=\"Basislast (kWh)\"),\n", - " alt.Tooltip(\"basislast_percentage:Q\", title=\"Basislast %\"),\n", + " # alt.Tooltip(\"basislast_percentage:Q\", title=\"Basislast %\"),\n", " ],\n", ")\n", "text = (\n", @@ -490,7 +504,14 @@ "outputs": [], "source": [ "alt.data_transformers.enable(\"vegafusion\")\n", - "alt.Chart(tf).transform_density(\n", + "# Calculate the xth percentile\n", + "qx = tf[\"total\"].quantile(0.98)\n", + "\n", + "# Filter the data to include only the data below the 95th percentile\n", + "filtered_tf = tf.filter(pl.col(\"total\") <= qx)\n", + "\n", + "# Create the density plot\n", + "alt.Chart(filtered_tf).transform_density(\n", " \"total\",\n", " as_=[\"total\", \"density\"],\n", ").mark_area().encode(\n", @@ -499,23 +520,52 @@ ").properties(width=900, height=400)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The chart plots the kernel density estimate (KDE) of the `total` energy usage values, filtered to include only the data below the 98th percentile. \n", + "\n", + "### Context:\n", + "- **X-axis (`total:Q`)**: Represents the energy usage values.\n", + "- **Y-axis (`density:Q`)**: Represents the estimated probability density of the energy usage values.\n", + "\n", + "### Interpretation:\n", + "- The KDE plot provides a smoothed estimate of the distribution of energy usage values.\n", + "- Peaks in the density plot indicate the most common energy usage values.\n", + "- Filtering to the 98th percentile removes extreme values (outliers), focusing on the more typical usage patterns.\n", + "\n", + "This helps in understanding the distribution and common ranges of energy usage in the dataset, highlighting where most of the energy usage values lie." + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "max = (\n", + "import altair as alt\n", + "import polars as pl\n", + "\n", + "# Assuming df_extended is already defined and contains the correct columns\n", + "\n", + "# Create the max chart\n", + "max_chart = (\n", " alt.Chart(df_extended)\n", " .mark_area()\n", " .encode(\n", " alt.X(\"timestamp:T\", axis=alt.Axis(title=\"Time\")),\n", " alt.Y(\"total_daily_usage:Q\", axis=alt.Axis(title=\"Energy use (kWh)\")),\n", - " tooltip=[\"timestamp:T\", \"total:Q\"],\n", + " tooltip=[\n", + " alt.Tooltip(\"timestamp:T\", title=\"Date\"),\n", + " alt.Tooltip(\"total_daily_usage:Q\", title=\"Total Daily Usage\"),\n", + " ],\n", " )\n", " .properties(width=1000, height=400)\n", ")\n", - "lowest = (\n", + "\n", + "# Create the lowest chart\n", + "lowest_chart = (\n", " alt.Chart(df_extended)\n", " .mark_area()\n", " .encode(\n", @@ -523,16 +573,18 @@ " alt.Y(\"min_power_usage_per_day:Q\", axis=alt.Axis(title=\"Energy use (kWh)\")),\n", " tooltip=[\n", " alt.Tooltip(\"timestamp:T\", title=\"Date\"),\n", - " alt.Tooltip(\"min_power_usage_per_day:Q\", title=\"Minimal recorded peak usage\"),\n", + " alt.Tooltip(\"min_power_usage_per_day:Q\", title=\"Minimal Recorded Peak Usage\"),\n", " ],\n", " color=alt.value(\"orange\"),\n", " )\n", " .properties(width=1000, height=400)\n", ")\n", "\n", - "# )\n", + "# Combine the charts\n", + "final_chart = max_chart + lowest_chart\n", "\n", - "max + lowest" + "# Display the final chart\n", + "final_chart.display()" ] }, { @@ -541,8 +593,42 @@ "metadata": {}, "outputs": [], "source": [ - "# comparing the options\n", - "max + lowest + lowest_new" + "# Clean calculation of percentage with proper handling of edge cases\n", + "df_extended = df_extended.with_columns(\n", + " [\n", + " # Calculate percentage only when conditions make sense\n", + " pl.when(\n", + " (pl.col(\"total_daily_usage\") > 0.1) # Meaningful total usage threshold\n", + " & (\n", + " pl.col(\"min_power_usage_per_day\") < pl.col(\"total_daily_usage\")\n", + " ) # Logical constraint\n", + " )\n", + " .then((pl.col(\"min_power_usage_per_day\") / pl.col(\"total_daily_usage\") * 100))\n", + " .otherwise(None) # Use None for invalid cases\n", + " .alias(\"percentage_standby_usage\")\n", + " ]\n", + ")\n", + "\n", + "# Update the standby_percentage chart to handle the cleaned data\n", + "standby_percentage = (\n", + " alt.Chart(df_extended)\n", + " .mark_area()\n", + " .encode(\n", + " x=\"timestamp:T\",\n", + " y=alt.Y(\n", + " \"percentage_standby_usage:Q\",\n", + " axis=alt.Axis(format=\"%\"),\n", + " scale=alt.Scale(domain=[0, 100]), # Force scale from 0-100%\n", + " ),\n", + " tooltip=[\n", + " \"timestamp:T\",\n", + " alt.Tooltip(\"percentage_standby_usage:Q\", format=\".1f\", title=\"Standby %\"),\n", + " alt.Tooltip(\"total_daily_usage:Q\", format=\".2f\", title=\"Total Usage\"),\n", + " alt.Tooltip(\"min_power_usage_per_day:Q\", format=\".2f\", title=\"Min Power\"),\n", + " ],\n", + " )\n", + " .properties(width=800, height=300, title=\"Daily Standby Usage Percentage\")\n", + ")" ] }, { @@ -551,12 +637,27 @@ "metadata": {}, "outputs": [], "source": [ - "# Assuming df is your DataFrame from the previous analysis\n", + "import altair as alt\n", + "import polars as pl\n", + "\n", + "# Assuming df_extended is already defined and contains the correct columns\n", + "\n", + "# Define a threshold for very small total_daily_usage values\n", + "threshold = 0.01 # Adjust this value as needed\n", + "\n", + "# Create the percentage_standby_usage column if it doesn't exist\n", + "if \"percentage_standby_usage\" not in df_extended.columns:\n", + " df_extended = df_extended.with_columns(\n", + " pl.when(pl.col(\"total_daily_usage\") > threshold)\n", + " .then((pl.col(\"min_power_usage_per_day\") / pl.col(\"total_daily_usage\") * 100))\n", + " .otherwise(0)\n", + " .alias(\"percentage_standby_usage\")\n", + " )\n", "\n", "# 1. Standby Usage vs Total Usage Over Time\n", "standby_vs_total = (\n", " alt.Chart(df_extended)\n", - " .transform_fold([\"total\", \"min_power_usage_per_day\"], as_=[\"variable\", \"value\"])\n", + " .transform_fold([\"total_daily_usage\", \"min_power_usage_per_day\"], as_=[\"variable\", \"value\"])\n", " .mark_line()\n", " .encode(\n", " x=\"timestamp:T\",\n", @@ -609,17 +710,16 @@ " alt.Chart(df_extended)\n", " .mark_circle()\n", " .encode(\n", - " x=\"total:Q\",\n", + " x=\"total_daily_usage:Q\",\n", " y=\"min_power_usage_per_day:Q\",\n", " color=alt.Color(\"month(timestamp):N\", scale=alt.Scale(scheme=\"category10\")),\n", - " tooltip=[\"timestamp:T\", \"total:Q\", \"min_power_usage_per_day:Q\"],\n", + " tooltip=[\"timestamp:T\", \"total_daily_usage:Q\", \"min_power_usage_per_day:Q\"],\n", " )\n", " .properties(width=500, height=500, title=\"Total Daily Usage vs Standby Usage\")\n", ")\n", "\n", "# Combine charts\n", "combined_chart = (standby_vs_total & standby_percentage) | (heatmap & (histogram & scatter))\n", - "\n", "# Display the combined chart\n", "combined_chart.display()" ] @@ -630,20 +730,265 @@ "metadata": {}, "outputs": [], "source": [ - "# Create a KDE plot for the 'total' column\n", - "kde_chart = (\n", - " alt.Chart(df_extended)\n", - " .transform_density(\n", - " density=\"total\",\n", - " as_=[\"total\", \"density\"],\n", + "# Add seasonal decomposition to detect patterns\n", + "from statsmodels.tsa.seasonal import seasonal_decompose\n", + "\n", + "\n", + "def analyze_seasonality(df):\n", + " # Resample to daily frequency and fill gaps\n", + " daily = (\n", + " df.select([pl.col(\"timestamp\"), pl.col(\"total\").alias(\"value\")])\n", + " .group_by_dynamic(\"timestamp\", every=\"1d\")\n", + " .agg([pl.col(\"value\").mean()])\n", + " )\n", + "\n", + " # Decompose into trend, seasonal, residual\n", + " result = seasonal_decompose(\n", + " daily.sort(\"timestamp\").get_column(\"value\"),\n", + " period=7, # Weekly seasonality\n", + " )\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import altair as alt\n", + "import polars as pl\n", + "\n", + "\n", + "def analyze_and_plot_seasonality(df: pl.LazyFrame) -> tuple[alt.VConcatChart, pl.DataFrame]:\n", + " \"\"\"\n", + " Analyze seasonality of energy usage data and create visualization\n", + "\n", + " Args:\n", + " df: LazyFrame with timestamp and total columns\n", + " Returns:\n", + " tuple containing (altair chart, results dataframe)\n", + " \"\"\"\n", + " # Prepare daily data\n", + " daily = (\n", + " df.filter(pl.col(\"total\") >= 0)\n", + " .group_by_dynamic(\"timestamp\", every=\"1d\")\n", + " .agg(pl.col(\"total\").mean().alias(\"value\"))\n", + " .sort(\"timestamp\")\n", + " .collect()\n", + " )\n", + "\n", + " # Get numpy array for decomposition\n", + " values = daily.get_column(\"value\").to_numpy()\n", + "\n", + " # Perform decomposition\n", + " decomposition = seasonal_decompose(\n", + " values,\n", + " period=7, # Weekly seasonality\n", + " extrapolate_trend=True,\n", " )\n", - " .mark_area()\n", - " .encode(x=\"total:Q\", y=\"density:Q\", tooltip=[\"total:Q\", \"density:Q\"])\n", - " .properties(width=800, height=400, title=\"Kernel Density Estimate of Total Energy Usage\")\n", - ")\n", "\n", - "# Display the KDE chart\n", - "kde_chart.display()" + " # Create results DataFrame\n", + " result_df = pl.DataFrame(\n", + " {\n", + " \"timestamp\": daily.get_column(\"timestamp\"),\n", + " \"observed\": values,\n", + " \"trend\": decomposition.trend,\n", + " \"seasonal\": decomposition.seasonal,\n", + " \"residual\": decomposition.resid,\n", + " }\n", + " )\n", + "\n", + " # Create separate charts for each component\n", + " def create_component_chart(title: str, y_column: str) -> alt.Chart:\n", + " return (\n", + " alt.Chart(result_df)\n", + " .mark_line()\n", + " .encode(\n", + " x=alt.X(\"timestamp:T\", title=\"Date\"),\n", + " y=alt.Y(f\"{y_column}:Q\", title=\"Value\"),\n", + " tooltip=[\n", + " alt.Tooltip(\"timestamp:T\", title=\"Date\"),\n", + " alt.Tooltip(f\"{y_column}:Q\", format=\".2f\"),\n", + " ],\n", + " )\n", + " .properties(width=800, height=150, title=title)\n", + " )\n", + "\n", + " # Combine charts vertically\n", + " chart = alt.vconcat(\n", + " create_component_chart(\"Original Data\", \"observed\"),\n", + " create_component_chart(\"Trend Component\", \"trend\"),\n", + " create_component_chart(\"Seasonal Component\", \"seasonal\"),\n", + " create_component_chart(\"Residual Component\", \"residual\"),\n", + " ).resolve_scale(y=\"independent\")\n", + "\n", + " return chart, result_df\n", + "\n", + "\n", + "# Usage example\n", + "chart, decomp_data = analyze_and_plot_seasonality(energy_data)\n", + "chart.display()\n", + "\n", + "# Calculate insights using Polars expressions\n", + "insights = decomp_data.select(\n", + " [\n", + " pl.col(\"observed\").mean().alias(\"daily_avg\"),\n", + " pl.col(\"seasonal\").abs().mean().alias(\"seasonal_var\"),\n", + " (pl.col(\"trend\").diff().mean() > 0).alias(\"is_increasing\"),\n", + " ]\n", + ").row(0)\n", + "\n", + "print(\"\\nKey Insights:\")\n", + "print(f\"Average daily consumption: {insights[0]:.2f} kWh\")\n", + "print(f\"Seasonal variation range: {insights[1]:.2f} kWh\")\n", + "print(f\"Trend direction: {'Increasing' if insights[2] else 'Decreasing'}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Some notes about the difference between the large building and the small residential building\n", + "\n", + "perhaps some future optimisations for B2B clients." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Comparing Residential vs Commercial Energy Patterns\n", + "\n", + "### Large Building Analysis\n", + "- High baseline consumption (41.57 kWh daily average)\n", + "- Strong weekly cycles (±8.16 kWh variations)\n", + "- Gradual decrease in consumption trend over the year\n", + "- Clear business hours pattern visible in seasonal component\n", + "- Several significant drops visible in residuals (possibly holidays/closures)\n", + "\n", + "### Key Differences from Residential Pattern\n", + "| Aspect | Residential | Commercial Building |\n", + "|--------|-------------|-------------------|\n", + "| Daily Average | 0.10 kWh | 41.57 kWh |\n", + "| Seasonal Impact | ±0.01 kWh | ±8.16 kWh |\n", + "| Trend | Increasing | Decreasing |\n", + "| Pattern | Less structured | Strong weekly rhythm |\n", + "\n", + "### Valuable Insights\n", + "1. **Operational Efficiency**\n", + " - Commercial building shows improving efficiency (decreasing trend)\n", + " - Residential consumption gradually increasing, might need attention\n", + "\n", + "2. **Usage Patterns**\n", + " - Commercial: Very predictable weekly cycles\n", + " - Residential: More irregular, lifestyle-dependent\n", + "\n", + "3. **Optimization Opportunities**\n", + " - Commercial: Focus on reducing weekend/night consumption\n", + " - Residential: Address growing base load trend\n", + "\n", + "4. **Anomaly Detection**\n", + " - Commercial building shows clear operational vs non-operational periods\n", + " - Makes it easier to spot inefficiencies or system issues" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Seasonal Decomposition Analysis for RESIDENTIAL example1\n", + "\n", + "The time series analysis of energy consumption reveals several interesting patterns:\n", + "\n", + "### Overall Consumption Pattern\n", + "- Average daily consumption is relatively low at 0.10 kWh\n", + "- Shows consistent base load with regular peaks throughout the year\n", + "- Notable higher consumption periods in winter months (November-December)\n", + "\n", + "### Trend Analysis\n", + "- Overall increasing trend throughout 2023\n", + "- Noticeable dip in August (possibly vacation period)\n", + "- Stronger upward trend in the latter part of the year\n", + "- Base consumption has gradually increased from ~0.05 kWh to ~0.15 kWh\n", + "\n", + "### Seasonal Patterns\n", + "- Clear weekly cycles in consumption (±0.01 kWh variation)\n", + "- Weekend/weekday patterns are visible in the seasonal component\n", + "- Relatively stable seasonal pattern suggests consistent weekly routines\n", + "\n", + "### Unusual Events (Residuals)\n", + "- Several significant spikes above normal usage\n", + "- August shows lower variability, supporting the vacation hypothesis\n", + "- December shows increased volatility, possibly due to holiday activities\n", + "\n", + "This analysis suggests a household with very efficient base load but gradually increasing consumption over the year. The weekly patterns are highly regular, indicating consistent lifestyle habits. The increasing trend might warrant attention to prevent further base load growth." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# General Analysis Remarks and Future Improvements\n", + "\n", + "## Key Visualization Insights\n", + "1. **Most Effective Client Visualizations**:\n", + " - Monthly bar chart (base load vs total usage) - Most intuitive for clients\n", + " - Daily pattern visualization - Helps identify anomalies\n", + " - Simple cost overlays - Translates energy into financial impact\n", + " \n", + "2. **Advanced Visualizations** (for expert analysis):\n", + " - Heatmap view - Reveals seasonal/weekly patterns\n", + " - Distribution plots - Shows consumption patterns\n", + " - Load curve analysis - Technical energy analysis\n", + "\n", + "## Potential Analysis Enhancements\n", + "\n", + "### Time Series Analysis\n", + "- Implement seasonal decomposition to detect patterns\n", + "- Add change point detection for base load shifts\n", + "- Correlate with weather data\n", + "- Analyze weekend vs weekday patterns\n", + "\n", + "### Smart KPIs and Benchmarks\n", + "1. **Comparative Metrics**:\n", + " - Base load vs similar buildings/households\n", + " - Historical improvement tracking\n", + " - Seasonal adjustments\n", + "\n", + "2. **Financial Insights**:\n", + " - Yearly standby power cost\n", + " - Potential savings calculations\n", + " - Return on investment for improvements\n", + "\n", + "### Edge Cases and Data Quality\n", + "- Vacation periods and very low usage days\n", + "- Grid outages and meter issues\n", + "- Solar PV impact handling\n", + "- Seasonal base load variations\n", + "\n", + "## Recommended Client Features\n", + "\n", + "### Actionable Insights\n", + "1. Simple traffic light system for base load performance\n", + "2. Concrete energy-saving recommendations\n", + "3. Clear cost implications of current base load\n", + "4. Easy comparison between different time periods\n", + "\n", + "### Smart Analytics\n", + "- Device-level consumption estimates\n", + "- Anomaly detection and alerts\n", + "- Predictive base load trends\n", + "- Custom benchmarking\n", + "\n", + "# Next Steps\n", + "\n", + "1. Implement basic benchmarking system\n", + "2. Add financial impact calculations\n", + "3. Develop anomaly detection\n", + "4. Create automated recommendations\n", + "5. Add weather correlation analysis\n", + "6. Implement device-level disaggregation\n" ] } ], diff --git a/openenergyid/baseload/__init__.py b/openenergyid/baseload/__init__.py index c13131d..daac3fc 100644 --- a/openenergyid/baseload/__init__.py +++ b/openenergyid/baseload/__init__.py @@ -3,15 +3,15 @@ from .main import ( BaseLoadMetrics, EnergySchema, - load_data, - calculate_base_load, + load_energy_data, + analyze_base_load, Granularity, ) __all__ = [ "BaseLoadMetrics", "EnergySchema", - "load_data", - "calculate_base_load", + "load_energy_data", + "analyze_base_load", "Granularity", ] diff --git a/tests/test_baseload.py b/tests/test_baseload.py new file mode 100644 index 0000000..a4cd92e --- /dev/null +++ b/tests/test_baseload.py @@ -0,0 +1,58 @@ +"""tests for main""" + +from datetime import datetime +import polars as pl +from openenergyid.baseload.main import Granularity, analyze_base_load + + +def test_granularity_feature(): + """ + Test the analyze_base_load function with different granularities. + + This function creates a sample LazyFrame with test data and tests the + analyze_base_load function with DAILY and HOURLY granularities. It verifies + the shape of the resulting DataFrame and checks for the presence of the + 'base_load_watts' column. + + Test Cases: + - DAILY granularity: Expects 2 rows in the result. + - HOURLY granularity: Expects 6 rows in the result. + + Raises: + AssertionError: If the shape of the resulting DataFrame does not match + the expected number of rows or if the 'base_load_watts' + column is not present in the result. + """ + # Create a sample LazyFrame with test data + data = { + "timestamp": [ + datetime(2023, 1, 1, 0, 0), + datetime(2023, 1, 1, 1, 0), + datetime(2023, 1, 1, 2, 0), + datetime(2023, 1, 2, 0, 0), + datetime(2023, 1, 2, 1, 0), + datetime(2023, 1, 2, 2, 0), + ], + "total": [10, 20, 30, 40, 50, 60], + } + lf = pl.LazyFrame(data) + + # Test with DAILY granularity + result_daily = analyze_base_load(lf, Granularity.P1D) + print("Daily Granularity Result:") + print(result_daily) + assert result_daily.shape[0] == 2, "Expected 2 rows for DAILY granularity" + assert "base_load_watts" in result_daily.columns, "Expected 'base_load_watts' column in result" + + # Test with HOURLY granularity + result_hourly = analyze_base_load(lf, Granularity.PT1H) + print("\nHourly Granularity Result:") + print(result_hourly) + assert result_hourly.shape[0] == 6, "Expected 6 rows for HOURLY granularity" + assert "base_load_watts" in result_hourly.columns, "Expected 'base_load_watts' column in result" + + print("\nAll tests passed!") + + +if __name__ == "__main__": + test_granularity_feature()