pvlib · kperrynrel · Mar 3, 2022 · Feb 22, 2022 · Feb 23, 2022 · Feb 23, 2022
diff --git a/docs/examples/data-completeness.py b/docs/examples/data-completeness.py
@@ -0,0 +1,85 @@
+"""
+Missing Data Periods
+====================
+
+Identifying days with missing data using a "completeness" score metric,
+and filtering out consecutive days with a low completeness score for
+future data analysis.
+"""
+
+# %%
+# Identifying days with missing data and filtering these days out reduces noise
+# when performing data analysis. This example shows how to use a
+# daily data "completeness" score to identify and filter out days with missing
+# data. This includes using
+# :py:func:`pvanalytics.quality.gaps.completeness_score`,
+# :py:func:`pvanalytics.quality.gaps.complete`, and
+# :py:func:`pvanalytics.quality.gaps.trim_incomplete`.
+
+import pvanalytics
+from pvanalytics.quality import gaps
+import matplotlib.pyplot as plt
+import pandas as pd
+import pathlib
+
+# %%
+# First, we import the AC power data stream that we are going to check for
+# completeness. The time series we download is a normalized AC power time
+# series from the PV Fleets Initiative, and is available via the DuraMAT
+# DataHub:
+# https://datahub.duramat.org/dataset/inverter-clipping-ml-training-set-real-data
+
+pvanalytics_dir = pathlib.Path(pvanalytics.__file__).parent
+file = pvanalytics_dir / 'data' / 'ac_power_inv_2173.csv'
+data = pd.read_csv(file, index_col=0, parse_dates=True)
+data = data.asfreq("15T")
+
+# %%
+# Now, use :py:func:`pvanalytics.quality.gaps.completeness_score` to get the
+# percentage of daily data that isn't NaN.
+data_completeness_score = gaps.completeness_score(data['value_normalized'])
+
+# Visualize data completeness score as a time series.
+data_completeness_score.plot()
+plt.xlabel("Date")
+plt.ylabel("Daily Completeness Score (Fractional)")
+plt.tight_layout()
+plt.show()
+
+# %%
+# Mask complete days, based on daily completeness score, using
+# :py:func:`pvanalytics.quality.gaps.complete`
+min_completeness = 0.333
+daily_completeness_mask = gaps.complete(data['value_normalized'],
+                                        minimum_completeness=min_completeness)
+
+# Mask complete days, based on daily completeness score
+data_completeness_score.plot()
+data_completeness_score.loc[daily_completeness_mask].plot(ls='', marker='.')
+data_completeness_score.loc[~daily_completeness_mask].plot(ls='', marker='.')
+plt.axhline(y=min_completeness, color='r', linestyle='--')
+plt.legend(labels=["Completeness Score", "Threshold met",
+                   "Threshold not met", "Completeness Threshold (.33)"],
+           loc="upper left")
+plt.xlabel("Date")
+plt.ylabel("Daily Completeness Score (Fractional)")
+plt.tight_layout()
+plt.show()
+
+# %%
+# Trim the time series based on the completeness score, where the time
+# series must have at least 10 consecutive days of data that meet the
+# completeness threshold. This is done using
+# :py:func:`pvanalytics.quality.gaps.trim_incomplete`
+number_consecutive_days = 10
+completeness_trim_mask = gaps.trim_incomplete(data['value_normalized'],
+                                              days=number_consecutive_days)
+# Re-visualize the time series with the data masked by the trim mask
+data[completeness_trim_mask]['value_normalized'].plot()
+data[~completeness_trim_mask]['value_normalized'].plot()
+plt.legend(labels=[True, False],
+           title="Daily Data Passing")
+plt.xlabel("Date")
+plt.ylabel("Normalized AC Power")
+plt.tight_layout()
+plt.show()
diff --git a/docs/examples/interpolated-periods.py b/docs/examples/interpolated-periods.py
@@ -0,0 +1,68 @@
+"""
+Interpolated Data Periods
+=========================
+
+Identifying periods in a time series where the data has been
+linearly interpolated.
+"""
+
+# %%
+# Identifying periods where time series data has been linearly interpolated
+# and removing these periods may help to reduce noise when performing future
+# data analysis. This example shows how to use
+# :py:func:`pvanalytics.quality.gaps.interpolation_diff`, which identifies and
+# masks linearly interpolated periods.
+
+import pvanalytics
+from pvanalytics.quality import gaps
+import matplotlib.pyplot as plt
+import pandas as pd
+import pathlib
+
+# %%
+# First, we import the AC power data stream that we are going to check for
+# interpolated periods. The time series we download is a normalized AC power
+# time series from the PV Fleets Initiative, and is available via the DuraMAT
+# DataHub:
+# https://datahub.duramat.org/dataset/inverter-clipping-ml-training-set-real-data
+pvanalytics_dir = pathlib.Path(pvanalytics.__file__).parent
+file = pvanalytics_dir / 'data' / 'ac_power_inv_2173.csv'
+data = pd.read_csv(file, index_col=0, parse_dates=True)
+data = data.asfreq("15T")
+
+# %%
+# We plot the time series before linearly interpolating missing data periods.
+data.plot()
+plt.xlabel("Date")
+plt.ylabel("Normalized AC Power")
+plt.tight_layout()
+plt.show()
+
+# %%
+# We add linearly interpolated data periods to the time series for the
+# :py:func:`pvanalytics.quality.gaps.interpolation_diff` to catch, and
+# re-visualize the data with those interpolated periods masked.
+interpolated_data_mask = data['value_normalized'].isna()
+data = data.interpolate(method='linear', limit_direction='forward', axis=0)
+data['value_normalized'].plot()
+data.loc[interpolated_data_mask, "value_normalized"].plot(ls='', marker='.')
+plt.legend(labels=["AC Power", "Interpolated Data"])
+plt.xlabel("Date")
+plt.ylabel("Normalized AC Power")
+plt.tight_layout()
+plt.show()
+
+# %%
+# Now, we use :py:func:`pvanalytics.quality.gaps.interpolation_diff` to
+# identify linearly interpolated periods in the time series. We re-plot
+# the data with this mask.
+detected_interpolated_data_mask = gaps.interpolation_diff(
+    data['value_normalized'])
+data['value_normalized'].plot()
+data.loc[detected_interpolated_data_mask,
+         "value_normalized"].plot(ls='', marker='.')
+plt.legend(labels=["AC Power", "Detected Interpolated Data"])
+plt.xlabel("Date")
+plt.ylabel("Normalized AC Power")
+plt.tight_layout()
+plt.show()
diff --git a/docs/examples/stale-data.py b/docs/examples/stale-data.py
@@ -0,0 +1,95 @@
+"""
+Stale Data Periods
+==================
+
+Identifying stale data periods, defined as periods of
+consecutive repeating values, in time series.
+"""
+
+# %%
+# Identifing and removing stale, or consecutive repeating, values in time
+# series data reduces noise when performing data analysis. This example shows
+# how to use two PVAnalytics functions,
+# :py:func:`pvanalytics.quality.gaps.stale_values_diff`
+# and :py:func:`pvanalytics.quality.gaps.stale_values_round`, to identify
+# and mask stale data periods in time series data.
+
+import pvanalytics
+from pvanalytics.quality import gaps
+import matplotlib.pyplot as plt
+import pandas as pd
+import pathlib
+import numpy as np
+
+# %%
+# First, we import the AC power data stream that we are going to check for
+# stale data periods. The time series we download is a normalized AC power time
+# series from the PV Fleets Initiative, and is available via the DuraMAT
+# DataHub:
+# https://datahub.duramat.org/dataset/inverter-clipping-ml-training-set-real-data
+
+pvanalytics_dir = pathlib.Path(pvanalytics.__file__).parent
+file = pvanalytics_dir / 'data' / 'ac_power_inv_2173.csv'
+data = pd.read_csv(file, index_col=0, parse_dates=True)
+data = data.asfreq("15T")
+
+# %%
+# We plot the time series before inserting artificial stale data periods.
+data.plot()
+plt.xlabel("Date")
+plt.ylabel("Normalized AC Power")
+plt.legend(labels=["AC Power"])
+plt.tight_layout()
+plt.show()
+
+# %%
+# We insert some repeating/stale data periods into the time series for the
+# stale data functions to catch, and re-visualize the data, with those stale
+# periods masked.
+
+data[460:520] = data.iloc[460]
+data[755:855] = data.iloc[755]
+data[1515:1600] = data.iloc[1515]
+stale_data_insert_mask = pd.Series(False, index=data.index)
+# Numpy.r_ translates slice objects to concatenation along the first axis.
+# See here:
+# https://numpy.org/doc/stable/reference/generated/numpy.r_.html
+stale_data_insert_mask.iloc[np.r_[460:520, 755:855, 1515:1600]] = True
+
+data['value_normalized'].plot()
+data.loc[stale_data_insert_mask, "value_normalized"].plot(ls='', marker='.')
+plt.legend(labels=["AC Power", "Inserted Stale Data"])
+plt.xlabel("Date")
+plt.ylabel("Normalized AC Power")
+plt.tight_layout()
+plt.show()
+
+# %%
+# Now, we use :py:func:`pvanalytics.quality.gaps.stale_values_diff` to
+# identify stale values in data. We visualize the detected stale periods
+# graphically.
+
+stale_data_mask = gaps.stale_values_diff(data['value_normalized'])
+data['value_normalized'].plot()
+data.loc[stale_data_mask, "value_normalized"].plot(ls='', marker='.')
+plt.legend(labels=["AC Power", "Detected Stale Data"])
+plt.xlabel("Date")
+plt.ylabel("Normalized AC Power")
+plt.tight_layout()
+plt.show()
+
+# %%
+# Now, we use :py:func:`pvanalytics.quality.gaps.stale_values_round` to
+# identify stale values in data, using rounded data. This function yields
+# similar results as :py:func:`pvanalytics.quality.gaps.stale_values_diff`,
+# except it looks for consecutive repeating data that has been rounded to
+# a settable decimals place.
+
+stale_data_round_mask = gaps.stale_values_round(data['value_normalized'])
+data['value_normalized'].plot()
+data.loc[stale_data_round_mask, "value_normalized"].plot(ls='', marker='.')
+plt.legend(labels=["AC Power", "Detected Stale Data"])
+plt.xlabel("Date")
+plt.ylabel("Normalized AC Power")
+plt.tight_layout()
+plt.show()
diff --git a/docs/whatsnew/0.1.2.rst b/docs/whatsnew/0.1.2.rst
@@ -20,6 +20,14 @@ Documentation
 
 * Added an example for
   :py:func:`pvanalytics.features.clipping.geometric`  (:issue:`133`, :pull:`134`)
+* Added examples for the quality.data_gaps module, including
+  :py:func:`pvanalytics.quality.gaps.stale_values_diff`,
+  :py:func:`pvanalytics.quality.gaps.stale_values_round`,
+  :py:func:`pvanalytics.quality.gaps.interpolation_diff`,
+  :py:func:`pvanalytics.quality.gaps.completeness_score`,
+  :py:func:`pvanalytics.quality.gaps.complete`, and
+  :py:func:`pvanalytics.quality.gaps.trim_incomplete`
+  (:issue:`133`, :pull:`135`)
 
 Contributors
 ~~~~~~~~~~~~