Skip to content

Commit

Permalink
Move extracting curve data from oedb turbine library to own function …
Browse files Browse the repository at this point in the history
…to allow testing independent from oedb data
  • Loading branch information
birgits committed Feb 9, 2024
1 parent d57801e commit b4db824
Show file tree
Hide file tree
Showing 2 changed files with 154 additions and 51 deletions.
46 changes: 46 additions & 0 deletions tests/test_data_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
get_turbine_types,
restore_default_turbine_data,
store_turbine_data_from_oedb,
_process_and_save_oedb_data,
)


Expand Down Expand Up @@ -99,6 +100,51 @@ def test_store_turbine_data_from_oedb(self, caplog):
assert "No cp-curve but has_cp_curve=True" not in caplog.text
assert "No power curve but has_power_curve=True" not in caplog.text

def test__prepare_and_save_oedb_turbine_curve_data(self, caplog):
"""Test `_prepare_and_save_oedb_turbine_curve_data` function."""
# prepare dummy turbine data
# turbine 0 everything okay, turbine 1 duplicated wind speeds, turbine 2
# power curve values broken
turbine_data = pd.DataFrame(
data={
"id": [0, 1, 2],
"turbine_type": ["turbine 0", "turbine 1", "turbine 2"],
"has_power_curve": [True, True, True],
"has_cp_curve": [True, True, True],
"power_curve_wind_speeds": ["[15, 20, 25]", "[15, 15, 25]", "[15, 20, 25]"],
"power_curve_values": ["[15, 20, 25]", "[15, 20, 25]", "[15, 20, [25]"],
"power_coefficient_curve_wind_speeds": ["[15, 20, 25]", "[15, 20, 25]", "[15, 20, 25]"],
"power_coefficient_curve_values": ["[15, 20, 25]", "[15, 20, 25]", "[15, 20, 25]"],
"thrust_coefficient_curve_wind_speeds": [0, 1, 2],
"thrust_coefficient_curve_values": [0, 1, 2],
"nominal_power": [0, 1, 2],
},
index=[0, 1, 2]
)

# run test with low / default threshold - data is not overwritten
t = {}
for fn in os.listdir(self.orig_path):
t[fn] = os.path.getmtime(os.path.join(self.orig_path, fn))
with caplog.at_level(logging.WARNING):
_process_and_save_oedb_data(turbine_data)
for fn in os.listdir(self.orig_path):
assert t[fn] == os.path.getmtime(os.path.join(self.orig_path, fn))
assert "The turbine library data contains too many faulty " in caplog.text

# run test with high threshold
for fn in os.listdir(self.orig_path):
t[fn] = os.path.getmtime(os.path.join(self.orig_path, fn))
with caplog.at_level(logging.WARNING):
_process_and_save_oedb_data(turbine_data, threshold=0.95)
for fn in os.listdir(self.orig_path):
assert t[fn] < os.path.getmtime(os.path.join(self.orig_path, fn))
assert "The turbine library data contains faulty power_curves" in caplog.text
assert not turbine_data.at[2, "has_power_curve"]
assert not turbine_data.at[1, "has_power_curve"]
assert turbine_data.at[1, "has_cp_curve"]
assert turbine_data.at[0, "has_power_curve"]

def test_wrong_url_load_turbine_data(self):
"""Load turbine data from oedb with a wrong schema."""
with pytest.raises(
Expand Down
159 changes: 108 additions & 51 deletions windpowerlib/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,21 +158,34 @@ def load_turbine_data_from_oedb(schema="supply", table="wind_turbine_library"):


def store_turbine_data_from_oedb(
schema="supply", table="wind_turbine_library"
schema="supply", table="wind_turbine_library", threshold=0.2
):
r"""
Loads turbine library from the OpenEnergy database (oedb).
Turbine data is saved to csv files ('oedb_power_curves.csv',
'oedb_power_coefficient_curves.csv' and 'oedb_nominal_power') for offline
usage of the windpowerlib. If the files already exist they are overwritten.
In case the turbine library on the oedb contains too many faulty turbines,
the already existing files are not overwritten. The accepted percentage of faulty
turbines can be set through the parameter `threshold`.
Parameters
----------
schema : str
Database schema of the turbine library.
table : str
Table name of the turbine library.
threshold : float
In case there are turbines in the turbine library with faulty data (e.g.
duplicate wind speed entries in the power (coefficient) curve data), the
threshold defines the share of accepted faulty turbine ata up to which the
existing turbine data is overwritten by the newly downloaded data.
For example, a threshold of 0.1 means that more than 10% of the
turbines would need to have invalid data in order to discard the downloaded
data. This is to make sure that in the rare case the oedb data is too buggy,
the turbine data that is by default provided with the windpowerlib is not
overwritten by poor data.
Returns
-------
Expand All @@ -182,11 +195,40 @@ def store_turbine_data_from_oedb(
"""
turbine_data = fetch_turbine_data_from_oedb(schema=schema, table=table)
# standard file name for saving data
filename = os.path.join(os.path.dirname(__file__), "oedb", "{0}.csv")
turbine_data = _process_and_save_oedb_data(
turbine_data, threshold=threshold
)
check_turbine_data(
filename = os.path.join(os.path.dirname(__file__), "oedb", "{0}.csv")
)
return turbine_data


def _process_and_save_oedb_data(turbine_data, threshold=0.2):
"""
Helper function to extract power (coefficient) curve data from the turbine library.
Parameters
-----------
turbine_data : :pandas:`pandas.DataFrame<frame>`
Raw turbine data downloaded from the oedb with
:func:`fetch_turbine_data_from_oedb`.
threshold : float
See parameter `threshold` in func:`store_turbine_data_from_oedb`
for more information.
Returns
--------
:pandas:`pandas.DataFrame<frame>`
Turbine data of different turbines such as 'manufacturer',
'turbine_type', 'nominal_power'.
# get all power (coefficient) curves and save them to file
for curve_type in ["power_curve", "power_coefficient_curve"]:
"""
curve_types = ["power_curve", "power_coefficient_curve"]
# get all power (coefficient) curves
curve_dict = {}
broken_turbines_dict = {}
for curve_type in curve_types:
broken_turbine_data = []
curves_df = pd.DataFrame(columns=["wind_speed"])
for index in turbine_data.index:
Expand Down Expand Up @@ -222,67 +264,82 @@ def store_turbine_data_from_oedb(
curves_df = pd.merge(
left=curves_df, right=df, how="outer", on="wind_speed"
)
else:
broken_turbine_data.append(
turbine_data.loc[index, "turbine_type"])
except:
broken_turbine_data.append(turbine_data.loc[index, "turbine_type"])

# warning in case of broken turbine data
if len(broken_turbine_data) > 0:
issue_link = ("https://github.com/OpenEnergyPlatform/data-preprocessing"
"/issues/28")
# in case only some data is faulty, only give out warning
if len(broken_turbine_data) < 0.2 * len(turbine_data):
logging.warning(
f"The turbine library data contains faulty {curve_type}s. The "
f"{curve_type} data can therefore not be loaded for the following "
f"turbines: {broken_turbine_data}. "
f"Please report this in the following issue, in case it hasn't "
f"already been reported: {issue_link}"
)
save_turbine_data = True
curve_dict[curve_type] = curves_df
broken_turbines_dict[curve_type] = broken_turbine_data

# check if there are faulty turbines and if so, raise warning
# if there are too many, don't save downloaded data to disk but keep existing data
if any(len(_) > 0 for _ in broken_turbines_dict.values()):
issue_link = ("https://github.com/OpenEnergyPlatform/data-preprocessing"
"/issues/28")
# in case only some data is faulty, only give out warning
if all(len(_) < threshold * len(turbine_data)
for _ in broken_turbines_dict.values()):
save_turbine_data = True
for curve_type in curve_types:
if len(broken_turbines_dict[curve_type]) > 0:
logging.warning(
f"The turbine library data contains faulty {curve_type}s. The "
f"{curve_type} data can therefore not be loaded for the "
f"following turbines: {broken_turbine_data}. "
f"Please report this in the following issue, in case it hasn't "
f"already been reported: {issue_link}"
)
# set has_power_(coefficient)_curve to False for faulty turbines
for turb in broken_turbine_data:
for turb in broken_turbines_dict[curve_type]:
ind = turbine_data[turbine_data.turbine_type == turb].index[0]
col = ("has_power_curve" if curve_type == "power_curve"
else "has_cp_curve")
turbine_data.at[ind, col] = False
# in case most data is faulty, do not store downloaded data
else:
logging.warning(
f"The turbine library data contains too many faulty {curve_type}s,"
f"wherefore {curve_type} data is not loaded from the oedb. "
f"Please report this in the following issue, in case it hasn't "
f"already been reported: {issue_link}"
)
save_turbine_data = False
# in case most data is faulty, do not store downloaded data
else:
save_turbine_data = True

if save_turbine_data:
curves_df = curves_df.set_index("wind_speed").sort_index().transpose()
logging.warning(
f"The turbine library data contains too many faulty turbine datasets "
f"wherefore it is not loaded from the oedb. "
f"In case you want to circumvent this behaviour, you can specify a "
f"higher tolerance through the parameter 'threshold'."
f"Please report this in the following issue, in case it hasn't "
f"already been reported: {issue_link}"
)
save_turbine_data = False
else:
save_turbine_data = True

if save_turbine_data:
# standard file name for saving data
filename = os.path.join(os.path.dirname(__file__), "oedb", "{0}.csv")
# save curve data to csv
for curve_type in curve_types:
curves_df = curve_dict[curve_type].set_index(
"wind_speed").sort_index().transpose()
# power curve values in W
if curve_type == "power_curve":
curves_df *= 1000
curves_df.index.name = "turbine_type"
curves_df.sort_index(inplace=True)
curves_df.to_csv(filename.format("{}s".format(curve_type)))

# get turbine data and save to file (excl. curves)
turbine_data_df = turbine_data.drop(
[
"power_curve_wind_speeds",
"power_curve_values",
"power_coefficient_curve_wind_speeds",
"power_coefficient_curve_values",
"thrust_coefficient_curve_wind_speeds",
"thrust_coefficient_curve_values",
],
axis=1,
).set_index("turbine_type")
# nominal power in W
turbine_data_df["nominal_power"] *= 1000
turbine_data_df.sort_index(inplace=True)
turbine_data_df.to_csv(filename.format("turbine_data"))
check_turbine_data(filename)
# save turbine data to file (excl. curves)
turbine_data_df = turbine_data.drop(
[
"power_curve_wind_speeds",
"power_curve_values",
"power_coefficient_curve_wind_speeds",
"power_coefficient_curve_values",
"thrust_coefficient_curve_wind_speeds",
"thrust_coefficient_curve_values",
],
axis=1,
).set_index("turbine_type")
# nominal power in W
turbine_data_df["nominal_power"] *= 1000
turbine_data_df.sort_index(inplace=True)
turbine_data_df.to_csv(filename.format("turbine_data"))
return turbine_data


Expand Down

0 comments on commit b4db824

Please sign in to comment.