Skip to content

Commit

Permalink
added dataframe_to_xarray function (#86)
Browse files Browse the repository at this point in the history
* dataframe_to_xarray function

* added testcase

* updated docs

* updated history.rst

* added dataframe_to_xarray to measurement jupyter notebook
  • Loading branch information
veenstrajelmer authored Apr 8, 2024
1 parent 63c2018 commit d343edf
Show file tree
Hide file tree
Showing 7 changed files with 668 additions and 87 deletions.
1 change: 1 addition & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ UNRELEASED
------------------
* added `catalog_filter` argument to `ddlpy.locations()` to enabling retrieving the extended catalog in https://github.com/Deltares/ddlpy/pull/87
* pass all Code parameters to measurements request instead of only four in https://github.com/Deltares/ddlpy/pull/88
* added ddlpy.dataframe_to_xarray()` function in https://github.com/Deltares/ddlpy/pull/86

0.3.0 (2023-03-13)
------------------
Expand Down
4 changes: 3 additions & 1 deletion ddlpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@
measurements_latest,
measurements_available,
measurements_amount,
simplify_dataframe,
)
from ddlpy.utils import (simplify_dataframe,
dataframe_to_xarray)

__all__ = ['locations',
'measurements',
'measurements_latest',
'measurements_available',
'measurements_amount',
'simplify_dataframe',
'dataframe_to_xarray'
]
21 changes: 0 additions & 21 deletions ddlpy/ddlpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,24 +401,3 @@ def measurements_latest(location):
if result['Succesvol']:
df = _combine_waarnemingenlijst(result, location)
return df


def simplify_dataframe(df: pd.DataFrame):
"""
drop columns with constant values from the dataframe
and collect them in a dictionary which is
added as attrs of the dataframe
"""

bool_constant = (df == df.iloc[0]).all()

# constant columns are flattened and converted to dict of attrs
df_attrs = df.loc[:, bool_constant].iloc[0].to_dict()

# varying columns are kept in output dataframe
df_simple = df.loc[:, ~bool_constant]

# attach as attrs to dataframe
df_simple.attrs = df_attrs

return df_simple
101 changes: 101 additions & 0 deletions ddlpy/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import dateutil.rrule
import itertools
import pandas as pd


def date_series(start, end, freq=dateutil.rrule.MONTHLY):
"""return a list of start and end date over the timespan start[->end following the frequency rule"""
Expand All @@ -26,3 +28,102 @@ def pairwise(it):
# remove it
del result[-1]
return result


def simplify_dataframe(df: pd.DataFrame):
"""
Drop columns with constant values from the dataframe and collect them
in a dictionary which is added as attrs of the dataframe.
"""

bool_constant = (df == df.iloc[0]).all()

# constant columns are flattened and converted to dict of attrs
df_attrs = df.loc[:, bool_constant].iloc[0].to_dict()

# varying columns are kept in output dataframe
df_simple = df.loc[:, ~bool_constant].copy()

# attach as attrs to dataframe
df_simple.attrs = df_attrs

return df_simple


def code_description_attrs_from_dataframe(df: pd.DataFrame):
# create var_attrs_dict
colname_code_list = df.columns[df.columns.str.contains(".Code")]
var_attrs_dict = {}
for colname_code in colname_code_list:
colname_oms = colname_code.replace(".Code",".Omschrijving")
meas_twocol = df[[colname_code,colname_oms]].drop_duplicates()
attr_dict = meas_twocol.set_index(colname_code)[colname_oms].to_dict()
var_attrs_dict[colname_code] = attr_dict
return var_attrs_dict


def dataframe_to_xarray(df: pd.DataFrame, drop_if_constant=[]):
"""
Converts the measurement dataframe to a xarray dataset,
including several cleanups to minimize the size of the netcdf dataset on disk:
- The column 'Parameter_Wat_Omschrijving' is dropped (combination of information in other columns)
- The column 'Meetwaarde.Waarde_Alfanumeriek' is dropped if 'Meetwaarde.Waarde_Numeriek' is present (contains duplicate values in that case)
- All Omschrijving columns are dropped and added as attributes to the Code variables
- All NVT-only Code columns are dropped and added as ds attributes
- All location columns are dropped and added as ds attributes
- All drop_if_constant columns are dropped and added as ds attributes (if the values are indeed constant)
"""

# create list of columns with duplicate info (often not constant), will be dropped
cols_bulky = ["Parameter_Wat_Omschrijving"]
if "Meetwaarde.Waarde_Alfanumeriek" in df.columns and 'Meetwaarde.Waarde_Numeriek' in df.columns:
# drop alfanumeriek if duplicate of numeriek # TODO: should not be returned by ddl
cols_bulky.append("Meetwaarde.Waarde_Alfanumeriek")

# create list of all omschrijving columns, will be dropped (added as ds[varn].attrs via code_description_attrs_from_dataframe())
cols_omschrijving = df.columns[df.columns.str.contains(".Omschrijving")].tolist()

# create list of all-NVT *.Code columns, will be dropped (codes added as ds.attrs)
bool_onlynvt_code = (df=='NVT').all(axis=0)
cols_onlynvt_code = df.columns[bool_onlynvt_code].tolist()
cols_onlynvt_code = [x for x in cols_onlynvt_code if x.endswith(".Code")]

# create list of location columns, will be dropped (added as ds.attrs)
cols_location = ['Code', 'Naam', 'Coordinatenstelsel', 'X', 'Y']

# add drop_if_constant colums to list if values are indeed constant, will be dropped (added as ds.attrs)
cols_constant = []
for colname in drop_if_constant:
assert colname in df.columns
if len(df[colname].drop_duplicates()) == 1:
cols_constant.append(colname)

# create ds attrs for all nvt/location/constant columns
ds_attrs = {}
attrs_columns = cols_onlynvt_code + cols_constant + cols_location
for colname in attrs_columns:
ds_attrs[colname] = df[colname].iloc[0]

# drop columns
drop_columns = (cols_bulky + cols_location + cols_constant +
cols_onlynvt_code + cols_omschrijving)
df_simple = df.drop(drop_columns, axis=1, errors='ignore')

# convert to UTC to please xarray
# TODO: adding tzone to time.encoding['units'] raises "ValueError: invalid time units: 1970-01-01 00:00:00 +01:00"
df_simple.index = df_simple.index.tz_convert(None)

# convert to xarray dataset and add ds_attrs
ds = df_simple.to_xarray()
ds = ds.assign_attrs(ds_attrs)

# assign attrs with code+omschrijving to each *.Code variable
var_attrs_dict = code_description_attrs_from_dataframe(df)
for varn in ds.data_vars:
if varn in var_attrs_dict.keys():
var_attrs = var_attrs_dict[varn]
ds[varn] = ds[varn].assign_attrs(var_attrs)

return ds
1 change: 1 addition & 0 deletions docs/modules.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ ddlpy module
:members:
:undoc-members:
:show-inheritance:
:member-order: bysource
579 changes: 522 additions & 57 deletions notebooks/measurements.ipynb

Large diffs are not rendered by default.

48 changes: 40 additions & 8 deletions tests/test_ddlpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,14 +155,6 @@ def test_measurements_duplicated(measurements):
assert isinstance(meas_clean.index, pd.DatetimeIndex)


def test_simplify_dataframe(measurements):
assert len(measurements.columns) == 53
meas_simple = ddlpy.simplify_dataframe(measurements)
assert hasattr(meas_simple, "attrs")
assert len(meas_simple.attrs) == 50
assert len(meas_simple.columns) == 3


datetype_list = ["string", "pd.Timestamp", "dt.datetime", "mixed"]
@pytest.mark.parametrize("datetype", datetype_list)
def test_check_convert_dates(datetype):
Expand Down Expand Up @@ -193,3 +185,43 @@ def test_check_convert_wrongorder():
with pytest.raises(ValueError):
start_date_out, end_date_out = ddlpy.ddlpy._check_convert_dates(end_date, start_date)


def test_simplify_dataframe(measurements):
assert len(measurements.columns) == 53
meas_simple = ddlpy.simplify_dataframe(measurements)
assert hasattr(meas_simple, "attrs")
assert len(meas_simple.attrs) == 50
assert len(meas_simple.columns) == 3


def test_dataframe_to_xarray(measurements):
drop_if_constant = ["WaarnemingMetadata.OpdrachtgevendeInstantieLijst",
"WaarnemingMetadata.BemonsteringshoogteLijst",
"WaarnemingMetadata.ReferentievlakLijst",
"AquoMetadata_MessageID",
"BemonsteringsSoort.Code",
"Compartiment.Code", "Eenheid.Code", "Grootheid.Code", "Hoedanigheid.Code",
]
ds_clean = ddlpy.dataframe_to_xarray(measurements, drop_if_constant)

# check if constant value that was not in drop_if_constant list is indeed not dropped
assert "MeetApparaat.Code" in ds_clean.data_vars
assert len(ds_clean["MeetApparaat.Code"]) > 0

for varname in drop_if_constant:
if varname == "WaarnemingMetadata.OpdrachtgevendeInstantieLijst":
continue
assert varname not in ds_clean.data_vars
assert varname in ds_clean.attrs.keys()
assert "WaarnemingMetadata.OpdrachtgevendeInstantieLijst" in ds_clean.data_vars
assert "WaarnemingMetadata.OpdrachtgevendeInstantieLijst" not in ds_clean.attrs.keys()

data_vars_list = ['WaarnemingMetadata.StatuswaardeLijst',
'WaarnemingMetadata.KwaliteitswaardecodeLijst',
'MeetApparaat.Code',
'WaardeBepalingsmethode.Code',
'Meetwaarde.Waarde_Numeriek']
for varname in data_vars_list:
assert varname in ds_clean.data_vars

assert "X" in ds_clean.attrs.keys()

0 comments on commit d343edf

Please sign in to comment.