added dataframe_to_xarray function (#86)

* dataframe_to_xarray function * added testcase * updated docs * updated history.rst * added dataframe_to_xarray to measurement jupyter notebook
Deltares · Apr 8, 2024 · d343edf · d343edf
1 parent 63c2018
commit d343edf
Show file tree

Hide file tree

Showing 7 changed files with 668 additions and 87 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -6,6 +6,7 @@ UNRELEASED
 ------------------
 * added `catalog_filter` argument to `ddlpy.locations()` to enabling retrieving the extended catalog in https://github.com/Deltares/ddlpy/pull/87
 * pass all Code parameters to measurements request instead of only four in https://github.com/Deltares/ddlpy/pull/88
+* added ddlpy.dataframe_to_xarray()` function in https://github.com/Deltares/ddlpy/pull/86
 
 0.3.0 (2023-03-13)
 ------------------

diff --git a/ddlpy/__init__.py b/ddlpy/__init__.py
@@ -11,13 +11,15 @@
                          measurements_latest, 
                          measurements_available, 
                          measurements_amount,
-                         simplify_dataframe,
                          )
+from ddlpy.utils import (simplify_dataframe,
+                         dataframe_to_xarray)
 
 __all__ = ['locations', 
            'measurements',
            'measurements_latest', 
            'measurements_available', 
            'measurements_amount',
            'simplify_dataframe',
+           'dataframe_to_xarray'
            ]
diff --git a/ddlpy/ddlpy.py b/ddlpy/ddlpy.py
@@ -401,24 +401,3 @@ def measurements_latest(location):
     if result['Succesvol']:
         df = _combine_waarnemingenlijst(result, location)
         return df
-
-
-def simplify_dataframe(df: pd.DataFrame):
-    """
-    drop columns with constant values from the dataframe
-    and collect them in a dictionary which is 
-    added as attrs of the dataframe
-    """
-
-    bool_constant = (df == df.iloc[0]).all()
-
-    # constant columns are flattened and converted to dict of attrs
-    df_attrs = df.loc[:, bool_constant].iloc[0].to_dict()
-
-    # varying columns are kept in output dataframe
-    df_simple = df.loc[:, ~bool_constant]
-
-    # attach as attrs to dataframe
-    df_simple.attrs = df_attrs
-
-    return df_simple
diff --git a/ddlpy/utils.py b/ddlpy/utils.py
@@ -1,5 +1,7 @@
 import dateutil.rrule
 import itertools
+import pandas as pd
+
 
 def date_series(start, end, freq=dateutil.rrule.MONTHLY):
     """return a list of start and end date over the timespan start[->end following the frequency rule"""
@@ -26,3 +28,102 @@ def pairwise(it):
         # remove it
         del result[-1]
     return result
+
+
+def simplify_dataframe(df: pd.DataFrame):
+    """
+    Drop columns with constant values from the dataframe and collect them 
+    in a dictionary which is added as attrs of the dataframe.
+    """
+
+    bool_constant = (df == df.iloc[0]).all()
+
+    # constant columns are flattened and converted to dict of attrs
+    df_attrs = df.loc[:, bool_constant].iloc[0].to_dict()
+
+    # varying columns are kept in output dataframe
+    df_simple = df.loc[:, ~bool_constant].copy()
+
+    # attach as attrs to dataframe
+    df_simple.attrs = df_attrs
+
+    return df_simple
+
+
+def code_description_attrs_from_dataframe(df: pd.DataFrame):
+    # create var_attrs_dict
+    colname_code_list = df.columns[df.columns.str.contains(".Code")]
+    var_attrs_dict = {}
+    for colname_code in colname_code_list:
+        colname_oms = colname_code.replace(".Code",".Omschrijving")
+        meas_twocol = df[[colname_code,colname_oms]].drop_duplicates()
+        attr_dict = meas_twocol.set_index(colname_code)[colname_oms].to_dict()
+        var_attrs_dict[colname_code] = attr_dict
+    return var_attrs_dict
+
+
+def dataframe_to_xarray(df: pd.DataFrame, drop_if_constant=[]):
+    """
+    Converts the measurement dataframe to a xarray dataset,
+    including several cleanups to minimize the size of the netcdf dataset on disk:
+    
+    - The column 'Parameter_Wat_Omschrijving' is dropped (combination of information in other columns)
+    - The column 'Meetwaarde.Waarde_Alfanumeriek' is dropped if 'Meetwaarde.Waarde_Numeriek' is present (contains duplicate values in that case)
+    - All Omschrijving columns are dropped and added as attributes to the Code variables
+    - All NVT-only Code columns are dropped and added as ds attributes
+    - All location columns are dropped and added as ds attributes
+    - All drop_if_constant columns are dropped and added as ds attributes (if the values are indeed constant)
+    
+    """
+
+    # create list of columns with duplicate info (often not constant), will be dropped
+    cols_bulky = ["Parameter_Wat_Omschrijving"]
+    if "Meetwaarde.Waarde_Alfanumeriek" in df.columns and 'Meetwaarde.Waarde_Numeriek' in df.columns:
+        # drop alfanumeriek if duplicate of numeriek # TODO: should not be returned by ddl
+        cols_bulky.append("Meetwaarde.Waarde_Alfanumeriek")
+
+    # create list of all omschrijving columns, will be dropped (added as ds[varn].attrs via code_description_attrs_from_dataframe())
+    cols_omschrijving = df.columns[df.columns.str.contains(".Omschrijving")].tolist()
+
+    # create list of all-NVT *.Code columns, will be dropped (codes added as ds.attrs)
+    bool_onlynvt_code = (df=='NVT').all(axis=0)
+    cols_onlynvt_code = df.columns[bool_onlynvt_code].tolist()
+    cols_onlynvt_code = [x for x in cols_onlynvt_code if x.endswith(".Code")]
+
+    # create list of location columns, will be dropped (added as ds.attrs)
+    cols_location = ['Code', 'Naam', 'Coordinatenstelsel', 'X', 'Y']
+
+    # add drop_if_constant colums to list if values are indeed constant, will be dropped (added as ds.attrs)
+    cols_constant = []
+    for colname in drop_if_constant:
+        assert colname in df.columns
+        if len(df[colname].drop_duplicates()) == 1:
+            cols_constant.append(colname)
+
+    # create ds attrs for all nvt/location/constant columns
+    ds_attrs = {}
+    attrs_columns = cols_onlynvt_code + cols_constant + cols_location
+    for colname in attrs_columns:
+        ds_attrs[colname] = df[colname].iloc[0]
+
+    # drop columns 
+    drop_columns = (cols_bulky + cols_location + cols_constant +
+                    cols_onlynvt_code + cols_omschrijving)
+    df_simple = df.drop(drop_columns, axis=1, errors='ignore')
+
+    # convert to UTC to please xarray
+    # TODO: adding tzone to time.encoding['units'] raises "ValueError: invalid time units: 1970-01-01 00:00:00 +01:00"
+    df_simple.index = df_simple.index.tz_convert(None)
+
+    # convert to xarray dataset and add ds_attrs
+    ds = df_simple.to_xarray()
+    ds = ds.assign_attrs(ds_attrs)
+
+    # assign attrs with code+omschrijving to each *.Code variable
+    var_attrs_dict = code_description_attrs_from_dataframe(df)
+    for varn in ds.data_vars:
+        if varn in var_attrs_dict.keys():
+            var_attrs = var_attrs_dict[varn]
+            ds[varn] = ds[varn].assign_attrs(var_attrs)
+
+    return ds
diff --git a/docs/modules.rst b/docs/modules.rst
@@ -17,3 +17,4 @@ ddlpy module
    :members:
    :undoc-members:
    :show-inheritance:
+   :member-order: bysource
diff --git a/notebooks/measurements.ipynb b/notebooks/measurements.ipynb
diff --git a/tests/test_ddlpy.py b/tests/test_ddlpy.py
@@ -155,14 +155,6 @@ def test_measurements_duplicated(measurements):
     assert isinstance(meas_clean.index, pd.DatetimeIndex)
 
 
-def test_simplify_dataframe(measurements):
-    assert len(measurements.columns) == 53
-    meas_simple = ddlpy.simplify_dataframe(measurements)
-    assert hasattr(meas_simple, "attrs")
-    assert len(meas_simple.attrs) == 50
-    assert len(meas_simple.columns) == 3
-
-
 datetype_list = ["string", "pd.Timestamp", "dt.datetime", "mixed"]
 @pytest.mark.parametrize("datetype", datetype_list)
 def test_check_convert_dates(datetype):
@@ -193,3 +185,43 @@ def test_check_convert_wrongorder():
     with pytest.raises(ValueError):
         start_date_out, end_date_out = ddlpy.ddlpy._check_convert_dates(end_date, start_date)
 
+
+def test_simplify_dataframe(measurements):
+    assert len(measurements.columns) == 53
+    meas_simple = ddlpy.simplify_dataframe(measurements)
+    assert hasattr(meas_simple, "attrs")
+    assert len(meas_simple.attrs) == 50
+    assert len(meas_simple.columns) == 3
+
+
+def test_dataframe_to_xarray(measurements):
+    drop_if_constant = ["WaarnemingMetadata.OpdrachtgevendeInstantieLijst",
+                        "WaarnemingMetadata.BemonsteringshoogteLijst",
+                        "WaarnemingMetadata.ReferentievlakLijst",
+                        "AquoMetadata_MessageID", 
+                        "BemonsteringsSoort.Code", 
+                        "Compartiment.Code", "Eenheid.Code", "Grootheid.Code", "Hoedanigheid.Code",
+                        ]
+    ds_clean = ddlpy.dataframe_to_xarray(measurements, drop_if_constant)
+
+    # check if constant value that was not in drop_if_constant list is indeed not dropped
+    assert "MeetApparaat.Code" in ds_clean.data_vars
+    assert len(ds_clean["MeetApparaat.Code"]) > 0
+
+    for varname in drop_if_constant:
+        if varname == "WaarnemingMetadata.OpdrachtgevendeInstantieLijst":
+            continue
+        assert varname not in ds_clean.data_vars
+        assert varname in ds_clean.attrs.keys()
+    assert "WaarnemingMetadata.OpdrachtgevendeInstantieLijst" in ds_clean.data_vars
+    assert "WaarnemingMetadata.OpdrachtgevendeInstantieLijst" not in ds_clean.attrs.keys()
+
+    data_vars_list = ['WaarnemingMetadata.StatuswaardeLijst',
+     'WaarnemingMetadata.KwaliteitswaardecodeLijst',
+     'MeetApparaat.Code',
+     'WaardeBepalingsmethode.Code',
+     'Meetwaarde.Waarde_Numeriek']
+    for varname in data_vars_list:
+        assert varname in ds_clean.data_vars
+
+    assert "X" in ds_clean.attrs.keys()