From 24c1c9929c6415d181a3a818fc8c852b93a1ce13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?thorbjoernl=20=28Thorbj=C3=B8rn=29?= <51087536+thorbjoernl@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:52:21 +0000 Subject: [PATCH 1/6] Set defaults explicitly and test for undocumented xarray behaviour --- pyaerocom/stats/mda8/mda8.py | 3 ++- tests/stats/mda8/test_mda8.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/pyaerocom/stats/mda8/mda8.py b/pyaerocom/stats/mda8/mda8.py index 54b031c34..32cbb027e 100644 --- a/pyaerocom/stats/mda8/mda8.py +++ b/pyaerocom/stats/mda8/mda8.py @@ -100,10 +100,11 @@ def _calc_mda8(data: xr.DataArray) -> xr.DataArray: def _rolling_average_8hr(arr: xr.DataArray) -> xr.DataArray: + # Labeling should be right which probably is the default in xarray. return arr.rolling(time=8, min_periods=6).mean() def _daily_max(arr: xr.DataArray) -> xr.DataArray: - return arr.resample(time="24H", offset="1h").reduce( + return arr.resample(time="24H", origin="start_day", label="left", offset="1h").reduce( lambda x, axis: np.apply_along_axis(min_periods_max, 1, x, min_periods=18) ) diff --git a/tests/stats/mda8/test_mda8.py b/tests/stats/mda8/test_mda8.py index 476c5650d..fbf9a6995 100644 --- a/tests/stats/mda8/test_mda8.py +++ b/tests/stats/mda8/test_mda8.py @@ -154,6 +154,26 @@ def test_rollingaverage(test_data, exp_ravg): assert all(ravg[0, :, 0] == pytest.approx(exp_ravg, abs=0, nan_ok=True)) +def test_rollingaverage_label(): + """ + Checks that the labels of rolling average is correct (we want it to be labeled based + on the "left-most" interval, ie. earliest measurement). This seems to be the + current behaviour of xarray (but not well documented) and this test should + detect if this behaviour changes in xarray in the future. + """ + data = xr.DataArray( + [[[x] for x in range(24)]], + dims=["data_source", "time", "station_name"], + coords={"time": xr.date_range(start="2024-01-01 00:00", periods=24, freq="1h")}, + ) + + ravg = _rolling_average_8hr(data) + + assert np.all( + ravg.get_index("time") == xr.date_range(start="2024-01-01 00:00", periods=24, freq="1h") + ) + + @pytest.mark.parametrize( "time,values,exp_daily_max", ( From 338984a6386117793e4db306c3f6e219b74ec63d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?thorbjoernl=20=28Thorbj=C3=B8rn=29?= <51087536+thorbjoernl@users.noreply.github.com> Date: Fri, 23 Aug 2024 07:21:05 +0000 Subject: [PATCH 2/6] Fix mda8 misaligned --- pyaerocom/stats/mda8/mda8.py | 9 +++++++-- tests/stats/mda8/test_mda8.py | 30 +++++++++++++++--------------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/pyaerocom/stats/mda8/mda8.py b/pyaerocom/stats/mda8/mda8.py index 32cbb027e..88e118f99 100644 --- a/pyaerocom/stats/mda8/mda8.py +++ b/pyaerocom/stats/mda8/mda8.py @@ -100,8 +100,13 @@ def _calc_mda8(data: xr.DataArray) -> xr.DataArray: def _rolling_average_8hr(arr: xr.DataArray) -> xr.DataArray: - # Labeling should be right which probably is the default in xarray. - return arr.rolling(time=8, min_periods=6).mean() + # Xarray labels the data left, while we want it to be labeled right, so we add 8h to + # the time index. + new_arr = arr.rolling(time=8, min_periods=6).mean() + # Xarray labels the data left, while we want it to be labeled right, so we add 8h to + # the time index. + new_arr["time"] = new_arr.get_index("time") + pd.Timedelta("8h") + return new_arr def _daily_max(arr: xr.DataArray) -> xr.DataArray: diff --git a/tests/stats/mda8/test_mda8.py b/tests/stats/mda8/test_mda8.py index fbf9a6995..64f53000f 100644 --- a/tests/stats/mda8/test_mda8.py +++ b/tests/stats/mda8/test_mda8.py @@ -1,6 +1,7 @@ import numpy as np import pytest import xarray as xr +import pandas as pd from pyaerocom.colocation.colocated_data import ColocatedData from pyaerocom.stats.mda8.mda8 import ( @@ -32,7 +33,7 @@ def test_data(time, values) -> xr.DataArray: pytest.param( xr.date_range(start="2024-01-01 01:00", periods=49, freq="1h"), np.linspace(start=1, stop=49, num=49), - [20.5, 44.5, np.nan], + [np.nan, 36.5, np.nan], id="incrementing-by-1", ), pytest.param( @@ -49,6 +50,13 @@ def test_data(time, values) -> xr.DataArray: [np.nan] * 3, id="with-nans", ), + # https://github.com/metno/pyaerocom/issues/1323 + pytest.param( + xr.date_range(start="2024-01-01 06:00:00", periods=30, freq="1h"), + np.arange(30), + [np.nan, 25.5], + id="#1323", + ), ), ) def test_calc_mda8(test_data, exp_mda8): @@ -90,26 +98,17 @@ def test_coldata_to_mda8(coldata): assert mda8.shape == (2, 8, 1) assert mda8.data.values[0, :, 0] == pytest.approx( - [ - np.nan, - np.nan, - 1.18741556, - 1.18777241, - 1.18869106, - 1.18879322, - 1.18807846, - 1.18700801, - ], + [np.nan, np.nan, 1.18853785, 1.18604125, 1.18869106, 1.18879322, 1.18807846, 1.18700801], abs=10**-5, nan_ok=True, ) assert mda8.data.values[1, :, 0] == pytest.approx( [ - 1.57327333, + np.nan, 1.28884431, - 1.28741556, - 1.28777241, + 1.28853785, + 1.28604125, 1.28869106, 1.28879322, 1.28807846, @@ -170,7 +169,8 @@ def test_rollingaverage_label(): ravg = _rolling_average_8hr(data) assert np.all( - ravg.get_index("time") == xr.date_range(start="2024-01-01 00:00", periods=24, freq="1h") + ravg.get_index("time") + == xr.date_range(start="2024-01-01 00:00", periods=24, freq="1h") + pd.Timedelta("8h") ) From 7b70bb18e2b631b8ae1875a3953e2a5c294e856d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?thorbjoernl=20=28Thorbj=C3=B8rn=29?= <51087536+thorbjoernl@users.noreply.github.com> Date: Fri, 23 Aug 2024 07:27:00 +0000 Subject: [PATCH 3/6] fix --- tests/stats/mda8/test_mda8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/stats/mda8/test_mda8.py b/tests/stats/mda8/test_mda8.py index 64f53000f..1b9d7ee70 100644 --- a/tests/stats/mda8/test_mda8.py +++ b/tests/stats/mda8/test_mda8.py @@ -27,7 +27,7 @@ def test_data(time, values) -> xr.DataArray: pytest.param( xr.date_range(start="2024-01-01 01:00", periods=49, freq="1h"), [0.0] * 49, - [0, 0, np.nan], + [np.nan, 0, np.nan], id="zeros", ), pytest.param( From 1fa4729e7eb106baa252676f8f581ac03140322c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?thorbjoernl=20=28Thorbj=C3=B8rn=29?= <51087536+thorbjoernl@users.noreply.github.com> Date: Fri, 23 Aug 2024 07:52:59 +0000 Subject: [PATCH 4/6] . --- tests/stats/mda8/test_mda8.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/stats/mda8/test_mda8.py b/tests/stats/mda8/test_mda8.py index 1b9d7ee70..707a727d2 100644 --- a/tests/stats/mda8/test_mda8.py +++ b/tests/stats/mda8/test_mda8.py @@ -156,9 +156,9 @@ def test_rollingaverage(test_data, exp_ravg): def test_rollingaverage_label(): """ Checks that the labels of rolling average is correct (we want it to be labeled based - on the "left-most" interval, ie. earliest measurement). This seems to be the - current behaviour of xarray (but not well documented) and this test should - detect if this behaviour changes in xarray in the future. + on the "right-most" interval, ie. latest measurement in the interval). + + https://github.com/metno/pyaerocom/issues/1323 """ data = xr.DataArray( [[[x] for x in range(24)]], From f08893760b1626493027440c9b4d4f329425ec37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?thorbjoernl=20=28Thorbj=C3=B8rn=29?= <51087536+thorbjoernl@users.noreply.github.com> Date: Fri, 23 Aug 2024 12:01:44 +0000 Subject: [PATCH 5/6] Undo right-shifting --- pyaerocom/stats/mda8/mda8.py | 9 ++------- tests/stats/mda8/test_mda8.py | 21 ++++++++++----------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/pyaerocom/stats/mda8/mda8.py b/pyaerocom/stats/mda8/mda8.py index 88e118f99..99ce4bbe4 100644 --- a/pyaerocom/stats/mda8/mda8.py +++ b/pyaerocom/stats/mda8/mda8.py @@ -100,13 +100,8 @@ def _calc_mda8(data: xr.DataArray) -> xr.DataArray: def _rolling_average_8hr(arr: xr.DataArray) -> xr.DataArray: - # Xarray labels the data left, while we want it to be labeled right, so we add 8h to - # the time index. - new_arr = arr.rolling(time=8, min_periods=6).mean() - # Xarray labels the data left, while we want it to be labeled right, so we add 8h to - # the time index. - new_arr["time"] = new_arr.get_index("time") + pd.Timedelta("8h") - return new_arr + # Xarray labels the data right based on the last data point in the period for rolling. + return arr.rolling(time=8, center=False, min_periods=6).mean() def _daily_max(arr: xr.DataArray) -> xr.DataArray: diff --git a/tests/stats/mda8/test_mda8.py b/tests/stats/mda8/test_mda8.py index 707a727d2..d81fb8294 100644 --- a/tests/stats/mda8/test_mda8.py +++ b/tests/stats/mda8/test_mda8.py @@ -1,7 +1,6 @@ import numpy as np import pytest import xarray as xr -import pandas as pd from pyaerocom.colocation.colocated_data import ColocatedData from pyaerocom.stats.mda8.mda8 import ( @@ -27,13 +26,13 @@ def test_data(time, values) -> xr.DataArray: pytest.param( xr.date_range(start="2024-01-01 01:00", periods=49, freq="1h"), [0.0] * 49, - [np.nan, 0, np.nan], + [0, 0, np.nan], id="zeros", ), pytest.param( xr.date_range(start="2024-01-01 01:00", periods=49, freq="1h"), np.linspace(start=1, stop=49, num=49), - [np.nan, 36.5, np.nan], + [20.5, 44.5, np.nan], id="incrementing-by-1", ), pytest.param( @@ -54,7 +53,7 @@ def test_data(time, values) -> xr.DataArray: pytest.param( xr.date_range(start="2024-01-01 06:00:00", periods=30, freq="1h"), np.arange(30), - [np.nan, 25.5], + [np.nan, np.nan], id="#1323", ), ), @@ -98,17 +97,17 @@ def test_coldata_to_mda8(coldata): assert mda8.shape == (2, 8, 1) assert mda8.data.values[0, :, 0] == pytest.approx( - [np.nan, np.nan, 1.18853785, 1.18604125, 1.18869106, 1.18879322, 1.18807846, 1.18700801], + [np.nan, np.nan, 1.18741556, 1.18777241, 1.18869106, 1.18879322, 1.18807846, 1.18700801], abs=10**-5, nan_ok=True, ) assert mda8.data.values[1, :, 0] == pytest.approx( [ - np.nan, + 1.57327333, 1.28884431, - 1.28853785, - 1.28604125, + 1.28741556, + 1.28777241, 1.28869106, 1.28879322, 1.28807846, @@ -156,7 +155,8 @@ def test_rollingaverage(test_data, exp_ravg): def test_rollingaverage_label(): """ Checks that the labels of rolling average is correct (we want it to be labeled based - on the "right-most" interval, ie. latest measurement in the interval). + on the "right-most" interval, ie. latest measurement in the interval). This is currently + the case in xarray but not greatly documented, so this test checks for that. https://github.com/metno/pyaerocom/issues/1323 """ @@ -169,8 +169,7 @@ def test_rollingaverage_label(): ravg = _rolling_average_8hr(data) assert np.all( - ravg.get_index("time") - == xr.date_range(start="2024-01-01 00:00", periods=24, freq="1h") + pd.Timedelta("8h") + ravg.get_index("time") == xr.date_range(start="2024-01-01 00:00", periods=24, freq="1h") ) From ce69c9c2c4c2b3969a56e4dfd8c3b86a1e46b4f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?thorbjoernl=20=28Thorbj=C3=B8rn=29?= <51087536+thorbjoernl@users.noreply.github.com> Date: Fri, 23 Aug 2024 12:09:33 +0000 Subject: [PATCH 6/6] Address code review --- tests/stats/mda8/test_mda8.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/stats/mda8/test_mda8.py b/tests/stats/mda8/test_mda8.py index d81fb8294..583ac2896 100644 --- a/tests/stats/mda8/test_mda8.py +++ b/tests/stats/mda8/test_mda8.py @@ -168,9 +168,8 @@ def test_rollingaverage_label(): ravg = _rolling_average_8hr(data) - assert np.all( - ravg.get_index("time") == xr.date_range(start="2024-01-01 00:00", periods=24, freq="1h") - ) + assert ravg["time"].isel(time=0) == np.datetime64("2024-01-01 00:00") + assert ravg["time"].isel(time=23) == np.datetime64("2024-01-01 23:00") @pytest.mark.parametrize(