Skip to content

Commit

Permalink
Merge pull request #30 from metno/22-duplicate-filter
Browse files Browse the repository at this point in the history
adding duplicate data filter
  • Loading branch information
heikoklein authored May 27, 2024
2 parents 81a2ae3 + 54cfe1d commit 628f3bd
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 37 deletions.
6 changes: 3 additions & 3 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ long_description = file: README.md
long_description_content_type = text/markdown
classifiers =
Programming Language :: Python :: 3 :: Only
Programming Language :: Python :: 3.9
Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11
Programming Language :: Python :: 3.12
License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)
Operating System :: OS Independent
Development Status :: 4 - Beta
Expand All @@ -21,7 +21,7 @@ url = https://pyaro.readthedocs.io


[options]
python_version = >=3.9
python_version = >=3.10
install_requires =
numpy
importlib-metadata >= 3.6; python_version < "3.10"
Expand All @@ -33,9 +33,9 @@ test_require = tox:tox
[tox:tox]
min_version = 4.0
env_list =
py312
py311
py310
py39
depends =
pandas

Expand Down
21 changes: 8 additions & 13 deletions src/pyaro/timeseries/AutoFilterReaderEngine.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .Filter import VariableNameFilter, Filter, filters, FilterFactory


class UnkownFilterException(Exception):
class UnknownFilterException(Exception):
pass


Expand All @@ -26,20 +26,15 @@ class AutoFilterReader(Reader):
"""

@classmethod
def supported_filters(cls) -> [Filter]:
def supported_filters(cls) -> list[Filter]:
"""Get the default list of implemented filters.
:return: list of filters
"""
filts = []
for (
f
) in "variables,stations,countries,bounding_boxes,time_bounds,flags,time_variable_station".split(
supported = "variables,stations,countries,bounding_boxes,duplicates,time_bounds,flags,time_variable_station".split(
","
):
filts.append(filters.get(f))

return filts
)
return [filters.get(name) for name in supported]

def _set_filters(self, filters):
supported = set()
Expand All @@ -53,12 +48,12 @@ def _set_filters(self, filters):
filters = filtlist
for filt in filters:
if filt.__class__ not in supported:
raise UnkownFilterException(
raise UnknownFilterException(
f"Filter {filt.__class__} not supported in {supported}."
)
self._filters = filters

def _get_filters(self) -> [Filter]:
def _get_filters(self) -> list[Filter]:
"""Get a list of filters actually set during initialization of this object.
:return: list of filters
Expand Down Expand Up @@ -116,7 +111,7 @@ def reader_class(self) -> Reader:
"""
pass

def supported_filters(self) -> [Filter]:
def supported_filters(self) -> list[Filter]:
"""The supported filters by this Engine. Maps to the Readers supported_filters.
:return: a list of filters
Expand Down
71 changes: 52 additions & 19 deletions src/pyaro/timeseries/Filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ def name(self) -> str:
:return: a string to be used by FilterFactory
"""

def filter_data(self, data: Data, stations: [Station], variables: [str]) -> Data:
def filter_data(
self, data: Data, stations: list[Station], variables: list[str]
) -> Data:
"""Filtering of data
:param data: Data of e.g. a Reader.data(varname) call
Expand All @@ -61,7 +63,7 @@ def filter_stations(self, stations: dict[str, Station]) -> dict[str, Station]:
"""
return stations

def filter_variables(self, variables: [str]) -> [str]:
def filter_variables(self, variables: list[str]) -> list[str]:
"""Filtering of variables
:param variables: List of variables, e.g. from a Reader.variables() call
Expand Down Expand Up @@ -203,8 +205,8 @@ class VariableNameFilter(Filter):
def __init__(
self,
reader_to_new: dict[str, str] = {},
include: [str] = [],
exclude: [str] = [],
include: list[str] = [],
exclude: list[str] = [],
):
"""Create a new variable name filter.
Expand Down Expand Up @@ -251,7 +253,7 @@ def filter_data(self, data, stations, variables):
data._set_variable(self._reader_to_new.get(data.variable, data.variable))
return data

def filter_variables(self, variables: [str]) -> [str]:
def filter_variables(self, variables: list[str]) -> list[str]:
"""change variable name and reduce variables applying include and exclude parameters
:param variables: variable names as in the reader
Expand Down Expand Up @@ -308,7 +310,7 @@ def filter_data_idx(self, data: Data, stations: dict[str, Station], variables: s

@registered_filter
class StationFilter(StationReductionFilter):
def __init__(self, include: [str] = [], exclude: [str] = []):
def __init__(self, include: list[str] = [], exclude: list[str] = []):
self._include = set(include)
self._exclude = set(exclude)
return
Expand All @@ -333,7 +335,7 @@ def filter_stations(self, stations: dict[str, Station]) -> dict[str, Station]:

@registered_filter
class CountryFilter(StationReductionFilter):
def __init__(self, include: [str] = [], exclude: [str] = []):
def __init__(self, include: list[str] = [], exclude: list[str] = []):
"""Filter countries by ISO2 names (capitals!)
:param include: countries to include, defaults to [], meaning all countries
Expand Down Expand Up @@ -371,8 +373,8 @@ class BoundingBoxFilter(StationReductionFilter):

def __init__(
self,
include: [(float, float, float, float)] = [],
exclude: [(float, float, float, float)] = [],
include: list[(float, float, float, float)] = [],
exclude: list[(float, float, float, float)] = [],
):
"""Filter using geographical bounding-boxes. Coordinates should be given in the range
[-180,180] (degrees_east) for longitude and [-90,90] (degrees_north) for latitude.
Expand Down Expand Up @@ -460,7 +462,7 @@ def filter_stations(self, stations: dict[str, Station]) -> dict[str, Station]:

@registered_filter
class FlagFilter(DataIndexFilter):
def __init__(self, include: [Flag] = [], exclude: [Flag] = []):
def __init__(self, include: list[Flag] = [], exclude: list[Flag] = []):
"""Filter data by Flags
:param include: flags to include, defaults to [], meaning all flags
Expand Down Expand Up @@ -498,12 +500,12 @@ class TimeBoundsException(Exception):
class TimeBoundsFilter(DataIndexFilter):
def __init__(
self,
start_include: [(str, str)] = [],
start_exclude: [(str, str)] = [],
startend_include: [(str, str)] = [],
startend_exclude: [(str, str)] = [],
end_include: [(str, str)] = [],
end_exclude: [(str, str)] = [],
start_include: list[(str, str)] = [],
start_exclude: list[(str, str)] = [],
startend_include: list[(str, str)] = [],
startend_exclude: list[(str, str)] = [],
end_include: list[(str, str)] = [],
end_exclude: list[(str, str)] = [],
):
"""Filter data by start and/or end-times of the measurements. Each timebound consists
of a bound-start and bound-end (both included). Timestamps are given as YYYY-MM-DD HH:MM:SS
Expand All @@ -527,7 +529,7 @@ def __init__(
def name(self):
return "time_bounds"

def _str_list_to_datetime_list(self, tuple_list: [(str, str)]):
def _str_list_to_datetime_list(self, tuple_list: list[(str, str)]):
retlist = []
for start, end in tuple_list:
start_dt = datetime.strptime(start, self.time_format)
Expand All @@ -539,7 +541,7 @@ def _str_list_to_datetime_list(self, tuple_list: [(str, str)]):
retlist.append((start_dt, end_dt))
return retlist

def _datetime_list_to_str_list(self, tuple_list) -> [(str, str)]:
def _datetime_list_to_str_list(self, tuple_list) -> list[(str, str)]:
retlist = []
for start_dt, end_dt in tuple_list:
retlist.append(
Expand Down Expand Up @@ -579,7 +581,7 @@ def has_envelope(self):
or len(self._end_include)
)

def envelope(self) -> (datetime, datetime):
def envelope(self) -> tuple[datetime, datetime]:
"""Get the earliest and latest time possible for this filter.
:return: earliest start and end-time (approximately)
Expand Down Expand Up @@ -709,6 +711,37 @@ def filter_data_idx(self, data: Data, stations: dict[str, Station], variables: s
return idx


@registered_filter
class DuplicateFilter(DataIndexFilter):
default_keys = ["stations", "start_times", "end_times"]

def __init__(self, duplicate_keys: list[str] | None = None):
"""remove duplicates from the data. By default, data with common
station, start_time, end_time are consider duplicates. Only one of the duplicates
is kept.
:param duplicate_keys: list of data-fields/columns, defaults to None, being the same
as ["stations", "start_times", "end_times"]
"""
self._keys = duplicate_keys

def init_kwargs(self):
if self._keys is None:
return {}
else:
return {"duplicate_keys": self._keys}

def name(self):
return "duplicates"

def filter_data_idx(self, data: Data, stations: dict[str, Station], variables: str):
if self._keys is None:
xkeys = self.default_keys
else:
xkeys = self._keys
return np.unique(data[xkeys], return_index=True)[1]


if __name__ == "__main__":
for name, fil in filters._filters.items():
assert name == fil.name()
Expand Down
15 changes: 15 additions & 0 deletions tests/test_CSVTimeSeriesReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,21 @@ def test_variables_filter(self):
self.assertEqual(ts.data(newsox).variable, newsox)
pass

def test_duplicate_filter(self):
engine = pyaro.list_timeseries_engines()["csv_timeseries"]
with engine.open(
self.multifile_dir + "/csvReader_testdata2.csv",
filters={"duplicates": {"duplicate_keys": None}},
) as ts:
self.assertEqual(len(ts.data("NOx")), 8)
with engine.open(
self.multifile_dir + "/csvReader_testdata2.csv",
filters={
"duplicates": {"duplicate_keys": ["stations", "start_times", "values"]}
},
) as ts:
self.assertEqual(len(ts.data("NOx")), 10)

def test_filterFactory(self):
filters = pyaro.timeseries.filters.list()
print(filters["variables"])
Expand Down
4 changes: 2 additions & 2 deletions tests/testdata/datadir/csvReader_testdata2.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
NOx,station1,8,58,9.050856687678724,Gg,1998-01-01 00:00:00,1998-01-02 00:00:00,Rural
NOx,station1,8,58,14.281596629076395,Gg,1998-01-02 00:00:00,1998-01-03 00:00:00,Rural
NOx,station1,8,58,15.28751469312069,Gg,1998-01-03 00:00:00,1998-01-04 00:00:00,Rural
NOx,station1,8,58,14.281596629076395,Gg,1998-01-01 00:00:00,1998-01-02 00:00:00,Rural
NOx,station1,8,58,15.28751469312069,Gg,1998-01-01 00:00:00,1998-01-02 00:00:00,Rural
NOx,station1,8,58,7.9515805412807055,Gg,1998-01-04 00:00:00,1998-01-05 00:00:00,Rural
NOx,station1,8,58,13.455696217690287,Gg,1998-01-05 00:00:00,1998-01-06 00:00:00,Rural
NOx,station1,8,58,12.07610491450203,Gg,1998-01-06 00:00:00,1998-01-07 00:00:00,Rural
Expand Down

0 comments on commit 628f3bd

Please sign in to comment.