diff --git a/README.md b/README.md index e35c4c9..e248fb3 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ print(pyaro.timeseries.filters.list) with engines['csv_timeseries'].open( filename=TEST_FILE, - filters=[pyaro.timeseries.filters.get('countries', include=['NO'])] + filters={'countries': {include=['NO']}} ) as ts: for var in ts.variables(): # stations diff --git a/docs/tutorials/basic_api.ipynb b/docs/tutorials/basic_api.ipynb index d32b406..2bef6c3 100644 --- a/docs/tutorials/basic_api.ipynb +++ b/docs/tutorials/basic_api.ipynb @@ -6,12 +6,12 @@ "source": [ "# Pyaro basic example\n", "\n", - "Install pyaro and check if installation is new enough:" + "* Install pyaro and check if installation is new enough:" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -20,7 +20,7 @@ "'0.0.5'" ] }, - "execution_count": 2, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -34,13 +34,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Check a list of installed engines. The most basic installation will install only the `csv_timeseries` engine.\n", + "* Check a list of installed engines. The most basic installation will install only the `csv_timeseries` engine.\n", "Install e.g. `https://github.com/metno/pyaro-readers` for many more engines." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -49,7 +49,7 @@ "{'csv_timeseries': }" ] }, - "execution_count": 3, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -62,12 +62,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Learn a bit about the engine." + "* Learn a bit about the engine." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -144,12 +144,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Check the description and the open-arguments to open a database with this engine:" + "* Check the description and the open-arguments to open a database with this engine:" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -170,6 +170,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "## Opening a datasource with an engine\n", + "\n", "Open now the timeseries `ts` with a table. You could do that with a `with` clause in larger code, \n", "but for simplicity, we don't do that here. `columns` map the files columns to the data, starting\n", "with first column as 0, which contains the variable-name in our example file.\n", @@ -180,7 +182,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -212,12 +214,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`ts` is now the handle to all data." + "`ts` is now the handle to the data-source.\n", + "\n", + "* Accessing metadata in the datasource, like available variables and stations" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -225,7 +229,7 @@ "output_type": "stream", "text": [ "dict_keys(['SOx', 'NOx'])\n", - "{'station1': , 'station2': }\n" + "{'station1': , 'station2': }\n" ] } ], @@ -234,11 +238,26 @@ "print(ts.stations())" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* The timeseries must be accessed per variable. It will be returned for all\n", + "stations. The data-columns can be accessed by `keys()`:" + ] + }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 24, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('values', 'stations', 'latitudes', 'longitudes', 'altitudes', 'start_times', 'end_times', 'flags', 'standard_deviations')\n" + ] + }, { "data": { "text/plain": [ @@ -265,7 +284,7 @@ " 93.348305 , 97.57919 , 19.217777 , 11.676097 ], dtype=float32)" ] }, - "execution_count": 15, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -273,6 +292,7 @@ "source": [ "var = 'SOx'\n", "ts_data = ts.data(var)\n", + "print(ts_data.keys())\n", "ts_data.stations\n", "ts_data.start_times\n", "ts_data.end_times\n", @@ -287,12 +307,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "## Conversion to pandas\n", + "\n", "For pandas users, the timeseries data can be converted to a dataframe:" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -495,7 +517,7 @@ "[104 rows x 9 columns]" ] }, - "execution_count": 16, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } diff --git a/docs/tutorials/filters.ipynb b/docs/tutorials/filters.ipynb new file mode 100644 index 0000000..dae4390 --- /dev/null +++ b/docs/tutorials/filters.ipynb @@ -0,0 +1,258 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Working with Pyaro filters\n", + "\n", + "Filters in Pyaro exist to reduce or modify the amount of data delivered by a database.\n", + "\n", + "Pyaro has a set of build-in filters under `pyaro.filters`. In addition, engines can add\n", + "additional filters for their specific engine.\n", + "\n", + "## Listing the default filters" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mappingproxy({'variables': VariableNameFilter(**{'reader_to_new': {}, 'include': [], 'exclude': []}),\n", + " 'stations': StationFilter(**{'include': [], 'exclude': []}),\n", + " 'countries': CountryFilter(**{'include': [], 'exclude': []}),\n", + " 'bounding_boxes': BoundingBoxFilter(**{'include': [], 'exclude': []}),\n", + " 'flags': FlagFilter(**{'include': [], 'exclude': []}),\n", + " 'time_bounds': TimeBoundsFilter(**{'start_include': [], 'start_exclude': [], 'startend_include': [], 'startend_exclude': [], 'end_include': [], 'end_exclude': []})})" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pyaro\n", + "\n", + "pyaro.timeseries.filters.list()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The keys of the return dictionary, i.e. variables, stations,... should be used to get a initialized filter, e.g. a\n", + "country-filter selecting only Norway:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "countries\n", + "CountryFilter(**{'include': ['NO'], 'exclude': []})\n" + ] + } + ], + "source": [ + "norway_filter = pyaro.timeseries.filters.get('countries', **{'include': ['NO']})\n", + "print(norway_filter.name())\n", + "print(norway_filter)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Listing the filters of an engine" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[VariableNameFilter(**{'reader_to_new': {}, 'include': [], 'exclude': []}),\n", + " StationFilter(**{'include': [], 'exclude': []}),\n", + " CountryFilter(**{'include': [], 'exclude': []}),\n", + " BoundingBoxFilter(**{'include': [], 'exclude': []}),\n", + " TimeBoundsFilter(**{'start_include': [], 'start_exclude': [], 'startend_include': [], 'startend_exclude': [], 'end_include': [], 'end_exclude': []}),\n", + " FlagFilter(**{'include': [], 'exclude': []})]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pr_csv = pyaro.list_timeseries_engines()['csv_timeseries']\n", + "pr_csv.supported_filters()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Programmatic and Declarative usage of filters\n", + "\n", + "When opening the data-source/the database, these filters can be given as dictionary or list.\n", + "The following two open-calls are identical. The first one is programmatical, while the second one\n", + "is declarative. The declarative version is often preferred for use in larger programs like pyaerocom." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"{'station': 'station1', 'latitude': 10.5, 'longitude': 172.5, 'altitude': 0.0, 'long_name': 'station1', 'country': 'NO', 'url': ''}\", \"{'station': 'station2', 'latitude': 45.5, 'longitude': -103.2, 'altitude': 0.0, 'long_name': 'station2', 'country': 'NO', 'url': ''}\"]\n", + "[\"{'station': 'station1', 'latitude': 10.5, 'longitude': 172.5, 'altitude': 0.0, 'long_name': 'station1', 'country': 'NO', 'url': ''}\", \"{'station': 'station2', 'latitude': 45.5, 'longitude': -103.2, 'altitude': 0.0, 'long_name': 'station2', 'country': 'NO', 'url': ''}\"]\n" + ] + } + ], + "source": [ + "ts = pyaro.open_timeseries('csv_timeseries', filename=\"../../tests/testdata/csvReader_testdata.csv\", filters=[norway_filter])\n", + "print([str(stat) for stat in ts.stations().values()])\n", + "ts = pyaro.open_timeseries('csv_timeseries', filename=\"../../tests/testdata/csvReader_testdata.csv\", filters={'countries': {'include': ['NO']}})\n", + "print([str(stat) for stat in ts.stations().values()])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filter-Usage outside of a Reader\n", + "\n", + "Sometimes users want to work with a existing reader with different sets of filters. Many Filters (all which inherit\n", + "from DataIndexFilter) can work with an existing reader. The `FilterCollection` helps to bundle these filters." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "all SOx data: 104\n", + "filtered SOx data: 52\n" + ] + } + ], + "source": [ + "ts = pyaro.open_timeseries('csv_timeseries', filename=\"../../tests/testdata/csvReader_testdata.csv\")\n", + "fc = pyaro.timeseries.FilterCollection({\n", + " \"countries\": {\"include\": [\"NO\"]},\n", + " \"stations\": {\"include\": [\"station1\"]},\n", + " })\n", + "print(\"all SOx data:\", len(ts.data(\"SOx\"))) # 104\n", + "print(\"filtered SOx data:\", len(fc.filter(ts, \"SOx\"))) # 52" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to apply several filtercollections on the same data without re-reading it from the reader\n", + "you can use `FilterCollection.filter_data`, i.e. here for filtering data more explicit." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data-points for SOx and station1: 52\n", + "Data-points for SOx and station2: 52\n" + ] + } + ], + "source": [ + "# store some information the filters might need\n", + "stations = ts.stations()\n", + "variables = ts.variables()\n", + "all_data = ts.data(\"SOx\")\n", + "for station in stations.keys():\n", + " fc = pyaro.timeseries.FilterCollection({\n", + " \"countries\": {\"include\": [\"NO\"]},\n", + " \"stations\": {\"include\": [station]},\n", + " })\n", + " data = fc.filter_data(all_data, stations, variables)\n", + " print(f\"Data-points for {data.variable} and {data.stations[0]}: {len(data)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filtering data without using a Filter\n", + "\n", + "The `Data` returned from a reader can also be sliced with a numpy-index array (boolean array with the same\n", + "size as data). The follow example will only give data-points for low latitutudes <20° north (i.e. only station2,\n", + "see above stations listing.)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "low latitude data-points: 52\n" + ] + } + ], + "source": [ + "low_lat_data = all_data.slice((all_data.latitudes < 20) & (all_data.latitudes > -20))\n", + "print(\"low latitude data-points: \", len(low_lat_data))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/tutorials/index.rst b/docs/tutorials/index.rst index d7b8425..0429898 100644 --- a/docs/tutorials/index.rst +++ b/docs/tutorials/index.rst @@ -10,9 +10,10 @@ Getting started :hidden: basic_api.ipynb + filters.ipynb This section contains tutorials that are meant to help you getting started quickly with pyaro. - `Basic pyaro API usage `__ | *basic_api.ipynb* - +- `pyaro Filter usage `__ | *filters.ipynb* diff --git a/setup.cfg b/setup.cfg index 3bc5392..99fc906 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = pyaro -version = 0.0.5 +version = 0.0.6 author = Heiko Klein, Daniel Heinesen author_email = Heiko.Klein@met.no description = pyaro py-aerocom reader objects diff --git a/src/pyaro/__version__.py b/src/pyaro/__version__.py deleted file mode 100644 index 8b13789..0000000 --- a/src/pyaro/__version__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/src/pyaro/timeseries/Filter.py b/src/pyaro/timeseries/Filter.py index 68dc180..ffe444a 100644 --- a/src/pyaro/timeseries/Filter.py +++ b/src/pyaro/timeseries/Filter.py @@ -69,6 +69,23 @@ def __repr__(self): return f"{type(self).__name__}(**{self.init_kwargs()})" +class DataIndexFilter(Filter): + """A abstract baseclass implementing filter_data by an abstract method + filter_data_idx""" + + @abc.abstractmethod + def filter_data_idx(self, data: Data, stations: dict[str, Station], variables: str): + """Filter data to an index which can be applied to Data.slice(idx) later + + :return: a index for Data.slice(idx) + """ + pass + + def filter_data(self, data: Data, stations: dict[str, Station], variables: str): + idx = self.filter_data_idx(data, stations, variables) + return data.slice(idx) + + class FilterFactoryException(Exception): pass @@ -110,6 +127,62 @@ def list(self) -> dict[str, Filter]: filters = FilterFactory() +class FilterCollectionException(Exception): + pass + + +class FilterCollection: + def __init__(self, filterlist=[]): + """A collection of DataIndexFilters which can be appied together. + + :param filterlist: _description_, defaults to [] + :return: _description_ + """ + self._filters = [] + tmp_filterlist = [] + if isinstance(filterlist, dict): + for name, kwargs in filterlist.items(): + tmp_filterlist.append(filters.get(name, **kwargs)) + else: + tmp_filterlist = filterlist + for f in tmp_filterlist: + self.add(f) + + def add(self, difilter: DataIndexFilter): + if not isinstance(difilter, DataIndexFilter): + raise FilterCollectionException( + f"filter not a DataIndexFilter, so can't add to collection" + ) + else: + self._filters.append(difilter) + + def filter_data( + self, data: Data, stations: dict[str, Station], variables: str + ) -> Data: + """Filter data with all filters in this collection. + + :param data: Data from a timeseries-reader, i.e. retrieved by ts.data(varname) + :param stations: stations-dict of a reader, i.e. retrieved by ts.stations() + :param variables: variables of a reader, i.e. retrieved by ts.variables() + :return: _description_ + """ + for fi in self._filters: + data = fi.filter_data(data, stations, variables) + return data + + def filter(self, ts_reader, variable: str) -> Data: + """Filter the data for a variable of a reader with all filters in this collection. + + :param ts_reader: a timeseries-reader instance + :param variable: a valid variable-name + :return: filtered data + """ + stations = ts_reader.stations() + variables = ts_reader.variables() + data = ts_reader.data(variable) + return self.filter_data(data, stations, variables) + + class VariableNameFilter(Filter): """Filter to change variable-names and/or include/exclude variables""" @@ -203,23 +276,6 @@ def has_reader_variable(self, variable) -> bool: filters.register(VariableNameFilter()) -class DataIndexFilter(Filter): - """A abstract baseclass implementing filter_data by an abstract method - filter_data_idx""" - - @abc.abstractmethod - def filter_data_idx(self, data: Data, stations: dict[str, Station], variables: str): - """Filter data to an index which can be applied to Data.slice(idx) later - - :return: a index for Data.slice(idx) - """ - pass - - def filter_data(self, data: Data, stations: dict[str, Station], variables: str): - idx = self.filter_data_idx(data, stations, variables) - return data.slice(idx) - - class StationReductionFilter(DataIndexFilter): """Abstract method for all filters, which work on reducing the number of stations only. diff --git a/src/pyaro/timeseries/__init__.py b/src/pyaro/timeseries/__init__.py index 20a42ac..a841d6a 100644 --- a/src/pyaro/timeseries/__init__.py +++ b/src/pyaro/timeseries/__init__.py @@ -2,4 +2,4 @@ from .Engine import Engine from .Reader import Reader from .Station import Station -from .Filter import filters +from .Filter import filters, FilterCollection diff --git a/tests/test_CSVTimeSeriesReader.py b/tests/test_CSVTimeSeriesReader.py index 19e665d..61d8b21 100644 --- a/tests/test_CSVTimeSeriesReader.py +++ b/tests/test_CSVTimeSeriesReader.py @@ -216,6 +216,21 @@ def test_filterFactory(self): print(filters["variables"]) self.assertTrue(True) + def test_filterCollection(self): + with pyaro.open_timeseries( + "csv_timeseries", + filename=self.file, + ) as ts: + filters = pyaro.timeseries.FilterCollection( + { + "countries": {"include": ["NO"]}, + "stations": {"include": ["station1"]}, + } + ) + data1 = ts.data("SOx") + data2 = filters.filter(ts, "SOx") + self.assertEqual(len(data1), 2 * len(data2)) + @unittest.skipUnless(has_pandas, "no pandas installed") def test_timeseries_data_to_pd(self): with pyaro.open_timeseries(