Merge pull request #1315 from metno/cloud_filtering_1310

EARLINET cloud filtering
metno · Aug 20, 2024 · 7c6d8b8 · 7c6d8b8
2 parents f73e411 + 9e48af9
commit 7c6d8b8
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 19 deletions.
diff --git a/pyaerocom/aeroval/coldatatojson_helpers.py b/pyaerocom/aeroval/coldatatojson_helpers.py
@@ -16,10 +16,17 @@
 from pyaerocom._warnings import ignore_warnings
 from pyaerocom.aeroval.exceptions import ConfigError, TrendsError
 from pyaerocom.aeroval.fairmode_stats import fairmode_stats
-from pyaerocom.aeroval.helpers import _get_min_max_year_periods, _period_str_to_timeslice
+from pyaerocom.aeroval.helpers import (
+    _get_min_max_year_periods,
+    _period_str_to_timeslice,
+)
 from pyaerocom.config import ALL_REGION_NAME
 from pyaerocom.exceptions import DataCoverageError, TemporalResolutionError
-from pyaerocom.region import Region, find_closest_region_coord, get_all_default_region_ids
+from pyaerocom.region import (
+    Region,
+    find_closest_region_coord,
+    get_all_default_region_ids,
+)
 from pyaerocom.region_defs import HTAP_REGIONS_DEFAULT, OLD_AEROCOM_REGIONS
 from pyaerocom.stats.stats import _init_stats_dummy, calculate_statistics
 from pyaerocom.trends_engine import TrendsEngine
@@ -627,7 +634,7 @@ def _make_regional_trends(
                     station_obs_trends.append(obs_trend)
             except TrendsError as e:
                 msg = f"Failed to calculate trends, and will skip. This was due to {e}"
-                logger.warning(msg)
+                logger.info(msg)
 
     if len(station_obs_trends) == 0 or len(station_mod_trends) == 0:
         trends_successful = False
@@ -813,7 +820,7 @@ def process_trends(
             trends_successful = True
         except TrendsError as e:
             msg = f"Failed to calculate trends, and will skip. This was due to {e}"
-            logger.warning(msg)
+            logger.info(msg)
 
     if trends_successful:
         # The whole trends dicts are placed in the stats dict
@@ -1028,7 +1035,7 @@ def _process_map_and_scat(
 
                                 except TrendsError as e:
                                     msg = f"Failed to calculate trends, and will skip. This was due to {e}"
-                                    logger.warning(msg)
+                                    logger.info(msg)
 
                     perstr = f"{per}-{season}"
                     map_stat[freq][perstr] = stats
@@ -1046,7 +1053,11 @@ def _process_map_and_scat(
                             obs = mod = jsdate = scat_dummy
                         else:
                             obs, mod = obs_vals.tolist(), mod_vals.tolist()
-                        scat_data[site][perstr] = {"obs": obs, "mod": mod, "date": jsdate}
+                        scat_data[site][perstr] = {
+                            "obs": obs,
+                            "mod": mod,
+                            "date": jsdate,
+                        }
 
     return (map_data, scat_data)
 
@@ -1547,13 +1558,17 @@ def process_profile_data_for_regions(
                         subset = per_season_subset.filter_region(
                             region_id=region_id, check_country_meta=use_country
                         )
-
-                        output["obs"][freq][perstr] = np.nanmean(subset.data[0, :, :])
-                        output["mod"][freq][perstr] = np.nanmean(subset.data[1, :, :])
+                        with ignore_warnings(
+                            RuntimeWarning,
+                            "Mean of empty slice",
+                            "All-NaN slice encountered",
+                        ):
+                            output["obs"][freq][perstr] = np.nanmean(subset.data[0, :, :])
+                            output["mod"][freq][perstr] = np.nanmean(subset.data[1, :, :])
 
                     except (DataCoverageError, TemporalResolutionError) as e:
                         msg = f"Failed to access subset timeseries, and will skip. Reason was: {e}"
-                        logger.warning(msg)
+                        logger.info(msg)
 
                         output["obs"][freq][perstr] = np.nan
                         output["mod"][freq][perstr] = np.nan
@@ -1614,12 +1629,17 @@ def process_profile_data_for_stations(
                             == station_name,  # in this case a station
                         ]  # Assumes ordering of station name matches
 
-                        output["obs"][freq][perstr] = np.nanmean(subset.data[0, :, :])
-                        output["mod"][freq][perstr] = np.nanmean(subset.data[1, :, :])
+                        with ignore_warnings(
+                            RuntimeWarning,
+                            "Mean of empty slice",
+                            "All-NaN slice encountered",
+                        ):
+                            output["obs"][freq][perstr] = np.nanmean(subset.data[0, :, :])
+                            output["mod"][freq][perstr] = np.nanmean(subset.data[1, :, :])
 
                     except (DataCoverageError, TemporalResolutionError) as e:
                         msg = f"Failed to access subset timeseries, and will skip. Reason was: {e}"
-                        logger.warning(msg)
+                        logger.info(msg)
 
                         output["obs"][freq][perstr] = np.nan
                         output["mod"][freq][perstr] = np.nan

diff --git a/pyaerocom/aeroval/experiment_output.py b/pyaerocom/aeroval/experiment_output.py
@@ -8,7 +8,12 @@
 import aerovaldb
 
 from pyaerocom import const
-from pyaerocom._lowlevel_helpers import DirLoc, StrType, TypeValidator, sort_dict_by_name
+from pyaerocom._lowlevel_helpers import (
+    DirLoc,
+    StrType,
+    TypeValidator,
+    sort_dict_by_name,
+)
 from pyaerocom.aeroval.collections import ObsCollection
 from pyaerocom.aeroval.glob_defaults import (
     VariableInfo,
@@ -32,7 +37,8 @@
 from pyaerocom.variable_helpers import get_aliases
 
 MapInfo = namedtuple(
-    "MapInfo", ["obs_network", "obs_var", "vert_code", "mod_name", "mod_var", "time_period"]
+    "MapInfo",
+    ["obs_network", "obs_var", "vert_code", "mod_name", "mod_var", "time_period"],
 )
 
 logger = logging.getLogger(__name__)
@@ -94,9 +100,11 @@ def __init__(self, cfg: EvalSetup):
         self.cfg = cfg
         super().__init__(
             cfg.proj_id,
-            cfg.path_manager.json_basedir
-            if cfg.path_manager.avdb_resource is None
-            else cfg.path_manager.json_basedir,
+            (
+                cfg.path_manager.json_basedir
+                if cfg.path_manager.avdb_resource is None
+                else cfg.path_manager.json_basedir
+            ),
         )
 
         # dictionary that will be filled by json cleanup methods to check for
@@ -1068,6 +1076,19 @@ def add_profile_entry(
                 ):  # only store incremental increases in the layers
                     current[model_name]["z"].append(midpoint)
 
+                # old boilerplate to get around recursive_default_dict issues
+                if "obs" not in current[model_name]:
+                    current[model_name]["obs"] = {}
+
+                if freq not in current[model_name]["obs"]:
+                    current[model_name]["obs"][freq] = {}
+
+                if "mod" not in current[model_name]:
+                    current[model_name]["mod"] = {}
+
+                if freq not in current[model_name]["mod"]:
+                    current[model_name]["mod"][freq] = {}
+
                 for per in periods:
                     for season in seasons:
                         perstr = f"{per}-{season}"

diff --git a/pyaerocom/io/read_earlinet.py b/pyaerocom/io/read_earlinet.py
@@ -24,7 +24,7 @@ class ReadEarlinet(ReadUngriddedBase):
     _FILEMASK = "*.*"
 
     #: version log of this class (for caching)
-    __version__ = "0.17_" + ReadUngriddedBase.__baseversion__
+    __version__ = "0.18_" + ReadUngriddedBase.__baseversion__
 
     #: Name of dataset (OBS_ID)
     DATA_ID = const.EARLINET_NAME
@@ -35,6 +35,14 @@ class ReadEarlinet(ReadUngriddedBase):
     #: default variables for read method
     DEFAULT_VARS = ["bsc532aer", "ec532aer"]
 
+    # These are applied to all files. If new cloud filter names are discovered they should be added here.
+    # As of 20.08.24, however, there are still files with less reliable data which get through the filters.
+    # https://github.com/metno/pyaerocom/issues/1310
+    CLOUD_FILTERS = {
+        "cloud_mask_type": 0,  # "no_cloudmask_available manual_cloudmask automatic_cloudmask"
+        "cirrus_contamination": 2,  # "not_available no_cirrus cirrus_detected"
+    }
+
     #: all data values that exceed this number will be set to NaN on read. This
     #: is because iris, xarray, etc. assign a FILL VALUE of the order of e36
     #: to missing data in the netcdf files
@@ -219,6 +227,12 @@ def read_file(self, filename, vars_to_retrieve=None, read_err=None, remove_outli
         self.logger.debug(f"Reading file {filename}")
 
         with xarray.open_dataset(filename, engine="netcdf4") as data_in:
+            for filter in self.CLOUD_FILTERS:
+                if filter in data_in.variables:
+                    if data_in.variables[filter].item() == self.CLOUD_FILTERS[filter]:
+                        self.logger.debug(f"Skipping {filename} due to cloud filtering")
+                        continue
+
             # getting the coords since no longer in metadata
             # Put also just in the attributes. not sure why appears twice
             data_out["station_coords"]["longitude"] = data_out["longitude"] = np.float64(