diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 849a9a579e..2b2230a59c 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -857,7 +857,7 @@ def _update_timerange(self): dataset.facets.pop('timerange') dataset.supplementaries = [] check.data_availability(dataset) - intervals = [_get_start_end_date(f.name) for f in dataset.files] + intervals = [_get_start_end_date(f) for f in dataset.files] min_date = min(interval[0] for interval in intervals) max_date = max(interval[1] for interval in intervals) diff --git a/esmvalcore/esgf/_search.py b/esmvalcore/esgf/_search.py index 87d4cdf095..62882a5345 100644 --- a/esmvalcore/esgf/_search.py +++ b/esmvalcore/esgf/_search.py @@ -168,7 +168,7 @@ def select_by_time(files, timerange): for file in files: start_date, end_date = _parse_period(timerange) try: - start, end = _get_start_end_date(file.name) + start, end = _get_start_end_date(file) except ValueError: # If start and end year cannot be read from the filename # just select everything. diff --git a/esmvalcore/local.py b/esmvalcore/local.py index 67007e9608..9ebee00073 100644 --- a/esmvalcore/local.py +++ b/esmvalcore/local.py @@ -7,7 +7,7 @@ import re from glob import glob from pathlib import Path -from typing import Any, Union +from typing import TYPE_CHECKING, Any, Union import iris import isodate @@ -17,17 +17,19 @@ from .exceptions import RecipeError from .typing import Facets, FacetValue +if TYPE_CHECKING: + from .esgf import ESGFFile + logger = logging.getLogger(__name__) def _get_from_pattern(pattern, date_range_pattern, stem, group): """Get time, date or datetime from date range patterns in file names.""" - # # Next string allows to test that there is an allowed delimiter (or # string start or end) close to date range (or to single date) start_point = end_point = None context = r"(?:^|[-_]|$)" - # + # First check for a block of two potential dates date_range_pattern_with_context = context + date_range_pattern + context daterange = re.search(date_range_pattern_with_context, stem) @@ -37,6 +39,7 @@ def _get_from_pattern(pattern, date_range_pattern, stem, group): date_range_pattern_with_context = (context + date_range_pattern + context) daterange = re.search(date_range_pattern_with_context, stem) + if daterange: start_point = daterange.group(group) end_group = '_'.join([group, 'end']) @@ -59,41 +62,72 @@ def _get_from_pattern(pattern, date_range_pattern, stem, group): return start_point, end_point -def _get_start_end_date(filename): +def _get_start_end_date( + file: str | Path | LocalFile | ESGFFile +) -> tuple[str, str]: """Get the start and end dates as a string from a file name. - Examples of allowed dates : 1980, 198001, 19801231, - 1980123123, 19801231T23, 19801231T2359, 19801231T235959, - 19801231T235959Z (ISO 8601). + Examples of allowed dates: 1980, 198001, 1980-01, 19801231, 1980-12-31, + 1980123123, 19801231T23, 19801231T2359, 19801231T235959, 19801231T235959Z + (ISO 8601). + + Dates must be surrounded by '-', '_' or '.' (the latter is used by CMIP3 + data), or string start or string end (after removing filename suffix). + + Look first for two dates separated by '-', '_' or '_cat_' (the latter is + used by CMIP3 data), then for one single date, and if there are multiple, + for one date at start or end. + + Parameters + ---------- + file: + The file to read the start and end data from. + + Returns + ------- + tuple[str, str] + The start and end date. - Dates must be surrounded by - or _ or string start or string end - (after removing filename suffix). + Raises + ------ + ValueError + Start or end date cannot be determined. - Look first for two dates separated by - or _, then for one single - date, and if they are multiple, for one date at start or end. """ - stem = Path(filename).stem + if hasattr(file, 'name'): # Path, LocalFile, ESGFFile + stem = Path(file.name).stem + else: # str + stem = Path(file).stem + start_date = end_date = None - # + + # Build regex time_pattern = (r"(?P[0-2][0-9]" r"(?P[0-5][0-9]" r"(?P[0-5][0-9])?)?Z?)") date_pattern = (r"(?P[0-9]{4})" - r"(?P[01][0-9]" - r"(?P[0-3][0-9]" + r"(?P-?[01][0-9]" + r"(?P-?[0-3][0-9]" rf"(T?{time_pattern})?)?)?") datetime_pattern = (rf"(?P{date_pattern})") - # end_datetime_pattern = datetime_pattern.replace(">", "_end>") - date_range_pattern = datetime_pattern + r"[-_]" + end_datetime_pattern + + # Dates can either be delimited by '-', '_', or '_cat_' (the latter for + # CMIP3) + date_range_pattern = ( + datetime_pattern + r"[-_](?:cat_)?" + end_datetime_pattern + ) + + # Find dates using the regex start_date, end_date = _get_from_pattern(datetime_pattern, date_range_pattern, stem, 'datetime') # As final resort, try to get the dates from the file contents - if (start_date is None or end_date is None) and Path(filename).exists(): - logger.debug("Must load file %s for daterange ", filename) - cubes = iris.load(filename) + if ((start_date is None or end_date is None) and + isinstance(file, (str, Path)) and Path(file).exists()): + logger.debug("Must load file %s for daterange ", file) + cubes = iris.load(file) for cube in cubes: logger.debug(cube) @@ -109,12 +143,30 @@ def _get_start_end_date(filename): break if start_date is None or end_date is None: - raise ValueError(f'File {filename} dates do not match a recognized ' - 'pattern and time can not be read from the file') + raise ValueError( + f"File {file} datetimes do not match a recognized pattern and " + f"time coordinate can not be read from the file" + ) + + # Remove potential '-' characters from datetimes + start_date = start_date.replace('-', '') + end_date = end_date.replace('-', '') return start_date, end_date +def _get_start_end_year( + file: str | Path | LocalFile | ESGFFile +) -> tuple[int, int]: + """Get the start and end year as int from a file name. + + See :func:`_get_start_end_date`. + + """ + (start_date, end_date) = _get_start_end_date(file) + return (int(start_date[:4]), int(end_date[:4])) + + def _dates_to_timerange(start_date, end_date): """Convert ``start_date`` and ``end_date`` to ``timerange``. @@ -162,72 +214,6 @@ def _replace_years_with_timerange(variable): variable.pop('end_year', None) -def _get_start_end_year(file): - """Get the start and end year from a file name. - - Examples of allowed dates : 1980, 198001, 19801231, - 1980123123, 19801231T23, 19801231T2359, 19801231T235959, - 19801231T235959Z (ISO 8601). - - Dates must be surrounded by - or _ or string start or string end - (after removing filename suffix). - - Look first for two dates separated by - or _, then for one single - date, and if they are multiple, for one date at start or end. - - Parameters - ---------- - file: LocalFile or esmvalcore.esgf.ESGFFile - The file to read the start and end year from. - - Returns - ------- - tuple[int, int] - The start and end year. - - Raises - ------ - ValueError - When start or end year cannot be determined. - - """ - start_year = end_year = None - - time_pattern = (r"(?P[0-2][0-9]" - r"(?P[0-5][0-9]" - r"(?P[0-5][0-9])?)?Z?)") - date_pattern = (r"(?P[0-9]{4})" - r"(?P[01][0-9]" - r"(?P[0-3][0-9]" - rf"(T?{time_pattern})?)?)?") - - end_date_pattern = date_pattern.replace(">", "_end>") - date_range_pattern = date_pattern + r"[-_]" + end_date_pattern - start_year, end_year = _get_from_pattern(date_pattern, date_range_pattern, - Path(file.name).stem, 'year') - # As final resort, try to get the dates from the file contents - if ((start_year is None or end_year is None) and isinstance(file, Path) - and file.exists()): - logger.debug("Must load file %s for daterange ", file) - cubes = iris.load(file) - - for cube in cubes: - logger.debug(cube) - try: - time = cube.coord('time') - except iris.exceptions.CoordinateNotFoundError: - continue - start_year = time.cell(0).point.year - end_year = time.cell(-1).point.year - break - - if start_year is None or end_year is None: - raise ValueError(f'File {file} dates do not match a recognized ' - 'pattern and time can not be read from the file') - - return int(start_year), int(end_year) - - def _parse_period(timerange): """Parse `timerange` values given as duration periods. diff --git a/tests/unit/local/test_time.py b/tests/unit/local/test_time.py index 57dd8252c7..a01f1b4d05 100644 --- a/tests/unit/local/test_time.py +++ b/tests/unit/local/test_time.py @@ -1,7 +1,11 @@ """Unit tests for time related functions in `esmvalcore.local`.""" +from pathlib import Path + import iris +import pyesgf import pytest +from esmvalcore.esgf import ESGFFile from esmvalcore.local import ( LocalFile, _dates_to_timerange, @@ -11,6 +15,22 @@ _truncate_dates, ) + +def _get_esgf_file(path): + """Get ESGFFile object.""" + result = pyesgf.search.results.FileResult( + json={ + 'dataset_id': 'CMIP6.ABC.v1|something.org', + 'dataset_id_template_': ["%(mip_era)s.%(source_id)s"], + 'project': ['CMIP6'], + 'size': 10, + 'title': path, + }, + context=None, + ) + return ESGFFile([result]) + + FILENAME_CASES = [ ['var_whatever_1980-1981', 1980, 1981], ['var_whatever_1980.nc', 1980, 1980], @@ -31,7 +51,13 @@ 2015, 2015 ], ['pr_A1.186101-200012.nc', 1861, 2000], - ['tas_A1.20C3M_1.CCSM.atmm.1990-01_cat_1999-12.nc', None, None], + ['tas_A1.20C3M_1.CCSM.atmm.1990-01_cat_1999-12.nc', 1990, 1999], + ['E5sf00_1M_1940_032.grb', 1940, 1940], + ['E5sf00_1D_1998-04_167.grb', 1998, 1998], + ['E5sf00_1H_1986-04-11_167.grb', 1986, 1986], + ['E5sf00_1M_1940-1941_032.grb', 1940, 1941], + ['E5sf00_1D_1998-01_1999-12_167.grb', 1998, 1999], + ['E5sf00_1H_2000-01-01_2001-12-31_167.grb', 2000, 2001], ] FILENAME_DATE_CASES = [ @@ -57,7 +83,13 @@ '20150101T000000Z', '20150101T000000Z' ], ['pr_A1.186101-200012.nc', '186101', '200012'], - ['tas_A1.20C3M_1.CCSM.atmm.1990-01_cat_1999-12.nc', None, None], + ['tas_A1.20C3M_1.CCSM.atmm.1990-01_cat_1999-12.nc', '199001', '199912'], + ['E5sf00_1M_1940_032.grb', '1940', '1940'], + ['E5sf00_1D_1998-04_167.grb', '199804', '199804'], + ['E5sf00_1H_1986-04-11_167.grb', '19860411', '19860411'], + ['E5sf00_1M_1940-1941_032.grb', '1940', '1941'], + ['E5sf00_1D_1998-01_1999-12_167.grb', '199801', '199912'], + ['E5sf00_1H_2000-01-01_2001-12-31_167.grb', '20000101', '20011231'], ] @@ -65,38 +97,68 @@ def test_get_start_end_year(case): """Tests for _get_start_end_year function.""" filename, case_start, case_end = case - filename = LocalFile(filename) + + # If the filename is inconclusive or too difficult we resort to reading the + # file, which fails here because the file is not there. if case_start is None and case_end is None: - # If the filename is inconclusive or too difficult - # we resort to reading the file, which fails here - # because the file is not there. with pytest.raises(ValueError): _get_start_end_year(filename) + with pytest.raises(ValueError): + _get_start_end_year(Path(filename)) + with pytest.raises(ValueError): + _get_start_end_year(LocalFile(filename)) + with pytest.raises(ValueError): + _get_start_end_year(_get_esgf_file(filename)) + else: start, end = _get_start_end_year(filename) assert case_start == start assert case_end == end + start, end = _get_start_end_year(Path(filename)) + assert case_start == start + assert case_end == end + start, end = _get_start_end_year(LocalFile(filename)) + assert case_start == start + assert case_end == end + start, end = _get_start_end_year(_get_esgf_file(filename)) + assert case_start == start + assert case_end == end @pytest.mark.parametrize('case', FILENAME_DATE_CASES) def test_get_start_end_date(case): """Tests for _get_start_end_date function.""" filename, case_start, case_end = case - filename = LocalFile(filename) + + # If the filename is inconclusive or too difficult we resort to reading the + # file, which fails here because the file is not there. if case_start is None and case_end is None: - # If the filename is inconclusive or too difficult - # we resort to reading the file, which fails here - # because the file is not there. with pytest.raises(ValueError): _get_start_end_date(filename) + with pytest.raises(ValueError): + _get_start_end_date(Path(filename)) + with pytest.raises(ValueError): + _get_start_end_date(LocalFile(filename)) + with pytest.raises(ValueError): + _get_start_end_date(_get_esgf_file(filename)) + else: start, end = _get_start_end_date(filename) assert case_start == start assert case_end == end + start, end = _get_start_end_date(Path(filename)) + assert case_start == start + assert case_end == end + start, end = _get_start_end_date(LocalFile(filename)) + assert case_start == start + assert case_end == end + start, end = _get_start_end_date(_get_esgf_file(filename)) + assert case_start == start + assert case_end == end -def test_read_time_from_cube(monkeypatch, tmp_path): - """Try to get time from cube if no date in filename.""" +def test_read_years_from_cube(monkeypatch, tmp_path): + """Try to get years from cube if no date in filename.""" monkeypatch.chdir(tmp_path) temp_file = LocalFile('test.nc') cube = iris.cube.Cube([0, 0], var_name='var') @@ -111,7 +173,7 @@ def test_read_time_from_cube(monkeypatch, tmp_path): def test_read_datetime_from_cube(monkeypatch, tmp_path): - """Try to get time from cube if no date in filename.""" + """Try to get datetime from cube if no date in filename.""" monkeypatch.chdir(tmp_path) temp_file = 'test.nc' cube = iris.cube.Cube([0, 0], var_name='var') @@ -133,12 +195,15 @@ def test_raises_if_unable_to_deduce(monkeypatch, tmp_path): iris.save(cube, temp_file) with pytest.raises(ValueError): _get_start_end_date(temp_file) + with pytest.raises(ValueError): + _get_start_end_year(temp_file) def test_fails_if_no_date_present(): """Test raises if no date is present.""" - with pytest.raises((ValueError, OSError)): + with pytest.raises((ValueError)): _get_start_end_date('var_whatever') + with pytest.raises((ValueError)): _get_start_end_year('var_whatever')