diff --git a/monetio/sat/__init__.py b/monetio/sat/__init__.py index adbe2029..5e9cd6ad 100644 --- a/monetio/sat/__init__.py +++ b/monetio/sat/__init__.py @@ -8,9 +8,11 @@ _tropomi_l2_no2_mm, goes, modis_ornl, - nesdis_edr_viirs, - nesdis_eps_viirs, + nesdis_avhrr_aot_aws_gridded, + nesdis_eps_viirs_aod_nrt, nesdis_frp, + nesdis_viirs_aod_aws_gridded, + nesdis_viirs_ndvi_aws_gridded, ) __all__ = [ @@ -23,9 +25,11 @@ "_tropomi_l2_no2_mm", "goes", "modis_ornl", - "nesdis_edr_viirs", - "nesdis_eps_viirs", + "nesdis_avhrr_aot_aws_gridded", + "nesdis_eps_viirs_aod_nrt", "nesdis_frp", + "nesdis_viirs_aod_aws_gridded", + "nesdis_viirs_ndvi_aws_gridded", ] __name__ = "sat" diff --git a/monetio/sat/nesdis_avhrr_aot_aws_gridded.py b/monetio/sat/nesdis_avhrr_aot_aws_gridded.py new file mode 100644 index 00000000..dae9b0ee --- /dev/null +++ b/monetio/sat/nesdis_avhrr_aot_aws_gridded.py @@ -0,0 +1,232 @@ +""" +NOAA Climate Data Record (CDR) Aerosol Optical Depth (AOD) Dataset Access Module + +This module provides access to NOAA's satellite-derived Aerosol Optical Depth data: + +Aerosol Optical Depth (AOD): + - Source: NOAA CDR AVHRR AOT (Aerosol Optical Thickness) + - Period: 1981-present + - Sensor: Advanced Very High Resolution Radiometer (AVHRR) + - Resolution: 0.1° x 0.1° (approximately 11km at equator) + - Coverage: Global over ocean + - Temporal Resolution: + * Daily averages + * Monthly averages + - Key Variables: + * aot_550: Aerosol Optical Thickness at 550nm + * number_of_retrievals: Number of valid retrievals + * quality_flags: Quality assurance flags + - AWS Path: noaa-cdr-aerosol-optical-thickness-pds/ + +Dataset Description: + The AVHRR AOT CDR provides a consistent, long-term record of aerosol optical + thickness over global oceans. This parameter is crucial for: + - Climate change studies + - Atmospheric correction + - Air quality monitoring + - Radiative forcing calculations + +Data Access: + Files are stored in NetCDF format on AWS S3, organized by: + - Daily data: /data/daily/YYYY/ + - Monthly data: /data/monthly/YYYY/ + +Usage: + >>> # Single date access (daily) + >>> dataset = open_dataset("2023-01-01") + + >>> # Monthly data + >>> dataset = open_dataset("2023-01-01", averaging_time=AveragingTime.MONTHLY) + + >>> # Multiple dates + >>> dates = pd.date_range("2023-01-01", "2023-01-10") + >>> dataset_multi = open_mfdataset(dates) + +References: + - Dataset Documentation: https://www.ncdc.noaa.gov/cdr/atmospheric/aerosol-optical-thickness + - Algorithm Theoretical Basis Document (ATBD): + https://www.ncdc.noaa.gov/cdr/atmospheric/aerosol-optical-thickness/documentation + +Notes: + - Data is only available over ocean surfaces + - Quality flags should be consulted for optimal data usage + - Monthly averages are computed from daily data +""" +from s3fs import S3FileSystem +from enum import Enum +from datetime import datetime +from pathlib import Path +from typing import List, Tuple, Union +import warnings +import pandas as pd +import s3fs +import xarray as xr + +AOD_BASE_PATH = "noaa-cdr-aerosol-optical-thickness-pds/data/daily" +AOD_FILE_PATTERN = "AOT_AVHRR_*_daily-avg_" + +class AveragingTime(Enum): + DAILY = "daily" + MONTHLY = "monthly" + +def create_daily_aod_list( + date_generated: List[datetime], + fs: S3FileSystem, + warning: bool = False +) -> Tuple[List[str], int]: + """ + Creates a list of daily AOD (Aerosol Optical Depth) files and calculates the total size of the files. + + Parameters: + date_generated (list): A list of dates for which to check the existence of AOD files. + fs (FileSystem): The file system object used to check file existence and size. + warning (bool, optional): If True, warns instead of raising error when file not found. Defaults to False. + + Returns: + tuple[list[str | None], int]: A tuple containing: + - List of file paths (str) or None for missing files if warning=True + - Total size of the files in bytes + """ + # Loop through observation dates & check for files + nodd_file_list = [] + nodd_total_size = 0 + for date in date_generated: + file_date = date.strftime("%Y%m%d") + year = file_date[:4] + prod_path = Path(AOD_BASE_PATH) / year + file_names = fs.glob(str(prod_path / f"{AOD_FILE_PATTERN}{file_date}_*.nc")) + # If file exists, add path to list and add file size to total + if file_names: + nodd_file_list.extend(file_names) + nodd_total_size += sum(fs.size(f) for f in file_names) + else: + msg = f"File does not exist on AWS: {prod_path}/{AOD_FILE_PATTERN}{file_date}_*.nc" + if warning: + warnings.warn(msg) + nodd_file_list.append(None) + else: + raise ValueError(msg) + + return nodd_file_list, nodd_total_size + +def create_monthly_aod_list(date_generated, fs, warning=False): + """ + Creates a list of daily AOD (Aerosol Optical Depth) files and calculates the total size of the files. + + Parameters: + date_generated (list): A list of dates for which to check the existence of AOD files. + fs (FileSystem): The file system object used to check file existence and size. + + Returns: + tuple: A tuple containing the list of file paths and the total size of the files. + """ + # Loop through observation dates & check for files + nodd_file_list = [] + nodd_total_size = 0 + for date in date_generated: + file_date = date.strftime("%Y%m%d") + year = file_date[:4] + prod_path = "noaa-cdr-aerosol-optical-thickness-pds/data/monthly/" + year + "/" + patt = "AOT_AVHRR_*_daily-avg_" + file_names = fs.glob(prod_path + patt + file_date + "_*.nc") + # If file exists, add path to list and add file size to total + if file_names: + nodd_file_list.extend(file_names) + nodd_total_size += sum(fs.size(f) for f in file_names) + else: + msg = "File does not exist on AWS: " + prod_path + patt + file_date + "_*.nc" + if warning: + warnings.warn(msg) + nodd_file_list.append(None) + else: + raise ValueError(msg) + + return nodd_file_list, nodd_total_size + +def open_dataset( + date: Union[str, datetime], + averaging_time: AveragingTime = AveragingTime.DAILY +) -> xr.Dataset: + """ + Opens a dataset for the given date, satellite, data resolution, and averaging time. + + Parameters: + date (str or datetime.datetime): The date for which to open the dataset. + averaging_time (str, optional): The averaging time. + Valid values are 'daily', or 'monthly'. Defaults to 'daily'. + + Returns: + xarray.Dataset: The opened dataset. + + Raises: + ValueError: If the input values are invalid. + """ + if isinstance(date, str): + date_generated = [pd.Timestamp(date)] + else: + date_generated = [date] + + # Access AWS using anonymous credentials + fs = s3fs.S3FileSystem(anon=True) + + if averaging_time == AveragingTime.MONTHLY: + file_list, _ = create_monthly_aod_list(date_generated, fs) + elif averaging_time == AveragingTime.DAILY: + file_list, _ = create_daily_aod_list(date_generated, fs) + else: + raise ValueError( + f"Invalid input for 'averaging_time' {averaging_time!r}: " + "Valid values are 'daily' or 'monthly'" + ) + + if len(file_list) == 0 or all(f is None for f in file_list): + raise ValueError(f"Files not available for product and date: {date_generated[0]}") + + with fs.open(file_list[0]) as aws_file: + dset = xr.open_dataset(aws_file) + + return dset + +def open_mfdataset(dates, averaging_time: AveragingTime = AveragingTime.DAILY, error_missing=False): + """ + Opens and combines multiple NetCDF files into a single xarray dataset. + + Parameters: + dates (pandas.DatetimeIndex): The dates for which to retrieve the data. + averaging_time (str, optional): The averaging time. + Valid values are 'daily', 'weekly', or 'monthly'. Defaults to 'daily'. + + Returns: + xarray.Dataset: The combined dataset containing the data for the specified dates. + + Raises: + ValueError: If the input parameters are invalid. + + """ + from collections.abc import Iterable + + if isinstance(dates, Iterable) and not isinstance(dates, str): + dates = pd.DatetimeIndex(dates) + else: + dates = pd.DatetimeIndex([dates]) + + # Access AWS using anonymous credentials + fs = s3fs.S3FileSystem(anon=True) + + if averaging_time == AveragingTime.MONTHLY: + file_list, _ = create_monthly_aod_list(dates, fs, warning=not error_missing) + elif averaging_time == AveragingTime.DAILY: + file_list, _ = create_daily_aod_list(dates, fs, warning=not error_missing) + else: + raise ValueError( + f"Invalid input for 'averaging_time' {averaging_time!r}: " + "Valid values are 'daily' or 'monthly'" + ) + + if len(file_list) == 0 or all(f is None for f in file_list): + raise ValueError(f"Files not available for product and dates: {dates}") + + aws_files = [fs.open(f) for f in file_list if f is not None] + + with xr.open_mfdataset(aws_files, concat_dim="time", combine="nested") as dset: + return dset diff --git a/monetio/sat/nesdis_edr_viirs.py b/monetio/sat/nesdis_edr_viirs.py deleted file mode 100644 index 19512ea3..00000000 --- a/monetio/sat/nesdis_edr_viirs.py +++ /dev/null @@ -1,110 +0,0 @@ -import os - -import xarray as xr - -server = "ftp.star.nesdis.noaa.gov" -base_dir = "/pub/smcd/jhuang/npp.viirs.aerosol.data/edraot550/" - - -def open_dataset(date, resolution="high", datapath="."): - current = change_dir(datapath) # noqa: F841 - # check resolution; by default 0.1 degree data is assumed - if resolution in {"high", "h"}: - # this is the 0.1 degree data - nlat = 1800 - nlon = 3600 - lon, lat = _get_latlons(nlat, nlon) - fname, date = download_data(date, resolution="high") - else: - nlat = 720 - nlon = 1440 - lon, lat = _get_latlons(nlat, nlon) - fname, date = download_data(date, resolution=0.25) - # unzip fname - fname = _unzip_file(fname) - # read the data - data = read_data(fname, lat, lon, date) - return data - - -def open_mfdataset(dates, resolution="high", datapath="."): - das = [] - for i in dates: - das.append(open_dataset(i, resolution=resolution, datapath=datapath)) - ds = xr.concat(das, dim="time") - return ds - - -def read_data(fname, lat, lon, date): - from numpy import float32, fromfile, nan - from pandas import to_datetime - - f = fromfile(fname, dtype=float32) - nlat, nlon = lon.shape - aot = f.reshape(2, nlat, nlon)[0, :, :].reshape(1, nlat, nlon) - aot[aot < -999] = nan - datearr = to_datetime([date]) - da = xr.DataArray(aot, coords=[datearr, range(nlat), range(nlon)], dims=["time", "y", "x"]) - da["latitude"] = (("y", "x"), lat) - da["longitude"] = (("y", "x"), lon) - da.attrs["units"] = "" - da.name = "VIIRS EDR AOD" - da.attrs["long_name"] = "Aerosol Optical Depth" - da.attrs["source"] = ( - "ftp://ftp.star.nesdis.noaa.gov/pub/smcd/jhuang/npp.viirs.aerosol.data/edraot550" - ) - return da - - -def _unzip_file(fname): - import subprocess - - subprocess.run(["gunzip", "-f", fname]) - return fname[:-3] - - -def change_dir(to_path): - current = os.getcwd() - os.chdir(to_path) - return current - - -def download_data(date, resolution="high"): - import ftplib - from datetime import datetime - - if isinstance(date, datetime): - year = date.strftime("%Y") - yyyymmdd = date.strftime("%Y%m%d") - else: - from pandas import Timestamp - - date = Timestamp(date) - year = date.strftime("%Y") - yyyymmdd = date.strftime("%Y%m%d") - if resolution == "high": - file = f"npp_aot550_edr_gridded_0.10_{yyyymmdd}.high.bin.gz" - else: - file = f"npp_aot550_edr_gridded_0.25_{yyyymmdd}.high.bin.gz" - ftp = ftplib.FTP(server) - ftp.login() - # print(base_dir) - # print(year) - # print(base_dir + year) - ftp.cwd(base_dir + year) - # print(file) - ftp.retrbinary("RETR " + file, open(file, "wb").write) - return file, date - - -def _get_latlons(nlat, nlon): - from numpy import linspace, meshgrid - - lon_min = -179.875 - lon_max = -1 * lon_min - lat_min = -89.875 - lat_max = -1.0 * lat_min - lons = linspace(lon_min, lon_max, nlon) - lats = linspace(lat_min, lat_max, nlat) - lon, lat = meshgrid(lons, lats) - return lon, lat diff --git a/monetio/sat/nesdis_eps_viirs.py b/monetio/sat/nesdis_eps_viirs.py deleted file mode 100644 index 71dcd42c..00000000 --- a/monetio/sat/nesdis_eps_viirs.py +++ /dev/null @@ -1,192 +0,0 @@ -import os - -import xarray as xr - -server = "ftp.star.nesdis.noaa.gov" -base_dir = "/pub/smcd/VIIRS_Aerosol/npp.viirs.aerosol.data/epsaot550/" - - -def open_dataset(date, datapath="."): - """Short summary. - - Parameters - ---------- - date : type - Description of parameter `date`. - datapath : type - Description of parameter `datapath`. - - Returns - ------- - type - Description of returned object. - - """ - current = change_dir(datapath) - nlat = 720 - nlon = 1440 - lon, lat = _get_latlons(nlat, nlon) - if isinstance(date, str): - fname, date = download_data(date) - else: - fname, date = download_data(date) - print(fname) - data = read_data(fname, lat, lon, date) - change_dir(current) - return data.where(data > 0) - - -def open_mfdataset(dates, datapath="."): - """Short summary. - - Parameters - ---------- - dates : type - Description of parameter `dates`. - datapath : type - Description of parameter `datapath`. - - Returns - ------- - type - Description of returned object. - - """ - from xarray import concat - - das = [] - for i in dates: - print(i) - das.append(open_dataset(i, datapath=datapath)) - ds = concat(das, dim="time") - return ds - - -def read_data(fname, lat, lon, date): - """Short summary. - - Parameters - ---------- - fname : type - Description of parameter `fname`. - lat : type - Description of parameter `lat`. - lon : type - Description of parameter `lon`. - date : type - Description of parameter `date`. - - Returns - ------- - type - Description of returned object. - - """ - from pandas import to_datetime - - f = xr.open_dataset(fname) - datearr = to_datetime([date]) - da = f["aot_ip_out"] - da = da.rename({"nlat": "y", "nlon": "x"}) - da["latitude"] = (("y", "x"), lat) - da["longitude"] = (("y", "x"), lon) - da = da.expand_dims("time") - da["time"] = datearr - da.attrs["units"] = "" - da.name = "VIIRS EPS AOT" - da.attrs["long_name"] = "Aerosol Optical Thickness" - da.attrs["source"] = ( - "ftp://ftp.star.nesdis.noaa.gov/pub/smcd/VIIRS_Aerosol/npp.viirs.aerosol.data/epsaot550" - ) - return da - - -def change_dir(to_path): - """Short summary. - - Parameters - ---------- - to_path : type - Description of parameter `to_path`. - - Returns - ------- - type - Description of returned object. - - """ - current = os.getcwd() - os.chdir(to_path) - return current - - -def download_data(date, resolution="high"): - """Short summary. - - Parameters - ---------- - date : type - Description of parameter `date`. - resolution : type - Description of parameter `resolution`. - - Returns - ------- - type - Description of returned object. - - """ - import ftplib - from datetime import datetime - - from pandas import DatetimeIndex - - if isinstance(date, datetime) or isinstance(date, DatetimeIndex): - year = date.strftime("%Y") - yyyymmdd = date.strftime("%Y%m%d") - else: - from pandas import Timestamp - - date = Timestamp(date) - year = date.strftime("%Y") - yyyymmdd = date.strftime("%Y%m%d") - # npp_eaot_ip_gridded_0.25_20181222.high.nc - # print(year, yyyymmdd) - file = f"npp_eaot_ip_gridded_0.25_{yyyymmdd}.high.nc" - exists = os.path.isfile(file) - if ~exists: - ftp = ftplib.FTP(server) - ftp.login() - ftp.cwd(base_dir + year) - ftp.retrbinary("RETR " + file, open(file, "wb").write) - else: - print(f"File Already Exists! Reading: {file}") - return file, date - - -def _get_latlons(nlat, nlon): - """Short summary. - - Parameters - ---------- - nlat : type - Description of parameter `nlat`. - nlon : type - Description of parameter `nlon`. - - Returns - ------- - type - Description of returned object. - - """ - from numpy import linspace, meshgrid - - lon_min = -179.875 - lon_max = -1 * lon_min - lat_min = -89.875 - lat_max = -1.0 * lat_min - lons = linspace(lon_min, lon_max, nlon) - lats = linspace(lat_max, lat_min, nlat) - lon, lat = meshgrid(lons, lats) - return lon, lat diff --git a/monetio/sat/nesdis_eps_viirs_aod_nrt.py b/monetio/sat/nesdis_eps_viirs_aod_nrt.py new file mode 100644 index 00000000..fe094e2f --- /dev/null +++ b/monetio/sat/nesdis_eps_viirs_aod_nrt.py @@ -0,0 +1,173 @@ +import pandas as pd + + +def build_urls(dates, *, daily=True, data_resolution=0.1, satellite="NOAA20"): + """Construct URLs for downloading NEPS data. + + Parameters + ---------- + dates : pd.DatetimeIndex or iterable of datetime + Dates to download data for. + daily : bool, optional + Whether to download daily (default) or monthly data. + data_resolution : float, optional + Resolution of data in degrees (0.1 or 0.25). + satellite : str, optional + Satellite platform, 'SNPP' or 'NOAA20'. + + Returns + ------- + pd.Series + Series with URLs and corresponding file names. + + Notes + ----- + The `res` and `sat` parameters are only used for sub-daily data. + """ + import warnings + from collections.abc import Iterable + + if isinstance(dates, Iterable) and not isinstance(dates, str): + dates = pd.DatetimeIndex(dates) + else: + dates = pd.DatetimeIndex([dates]) + + if daily: + dates = dates.floor("D").unique() + else: # monthly + dates = dates.to_period("M").to_timestamp().unique() + + if data_resolution != 0.25 and not daily: + warnings.warn( + "Monthly data is only available at 0.25 deg resolution, " + f"got 'data_resolution' {data_resolution!r}" + ) + + sat_dirname = satellite.lower() + if satellite.upper() == "SNPP": + sat = "npp" if daily else "snpp" + elif satellite.upper() == "NOAA20": + sat = "noaa20" + res = str(data_resolution).ljust(5, "0") + aod_dirname = "aod/eps" if daily else "aod_monthly" + + urls = [] + fnames = [] + + print("Building VIIRS URLs...") + base_url = ( + "https://www.star.nesdis.noaa.gov/pub/smcd/VIIRS_Aerosol/viirs_aerosol_gridded_data/" + f"{sat_dirname}/{aod_dirname}/" + ) + + for date in dates: + if daily: + fname = "{}/viirs_eps_{}_aod_{}_deg_{}_nrt.nc".format( + date.strftime("%Y"), + sat, + res, + date.strftime("%Y%m%d"), + ) + else: + fname = "viirs_aod_monthly_{}_{}_deg_{}_nrt.nc".format( + sat, + res, + date.strftime("%Y%m"), + ) + url = base_url + fname + urls.append(url) + fnames.append(fname) + + # Note: files needed for comparison + urls = pd.Series(urls, index=None) + fnames = pd.Series(fnames, index=None) + + return urls, fnames + + +def open_dataset(date, *, satellite="NOAA20", data_resolution=0.1, daily=True): + """ + Parameters + ---------- + date : str or datetime-like + The date for which to open the dataset. + """ + from io import BytesIO + + import pandas as pd + import requests + import xarray as xr + + if not isinstance(date, pd.Timestamp): + d = pd.to_datetime(date) + else: + d = date + + if satellite.lower() not in ("noaa20", "snpp"): + raise ValueError( + f"Invalid input for 'satellite' {satellite!r}: " "Valid values are 'NOAA20' or 'SNPP'" + ) + + if data_resolution not in {0.1, 0.25}: + raise ValueError( + f"Invalid input for 'data_resolution' {data_resolution!r}: " + "Valid values are 0.1 or 0.25" + ) + + urls, _ = build_urls(d, satellite=satellite, data_resolution=data_resolution, daily=daily) + + r = requests.get(urls[0], stream=True) + r.raise_for_status() + dset = xr.open_dataset(BytesIO(r.content)) + + dset = dset.expand_dims(time=[d]).set_coords(["time"]) + + return dset + + +def open_mfdataset(dates, satellite="NOAA20", data_resolution=0.1, daily=True, error_missing=False): + import warnings + from collections.abc import Iterable + from io import BytesIO + + import pandas as pd + import requests + import xarray as xr + + if isinstance(dates, Iterable) and not isinstance(dates, str): + dates = pd.DatetimeIndex(dates) + else: + dates = pd.DatetimeIndex([dates]) + + if satellite.lower() not in ("noaa20", "snpp"): + raise ValueError( + f"Invalid input for 'satellite' {satellite!r}: " "Valid values are 'NOAA20' or 'SNPP'" + ) + + if data_resolution not in {0.1, 0.25}: + raise ValueError( + f"Invalid input for 'data_resolution' {data_resolution!r}: " + "Valid values are 0.1 or 0.25" + ) + + urls, _ = build_urls(dates, satellite=satellite, data_resolution=data_resolution, daily=daily) + + dsets = [] + for url, date in zip(urls, dates): + r = requests.get(url, stream=True) + if r.status_code != 200: + msg = f"Failed to access file on NESDIS FTP server: {url}" + if error_missing: + raise RuntimeError(msg) + else: + warnings.warn(msg) + else: + ds = xr.open_dataset(BytesIO(r.content)).expand_dims(time=[date]).set_coords(["time"]) + dsets.append(ds) + + if len(dsets) == 0: + raise ValueError(f"Files not available for product and dates: {dates}") + + dset = xr.concat(dsets, dim="time") + + return dset diff --git a/monetio/sat/nesdis_viirs_aod_aws_gridded.py b/monetio/sat/nesdis_viirs_aod_aws_gridded.py new file mode 100644 index 00000000..7acbb937 --- /dev/null +++ b/monetio/sat/nesdis_viirs_aod_aws_gridded.py @@ -0,0 +1,399 @@ +""" +NOAA VIIRS Aerosol Optical Depth (AOD) Dataset Access Module + +This module provides access to NOAA's VIIRS-derived Aerosol Optical Depth data: + +Data Products: + 1. Daily AOD: + - Resolution options: 0.05°, 0.10°, 0.25° + - Coverage: Global over ocean + - Variables: AOD at 550nm, quality flags + - Path: noaa-jpss/{satellite}/VIIRS/{resolution}_Degrees_Daily/ + + 2. Weekly AOD: + - Fixed resolution: 0.25° + - Coverage: Global over ocean + - Variables: Weekly averaged AOD + - Path: noaa-jpss/{satellite}/VIIRS/0.25_Degrees_Weekly/ + + 3. Monthly AOD: + - Fixed resolution: 0.25° + - Coverage: Global over ocean + - Variables: Monthly averaged AOD + - Path: noaa-jpss/{satellite}/VIIRS/0.25_Degrees_Monthly/ + +Satellites: + - SNPP: Data available from 2012-01-19 to present + - NOAA20: Data available from 2018-01-01 to present + +References: + - VIIRS AOD Algorithm: https://www.star.nesdis.noaa.gov/jpss/documents/ATBD/ATBD_EPS_Aerosol_AOD_v3.0.1.pdf + - Data Access: https://www.avl.class.noaa.gov/saa/products/welcome +""" +# Standard library imports +from datetime import datetime +from enum import Enum +from functools import lru_cache +from pathlib import Path +from typing import List, Tuple, Union +import warnings + +# Third-party imports +import pandas as pd +import s3fs +import xarray as xr + +class AveragingTime(str, Enum): + """Enumeration of valid averaging time periods.""" + DAILY = "daily" + WEEKLY = "weekly" + MONTHLY = "monthly" + +class Satellite(str, Enum): + """Enumeration of valid satellites.""" + SNPP = "SNPP" + NOAA20 = "NOAA20" + +# Configuration dictionary for data products +PRODUCT_CONFIG = { + AveragingTime.DAILY: { + "path_template": "noaa-jpss/{satellite}/VIIRS/{satellite}_VIIRS_Aerosol_Optical_Depth_Gridded_Reprocessed/{resolution}_Degrees_Daily/{year}/", + "file_template": "viirs_eps_{sat_name}_aod_{resolution}_deg_{date}.nc", + "resolutions": {"0.050", "0.100", "0.250"}, + }, + AveragingTime.WEEKLY: { + "path_template": "noaa-jpss/{satellite}/VIIRS/{satellite}_VIIRS_Aerosol_Optical_Depth_Gridded_Reprocessed/0.25_Degrees_Weekly/{year}/", + "resolutions": {"0.250"}, + }, + AveragingTime.MONTHLY: { + "path_template": "noaa-jpss/{satellite}/VIIRS/{satellite}_VIIRS_Aerosol_Optical_Depth_Gridded_Reprocessed/0.25_Degrees_Monthly/", + "file_template": "viirs_aod_monthly_{sat_name}_0.250_deg_{date}.nc", + "resolutions": {"0.250"}, + } +} + +@lru_cache(maxsize=128) +def _get_satellite_name(satellite: str) -> str: + """Get the lowercase satellite name used in file paths.""" + return "npp" if satellite == "SNPP" else "noaa20" + +def validate_inputs(satellite: str, data_resolution: str, averaging_time: str) -> None: + """ + Validate input parameters. + + Args: + satellite: Satellite name + data_resolution: Data resolution + averaging_time: Averaging period + + Raises: + ValueError: If inputs are invalid + """ + if satellite not in {s.value for s in Satellite}: + raise ValueError(f"Invalid satellite: {satellite}. Must be one of {list(Satellite)}") + + if averaging_time not in {t.value for t in AveragingTime}: + raise ValueError(f"Invalid averaging_time: {averaging_time}. Must be one of {list(AveragingTime)}") + + if data_resolution not in PRODUCT_CONFIG[averaging_time]["resolutions"]: + raise ValueError( + f"Invalid resolution {data_resolution} for {averaging_time} data. " + f"Valid resolutions: {PRODUCT_CONFIG[averaging_time]['resolutions']}" + ) + +def create_daily_aod_list( + data_resolution: str, + satellite: str, + date_generated: List[datetime], + fs: s3fs.S3FileSystem, + warning: bool = False +) -> Tuple[List[str], int]: + """ + Creates a list of daily AOD files and calculates their total size. + + Args: + data_resolution: Data resolution + satellite: Satellite name + date_generated: List of dates to process + fs: S3 filesystem object + warning: Whether to warn instead of raise errors + + Returns: + Tuple of (file_list, total_size) + """ + validate_inputs(satellite, data_resolution, AveragingTime.DAILY) + + file_list = [] + total_size = 0 + sat_name = _get_satellite_name(satellite) + config = PRODUCT_CONFIG[AveragingTime.DAILY] + + for date in date_generated: + file_date = date.strftime("%Y%m%d") + year = file_date[:4] + + file_name = config["file_template"].format( + sat_name=sat_name, + resolution=data_resolution, + date=file_date + ) + + prod_path = config["path_template"].format( + satellite=satellite, + resolution=data_resolution[:4], + year=year + ) + + full_path = prod_path + file_name + + if fs.exists(full_path): + file_list.extend(fs.ls(full_path)) + total_size += fs.size(full_path) + else: + msg = f"File does not exist: {full_path}" + if warning: + warnings.warn(msg, stacklevel=2) + file_list.append(None) + else: + raise ValueError(msg) + + return file_list, total_size + +def create_monthly_aod_list( + satellite: str, + date_generated: List[datetime], + fs: s3fs.S3FileSystem, + warning: bool = False +) -> Tuple[List[str], int]: + """ + Creates a list of monthly AOD files and calculates their total size. + + Args: + satellite: Satellite name + date_generated: List of dates to process + fs: S3 filesystem object + warning: Whether to warn instead of raise errors + + Returns: + Tuple of (file_list, total_size) + """ + validate_inputs(satellite, "0.250", AveragingTime.MONTHLY) + + file_list = [] + total_size = 0 + processed_months = set() + sat_name = _get_satellite_name(satellite) + config = PRODUCT_CONFIG[AveragingTime.MONTHLY] + + for date in date_generated: + year_month = date.strftime("%Y%m") + if year_month in processed_months: + continue + + processed_months.add(year_month) + file_name = config["file_template"].format( + sat_name=sat_name, + date=year_month + ) + + prod_path = config["path_template"].format(satellite=satellite) + full_path = prod_path + file_name + + if fs.exists(full_path): + file_list.extend(fs.ls(full_path)) + total_size += fs.size(full_path) + else: + msg = f"File does not exist: {full_path}" + if warning: + warnings.warn(msg, stacklevel=2) + file_list.append(None) + else: + raise ValueError(msg) + + return file_list, total_size + +def create_weekly_aod_list( + satellite: str, + date_generated: List[datetime], + fs: s3fs.S3FileSystem, + warning: bool = False +) -> Tuple[List[str], int]: + """ + Creates a list of weekly AOD files and calculates their total size. + + Args: + satellite: Satellite name + date_generated: List of dates to process + fs: S3 filesystem object + warning: Whether to warn instead of raise errors + + Returns: + Tuple of (file_list, total_size) + """ + validate_inputs(satellite, "0.250", AveragingTime.WEEKLY) + + file_list = [] + total_size = 0 + config = PRODUCT_CONFIG[AveragingTime.WEEKLY] + + for date in date_generated: + file_date = date.strftime("%Y%m%d") + year = file_date[:4] + + prod_path = config["path_template"].format( + satellite=satellite, + year=year + ) + + try: + all_files = fs.ls(prod_path) + for file in all_files: + file_name = Path(file).name + date_range = file_name.split("_")[7].split(".")[0] + file_start, file_end = date_range.split("-") + + if file_start <= file_date <= file_end and file not in file_list: + file_list.append(file) + total_size += fs.size(file) + except Exception as e: + if warning: + warnings.warn(str(e), stacklevel=2) + else: + raise ValueError(str(e)) + + return file_list, total_size + +def open_dataset( + date: Union[str, datetime], + satellite: str = "SNPP", + data_resolution: Union[float, str] = 0.1, + averaging_time: str = "daily" +) -> xr.Dataset: + """ + Load VIIRS AOD data from AWS for the given parameters. + + Args: + date: The date for which to open the dataset. + SNPP has data from 2012-01-19 to present. + NOAA20 has data from 2018-01-01 to present. + satellite: The satellite to retrieve data from. + Valid values are 'SNPP' or 'NOAA20'. + data_resolution: The data resolution. + Valid values are '0.050', '0.100', or '0.250'. Defaults to 0.1°. + Only has effect when averaging_time is 'daily'. + For 'weekly' and 'monthly' data, resolution is always 0.25. + averaging_time: The averaging time period. + Valid values are 'daily', 'weekly', or 'monthly'. + + Returns: + xarray.Dataset: The opened dataset. + + Raises: + ValueError: If input parameters are invalid. + """ + validate_inputs(satellite, str(data_resolution).ljust(5, "0"), averaging_time) + + if isinstance(date, str): + date_generated = [pd.Timestamp(date)] + else: + date_generated = [date] + + fs = s3fs.S3FileSystem(anon=True) + + # Get file list based on averaging time + if averaging_time == AveragingTime.MONTHLY: + file_list, _ = create_monthly_aod_list(satellite, date_generated, fs) + elif averaging_time == AveragingTime.WEEKLY: + file_list, _ = create_weekly_aod_list(satellite, date_generated, fs) + else: # daily + data_resolution = str(data_resolution).ljust(5, "0") + file_list, _ = create_daily_aod_list(data_resolution, satellite, date_generated, fs) + + if len(file_list) == 0 or all(f is None for f in file_list): + raise ValueError(f"Files not available for {averaging_time} data and date: {date_generated[0]}") + + # Open and process dataset + dset = xr.open_dataset(fs.open(file_list[0])) + dset = dset.expand_dims(time=date_generated).set_coords(["time"]) + + return dset + +def open_mfdataset( + dates: Union[pd.DatetimeIndex, datetime, str], + satellite: str = "SNPP", + data_resolution: Union[float, str] = 0.1, + averaging_time: str = "daily", + error_missing: bool = False +) -> xr.Dataset: + """ + Opens and combines multiple NetCDF files into a single dataset. + + Args: + dates: The dates for which to retrieve the data. + SNPP has data from 2012-01-19 to present. + NOAA20 has data from 2018-01-01 to present. + satellite: The satellite name. + Valid values are 'SNPP' or 'NOAA20'. + data_resolution: The data resolution. + Valid values are '0.050', '0.100', or '0.250'. Defaults to 0.1°. + Only has effect when averaging_time is 'daily'. + For 'weekly' and 'monthly' data, resolution is always 0.25. + averaging_time: The averaging time period. + Valid values are 'daily', 'weekly', or 'monthly'. + error_missing: If False (default), skip missing files with warning + and continue processing. Otherwise, raise an error. + + Returns: + xarray.Dataset: The combined dataset for specified dates. + + Raises: + ValueError: If input parameters are invalid. + """ + # Validate inputs + validate_inputs(satellite, str(data_resolution).ljust(5, "0"), averaging_time) + + # Convert dates to DatetimeIndex + if isinstance(dates, (str, datetime)): + dates = pd.DatetimeIndex([dates]) + elif not isinstance(dates, pd.DatetimeIndex): + dates = pd.DatetimeIndex(dates) + + fs = s3fs.S3FileSystem(anon=True) + + # Get file list based on averaging time + if averaging_time == AveragingTime.MONTHLY: + file_list, _ = create_monthly_aod_list( + satellite, dates, fs, warning=not error_missing + ) + elif averaging_time == AveragingTime.WEEKLY: + file_list, _ = create_weekly_aod_list( + satellite, dates, fs, warning=not error_missing + ) + else: # daily + data_resolution = str(data_resolution).ljust(5, "0") + file_list, _ = create_daily_aod_list( + data_resolution, satellite, dates, fs, warning=not error_missing + ) + + if len(file_list) == 0 or all(f is None for f in file_list): + raise ValueError(f"Files not available for {averaging_time} data and dates: {dates}") + + if not len(file_list) == len(dates): + raise ValueError( + "'dates' and discovered file list are not the same length. " + f"Check your dates input for {averaging_time} frequency." + ) + + # Process valid files and dates + dates_good = [] + aws_files = [] + for d, f in zip(dates, file_list): + if f is not None: + aws_files.append(fs.open(f)) + dates_good.append(d) + + # Combine datasets + dset = xr.open_mfdataset(aws_files, concat_dim="time", combine="nested") + dset["time"] = dates_good + + return dset \ No newline at end of file diff --git a/monetio/sat/nesdis_viirs_ndvi_aws_gridded.py b/monetio/sat/nesdis_viirs_ndvi_aws_gridded.py new file mode 100644 index 00000000..02365839 --- /dev/null +++ b/monetio/sat/nesdis_viirs_ndvi_aws_gridded.py @@ -0,0 +1,280 @@ +""" +NOAA Climate Data Record (CDR) and Near Real-Time (NRT) Dataset Access Module + +This module provides access to various NOAA satellite-derived environmental datasets: + +1. Vegetation Health Index (VHI): + - Available from both VIIRS (2012-present) and AVHRR (1981-2012) sensors + - Source: NOAA CDR + - Resolution: 4km global + - Frequency: Daily + - Variables: NDVI, VCI, TCI, VHI + - AWS Path: noaa-cdr-vegetation-health-pds/ + +2. Leaf Area Index (LAI) and Fraction of Photosynthetically Active Radiation (FPAR): + - Available from VIIRS sensor (2012-present) + - Source: NOAA VIIRS + - Resolution: 500m global + - Frequency: Daily + - Variables: LAI, FPAR + - AWS Path: noaa-viirs-lai-fpar/ + +3. Snow Cover Extent: + - Available from Interactive Multisensor Snow and Ice Mapping System (IMS) + - Source: NOAA CDR + - Resolution: 4km Northern Hemisphere + - Frequency: Daily + - Variables: Snow Cover, Sea Ice + - AWS Path: noaa-cdr-snow-cover-extent-ims-nrt/ + +Data Access: + All datasets are accessed through AWS S3 buckets in NetCDF format. + Files are organized by year and contain daily observations. + +Usage: + >>> # Single date access + >>> dataset = open_dataset("2023-01-01", data_type="vhi", sensor="viirs") + + >>> # Multiple dates + >>> dates = pd.date_range("2023-01-01", "2023-01-10") + >>> dataset_multi = open_mfdataset(dates, data_type="snow", sensor="ims") + + >>> # Historical AVHRR data + >>> dataset_avhrr = open_dataset("2000-01-01", data_type="vhi", sensor="avhrr") + +References: + - VHI: https://www.ncdc.noaa.gov/cdr/terrestrial/vegetation-health + - LAI/FPAR: https://www.star.nesdis.noaa.gov/jpss/EDRs/products_Vegetation.php + - Snow Cover: https://www.ncdc.noaa.gov/snow-and-ice/snow-cover + +Note: + This module requires active internet connection and access to AWS S3 buckets. + Some datasets might have temporal gaps or missing data. +""" +# Standard library imports +from datetime import datetime +from enum import Enum +from functools import lru_cache +from pathlib import Path +from typing import List, Tuple, Union +import warnings + +# Third-party imports +import pandas as pd +import s3fs +import xarray as xr + +# Configuration dictionary for different data products +DATA_CONFIGS = { + "vhi": { + "viirs": { + "path": "noaa-cdr-ndvi-pds/data/", + "pattern": "VIIRS-Land_*" + }, + "avhrr": { + "path": "noaa-cdr-vegetation-health-pds/data/", + "pattern": "AVHRR-Land_*" + } + }, + "lai_fpar": { + "viirs": { + "path": "noaa-cdr-leaf-area-index-fapar-pds/data/", + "pattern": "VIIRS-Land_*" + }, + 'avhrr': { + "path": "noaa-cdr-leaf-area-index-fapar-pds/data/", + "pattern": "AVHRR-Land_*" + } + }, + "snow": { + "ims": { + "path": "noaa-cdr-snow-cover-extent-ims-nrt/", + "pattern": "snow_cover_extent_*" + } + } +} + +def validate_inputs(date_generated: List[datetime], data_type: str, sensor: str) -> None: + """ + Validates input parameters. + + Args: + date_generated: List of dates to process + data_type: Type of data product + sensor: Sensor type + + Raises: + ValueError: If inputs are invalid + """ + if data_type not in DATA_CONFIGS: + raise ValueError(f"Unsupported data type: {data_type}. Available types: {list(DATA_CONFIGS.keys())}") + + if sensor not in DATA_CONFIGS[data_type]: + raise ValueError( + f"Unsupported sensor '{sensor}' for data type '{data_type}'. " + f"Available sensors: {list(DATA_CONFIGS[data_type].keys())}" + ) + +@lru_cache(maxsize=128) +def _get_cached_file_list(year: str, prod_path: str, pattern: str, file_date: str) -> List[str]: + """ + Cached version of file listing to improve performance for repeated requests. + """ + fs = s3fs.S3FileSystem(anon=True) + return fs.glob(f"{prod_path}{year}/{pattern}{file_date}_*.nc") + +def create_daily_data_list( + date_generated: List[datetime], + fs: s3fs.S3FileSystem, + data_type: str = "vhi", + sensor: str = "viirs", + warning: bool = False +) -> Tuple[List[str], int]: + """ + Creates a list of daily data files and calculates the total size of the files. + + Args: + date_generated: List of dates to process + fs: S3 filesystem object + data_type: Type of data product + sensor: Sensor type + warning: Whether to warn instead of raising an error for missing files + + Returns: + Tuple containing list of file paths and total size + """ + validate_inputs(date_generated, data_type, sensor) + + file_list = [] + total_size = 0 + config = DATA_CONFIGS[data_type][sensor] + + for date in date_generated: + file_date = date.strftime("%Y%m%d") + year = file_date[:4] + + try: + file_names = _get_cached_file_list( + year, + config["path"], + config["pattern"], + file_date + ) + + if file_names: + file_list.extend(file_names) + total_size += sum(fs.size(f) for f in file_names) + else: + raise FileNotFoundError( + f"No files found for {data_type} ({sensor}) on {file_date}" + ) + + except Exception as e: + if warning: + warnings.warn(str(e)) + file_list.append(None) + else: + raise ValueError(str(e)) + + return file_list, total_size + +def process_timeofday(dataset: xr.Dataset) -> xr.Dataset: + """ + Process TIMEOFDAY variable in dataset. + + Args: + dataset: Input xarray dataset + + Returns: + Processed dataset + """ + if "TIMEOFDAY" in dataset: + m = dataset["TIMEOFDAY"].attrs.pop("scale_factor") + b = dataset["TIMEOFDAY"].attrs.pop("add_offset") + fv = dataset["TIMEOFDAY"].attrs.pop("_FillValue") + + dataset["TIMEOFDAY"] = dataset["TIMEOFDAY"] * m + b + dataset["TIMEOFDAY"].attrs.update(units="hours") + dataset = xr.decode_cf(dataset) + + dataset["TIMEOFDAY"] = dataset["TIMEOFDAY"].where( + dataset["TIMEOFDAY"] != pd.Timedelta(fv * m + b, unit="hours") + ) + else: + dataset = xr.decode_cf(dataset) + + return dataset + +def open_dataset( + date: Union[str, datetime], + data_type: str = "vhi", + sensor: str = "viirs" +) -> xr.Dataset: + """ + Opens a dataset for the given date. + + Args: + date: Date to process + data_type: Type of data product + sensor: Sensor type + + Returns: + Opened xarray dataset + """ + date_generated = [pd.Timestamp(date)] if isinstance(date, str) else [date] + + fs = s3fs.S3FileSystem(anon=True) + file_list, _ = create_daily_data_list(date_generated, fs, data_type, sensor) + + if len(file_list) == 0 or all(f is None for f in file_list): + raise ValueError(f"Files not available for {data_type} ({sensor}) and date: {date_generated[0]}") + + dset = xr.open_dataset(fs.open(file_list[0]), decode_cf=False) + return process_timeofday(dset) + +def open_mfdataset( + dates: Union[pd.DatetimeIndex, datetime, str], + data_type: str = "vhi", + sensor: str = "viirs", + error_missing: bool = False, + **kwargs +) -> xr.Dataset: + """ + Opens and combines multiple NetCDF files into a single dataset. + + Args: + dates: Dates to process + data_type: Type of data product + sensor: Sensor type + error_missing: Whether to raise error on missing files + + Returns: + Combined xarray dataset + """ + if isinstance(dates, (str, datetime)): + dates = pd.DatetimeIndex([dates]) + elif not isinstance(dates, pd.DatetimeIndex): + dates = pd.DatetimeIndex(dates) + + fs = s3fs.S3FileSystem(anon=True) + file_list, _ = create_daily_data_list( + dates, + fs, + data_type=data_type, + sensor=sensor, + warning=not error_missing + ) + + if len(file_list) == 0 or all(f is None for f in file_list): + raise ValueError(f"Files not available for {data_type} ({sensor}) and dates: {dates}") + + aws_files = [fs.open(f) for f in file_list if f is not None] + dset = xr.open_mfdataset( + aws_files, + concat_dim="time", + combine="nested", + decode_cf=False, + **kwargs + ) + + return process_timeofday(dset) \ No newline at end of file diff --git a/tests/test_avhrr_aot.py b/tests/test_avhrr_aot.py new file mode 100644 index 00000000..3ced3050 --- /dev/null +++ b/tests/test_avhrr_aot.py @@ -0,0 +1,49 @@ +import sys + +import pandas as pd +import pytest + +from monetio.sat.nesdis_avhrr_aot_aws_gridded import open_dataset, open_mfdataset + +if sys.version_info < (3, 7): + pytest.skip("s3fs requires Python 3.7+", allow_module_level=True) + + +def test_open_dataset_no_data(): + with pytest.raises(ValueError, match="File does not exist on AWS:"): + open_dataset("1900-01-01") + + +def test_open_dataset(): + date = "2023-01-01" + ds = open_dataset(date) + assert set(ds.dims) >= {"time", "latitude", "longitude"} + assert ds.sizes["time"] == 1 + assert ds.sizes["latitude"] == 1800 + assert ds.sizes["longitude"] == 3600 + assert ds["time"] == pd.to_datetime(date) + assert "aot1" in ds.data_vars + assert ds["aot1"].dims == ("time", "latitude", "longitude") + + +def test_open_mfdataset(): + dates = ["2023-01-01", "2023-01-02"] + ds = open_mfdataset(dates) + assert (ds["time"] == pd.DatetimeIndex(dates)).all() + + +def test_open_mfdataset_error(): + dates = ["1900-01-01", "2023-01-01"] + + with pytest.warns(UserWarning, match="File does not exist on AWS:"): + ds = open_mfdataset(dates) + assert ds.sizes["time"] == 1 + assert ds["time"] == pd.to_datetime(dates[-1]) + + with pytest.raises(ValueError, match="File does not exist on AWS:"): + _ = open_mfdataset(dates, error_missing=True) + + with pytest.raises(ValueError, match="Files not available for product and dates"), pytest.warns( + UserWarning, match="File does not exist on AWS:" + ): + _ = open_mfdataset(dates[:1], error_missing=False) diff --git a/tests/test_viirs_aod.py b/tests/test_viirs_aod.py new file mode 100644 index 00000000..2a841d72 --- /dev/null +++ b/tests/test_viirs_aod.py @@ -0,0 +1,130 @@ +import sys + +import pandas as pd +import pytest + +from monetio.sat.nesdis_viirs_aod_aws_gridded import open_dataset, open_mfdataset + +if sys.version_info < (3, 7): + pytest.skip("s3fs requires Python 3.7+", allow_module_level=True) + + +@pytest.mark.parametrize("sat", ["SNPP", "NOAA20"]) +@pytest.mark.parametrize("res", [0.05, 0.1, 0.25]) +def test_open_dataset_daily(sat, res): + date = "2020-01-01" # a date when we have both SNPP and NOAA-20 data available + s_res = f"{res:.3f}" + + ds = open_dataset(date, sat, s_res) + assert set(ds.dims) == {"time", "lat", "lon"} + assert ds.sizes["time"] == 1 + assert ds.sizes["lat"] == int(180 / res) + assert ds.sizes["lon"] == int(360 / res) + assert ds.attrs["satellite_name"] == ("NPP" if sat == "SNPP" else "NOAA 20") + assert ds.attrs["spatial_resolution"].strip().startswith(str(res)) + assert (ds.time == pd.DatetimeIndex([date])).all() + + +def test_open_dataset_bad_input(): + with pytest.raises(ValueError, match="Invalid input"): + open_dataset("2020-01-01", satellite="GOES-16") + + with pytest.raises(ValueError, match="Invalid input"): + open_dataset("2020-01-01", satellite="both") + + with pytest.raises(ValueError, match="Invalid input"): + open_dataset("2020-01-01", data_resolution=100) + + with pytest.raises(ValueError, match="Invalid input"): + open_dataset("2020-01-01", averaging_time="asdf") + + +def test_open_dataset_no_data(): + with pytest.raises(ValueError, match="File does not exist on AWS:"): + open_dataset("1900-01-01") + + +def test_open_mfdataset_bad_input(): + cases = [ + {"satellite": "GOES-16"}, + {"satellite": "both"}, + {"data_resolution": 100}, + {"averaging_time": "asdf"}, + ] + for case in cases: + with pytest.raises(ValueError, match="Invalid input"): + open_mfdataset(["2020-01-01"], **case) + + +def test_open_mfdataset_daily(): + ds = open_mfdataset(["2020-01-01", "2020-01-02"], satellite="SNPP", data_resolution=0.25) + assert set(ds.dims) == {"time", "lat", "lon"} + assert ds.sizes["time"] == 2 + assert ds.attrs["spatial_resolution"].strip().startswith("0.25") + assert (ds.time == pd.DatetimeIndex(["2020-01-01", "2020-01-02"])).all() + + +def test_open_mfdataset_monthly(): + with pytest.raises(ValueError, match="not the same length"): + open_mfdataset(["2020-01-01", "2020-01-02"], averaging_time="monthly") + + months = pd.date_range(start="2020-01-01", freq="MS", periods=2) + ds = open_mfdataset(months, averaging_time="monthly") + assert ds.sizes["time"] == 2 + + +def test_open_mfdataset_daily_warning(): + dates = ["2012-01-18", "2012-01-19"] # 2012-01-19 is the first available date + + # Warn and skip by default + with pytest.warns(match="File does not exist on AWS:"): + ds = open_mfdataset( + dates, + satellite="SNPP", + data_resolution=0.25, + averaging_time="daily", + ) + assert ds.sizes["time"] == 1 + assert (ds.time == pd.DatetimeIndex([dates[1]])).all() + + # Error optionally + with pytest.raises(ValueError, match="File does not exist on AWS:"): + ds = open_mfdataset( + dates, + satellite="SNPP", + data_resolution=0.25, + averaging_time="daily", + error_missing=True, + ) + + +def test_open_mfdataset_monthly_warning(): + dates = ["2011-12-01", "2012-01-01"] + + # Warn and skip by default + with pytest.warns(match="File does not exist on AWS:"): + ds = open_mfdataset( + dates, + satellite="SNPP", + data_resolution=0.25, + averaging_time="monthly", + ) + assert ds.sizes["time"] == 1 + assert (ds.time == pd.DatetimeIndex([dates[1]])).all() + + # Error optionally + with pytest.raises(ValueError, match="File does not exist on AWS:"): + ds = open_mfdataset( + dates, + satellite="SNPP", + data_resolution=0.25, + averaging_time="monthly", + error_missing=True, + ) + + +def test_open_mfdataset_no_data(): + with pytest.raises(ValueError, match="Files not available"), pytest.warns( + match="File does not exist on AWS:" + ): + open_mfdataset(["1900-01-01"]) diff --git a/tests/test_viirs_aod_nrt.py b/tests/test_viirs_aod_nrt.py new file mode 100644 index 00000000..d07e8608 --- /dev/null +++ b/tests/test_viirs_aod_nrt.py @@ -0,0 +1,84 @@ +import warnings + +import pandas as pd +import pytest + +from monetio.sat.nesdis_eps_viirs_aod_nrt import open_dataset, open_mfdataset + +NOW = pd.Timestamp.now("UTC") +TODAY = NOW.floor("D") + +with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="Converting to Period representation will drop timezone information.", + ) + THIS_MONTH = TODAY.to_period("M").to_timestamp() + +LAST_MONTH = THIS_MONTH - pd.DateOffset(months=1) +LAST_LAST_MONTH = LAST_MONTH - pd.DateOffset(months=1) + + +@pytest.mark.parametrize("res", [0.25, 0.1]) +@pytest.mark.parametrize("sat", ["NOAA20", "SNPP"]) +def test_open_dataset_daily(sat, res): + # Note: only NRT + date = (TODAY - pd.Timedelta(days=2)).tz_localize(None) + ds = open_dataset(date, satellite=sat, data_resolution=res) + + assert date.strftime(r"%Y%m%d") in ds.attrs["dataset_name"] + assert ds.attrs["spatial_resolution"].strip() == f"{res:.2f} degree" + assert ds.attrs["satellite_name"] == ("Suomi NPP" if sat == "SNPP" else "NOAA 20") + + assert set(ds.dims) == {"time", "lat", "lon"} + assert ds.sizes["time"] == 1 + assert ds.sizes["lat"] == int(180 / res) + assert ds.sizes["lon"] == int(360 / res) + assert (ds.time == pd.DatetimeIndex([date])).all() + assert "AOD550" in ds.data_vars + + +@pytest.mark.parametrize("sat", ["NOAA20", "SNPP"]) +def test_open_dataset_monthly(sat): + # Seems like only one is stored + if NOW - THIS_MONTH.tz_localize("UTC") > pd.Timedelta(hours=12): + date = LAST_MONTH + else: + date = LAST_LAST_MONTH + + ds = open_dataset(date, satellite=sat, daily=False, data_resolution=0.25) + assert ds.sizes["time"] == 1 + + +def test_open_mfdataset(): + today = TODAY.tz_localize(None) + dates = [today - pd.Timedelta(days=2), today - pd.Timedelta(days=3)] + ds = open_mfdataset(dates) + assert ds.sizes["time"] == len(dates) + + +def test_missing_date(): + from requests.exceptions import HTTPError + + with pytest.raises(HTTPError): + open_dataset("1900-01-01") + + +def test_missing_date_mf(): + # No dsets collected + with pytest.raises(ValueError, match="Files not available for product and dates"), pytest.warns( + UserWarning, match="Failed to access file" + ): + open_mfdataset("1900-01-01") + + # Error during dsets collection + with pytest.raises(RuntimeError, match="Failed to access file"): + open_mfdataset("1900-01-01", error_missing=True) + + one_good = ["1900-01-01", TODAY.tz_localize(None) - pd.Timedelta(days=2)] + with pytest.warns(UserWarning, match="Failed to access file"): + ds = open_mfdataset(one_good) + assert ds.sizes["time"] == 1 + + with pytest.raises(RuntimeError, match="Failed to access file"): + open_mfdataset(one_good, error_missing=True) diff --git a/tests/test_viirs_ndvi.py b/tests/test_viirs_ndvi.py new file mode 100644 index 00000000..b599f53c --- /dev/null +++ b/tests/test_viirs_ndvi.py @@ -0,0 +1,61 @@ +import sys + +import numpy as np +import pandas as pd +import pytest + +from monetio.sat.nesdis_viirs_ndvi_aws_gridded import open_dataset, open_mfdataset + +if sys.version_info < (3, 7): + pytest.skip("s3fs requires Python 3.7+", allow_module_level=True) + + +def test_open_dataset_no_data(): + with pytest.raises(ValueError, match="File does not exist on AWS:"): + open_dataset("1900-01-01") + + +def test_open_dataset(): + date = "2023-01-01" + ds = open_dataset(date) + + assert set(ds.dims) >= {"time", "latitude", "longitude"} + assert ds.sizes["time"] == 1 + assert ds.sizes["latitude"] == 3600 + assert ds.sizes["longitude"] == 7200 + assert ds["time"] == pd.to_datetime(date) + + assert "NDVI" in ds.data_vars + assert ds["NDVI"].dims == ("time", "latitude", "longitude") + + assert np.abs(ds["NDVI"]).max().item() < 5 + q = 0.02 + a, b = ds["NDVI"].quantile((q, 1 - q)).values + assert -1 < a < b < 1 + + assert ds["TIMEOFDAY"].isnull().sum() > 0 + assert ds["TIMEOFDAY"].to_series().min() >= pd.Timedelta("0h") + assert ds["TIMEOFDAY"].to_series().max() < pd.Timedelta("24h") + + +def test_open_mfdataset(): + dates = ["2023-01-01", "2023-01-02"] + ds = open_mfdataset(dates) + assert (ds["time"] == pd.DatetimeIndex(dates)).all() + + +def test_open_mfdataset_error(): + dates = ["1900-01-01", "2023-01-01"] + + with pytest.warns(UserWarning, match="File does not exist on AWS:"): + ds = open_mfdataset(dates) + assert ds.sizes["time"] == 1 + assert ds["time"] == pd.to_datetime(dates[-1]) + + with pytest.raises(ValueError, match="File does not exist on AWS:"): + _ = open_mfdataset(dates, error_missing=True) + + with pytest.raises(ValueError, match="Files not available for product and dates"), pytest.warns( + UserWarning, match="File does not exist on AWS:" + ): + _ = open_mfdataset(dates[:1], error_missing=False)