apache · RKuttruff · Jun 29, 2023 · Jul 5, 2023 · Jul 6, 2023 · Jul 10, 2023
diff --git a/.gitignore b/.gitignore
@@ -4,5 +4,6 @@
 *.idea
 *.DS_Store
 analysis/webservice/algorithms/doms/domsconfig.ini
-data-access/nexustiles/config/datastores.ini
+data-access/nexustiles/backends/nexusproto/config/datastores.ini
+data-access/nexustiles/config/datasets.ini
 venv/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,12 +8,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - SDAP-506:
   - Added STAC Catalog endpoint for matchup outputs
+- SDAP-472:
+  - Support for Zarr backend (gridded data only)
+  - Dataset management endpoints for Zarr datasets
+- SDAP-498: Support for satellite units & other dataset-level metadata
 ### Changed
 - SDAP-493: 
   - Updated /job endpoint to use `executionId` terminology for consistency with existing `/cdmsresults` endpoint
   - Updated /job endpoint with details about number of primary and secondary tiles.
 - SDAP-500: Improvements to SDAP Asynchronous Jobs
 - SDAP-499: Added page number to default filename for matchup output
+- SDAP-472: Overhauled `data-access` to support multiple backends for simultaneous support of multiple ARD formats
 ### Deprecated
 ### Removed
 - SDAP-493: 

diff --git a/analysis/conda-requirements.txt b/analysis/conda-requirements.txt
@@ -22,7 +22,8 @@ pytz==2021.1
 utm==0.6.0
 shapely==1.7.1
 backports.functools_lru_cache==1.6.1
-boto3==1.16.63
+boto3>=1.16.63
+botocore==1.24.21
 pillow==8.1.0
 mpld3=0.5.1
 tornado==6.1
@@ -33,4 +34,4 @@ gdal==3.2.1
 mock==4.0.3
 importlib_metadata==4.11.4
 #singledispatch==3.4.0.3
-
+schema
diff --git a/analysis/setup.py b/analysis/setup.py
@@ -17,8 +17,11 @@
 import setuptools
 from subprocess import check_call, CalledProcessError
 
-with open('../VERSION.txt', 'r') as f:
-    __version__ = f.read()
+try:
+    with open('../VERSION.txt', 'r') as f:
+        __version__ = f.read()
+except:
+    __version__ = None
 
 
 try:

diff --git a/analysis/webservice/algorithms/DailyDifferenceAverage.py b/analysis/webservice/algorithms/DailyDifferenceAverage.py
@@ -21,7 +21,8 @@
 
 import numpy as np
 import pytz
-from nexustiles.nexustiles import NexusTileService, NexusTileServiceException
+from nexustiles.nexustiles import NexusTileService
+from nexustiles.exception import NexusTileServiceException
 from shapely.geometry import box
 
 from webservice.NexusHandler import nexus_handler

diff --git a/analysis/webservice/algorithms/StandardDeviationSearch.py b/analysis/webservice/algorithms/StandardDeviationSearch.py
@@ -19,7 +19,7 @@
 from datetime import datetime
 from functools import partial
 
-from nexustiles.nexustiles import NexusTileServiceException
+from nexustiles.exception import NexusTileServiceException
 from pytz import timezone
 
 from webservice.NexusHandler import nexus_handler

diff --git a/analysis/webservice/algorithms/doms/BaseDomsHandler.py b/analysis/webservice/algorithms/doms/BaseDomsHandler.py
@@ -78,14 +78,15 @@ def default(self, obj):
 
 class DomsQueryResults(NexusResults):
     def __init__(self, results=None, args=None, bounds=None, count=None, details=None, computeOptions=None,
-                 executionId=None, status_code=200, page_num=None, page_size=None):
-        NexusResults.__init__(self, results=results, meta=None, stats=None, computeOptions=computeOptions,
+                 executionId=None, status_code=200, page_num=None, page_size=None, meta=None):
+        NexusResults.__init__(self, results=results, meta=meta, stats=None, computeOptions=computeOptions,
                               status_code=status_code)
         self.__args = args
         self.__bounds = bounds
         self.__count = count
         self.__details = details
         self.__executionId = str(executionId)
+        self.__meta = meta if meta is not None else {}
 
         if self.__details is None:
             self.__details = {}
@@ -98,26 +99,27 @@ def toJson(self):
         bounds = self.__bounds.toMap() if self.__bounds is not None else {}
         return json.dumps(
             {"executionId": self.__executionId, "data": self.results(), "params": self.__args, "bounds": bounds,
-             "count": self.__count, "details": self.__details}, indent=4, cls=DomsEncoder)
+             "count": self.__count, "details": self.__details, "metadata": self.__meta}, indent=4, cls=DomsEncoder)
 
     def toCSV(self):
-        return DomsCSVFormatter.create(self.__executionId, self.results(), self.__args, self.__details)
+        return DomsCSVFormatter.create(self.__executionId, self.results(), self.__args, self.__details, self.__meta)
 
     def toNetCDF(self):
-        return DomsNetCDFFormatter.create(self.__executionId, self.results(), self.__args, self.__details)
+        return DomsNetCDFFormatter.create(self.__executionId, self.results(), self.__args, self.__details, self.__meta)
 
     def filename(self):
         return f'CDMS_{self.__executionId}_page{self.__details["pageNum"]}'
 
 
 class DomsCSVFormatter:
     @staticmethod
-    def create(executionId, results, params, details):
+    def create(executionId, results, params, details, metadata):
 
         csv_mem_file = io.StringIO()
         try:
             DomsCSVFormatter.__addConstants(csv_mem_file)
             DomsCSVFormatter.__addDynamicAttrs(csv_mem_file, executionId, results, params, details)
+            DomsCSVFormatter.__addMetadata(csv_mem_file, metadata)
             csv.writer(csv_mem_file).writerow([])
 
             DomsCSVFormatter.__packValues(csv_mem_file, results)
@@ -135,7 +137,11 @@ def is_empty(s):
 
         name = variable['cf_variable_name']
 
-        return name if not is_empty(name) else variable['variable_name']
+        header_name = name if not is_empty(name) else variable['variable_name']
+
+        unit = variable.get('variable_unit', None)
+
+        return f'{header_name} ({unit})' if unit is not None else header_name
 
     @staticmethod
     def __packValues(csv_mem_file, results):
@@ -288,10 +294,31 @@ def __addDynamicAttrs(csvfile, executionId, results, params, details):
 
         writer.writerows(global_attrs)
 
+    @staticmethod
+    def __addMetadata(csvfile, meta):
+        def meta_dict_to_list(meta_dict: dict, prefix='metadata') -> list:
+            attrs = []
+
+            for key in meta_dict:
+                new_key = key if prefix == '' else f'{prefix}.{key}'
+                value = meta_dict[key]
+
+                if isinstance(value, dict):
+                    attrs.extend(meta_dict_to_list(value, new_key))
+                else:
+                    attrs.append(dict(MetadataAttribute=new_key, Value=value))
+
+            return attrs
+
+        metadata_attrs = meta_dict_to_list(meta)
+
+        writer = csv.DictWriter(csvfile, sorted(next(iter(metadata_attrs)).keys()))
+        writer.writerows(metadata_attrs)
+
 
 class DomsNetCDFFormatter:
     @staticmethod
-    def create(executionId, results, params, details):
+    def create(executionId, results, params, details, metadata):
 
         t = tempfile.mkstemp(prefix="cdms_", suffix=".nc")
         tempFileName = t[1]
@@ -335,6 +362,30 @@ def create(executionId, results, params, details):
         dataset.CDMS_page_num = details["pageNum"]
         dataset.CDMS_page_size = details["pageSize"]
 
+        ####TEST
+
+        def meta_dict_to_list(meta_dict: dict, prefix='metadata') -> list:
+            attrs = []
+
+            for key in meta_dict:
+                new_key = key if prefix == '' else f'{prefix}.{key}'
+                value = meta_dict[key]
+
+                if value is None:
+                    value = 'NULL'
+                elif isinstance(value, list):
+                    value = json.dumps(value)
+
+                if isinstance(value, dict):
+                    attrs.extend(meta_dict_to_list(value, new_key))
+                else:
+                    attrs.append((new_key, value))
+
+            return attrs
+
+        for attr in meta_dict_to_list(metadata):
+            setattr(dataset, *attr)
+
         insituDatasets = params["matchup"]
         insituLinks = set()
         for insitu in insituDatasets:
@@ -534,7 +585,8 @@ def writeGroup(self):
                 self.__enrichVariable(data_variable, min_data, max_data, has_depth=None, unit=units[variable])
                 data_variable[:] = np.ma.masked_invalid(variables[variable])
                 data_variable.long_name = name
-                data_variable.standard_name = cf_name
+                if cf_name:
+                    data_variable.standard_name = cf_name
 
     #
     # Lists may include 'None" values, to calc min these must be filtered out

diff --git a/analysis/webservice/algorithms/doms/ResultsRetrieval.py b/analysis/webservice/algorithms/doms/ResultsRetrieval.py
@@ -19,6 +19,11 @@
 from . import ResultsStorage
 from webservice.NexusHandler import nexus_handler
 from webservice.webmodel import NexusProcessingException
+from nexustiles.nexustiles import NexusTileService
+
+import logging
+
+log = logging.getLogger(__name__)
 
 
 @nexus_handler
@@ -48,5 +53,22 @@ def calc(self, computeOptions, **args):
         with ResultsStorage.ResultsRetrieval(self.config) as storage:
             params, stats, data = storage.retrieveResults(execution_id, trim_data=simple_results, page_num=page_num, page_size=page_size)
 
+        try:
+            ds_metadata = {}
+            ds_meta_primary_name = params['primary']
+
+            primary_metadata = NexusTileService.get_metadata_for_dataset(ds_meta_primary_name)
+
+            ds_metadata['primary'] = {ds_meta_primary_name: primary_metadata}
+
+            ds_metadata['secondary'] = {}
+
+            for secondary_ds_name in params['matchup'].split(','):
+                ds_metadata['secondary'][secondary_ds_name] = NexusTileService.get_metadata_for_dataset(secondary_ds_name)
+        except:
+            log.warning('Could not build dataset metadata dict due to an error')
+            ds_metadata = {}
+
         return BaseDomsHandler.DomsQueryResults(results=data, args=params, details=stats, bounds=None, count=len(data),
-                                                computeOptions=None, executionId=execution_id, page_num=page_num, page_size=page_size)
+                                                computeOptions=None, executionId=execution_id, page_num=page_num,
+                                                page_size=page_size, meta=dict(datasets=ds_metadata))
diff --git a/analysis/webservice/algorithms/doms/subsetter.py b/analysis/webservice/algorithms/doms/subsetter.py
@@ -24,6 +24,8 @@
 from webservice.algorithms.doms.insitu import query_insitu
 from webservice.webmodel import NexusProcessingException, NexusResults
 
+from nexustiles.nexustiles import NexusTileService
+
 from . import BaseDomsHandler
 
 ISO_8601 = '%Y-%m-%dT%H:%M:%S%z'
@@ -302,20 +304,46 @@ def toCsv(self):
         logging.info('Converting result to CSV')
 
         for dataset_name, results in dataset_results.items():
+            try:
+                ds_metadata = NexusTileService.get_metadata_for_dataset(dataset_name)
+            except:
+                ds_metadata = {}
+
+            ds_vars = ds_metadata.get('variables', [])
+
+            variable_dict = {}
+            variable_dict_cf = {}
+
+            for v in ds_vars:
+                variable_dict[v['name']] = v
+                variable_dict_cf[v['cf_standard_name']] = v
+
             rows = []
 
             headers = [
                 'longitude',
                 'latitude',
                 'time'
             ]
-            data_variables = list(set([keys for result in results for keys in result['data'].keys()]))
-            data_variables.sort()
+
+            data_variables = []
+            data_variable_headers = []
+
+            for dv in sorted(list(set([keys for result in results for keys in result['data'].keys()]))):
+                data_variables.append(dv)
+
+                if dv in variable_dict_cf and variable_dict_cf[dv]["unit"] is not None:
+                    data_variable_headers.append(f'{dv} ({variable_dict_cf[dv]["unit"]})')
+                elif dv in variable_dict and variable_dict[dv]["unit"] is not None:
+                    data_variable_headers.append(f'{dv} ({variable_dict[dv]["unit"]})')
+                else:
+                    data_variable_headers.append(dv)
 
             if 'id' in list(set([keys for result in results for keys in result.keys()])):
                 headers.append('id')
 
-            headers.extend(data_variables)
+            headers.extend(data_variable_headers)
+
             for i, result in enumerate(results):
                 cols = []
 

diff --git a/analysis/webservice/algorithms_spark/DailyDifferenceAverageSpark.py b/analysis/webservice/algorithms_spark/DailyDifferenceAverageSpark.py
@@ -324,7 +324,7 @@ def calculate_diff(tile_service_factory, tile_ids, bounding_wkt, dataset, climat
     for tile_id in tile_ids:
         # Get the dataset tile
         try:
-            dataset_tile = get_dataset_tile(tile_service, wkt.loads(bounding_wkt.value), tile_id)
+            dataset_tile = get_dataset_tile(tile_service, wkt.loads(bounding_wkt.value), tile_id, dataset.value)
         except NoDatasetTile:
             # This should only happen if all measurements in a tile become masked after applying the bounding polygon
             continue
@@ -348,12 +348,12 @@ def calculate_diff(tile_service_factory, tile_ids, bounding_wkt, dataset, climat
     return chain(*diff_generators)
 
 
-def get_dataset_tile(tile_service, search_bounding_shape, tile_id):
+def get_dataset_tile(tile_service, search_bounding_shape, tile_id, dataset):
     the_time = datetime.now()
 
     try:
         # Load the dataset tile
-        dataset_tile = tile_service.find_tile_by_id(tile_id)[0]
+        dataset_tile = tile_service.find_tile_by_id(tile_id, ds=dataset)[0]
         # Mask it to the search domain
         dataset_tile = tile_service.mask_tiles_to_polygon(search_bounding_shape, [dataset_tile])[0]
     except IndexError:

diff --git a/analysis/webservice/algorithms_spark/HofMoellerSpark.py b/analysis/webservice/algorithms_spark/HofMoellerSpark.py
@@ -44,12 +44,12 @@ class HofMoellerCalculator(object):
     def hofmoeller_stats(tile_service_factory, metrics_callback, tile_in_spark):
 
         (latlon, tile_id, index,
-         min_lat, max_lat, min_lon, max_lon) = tile_in_spark
+         min_lat, max_lat, min_lon, max_lon, dataset) = tile_in_spark
 
         tile_service = tile_service_factory()
         try:
             # Load the dataset tile
-            tile = tile_service.find_tile_by_id(tile_id, metrics_callback=metrics_callback)[0]
+            tile = tile_service.find_tile_by_id(tile_id, metrics_callback=metrics_callback, ds=dataset)[0]
             calculation_start = datetime.now()
             # Mask it to the search domain
             tile = tile_service.mask_tiles_to_bbox(min_lat, max_lat,
@@ -352,7 +352,7 @@ def calc(self, compute_options, **args):
 
         min_lon, min_lat, max_lon, max_lat = bbox.bounds
 
-        nexus_tiles_spark = [(self._latlon, tile.tile_id, x, min_lat, max_lat, min_lon, max_lon) for x, tile in
+        nexus_tiles_spark = [(self._latlon, tile.tile_id, x, min_lat, max_lat, min_lon, max_lon, tile.dataset) for x, tile in
                              enumerate(self._get_tile_service().find_tiles_in_box(min_lat, max_lat, min_lon, max_lon,
                                                                                   ds, start_time, end_time,
                                                                                   metrics_callback=metrics_record.record_metrics,
@@ -408,7 +408,7 @@ def calc(self, compute_options, **args):
 
         min_lon, min_lat, max_lon, max_lat = bbox.bounds
 
-        nexus_tiles_spark = [(self._latlon, tile.tile_id, x, min_lat, max_lat, min_lon, max_lon) for x, tile in
+        nexus_tiles_spark = [(self._latlon, tile.tile_id, x, min_lat, max_lat, min_lon, max_lon, tile.dataset) for x, tile in
                              enumerate(self._get_tile_service().find_tiles_in_box(min_lat, max_lat, min_lon, max_lon,
                                                                                   ds, start_time, end_time,
                                                                                   metrics_callback=metrics_record.record_metrics,