merge in main

voto-ocean-knowledge · Aug 7, 2024 · 0740359 · 0740359
2 parents 24b8de0 + 96fb528
commit 0740359
Show file tree

Hide file tree

Showing 90 changed files with 1,125 additions and 419 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -7,8 +7,7 @@ jobs:
     strategy:
       matrix:
         os: ["ubuntu-latest"]
-        python-version: ["3.7", "3.8", "3.9"]
-
+        python-version: ["3.9", "3.10"]
     steps:
       - uses: actions/checkout@v2
       - name: Cache conda
@@ -22,15 +21,15 @@ jobs:
             ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{hashFiles('environment.yml') }}
       - uses: conda-incubator/setup-miniconda@v2
         with:
-          activate-environment: pyglider
+          activate-environment: pyglider-test
           environment-file: tests/environment.yml
           python-version: ${{ matrix.python-version }}
           channel-priority: strict
           use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
       - name: Conda info
         shell: bash -l {0}
-        run: conda info
-      - name: install pyglider
+        run: conda info; conda list
+      - name: install pyglider source
         shell: bash -l {0}
         run: which pip; pip install -e .
       - name: Process seaexplorer

diff --git a/docs-requirements.txt b/docs-requirements.txt
@@ -13,3 +13,4 @@ autoapi
 numpydoc
 myst-parser
 dbdreader
+polars>=0.16
diff --git a/environment.yml b/environment.yml
@@ -1,9 +1,8 @@
 name: pyglider
 channels:
   - conda-forge
-  - defaults
 dependencies:
-  - python
+  - python>=3.10
   - numpy
   - pip
   - xarray
@@ -13,6 +12,6 @@ dependencies:
   - scipy
   - bitstring
   - pooch
-  - polars
   - pip:
     - dbdreader
+    - polars
diff --git a/pyglider/ncprocess.py b/pyglider/ncprocess.py
@@ -14,7 +14,7 @@
 _log = logging.getLogger(__name__)
 
 
-def extract_timeseries_profiles(inname, outdir, deploymentyaml):
+def extract_timeseries_profiles(inname, outdir, deploymentyaml, force=False):
     """
     Extract and save each profile from a timeseries netCDF.
 
@@ -29,16 +29,18 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml):
     deploymentyaml : str or Path
         location of deployment yaml file for the netCDF file.  This should
         be the same yaml file that was used to make the timeseries file.
+
+    force : bool, default False
+        Force an overwite even if profile netcdf already exists
     """
     try:
         os.mkdir(outdir)
     except FileExistsError:
         pass
 
-    with open(deploymentyaml) as fin:
-        deployment = yaml.safe_load(fin)
-    meta = deployment['metadata']
+    deployment = utils._get_deployment(deploymentyaml)
 
+    meta = deployment['metadata']
     with xr.open_dataset(inname) as ds:
         _log.info('Extracting profiles: opening %s', inname)
         profiles = np.unique(ds.profile_index)
@@ -49,7 +51,7 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml):
             dss = ds.isel(time=ind)
             outname = outdir + '/' + utils.get_file_id(dss) + '.nc'
             _log.info('Checking %s', outname)
-            if not os.path.exists(outname):
+            if force or (not os.path.exists(outname)):
                 # this is the id for the whole file, not just this profile..
                 dss['trajectory'] = utils.get_file_id(ds).encode()
                 trajlen = len(utils.get_file_id(ds).encode())
@@ -67,19 +69,40 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml):
 
                     dss['v'] = dss.water_velocity_northward.mean()
                     dss['v'].attrs = profile_meta['v']
+                elif 'u' in profile_meta:
+                    dss['u'] = profile_meta['u'].get('_FillValue', np.NaN)
+                    dss['u'].attrs = profile_meta['u']
+
+                    dss['v'] = profile_meta['v'].get('_FillValue', np.NaN)
+                    dss['v'].attrs = profile_meta['v']
+                else:
+                    dss['u'] = np.NaN
+                    dss['v'] = np.NaN
+
 
-                dss['profile_id'] = np.array(p*1.0)
+                dss['profile_id'] = np.int32(p)
                 dss['profile_id'].attrs = profile_meta['profile_id']
+                if '_FillValue' not in dss['profile_id'].attrs:
+                    dss['profile_id'].attrs['_FillValue'] = -1
+                dss['profile_id'].attrs['valid_min'] = np.int32(dss['profile_id'].attrs['valid_min'])
+                dss['profile_id'].attrs['valid_max'] = np.int32(dss['profile_id'].attrs['valid_max'])
+
                 dss['profile_time'] = dss.time.mean()
                 dss['profile_time'].attrs = profile_meta['profile_time']
+                # remove units so they can be encoded later:
+                try:
+                    del dss.profile_time.attrs['units']
+                    del dss.profile_time.attrs['calendar']
+                except KeyError:
+                    pass
                 dss['profile_lon'] = dss.longitude.mean()
                 dss['profile_lon'].attrs = profile_meta['profile_lon']
                 dss['profile_lat'] = dss.latitude.mean()
                 dss['profile_lat'].attrs = profile_meta['profile_lat']
 
                 dss['lat'] = dss['latitude']
                 dss['lon'] = dss['longitude']
-                dss['platform'] = np.NaN
+                dss['platform'] = np.int32(1)
                 comment = (meta['glider_model'] + ' operated by ' +
                            meta['institution'])
                 dss['platform'].attrs['comment'] = comment
@@ -90,6 +113,9 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml):
                     meta['glider_model'] + dss['platform'].attrs['id'])
                 dss['platform'].attrs['type'] = 'platform'
                 dss['platform'].attrs['wmo_id'] = meta['wmo_id']
+                if '_FillValue' not in dss['platform'].attrs:
+                    dss['platform'].attrs['_FillValue'] = -1
+
 
                 dss['lat_uv'] = np.NaN
                 dss['lat_uv'].attrs = profile_meta['lat_uv']
@@ -98,32 +124,51 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml):
                 dss['time_uv'] = np.NaN
                 dss['time_uv'].attrs = profile_meta['time_uv']
 
-                dss['instrument_ctd'] = np.NaN
+                dss['instrument_ctd'] = np.int32(1.0)
                 dss['instrument_ctd'].attrs = profile_meta['instrument_ctd']
+                if '_FillValue' not in dss['instrument_ctd'].attrs:
+                    dss['instrument_ctd'].attrs['_FillValue'] = -1
 
                 dss.attrs['date_modified'] = str(np.datetime64('now')) + 'Z'
 
-                # ancillary variables::
+                # ancillary variables: link and create with values of 2.  If
+                # we dont' want them all 2, then create these variables in the
+                # time series
                 to_fill = ['temperature', 'pressure', 'conductivity',
-                           'salinity', 'density', 'lon', 'lat', 'depth']
+                        'salinity', 'density', 'lon', 'lat', 'depth']
                 for name in to_fill:
-                    dss[name].attrs['ancillary_variables'] = name + '_qc'
+                    qcname = name + '_qc'
+                    dss[name].attrs['ancillary_variables'] = qcname
+                    if qcname not in dss.keys():
 
+                        dss[qcname] = ('time', 2 * np.ones(len(dss[name]), np.int8))
+                        dss[qcname].attrs = utils.fill_required_qcattrs({}, name)
+                        # 2 is "not eval"
                 # outname = outdir + '/' + utils.get_file_id(dss) + '.nc'
                 _log.info('Writing %s', outname)
                 timeunits = 'seconds since 1970-01-01T00:00:00Z'
                 timecalendar = 'gregorian'
+                try:
+                    del dss.profile_time.attrs['_FillValue']
+                    del dss.profile_time.attrs['units']
+                except KeyError:
+                    pass
                 dss.to_netcdf(outname, encoding={'time': {'units': timeunits,
-                                                          'calendar': timecalendar},
-                                                          'profile_time': 
-                                                         {'units': timeunits}})
+                                                          'calendar': timecalendar,
+                                                          'dtype': 'float64'},
+                                                          'profile_time':
+                                                         {'units': timeunits,
+                                                         '_FillValue': -99999.0,
+                                                         'dtype': 'float64'},
+                }
+
+                                                         )
 
                 # add traj_strlen using bare ntcdf to make IOOS happy
                 with netCDF4.Dataset(outname, 'r+') as nc:
                     nc.renameDimension('string%d' % trajlen, 'traj_strlen')
 
-
-def make_gridfiles(inname, outdir, deploymentyaml, dz=1):
+def make_gridfiles(inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, starttime='1970-01-01'):
     """
     Turn a timeseries netCDF file into a vertically gridded netCDF.
 
@@ -154,11 +199,12 @@ def make_gridfiles(inname, outdir, deploymentyaml, dz=1):
     except FileExistsError:
         pass
 
-    with open(deploymentyaml) as fin:
-        deployment = yaml.safe_load(fin)
+    deployment = utils._get_deployment(deploymentyaml)
+
     profile_meta = deployment['profile_variables']
 
-    ds = xr.open_dataset(inname)
+    ds = xr.open_dataset(inname, decode_times=True)
+    ds = ds.where(ds.time > np.datetime64(starttime), drop=True)
     _log.info(f'Working on: {inname}')
     _log.debug(str(ds))
     _log.debug(str(ds.time[0]))
@@ -168,17 +214,24 @@ def make_gridfiles(inname, outdir, deploymentyaml, dz=1):
     profiles = [p for p in profiles if (~np.isnan(p) and not (p % 1)
                                         and (p > 0))]
     profile_bins = np.hstack((np.array(profiles) - 0.5, [profiles[-1]+0.5]))
-
+    _log.debug(profile_bins)
     Nprofiles = len(profiles)
     _log.info(f'Nprofiles {Nprofiles}')
     depth_bins = np.arange(0, 1100.1, dz)
     depths = depth_bins[:-1] + 0.5
-
+    xdimname = 'time'
     dsout = xr.Dataset(
         coords={'depth': ('depth', depths),
-                'profile': ('time', profiles)})
+                'profile': (xdimname, profiles)})
+    dsout['depth'].attrs = {'units': 'm',
+                            'long_name': 'Depth',
+                            'standard_name': 'depth',
+                            'positive': 'down',
+                            'coverage_content_type': 'coordinate',
+                            'comment': 'center of depth bins'}
+
     ds['time_1970'] = ds.temperature.copy()
-    ds['time_1970'].values = ds.time.values.astype(np.float64)/1e9
+    ds['time_1970'].values = ds.time.values.astype(np.float64)
     for td in ('time_1970', 'longitude', 'latitude'):
         good = np.where(~np.isnan(ds[td]) & (ds['profile_index'] % 1 == 0))[0]
         dat, xedges, binnumber = stats.binned_statistic(
@@ -187,19 +240,19 @@ def make_gridfiles(inname, outdir, deploymentyaml, dz=1):
                 bins=[profile_bins])
         if td == 'time_1970':
             td = 'time'
-            dat = dat.astype('timedelta64[s]') + np.datetime64('1970-01-01T00:00:00')
+            dat = dat.astype('timedelta64[ns]') + np.datetime64('1970-01-01T00:00:00')
         _log.info(f'{td} {len(dat)}')
         dsout[td] = (('time'), dat, ds[td].attrs)
     ds.drop('time_1970')
     good = np.where(~np.isnan(ds['time']) & (ds['profile_index'] % 1 == 0))[0]
     _log.info(f'Done times! {len(dat)}')
     dsout['profile_time_start'] = (
-        ('time'), dat, profile_meta['profile_time_start'])
+        (xdimname), dat, profile_meta['profile_time_start'])
     dsout['profile_time_end'] = (
-        ('time'), dat, profile_meta['profile_time_end'])
+        (xdimname), dat, profile_meta['profile_time_end'])
 
     for k in ds.keys():
-        if k in ['time', 'longitude', 'latitude', 'depth'] or 'time' in k:
+        if k in ['time', 'profile', 'longitude', 'latitude', 'depth'] or 'time' in k:
             continue
         _log.info('Gridding %s', k)
         good = np.where(~np.isnan(ds[k]) & (ds['profile_index'] % 1 == 0))[0]
@@ -216,15 +269,14 @@ def make_gridfiles(inname, outdir, deploymentyaml, dz=1):
                                               "scipy.stats.gmean")
         else:
             average_method = "mean"
-
         dat, xedges, yedges, binnumber = stats.binned_statistic_2d(
                 ds['profile_index'].values[good],
                 ds['depth'].values[good],
                 values=ds[k].values[good], statistic=average_method,
                 bins=[profile_bins, depth_bins])
 
         _log.debug(f'dat{np.shape(dat)}')
-        dsout[k] = (('depth', 'time'), dat.T, ds[k].attrs)
+        dsout[k] = (('depth', xdimname), dat.T, ds[k].attrs)
 
         # fill gaps in data:
         dsout[k].values = utils.gappy_fill_vertical(dsout[k].values)
@@ -240,11 +292,45 @@ def make_gridfiles(inname, outdir, deploymentyaml, dz=1):
         dsout = dsout.drop(['water_velocity_eastward',
                             'water_velocity_northward'])
     dsout.attrs = ds.attrs
+    dsout.attrs.pop('cdm_data_type')
+    # fix to be ISO parsable:
+    if len(dsout.attrs['deployment_start']) > 18:
+        dsout.attrs['deployment_start'] = dsout.attrs['deployment_start'][:19]
+        dsout.attrs['deployment_end'] = dsout.attrs['deployment_end'][:19]
+        dsout.attrs['time_coverage_start'] = dsout.attrs['time_coverage_start'][:19]
+        dsout.attrs['time_coverage_end'] = dsout.attrs['time_coverage_end'][:19]
+    # fix standard_name so they don't overlap!
+    try:
+        dsout['waypoint_latitude'].attrs.pop('standard_name')
+        dsout['waypoint_longitude'].attrs.pop('standard_name')
+        dsout['profile_time_start'].attrs.pop('standard_name')
+        dsout['profile_time_end'].attrs.pop('standard_name')
+    except:
+        pass
+    # set some attributes for cf guidance
+    # see H.6.2. Profiles along a single trajectory
+    # https://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/aphs06.html
+    dsout.attrs['featureType'] = 'trajectoryProfile'
+    dsout['profile'].attrs['cf_role'] = 'profile_id'
+    dsout['mission_number'] = int(1)
+    dsout['mission_number'].attrs['cf_role'] = 'trajectory_id'
+    dsout = dsout.set_coords(['latitude', 'longitude', 'time'])
+    for k in dsout:
+        if k in ['profile', 'depth', 'latitude', 'longitude', 'time', 'mission_number']:
+            dsout[k].attrs['coverage_content_type'] = 'coordinate'
+        else:
+            dsout[k].attrs['coverage_content_type'] = 'physicalMeasurement'
+
 
-    outname = outdir + '/' + ds.attrs['deployment_name'] + '_grid.nc'
+    outname = outdir + '/' + ds.attrs['deployment_name'] + '_grid' + fnamesuffix + '.nc'
     _log.info('Writing %s', outname)
-    timeunits = 'seconds since 1970-01-01T00:00:00Z'
-    dsout.to_netcdf(outname, encoding={'time': {'units': timeunits}})
+    # timeunits = 'nanoseconds since 1970-01-01T00:00:00Z'
+    dsout.to_netcdf(
+        outname,
+        encoding={'time': {'units': 'seconds since 1970-01-01T00:00:00Z',
+                           '_FillValue': np.NaN,
+                           'calendar': 'gregorian',
+                           'dtype': 'float64'}})
     _log.info('Done gridding')
 
     return outname