Skip to content

Commit

Permalink
merge in main
Browse files Browse the repository at this point in the history
  • Loading branch information
callumrollo committed Aug 7, 2024
2 parents 24b8de0 + 96fb528 commit 0740359
Show file tree
Hide file tree
Showing 90 changed files with 1,125 additions and 419 deletions.
9 changes: 4 additions & 5 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ jobs:
strategy:
matrix:
os: ["ubuntu-latest"]
python-version: ["3.7", "3.8", "3.9"]

python-version: ["3.9", "3.10"]
steps:
- uses: actions/checkout@v2
- name: Cache conda
Expand All @@ -22,15 +21,15 @@ jobs:
${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{hashFiles('environment.yml') }}
- uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: pyglider
activate-environment: pyglider-test
environment-file: tests/environment.yml
python-version: ${{ matrix.python-version }}
channel-priority: strict
use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly!
- name: Conda info
shell: bash -l {0}
run: conda info
- name: install pyglider
run: conda info; conda list
- name: install pyglider source
shell: bash -l {0}
run: which pip; pip install -e .
- name: Process seaexplorer
Expand Down
1 change: 1 addition & 0 deletions docs-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ autoapi
numpydoc
myst-parser
dbdreader
polars>=0.16
5 changes: 2 additions & 3 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
name: pyglider
channels:
- conda-forge
- defaults
dependencies:
- python
- python>=3.10
- numpy
- pip
- xarray
Expand All @@ -13,6 +12,6 @@ dependencies:
- scipy
- bitstring
- pooch
- polars
- pip:
- dbdreader
- polars
150 changes: 118 additions & 32 deletions pyglider/ncprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
_log = logging.getLogger(__name__)


def extract_timeseries_profiles(inname, outdir, deploymentyaml):
def extract_timeseries_profiles(inname, outdir, deploymentyaml, force=False):
"""
Extract and save each profile from a timeseries netCDF.
Expand All @@ -29,16 +29,18 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml):
deploymentyaml : str or Path
location of deployment yaml file for the netCDF file. This should
be the same yaml file that was used to make the timeseries file.
force : bool, default False
Force an overwite even if profile netcdf already exists
"""
try:
os.mkdir(outdir)
except FileExistsError:
pass

with open(deploymentyaml) as fin:
deployment = yaml.safe_load(fin)
meta = deployment['metadata']
deployment = utils._get_deployment(deploymentyaml)

meta = deployment['metadata']
with xr.open_dataset(inname) as ds:
_log.info('Extracting profiles: opening %s', inname)
profiles = np.unique(ds.profile_index)
Expand All @@ -49,7 +51,7 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml):
dss = ds.isel(time=ind)
outname = outdir + '/' + utils.get_file_id(dss) + '.nc'
_log.info('Checking %s', outname)
if not os.path.exists(outname):
if force or (not os.path.exists(outname)):
# this is the id for the whole file, not just this profile..
dss['trajectory'] = utils.get_file_id(ds).encode()
trajlen = len(utils.get_file_id(ds).encode())
Expand All @@ -67,19 +69,40 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml):

dss['v'] = dss.water_velocity_northward.mean()
dss['v'].attrs = profile_meta['v']
elif 'u' in profile_meta:
dss['u'] = profile_meta['u'].get('_FillValue', np.NaN)
dss['u'].attrs = profile_meta['u']

dss['v'] = profile_meta['v'].get('_FillValue', np.NaN)
dss['v'].attrs = profile_meta['v']
else:
dss['u'] = np.NaN
dss['v'] = np.NaN


dss['profile_id'] = np.array(p*1.0)
dss['profile_id'] = np.int32(p)
dss['profile_id'].attrs = profile_meta['profile_id']
if '_FillValue' not in dss['profile_id'].attrs:
dss['profile_id'].attrs['_FillValue'] = -1
dss['profile_id'].attrs['valid_min'] = np.int32(dss['profile_id'].attrs['valid_min'])
dss['profile_id'].attrs['valid_max'] = np.int32(dss['profile_id'].attrs['valid_max'])

dss['profile_time'] = dss.time.mean()
dss['profile_time'].attrs = profile_meta['profile_time']
# remove units so they can be encoded later:
try:
del dss.profile_time.attrs['units']
del dss.profile_time.attrs['calendar']
except KeyError:
pass
dss['profile_lon'] = dss.longitude.mean()
dss['profile_lon'].attrs = profile_meta['profile_lon']
dss['profile_lat'] = dss.latitude.mean()
dss['profile_lat'].attrs = profile_meta['profile_lat']

dss['lat'] = dss['latitude']
dss['lon'] = dss['longitude']
dss['platform'] = np.NaN
dss['platform'] = np.int32(1)
comment = (meta['glider_model'] + ' operated by ' +
meta['institution'])
dss['platform'].attrs['comment'] = comment
Expand All @@ -90,6 +113,9 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml):
meta['glider_model'] + dss['platform'].attrs['id'])
dss['platform'].attrs['type'] = 'platform'
dss['platform'].attrs['wmo_id'] = meta['wmo_id']
if '_FillValue' not in dss['platform'].attrs:
dss['platform'].attrs['_FillValue'] = -1


dss['lat_uv'] = np.NaN
dss['lat_uv'].attrs = profile_meta['lat_uv']
Expand All @@ -98,32 +124,51 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml):
dss['time_uv'] = np.NaN
dss['time_uv'].attrs = profile_meta['time_uv']

dss['instrument_ctd'] = np.NaN
dss['instrument_ctd'] = np.int32(1.0)
dss['instrument_ctd'].attrs = profile_meta['instrument_ctd']
if '_FillValue' not in dss['instrument_ctd'].attrs:
dss['instrument_ctd'].attrs['_FillValue'] = -1

dss.attrs['date_modified'] = str(np.datetime64('now')) + 'Z'

# ancillary variables::
# ancillary variables: link and create with values of 2. If
# we dont' want them all 2, then create these variables in the
# time series
to_fill = ['temperature', 'pressure', 'conductivity',
'salinity', 'density', 'lon', 'lat', 'depth']
'salinity', 'density', 'lon', 'lat', 'depth']
for name in to_fill:
dss[name].attrs['ancillary_variables'] = name + '_qc'
qcname = name + '_qc'
dss[name].attrs['ancillary_variables'] = qcname
if qcname not in dss.keys():

dss[qcname] = ('time', 2 * np.ones(len(dss[name]), np.int8))
dss[qcname].attrs = utils.fill_required_qcattrs({}, name)
# 2 is "not eval"
# outname = outdir + '/' + utils.get_file_id(dss) + '.nc'
_log.info('Writing %s', outname)
timeunits = 'seconds since 1970-01-01T00:00:00Z'
timecalendar = 'gregorian'
try:
del dss.profile_time.attrs['_FillValue']
del dss.profile_time.attrs['units']
except KeyError:
pass
dss.to_netcdf(outname, encoding={'time': {'units': timeunits,
'calendar': timecalendar},
'profile_time':
{'units': timeunits}})
'calendar': timecalendar,
'dtype': 'float64'},
'profile_time':
{'units': timeunits,
'_FillValue': -99999.0,
'dtype': 'float64'},
}

)

# add traj_strlen using bare ntcdf to make IOOS happy
with netCDF4.Dataset(outname, 'r+') as nc:
nc.renameDimension('string%d' % trajlen, 'traj_strlen')


def make_gridfiles(inname, outdir, deploymentyaml, dz=1):
def make_gridfiles(inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, starttime='1970-01-01'):
"""
Turn a timeseries netCDF file into a vertically gridded netCDF.
Expand Down Expand Up @@ -154,11 +199,12 @@ def make_gridfiles(inname, outdir, deploymentyaml, dz=1):
except FileExistsError:
pass

with open(deploymentyaml) as fin:
deployment = yaml.safe_load(fin)
deployment = utils._get_deployment(deploymentyaml)

profile_meta = deployment['profile_variables']

ds = xr.open_dataset(inname)
ds = xr.open_dataset(inname, decode_times=True)
ds = ds.where(ds.time > np.datetime64(starttime), drop=True)
_log.info(f'Working on: {inname}')
_log.debug(str(ds))
_log.debug(str(ds.time[0]))
Expand All @@ -168,17 +214,24 @@ def make_gridfiles(inname, outdir, deploymentyaml, dz=1):
profiles = [p for p in profiles if (~np.isnan(p) and not (p % 1)
and (p > 0))]
profile_bins = np.hstack((np.array(profiles) - 0.5, [profiles[-1]+0.5]))

_log.debug(profile_bins)
Nprofiles = len(profiles)
_log.info(f'Nprofiles {Nprofiles}')
depth_bins = np.arange(0, 1100.1, dz)
depths = depth_bins[:-1] + 0.5

xdimname = 'time'
dsout = xr.Dataset(
coords={'depth': ('depth', depths),
'profile': ('time', profiles)})
'profile': (xdimname, profiles)})
dsout['depth'].attrs = {'units': 'm',
'long_name': 'Depth',
'standard_name': 'depth',
'positive': 'down',
'coverage_content_type': 'coordinate',
'comment': 'center of depth bins'}

ds['time_1970'] = ds.temperature.copy()
ds['time_1970'].values = ds.time.values.astype(np.float64)/1e9
ds['time_1970'].values = ds.time.values.astype(np.float64)
for td in ('time_1970', 'longitude', 'latitude'):
good = np.where(~np.isnan(ds[td]) & (ds['profile_index'] % 1 == 0))[0]
dat, xedges, binnumber = stats.binned_statistic(
Expand All @@ -187,19 +240,19 @@ def make_gridfiles(inname, outdir, deploymentyaml, dz=1):
bins=[profile_bins])
if td == 'time_1970':
td = 'time'
dat = dat.astype('timedelta64[s]') + np.datetime64('1970-01-01T00:00:00')
dat = dat.astype('timedelta64[ns]') + np.datetime64('1970-01-01T00:00:00')
_log.info(f'{td} {len(dat)}')
dsout[td] = (('time'), dat, ds[td].attrs)
ds.drop('time_1970')
good = np.where(~np.isnan(ds['time']) & (ds['profile_index'] % 1 == 0))[0]
_log.info(f'Done times! {len(dat)}')
dsout['profile_time_start'] = (
('time'), dat, profile_meta['profile_time_start'])
(xdimname), dat, profile_meta['profile_time_start'])
dsout['profile_time_end'] = (
('time'), dat, profile_meta['profile_time_end'])
(xdimname), dat, profile_meta['profile_time_end'])

for k in ds.keys():
if k in ['time', 'longitude', 'latitude', 'depth'] or 'time' in k:
if k in ['time', 'profile', 'longitude', 'latitude', 'depth'] or 'time' in k:
continue
_log.info('Gridding %s', k)
good = np.where(~np.isnan(ds[k]) & (ds['profile_index'] % 1 == 0))[0]
Expand All @@ -216,15 +269,14 @@ def make_gridfiles(inname, outdir, deploymentyaml, dz=1):
"scipy.stats.gmean")
else:
average_method = "mean"

dat, xedges, yedges, binnumber = stats.binned_statistic_2d(
ds['profile_index'].values[good],
ds['depth'].values[good],
values=ds[k].values[good], statistic=average_method,
bins=[profile_bins, depth_bins])

_log.debug(f'dat{np.shape(dat)}')
dsout[k] = (('depth', 'time'), dat.T, ds[k].attrs)
dsout[k] = (('depth', xdimname), dat.T, ds[k].attrs)

# fill gaps in data:
dsout[k].values = utils.gappy_fill_vertical(dsout[k].values)
Expand All @@ -240,11 +292,45 @@ def make_gridfiles(inname, outdir, deploymentyaml, dz=1):
dsout = dsout.drop(['water_velocity_eastward',
'water_velocity_northward'])
dsout.attrs = ds.attrs
dsout.attrs.pop('cdm_data_type')
# fix to be ISO parsable:
if len(dsout.attrs['deployment_start']) > 18:
dsout.attrs['deployment_start'] = dsout.attrs['deployment_start'][:19]
dsout.attrs['deployment_end'] = dsout.attrs['deployment_end'][:19]
dsout.attrs['time_coverage_start'] = dsout.attrs['time_coverage_start'][:19]
dsout.attrs['time_coverage_end'] = dsout.attrs['time_coverage_end'][:19]
# fix standard_name so they don't overlap!
try:
dsout['waypoint_latitude'].attrs.pop('standard_name')
dsout['waypoint_longitude'].attrs.pop('standard_name')
dsout['profile_time_start'].attrs.pop('standard_name')
dsout['profile_time_end'].attrs.pop('standard_name')
except:
pass
# set some attributes for cf guidance
# see H.6.2. Profiles along a single trajectory
# https://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/build/aphs06.html
dsout.attrs['featureType'] = 'trajectoryProfile'
dsout['profile'].attrs['cf_role'] = 'profile_id'
dsout['mission_number'] = int(1)
dsout['mission_number'].attrs['cf_role'] = 'trajectory_id'
dsout = dsout.set_coords(['latitude', 'longitude', 'time'])
for k in dsout:
if k in ['profile', 'depth', 'latitude', 'longitude', 'time', 'mission_number']:
dsout[k].attrs['coverage_content_type'] = 'coordinate'
else:
dsout[k].attrs['coverage_content_type'] = 'physicalMeasurement'


outname = outdir + '/' + ds.attrs['deployment_name'] + '_grid.nc'
outname = outdir + '/' + ds.attrs['deployment_name'] + '_grid' + fnamesuffix + '.nc'
_log.info('Writing %s', outname)
timeunits = 'seconds since 1970-01-01T00:00:00Z'
dsout.to_netcdf(outname, encoding={'time': {'units': timeunits}})
# timeunits = 'nanoseconds since 1970-01-01T00:00:00Z'
dsout.to_netcdf(
outname,
encoding={'time': {'units': 'seconds since 1970-01-01T00:00:00Z',
'_FillValue': np.NaN,
'calendar': 'gregorian',
'dtype': 'float64'}})
_log.info('Done gridding')

return outname
Expand Down
Loading

0 comments on commit 0740359

Please sign in to comment.