Skip to content

Commit

Permalink
ENH: allow multiple deploymentyaml
Browse files Browse the repository at this point in the history
  • Loading branch information
C-PROOF committed Jul 4, 2024
1 parent b2a31ae commit fc59e43
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 22 deletions.
8 changes: 4 additions & 4 deletions pyglider/ncprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml):
except FileExistsError:
pass

with open(deploymentyaml) as fin:
deployment = yaml.safe_load(fin)
deployment = utils._get_deployment(deploymentyaml)

meta = deployment['metadata']
with xr.open_dataset(inname) as ds:
_log.info('Extracting profiles: opening %s', inname)
Expand Down Expand Up @@ -172,8 +172,8 @@ def make_gridfiles(inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1, star
except FileExistsError:
pass

with open(deploymentyaml) as fin:
deployment = yaml.safe_load(fin)
deployment = utils._get_deployment(deploymentyaml)

profile_meta = deployment['profile_variables']

ds = xr.open_dataset(inname, decode_times=True)
Expand Down
17 changes: 8 additions & 9 deletions pyglider/seaexplorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def raw_to_rawnc(indir, outdir, deploymentyaml, incremental=True,
# Try to read the file with polars. If the file is corrupted (rare), file read will fail and file
# is appended to badfiles
try:
out = pl.read_csv(f, separator=';')
out = pl.read_csv(f, sep=';')
except Exception as e:
_log.warning(f'Exception reading {f}: {e}')
_log.warning(f'Could not read {f}')
Expand All @@ -137,11 +137,11 @@ def raw_to_rawnc(indir, outdir, deploymentyaml, incremental=True,
# Parse the datetime from nav files (called Timestamp) and pld1 files (called PLD_REALTIMECLOCK)
if "Timestamp" in out.columns:
out = out.with_columns(
pl.col("Timestamp").str.strptime(pl.Datetime, format="%d/%m/%Y %H:%M:%S"))
pl.col("Timestamp").str.strptime(pl.Datetime, fmt="%d/%m/%Y %H:%M:%S"))
out = out.rename({"Timestamp": "time"})
else:
out = out.with_columns(
pl.col("PLD_REALTIMECLOCK").str.strptime(pl.Datetime, format="%d/%m/%Y %H:%M:%S.%3f"))
pl.col("PLD_REALTIMECLOCK").str.strptime(pl.Datetime, fmt="%d/%m/%Y %H:%M:%S.%3f"))
out = out.rename({"PLD_REALTIMECLOCK": "time"})
for col_name in out.columns:
if "time" not in col_name.lower():
Expand All @@ -150,7 +150,7 @@ def raw_to_rawnc(indir, outdir, deploymentyaml, incremental=True,
if 'AD2CP_TIME' in out.columns:
# Set datestamps with date 00000 to None
out = out.with_columns(
pl.col('AD2CP_TIME').str.strptime(pl.Datetime, format="%m%d%y %H:%M:%S", strict=False))
pl.col('AD2CP_TIME').str.strptime(pl.Datetime, fmt="%m%d%y %H:%M:%S", strict=False))

# subsetting for heavily oversampled raw data:
if rawsub == 'raw' and dropna_subset is not None:
Expand Down Expand Up @@ -232,8 +232,8 @@ def merge_parquet(indir, outdir, deploymentyaml, incremental=False, kind='raw'):
Only add new files....
"""

with open(deploymentyaml) as fin:
deployment = yaml.safe_load(fin)
deployment = utils._get_deployment(deploymentyaml)

metadata = deployment['metadata']
id = metadata['glider_name']
outgli = outdir + '/' + id + '-rawgli.parquet'
Expand Down Expand Up @@ -297,7 +297,6 @@ def _remove_fill_values(df, fill_value=9999):
pl.when(pl.col(pl.Float64) == fill_value)
.then(None)
.otherwise(pl.col(pl.Float64))
.name.keep()
)
return df

Expand All @@ -309,8 +308,8 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, kind='raw',
A little different than above, for the 4-file version of the data set.
"""

with open(deploymentyaml) as fin:
deployment = yaml.safe_load(fin)
deployment = utils._get_deployment(deploymentyaml)

metadata = deployment['metadata']
ncvar = deployment['netcdf_variables']
device_data = deployment['glider_devices']
Expand Down
22 changes: 13 additions & 9 deletions pyglider/slocum.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import time
import xarray as xr
import xml.etree.ElementTree as ET
import yaml
from collections.abc import Iterable

import pyglider.utils as utils

Expand Down Expand Up @@ -621,8 +621,8 @@ def merge_rawnc(indir, outdir, deploymentyaml,

scisuffix = scisuffix.lower()
glidersuffix = glidersuffix.lower()
with open(deploymentyaml) as fin:
deployment = yaml.safe_load(fin)
deployment = utils._get_deployment(deploymentyaml)

metadata = deployment['metadata']
id = metadata['glider_name'] + metadata['glider_serial']

Expand Down Expand Up @@ -684,8 +684,7 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, *,
name of the new merged netcdf file.
"""

with open(deploymentyaml) as fin:
deployment = yaml.safe_load(fin)
deployment = utils._get_deployment(deploymentyaml)
metadata = deployment['metadata']
ncvar = deployment['netcdf_variables']
device_data = deployment['glider_devices']
Expand Down Expand Up @@ -807,8 +806,13 @@ def binary_to_timeseries(indir, cachedir, outdir, deploymentyaml, *,
outdir : string
Directory to put the merged timeseries files.
deploymentyaml : str
YAML text file with deployment information for this glider.
deploymentyaml : str or list
Name of YAML text file with deployment information for this glider.
If a list, then the YAML files are read in order, and any top-level dictionaries
are overwritten from the previous YAMLs. The advantage of this is that it allows
metadata that is common to multiple ways of processing the data come from the
first file, and then subsequent files change "netcdf_variables" if desired.
profile_filt_time : float
time in seconds over which to smooth the pressure time series for
Expand All @@ -827,8 +831,8 @@ def binary_to_timeseries(indir, cachedir, outdir, deploymentyaml, *,
if not have_dbdreader:
raise ImportError('Cannot import dbdreader')

with open(deploymentyaml) as fin:
deployment = yaml.safe_load(fin)
deployment = utils._get_deployment(deploymentyaml)

ncvar = deployment['netcdf_variables']
device_data = deployment['glider_devices']
thenames = list(ncvar.keys())
Expand Down
20 changes: 20 additions & 0 deletions pyglider/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from scipy.signal import argrelextrema
import gsw
import logging
import yaml


_log = logging.getLogger(__name__)

Expand Down Expand Up @@ -674,6 +676,24 @@ def example_gridplot(filename, outname,
fig.savefig(outname, dpi=dpi)


def _get_deployment(deploymentyaml):
"""
Take the list of files in *deploymentyaml* and parse them
for deployment information, with subsequent files overwriting
previous files.
"""
if isinstance(deploymentyaml, str):
deploymentyaml = [deploymentyaml,]
deployment = {}
for nn, d in enumerate(deploymentyaml):
with open(d) as fin:
deployment_ = yaml.safe_load(fin)
for k in deployment_:
deployment[k] = deployment_[k]

return deployment


__all__ = ['get_distance_over_ground', 'get_glider_depth', 'get_profiles_new',
'get_derived_eos_raw', "fill_metadata", "nmea2deg",
"gappy_fill_vertical", "oxygen_concentration_correction"]

0 comments on commit fc59e43

Please sign in to comment.