Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

coastal_boundary dataframe with hourly SCHISM data in Hyfeature unit test #601

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 70 additions & 6 deletions src/troute-network/troute/hyfeature_network_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from joblib import delayed, Parallel
import pyarrow as pa
import pyarrow.parquet as pq
import xarray as xr

import troute.nhd_io as nhd_io

Expand All @@ -23,11 +24,12 @@ def build_forcing_sets(
t0
):

run_sets = forcing_parameters.get("qlat_forcing_sets", None)
nexus_input_folder = forcing_parameters.get("nexus_input_folder", None)
nts = forcing_parameters.get("nts", None)
max_loop_size = forcing_parameters.get("max_loop_size", 12)
dt = forcing_parameters.get("dt", None)
run_sets = forcing_parameters.get("qlat_forcing_sets", None)
nexus_input_folder = forcing_parameters.get("nexus_input_folder", None)
downstream_boundary_input_folder = forcing_parameters.get("downstream_boundary_input_folder", None)
nts = forcing_parameters.get("nts", None)
max_loop_size = forcing_parameters.get("max_loop_size", 12)
dt = forcing_parameters.get("dt", None)

try:
nexus_input_folder = pathlib.Path(nexus_input_folder)
Expand All @@ -38,7 +40,8 @@ def build_forcing_sets(
raise AssertionError("Aborting simulation because the nexus_input_folder:", qlat_input_folder,"does not exist. Please check the the nexus_input_folder variable is correctly entered in the .yaml control file") from None

forcing_glob_filter = forcing_parameters.get("nexus_file_pattern_filter", "*.NEXOUT")

downstream_boundary_glob_filter = forcing_parameters.get("downstream_boundary_file_pattern_filter", "*SCHISM.nc")

if forcing_glob_filter=="nex-*":
print("Reformating qlat nexus files as hourly binary files...")
binary_folder = forcing_parameters.get('binary_nexus_file_folder', None)
Expand Down Expand Up @@ -161,6 +164,64 @@ def build_forcing_sets(
k += max_loop_size
j += 1

# creates downstream boundary forcing file list
if downstream_boundary_input_folder:
# get the first and seconded files from an ordered list of all forcing files
downstream_boundary_input_folder = pathlib.Path(downstream_boundary_input_folder)
all_files = sorted(downstream_boundary_input_folder.glob(downstream_boundary_glob_filter))
first_file = all_files[0]
second_file = all_files[1]

# Deduce the timeinterval of the forcing data from the output timestamps of the first
df = read_file(first_file)
t1_str = pd.to_datetime(df.time.iloc[0]).strftime("%Y-%m-%d_%H:%M:%S")
t1 = datetime.strptime(t1_str,"%Y-%m-%d_%H:%M:%S")
df = read_file(second_file)
t2_str = pd.to_datetime(df.time.iloc[0]).strftime("%Y-%m-%d_%H:%M:%S")
t2 = datetime.strptime(t2_str,"%Y-%m-%d_%H:%M:%S")
dt_downstream_boundary_timedelta = t2 - t1
dt_downstream_boundary = dt_downstream_boundary_timedelta.seconds

bts_subdivisions = dt_downstream_boundary / dt

# the number of files required for the simulation
# For example, for 4 hrs total simulation and 2 hr max_loop, two sets of boundary data one at (t0, t0+1hr, t0+2hr) and
# the other at (t0+2hr, t0+3hr, t0+4hr) are needed.
nfiles = int(np.ceil(nts / bts_subdivisions)) + 1

# list of downstream boundary file datetimes
datetime_list = [t0 + dt_downstream_boundary_timedelta * (n) for n in
range(nfiles)]
datetime_list_str = [datetime.strftime(d, '%Y%m%d%H%M') for d in
datetime_list]
# list of downstream boundary files
downstream_boundary_filename_list = [d_str + downstream_boundary_glob_filter[1:] for d_str in
datetime_list_str]

# check that all forcing files exist
for f in downstream_boundary_filename_list:
try:
J = pathlib.Path(downstream_boundary_input_folder.joinpath(f))
assert J.is_file() == True
except AssertionError:
raise AssertionError("Aborting simulation because downstream boundary file", J, "cannot be not found.") from None

# build run sets list
#run_sets = []
k = 0
j = 0
nts_accum = 0
nts_last = 0

while k < len(downstream_boundary_filename_list)-1:
if k + max_loop_size < len(downstream_boundary_filename_list):
run_sets[j]['downstream_boundary_files'] = downstream_boundary_filename_list[k:k
+ max_loop_size+1]
else:
run_sets[j]['downstream_boundary_files'] = downstream_boundary_filename_list[k:]
k += max_loop_size
j += 1

return run_sets

def build_qlateral_array(
Expand Down Expand Up @@ -296,5 +357,8 @@ def read_file(file_name):
elif extension=='.parquet':
df = pq.read_table(file_name).to_pandas().reset_index()
df.index.name = None
elif extension=='.nc':
df = xr.open_dataset(file_name)
df = df.to_dataframe()

return df
49 changes: 30 additions & 19 deletions src/troute-network/troute/hyfeature_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,30 +694,31 @@ def hyfeature_forcing(
"""

# Unpack user-specified forcing parameters
dt = forcing_parameters.get("dt", None)
qts_subdivisions = forcing_parameters.get("qts_subdivisions", None)
nexus_input_folder = forcing_parameters.get("nexus_input_folder", None)
qlat_file_index_col = forcing_parameters.get("qlat_file_index_col", "feature_id")
qlat_file_value_col = forcing_parameters.get("qlat_file_value_col", "q_lateral")
qlat_file_gw_bucket_flux_col = forcing_parameters.get("qlat_file_gw_bucket_flux_col", "qBucket")
qlat_file_terrain_runoff_col = forcing_parameters.get("qlat_file_terrain_runoff_col", "qSfcLatRunoff")
dt = forcing_parameters.get("dt", None)
qts_subdivisions = forcing_parameters.get("qts_subdivisions", None)
nexus_input_folder = forcing_parameters.get("nexus_input_folder", None)
qlat_file_index_col = forcing_parameters.get("qlat_file_index_col", "feature_id")
qlat_file_value_col = forcing_parameters.get("qlat_file_value_col", "q_lateral")
qlat_file_gw_bucket_flux_col = forcing_parameters.get("qlat_file_gw_bucket_flux_col", "qBucket")
qlat_file_terrain_runoff_col = forcing_parameters.get("qlat_file_terrain_runoff_col", "qSfcLatRunoff")
downstream_boundary_input_folder = forcing_parameters.get("downstream_boundary_input_folder", None)


# TODO: find a better way to deal with these defaults and overrides.
run["t0"] = run.get("t0", t0)
run["nts"] = run.get("nts")
run["dt"] = run.get("dt", dt)
run["qts_subdivisions"] = run.get("qts_subdivisions", qts_subdivisions)
run["nexus_input_folder"] = run.get("nexus_input_folder", nexus_input_folder)
run["qlat_file_index_col"] = run.get("qlat_file_index_col", qlat_file_index_col)
run["qlat_file_value_col"] = run.get("qlat_file_value_col", qlat_file_value_col)
run["qlat_file_gw_bucket_flux_col"] = run.get("qlat_file_gw_bucket_flux_col", qlat_file_gw_bucket_flux_col)
run["qlat_file_terrain_runoff_col"] = run.get("qlat_file_terrain_runoff_col", qlat_file_terrain_runoff_col)
run["t0"] = run.get("t0", t0)
run["nts"] = run.get("nts")
run["dt"] = run.get("dt", dt)
run["qts_subdivisions"] = run.get("qts_subdivisions", qts_subdivisions)
run["nexus_input_folder"] = run.get("nexus_input_folder", nexus_input_folder)
run["qlat_file_index_col"] = run.get("qlat_file_index_col", qlat_file_index_col)
run["qlat_file_value_col"] = run.get("qlat_file_value_col", qlat_file_value_col)
run["qlat_file_gw_bucket_flux_col"] = run.get("qlat_file_gw_bucket_flux_col", qlat_file_gw_bucket_flux_col)
run["qlat_file_terrain_runoff_col"] = run.get("qlat_file_terrain_runoff_col", qlat_file_terrain_runoff_col)
run["downstream_boundary_input_folder"] = run.get("downstream_boundary_input_folder", downstream_boundary_input_folder)

#---------------------------------------------------------------------------
# Assemble lateral inflow data
#---------------------------------------------------------------------------

start_time = time.time()
LOG.info("Creating a DataFrame of lateral inflow forcings ...")

Expand All @@ -741,7 +742,8 @@ def hyfeature_forcing(
# Assemble coastal coupling data [WIP]
#---------------------------------------------------------------------
# Run if coastal_boundary_depth_df has not already been created:
if coastal_boundary_depth_df.empty:
#if coastal_boundary_depth_df.empty:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be un-commented out?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a temporary solution as for hourly schism data the coastal_boundary_depth_df has to be updated at every time loop, so the lines under the if statement need to be executed at every time loop. Meanwhile, for multi hourly schism data, the lines need to be executed only once, so the if statement should be there.

if 1==1:
coastal_boundary_elev_files = forcing_parameters.get('coastal_boundary_input_file', None)
coastal_boundary_domain_files = hybrid_parameters.get('coastal_boundary_domain', None)

Expand All @@ -750,11 +752,20 @@ def hyfeature_forcing(
LOG.info("creating coastal dataframe ...")

coastal_boundary_domain = nhd_io.read_coastal_boundary_domain(coastal_boundary_domain_files)

# create dataframe for hourly schism data
coastal_boundary_depth_df = nhd_io.build_coastal_dataframe(
run,
coastal_boundary_domain,
)

# create dataframe for multi hourly schism data
'''
coastal_boundary_depth_df = nhd_io.build_coastal_ncdf_dataframe(
coastal_boundary_elev_files,
coastal_boundary_domain,
)

'''
LOG.debug(
"coastal boundary elevation observation DataFrame creation complete in %s seconds." \
% (time.time() - start_time)
Expand Down
44 changes: 42 additions & 2 deletions src/troute-network/troute/nhd_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -1563,11 +1563,12 @@ def build_coastal_dataframe(coastal_boundary_elev):
)
return coastal_df


'''
def build_coastal_ncdf_dataframe(coastal_ncdf):
with xr.open_dataset(coastal_ncdf) as ds:
coastal_ncdf_df = ds[["elev", "depth"]]
return coastal_ncdf_df.to_dataframe()
'''

def build_coastal_ncdf_dataframe(
coastal_files,
Expand All @@ -1594,7 +1595,7 @@ def build_coastal_ncdf_dataframe(
tfin = start_date + dt_timeslice*len(timesteps)
timestamps = pd.date_range(start_date, tfin, freq=dt_timeslice)
timestamps = timestamps.strftime('%Y-%m-%d %H:%M:%S')

import pdb; pdb.set_trace()
# create a dataframe of water depth at coastal domain nodes
timeslice_schism_list=[]
for t in range(0, len(timesteps)+1):
Expand All @@ -1619,6 +1620,45 @@ def build_coastal_ncdf_dataframe(
# linearly extrapolate depth value at start date
coastal_boundary_depth_df.iloc[:,0] = 2.0*coastal_boundary_depth_df.iloc[:,1] - coastal_boundary_depth_df.iloc[:,2]

return coastal_boundary_depth_df

def build_coastal_dataframe(
run,
coastal_boundary_domain,
):

downstream_boundary_input_folder = run.get("downstream_boundary_input_folder",None)

if downstream_boundary_input_folder:
downstream_boundary_input_folder = pathlib.Path(downstream_boundary_input_folder)
if "downstream_boundary_files" in run:
downstream_boundary_files = run.get("downstream_boundary_files")
downstream_boundary_files = [downstream_boundary_input_folder.joinpath(f) for f in downstream_boundary_files]

timeslice_schism_list=[]
for f in downstream_boundary_files:
ds = xr.open_dataset(f)
df = ds.to_dataframe()
tws = []
timestamps = []
depths= []
for tw, boundary_node in coastal_boundary_domain.items():
tws.append(tw)
df2 = df[df['schism_id']==boundary_node]
date = df2.time.iloc[0]
timestamps.append(pd.to_datetime(date).strftime('%Y-%m-%d %H:%M:%S'))
depths.append(df2.elev.iat[0] + df2.depth.iat[0])
timeslice_schism = (pd.DataFrame({
'stationId' : tws,
'datetime' : timestamps,
'depth' : depths
}).
set_index(['stationId', 'datetime']).
unstack(1, fill_value = np.nan)['depth'])
timeslice_schism_list.append(timeslice_schism)
import pdb; pdb.set_trace()
coastal_boundary_depth_df = pd.concat(timeslice_schism_list, axis=1, ignore_index=False)

return coastal_boundary_depth_df

def lastobs_df_output(
Expand Down
72 changes: 72 additions & 0 deletions src/troute-network/troute/nhd_network_utilities_v02.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import netCDF4
from joblib import delayed, Parallel
import xarray as xr

import troute.nhd_io as nhd_io
import troute.nhd_network as nhd_network
Expand Down Expand Up @@ -253,6 +254,7 @@ def build_forcing_sets(

run_sets = forcing_parameters.get("qlat_forcing_sets", None)
qlat_input_folder = forcing_parameters.get("qlat_input_folder", None)
downstream_boundary_input_folder = forcing_parameters.get("downstream_boundary_input_folder", None)
nts = forcing_parameters.get("nts", None)
max_loop_size = forcing_parameters.get("max_loop_size", 12)
dt = forcing_parameters.get("dt", None)
Expand All @@ -266,6 +268,7 @@ def build_forcing_sets(
raise AssertionError("Aborting simulation because the qlat_input_folder:", qlat_input_folder,"does not exist. Please check the the qlat_input_folder variable is correctly entered in the .yaml control file") from None

forcing_glob_filter = forcing_parameters.get("qlat_file_pattern_filter", "*.CHRTOUT_DOMAIN1")
downstream_boundary_glob_filter = forcing_parameters.get("downstream_boundary_file_pattern_filter", "*SCHISM.nc")

# TODO: Throw errors if insufficient input data are available
if run_sets:
Expand Down Expand Up @@ -357,6 +360,64 @@ def build_forcing_sets(
nts_last = nts_accum
k += max_loop_size
j += 1

# creates downstream boundary forcing file list
if downstream_boundary_input_folder:
# get the first and seconded files from an ordered list of all forcing files
downstream_boundary_input_folder = pathlib.Path(downstream_boundary_input_folder)
all_files = sorted(downstream_boundary_input_folder.glob(downstream_boundary_glob_filter))
first_file = all_files[0]
second_file = all_files[1]

# Deduce the timeinterval of the forcing data from the output timestamps of the first
df = read_file(first_file)
t1_str = pd.to_datetime(df.time.iloc[0]).strftime("%Y-%m-%d_%H:%M:%S")
t1 = datetime.strptime(t1_str,"%Y-%m-%d_%H:%M:%S")
df = read_file(second_file)
t2_str = pd.to_datetime(df.time.iloc[0]).strftime("%Y-%m-%d_%H:%M:%S")
t2 = datetime.strptime(t2_str,"%Y-%m-%d_%H:%M:%S")
dt_downstream_boundary_timedelta = t2 - t1
dt_downstream_boundary = dt_downstream_boundary_timedelta.seconds

bts_subdivisions = dt_downstream_boundary / dt
# the number of files required for the simulation
nfiles = int(np.ceil(nts / bts_subdivisions)) + 1

# list of downstream boundary file datetimes
# n+1 because downstream boundary values, for example, during next 2hr simulation are
# values at t0, t0+1hr, t0+2hr and schism currently do not store value at t0 (to be interpolated)
datetime_list = [t0 + dt_downstream_boundary_timedelta * (n+1) for n in
range(nfiles)]
datetime_list_str = [datetime.strftime(d, '%Y%m%d%H%M') for d in
datetime_list]
# list of downstream boundary files
downstream_boundary_filename_list = [d_str + downstream_boundary_glob_filter[1:] for d_str in
datetime_list_str]

# check that all forcing files exist
for f in downstream_boundary_filename_list:
try:
J = pathlib.Path(downstream_boundary_input_folder.joinpath(f))
assert J.is_file() == True
except AssertionError:
raise AssertionError("Aborting simulation because downstream boundary file", J, "cannot be not found.") from None

# build run sets list
k = 0
j = 0
nts_accum = 0
nts_last = 0
while k < len(downstream_boundary_filename_list)-1:
#run_sets.append({})

if k + max_loop_size < len(downstream_boundary_filename_list):
run_sets[j]['downstream_boundary_files'] = downstream_boundary_filename_list[k:k
+ max_loop_size+1]
else:
run_sets[j]['downstream_boundary_files'] = downstream_boundary_filename_list[k:]
k += max_loop_size
j += 1

return run_sets

def build_qlateral_array(
Expand Down Expand Up @@ -872,3 +933,14 @@ def build_refac_connections(diff_network_parameters):
)

return connections

def read_file(file_name):
extension = file_name.suffix
if extension=='.csv':
df = pd.read_csv(file_name)
elif extension=='.parquet':
df = pq.read_table(file_name).to_pandas().reset_index()
df.index.name = None
elif extension=='.nc':
df = xr.open_dataset(file_name)
df = df.to_dataframe()
Loading