-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnutrient-transect-eml-assembly.Rmd
277 lines (207 loc) · 9.59 KB
/
nutrient-transect-eml-assembly.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
---
title: "Creating a Data Package for NES-LTER Nutrient Transect Cruise Data"
author: "Kate Morkeski, Joanne Koch, Stace Beaulieu and Joe Futrelle"
date: "2024-12-09"
output: html_document
---
## Setup
```{r setup, include=FALSE}
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
# two of the required packages are installed from GitHub
# library(remotes)
# remotes::install_github("EDIorg/EMLassemblyline")
# remotes::install_github("WHOIGit/ediutilities")
library(EMLassemblyline)
library(ediutilities)
library(here)
library(lubridate)
library(dplyr)
library(stringr)
library(tidyr)
library(ggplot2)
library(readxl)
library(readr)
library(compareDF)
library(devtools)
library(EML)
library(maps)
library(xml2)
base_api_url <- 'https://nes-lter-data.whoi.edu/api'
```
## Read in Compiled Cruise Data
This chunk loads in the cruise data compiled from the REST API.
```{r}
cruise_list <- api_list_cruises()
all_cruises <- NA
print(cruise_list)
for(cruise_id in cruise_list) {
cruise_id <- str_to_lower(cruise_id)
nut_url <- glue::glue('{base_api_url}/nut/{cruise_id}.csv')
print(nut_url)
cruise_nut <- tryCatch(read_csv(nut_url),
error=function(cond)
{ message(cond)
return(NULL) })
if(nrow(cruise_nut) > 0 && !is.null(cruise_nut)) {
if(length(all_cruises) == 1 && is.na(all_cruises)) {
all_cruises <- cruise_nut
} else {
all_cruises <- plyr::rbind.fill(all_cruises, cruise_nut)
}
}
}
```
```{r}
all_cruises <- all_cruises[order(all_cruises$date), ]
# rename "distance_km"
names(all_cruises)[names(all_cruises) == 'distance_km'] <- 'station_distance'
names(all_cruises)[names(all_cruises) == 'date'] <- 'date_time_utc'
# define columns to round
round_cols <- c("nitrate_nitrite", "ammonium", "phosphate", "silicate", "depth", "station_distance")
# round nutrient, distance, and depth columns
all_cruises[, round_cols] <- round(all_cruises[, round_cols], 3)
# round lat and lon columns
all_cruises[, c("latitude", "longitude")] <- round(all_cruises[, c("latitude", "longitude")], 4)
# round depth
all_cruises[, c("depth")] <- round(all_cruises[, c("depth")], 1)
```
## Read in Sosik data that are not out put from API because they lack Niskin metadata
```{r}
# # samples that do not have metadata output from API
# # Read in the samples lacking bottle metadata csv file.
# # If the entry exists in all_cruises, copy pertinent data from the samples file
# # to all_cruises. If the entry does not exist in all_cruises, do not add it to
# # all_cruises because the nutient data is missing from the samples file.
need_metadata <- read_csv('samples_lacking_bottle_metadata_nutrients_nearest_station.csv')
need_metadata <- need_metadata %>%
mutate(depth = round(depth, 1)) %>%
rename(date_time_utc = date) %>%
mutate(station_distance = case_when(station_distance > 2 ~ NA, TRUE ~ station_distance)) %>%
mutate(nearest_station = case_when(station_distance == NA ~ NA, TRUE ~ nearest_station))
need_metadata$date_time_utc <- as.POSIXct(need_metadata$date_time_utc, tz = "GMT", "%m/%d/%Y %H:%M")
all_cruises$cast <- as.character(all_cruises$cast)
need_metadata$cast <- as.character(need_metadata$cast)
all_cruises$niskin <- as.character(all_cruises$niskin)
need_metadata$niskin <- as.character(need_metadata$niskin)
# prevents adding duplicate rows
all_cruises <- all_cruises %>% anti_join(need_metadata, by = c("cruise", "cast", "niskin", "sample_id", "replicate"))
# add rows rename new data frame
all_nutrients <- bind_rows(all_cruises, need_metadata)
all_nutrients <- all_nutrients %>% arrange(date_time_utc, cruise, cast, niskin, replicate)
# all_cruises$nearest_station[is.na(all_cruises$nearest_station)] <- ""
```
```{r}
# read new locally-produced table
v3 = all_nutrients
# read v1 table from GitHub
v1_commit_hash='06badc2f5fc07d3a995b72af260ce68f0c69119d' # version 1
v2_commit_hash='4236d91cd1289ce6b0958e01d2a45e286de789e7' # version 2
#v1 = read_csv(glue::glue('https://raw.githubusercontent.com/WHOIGit/nes-lter-nutrient-transect/{v1_commit_hash}/nes-lter-nutrient-transect.csv'))
v2 = read_csv(glue::glue('https://raw.githubusercontent.com/WHOIGit/nes-lter-nutrient-transect/{v2_commit_hash}/nes-lter-nutrient-transect.csv'))
names(v2)[names(v2) == 'date'] <- 'date_time_utc'
comparison <- compare_df(v3, v2, c("cruise", "cast", "niskin"))
create_output_table(comparison, output_type='xlsx', file_name='version_comparison.xlsx')
```
## QA: Nutrient Outlier Check
Check if there are severe differences in nutrient values between the replicates across cruise, cast, Niskin, and depth. Plot these differences to perform a visual check.
```{r}
# calculate the difference between the replicates across all nutrients
nut_check <- all_nutrients %>%
group_by(cruise, cast, niskin, depth) %>%
mutate(nitrate_nitrite_diff = abs(nitrate_nitrite - lead(nitrate_nitrite)),
ammonium_diff = abs(ammonium - lead(ammonium)),
phosphate_diff = abs(phosphate - lead(phosphate)),
silicate_diff = abs(silicate - lead(silicate)))
# isolate the outlier value
# nut_check[which.max(nut_check$nitrate_nitrite_diff),]
# define the nutrient outlier columns to gather on
nut_diff_cols <- c("nitrate_nitrite_diff", "phosphate_diff", "ammonium_diff", "silicate_diff")
# convert to long for ease of plotting
nut_check_long <- nut_check %>%
select(nitrate_nitrite_diff, ammonium_diff, phosphate_diff, silicate_diff) %>%
filter(nitrate_nitrite_diff != 0) %>%
gather(nutrient_diffs, value, nut_diff_cols, factor_key = TRUE)
# plot the differences and look for outliers
ggplot(data = nut_check_long, aes(x = nutrient_diffs, y = value)) +
geom_boxplot(outlier.size = 0.5) +
# scale_x_datetime(date_breaks = "6 weeks") +
ylab(paste0("Concentration (µmol/L)")) +
theme_classic()
```
## QA: Map Sampling Locations
Call the map_locs function from edi-utility.R to map the sampling locations. Perform a visual check.
```{r}
# Map Check
map_locs(df = all_nutrients, xvar = "longitude", yvar = "latitude",
region = "transect", colorvar = "cruise")
```
## QA: Plot nutrients as a check
Plot the distribution of nutrient values across all cruises using a boxplot.
```{r}
# define the nutrient columns
nut_cols <- c("nitrate_nitrite", "phosphate", "ammonium", "silicate")
# convert to long for ease of plotting
cruises_long <- all_nutrients %>%
gather(nutrients, value, nut_cols, factor_key = TRUE)
# loop through nutrient columns
for (i in 1:length(nut_cols)) {
nut_subset <- cruises_long %>% filter(nutrients == nut_cols[i])
# ggplot where x = cast, y = value and the lineplots are grouped by cruise
p <- ggplot(data = nut_subset, aes(x = date_time_utc, y = value, color = cruise)) +
geom_boxplot(outlier.size = 0.5) +
# scale_x_datetime(date_breaks = "6 weeks") +
ylab(paste0(nut_cols[i], " concentration (µmol/L)")) +
theme_classic()
print(p)
}
```
## QA: Determine if any nutrient values exceed expectations
According to a global range check: nitrate less than 30 umol/l and ammonium less than 5 umol/l based on [Rees et al. 2006](https://doi.org/10.1016/j.dsr2.2006.05.008), phosphate less than 3 (no great reference but this appears to be upper for Atlantic), silicate less than 60 (Elements of Physical Oceanography chapter on marine silica cycle, for deep Atlantic, this is probably too high).
```{r}
summary(all_nutrients) # visual min max for all columns
if(any(all_nutrients$nitrate_nitrite > 30)) cat("nitrate_nitrite_exceeds")
if(any(all_nutrients$ammonium > 5)) cat("ammonium_exceeds")
if(any(all_nutrients$phosphate > 3)) cat("phosphate_exceeds")
if(any(all_nutrients$silicate > 60)) cat("silicate_exceeds")
```
## Output final data file
```{r}
all_nutrients$alternate_sample_id[is.nan(all_nutrients$alternate_sample_id)] <- ""
#all_nutrients$alternate_sample_id[all_nutrients$alternate_sample_id == NaN] <- ""
all_nutrients$nearest_station[is.na(all_nutrients$nearest_station)] <- ""
#all_nutrients <- all_nutrients[order(all_nutrients$date), ]
# Write to the csv file
write.csv(all_nutrients, "nes-lter-nutrient-transect.csv", na = "NaN", row.names = FALSE, quote = FALSE)
```
## EML Assembly
Generate the package and insert the parent project node into the resulting EML
```{r}
# define input files
metadata <- "nutrient-transect-info"
edi_filename <- "nes-lter-nutrient-transect"
pkg_id <- "knb-lter-nes.4.3"
# Make EML Templates
excel_to_template(here(metadata), edi_filename, rights='CC0', file_type=".md", del_rights = FALSE)
# Data Coverage
# isolate date and geospatial columns for input
date_col <- as.Date(all_nutrients$date_time_utc)
lat_col <- all_nutrients$latitude
lon_col <- all_nutrients$longitude
# run function to determine geospatial and temporal coverage
coverage <- data_coverage(dates = date_col, lat = lat_col, lon = lon_col)
# Make EML
make_eml(path = here(),
dataset.title = "Dissolved inorganic nutrients from NES-LTER Transect cruises, including 4 macro-nutrients from water column bottle samples, ongoing since 2017",
data.table.name = edi_filename,
data.table = paste0(edi_filename, ".csv"),
data.table.description = "Dissolved inorganic nutrients from water column bottle samples taken on NES-LTER Transect cruises",
temporal.coverage = c(coverage$startdate, coverage$enddate),
geographic.description = "NES-LTER Transect",
geographic.coordinates = c(coverage$North, coverage$East, coverage$South, coverage$West),
maintenance.description = "ongoing",
user.id = "NES",
user.domain = "LTER",
package.id = pkg_id)
# Insert Custom Project Node
project_insert(edi_pkg = pkg_id, filename = 'parent_project_NESI-II_RAPID_OOI.txt')
```