nutrient-transect-eml-assembly.Rmd

---
title: "Creating a Data Package for NES-LTER Nutrient Transect Cruise Data"
author: "Kate Morkeski, Joanne Koch, Stace Beaulieu and Joe Futrelle"
date: "2024-12-09"
output: html_document
---

## Setup

```{r setup, include=FALSE}
knitr::opts_chunk$set(message = FALSE, warning = FALSE)

# two of the required packages are installed from GitHub
# library(remotes)
# remotes::install_github("EDIorg/EMLassemblyline")
# remotes::install_github("WHOIGit/ediutilities")

library(EMLassemblyline)
library(ediutilities)
library(here)
library(lubridate)
library(dplyr)
library(stringr)
library(tidyr)
library(ggplot2)
library(readxl)
library(readr)
library(compareDF)
library(devtools)
library(EML)
library(maps)
library(xml2)

base_api_url <- 'https://nes-lter-data.whoi.edu/api'

```

## Read in Compiled Cruise Data

This chunk loads in the cruise data compiled from the REST API.

```{r}
cruise_list <- api_list_cruises()

all_cruises <- NA
print(cruise_list)
for(cruise_id in cruise_list) {
  cruise_id <- str_to_lower(cruise_id)
  nut_url <- glue::glue('{base_api_url}/nut/{cruise_id}.csv')
  print(nut_url)

  cruise_nut <- tryCatch(read_csv(nut_url), 
                 error=function(cond) 
                     { message(cond)
                       return(NULL) })

  if(nrow(cruise_nut) > 0 && !is.null(cruise_nut)) {
    if(length(all_cruises) == 1 && is.na(all_cruises)) {
      all_cruises <- cruise_nut
    } else {
      all_cruises <- plyr::rbind.fill(all_cruises, cruise_nut)
    }
  }
}
```

```{r}
all_cruises <- all_cruises[order(all_cruises$date), ]

# rename "distance_km" 
names(all_cruises)[names(all_cruises) == 'distance_km'] <- 'station_distance'
names(all_cruises)[names(all_cruises) == 'date'] <- 'date_time_utc'

# define columns to round
round_cols <- c("nitrate_nitrite", "ammonium", "phosphate", "silicate", "depth", "station_distance")
# round nutrient, distance, and depth columns
all_cruises[, round_cols] <- round(all_cruises[, round_cols], 3)
# round lat and lon columns
all_cruises[, c("latitude", "longitude")] <- round(all_cruises[, c("latitude", "longitude")], 4)
# round depth
all_cruises[, c("depth")] <- round(all_cruises[, c("depth")], 1)

```
## Read in Sosik data that are not out put from API because they lack Niskin metadata 

```{r}

# # samples that do not have metadata output from API
# # Read in the samples lacking bottle metadata csv file.
# # If the entry exists in all_cruises, copy pertinent data from the samples file
# # to all_cruises. If the entry does not exist in all_cruises, do not add it to 
# # all_cruises because the nutient data is missing from the samples file.

need_metadata <- read_csv('samples_lacking_bottle_metadata_nutrients_nearest_station.csv')
need_metadata <- need_metadata %>%
  mutate(depth = round(depth, 1)) %>%
  rename(date_time_utc = date) %>%
  mutate(station_distance = case_when(station_distance > 2 ~ NA, TRUE ~ station_distance)) %>%
  mutate(nearest_station = case_when(station_distance == NA ~ NA, TRUE ~ nearest_station)) 
need_metadata$date_time_utc <- as.POSIXct(need_metadata$date_time_utc, tz = "GMT", "%m/%d/%Y %H:%M")

all_cruises$cast <- as.character(all_cruises$cast)
need_metadata$cast <- as.character(need_metadata$cast)
all_cruises$niskin <- as.character(all_cruises$niskin)
need_metadata$niskin <- as.character(need_metadata$niskin)

# prevents adding duplicate rows
all_cruises <- all_cruises %>% anti_join(need_metadata, by = c("cruise", "cast", "niskin", "sample_id", "replicate"))

# add rows rename new data frame  
all_nutrients <- bind_rows(all_cruises, need_metadata)

all_nutrients <- all_nutrients %>% arrange(date_time_utc, cruise, cast, niskin, replicate)

# all_cruises$nearest_station[is.na(all_cruises$nearest_station)] <- ""

```


```{r}

# read new locally-produced table
v3 = all_nutrients

# read v1 table from GitHub
v1_commit_hash='06badc2f5fc07d3a995b72af260ce68f0c69119d'  # version 1
v2_commit_hash='4236d91cd1289ce6b0958e01d2a45e286de789e7'  # version 2

#v1 = read_csv(glue::glue('https://raw.githubusercontent.com/WHOIGit/nes-lter-nutrient-transect/{v1_commit_hash}/nes-lter-nutrient-transect.csv'))
v2 = read_csv(glue::glue('https://raw.githubusercontent.com/WHOIGit/nes-lter-nutrient-transect/{v2_commit_hash}/nes-lter-nutrient-transect.csv'))

names(v2)[names(v2) == 'date'] <- 'date_time_utc'
comparison <- compare_df(v3, v2, c("cruise", "cast", "niskin"))

create_output_table(comparison, output_type='xlsx', file_name='version_comparison.xlsx')

```


## QA: Nutrient Outlier Check

Check if there are severe differences in nutrient values between the replicates across cruise, cast, Niskin, and depth. Plot these differences to perform a visual check.

```{r}

# calculate the difference between the replicates across all nutrients
nut_check <- all_nutrients %>%
  group_by(cruise, cast, niskin, depth) %>%
  mutate(nitrate_nitrite_diff = abs(nitrate_nitrite - lead(nitrate_nitrite)),
         ammonium_diff = abs(ammonium - lead(ammonium)),
         phosphate_diff = abs(phosphate - lead(phosphate)),
         silicate_diff = abs(silicate - lead(silicate))) 
# isolate the outlier value
# nut_check[which.max(nut_check$nitrate_nitrite_diff),]

# define the nutrient outlier columns to gather on
nut_diff_cols <- c("nitrate_nitrite_diff", "phosphate_diff", "ammonium_diff", "silicate_diff")

# convert to long for ease of plotting
nut_check_long <- nut_check %>%
  select(nitrate_nitrite_diff, ammonium_diff, phosphate_diff, silicate_diff) %>%
  filter(nitrate_nitrite_diff != 0) %>%
  gather(nutrient_diffs, value, nut_diff_cols, factor_key = TRUE)

# plot the differences and look for outliers
ggplot(data = nut_check_long, aes(x = nutrient_diffs, y = value)) +
    geom_boxplot(outlier.size = 0.5) +
    # scale_x_datetime(date_breaks = "6 weeks") +
    ylab(paste0("Concentration (µmol/L)")) +
    theme_classic()

```

## QA: Map Sampling Locations

Call the map_locs function from edi-utility.R to map the sampling locations. Perform a visual check.

```{r}

# Map Check
map_locs(df = all_nutrients, xvar = "longitude", yvar = "latitude",
         region = "transect", colorvar = "cruise")

```


## QA: Plot nutrients as a check

Plot the distribution of nutrient values across all cruises using a boxplot. 

```{r}
# define the nutrient columns 
nut_cols <- c("nitrate_nitrite", "phosphate", "ammonium", "silicate")

# convert to long for ease of plotting
cruises_long <- all_nutrients %>%
    gather(nutrients, value, nut_cols, factor_key = TRUE)

# loop through nutrient columns
for (i in 1:length(nut_cols)) {
  nut_subset <- cruises_long %>% filter(nutrients == nut_cols[i])
  
  # ggplot where x = cast, y = value and the lineplots are grouped by cruise
  p <- ggplot(data = nut_subset, aes(x = date_time_utc, y = value, color = cruise)) +
    geom_boxplot(outlier.size = 0.5) +
    # scale_x_datetime(date_breaks = "6 weeks") +
    ylab(paste0(nut_cols[i], " concentration (µmol/L)")) +
    theme_classic()
  print(p)
}

```

## QA: Determine if any nutrient values exceed expectations

According to a global range check: nitrate less than 30 umol/l and ammonium less than 5 umol/l based on [Rees et al. 2006](https://doi.org/10.1016/j.dsr2.2006.05.008), phosphate less than 3 (no great reference but this appears to be upper for Atlantic), silicate less than 60 (Elements of Physical Oceanography chapter on marine silica cycle, for deep Atlantic, this is probably too high).

```{r}
summary(all_nutrients) # visual min max for all columns

if(any(all_nutrients$nitrate_nitrite > 30)) cat("nitrate_nitrite_exceeds") 
if(any(all_nutrients$ammonium > 5)) cat("ammonium_exceeds")
if(any(all_nutrients$phosphate > 3)) cat("phosphate_exceeds")
if(any(all_nutrients$silicate > 60)) cat("silicate_exceeds")

```
## Output final data file

```{r}

all_nutrients$alternate_sample_id[is.nan(all_nutrients$alternate_sample_id)] <- ""
#all_nutrients$alternate_sample_id[all_nutrients$alternate_sample_id == NaN] <- ""
all_nutrients$nearest_station[is.na(all_nutrients$nearest_station)] <- ""
#all_nutrients <- all_nutrients[order(all_nutrients$date), ]

# Write to the csv file
write.csv(all_nutrients, "nes-lter-nutrient-transect.csv", na = "NaN", row.names = FALSE, quote = FALSE)

```

## EML Assembly
Generate the package and insert the parent project node into the resulting EML

```{r}
# define input files
metadata <- "nutrient-transect-info"
edi_filename <- "nes-lter-nutrient-transect"
pkg_id <- "knb-lter-nes.4.3"

# Make EML Templates 
excel_to_template(here(metadata), edi_filename, rights='CC0', file_type=".md", del_rights = FALSE)

# Data Coverage
# isolate date and geospatial columns for input
date_col <- as.Date(all_nutrients$date_time_utc)
lat_col <- all_nutrients$latitude
lon_col <- all_nutrients$longitude
# run function to determine geospatial and temporal coverage
coverage <- data_coverage(dates = date_col, lat = lat_col, lon = lon_col)

# Make EML
make_eml(path = here(),
         dataset.title = "Dissolved inorganic nutrients from NES-LTER Transect cruises, including 4 macro-nutrients from water column bottle samples, ongoing since 2017",
         data.table.name = edi_filename,
         data.table = paste0(edi_filename, ".csv"),
         data.table.description = "Dissolved inorganic nutrients from water column bottle samples taken on NES-LTER Transect cruises",
         temporal.coverage = c(coverage$startdate, coverage$enddate),
         geographic.description = "NES-LTER Transect",
         geographic.coordinates = c(coverage$North, coverage$East, coverage$South, coverage$West),
         maintenance.description = "ongoing",
         user.id = "NES",
         user.domain = "LTER",
         package.id = pkg_id)

# Insert Custom Project Node
project_insert(edi_pkg = pkg_id, filename = 'parent_project_NESI-II_RAPID_OOI.txt')
```