nes-lter-picoeuk-mvco-eml.Rmd

---
title: "NES-LTER picoeuk and Syn at MVCO"
author: "Stace Beaulieu"
date: "2022-11-08"
output: html_document
---

## R Markdown Setup

```{r setup, include=FALSE}
knitr::opts_chunk$set(message = FALSE, warning = FALSE)

# clear workspace for local development
rm(list = ls())

# set environment timezone to UTC
Sys.setenv(TZ = "UTC")

# assign relative path to directory
# dir <- "/Users/sbeaulieu/Desktop/github/WHOIGit/nes-lter-picoeuk-mvco/" 
dir <- "/Users/sbeaulieu/Desktop/github/nes-lter-picoeuk-mvco/"
# set as working directory
setwd(dir)

# define source for functions developed for the EDI packaging workflow
source("edi-utilities.R")

# install necessary libraries
# install.packages("devtools")
# install_github("EDIorg/EMLassemblyline")
# remotes::install_github("EDIorg/dataCleanr")

# define R packages to require
libs <- c("tidyverse", "readxl", "lubridate", "devtools", "EMLassemblyline", "EML", "maps", "xml2", "dataCleanr")
# load libraries
lapply(libs, require, character.only = TRUE)
```

## Load data table revise column headers and add columns here

```{r}

# picoeuk <- read_csv("Picoeukaryote_Cellsperml.csv") # data file through 2016
# picoeuk <- read_csv("Picoeukaryote_Raw_Counts.csv")  # data file through 2018
pico <- read_csv("AllPicos_Cellsperml.csv")  # data file inclusive of Syn and picoeuk

# remove rows when entire row is NA
# picoeukclean <- filter_all(picoeuk, all_vars(!is.na(.)))
picoclean <- filter_all(pico, all_vars(!is.na(.)))

# convert picoeukclean$Time_UTC to ISO8601 format
# library(dataCleanr)
# newtime <- iso8601_convert(picoeukclean$Time_UTC, orders = 'dby HMS', return.format = TRUE)
# newtime <- rename(newtime, Time_UTC = x)
# picoeukclean <- full_join(picoeukclean, newtime, by = "Time_UTC")
newtime <- iso8601_convert(picoclean$Time_UTC, orders = 'dby HMS', return.format = TRUE)
newtime <- rename(newtime, Time_UTC = x)
picoclean <- full_join(picoclean, newtime, by = "Time_UTC")

# Note the table does not have any remaining NaT or NaN
# view summary stats
summary(picoclean)

# rename columns
# picoeukclean <- rename(picoeukclean, eventDate = x_converted)
# picoeukclean <- rename(picoeukclean, concentration_picoeuk = Picoeukaryote_Concentration_Cells_per_mL)
picoclean <- rename(picoclean, date = x_converted)
picoclean <- rename(picoclean, redeuk_leq_20um_cells_per_ml = Picoeukaryote_Concentration_Cells_per_mL)
picoclean <- rename(picoclean, syn_cells_per_ml = Synechococcus_Concentration_Cells_per_mL)

# add columns decimalLatitude, decimalLongitude, depth
picoclean$latitude = 41.3250
picoclean$longitude = -70.5667
picoclean$depth = 4

# strip extraneous columns Time_UTC and format
picoclean <- select(picoclean, -Time_UTC, -format)

# round abundance to 2 past decimal
# picoeukclean$concentration_picoeuk <- round(picoeukclean$concentration_picoeuk, digits = 2)
picoclean$redeuk_leq_20um_cells_per_ml <- round(picoclean$redeuk_leq_20um_cells_per_ml, digits = 2)
picoclean$syn_cells_per_ml <- round(picoclean$syn_cells_per_ml, digits = 2)

```


## QA: Map Sampling Locations

Call the map_locs function from edi-utility.R to map the sampling locations. Perform a visual check.

```{r}

# Map Check
# Note Package `maps` required for `map_data`

map_locs(df = picoclean, xvar = "longitude", yvar = "latitude", region = "transect", colorvar = NULL)


```

## Additional quality assurance for data values

```{r}
# these steps in addition to summary stats for quality assurance

# all date must be unique
length(unique(picoclean$date))

# summary stats above make sure abundance_picoeuk does not exceed
# toomany <- picoeukclean %>% filter(abundance_picoeuk > 1000000)

# summary stats above make sure no NAs in abundance
# abundNA <- picoeukclean %>% filter(is.na(abundance_picoeuk))


```

## Column Header Organization and write files for upload to EDI
```{r}
# # define headers for columns in desired order
nes_lter_pico_mvco_headers <- c("date","redeuk_leq_20um_cells_per_ml","syn_cells_per_ml", "latitude","longitude","depth")

# reorder columns as necessary
nes_lter_pico_mvco <- picoclean[, nes_lter_pico_mvco_headers]

# write files for upload to EDI
write.csv(nes_lter_pico_mvco, file.path(dir, "/project_folder/nes-lter-pico-mvco-T.csv"),
          row.names = FALSE)

```

## use text editor to manually remove quotations and replace T with space in datetime
## save new csv file without the -T 

## compare to previously published version

```{r}
# remove the syn column and export a temporary csv to compare with diffchecker
compare2old <- select(nes_lter_pico_mvco, -syn_cells_per_ml)
write.csv(compare2old, file.path(dir, "/project_folder/compare2old.csv"),
          row.names = FALSE)
# previous version different column headers but exactly the same values
# none missing, just added rows

```


## EML Assembly: nes-lter-picoeuk-mvco

This chunk outputs the final xml file for EDI through the following steps:

Step 1: Populating EML Assembly Line templates with metadata
Step 2: Calculating the geospatial and temporal coverage 
Step 3: Making the XML file 
Step 4: Inserting a custom NES-LTER parent project node 

```{r}

# define input for EML assembly
metadata <- "nes-lter-pico-mvco-info"  # xlsx file
project_folder <- "project_folder/"
# edi_data <- "nes-lter-picoeuk-mvco"  # data file created by above script
# use text editor to manually remove quotations and replace T with space in datetime
# save new csv file without the -T
edi_data <- "nes-lter-pico-mvco" # manually edited data file
file_descriptions <- "Data table with picoeuk and Syn cell concentration and time sampled"
pkg_id <- "knb-lter-nes.10.2"

# Make EML Templates
# copy abstract, additional_info, and methods txt files into project_folder
# or else this next step will produce empty files

xlsx_to_template(metadata.path = paste0(dir, metadata),
                 output.path = paste0(dir, project_folder),
                 edi.filename = edi_data,
                 rights = "CCBY")

# Data Coverage
# isolate date and geospatial columns for input
date_col <- as.Date(picoclean$date)
lat_col <- picoclean$latitude
lon_col <- picoclean$longitude
# run function to determine geospatial and temporal coverage
coverage <- data_coverage(dates = date_col, lat = lat_col, lon = lon_col)

# Make EML
make_eml(path = paste0(dir, project_folder),
         dataset.title = "Abundance of eukaryote picophytoplankton and Synechococcus from a moored submersible flow cytometer at Martha's Vineyard Coastal Observatory, ongoing since 2003 (NES-LTER since 2017)",
         data.table = paste0(edi_data, ".csv"),
         data.table.description = file_descriptions,
         temporal.coverage = c(coverage$startdate, coverage$enddate),
         geographic.description = "Martha's Vineyard Coastal Observatory",
         geographic.coordinates = c(coverage$North, coverage$East, coverage$South, coverage$West),
         maintenance.description = "ongoing",
         user.id = "NES",
         user.domain = "LTER",
         package.id = pkg_id)

# Insert Custom Project Node
project_insert(edi_pkg = pkg_id, 
               xml.path = paste0(dir, project_folder))
```

## Add semantic annotations to EML 2.2