forked from WHOIGit/nes-lter-picoeuk-mvco
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnes-lter-picoeuk-mvco-eml.Rmd
204 lines (150 loc) · 7 KB
/
nes-lter-picoeuk-mvco-eml.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
---
title: "NES-LTER picoeuk and Syn at MVCO"
author: "Stace Beaulieu"
date: "2022-11-08"
output: html_document
---
## R Markdown Setup
```{r setup, include=FALSE}
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
# clear workspace for local development
rm(list = ls())
# set environment timezone to UTC
Sys.setenv(TZ = "UTC")
# assign relative path to directory
# dir <- "/Users/sbeaulieu/Desktop/github/WHOIGit/nes-lter-picoeuk-mvco/"
dir <- "/Users/sbeaulieu/Desktop/github/nes-lter-picoeuk-mvco/"
# set as working directory
setwd(dir)
# define source for functions developed for the EDI packaging workflow
source("edi-utilities.R")
# install necessary libraries
# install.packages("devtools")
# install_github("EDIorg/EMLassemblyline")
# remotes::install_github("EDIorg/dataCleanr")
# define R packages to require
libs <- c("tidyverse", "readxl", "lubridate", "devtools", "EMLassemblyline", "EML", "maps", "xml2", "dataCleanr")
# load libraries
lapply(libs, require, character.only = TRUE)
```
## Load data table revise column headers and add columns here
```{r}
# picoeuk <- read_csv("Picoeukaryote_Cellsperml.csv") # data file through 2016
# picoeuk <- read_csv("Picoeukaryote_Raw_Counts.csv") # data file through 2018
pico <- read_csv("AllPicos_Cellsperml.csv") # data file inclusive of Syn and picoeuk
# remove rows when entire row is NA
# picoeukclean <- filter_all(picoeuk, all_vars(!is.na(.)))
picoclean <- filter_all(pico, all_vars(!is.na(.)))
# convert picoeukclean$Time_UTC to ISO8601 format
# library(dataCleanr)
# newtime <- iso8601_convert(picoeukclean$Time_UTC, orders = 'dby HMS', return.format = TRUE)
# newtime <- rename(newtime, Time_UTC = x)
# picoeukclean <- full_join(picoeukclean, newtime, by = "Time_UTC")
newtime <- iso8601_convert(picoclean$Time_UTC, orders = 'dby HMS', return.format = TRUE)
newtime <- rename(newtime, Time_UTC = x)
picoclean <- full_join(picoclean, newtime, by = "Time_UTC")
# Note the table does not have any remaining NaT or NaN
# view summary stats
summary(picoclean)
# rename columns
# picoeukclean <- rename(picoeukclean, eventDate = x_converted)
# picoeukclean <- rename(picoeukclean, concentration_picoeuk = Picoeukaryote_Concentration_Cells_per_mL)
picoclean <- rename(picoclean, date = x_converted)
picoclean <- rename(picoclean, redeuk_leq_20um_cells_per_ml = Picoeukaryote_Concentration_Cells_per_mL)
picoclean <- rename(picoclean, syn_cells_per_ml = Synechococcus_Concentration_Cells_per_mL)
# add columns decimalLatitude, decimalLongitude, depth
picoclean$latitude = 41.3250
picoclean$longitude = -70.5667
picoclean$depth = 4
# strip extraneous columns Time_UTC and format
picoclean <- select(picoclean, -Time_UTC, -format)
# round abundance to 2 past decimal
# picoeukclean$concentration_picoeuk <- round(picoeukclean$concentration_picoeuk, digits = 2)
picoclean$redeuk_leq_20um_cells_per_ml <- round(picoclean$redeuk_leq_20um_cells_per_ml, digits = 2)
picoclean$syn_cells_per_ml <- round(picoclean$syn_cells_per_ml, digits = 2)
```
## QA: Map Sampling Locations
Call the map_locs function from edi-utility.R to map the sampling locations. Perform a visual check.
```{r}
# Map Check
# Note Package `maps` required for `map_data`
map_locs(df = picoclean, xvar = "longitude", yvar = "latitude", region = "transect", colorvar = NULL)
```
## Additional quality assurance for data values
```{r}
# these steps in addition to summary stats for quality assurance
# all date must be unique
length(unique(picoclean$date))
# summary stats above make sure abundance_picoeuk does not exceed
# toomany <- picoeukclean %>% filter(abundance_picoeuk > 1000000)
# summary stats above make sure no NAs in abundance
# abundNA <- picoeukclean %>% filter(is.na(abundance_picoeuk))
```
## Column Header Organization and write files for upload to EDI
```{r}
# # define headers for columns in desired order
nes_lter_pico_mvco_headers <- c("date","redeuk_leq_20um_cells_per_ml","syn_cells_per_ml", "latitude","longitude","depth")
# reorder columns as necessary
nes_lter_pico_mvco <- picoclean[, nes_lter_pico_mvco_headers]
# write files for upload to EDI
write.csv(nes_lter_pico_mvco, file.path(dir, "/project_folder/nes-lter-pico-mvco-T.csv"),
row.names = FALSE)
```
## use text editor to manually remove quotations and replace T with space in datetime
## save new csv file without the -T
## compare to previously published version
```{r}
# remove the syn column and export a temporary csv to compare with diffchecker
compare2old <- select(nes_lter_pico_mvco, -syn_cells_per_ml)
write.csv(compare2old, file.path(dir, "/project_folder/compare2old.csv"),
row.names = FALSE)
# previous version different column headers but exactly the same values
# none missing, just added rows
```
## EML Assembly: nes-lter-picoeuk-mvco
This chunk outputs the final xml file for EDI through the following steps:
Step 1: Populating EML Assembly Line templates with metadata
Step 2: Calculating the geospatial and temporal coverage
Step 3: Making the XML file
Step 4: Inserting a custom NES-LTER parent project node
```{r}
# define input for EML assembly
metadata <- "nes-lter-pico-mvco-info" # xlsx file
project_folder <- "project_folder/"
# edi_data <- "nes-lter-picoeuk-mvco" # data file created by above script
# use text editor to manually remove quotations and replace T with space in datetime
# save new csv file without the -T
edi_data <- "nes-lter-pico-mvco" # manually edited data file
file_descriptions <- "Data table with picoeuk and Syn cell concentration and time sampled"
pkg_id <- "knb-lter-nes.10.2"
# Make EML Templates
# copy abstract, additional_info, and methods txt files into project_folder
# or else this next step will produce empty files
xlsx_to_template(metadata.path = paste0(dir, metadata),
output.path = paste0(dir, project_folder),
edi.filename = edi_data,
rights = "CCBY")
# Data Coverage
# isolate date and geospatial columns for input
date_col <- as.Date(picoclean$date)
lat_col <- picoclean$latitude
lon_col <- picoclean$longitude
# run function to determine geospatial and temporal coverage
coverage <- data_coverage(dates = date_col, lat = lat_col, lon = lon_col)
# Make EML
make_eml(path = paste0(dir, project_folder),
dataset.title = "Abundance of eukaryote picophytoplankton and Synechococcus from a moored submersible flow cytometer at Martha's Vineyard Coastal Observatory, ongoing since 2003 (NES-LTER since 2017)",
data.table = paste0(edi_data, ".csv"),
data.table.description = file_descriptions,
temporal.coverage = c(coverage$startdate, coverage$enddate),
geographic.description = "Martha's Vineyard Coastal Observatory",
geographic.coordinates = c(coverage$North, coverage$East, coverage$South, coverage$West),
maintenance.description = "ongoing",
user.id = "NES",
user.domain = "LTER",
package.id = pkg_id)
# Insert Custom Project Node
project_insert(edi_pkg = pkg_id,
xml.path = paste0(dir, project_folder))
```
## Add semantic annotations to EML 2.2