-
Notifications
You must be signed in to change notification settings - Fork 0
/
clear-invalid-env-PacoEzpela.Rmd
89 lines (67 loc) · 2.71 KB
/
clear-invalid-env-PacoEzpela.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
In this notebook we look when the sensors have been moved and, therefore, the logged data isn't valid.
```{r}
# import libraries
source('lib-dendro.R')
library(plotly)
library(readr) # for write_csv() function
library(tidyverse) # for %>%
library(glue)
# global variables
PATH = dirname(rstudioapi::getSourceEditorContext()$path)
setwd(PATH)
PLACE = 'PacoEzpela'
BUFFER_ENV_DIR = glue('processed/{PLACE}-env-buffer-toclear')
OUTPUT_ENV_DIR = glue('processed/{PLACE}-env-processed')
```
```{r}
# importing processed environmental data #
list_files <- list.files(file.path(PATH,BUFFER_ENV_DIR), pattern="*.csv$", full.names=TRUE)
db.env<-read.all.env.processed(list_files)
str(db.env)
```
With VWC we can better see when the sensor is moved (VWC equals to 0 for a long period):
```{r}
f <- plot_ly(db.env, x = ~ts, y = ~vwc, color = ~series, type = 'scatter', mode = 'lines')
f
# library(datacleanr); dcr_app(db.env)
```
Defining and clearing the periods of invalid data. There's quite a bunch of invalid data (timeframes where the sensor has been removed by an animal), we need to remove those:
First, we remove all the data prior to installation:
```{r}
ts_start = "2023-03-23 14:00:00"
head(db.env)
db.env <- db.env[which(db.env$ts>=ts_start),]
head(db.env)
```
Then, we set NA when there's no valid data:
```{r}
# We clear the invalid data for each series:
series1 = (db.env$series == "94252898")
# first interval:
interval1.1 = (db.env$ts >= "2023-06-18 04:15:00")
db.env[series1 & interval1.1, c("vwc", "soil.temp", "surface.temp", "air.temp")] <- NA
series2 = (db.env$series == "94252899")
interval2.1 = (db.env$ts >= "2023-08-27 20:45:00")
db.env[series2 & interval2.1, c("vwc", "soil.temp", "surface.temp", "air.temp")] <- NA
```
This is the result of adding the NA to the invalid periods:
```{r}
f <- plot_ly(db.env, x = ~ts, y = ~vwc, color = ~series, type = 'scatter', mode = 'lines')
f
```
save result
```{r}
OUTPUT_PATH = file.path(PATH, OUTPUT_ENV_DIR)
if (!dir.exists(OUTPUT_PATH)) {dir.create(OUTPUT_PATH)}
write_csv(db.env, file.path(OUTPUT_PATH, "proc-env.csv" ), append = F, col_names = T)
db.agg <- subset(db.env, select = c("ts", "soil.temp", "surface.temp", "air.temp", "vwc") )
# Do mean of all sensors
db.agg <- db.env %>%
#filter(ts < ymd_hms("2022-12-31 00:00:00")) %>%
group_by(ts) %>%
dplyr::summarise(soil.temp = mean(soil.temp, na.rm = T), surface.temp = mean(surface.temp, na.rm = T), air.temp = mean(air.temp, na.rm = T), vwc = mean(vwc, na.rm = T))
summary(db.agg)
# write aggregated data to file.
if (!dir.exists(file.path(OUTPUT_PATH, 'aggregated'))) {dir.create(file.path(OUTPUT_PATH, 'aggregated'))}
write_csv(db.agg, file.path(OUTPUT_PATH, 'aggregated', "proc-agg-env.csv"), append = F, col_names = T)
```