This repository was archived by the owner on Jan 10, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02-tidy_data.R
80 lines (65 loc) · 3.21 KB
/
02-tidy_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
library(tidyverse)
combined.tempdata <- read.csv("combined data\\combined.tempdata.csv")
head(combined.tempdata)
#move into seperate dataframe for cleaning
cleaned.tempdata <- combined.tempdata
#turn "max" string into max plate area so can turn into numeric variable
pi * ((135/2) ^ 2)
cleaned.tempdata$totalgrowth.mm <- gsub("max", "14313.88",
cleaned.tempdata$totalgrowth.mm)
#check all values of "max" has been changed into max plate area
filter(cleaned.tempdata, totalgrowth.mm == "max") %>% summarise(n())
filter(cleaned.tempdata, totalgrowth.mm == "14313.88") %>% summarise(n())
#sanity check if all "max" in original df have been replaced with pi r ^ 2 value
filter(combined.tempdata, totalgrowth.mm == "max") %>% summarise(n()) ==
filter(cleaned.tempdata, totalgrowth.mm == "14313.88") %>% summarise(n())
#turn totalgrowth.mm into numeric vector for futher analysis
cleaned.tempdata$totalgrowth.mm <- as.numeric(cleaned.tempdata$totalgrowth.mm)
#check for number of results above max plate size and then replace with max
#plate size
filter(cleaned.tempdata, totalgrowth.mm > 14313.88) %>%
summarise(n())
cleaned.tempdata$totalgrowth.mm[cleaned.tempdata$totalgrowth.mm > 14313.88] <- 14313.88
#check number of totalgrowth.mm variables that are above max plate area
if (filter(cleaned.tempdata, totalgrowth.mm > 14313.88) %>%
summarise(n()) > 0){
filter(cleaned.tempdata, totalgrowth.mm > 14313.88)
} else {
print("No variables above max plate size")
}
#check number of NA values - try to find these if possible!
sum(is.na(cleaned.tempdata$totalgrowth.mm))
filter(cleaned.tempdata, is.na(cleaned.tempdata$totalgrowth.mm))
#filter out cases where plates were contaminated or had hit edge as this will
#affect growthrate calculation
cleaned.tempdata <- filter(cleaned.tempdata, hit.edge == "no",
contaminated == "no")
#then write this to csv file for access for future calculations
######Write cleaned.tempdata.csv
write.csv(cleaned.tempdata,
file = "cleaned data\\cleaned.tempdata.csv", row.names = FALSE)
#####Checking work so far
#see how many of each temperature point have
group_by(cleaned.tempdata, temperature) %>%
summarise(n())
#read in previous combined files
fifteen <- read.csv("combined data\\15degreescombined.csv")
twentyfive <- read.csv("combined data\\25degreescombined.csv")
thirty <- read.csv("combined data\\30degreescombined.csv")
#combine these csv's
combined.rawdata <- rbind(fifteen, twentyfive, thirty)
#filter out the hit edges and contaminated as above
filter(combined.rawdata, hit.edge == "no", contaminated == "no") %>%
group_by(temperature) %>%
summarise(n())
#check that they match
group_by(cleaned.tempdata, temperature) %>% summarise(n()) ==
filter(combined.rawdata, hit.edge == "no", contaminated == "no") %>%
group_by(temperature) %>% summarise(n())
#####Add in time in culture and species isolated from
daysinculture <- read.csv("raw data//days.in.culture.csv")
head(daysinculture)
head(cleaned.tempdata)
cultureinfo.tempdata <- left_join(cleaned.tempdata, daysinculture, by = c("isolate", "temperature"))
write.csv(cultureinfo.tempdata,
file = "cleaned data\\cultureinfo.tempdata.csv", row.names = FALSE)