forked from astaaudzi/RLSanalysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRLSsiteGrouping.Rmd
408 lines (283 loc) · 21.5 KB
/
RLSsiteGrouping.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
---
title: "Grouping of RLS sites"
author: "Asta and Freddie"
date: "17 November 2017"
output:
html_document:
code_folding: show
toc: yes
toc_float: no
pdf_document:
toc:yes
---
```{r warning=FALSE, message=FALSE, warning=FALSE}
rm(list=ls())
list.files()
list.of.packages <- c("tidyverse", "dplyr", "ggplot2", "ggmap")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
lapply(list.of.packages, require, character.only = TRUE)
#load RLS_MPA bassian ecoregion data
load(file="rlsbas.RData")
nsites <- length(unique(rls_bas$SiteCode)) #531 site codes in the data
## sumarrise bassian sites
sites_bas <- rls_bas %>%
group_by(SiteCode) %>%
summarise (name = first(`Site name`), location = first(Location), latt = round(mean(SiteLat),3), long = round(mean(SiteLong),3), nsp = n_distinct(SpeciesID), year = n_distinct(year), surveys = n_distinct(SurveyID), depth = mean(Depth), visibility = round(mean(Visibility, na.rm = TRUE),3))
# there are 531 sites.
# Plot the sites
tasmania = get_map(location = c(lon = 145, lat = -40), zoom = 6, maptype = "satellite")
## TODO - add a title and axis names to this
ggmap(tasmania, extent = "panel", legend = "bottomright") +
ggtitle("Reef Life Survey Sites") +
theme(plot.title = element_text(hjust = 0.5)) +
geom_point(aes(x = sites_bas$long, y = sites_bas$latt), data = sites_bas, size = 1, color = as.numeric(factor(sites_bas$location))) +
scale_y_continuous(limits = c(-44.00, -37.00), expand = c(0, 0)) +
scale_x_continuous(limits = c(143.00, 150.00), expand = c(0, 0))
# TITLE ADDED BUT NOT SURE WHY THE ERROR?
#Table of sites
#knitr::kable(sites_bas)
```
There are 531 sites. They are roughly grouped into locations as shown on the map, but Rick suggests that location name should not be used. So let's group them differently. First we define a grid of 10 intervals and group the sites into the grid.
```{r warning=FALSE, message=FALSE, warning=FALSE}
## Group the sites into
n_groups <- 10
int <- (max(sites_bas$latt) - min(sites_bas$latt)) / n_groups #latitude interval
sites_bas$group <- ceiling((sites_bas$latt - (min(sites_bas$latt)-0.001)) / int )
int2 <- (max(sites_bas$long) - min(sites_bas$long)) / n_groups #long based interval
sites_bas$group2 <- ceiling((sites_bas$long - (min(sites_bas$long)-0.001)) / int2 )
sites_bas$group3 <- ((sites_bas$group - 1) * n_groups) + sites_bas$group2 # final grouping, the groups are not consequtive numbers
length(unique(sites_bas$group3)) # this gives 31 groups
#and map these sites
hlines <- seq(min(sites_bas$latt),max(sites_bas$latt),by=int)
vlines <- seq(min(sites_bas$long),max(sites_bas$long),by=int2)
ggmap(tasmania, extent = "panel", legend = "bottomright") +
geom_point(aes(x = sites_bas$long, y = sites_bas$latt), data = sites_bas, size = 1, color = sites_bas$group3) +
scale_y_continuous(limits = c(-44.00, -37.00), expand = c(0, 0)) +
scale_x_continuous(limits = c(143.00, 150.00), expand = c(0, 0)) +
geom_vline(xintercept = vlines) +
geom_hline(yintercept = hlines)
# Now put these groups into the main rls_bas dataset
rls_bas$group <- sites_bas$group3[match(rls_bas$SiteCode, sites_bas$SiteCode)]
```
Next I get site covariate inforamtion. The following chunk is used to load all the data from .csv files and compile it into one data file saved as SiteData_bas.RData. It has a message eval=FALSE which means that it will not run upon knitting.
```{r warning=FALSE, message=FALSE, warning=FALSE, eval=FALSE, echo=FALSE}
#this chunk only needs to be run once or not at all if the output of site infomation is loaded from RData
#but I keep it here for the future, in case we get new .csv files from Rick and new inforamtion
#Get other information about the sites, like protection status, exposure, environmental covariates
rls_exp <- read_csv("RLSexposures.csv")
rls_exp <- rls_exp %>% select(SiteCode, `Wave exposure`)
rls_cov <- read_csv("RLSsitecovar.csv") # files with various covariates
rls_cov <- rls_cov[,c(1:2,15:31)]
rls_prot <- read_csv("Neoli.csv") #file with protection status
rls_prot <- rls_prot %>% select(SiteCode, Governance)
#in neoli.csv data empty values for governance means that the site is not protected. So we replace them with 0
rls_prot$Governance[is.na(rls_prot$Governance)] <- 0
## These are datafiles from German for the MPA sites
mpa1 <- read_csv("TASprotect1.csv") #MPA sites and their protection status
mpa2 <- read_csv("TASprotect2.csv")
sites_mpa1 <- mpa1 %>%
group_by(SiteCode) %>%
summarise(protection = first(RESERVE_STATUS_CODE))
a = sapply(sites_mpa1$protection,switch,'NTZ'=1,'EXT'=0, 'RTZ' = 0.5)
rls_mpa1 <- cbind(sites_mpa1[,1],a)
sites_mpa2 <- mpa2 %>%
group_by(SiteCode) %>%
summarise(protection = first(RESERVE_STATUS_CODE))
b = sapply(sites_mpa2$protection,switch,'NTZ'=1,'EXT'=0, 'RTZ' = 0.5)
rls_mpa2 <- cbind(sites_mpa2[,1],b)
algae <- read_csv("TASalgae.csv") #more MPA sites and protection status as well as algal abundance
sites_algae <- algae %>%
group_by(SiteCode) %>%
summarise(protection = first(RESERVE_STATUS_CODE), abundA = mean(TOTAL_NUMBER))
c = sapply(sites_algae$protection,switch,'NTZ'=1,'EXT'=0, 'RTZ' = 0.5)
rls_algae = cbind(sites_algae[,c(1,3)],c)
##Now I need to merge all the matrices
sites1 <- merge(x = sites_bas, y = rls_exp, by.x = "SiteCode", by.y = "SiteCode", all.x = TRUE)
sites2 <- merge(x = sites1, y = rls_cov, by.x = "SiteCode", by.y = "SiteCode", all.x = TRUE)
sites3 <- merge(x = sites2, y = rls_prot, by.x = "SiteCode", by.y = "SiteCode", all.x = TRUE)
sites4 <- merge(x = sites3, y = rls_mpa1, by.x = "SiteCode", by.y = "SiteCode", all.x = TRUE)
sites5 <- merge(x = sites4, y = rls_mpa2, by.x = "SiteCode", by.y = "SiteCode", all.x = TRUE)
sites6 <- merge(x = sites5, y = rls_algae, by.x = "SiteCode", by.y = "SiteCode", all.x = TRUE)
# further manipulation of data
# select the four columns that have the protection status for different sites from different files and merge them to remove NAs
protection <- sites6 %>% select(Governance, a, b, c)
mpa = t(apply(protection, 1, sort, na.last = T)) # now the first column has the protection information
mpa = as.numeric(mpa[,1]) # pick the first column now, if it has NA then the site does not have protection information. Note, 1 means no take, 0.5 means restricted take, and 0 means open to fishing
sites6$latt <- abs(sites6$latt) #convert lattitude to positive values for PCA analyses
siteData <- cbind(sites6, mpa)
siteData <- siteData[!duplicated(siteData), ] #remove duplicates
save(siteData, file = "SiteData_bas.RData") # to avoid reading all the matrices again
```
#Now group sites based on environmental data
```{r warning=FALSE, message=FALSE, warning=FALSE}
load (file = "SiteData_bas.RData")
# Select which variables should be included in the final analsyes. I don't include visibility because it is missing in many sites
siteData1 <- siteData %>% select(SiteCode, latt, long, nsp, depth, damax, chlomean, chlomax, phos, silicate, sstmean, sstrange, sstmax, sstmin, chlomin, parmean, salinity, dissox, cloudmean, nitrate, calcite, ph, abundA, `Wave exposure`)
## The data set above can include any range of selected variables. NOTE, the protection status or variable 'mpa' probably should not be included in clustering. We should cluster sites by different variables. Then the models will be run with and without fishing and emergent data compared to sites with and without protection
plot(siteData1[,-1])
#It is interesting how latitude correlates with phos and silicate so strongly! Looks a big weird. Latitude correlates with sstmean and sstmax but not sstrange. damax (turbidity) correlates strongly with chlomean and chlomax. So it looks that we could reduce the strongly correlated variables
#in the new selection I exclude strongly correlated variables and also abundA (algal abudnance) and wave exposure due to missing data
siteData1 <- siteData %>% select(SiteCode, latt, long, nsp, depth, chlomean, phos, sstmean, sstrange, sstmax, sstmin, parmean, salinity, dissox, cloudmean, calcite, ph)
plot(siteData1[,-1])
## Freddie, this is a super clumsy code and I am sure this could be done better - I need to turn it into a numeric matrix wtih site names as row names and variables as columns. This is because clustering analyses do not run on lists but need numeric values
siteForPCA <- matrix(unlist(siteData1),ncol=length(siteData1[,]),byrow=FALSE) #turn the list into matrix
siteForPCAt <- siteForPCA[,-1] #remove the first column which is the site code
storage.mode(siteForPCAt) <- "numeric"
siteForPCAt[is.nan(siteForPCAt)] <- NA # some NaN still found in the data and will mess up the analyses. Replace them with NA
rownames(siteForPCAt) <- siteData1[,1] #site code as row names
colnames(siteForPCAt) <- colnames(siteData1[,-1]) #variables as column names
# This is now the main data for PCA analysis of sites based on environmental conditions
envirPCA <- siteForPCAt
##Principal component analysis.
# At this stage we simply remove observations with missing data. There are some methods that can to interpolate missing data and it might be better to use that in the future
sitePCA = prcomp(na.omit(envirPCA[,-c(1:2)]), center = TRUE, scale. = TRUE, retx = TRUE) #conduct PCA but don't use lat and long as variables. I use scaled nad rotated axis, but it does not make any difference
summary(sitePCA)
plot(sitePCA)
biplot(sitePCA, cex = 0.5, expand = 1)
plot(sitePCA$x[,1], sitePCA$x[,2])
sitePCA$rotation[,c(1:2)] #check correlation of variables with the first two axis. Strong correlation show they are most important determinants of grouping
pscores = sitePCA$x[,c(1,2)]
siteData2 = merge(x = siteData1, y = pscores, by.x = "SiteCode", by.y = "row.names", all.x = TRUE)
# TODO - The following is not a very rigorous or automated way to define groups. This done by simple eyeballing of the PCA plot to identify four main groups on PC1 and PC2. This should be done better
sitesWithGroups <- siteData2 %>% mutate(g = case_when((siteData2$PC1 < 0 & siteData2$PC2 < 0) ~ 1, siteData2$PC1 > 4 ~ 2, (siteData2$PC1 < -2 & siteData2$PC2 >0) ~ 3))
sitesWithGroups$g[is.na(sitesWithGroups$g)] <- 4 # assign other sites a code of 4.
## And plot them now - looks pretty cool! Three clear groups based on environmental variables only
# We just need to change colours so they are better
tasmania = get_map(location = c(lon = 145, lat = -40), zoom = 6, maptype = "satellite")
ggmap(tasmania, extent = "panel", legend = "bottomright") +
geom_point(aes(x = sitesWithGroups$long, y = -(sitesWithGroups$latt)), data = sitesWithGroups, size = 0.7, color = sitesWithGroups$g) +
scale_y_continuous(limits = c(-44.00, -37.00), expand = c(0, 0)) +
scale_x_continuous(limits = c(143.00, 150.00), expand = c(0, 0))
#Finally add the envirogroups to the main dataset
rls_bas$envgroup <- sitesWithGroups$g[match(rls_bas$SiteCode, sitesWithGroups$SiteCode)]
```
The results above show that when 14 variables are included, the first two PC components explain 73% of variation, which is good. The PCA shows that the first axis is mostly determined by the contrast between temperature(sstmean, sstmax), salinity, ph against nitrate, phos, silicate and dissox. The three groups visible on PC1 are further separated on PC2, which is loaded by damax (turbidity), cholorpyl and sstrage agaist sstmin and depth. The four groups define S, Port Phillip Bay, Derwent and the rest.
Next we can look whether clustering gives us similar groupings
```{r warning=FALSE, message=FALSE, warning=FALSE}
clus <- kmeans(matrixForPCA[,-c(1:2)], 3, nstart = 20) # don't include lat long in clustering, only use environmental covariates
siteclus <- clus$cluster
## add cluster identity to the sizeWithGroups matrix (it is called y now but should be renamed)
sitesWithGroups = merge(x = sitesWithGroups, y = siteclus, by.x = "SiteCode", by.y = "row.names", all.x = TRUE)
ggmap(tasmania, extent = "panel", legend = "bottomright") +
geom_point(aes(x = sitesWithGroups$long, y = -(sitesWithGroups$latt)), data = sitesWithGroups, size = 0.7, color = sitesWithGroups$y) +
scale_y_continuous(limits = c(-44.00, -37.00), expand = c(0, 0)) +
scale_x_continuous(limits = c(143.00, 150.00), expand = c(0, 0))
```
*Result* - The k means clustering into 3 groups is not that good. The groups are all intermixed.
The next thing I want to do is to look at the grouping based on biological data only
```{r warning=FALSE, message=FALSE, warning=FALSE, eval = FALSE}
#First filter out species that have only been observed over a certain number of years, here I choose species seen for 5 or more years
# I can either apply a filter by selecting key groups that I think are important, or use all groups
mainSpeciesOccur <- rls_bas %>%
# filter (( PHYLUM == "Chordata" & GENUS != "NA" & CLASS != "Ascidiacea") | CLASS == "Cephalopoda") %>%
# filter (( PHYLUM == "Chordata" & GENUS != "NA" & CLASS != "Ascidiacea") | CLASS == "Cephalopoda" | ORDER == "Decapoda" | GENUS == "Haliotis" | GENUS == "Turbo" | CLASS == "Asteroidea" | CLASS == "Echinoidea" | CLASS == "Echiuroidea" ) %>%
group_by (TAXONOMIC_NAME, GENUS, CLASS, FAMILY, ORDER) %>%
summarise (years = n_distinct(year), years2000 = n_distinct(ye2000), years2010 = n_distinct(ye2010)) %>%
filter(years > 4)
MainSpeciesList <- mainSpeciesOccur[[1]] ## list of 163 species filtered by occurrence in years, or 220 species without any filters
## Common benthic invertebrates - include lobsters (ORDER == "Decapoda", not many of them), abalone (GENUS == "Haliotis", 2 species), periwinkles (GENUS == "Turbo", Turbo undulatus), seastars (Class == "Asteroidea"), and urchins as one grazer group (CLASS == "Echinoidea" & "Echiuroidea"). Also Maoricolpus roseus is a common gastropod and an invader from NZ. Dicathais orbita (sea kangaroo?) is a common gastropod, could be pooled with two other presumably herbivorous gastropods? Alternatively we just pool all gastopods except for Haliotis
#Second calculate mean abundance and biomass of these MORE COMMON species per survey and filter out rare species
SpPerSurvey <- rls_bas %>%
filter (TAXONOMIC_NAME %in% MainSpeciesList) %>%
group_by (TAXONOMIC_NAME, GENUS, CLASS, FAMILY, ORDER, SurveyID) %>% ## group by day, month, year and location - survery
summarise (abundSur = round(sum(Abundance),2), biomSur = round(sum(BioMass, na.rm=TRUE),2)) %>%
group_by (TAXONOMIC_NAME, GENUS, CLASS, FAMILY, ORDER) %>%
summarise(meanAbun = round(mean(abundSur),3), meanBiom = round(mean(biomSur),3)) %>%
filter (meanAbun > 4 | meanBiom > 1000)
MainSpecies <- SpPerSurvey[[1]] ## if abund > 5 I get final list of common species with 3 filters applied = 68 species, or 78 species if using all taxa. If abund > 6 I get 68 for final list of taxa.
#Get summary about the groups
groups_bas <- rls_bas %>%
group_by(group) %>%
summarise (location = first(Location), latt = round(mean(SiteLat),3), long = round(mean(SiteLong),3), nsp = n_distinct(SpeciesID), year = n_distinct(year), surveys = n_distinct(SurveyID), locatnum = n_distinct(Location), depth = mean(Depth), visibility = round(mean(Visibility, na.rm = TRUE),3))
# Create sitexspecies abundance matrix. First, summarise species records for the main sites. One can apply various filters to exclude locations that have large deviation on sites
SpeciesforPCA <- rls_bas %>%
filter (TAXONOMIC_NAME %in% MainSpecies) %>%
# filter (Location != "Port Phillip Bay" & Location != "Port Phillip Heads" & Location != "Rocky Cape" & Location != "Derwent" & Location != "Derwent Estuary" & Location != "Victoria (other)") %>%
# filter (Location != "Port Phillip Bay" & Location != "Port Phillip Heads") %>%
filter (SiteLat < -39) %>% # filter victorian sites
# filter(Method == 1) %>%
# filter (year < 2006) %>%
group_by(group, SurveyID, TAXONOMIC_NAME) %>%
summarise (abund = round(sum(Abundance),3)) %>%
group_by(group, TAXONOMIC_NAME) %>%
summarise(abundMean = round(mean(abund), 3))
## This gives the mean abundance of key species per site - a large matrix
#Freddie, here again is the same issue of converting a list into a matrix
spPCAdata <- spread(SpeciesforPCA, key = group, value = abundMean, fill = 0)
spPCAdata1 <- matrix(unlist(spPCAdata),ncol=length(spPCAdata[,]),byrow=FALSE) #turn the list into matrix
spPCAdatat <- spPCAdata1[,-1]
storage.mode(spPCAdatat) <- "numeric"
rownames(spPCAdatat) <- spPCAdata1[,1]
colnames(spPCAdatat) <- colnames(spPCAdata[,-1])
speciesPCA <- t(spPCAdatat)
#Freddie, we need to give more informative titles to the groups we now define from the coordinate grid. For example, in the plots below it would be good to identify what those groups are
## now we do PCA
sitePCAsp = prcomp(speciesPCA, scale. = TRUE, retx = TRUE)
summary(sitePCAsp)
plot(sitePCAsp)
biplot(sitePCAsp, cex = 0.5, expand = 1)
plot(sitePCAsp$x[,1], sitePCAsp$x[,2]) # How can we identify the groups better?
pscores = sitePCAsp$x[,c(1,2)]
groupData = merge(x = groups_bas, y = pscores, by.x = "group", by.y = "row.names", all.x = FALSE)
# The following is not a very rigorous or automated way to define groups. This done by simple eyeballing of the PCA plot to identify four main groups on PC1 and PC2
groupData <- groupData %>% mutate(grPC = case_when(groupData$PC1 > 2 ~ 1, (groupData$PC1 <1 & groupData$PC2 < -1) ~ 2, (groupData$PC1 <1 & groupData$PC2 > -1) ~ 3))
ggmap(tasmania, extent = "panel", legend = "bottomright") +
geom_point(aes(x = groupData$long, y = groupData$latt), data = groupData, size = 1, color = (groupData$grPC+1)) + # I add +1 to colour because black colour is almost not visible
scale_y_continuous(limits = c(-44.00, -37.00), expand = c(0, 0)) +
scale_x_continuous(limits = c(143.00, 150.00), expand = c(0, 0))
```
The last chunk looks at various ordinations based on location
```{r warning=FALSE, message=FALSE, warning=FALSE, eval = FALSE}
# The PCA on sites does not look good as there is too much effect of some deviant sites and variation cannot be summarised in the first few axes. Let's try to do the PCA on Locations instaed
#I exclude victorian locations and derwent river as they seem to be too different. Also the analysis can be done for two main study periods - before and after 2005
LocForPCA <- rls_bas %>%
filter (TAXONOMIC_NAME %in% MainSpecies) %>%
filter (Location != "Port Phillip Bay" & Location != "Port Phillip Heads" & Location != "Rocky Cape" & Location != "Derwent" & Location != "Derwent Estuary" & Location != "Victoria (other)" & Location != "Tasmania (other)") %>%
filter (( PHYLUM == "Chordata" & GENUS != "NA" & CLASS != "Ascidiacea") | CLASS == "Cephalopoda") %>% # try grouping on vertebrates only
filter(Method == 1) %>%
filter (year > 2005) %>%
group_by(Location, SurveyID, TAXONOMIC_NAME) %>%
summarise (abund = round(sum(Abundance),3)) %>%
group_by(Location, TAXONOMIC_NAME) %>%
summarise(abundMean = round(mean(abund), 3))
locPCAdata <- spread(LocForPCA, key = Location, value = abundMean, fill = 0)
locPCAdata1 <- matrix(unlist(locPCAdata),ncol=length(locPCAdata[,]),byrow=FALSE) #turn the list into matrix
locPCAdatat <- locPCAdata1[,-1]
storage.mode(locPCAdatat) <- "numeric"
rownames(locPCAdatat) <- locPCAdata1[,1]
colnames(locPCAdatat) <- colnames(locPCAdata[,-1])
locationPCA <-t(locPCAdatat) ## This is the main biological data for sites
locatPCAsp = prcomp(locationPCA[], scale. = TRUE, retx = TRUE) #conduct PCA on species abundance data only. We need to scale the abundance matrix because it is not correlation matrix and otherwise some common species have a huge effect
summary(locatPCAsp)
plot(locatPCAsp)
biplot(locatPCAsp, cex = c(0.6, 0.5), expand = 1.5, xlim = c(-0.8, 0.8), ylim = c(-0.8, 0.4)) # REMEMBER that the first 2 axes only explain 25% of total variation
plot(locatPCAsp$x[,1], locatPCAsp$x[,3])
locatPCAsp$rotation[,c(1,2)]
## alternatively try MDS on the location abundance data
#
#mdstest = metaMDS(locationPCA, k=2, trymax = 200) # this was first done with most data, but Flinders island affects the data too much
mdstest = metaMDS(locationPCA [-8,], k=2, trymax = 200) # exclude row 8 which is flinders island
ordiplot(mdstest,type="n")
orditorp(mdstest,display="sites",cex=1,air=0.1)
orditorp(mdstest,display="species",col="red",air=0.01)
stressplot(mdstest)
plot(mdstest)
## Or alternatively try k-means clustering on locationxspecies data. This is not a very intersting result, because just a few locations are separated and most others are grouped into 1. K-means clustering always separates Ninepin (internal and external) into its own group
clus <- kmeans(locationPCA, 5, nstart = 20)
locationClusters <- clus$cluster
# Summarise location data
locatSum <- rls_bas %>%
group_by(Location) %>%
summarise (latt = mean(SiteLat), long = mean(SiteLong), nsp = n_distinct(SpeciesID), year = n_distinct(year), dep_min = min(Depth), dep_max = max(Depth), visib = round(mean(Visibility, na.rm = TRUE),3))
## add cluster identity to the location datda
locationsWithGroups = merge(x = locatSum, y = locationClusters, by.x = "Location", by.y = "row.names", all.x = FALSE)
## so this clu
ggmap(tasmania, extent = "panel", legend = "bottomright") +
geom_point(aes(x = sitesWithGroups$long, y = -(sitesWithGroups$latt)), data = sitesWithGroups, size = 0.7, color = sitesWithGroups$y) +
scale_y_continuous(limits = c(-44.00, -37.00), expand = c(0, 0)) +
scale_x_continuous(limits = c(143.00, 150.00), expand = c(0, 0))
## some other code bits and pieces
#temp[, colSums(temp != 0) > 0]
#SelectVar[, colSums(SelectVar == 0) == 0]
```