.Rhistory

#####calling the function for all four names- matching each 'wrongname' against all of the 'rightnames'
for(i in 1:dim(nametest)[1])
{
nametest$genusmatches[i]=bestmatch(nametest$WrongGen[i],nametest$GoodGen)
nametest$sppmatches[i]=bestmatch(nametest$WrongSpp[i],nametest$GoodSpp)
}
nametest
library(tcltk2)
library(tcltk)
library(tcltk)
library(tcltk2)
library("metagear", lib.loc="C:/Program Files/R/R-3.4.0/library")
library(metagear)
biocLite('EBImage')
source("https://bioconductor.org/biocLite.R")
biocLite('EBImage')
library(metagear)
library(EBImage)
library(metagear)
?plot_PRISMA
?abstract_screener
library(metagear)
?PDFs_collect
?abstract_screener
?effort_initialize
data(example_references_metagear)
head(example_references_metagear)
write.csv(example_references_metagear,'~/gleon/Geisha/metagear_ex.csv')
summary(lm)
.011/0.017
5/8
176+120+18
314/40
load("C:/Users/vpatil/Documents/gleon/Geisha/phyto_package/github/MFG_CSR.rda")
View(mfg.csr)
View(mfg.csr)
load("C:/Users/vpatil/Documents/gleon/Geisha/phyto_package/github/sppMFG.rda")
print(ipm.lsgo,digits=3)
44693*.02
44693*.03
44693*.04
#------
# CSR validation
#------
library(ggplot2)
library(scales)
library(vegan)
rm(list = ls())
setwd('~/gleon/Geisha/phyto_package/GEISHA_phytoplankton_github_shared/')
dat.m = read.csv("madgwick_csr_data.csv",stringsAsFactors = F)
source('species_mfg_convert02042018.r')
source('mfg_csr.r')
dat.m$SV=dat.m$surface.area/dat.m$volume
dat.m$MSV=dat.m$mld*dat.m$SV
dat.m<-genus.species.extract(dat.m,'Species')
dat.m<-phyto.convert.df(dat.m)
dat.m$MFG[dat.m$Species=='Osscilatoria']='5a-FilaCyano'
dat.m$MFG[dat.m$Species=='Elaktothrix']='11b-GelaChlor'
dat.m$MFG.number=sapply(dat.m$MFG,function(x) strsplit(gsub('-','_',x),split='_')[[1]][1])
dat.m=mfg.csr.convert.df(dat.m,'MFG.number')
dat.m$CSR[dat.m$Species=='Small non-flagellates'|dat.m$Species=='Small flagellates']='C'
dat.m
s.group=dat.m[dat.m$CSR=='S',]
#final plot
csr.plot=ggplot(dat.m, aes(x = MSV, y = SV, color = CSR)) +
geom_point(size=3,shape=19,position=position_jitter(width=0.1,height=0.1))+
theme(text=element_text(face="bold",size=20))+
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)),limits=c(2,1000)) +
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)),limits=c(0.01,10))+
xlab("msv-1")+ylab("sv-1")+
annotate("text",x=10,y=10,label="C",fontface=2,size=10)+
annotate("text",x=1000,y=10,label="R",fontface=2,size=10)+
annotate("text",x=50,y=0.02,label="S",fontface=2,size=10)
#geom_text(aes(label = sh.name), hjust = 0, vjust = 0)
csr.plot
setwd('~/gleon/Geisha/phyto_package/mee ms/')
mfg.mugg<-read.csv('MFG_in_out_mugg.csv')
head(mfg.mugg)
confusion=table(mfg.mugg$MFG,mfg.mugg$MFG_new)
write.csv(confusion,'mugg_mfg_confusion.csv')
table(mfg.mugg$MFG==mfg.mugg$MFG_new)/dim(mfg.mugg)[1]
table(mfg.mugg$MFG==mfg.mugg$MFG_new)
mfg.mugg<-read.csv('MFG_in_out_mugg.csv',stringsAsFactors = T)
confusion=table(mfg.mugg$MFG,mfg.mugg$MFG_new)
write.csv(confusion,'mugg_mfg_confusion.csv')
table(mfg.mugg$MFG==mfg.mugg$MFG_new)/dim(mfg.mugg)[1]
mfg.mugg<-read.csv('MFG_in_out_mugg.csv',stringsAsFactors = F)
confusion=table(mfg.mugg$MFG,mfg.mugg$MFG_new)
write.csv(confusion,'mugg_mfg_confusion.csv')
table(mfg.mugg$MFG==mfg.mugg$MFG_new)/dim(mfg.mugg)[1]
table(is.na(mfg.mugg$MFG))
3/163
table(mfg.mugg$MFG==mfg.mugg$MFG_new)
table(mfg.mugg$Cause.of.Difference)
setwd('~/gleon/Geisha/phyto_package/mee ms/')
mfg.mugg<-read.csv('MFG_in_out_mugg.csv',stringsAsFactors = F)
confusion=table(mfg.mugg$MFG,mfg.mugg$MFG_new)
write.csv(confusion,'mugg_mfg_confusion.csv')
table(mfg.mugg$MFG==mfg.mugg$MFG_new)/dim(mfg.mugg)[1]
table(mfg.mugg$MFG==mfg.mugg$MFG_new)
table(mfg.mugg$Cause.of.Difference)
table(is.na(mfg.mugg$MFG))
setwd('~/gleon/Geisha/phyto_package/mee ms/')
mfg.mugg<-read.csv('MFG_in_out_mugg.csv',stringsAsFactors = F)
confusion=table(mfg.mugg$MFG,mfg.mugg$MFG_new)
write.csv(confusion,'mugg_mfg_confusion.csv')
table(mfg.mugg$MFG==mfg.mugg$MFG_new)/dim(mfg.mugg)[1]
table(mfg.mugg$MFG==mfg.mugg$MFG_new)
table(mfg.mugg$Cause.of.Difference)
table(is.na(mfg.mugg$MFG))
setwd('~/gleon/Geisha/phyto_package/mee ms/')
mfg.mugg<-read.csv('MFG_in_out_mugg.csv',stringsAsFactors = F)
confusion=table(mfg.mugg$MFG,mfg.mugg$MFG_new)
write.csv(confusion,'mugg_mfg_confusion.csv')
table(mfg.mugg$MFG==mfg.mugg$MFG_new)/dim(mfg.mugg)[1]
table(mfg.mugg$MFG==mfg.mugg$MFG_new)
table(mfg.mugg$Cause.of.Difference)
16/19
2/19
1/19
16/21
###don't save- only written to create lake-specific temporary missing species lists for expert mfg classification filling
##there is something wrong with what you are doing- need to re-run the validation of nico's model versus trait-based classification and make sure you get the same correspondence that torsten described.
setwd('~/../Desktop/random files for mfg debug/')
source('species_mfg_convert02042018.r')
source('mfg_csr.r')
species.mfg.library=species.mfg.library[species.mfg.library$source=='Nico',] ###need to make sure you match this to the true original library, then you can send nico the library extension based on the best available size criteria.
source('~/../Desktop/random files for mfg debug/traits_to_mfg02042018.R')
setwd('~/gleon/Geisha/datasets/rawphytodata/phytodata_qaqc/')
mfg.traits.dbase<-read.csv('../../../phyto_package/github/orlane_fortraitstomfg.csv',stringsAsFactors = F)
species.mfg.library=species.mfg.library[species.mfg.library$source=='Nico',] ###need to make sure you match this to the true original library, then you can send nico the library extension based on the best available size criteria.
source('~/../Desktop/random files for mfg debug/traits_to_mfg02042018.R')
setwd('~/gleon/Geisha/datasets/rawphytodata/phytodata_qaqc/')
mfg.traits.dbase<-read.csv('../../../phyto_package/github/orlane_fortraitstomfg.csv',stringsAsFactors = F)
frederic.new=read.csv('~/../Desktop/random files for mfg debug/Appendix-1-Phytoplankton metrics database-revised.csv',stringsAsFactors = F)
setwd('~/gleon/Geisha/phyto_package/GEISHA_phytoplankton_github_shared/')
source('species_mfg_convert02042018.r')
source('mfg_csr.r')
species.mfg.library=species.mfg.library[species.mfg.library$source=='Nico',] ###need to make sure you match this to the true original library, then you can send nico the library extension based on the best available size criteria.
source('traits_to_mfg02042018.R')
mfg.traits.dbase<-read.csv('../github/orlane_fortraitstomfg.csv',stringsAsFactors = F)
frederic.new=read.csv('../mee ms/Appendix-1-Phytoplankton metrics database-revised.csv',stringsAsFactors = F)
names(frederic.new)[1]='Taxa.name'
mfg.traits.dbase=subset(mfg.traits.dbase,select=c('Taxa.name','aerotopes','Gelatinous','Centric'))
names(frederic.new)[1]='Taxa.name'
mfg.traits.dbase=subset(mfg.traits.dbase,select=c('Taxa.name','aerotopes','Gelatinous','Centric'))
mfg.traits.dbase$phyto_name=mfg.traits.dbase$Taxa.name
frederic.new<-merge(frederic.new,mfg.traits.dbase,all.x=T)
frederic.new<-subset(frederic.new,select=c("Class","Order","Mobility.apparatus","Heterotrophic","Mixotrophic","Autotrophic","Nano.microphytoplankton","Filament","Colonial","Geometrical.shape.of.the.cell","Gelatinous","aerotopes","Centric","phyto_name","Cell.surface.area.ÂµmÂ²","Cell.biovolume.Âµm3","Number.of.cells.per.colony","Cumulated.surface.area.of.cells.in.a.colony.ÂµmÂ²","Colony.length..takes.into.account.mucilage..Âµm","Colony.width..takes.into.account.mucilage..Âµm","Colony.thickness..takes.into.account.mucilage..Âµm","Colony.surface.area.ÂµmÂ²..with.mucilage.","Colony.biovolume.Âµm3..without.mucilage.","Maximal.length.of.the.algal.object"))
names(frederic.new)
f.newcols=c("Class","Order","Mobility.apparatus","Heterotrophic","Mixotrophic","Autotrophic","Nano.microphytoplankton","Filament","Colonial","Geometrical.shape.of.the.cell","Gelatinous","aerotopes","Centric","phyto_name","Cell.surface.area.ÂµmÂ²","Cell.biovolume.Âµm3","Number.of.cells.per.colony","Cumulated.surface.area.of.cells.in.a.colony.ÂµmÂ²","Colony.length..takes.into.account.mucilage..Âµm","Colony.width..takes.into.account.mucilage..Âµm","Colony.thickness..takes.into.account.mucilage..Âµm","Colony.surface.area.ÂµmÂ²..with.mucilage.","Colony.biovolume.Âµm3..without.mucilage.","Maximal.length.of.the.algal.object")
f.newcols[f.newcols %in% names(frederic.new)==F]
names(frederic.new)
f.newcols[f.newcols %in% names(frederic.new)==F]
names(frederic.new)
f.newcols[f.newcols %in% names(frederic.new)==F]
grep(frederic.new,'Cell.',values=T)
grep(frederic.new,'Cell.',value=T)
?grep
grep(frederic.new,'Cell.',value=T,fixed=T)
grep('Cell.',frederic.new,fixed=T,value=T)
grep('Cell.',names(frederic.new),value=T)
f.newcols=c("Class","Order","Mobility.apparatus","Heterotrophic","Mixotrophic","Autotrophic","Nano.microphytoplankton","Filament","Colonial","Geometrical.shape.of.the.cell","Gelatinous","aerotopes","Centric","phyto_name","Cell.surface.area.µm²","Cell.biovolume.µm3","Number.of.cells.per.colony","Cumulated.surface.area.of.cells.in.a.colony.µm²","Colony.length..takes.into.account.mucilage..µm","Colony.width..takes.into.account.mucilage..µm","Colony.thickness..takes.into.account.mucilage..µm","Colony.surface.area.µm²..with.mucilage.","Colony.biovolume.µm3..without.mucilage.","Maximal.length.of.the.algal.object")
f.newcols=c("Class","Order","Mobility.apparatus","Heterotrophic","Mixotrophic","Autotrophic","Nano.microphytoplankton","Filament","Colonial","Geometrical.shape.of.the.cell","Gelatinous","aerotopes","Centric","phyto_name","Cell.surface.area.µm²","Cell.biovolume.µm3","Number.of.cells.per.colony","Cumulated.surface.area.of.cells.in.a.colony.µm²","Colony.length..takes.into.account.mucilage..µm","Colony.width..takes.into.account.mucilage..µm","Colony.thickness..takes.into.account.mucilage..µm","Colony.surface.area.µm²..with.mucilage.","Colony.biovolume.µm3..without.mucilage.","Maximal.length.of.the.algal.object")
f.newcols[f.newcols %in% names(frederic.new)==F]
frederic.new<-subset(frederic.new,select=f.newcols)
frederic.new=frederic.new[!is.na(frederic.new$phyto_name),]
frederic.new$size=ifelse(frederic.new$Colonial==1 | frederic.new$Maximal.length.of.the.algal.object>63,'large','small')
frederic.new
dim(frederic.new)
mfg.traits.dbase=frederic.new
mfg.traits.dbase$trait.MFG=''
for(i in 1:dim(mfg.traits.dbase)[1])
{
mfg.traits.dbase$trait.MFG[i]=traits.to.mfg(flagella=mfg.traits.dbase$Mobility.apparatus[i],size=mfg.traits.dbase$size[i],colonial=mfg.traits.dbase$Colonial[i],filament=mfg.traits.dbase$Filament[i],centric=mfg.traits.dbase$Centric[i],gelatinous = mfg.traits.dbase$Gelatinous[i],aerotypes = mfg.traits.dbase$aerotopes[i],class=mfg.traits.dbase$Class[i],order=mfg.traits.dbase$Order[i])
print(i)
}
names(mfg.traits.dbase)
mfg.traits.dbase=genus.species.extract(mfg.traits.dbase,'phyto_name')
mfg.traits.dbase$species=gsub('cf.','',mfg.traits.dbase$species)
mfg.traits.dbase<-mfg.traits.dbase[!duplicated(mfg.traits.dbase),]
mfg.traits.dbase<-phyto.convert.df(mfg.traits.dbase)
mfg.traits.dbase$MFG=factor(mfg.traits.dbase$MFG,levels=unique(as.character(mfg.traits.dbase$MFG)))
mfg.traits.dbase$trait.MFG=factor(mfg.traits.dbase$trait.MFG,levels=unique(as.character(mfg.traits.dbase$MFG)))
##validating trait and expert classifications
confusion.mat=table(mfg.traits.dbase$MFG,mfg.traits.dbase$trait.MFG)
write.csv(confusion.mat,'~/../Desktop/random files for mfg debug/mfg_confusion_newlarge.csv')
confusion.mat
mfg.traits.dbase$MFG
View(mfg.traits.dbase)
##size cutoff modeling
mfg.traits.dbase$large.MFG=NA
mfg.traits.dbase$large.MFG[grep(c('1|6|8'),mfg.traits.dbase$MFG)]='L'
mfg.traits.dbase$large.MFG[grep(c('2|7|9'),mfg.traits.dbase$MFG)]='S'
size.vars=names(mfg.traits.dbase)[c(7,15,16,17,9,18,22,23,24)]
size.dat=mfg.traits.dbase[,size.vars]
sapply(size.dat,function(x) sum(is.na(x)))
size.dat$Colony.surface.area.µm²..with.mucilage.[is.na(size.dat$Colony.surface.area.µm²..with.mucilage.)]=size.dat$Cell.surface.area.µm²[is.na(size.dat$Colony.surface.area.µm²..with.mucilage.)]
size.dat$Colony.biovolume.µm3..without.mucilage.[is.na(size.dat$Colony.biovolume.µm3..without.mucilage.)]=size.dat$Cell.biovolume.µm3[is.na(size.dat$Colony.biovolume.µm3..without.mucilage.)]
row.names(size.dat)=mfg.traits.dbase$phyto_name
size.dat$large.MFG=mfg.traits.dbase$large.MFG
size.dat$large.MFG=mfg.traits.dbase$large.MFG
size.dat$large.MFG=mfg.traits.dbase$large.MFG
size.dat$large.MFG=ifelse(size.dat$large.MFG=='L',1,0)
size.dat$Nano.microphytoplankton=ifelse(size.dat$Nano.microphytoplankton=='Nanophytoplankton',0,1)
size.dat$Colonial[size.dat$Number.of.cells.per.colony>=2]=1
size.vars=names(mfg.traits.dbase)[c(7,15,16,17,9,18,22,23,24)]
size.dat=mfg.traits.dbase[,size.vars]
sapply(size.dat,function(x) sum(is.na(x)))
size.dat$Colony.surface.area.µm²..with.mucilage.[is.na(size.dat$Colony.surface.area.µm²..with.mucilage.)]=size.dat$Cell.surface.area.µm²[is.na(size.dat$Colony.surface.area.µm²..with.mucilage.)]
size.dat$Colony.biovolume.µm3..without.mucilage.[is.na(size.dat$Colony.biovolume.µm3..without.mucilage.)]=size.dat$Cell.biovolume.µm3[is.na(size.dat$Colony.biovolume.µm3..without.mucilage.)]
size.dat$large.MFG=mfg.traits.dbase$large.MFG
size.dat$large.MFG=ifelse(size.dat$large.MFG=='L',1,0)
size.dat$Nano.microphytoplankton=ifelse(size.dat$Nano.microphytoplankton=='Nanophytoplankton',0,1)
size.dat$Colonial[size.dat$Number.of.cells.per.colony>=2]=1
size.dat$Cell.surface.area.µm²=as.numeric(size.dat$Cell.surface.area.µm²)
size.dat$Cumulated.surface.area.of.cells.in.a.colony.µm²=as.numeric(size.dat$Cumulated.surface.area.of.cells.in.a.colony.µm²)
size.dat$Colony.surface.area.µm²..with.mucilage.=as.numeric(size.dat$Colony.surface.area.µm²..with.mucilage.)
library(randomForest)
size.dat<-na.omit(size.dat)
size.rf<-randomForest(large.MFG~.,data=size.dat,family='binomial')
mfg.traits.dbase$large.MFG=NA
mfg.traits.dbase$large.MFG[grep(c('1|6|8'),mfg.traits.dbase$MFG)]='L'
mfg.traits.dbase$large.MFG[grep(c('2|7|9'),mfg.traits.dbase$MFG)]='S'
# mfg.traits.dbase$diatom=0
# mfg.traits.dbase$diatom[grep(c('6|7'),mfg.traits.dbase$MFG)]=1
# mfg.traits.dbase$cyano=0
# mfg.traits.dbase$cyano[grep(c('4|5'),mfg.traits.dbase$MFG)]=1
# mfg.traits.dbase$flagellate=0
# mfg.traits.dbase$flagellate[grep(c('1|2|3'),mfg.traits.dbase$flagellate)]=1
size.vars=names(mfg.traits.dbase)[c(7,15,16,17,9,18,22,23,24)]
size.dat=mfg.traits.dbase[,size.vars]
sapply(size.dat,function(x) sum(is.na(x)))
size.dat$Colony.surface.area.µm²..with.mucilage.[is.na(size.dat$Colony.surface.area.µm²..with.mucilage.)]=size.dat$Cell.surface.area.µm²[is.na(size.dat$Colony.surface.area.µm²..with.mucilage.)]
size.dat$Colony.biovolume.µm3..without.mucilage.[is.na(size.dat$Colony.biovolume.µm3..without.mucilage.)]=size.dat$Cell.biovolume.µm3[is.na(size.dat$Colony.biovolume.µm3..without.mucilage.)]
size.dat$large.MFG=mfg.traits.dbase$large.MFG
size.dat$large.MFG=ifelse(size.dat$large.MFG=='L',1,0)
size.dat$Nano.microphytoplankton=ifelse(size.dat$Nano.microphytoplankton=='Nanophytoplankton',0,1)
size.dat$Colonial[size.dat$Number.of.cells.per.colony>=2]=1
size.dat$Cell.surface.area.µm²=as.numeric(size.dat$Cell.surface.area.µm²)
size.dat$Cumulated.surface.area.of.cells.in.a.colony.µm²=as.numeric(size.dat$Cumulated.surface.area.of.cells.in.a.colony.µm²)
size.dat$Colony.surface.area.µm²..with.mucilage.=as.numeric(size.dat$Colony.surface.area.µm²..with.mucilage.)
dim(size.dat)
head(size.dat)
table(size.dat$large.MFG)
mfg.traits.dbase$MFG
names(mfg.traits.dbase)
mfg.traits.dbase$genus
mfg.traits.dbase$species
mfg.traits.dbase<-phyto.convert.df(mfg.traits.dbase)
mfg.traits.dbase$MFG
phyto.convert.df
source('species_mfg_convert02042018.r')
head(mfg.traits.dbase)
tail(mfg.traits.dbase)
genus='Woronichinia'
species='naegeliana'
species.mfg.library
grep('Woronichinia',species.mfg.library$genus)
dim(species.mfg.library)
mfg.traits.dbase$phyto_name
grep('Scenedesmus',species.mfg.library$genus)
genus='Scenedesmus'
species='smithii'
species.phyto.convert(genus,species)
phyto.convert.df
phyto.df=mfg.traits.dbase
mfgs<-vector(length=dim(phyto.df)[1])
for(i in 1:dim(phyto.df)[1])
{
mfgs[i]=species.phyto.convert(phyto.df$genus[i],phyto.df$species[i])
}
mfgs
phyto.convert.df
mfgs<-vector(length=dim(phyto.df)[1])
for(i in 1:dim(phyto.df)[1])
{
mfgs[i]=species.phyto.convert(phyto.df$genus[i],phyto.df$species[i])
}
phyto.df$MFG=mfgs
#phyto.df$MFG.numb
phyto.df$MFG
mfg.traits.dbase$MFG
mfg.traits.dbase<-phyto.convert.df(mfg.traits.dbase)
mfg.traits.dbase$MFG
mfg.traits.dbase$trait.MFG
mfg.traits.dbase$trait.MFG=''
for(i in 1:dim(mfg.traits.dbase)[1])
{
mfg.traits.dbase$trait.MFG[i]=traits.to.mfg(flagella=mfg.traits.dbase$Mobility.apparatus[i],size=mfg.traits.dbase$size[i],colonial=mfg.traits.dbase$Colonial[i],filament=mfg.traits.dbase$Filament[i],centric=mfg.traits.dbase$Centric[i],gelatinous = mfg.traits.dbase$Gelatinous[i],aerotypes = mfg.traits.dbase$aerotopes[i],class=mfg.traits.dbase$Class[i],order=mfg.traits.dbase$Order[i])
print(i)
}
mfg.traits.dbase$trait.MFG
mfg.traits.dbase$MFG=factor(mfg.traits.dbase$MFG,levels=unique(as.character(mfg.traits.dbase$MFG)))
mfg.traits.dbase$trait.MFG=factor(mfg.traits.dbase$trait.MFG,levels=unique(as.character(mfg.traits.dbase$MFG)))
confusion.mat=table(mfg.traits.dbase$MFG,mfg.traits.dbase$trait.MFG)
confusion.mat
table(mfg.traits.dbase$MFG==mfg.traits.dbase$trait.MFG)
632/(632+190)
size.vars=names(mfg.traits.dbase)[c(7,15,16,17,9,18,22,23,24)]
size.dat=mfg.traits.dbase[,size.vars]
sapply(size.dat,function(x) sum(is.na(x)))
size.dat$Colony.surface.area.µm²..with.mucilage.[is.na(size.dat$Colony.surface.area.µm²..with.mucilage.)]=size.dat$Cell.surface.area.µm²[is.na(size.dat$Colony.surface.area.µm²..with.mucilage.)]
size.dat$Colony.biovolume.µm3..without.mucilage.[is.na(size.dat$Colony.biovolume.µm3..without.mucilage.)]=size.dat$Cell.biovolume.µm3[is.na(size.dat$Colony.biovolume.µm3..without.mucilage.)]
size.dat$large.MFG=mfg.traits.dbase$large.MFG
size.dat$large.MFG=ifelse(size.dat$large.MFG=='L',1,0)
size.dat$Nano.microphytoplankton=ifelse(size.dat$Nano.microphytoplankton=='Nanophytoplankton',0,1)
size.dat$Colonial[size.dat$Number.of.cells.per.colony>=2]=1
size.dat$Cell.surface.area.µm²=as.numeric(size.dat$Cell.surface.area.µm²)
size.dat$Cumulated.surface.area.of.cells.in.a.colony.µm²=as.numeric(size.dat$Cumulated.surface.area.of.cells.in.a.colony.µm²)
size.dat$Colony.surface.area.µm²..with.mucilage.=as.numeric(size.dat$Colony.surface.area.µm²..with.mucilage.)
head(size.dat)
mfg.traits.dbase$large.MFG
# mfg.traits.dbase<-phyto.convert.df(mfg.traits.dbase)
mfg.traits.dbase$large.MFG=NA
mfg.traits.dbase$large.MFG[grep(c('1|6|8'),mfg.traits.dbase$MFG)]='L'
mfg.traits.dbase$large.MFG[grep(c('2|7|9'),mfg.traits.dbase$MFG)]='S'
mfg.traits.dbase$large.MFG
size.vars=names(mfg.traits.dbase)[c(7,15,16,17,9,18,22,23,24)]
size.dat=mfg.traits.dbase[,size.vars]
sapply(size.dat,function(x) sum(is.na(x)))
size.dat$Colony.surface.area.µm²..with.mucilage.[is.na(size.dat$Colony.surface.area.µm²..with.mucilage.)]=size.dat$Cell.surface.area.µm²[is.na(size.dat$Colony.surface.area.µm²..with.mucilage.)]
size.dat$Colony.biovolume.µm3..without.mucilage.[is.na(size.dat$Colony.biovolume.µm3..without.mucilage.)]=size.dat$Cell.biovolume.µm3[is.na(size.dat$Colony.biovolume.µm3..without.mucilage.)]
size.dat$large.MFG=mfg.traits.dbase$large.MFG
size.dat$large.MFG=ifelse(size.dat$large.MFG=='L',1,0)
size.dat$Nano.microphytoplankton=ifelse(size.dat$Nano.microphytoplankton=='Nanophytoplankton',0,1)
size.dat$Colonial[size.dat$Number.of.cells.per.colony>=2]=1
size.dat$Cell.surface.area.µm²=as.numeric(size.dat$Cell.surface.area.µm²)
size.dat$Cumulated.surface.area.of.cells.in.a.colony.µm²=as.numeric(size.dat$Cumulated.surface.area.of.cells.in.a.colony.µm²)
size.dat$Colony.surface.area.µm²..with.mucilage.=as.numeric(size.dat$Colony.surface.area.µm²..with.mucilage.)
size.dat
library(randomForest)
size.dat<-na.omit(size.dat)
size.rf<-randomForest(large.MFG~.,data=size.dat,family='binomial')
varImpPlot(size.rf)
partialPlot(size.rf,pred.data = size.dat,x.var = 'Colonial')
mld.pp=partialPlot(size.rf,pred.data = size.dat,x.var = 'Maximal.length.of.the.algal.object')
size.ctree<-ctree(large.MFG~.,data=size.dat)
size.dat$ctree.node=predict(size.ctree,size.dat,type='node')
plot(size.ctree)
library(party)
library(party)
size.ctree<-ctree(large.MFG~.,data=size.dat)
size.dat$ctree.node=predict(size.ctree,size.dat,type='node')
plot(size.ctree)
?ctree
table(mfg.traits.dbase$MFG==mfg.traits.dbase$trait.MFG)
prod=diag(confusion.mat)/apply(confusion.mat,1,sum)
users=diag(confusion.mat)/apply(confusion.mat,2,sum)
mean(na.omit(prod))
mean(na.omit(users))
table(mfg.traits.dbase$MFG==mfg.traits.dbase$trait.MFG)
table(mfg.traits.dbase$MFG==mfg.traits.dbase$trait.MFG)/sum(632,190)
source('species_mfg_convert.r')
?pvclust
library(pvclust)
new.mfg.library=mfg.traits.dbase[is.na(mfg.traits.dbase$MFG),]
head(new.mfg.library)
new.mfg.library=mfg.traits.dbase[is.na(mfg.traits.dbase$MFG)&!is.na(mfg.traits.dbase$trait.MFG),]
head(new.mfg.library)
table(is.na(mfg.traits.dbase$MFG))
239/(901+239)
239-188
51/1140
new.mfg.library=mfg.traits.dbase[is.na(mfg.traits.dbase$MFG)&!is.na(mfg.traits.dbase$trait.MFG),]
new.mfg.library=subset(new.mfg.library,select=c('genus','species','MFG'))
head(new.mfg.library)
new.mfg.library=mfg.traits.dbase[is.na(mfg.traits.dbase$MFG)&!is.na(mfg.traits.dbase$trait.MFG),]
new.mfg.library=subset(new.mfg.library,select=c('genus','species','trait.MFG'))
write.csv(new.mfg.library,'../mee ms/newsize_mfg_library_add.csv')
write.csv(species.mfg.library,'../mee ms/old_mfg_library.csv')
newspecies.mfg.library=read.csv('../mee ms/expanded_mfg_library.csv')
save('sppMFG_new.rda')
save(newspecies.mfg.library,'sppMFG_new.rda')
save(newspecies.mfg.library,file='sppMFG_new.rda')
save(newspecies.mfg.library,file='sppMFG_new.rda')
species.mfg.library=newspecies.mfg.library
names(mfg.traits.dbase)
?metaMDS
mfg.traits.forord=subset(mfg.traits.dbase,select=c('Heterotrophic','Mixotrophic','Autotrophic','Filament','Colonial','Gelatinous','aerotopes','Class','Order','Centric'))
mfg.ord=metaMDS(mfg.traits.forord)
mfg.traits.forord$Large=ifelse(mfg.traits.dbase=='large',1,0)
mfg.traits.forord[is.na(mfg.traits.forord)]=0
mfg.traits.forord$Large=ifelse(mfg.traits.dbase=='large',1,0)
str(mfg.traits.forord)
mfg.traits.forord=subset(mfg.traits.dbase,select=c('Heterotrophic','Mixotrophic','Autotrophic','Filament','Colonial','Gelatinous','aerotopes','Centric'))
mfg.traits.forord[is.na(mfg.traits.forord)]=0
mfg.traits.forord
mfg.ord=metaMDS(mfg.traits.forord)
plot(mfg.ord)
test=pvclust(mfg.traits.forord)
plot(test)
?vegdist
traitdist=vegdist(mfg.traits.forord,method='gower')
test2<-hclust(traitdist)
plot(Test2)
plot(test2)
test3=pvclust(traitdist)
test3=pvclust(mfg.traits.forord,method='gower')
?pvclust
test3=pvclust(mfg.traits.forord,method.dist='gower')
test3=pvclust(mfg.traits.forord,method.hclust='gower')
head(mfg.traits.forord)
row.names(mfg.traits.forord)=mfg.traits.dbase$phyto_name
row.names(mfg.traits.forord)=paste(1:dim(mfg.traits.forord),mfg.traits.dbase$phyto_name)
row.names(mfg.traits.forord)=paste(1:dim(mfg.traits.forord),mfg.traits.dbase$phyto_name,sep='_')
row.names(mfg.traits.forord)=paste(1:dim(mfg.traits.forord)[1],mfg.traits.dbase$phyto_name,sep='_')
test2<-hclust(traitdist)
test3=pvclust(mfg.traits.forord,method='gower')
?pvclust
test2<-hclust(traitdist)
plot(test2)
?hclust
plot(test3,labels=mfg.traits.dbase$MFG)
plot(test2,labels=mfg.traits.dbase$MFG)
plot(test2,labels=mfg.traits.dbase$MFG,cex=.3)
table(is.na(mfg.traits.dbase$MFG))
mfg.csr
write.csv(mfg.csr,'../mee ms/mfg_csr_library.csv')
setwd('~/gleon/Geisha/phyto_package/GEISHA_phytoplankton_github_shared/')
orlane<-read.csv('orlaneTraitData_MFGcompare_original.csv',stringsAsFactors=F)
#the gelatinous column has been flipped so that 1 means gelatinous and 0 means no mucilage.
##now creating a vector of the trait names in orlanes dataset that correspond to arguments the traits.to.mfg function
orlane.trait.names=c('Motile','size','Colonial','Filament','Centric','Gelatinous','aerotopes','Class','Order')
orlane<-read.csv('orlaneTraitData_MFGcompare_gelFix.csv',stringsAsFactors=F)
orlane<-read.csv('orlaneTraitData_MFGcompare_gelFix.csv',stringsAsFactors=F)
#the gelatinous column has been flipped so that 1 means gelatinous and 0 means no mucilage.
##now creating a vector of the trait names in orlanes dataset that correspond to arguments the traits.to.mfg function
orlane.trait.names=c('Motile','size','Colonial','Filament','Centric','Gelatinous','aerotopes','Class','Order')
###########################################################33
#load the scripts
source('species_mfg_convert02042018.r')
source('mfg_csr.r')
source('mfg_to_traits.r')
source('traits_to_mfg02042018.r')
source('~/gleon/Geisha/phyto_package/GEISHA_phytoplankton_github_shared/mfg_to_traits.r', echo=TRUE)
source('~/gleon/Geisha/phyto_package/GEISHA_phytoplankton_github_shared/mfg_to_traits.r', echo=TRUE)
####example script demonstrating how to classify MFG and CSR using R scripts for the Geisha R package
####2/4/2018
rm(list=ls())
##first set the working directory
setwd('~/gleon/Geisha/phyto_package/GEISHA_phytoplankton_github_shared/')
##the directory should contain the following scripts:
##species_mfg_convert.r
##MFG_to_CSR.r
##traits_to_mfg.r
##mfg_to_traits.r
###you will also need the following R data objects:
##sppMFG.rda
##MFG_CSR.rda
###current versions of these scripts are labelled with date stamps
#####################################################################################
#load geneva dataset with species names and trait matrix
orlane<-read.csv('orlaneTraitData_MFGcompare_gelFix.csv',stringsAsFactors=F)
#the gelatinous column has been flipped so that 1 means gelatinous and 0 means no mucilage.
##now creating a vector of the trait names in orlanes dataset that correspond to arguments the traits.to.mfg function
orlane.trait.names=c('Motile','size','Colonial','Filament','Centric','Gelatinous','aerotopes','Class','Order')
###########################################################33
#load the scripts
source('species_mfg_convert02042018.r')
source('mfg_csr.r')
source('mfg_to_traits.r')
source('traits_to_mfg02042018.r')
load('MFG_CSR.rda')
load('sppMFG.rda')
#############################################################
##first, split the Taxa.name column into a genus and species
orlane<-genus.species.extract(orlane,'Taxa.name')
###now create MFG classification based on Nico's expert assessment
orlane<-phyto.convert.df(orlane)
##making second MFG column based on traits
##it will be called mfg.from.traits
orlane=trait.to.mfg.df(orlane,orlane.trait.names)
##making MFG number column associated with MFG.from.Nico
orlane$MFG.num=sapply(orlane$MFG,function(x) strsplit(x,split='-')[[1]][1])
##creating expert CSR classifications from Nico MFG classifications
orlane<-mfg.csr.convert.df(orlane,'MFG.num')
#comparisons
##missing classifications
dim(orlane)[1]
sum(is.na(orlane$mfg.from.traits))
sum(is.na(orlane$MFG))
table(orlane$mfg.from.traits==orlane$MFG)
###only about 60% agreement currently for lake geneva data.
###fraction caused by discrepancies in size
MFG.large=grep('L',orlane$MFG,ignore.case=F)
traits.large=grep('large',orlane$size)
MFG.small=grep('Small',orlane$MFG)
traits.small=grep('small',orlane$size)
##number of large MFG's listed as small in frederic's dataset
length(na.omit(match(MFG.large,traits.small)))
##number of small MFG's listed as large in frederic's dataset
length(na.omit(match(MFG.small,traits.large)))
#so 254 out of 371 mismatches between trait-based and expert classifications were associated with size.
###fraction caused by discrepancies in size
MFG.large=grep('L',orlane$MFG,ignore.case=F)
traits.large=grep('large',orlane$size)
MFG.small=grep('Small',orlane$MFG)
traits.small=grep('small',orlane$size)
#colonial mfgs
MFG.colonial=orlane$MFG.num %in% c('5a','5b','5c','5d','5e','6a1','6b1','10a','10b','10c','11a','11b','11c')
MFG.mismatch=orlane$MFG != orlane$mfg.from.traits
#species listed as unicellular in trait data
mfg.traits.unicell=orlane$Colonial==0
sum(MFG.colonial & mfg.traits.unicell)
##21 species listed as unicellular that nico classified as colonial
##discrepancies based on presence of mucilage
sum(na.omit(orlane$MFG=='11b-GelaChlor' & orlane$Gelatinous==0))
##52 mismatches due to gelatinous
head(orlane) ###view dataset with new columns