adw96 · adw96 · Jul 28, 2018 · Jul 28, 2018 · Jul 28, 2018
diff --git a/r/.DS_Store b/r/.DS_Store
diff --git a/r/Updated Tutorials 2018/additionalcode.R b/r/Updated Tutorials 2018/additionalcode.R
@@ -0,0 +1,229 @@
+###############################################
+# 
+# additional R-code from previous R-tutorials B & C
+#
+###############################################
+
+##########################################
+#
+#  Introduction to R 
+#  Option B: For users with some experience with R
+#  A tutorial for STAMPS 2017
+# 
+#########################################
+#
+#  This tutorial is written by Amy Willis, July 2017
+#  Contributions also made by [please feel free to contribute!]
+#
+#########################################
+#
+# If you downloaded this from the website, 
+# be sure to rename as "STAMPS_Intro2R_OptionB.R", 
+# R and R Studio will not interpret is as an R script with ".txt" suffix
+
+#########################################
+#
+# Disclaimer: The level of hand-holding in this tutorial will be
+# less than in Option A. There will be some expectation on you to
+# attempt to solve your issues. By all means request help from the TAs,
+# but in order to obtain mastery of any language it is important to practice 
+# correcting yourself, and now is a great opportunity to practice :)
+# 
+#########################################
+
+
+###############################################
+# 
+# Get some context 
+#
+###############################################
+# Set your working directory (refer to Option A if you're not sure how)
+setwd("~/Documents/MyProjectDirectory") 
+
+# If you already worked through Option A, feel free to skip until the next section
+
+# In Option A, we loaded in some covariate and OTU data and briefly
+# made some plots and looked at some summaries
+covariates <- read.table("FWS_covariates.txt", header=TRUE, sep="\t", as.is=TRUE)
+abundances <- read.table("FWS_OTUs.txt", header=TRUE, row.names = 1, sep="\t", as.is=TRUE)
+
+# take a moment to have a look at the data here
+
+###############################################
+# 
+# ggplot
+#
+###############################################
+
+# Last night you learned how to install a package from CRAN
+# We're now going to use the package ggplot2
+
+# If this doesn't work for you, look at the Installing R website
+library(ggplot2)
+
+# check that the following produces a plot
+ggplot(data.frame("x"=c(1,2,3), "y"=c(4,4.6, 5)))+geom_point(aes(x,y))
+# and don't worry about what this means for now
+
+# base graphics in R, such as plot() and boxplot(), are not very pretty
+# enter: ggplot! (everyone calls the package ggplot2 "ggplot") 
+
+# ggplot makes simple things difficult, difficult things simple, and
+# everything beautiful
+
+# ggplot has a different structure to base graphics
+# here is how you make a ggplot
+abundances <- data.frame(abundances)
+ggplot(abundances) 
+
+# nothing happened! why?
+# we need to add components, such as a scatterplot or a histogram, 
+# to build up a plot
+ggplot(abundances, aes(x = JPA_Jan)) +
+  geom_histogram()
+
+# this says that "abundances" is the data frame that we want to plot, 
+# JPA_Jan is the only variable that we care about
+# and that we want a histogram
+
+# obviously there are lots of taxa that were not observed in this sample, hence the zeros
+# let's restrict our attention to taxa that we did see
+# Subsetting 2 different variables is done like this
+ggplot(subset(abundances, JPA_Jan > 0), aes(JPA_Jan)) + 
+  geom_histogram()
+
+# the first argument is the data frame of interest
+# aes() specifies which column we want
+# and the geom_histogram() says that we want a histogram
+
+# The range of this histogram is huge, so we want to focus on the distribution of
+# OTUs that were observed once or more but less than 60 times, we would do
+ggplot(subset(abundances, JPA_Jan > 0 & JPA_Jan < 60), aes(JPA_Jan)) + 
+  geom_histogram()
+
+# a scatterplot shows how correlated the abundances are between january and february
+ggplot(subset(abundances, JPA_Jan > 0 & JPA_Feb > 0), 
+       aes(JPA_Jan, JPA_Feb)) + 
+  geom_point()
+# again, ggplot() specifies what we want plotted
+# geom_point() specifies that we want a scatterplot
+
+ggplot(subset(abundances, JPA_Jan > 0 & JPA_Feb > 0), 
+       aes(JPA_Jan, JPA_Feb)) + 
+  geom_point() + 
+  theme_classic() # changes the styling
+
+ggplot(abundances, aes(JPA_Jan, JPA_Feb)) + 
+  geom_point() + 
+  labs(title="My first scatterplot") + # adds a title
+  xlab("January OTU counts") + # adds labels 
+  ylab("February OTU counts")
+
+# one tedious thing about ggplot is that all of the information that
+# you want to plot must be in the same data frame
+# So incorporating Season (from the data frame "covariates") 
+# to colour the points our scatterplot is not trivial
+
+# We need to create a new data frame that contains all this information
+# together... 
+
+# Exercise: Use the internet to learn how to change the colour
+# of the points. Create a new data frame that contains a
+# logical [TRUE/FALSE] variable indicating if the January abundance
+# is higher than 4000. Use this variable to differentially colour the single
+# observation that fits this category
+
+# Don't look at the solution until you've given it a good attempt 
+# (this is equally good practice for troubleshooting your problems)
+# but the solution is available in STAMPS_Intro2R_OptionB_solutions.R
+
+
+##########################################
+#
+#  Introduction to R
+#  Option C: For users with a lot of experience with R
+#  A tutorial for STAMPS 2017
+#
+#########################################
+#
+#  This tutorial was written in July 2017
+#  A collaborative effort by 
+#  Julia Fukuyama, Lan Huong Nguyen, 
+#  Kris Sankaran, and Amy Willis
+# 
+#  [please feel free to contribute!]
+#
+#########################################
+#
+# Disclaimer: TA support is primarily for Options A and B.
+# Please make every effort to resolve any difficulties that you encounter
+# on your own. That said, we hope that you learn something new from this
+# tutorial!
+#
+#########################################
+# Set your working directory and load data
+setwd("~/Documents/MyProjectDirectory")
+covariates <- read.table("FWS_covariates.txt", header=TRUE, sep="\t", as.is=TRUE)
+abundances <- read.table("FWS_OTUs.txt", header=TRUE, row.names = 1, sep="\t", as.is=TRUE)
+
+# take a moment to have a look at the data here
+
+###############################################
+#
+# more ggplot, reshape, cast, melt
+# By Kris Sankaran
+#
+###############################################
+library(ggplot2)
+library(reshape2) # melt and cast
+library(dplyr) # filter, group_by, and mutate
+
+# let's try to plot the abundance of the dominant 2
+# taxa against each other, and colour the points
+# based on Season
+
+# We need to reshape the data so that what will go
+# in the x and y for the scatterplot are their own
+# columns. We need to *melt* the data. This takes it
+# from "wide" (one sample per column) to tall
+# (different samples stacked on top of one another).
+
+# First let's retrieve the taxonomic information,
+# using strsplit to split the rownames according to
+# the ";" character
+taxa <- strsplit(rownames(abundances), split = ";")
+taxa <- do.call(rbind, taxa) # bind the list into a matrix
+colnames(taxa) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus")
+
+# We can use the %>% operation so that we don't need
+# to keep track of temporary variables. Melt converts
+# the data from wide to tall
+melted_abundances <- data.frame(taxa, abundances) %>%
+  melt(
+    id.vars = colnames(taxa),
+    variable.name = "SampleName",
+    value.name = "abundance"
+  )
+
+# We can incorporate the season information using
+# left_join, and get the taxa abundances using
+# group_by_at and summarise
+melted_abundances <- melted_abundances %>%
+  group_by_at(colnames(taxa)) %>%
+  mutate(total = sum(abundance)) %>%
+  left_join(covariates) %>%
+  ungroup()
+
+# To get the most abundant taxa, we filter down
+# to the top two totals. Then, we cast the data.frame
+# so that there is one column for what will become
+# each axis in the plot
+top_abundances <- melted_abundances %>%
+  filter(total >= sort(rowSums(abundances), decreasing = TRUE)[2]) %>%
+  dcast(SampleName + Season ~ Genus, value.var = "abundance")
+
+# Now we can make the plot! Notice that most of
+# the work was in defining data with the appropriate
+# input format
+ggplot(top_abundances) +
+  geom_point(aes(x = Burkholderia, y = Ralstonia, col = Season))
diff --git a/r/Updated Tutorials 2018/apply.R b/r/Updated Tutorials 2018/apply.R
@@ -0,0 +1,34 @@
+###############################################
+# 
+# The apply family
+#
+###############################################
+
+# Unfortunately, for loops, while intuitive and broadly useful, 
+# are relatively slow in R
+
+# apply() is a much better way of doing this!
+
+fast_total_reads <- apply(abundances, 2, sum)
+# what does the second argument 2 do? Compare to second argument of 1
+# Find out with ?apply
+
+# sweep() is related to apply()
+sweep(abundances, 2, fast_total_reads, "/")
+# "go down the columns of abundances, applying "/" (division) to each column,
+# with the second argument of the division being the element of
+# fast_total_reads corresponding to the column number of abundances
+all(sweep(abundances, 2, fast_total_reads, "/") == relative_abundances)
+
+
+# careful! dividing a matrix by a vector divides across the rows, 
+# not the columns!
+all(abundances/fast_total_reads == relative_abundances)
+
+# mapply(), tapply(), lapply() are all variations on apply for different 
+# situations
+
+# In writing this tutorial, I just learnt about prop.table,
+# which turns a matrix into its relative abundances across a fixed margin.
+# Cool!
+all(prop.table(as.matrix(abundances), margin = 2) == relative_abundances)
diff --git a/r/Updated Tutorials 2018/functions.R b/r/Updated Tutorials 2018/functions.R
@@ -0,0 +1,37 @@
+###############################################
+# 
+# Function writing
+#
+###############################################
+
+# writing functions is easy in R!
+my_first_function <- function() {
+  cat("Hello World!")
+}
+my_first_function()
+# functions can have no arguments, or several arguments:
+my_second_function <- function(times) {
+  counter <- 1
+  while (counter <= times) { # this is a different type of loop
+    cat("Hello World!\n")
+    counter <- counter + 1
+  }
+}
+my_second_function(3)
+
+# If you are dealing with one OTU table, turning it into
+# relative abundance table is pretty easy. But what if you have 
+# to do it for many OTU tables?
+
+# Exercise: Write a function that takes in an OTU table
+# and returns the relative abundance table
+
+# Note that you can construct functions inline:
+all(apply(abundances, 2, function(x) x/sum(x)) == relative_abundances)
+
+# Save your work!
+
+# Congratulations! You are at a level with your R understanding
+# that will get you through most of STAMPS! If you feel so inclined, 
+# have a look through source_system.R, parallel.R, and markdown.R, but at this point, you probably deserve
+# a cold beverage and a break!
diff --git a/r/Updated Tutorials 2018/lists.R b/r/Updated Tutorials 2018/lists.R
@@ -0,0 +1,46 @@
+###############################################
+# 
+# Lists
+#
+###############################################
+
+# Lists are great! Lists allow you to combine many different types of data
+# of many different sizes into groups
+
+# We're going to make a list corresponding to all of the information
+# about the first sample
+
+sample1 <- list()
+sample1$name <- covariates$SampleName[1]
+sample1$season <- covariates$Season[1]
+sample1$place <- covariates$Location[1]
+sample1
+sample1$counts <- abundances[,1]
+
+# to see all the information in a list, go
+sample1
+# and to see a particular element of the list, 
+# place the name of that element after the sign $
+sample1$name
+
+# you can also make a list of lists!
+all_samples <- list()
+all_samples$sample1 <- sample1
+all_samples$sample2 <- list()
+all_samples$sample2$name <- covariates$SampleName[2]
+all_samples
+
+# Exercise: Use a loop to loop through all of the samples, 
+# creating a list of the information corresponding to that sample.
+# (name, location, month, relative abundance table, abundance table)
+
+# Hint:
+all_samples[[1]]
+all_samples[[2]]
+# Double brackets can also be used to refer to the elements of a list, 
+# instead of naming them individually
+names(all_samples)
+
+
+# Bonus: do this with a apply loop (maybe after finishing the function
+# writing section)
diff --git a/r/Updated Tutorials 2018/loops.R b/r/Updated Tutorials 2018/loops.R
@@ -0,0 +1,33 @@
+###############################################
+# 
+# loops
+#
+###############################################
+
+# R is a vectorised language, so for a function that takes 1 argument, 
+# you can apply across a vector
+abundances[, 1] + 5
+exp(abundances[, 1])
+
+# what about for more complex functions?
+
+# In Option A we calculated the relative abundance of the taxa
+# using scale()
+# This was a good way to do it, but are we going to try to do the
+# same thing with a loop
+
+relative_abundances <- abundances # create a matrix of the same size as abundances
+for (current_index in 1:(dim(abundances)[2])) { # cycling through the columns...
+  number_reads <- sum(abundances[, current_index]) # find the column total
+  relative_abundance <- abundances[, current_index]/number_reads # use it to find relative abundance
+  relative_abundances[, current_index] <- relative_abundance # save the current column into our new matrix
+}
+
+# we can track that this matches our original calculation with
+all(relative_abundances == scale(abundances, center = F, scale = colSums(abundances)))
+
+# what does all() do?
+?all
+
+# Exercise: use a for loop to find which genus was observed most frequently
+# when combining all samples