From c73fee4a2e2799a118be42f5266c74076b111de8 Mon Sep 17 00:00:00 2001 From: Hobbeist Date: Wed, 22 May 2019 22:50:52 +0800 Subject: [PATCH] #3 added a tutorial R script to R and worked a bit on the presentation --- .DS_Store | Bin 8196 -> 8196 bytes R/03_analysis_modelling.R | 216 +++++++++++++++++++++ vignettes/.DS_Store | Bin 0 -> 6148 bytes vignettes/03_analysis_modelling.Rmd | 56 +++--- vignettes/assets/.DS_Store | Bin 0 -> 6148 bytes vignettes/assets/header-includes/.DS_Store | Bin 0 -> 6148 bytes 6 files changed, 243 insertions(+), 29 deletions(-) create mode 100644 R/03_analysis_modelling.R create mode 100644 vignettes/.DS_Store create mode 100644 vignettes/assets/.DS_Store create mode 100644 vignettes/assets/header-includes/.DS_Store diff --git a/.DS_Store b/.DS_Store index 6eb70d63ab1a5557c26d77f597bc494625a0468c..804a4d8c73f4ed0178fc63389214f0afdd340044 100644 GIT binary patch literal 8196 zcmeHM%Wl&^6ur|J>I5OOKx&q=Vdq6$9%&Ypth6k;Lm0sVP;1LkYbmv>*l9%FAb$q* zi{K0R9nL(cg41jPg*(#RGmm?1os*k!u0up(G)sJ<9uc|d3cC+r?r3UX^_8||S2{p} zc#7x6MLfx4%)hUA^8zct3a|pK04u->{0j=;nXQ_;=DBZn*|Gwxz<;TLIv*su!oXr} zP#+yIx�^Sk?{Wt{MQP#lT{15IbnXgaS>duqB2t;aHc}FR)k}G~py{@geM;g{@G8 zzB{fjwL6KxAX`>|6&cn{f@GPH) zWmL|JQ8az8X0M|1EVkn}c|N%vNAcTqJU{L}KFG5&O0)4)2dBvxE+5{dS(493`7BEc zeVx#3xNh5RA9s6Ip9E@%gLaB<#6B zj~X0rmI+>l(`=Gwr&v?|4yBY)LFMKWw*(u+TXc?3Jlj;-mR^Ijpw~-;zy0k5->ItI z;n0)FY2^1{(D&qvp}&&fUrau)&2N?eVAkN;rc31ZDYAQvFLnuAY$*3Tm@aF<&wrCY zY*~S(0xh%9Q}zGL_3!_iVLWFASb+^vKsblt;Ru9&{f2IGQ?<54KS5Vb`PBw>2+VIU f0x-Y*!w_Q$R}~XjtPNrXjb8+48Q8D_e^h~=nPFOv literal 8196 zcmeHMTWs7!6umc()^<|jgp|ag+bm53X`8g02YHmVY+kf!Q&O>;XVWC!wMi@-d%N~- z(xyQ{6;B}$4~dT!feL@EgoK0y6`wpR6(Lnbyd?O5A3hNXB);H|$BEiJKHvkW>XByd zc<#O9xo3_&d%XaFof)kjpd0`Ms!YlS)T~pOp4Bxa#~;ZfiR2I9f(bfkiCHJL#yS`X z5C{+m5C{+m5C{sg=RVg4ZGb?4z-@^Ddq0G!G8y&dBA4E)gBmXdAjwhy zuTY=z0M94v%cw6Gx%6CVPM$rW?}~nkf!v++VP;M;>dQqgxjP_t2lVfZeujeH-N`TP znFB_-4B7yJ0D-j#u*XM$0k9}{H*U<|6Q-RY-L*u@G#ob)`3iG#^YS-s7FgLLoQzG| zvoTk7Gft7K#uXwf6LQ7;e0`+3u{PY&+PF{~p096eYN!o2*EcULh(cL){ej-enbT+H&n`TE zo=1oN8v=?}MSEerP)#S>#Y4?`Z8_AhmqQh$0;Tte)HflQWGRCFlxx_QGpMH>k`r=~ zLs9o8ZR>biXZoR}ZDwXHXF@Jcs%A1{s;=H)nla;y?hL#7oXdKnj-7V9Ji`fD&N%ub zI@uXny_>9^VS}b)W-^MG3b%%K+`A`o|NaNNm$uwdBugby=}^{ijD)EV%o>_IJ)$~E z-O>zeYLq~;T;owgPm9I5(_-9GXY{4J%F1P>LfN*FmmpUO!ZCUlbzwMcD(d{i9Day0p1yL3<}##(VWY0u11JPsf3P`U*9sxZ#(-KRQke@eIL z_Fi0+gTEq9geO(ElIamk&fTenx$)HSrUcdTsZsvW$M zOdfwYK|JCt6T{oao7qVFu2kGDg$9UU`rPU#VVUdlX2v9eu|%m1he9%=5Wn16O_w>u za(=mMcSw?RiNm62ymy}zlJkkKT-(?rNpc?XnQL3yNV|!c%e4m%k`~w#$JJtWND{=q z&%%pv39i5=#J}tCC43J*!q4yrF)tsBu#9-O4MVsc@55cV2Wznoo3WJ`_yBRR2M=K{ z9w8paaS(@b1V=H6I" into +#' the console, for example: ?read_csv + + +#### Import libraries ---- +library(biometrics) +library(ggplot2) +library(gridExtra) +library(grid) +library(tidyverse) +library(ggpubr) +library(ggfortify) #autoplot function, see below in diagnostics +theme_set(theme_classic()) + +#### Read in data ---- +# Read in a CSV files + +demoData <- read_csv("data/demo.csv") + + +#### Summary of the data + +summary(demoData) + + +#### Getting to kow the distribution of variables ---- +#' Here we first focus on the day1 - day3 variables +#' Boxplot the data to look for outlier + +demoData %>% + gather(Days, measurement, day1:day3, factor_key=TRUE) %>% + ggplot(aes(y=measurement,x=Days, fill=Days)) + + labs(title = "Days: 1 to 3 with outlier", x = "", y = "Measurment") + + geom_boxplot() + + scale_color_telethonkids("light") + + theme_minimal() + +#' Q: What do you see in this exercise? how do you handle the observation? + +#' A: We need to exclude this definite outlier. It is so far away from any other data point, that it +#' is very likely a measurment error rather than a misclassification + + +#### Remove the outlier: ---- +#' here we can see that no value is above 50, so we can use dplyr +#' to filter the data set for values that are < 50 days and store the results in +#' a new data frame, so we don't override the full data set. After that, we plot +#' the data in a boxplot again + +no_out <- demoData %>% + filter(day2 < 100) + + +no_out %>% + gather(Days, measurement, day1:day3, factor_key=TRUE) %>% + ggplot(aes(y=measurement,x=Days, fill=Days)) + + labs(title = "Days: 1 to 3 outlier removed", x = "", y = "Measurment") + + geom_boxplot() + + scale_color_telethonkids("light") + + theme_minimal() + + + + +#'Now we use the quantile-quantile plot (QQ plot) of the variables to check for normal distribution of the data +no_out %>% # Select the demo data and utilise the pipe + gather(Days, measurement, day1:day3, factor_key=TRUE) %>% + ggplot(aes(sample=measurement)) + + stat_qq() + + stat_qq_line() + # This ads a line to the plot to better judge the deviation + theme_minimal() + + facet_wrap(~Days, nrow = 1) + +#' We want the plots to be aproximately on one line in order to show some normality + +#### Let's plot the three day measurements against each other to see if there is a tendency of +#' association. ---- + +plot_day1_day2 <- + no_out %>% + ggplot(aes(x=day1, y=day2)) + + labs(title = "Day 1 and Day 2", x = "day 1", y = "day 2") + + geom_point(size = 4) + + geom_smooth(method='lm')+ + scale_color_telethonkids("light") + + theme_minimal() + +plot_day2_day3 <- + no_out %>% + ggplot(aes(x=day2, y=day3)) + + labs(title = "Day 2 and Day 3", x = "day 2", y = "day 3") + + geom_point(size = 4) + + geom_smooth(method='lm')+ + scale_color_telethonkids("light") + + theme_minimal() + +plot_day1_day3 <- + no_out %>% + ggplot(aes(x=day1, y=day3)) + + labs(title = "Day 1 and Day 3", x = "day 1", y = "day 3") + + geom_point(size = 4) + + geom_smooth(method='lm')+ + scale_color_telethonkids("light") + + theme_minimal() + + + +grid.arrange(plot_day1_day2, plot_day2_day3, plot_day1_day3, nrow=2, ncol=2) + +###Linear regression model ---- +#' In the plots, we can see that day 2 and day 3 seem to be correlated. Let's perform a linear model +#' and see if we can find a significant association + +lm(day3 ~ day2, data=no_out) + +#' This function alone does not tell you anything about significane. We need to use the wrapper function "summary" +#' to get more information about the model. Therefore, we store the model in a variable: + +model_da2_day3 <- lm(day3 ~ day2, data=no_out) + +#' Once we have done this, we can use the model name and query it for more information +#' Let's use teh summary() function to get the p-value and R and R2 values, as well as +#' other relevant model statistics + +summary(model_da2_day3) + +#' What do you see in the R output? + + + +#### Model Diagnostic plots ---- +#' With base R it is very easy to look at the model diagnostic plots. +#' just wrap the function plot() around your variable that you created to store the model + +plot(model_da2_day3) + +#' R will prompt you now to hit return to go through all the plots. Anicer way, in line with the tidy +#' way of coding, is by simpply wrapping the "autoplot()" function around the model. + +autoplot(model_da2_day3) + +#' Here you get all the plots on one page and it looks nice and tidy if you want to include it +#' in a publication. + + + +#### Multiple Linear regression ---- +#' The function for the multiple linear regression is the same as for the simple linear regression. +#' It is "lm()". But now we add as many variables as we want with "+" on the right side of the +#' equation: "lm(y ~ x + covar1 + covar2 + ... + covar3, data=data)" +#' +#' Let's try it in our example data. Check the available variables again: + +names(no_out) + +#' We have three variables that might be interesting in the model: sex, as represented in the "males" +#' variable, "intervention" and "smoker". Let's start with adding one by one and look at the results + +modelIntervention <- lm(day3 ~ day2 + intervention, data=no_out) +summary(modelIntervention) + +#' As we can see, the intervention is highly significantly associated with the day3 value and the day1 +#' value is no longer significant in the model. +#' Let's try and visualise this, by coloring the intervention variable in a scatterplot + +no_out %>% + ggplot(aes(x=day3, y=day2, col=intervention)) + + geom_point() + +#' In the plot you can see that there are now three clusters. The day2 and day3 values are actually +#' dependent on the intervention group. We can visualise this in a boxplot to show the different levels + +no_out %>% + ggplot(aes(x=intervention, y=day3, col=intervention)) + + geom_boxplot() + +#' Let's see if sex also influences the association between day 2 and day3 + +modelSex <- lm(day3 ~ day2 + male, data=no_out) +summary(modelSex) + +#' As we can see, the sex does not matter in the association between the day 2 and day3 values. +#' Now let's check the last variabel, smoking status + +modelSmoker <- lm(day3 ~ day2 + smoker, data=no_out) +summary(modelSmoker) + +#'Smoking status is also not changeing the association between day2 and day3. + + + +#### Logistic regression ---- +#'For the logistic regression, we have to switch to the function for a "generalised linear model" +#'in R: "glm()". To specify that we are indeed interested in a logistic regression, we have to set the +#'family to binomial(link='logit'). Let's see if being a smoker is associated with being male. + +logModelSmoker <- glm(smoker ~ male, data=no_out, family=binomial(link='logit')) +summary(logModelSmoker) + + + + diff --git a/vignettes/.DS_Store b/vignettes/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..d0def6fcde8c45cd063ba211177bfdc6218ac053 GIT binary patch literal 6148 zcmeHKu};G<5VhL^ZV@5_QW?Flb!Zu=G9X2v3~We2FaT;3wW5(qQZ*?RgrIx|{1Jb^ z?{H_km8!zPf)Krn&hLD_v*nk_7c<7VJMFg^vlwFn6tPr=<_p1b)DIbdkv)=vVQ}&#{ylP@Oe1l4M4bI^WjHNw3 zc#~)pM|a?#=geaXi2-7O7+41e%-JWl*I~1?M`C~&_z?qmK1fhR$6%pR9Uai%?<0;^ z5K+L!y9A;%=olEM?p&M{bM)ai_?m0=#UGB-CAu2u)XRN;&} z8mT1)h=FAW%DQV|{XhNw{=b|=Jz{_u_*V?@O2_NCuq9JlS2l;WR)Y3GQ82F1xJUs* im12m+QoIkU1pE>WK*wOA5j-IDBcN!Yh8Xx&20j6yPf~gS literal 0 HcmV?d00001 diff --git a/vignettes/03_analysis_modelling.Rmd b/vignettes/03_analysis_modelling.Rmd index f46988a..a045929 100644 --- a/vignettes/03_analysis_modelling.Rmd +++ b/vignettes/03_analysis_modelling.Rmd @@ -33,9 +33,9 @@ data(iris) ## What we cover -- Linear Regression -- Multiple Linear Regression -- Logistic Regression +>- Linear Regression +>- Multiple Linear Regression +>- Logistic Regression ```{r echo=FALSE, error=FALSE, message=FALSE, warning=FALSE, out.extra = 'class="centre" style="width: 500px;"', warnings=FALSE} setwd("/Users/srauschert/Desktop/Work/20.) Git_GitHub/RWorkshop/") @@ -55,42 +55,33 @@ ggplot( aes(day2, day3)) + ``` # Linear Regression -##Linear Regression in R -In a linear regression, we aim to find a model:
+## Linear Regression in R -- that represents our data and +In a linear regression, we aim to find a model:
-- can give information about the association between our variables of interest. +>- that represents our data and +>- can give information about the association between our variables of interest. -The command in R for a linear model is
+The command in R for a linear model is
lm(y~x). **y** is the outcome variable that interests us and **x** is the variable that we want to test in its association with **y** -## Example: the iris data set -The Iris data set consists of information about three different species of iris flowers, namely **setosa, virginica and versicolor**.
- -It holds information on: - -- Sepal length - -- Sepal width +## Example: +## Data set summary +Let's first have a look at the summary table of the example data set, by using the summary() command: -- Petal length +```{r, echo = FALSE, out.extra = 'class="centre" style="width: 100px;"',warning=FALSE} +#kable(summary(tki_demo[,c(6:8)])) -- Petal width +summary(tki_demo[,c(6:8)]) -## Data set summary -Let's first have a look at the summary table of the Iris data set, by using the summary() command: - -```{r, echo = FALSE, results='asis',out.extra = 'class="centre" style="width: 100px;"',warning=FALSE} -kable(summary(tki_demo[,c(6:8)])) ``` -#Visualisation of data distributions -##Helpful plots before modelling +# Visualisation of data distributions +## Helpful plots before modelling Before we start with the linear regression model, we need to get an idea of the underlying data and its distribution. We know that the linear regression has the assumtptions: @@ -173,7 +164,7 @@ grid.arrange(plot1, plot2, plot3, plot4, nrow=2, ncol=2) ``` -##Linear Regression +## Linear Regression In the plots, we could already see, that petal length and petal width seem to be associated. This is obvious when drawing a line in the plot. Let's now perform a linear regression model in R. @@ -185,14 +176,14 @@ Let's now perform a linear regression model in R. - We also specify the data set that holds the variables we specified as **x** and **y**. -##Linear Regression Results +## Linear Regression Results Now we want to look at the results of the linear regression. So how do we get the p-value and \(\beta\)-coefficient for the association? In R, we add the summary() function to the lm() function, like so: summary(lm(y~x, data=data)) -##Example Results +## Example Results ```{r, message=FALSE, warning=FALSE, error=FALSE, echo = FALSE,out.extra = 'class="centre" style="width: 500px;"'} #summary(lm(Petal.Length~Petal.Width, data=iris)) @@ -201,7 +192,7 @@ library(sjPlot) tab_model(lm1, file="output.html") ``` -##Diagnostics +## Diagnostics
```{r error=FALSE, message=FALSE, warning=FALSE, include=FALSE} @@ -250,3 +241,10 @@ ggplot(model.diag.metrics, aes(Petal.Length, Petal.Width)) + ```
+ +# Multiple Linear Regression +## How to? +Multiple linear regression works with the same function inR : lm(y ~ x + covar1 + covar2 + ... + covarx , data=data) + +# Logistic regression + diff --git a/vignettes/assets/.DS_Store b/vignettes/assets/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..6b0b1efcf3c6c0518e4018fb73125e033b32ee84 GIT binary patch literal 6148 zcmeHK%}T>S5T4bl+lttOP>*}@R-yG^=|Kp!9=wSWJ*e2k1RF?`(xeuxl{|*Nk#FGZ zIJ3JdZ50oKNZ5g$Z+3QO!+h*d764dl=+yzr0Kl;k7D}kT5E`d8CkxuMjL761J`7+R z0*K&xF4{N@1BQYB#sKZzb?AZ*4tS_P_iqS8=}FvC(s#qy#mE=89|lQUtG%*?#lq6^ z3g;zWS}R@J-FRfDPC86l&fuJSM^4&xwS5xD;fL+Gr-47ZDsSw^QR)PdKak0R=VQpl zSrB>gs1*+*FOl=uYJl@1FJ6@^dWiz!FA;p!@gFb|~D#*_38u+5p z87|&eOde714!)Yl;!I730mHzr7@+e(U?a3NW(wuefl59B5DVy*f;PP+C`V|tG-e8M z1cj+oM3u_)6N9OA^a~wlY0MO=bYS}N!St7zexWe=b(~+waA1}~qYVRwfoTR7Rkuv{ z|H1d~|LG*tGYl98a>W2EwC#2aOVW4iTyb>Q3Tz#0Bovn^lqsn6bu10KiZ`)I!7)Jw WqNOoYh#nO4BOqxo$}sS!47>rcYux+* literal 0 HcmV?d00001 diff --git a/vignettes/assets/header-includes/.DS_Store b/vignettes/assets/header-includes/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..ff731b9751be4129a5a482f6e88a2752819c8a9b GIT binary patch literal 6148 zcmeHK%}xR_5S}V(!9NZrdhEp;8rFkk4<;MbgEx~kdQbzqg1QNA4J;5v%sz&`k#FGZ zIMWs*BHlDgI?42#wlm%BmrUCM09J0-9DqCka8$xf4x2B8=1E6n%8XLrV5T~b#ys9 ziQ<~2Mqu#?LdDYSiJtO!2|Ymzk+hnEE=dFO)klPor%u1D1hd2Bvj4Pxt@+`1*f1 z$o4D)mVt?4fMxdly)tge+^r*b%7 literal 0 HcmV?d00001