forked from cmbosma/r_templates
-
Notifications
You must be signed in to change notification settings - Fork 0
/
new_projects.r
200 lines (148 loc) · 6.47 KB
/
new_projects.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
## [Insert Title]
## Open Useful Websites or Documentation
## -----------------------
browseURL("http://www.statmethods.net/") # Quick-R website - great reference for data analysis in R
browseURL("http://cran.r-project.org/web/packages/available_packages_by_name.html") # Available CRAN Packages by Name
browseURL("http://cran.r-project.org/web/views/") # List of packages by category
browseURL("http://www.ats.ucla.edu/stat/mult_pkg/whatstat/") # Flow chart for choosing the the correct analyses
browseURL("http://psych.colorado.edu/~mcclella/psych3101h/statfinder/start.html") # Flow chart for choosing type of analysis to run
browseURL("http://tidyverse.org") # Tidyverse package website
browseURL("http://style.tidyverse.org") # The tidyverse style guide
## PACKAGES
## -----------------------
# if (!require(packagename)) {install.packages("packagename"); require(packagename)} # Template
if (!require(psych)) {install.packages("psych"); require(psych)} # Basic upgrades to the base R functions tailored to psychology
if (!require(tidyverse)) {install.packages("tidyverse"); require(tidyverse)} # Packages by Hadley Wickham. Includes dyplr, tidyr, and ggplot2
if (!require(haven)) {install.packages("haven"); require(haven)} # Package for importing data sets, including excel and SPSS
if (!require(lintr)) {install.packages("lintr"); require(lintr)} # Package for checking style of code
if (!require(outliers)) {install.packages("outliers"); require(outliers)} # Outlier detection
if (!require(p.adjust)) {install.packages("p.adjust"); require(p.adjust)} # For correcting for multiple comparisons
if (!require(robustHD)) {install.packages("robustHD"); require(robustHD)} # Package for winsorizing and critical values
if (!require(broom)) {install.packages("broom"); require(broom)} # cleans up the output of linear models and a tabular output
# More efficient way to load packages
# Packages needed
packages <- c("tidyverse",
"here",
"readxl",
"psych",
"haven",
"magrittr",
"lintr",
"outliers",
"p.adjust",
"robustHD",
"broom",
"VIM")
# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
# Load packages into workspace
invisible(lapply(packages, library, character.only = TRUE))
## IMPORT DATA (from flat .csv file)
# Note: use function read.csv2 for european data sets (with commas as decimals). Use read.table() to set parameters to read other flat data formats
# use fread() function to import data and easily drop/select variables ex: data <- fread("data.csv", select = c(), drop = c())
## -----------------------
here::here() # Set your directory - works best when you are using R projects
data <- haven::read_csv(here("datafolder", "data.csv"))
data <- haven::read_csv("[data.csv]") # load data using haven package - can use arguments to convert data types through import
names(data) <- tolower(names(data)) ##Change all variable names to lowercase
names(data) <- gsub("-", "_", names(data)) ## replace "." with "_" in variable names (change based on variable names)
names(data) # Checking changes to variable names
head(data, 10); tail(data, 10) # Print fist and last six items of data set
str(data) # check to see which variables may need to be converted to a different class
View(data)
# Opens finder for you to choose the file
data <- read_csv(file.choose())
## SUBSETTING COLUMNS OF INTEREST
# base
df <- data.frame(data[vec1, vec2, vec3,...])
# tidyverse
df <- df %>%
select(vec1, vec2, ...)
## FORMATTING DATA - Basic (if needed)
## -----------------------
class()
str() # Shows type of data (class and other parameters)
# Factors
factor()
# Characters
is.character()
as.character()
# Numeric
is.numeric()
as.numeric()
# Centering
scale(df$vec, center = TRUE, scale = FALSE)
df$vec - mean(df$vec) # manually
## LOOK AT THE DATA
## -----------------------
ls() # Look at what is in your workspace
summary()
describe()
table() # Useful for looking at frequencies of categorical variables (can use sapply(Var, sum) as well)
aggr() # from the VIM package; investigate instances of NAs
## PLOTTING DATA AND OUTLIER DETECTION
## ------------------------
# Using ggplot2
xvar_mean <- mean(xvar) # Save mean to vector
# Histogram
ggplot(data = data, aes(x = xvar_rand_norm)) +
geom_histogram() +
geom_vline(xintercept = xvar_mean, color = "dark red") +
annotate("text", label = paste("Mean: ", round(xvar_mean,digits = 2)), x = xvar_mean, y = 30, color = "white", size = 5)
# Boxplots using ggplot 2
# Using base graphics
par() # view current settings
opar <- par() # make a copy of current settings
attach(data)
plot()
boxplot()
hist()
par(opar) # restore original settings
detach(data)
# Outlier detection
describe()
boxplot()
scale(vec, center = TRUE, scale = TRUE) # calculate z-scores for each value in vector
outlier() # using 'outlier' package
# if univariate, continuous variable
outlier_values <- boxplot.stats(df$vec)$out
boxplot(df$vec, main = "variable name", boxwex = 0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)
outlier_ids_upper <- subset(df, select = c(id, vec), vec >= x) # change argument accordingly
outlier_ids_upper
outlier_ids_lower <- subset(df, select = c(id, vec), vec <= x) # change argument accordingly
outlier_ids_lower
# Converting to z-score
scale(df$vec, center = TRUE, scale = TRUE)
df$vec - mean(df$vec)/sd(df$vec) # manually
# List cases based on cutoff value with dplyr
dplyr::filter(df[c("id", "vec")], vec < x | vec > x) # replace x with cutoff
# Winsorizing
df[which(df$id == id_number), c("vec")] == new_value # doesn't work
df[row_number, column_number] = new_value
df <- df %>% mutate(vec = replace(vec, vec/"id" <= x, new value)) # can put id (e.g., AAE012) instead of vec
df <- df %>% mutate(vec = replace(vec, vec <= x, new value)) # using dplyr
## DESCRIPTIVE STATISTICS
## -----------------------
# Using tidyverse
# By variable
df %>%
summarise(mean = mean(vec),
sd = sd(vec))
# Grouped by a grouping variable
df %>%
group_by(group_vec),
summarise()
# Based on a value
df %>%
filter(vec == x), # can use any boolean operater
summarise()
# Depending on type of data, use lapply(), or vapply() for more control
sapply(, mean)
sapply(, sd)
sapply(, range)
## DATA ANALYSIS
## -----------------------