From 1600852eec4a888fa430d7ab4054a72162b38933 Mon Sep 17 00:00:00 2001 From: ngotelli Date: Thu, 1 Feb 2024 16:32:40 -0500 Subject: [PATCH] update Atomic Vectors I lecture --- Lectures/AtomicVectors_I.Rmd | 274 ++++++++++++ Lectures/AtomicVectors_I.html | 765 ++++++++++++++++++++++++++++++++++ index.Rmd | 6 +- index.html | 9 +- 4 files changed, 1047 insertions(+), 7 deletions(-) create mode 100644 Lectures/AtomicVectors_I.Rmd create mode 100644 Lectures/AtomicVectors_I.html diff --git a/Lectures/AtomicVectors_I.Rmd b/Lectures/AtomicVectors_I.Rmd new file mode 100644 index 0000000..463e377 --- /dev/null +++ b/Lectures/AtomicVectors_I.Rmd @@ -0,0 +1,274 @@ +--- +title: 'Atomic Vectors I' +author: "Nicholas J. Gotelli" +date: "1 February 2024" +output: + html_document: + highlight: tango + keep_md: no + theme: united + pdf_document: default +--- +## History of R +- Early languages FORTRAN, C +- S language in 1970s; programming plus stats, matrix algebra, graphics later +- Open-source R in 1992; first stable version in 2000 + +## Advantages of R +- interpreted (for interactive use) +- graphics, statistics +- very active community of contributors +- works on multiple platforms + +## Disadvantages of R +- interpreted (slow speed) +- lazy evaluation +- functions hard to learn +- poorly documented +- unreliable packages +- problems with big data + - subroutines coded for C or Fortran + - Julia? + +## R as a general programming language +- data structures and types +- writing functions +- for loops +- if..then, while statements +- structured programming +- annotation +- naming of variables +- using random numbers +- structured programming +- pseudocode + +### Using the assignment operator + +```{r, eval=FALSE} +# Using the assignment operator +x <- 5 # preferred +y = 4 # legal but not used except in function defaults +y = y + 1.1 +print(y) +y <- y + 1.1 +print(y) +``` + +# Variable names + +```{r} +z <- 3 # Begin with lower case letter +plantHeight <- 10 # option "camelCaseFormatting" +plant.height <- 4.2 # avoid periods +plant_height <- 3.3 # optimal "snake_case_formatting" +. <- 5.5 # reserve this for a generic temporary variable (more later) +``` + +## R's Four Data Types + +Dimensions | Homogeneous | Heterogeneous +------------- | ------------- | ------------- +1-dimension | Atomic Vector | List +2-dimensions | Matrix | Data Frame +n-dimenions | (array) | | + +## Types of Atomic Vectors +- character strings +- integers +- double +- integers, doubles are "numeric" +- logical +- (factor) +- vector of lists! + +### One dimensional atomic vectors +```{r, eval=FALSE} +# the combine function +z <- c(3.2, 5, 5, 6) +print(z) +typeof(z) +is.numeric(z) + +# c() always "flattens" to an atomic vector +z <- c(c(3,4),c(5,6)) +print(z) + +# character strings with single or double quotes +z <- c("perch","bass",'trout') +print(z) + +# use both with an internal quote +z <- c("This is only 'one' character string", 'a second') +print(z) +typeof(z) +is.character(z) + +# building logicals +# Boolean, not with quotes, all caps +z <- c(TRUE,TRUE,FALSE) +# avoid abbreviations T, F which will work +print(z) +typeof(z) +is.logical(z) +is.integer(z) + +``` +### Three Properties of a Vector + +#### Type + +```{r, eval=FALSE} +z <- c(1.1, 1.2, 3, 4.4) +typeof(z) # gives type +is.numeric(z) # is. gives logical +as.character(z) # as. coerces variable +print(z) +typeof(z) + +``` + +#### Length +```{r, eval=FALSE} +length(z) # gives number of elements +length(y) # throws error if variable does not exist + +``` + +#### Names +```{r, eval=FALSE} +z <- runif(5) +# optional attribute not initially assigned +names(z) +print(z) +# add names later after variable is created +names(z) <- c("chow","pug","beagle","greyhound","akita") +print(z) + +# add names when variable is built (with or without quotes) + z2 <- c(gold=3.3, silver=10, lead=2) +print(z2) + +# reset names +names(z2) <- NULL + +# names can be added for only a few elements +# names do not have to be distinct, but often are +names(z2) <- c("copper","zinc") +print(z2) + +``` + +#### Special Data Types +```{r, eval=FALSE} + +# NA values for missing data +z <- c(3.2,3.3,NA) # NA is a missing value +typeof(z) +length(z) +typeof(z[3]) # what is the type of third element + +z1 <- NA +typeof(z1) #different NA types + +is.na(z) # logical operator to find missing values +mean(z) # won't work because of NA +is.na(z)# evaluate to find midding values +!is.na(z) # use ! for NOT missing values +mean(!is.na(z)) # wrong answer based on TRUE FALSE!! +mean(z[!is.na(z)]) # correct use of indexing +#----------------------------- + +# NaN, -Inf, and Inf from numeric division +z <- 0/0 # NaN +typeof(z) +print(z) +z <- 1/0 # Inf +print(z) +z <-1/0 # - Inf +print(z) +#------------------------------- +# NULL is an object that is nothing! +# a reserved word in R +z <- NULL +typeof(z) +length(z) +is.null(z) # only operation that works on a null +``` +### Three Notable Features of Atomic Vectors + +#### Coercion + +```{r, eval=FALSE} +# All atomics are of the same type +# if they are different, R coerces them +# logical -> integer -> double -> character + +a <- c(2, 2.0) +print(a) +typeof(a) # technically integer coerced to numeric + +b <- c("purple","green") +typeof(b) + +d <- c(a,b) +print(d) +typeof(d) + +# "Mistakes" in numeric variables convert to strings +# Very useful when working with logical variables + +a <- runif(10) +print(a) + +# Comparison operators yield a logical result +a > 0.5 + +# do math on a logical and it coerces to an integer! + +# How many elements are greater than 0.5? +sum(a > 0.5) + +# What proportion of the vector elements are greater than 0.5? + +mean(a > 0.5) + +#Qualifying exam question! Approximately what proportion of observations drawn from a normal (0,1) distribution are larger than 2.0? + +mean(rnorm(1000) > 2) +``` + + +#### Vectorization + +```{r, eval=FALSE} +# adding a constant to a vector +z <- c(10,20,30) +z + 1 + +# what happens when vectors are added? + +y <- c(1,2,3) +z + y + +# results is an "element by element" operation on the vector +# most vector operations can be done this way + +z^2 + +``` + + + +#### Recycling +```{r, eval=FALSE} +# but what if vector lengths are not equal? +z <- c(10,20,30) +x <- c(1,2) +z + x + +# warning is issued by calculation is still made +# shorter vector is always "recycled" +# works with scalars (= vector of length(1)) + +``` + diff --git a/Lectures/AtomicVectors_I.html b/Lectures/AtomicVectors_I.html new file mode 100644 index 0000000..d38ecbe --- /dev/null +++ b/Lectures/AtomicVectors_I.html @@ -0,0 +1,765 @@ + + + + + + + + + + + + + + + +Atomic Vectors I + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

History of R

+
    +
  • Early languages FORTRAN, C
  • +
  • S language in 1970s; programming plus stats, matrix algebra, +graphics later
  • +
  • Open-source R in 1992; first stable version in 2000
  • +
+
+
+

Advantages of R

+
    +
  • interpreted (for interactive use)
  • +
  • graphics, statistics
  • +
  • very active community of contributors
  • +
  • works on multiple platforms
  • +
+
+
+

Disadvantages of R

+
    +
  • interpreted (slow speed)
  • +
  • lazy evaluation
  • +
  • functions hard to learn
  • +
  • poorly documented
  • +
  • unreliable packages
  • +
  • problems with big data +
      +
    • subroutines coded for C or Fortran
    • +
    • Julia?
    • +
  • +
+
+
+

R as a general programming language

+
    +
  • data structures and types
  • +
  • writing functions
  • +
  • for loops
  • +
  • if..then, while statements
  • +
  • structured programming
  • +
  • annotation
  • +
  • naming of variables
  • +
  • using random numbers
  • +
  • structured programming
  • +
  • pseudocode
  • +
+
+

Using the assignment operator

+
# Using the assignment operator
+x <- 5 # preferred
+y = 4 # legal but not used except in function defaults
+y = y + 1.1
+print(y)
+y <- y + 1.1
+print(y)
+
+
+
+

Variable names

+
z <- 3 # Begin with lower case letter 
+plantHeight <- 10 # option "camelCaseFormatting"
+plant.height <- 4.2 # avoid periods
+plant_height <- 3.3  # optimal "snake_case_formatting"
+. <- 5.5 # reserve this for a generic temporary variable (more later)
+
+

R’s Four Data Types

+ + + + + + + + + + + + + + + + + + + + + + + + + +
DimensionsHomogeneousHeterogeneous
1-dimensionAtomic VectorList
2-dimensionsMatrixData Frame
n-dimenions(array)
+
+
+

Types of Atomic Vectors

+
    +
  • character strings
  • +
  • integers
  • +
  • double
  • +
  • integers, doubles are “numeric”
  • +
  • logical
  • +
  • (factor)
  • +
  • vector of lists!
  • +
+
+

One dimensional atomic vectors

+
# the combine function
+z <- c(3.2, 5, 5, 6) 
+print(z)
+typeof(z)
+is.numeric(z)
+
+# c() always "flattens" to an atomic vector
+z <- c(c(3,4),c(5,6)) 
+print(z)
+
+# character strings with single or double quotes
+z <- c("perch","bass",'trout') 
+print(z)
+
+# use both with an internal quote
+z <- c("This is only 'one' character string", 'a second')
+print(z)
+typeof(z)
+is.character(z)
+
+# building logicals
+# Boolean, not with quotes, all caps
+z <- c(TRUE,TRUE,FALSE) 
+# avoid abbreviations T, F which will work
+print(z)
+typeof(z)
+is.logical(z)
+is.integer(z)
+
+
+

Three Properties of a Vector

+
+

Type

+
z <- c(1.1, 1.2, 3, 4.4)
+typeof(z) # gives type
+is.numeric(z) # is. gives logical
+as.character(z) # as. coerces variable
+print(z)
+typeof(z)
+
+
+

Length

+
length(z) # gives number of elements
+length(y) # throws error if variable does not exist
+
+
+

Names

+
z <- runif(5)
+# optional attribute not initially assigned
+names(z) 
+print(z)
+# add names later after variable is created
+names(z) <- c("chow","pug","beagle","greyhound","akita")
+print(z)
+
+# add names when variable is built (with or without quotes)
+ z2 <- c(gold=3.3, silver=10, lead=2)
+print(z2)
+
+# reset names
+names(z2) <- NULL
+
+# names can be added for only a few elements
+# names do not have to be distinct, but often are
+names(z2) <- c("copper","zinc")
+print(z2)
+
+
+

Special Data Types

+
# NA values for missing data
+z <- c(3.2,3.3,NA) # NA is a missing value
+typeof(z)
+length(z)
+typeof(z[3]) # what is the type of third element
+
+z1 <- NA
+typeof(z1) #different NA types
+
+is.na(z) # logical operator to find missing values
+mean(z) # won't work because of NA
+is.na(z)# evaluate to find midding values
+!is.na(z) # use ! for NOT missing values
+mean(!is.na(z)) # wrong answer based on TRUE FALSE!!
+mean(z[!is.na(z)]) # correct use of indexing
+#-----------------------------
+
+# NaN, -Inf, and Inf from numeric division
+z <-  0/0   # NaN
+typeof(z)
+print(z)
+z <- 1/0   # Inf
+print(z)
+z <-1/0  # - Inf
+print(z)
+#-------------------------------
+# NULL is an object that is nothing!
+# a reserved word in R
+z <- NULL
+typeof(z)
+length(z)
+is.null(z) # only operation that works on a null
+
+
+
+

Three Notable Features of Atomic Vectors

+
+

Coercion

+
# All atomics are of the same type
+# if they are different, R coerces them
+# logical -> integer -> double -> character
+
+a <- c(2, 2.0)
+print(a)
+typeof(a) # technically integer coerced to numeric
+
+b <- c("purple","green")
+typeof(b)
+
+d <- c(a,b)
+print(d)
+typeof(d)
+
+# "Mistakes" in numeric variables convert to strings
+# Very useful when working with logical variables
+
+a <- runif(10)
+print(a)
+
+# Comparison operators yield a logical result
+a > 0.5
+
+# do math on a logical and it coerces to an integer!
+
+# How many elements are greater than 0.5?
+sum(a > 0.5)
+
+# What proportion of the vector elements are greater than 0.5?
+
+mean(a > 0.5)
+
+#Qualifying exam question! Approximately what proportion of observations drawn from a normal (0,1) distribution are larger than 2.0?
+
+mean(rnorm(1000) > 2)
+
+
+

Vectorization

+
# adding a constant to a vector
+z <- c(10,20,30)
+z + 1
+
+# what happens when vectors are added?
+
+y <- c(1,2,3)
+z + y
+
+# results is an "element by element" operation on the vector
+# most vector operations can be done this way
+
+z^2
+
+
+

Recycling

+
# but what if vector lengths are not equal?
+z <- c(10,20,30)
+x <- c(1,2)
+z + x
+
+# warning is issued by calculation is still made
+# shorter vector is always "recycled"
+# works with scalars (= vector of length(1))
+
+
+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/index.Rmd b/index.Rmd index aa49d54..c43a94b 100644 --- a/index.Rmd +++ b/index.Rmd @@ -35,10 +35,10 @@ January 17 | - | [Homework #1](Homeworks/Homework_01.html) January 18 | [GitHub](Lectures/GitHub.html) | - January 23 | [Markdown](Lectures/Markdown.html) | - January 24 | - | [Homework #2](Homeworks/Homework_02.html) -January 25 | [Regular Expressions](Lectures/RegularExpressions.html) | - -January 30 | [Markdown II](Lectures/Lecture_05.xhtml) | - +January 25 | [Markdown II](Lectures/Markdown.html) | - +January 30 | [Regular Expressions](Lectures/RegularExpressions.html) | - January 31 | - | [Homework #3](Homeworks/Homework_03.html) -February 1 | [LaTeX & Typora](Lectures/Lecture_06.xhtml) | - +February 1 | [Atomic Vectors I](Lectures/AtomicVectors_I.html) | - February 6 | [Atomic Vectors I](Lectures/Lecture_07.xhtml) | - February 7 | **Portfolio Check #1** | [Homework #4](Homeworks/Homework_04.html) February 8 | [Atomic Vectors II](Lectures/Lecture_08.xhtml)| - diff --git a/index.html b/index.html index 4560ae3..0b8bc05 100644 --- a/index.html +++ b/index.html @@ -84,6 +84,7 @@

Course Materials

Installation

Course Syllabus

R Scripts

+

Example Datasets

Cheat Sheets & Programming Resources


@@ -131,13 +132,13 @@

Lecture Outlines & Homework Assignments

January 25 -Regular -Expressions +Markdown II - January 30 -Markdown II +Regular +Expressions - @@ -147,7 +148,7 @@

Lecture Outlines & Homework Assignments

February 1 -LaTeX & Typora +Atomic Vectors I -