Introduction to R - 2nd lesson (classes and data)

Nathalie Villa-Vialaneix - http://www.nathalievialaneix.eu
September 14-16th, 2015

Master TIDE, Université Paris 1

Class

a <- c(3, 4, 5); class(a)

[1] "numeric"

class is identical to mode for vectors. More complicated objects have different classes, such as:

matrix and array
data.frame
list
…

Matrices and arrays

class matrix
definition, subsetting
operations
functions
class array

Matrix

matrix: two dimensional array with entries having the same mode

A <- matrix(0, ncol=3, nrow=2); A

     [,1] [,2] [,3]
[1,]    0    0    0
[2,]    0    0    0

B <- matrix(1:10, ncol=5); B

     [,1] [,2] [,3] [,4] [,5]
[1,]    1    3    5    7    9
[2,]    2    4    6    8   10

Matrix definition and subsetting

C <- matrix(c("A", 1:9), ncol=5, byrow=TRUE); C

     [,1] [,2] [,3] [,4] [,5]
[1,] "A"  "1"  "2"  "3"  "4" 
[2,] "5"  "6"  "7"  "8"  "9"

C[2,3] <- 1; C

     [,1] [,2] [,3] [,4] [,5]
[1,] "A"  "1"  "2"  "3"  "4" 
[2,] "5"  "6"  "1"  "8"  "9"

Matrix block subsetting

     [,1] [,2] [,3] [,4] [,5]
[1,]    1    3    5    7    9
[2,]    2    4    6    8   10

B[1,]

[1] 1 3 5 7 9

B[,2:3]

     [,1] [,2]
[1,]    3    5
[2,]    4    6

Elementwise operations

B[,1:2]+B[,3:4]

     [,1] [,2]
[1,]    6   10
[2,]    8   12

B*B

     [,1] [,2] [,3] [,4] [,5]
[1,]    1    9   25   49   81
[2,]    4   16   36   64  100

Matrix operations

B%*%t(B); tcrossprod(B,B) # faster

     [,1] [,2]
[1,]  165  190
[2,]  190  220

     [,1] [,2]
[1,]  165  190
[2,]  190  220

crossprod(B,B)

     [,1] [,2] [,3] [,4] [,5]
[1,]    5   11   17   23   29
[2,]   11   25   39   53   67
[3,]   17   39   61   83  105
[4,]   23   53   83  113  143
[5,]   29   67  105  143  181

Matrix operations

crossprod(B,B)

     [,1] [,2] [,3] [,4] [,5]
[1,]    5   11   17   23   29
[2,]   11   25   39   53   67
[3,]   17   39   61   83  105
[4,]   23   53   83  113  143
[5,]   29   67  105  143  181

outer(B[,1], B[,2])

     [,1] [,2]
[1,]    3    4
[2,]    6    8

Matrix operations

D <- crossprod(B)
resEigen <- eigen(D)
resEigen$values

[1]  3.844798e+02  5.201834e-01  2.697369e-15 -4.583117e-15 -1.657516e-14

all.equal(resEigen$vectors%*%
            diag(resEigen$values)%*%
            t(resEigen$vectors),
          D)

[1] TRUE

Matrix functions

ncol(B); nrow(B); dim(B)

[1] 5

[1] 2

[1] 2 5

mode(B); mode(C)

[1] "numeric"

[1] "character"

Matrix functions

rowSums(B); colSums(B)

[1] 25 30

[1]  3  7 11 15 19

colnames(B)

NULL

rownames(B) <- 1:nrow(B); rownames(B)

[1] "1" "2"

Functions on rows and columns

apply(B, 1, sum); apply(B, 2, sum)

 1  2 
25 30

[1]  3  7 11 15 19

apply(B, 2, as.character)

     [,1] [,2] [,3] [,4] [,5]
[1,] "1"  "3"  "5"  "7"  "9" 
[2,] "2"  "4"  "6"  "8"  "10"

Functions on rows and columns

sweep(B, 1, rep(2,nrow(B)), "+")

  [,1] [,2] [,3] [,4] [,5]
1    3    5    7    9   11
2    4    6    8   10   12

sweep(B, 1, rowSums(B), "/")

        [,1]      [,2] [,3]      [,4]      [,5]
1 0.04000000 0.1200000  0.2 0.2800000 0.3600000
2 0.06666667 0.1333333  0.2 0.2666667 0.3333333

Arrays

array is similar to matrix but with more than 2 dimensions

E <- array(1:20, c(2,5,2)); E

, , 1

     [,1] [,2] [,3] [,4] [,5]
[1,]    1    3    5    7    9
[2,]    2    4    6    8   10

, , 2

     [,1] [,2] [,3] [,4] [,5]
[1,]   11   13   15   17   19
[2,]   12   14   16   18   20

Exercise 1

vect <- 1:6

From vect, obtain in one command line each

     [,1] [,2] [,3]
[1,]    1    1    1
[2,]    2    2    2
[3,]    3    3    3
[4,]    4    4    4
[5,]    5    5    5
[6,]    6    6    6

     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]    1    2    3    4    5    6
[2,]    1    2    3    4    5    6
[3,]    1    2    3    4    5    6

Exercise 2

data(Titanic)
cont <- t(apply(Titanic, c(1,4), sum)); cont

        Class
Survived 1st 2nd 3rd Crew
     No  122 167 528  673
     Yes 203 118 178  212

Find the ratio of survivors in each class in one command line.

Data frames

class data.frame
definition
subsetting
operators and vectors
basic statistics

Definition

data.frame objects are 2 dimensional arrays with different modes in the different columns. They are well suited to store several observations (rows) of various variables (columns).

df <- data.frame(nom=c("Nathalie",
                       "Madalina",
                       "Nicolas"),
                 age=c(38,34,31))
df

       nom age
1 Nathalie  38
2 Madalina  34
3  Nicolas  31

Data frames and class

Unlike matrices, the columns might not have the same class

class(df); sapply(df, class)

[1] "data.frame"

      nom       age 
 "factor" "numeric"

Subsetting

Columns can be extracted by number or name

names(df)

[1] "nom" "age"

df[,2]; df$age

[1] 38 34 31

[1] 38 34 31

Subsetting

df[1,]

       nom age
1 Nathalie  38

df[df$age>32,]; subset(df, age>32, c("nom"))

       nom age
1 Nathalie  38
2 Madalina  34

       nom
1 Nathalie
2 Madalina

Exploring a data frame

data(iris);
dim(iris); names(iris); sapply(iris,class)

[1] 150   5

[1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
[5] "Species"

Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
   "numeric"    "numeric"    "numeric"    "numeric"     "factor"

Exploring a data frame

summary(iris[,3:5])

  Petal.Length    Petal.Width          Species  
 Min.   :1.000   Min.   :0.100   setosa    :50  
 1st Qu.:1.600   1st Qu.:0.300   versicolor:50  
 Median :4.350   Median :1.300   virginica :50  
 Mean   :3.758   Mean   :1.199                  
 3rd Qu.:5.100   3rd Qu.:1.800                  
 Max.   :6.900   Max.   :2.500

Basic numeric statistics

min(iris$Sepal.Width); max(iris$Sepal.Width)

[1] 2

[1] 4.4

mean(iris$Sepal.Width)

[1] 3.057333

median(iris$Sepal.Width)

[1] 3

Basic numeric statistics

range(iris$Sepal.Width)

[1] 2.0 4.4

sd(iris$Sepal.Width); var(iris$Sepal.Width)

[1] 0.4358663

[1] 0.1899794

quantile(iris$Sepal.Width, c(0,0.25,0.5,0.75,1))

  0%  25%  50%  75% 100% 
 2.0  2.8  3.0  3.3  4.4

Contingency tables

table(iris$Species)


    setosa versicolor  virginica 
        50         50         50

iris$Large <- factor(iris$Sepal.Width>3)
table(iris$Species,iris$Large)


             FALSE TRUE
  setosa         8   42
  versicolor    42    8
  virginica     33   17

Operations on rows and columns

apply(iris[,1:4], 2, mean)

Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
    5.843333     3.057333     3.758000     1.199333

s.iris <- sweep(iris[,1:4], 2,
                apply(iris[,1:4],2,mean),
                "-")
head(s.iris[,1:2], 3)

  Sepal.Length Sepal.Width
1   -0.7433333  0.44266667
2   -0.9433333 -0.05733333
3   -1.1433333  0.14266667

Operations on rows and columns

apply(s.iris, 2, mean)

 Sepal.Length   Sepal.Width  Petal.Length   Petal.Width 
-3.666902e-16  9.169893e-17 -3.158931e-17 -3.359437e-17

s2.iris <- scale(iris[,1:2])
apply(s2.iris,2,mean); apply(s2.iris,2,sd)

 Sepal.Length   Sepal.Width 
-4.484318e-16  2.034094e-16

Sepal.Length  Sepal.Width 
           1            1

Conditional statistics

by(iris$Sepal.Length, iris$Species, mean)

iris$Species: setosa
[1] 5.006
-------------------------------------------------------- 
iris$Species: versicolor
[1] 5.936
-------------------------------------------------------- 
iris$Species: virginica
[1] 6.588

Conditional statistics

tapply(iris$Sepal.Length, iris$Species, sd)

    setosa versicolor  virginica 
 0.3524897  0.5161711  0.6358796

head(tapply(iris$Sepal.Length, iris$Species,
            quantile, probs=c(0.25,0.75)), 2)

$setosa
25% 75% 
4.8 5.2 

$versicolor
25% 75% 
5.6 6.3

Exercise 3

data(airquality)

What is the dimension of the data?
What are the variables included in the data? What are their types?
How can you obtain a summary of the data?
What is the number of days having a temperature larger than 80°F? What is the average Ozone rate and Wind speed for these days?
What is the average Ozone rate for each month?

Lists

class list
definition
subsetting
operators and vectors
basic statistics

Lists

list is a collection of heterogeneous objects:

a.list <- list("name"="a list", "matrix"=B,
               "dataframe"=df); a.list

$name
[1] "a list"

$matrix
  [,1] [,2] [,3] [,4] [,5]
1    1    3    5    7    9
2    2    4    6    8   10

$dataframe
       nom age
1 Nathalie  38
2 Madalina  34
3  Nicolas  31

Basic operations

class(a.list)

[1] "list"

length(a.list)

[1] 3

names(a.list)

[1] "name"      "matrix"    "dataframe"

Basic operations

lapply(a.list, class)

$name
[1] "character"

$matrix
[1] "matrix"

$dataframe
[1] "data.frame"

Basic operations

lapply(a.list, length)

$name
[1] 1

$matrix
[1] 10

$dataframe
[1] 2