Nathalie Villa-Vialaneix - http://www.nathalievialaneix.eu
September 14-16th, 2015
Master TIDE, Université Paris 1
a <- c(3, 4, 5); class(a)
[1] "numeric"
class
is identical to mode
for vectors. More complicated objects
have different classes, such as:
matrix
and array
data.frame
list
matrix
array
matrix
: two dimensional array with entries having the same mode
A <- matrix(0, ncol=3, nrow=2); A
[,1] [,2] [,3]
[1,] 0 0 0
[2,] 0 0 0
B <- matrix(1:10, ncol=5); B
[,1] [,2] [,3] [,4] [,5]
[1,] 1 3 5 7 9
[2,] 2 4 6 8 10
C <- matrix(c("A", 1:9), ncol=5, byrow=TRUE); C
[,1] [,2] [,3] [,4] [,5]
[1,] "A" "1" "2" "3" "4"
[2,] "5" "6" "7" "8" "9"
C[2,3] <- 1; C
[,1] [,2] [,3] [,4] [,5]
[1,] "A" "1" "2" "3" "4"
[2,] "5" "6" "1" "8" "9"
B
[,1] [,2] [,3] [,4] [,5]
[1,] 1 3 5 7 9
[2,] 2 4 6 8 10
B[1,]
[1] 1 3 5 7 9
B[,2:3]
[,1] [,2]
[1,] 3 5
[2,] 4 6
B[,1:2]+B[,3:4]
[,1] [,2]
[1,] 6 10
[2,] 8 12
B*B
[,1] [,2] [,3] [,4] [,5]
[1,] 1 9 25 49 81
[2,] 4 16 36 64 100
B%*%t(B); tcrossprod(B,B) # faster
[,1] [,2]
[1,] 165 190
[2,] 190 220
[,1] [,2]
[1,] 165 190
[2,] 190 220
crossprod(B,B)
[,1] [,2] [,3] [,4] [,5]
[1,] 5 11 17 23 29
[2,] 11 25 39 53 67
[3,] 17 39 61 83 105
[4,] 23 53 83 113 143
[5,] 29 67 105 143 181
crossprod(B,B)
[,1] [,2] [,3] [,4] [,5]
[1,] 5 11 17 23 29
[2,] 11 25 39 53 67
[3,] 17 39 61 83 105
[4,] 23 53 83 113 143
[5,] 29 67 105 143 181
outer(B[,1], B[,2])
[,1] [,2]
[1,] 3 4
[2,] 6 8
D <- crossprod(B)
resEigen <- eigen(D)
resEigen$values
[1] 3.844798e+02 5.201834e-01 2.697369e-15 -4.583117e-15 -1.657516e-14
all.equal(resEigen$vectors%*%
diag(resEigen$values)%*%
t(resEigen$vectors),
D)
[1] TRUE
ncol(B); nrow(B); dim(B)
[1] 5
[1] 2
[1] 2 5
mode(B); mode(C)
[1] "numeric"
[1] "character"
rowSums(B); colSums(B)
[1] 25 30
[1] 3 7 11 15 19
colnames(B)
NULL
rownames(B) <- 1:nrow(B); rownames(B)
[1] "1" "2"
apply(B, 1, sum); apply(B, 2, sum)
1 2
25 30
[1] 3 7 11 15 19
apply(B, 2, as.character)
[,1] [,2] [,3] [,4] [,5]
[1,] "1" "3" "5" "7" "9"
[2,] "2" "4" "6" "8" "10"
sweep(B, 1, rep(2,nrow(B)), "+")
[,1] [,2] [,3] [,4] [,5]
1 3 5 7 9 11
2 4 6 8 10 12
sweep(B, 1, rowSums(B), "/")
[,1] [,2] [,3] [,4] [,5]
1 0.04000000 0.1200000 0.2 0.2800000 0.3600000
2 0.06666667 0.1333333 0.2 0.2666667 0.3333333
array
is similar to matrix
but with more than 2 dimensions
E <- array(1:20, c(2,5,2)); E
, , 1
[,1] [,2] [,3] [,4] [,5]
[1,] 1 3 5 7 9
[2,] 2 4 6 8 10
, , 2
[,1] [,2] [,3] [,4] [,5]
[1,] 11 13 15 17 19
[2,] 12 14 16 18 20
vect <- 1:6
From vect
, obtain in one command line each
[,1] [,2] [,3]
[1,] 1 1 1
[2,] 2 2 2
[3,] 3 3 3
[4,] 4 4 4
[5,] 5 5 5
[6,] 6 6 6
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1 2 3 4 5 6
[2,] 1 2 3 4 5 6
[3,] 1 2 3 4 5 6
data(Titanic)
cont <- t(apply(Titanic, c(1,4), sum)); cont
Class
Survived 1st 2nd 3rd Crew
No 122 167 528 673
Yes 203 118 178 212
Find the ratio of survivors in each class in one command line.
data.frame
data.frame
objects are 2 dimensional arrays with different modes in the
different columns. They are well suited to store several observations (rows) of
various variables (columns).
df <- data.frame(nom=c("Nathalie",
"Madalina",
"Nicolas"),
age=c(38,34,31))
df
nom age
1 Nathalie 38
2 Madalina 34
3 Nicolas 31
Unlike matrices, the columns might not have the same class
class(df); sapply(df, class)
[1] "data.frame"
nom age
"factor" "numeric"
Columns can be extracted by number or name
names(df)
[1] "nom" "age"
df[,2]; df$age
[1] 38 34 31
[1] 38 34 31
df[1,]
nom age
1 Nathalie 38
df[df$age>32,]; subset(df, age>32, c("nom"))
nom age
1 Nathalie 38
2 Madalina 34
nom
1 Nathalie
2 Madalina
data(iris);
dim(iris); names(iris); sapply(iris,class)
[1] 150 5
[1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
[5] "Species"
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
"numeric" "numeric" "numeric" "numeric" "factor"
summary(iris[,3:5])
Petal.Length Petal.Width Species
Min. :1.000 Min. :0.100 setosa :50
1st Qu.:1.600 1st Qu.:0.300 versicolor:50
Median :4.350 Median :1.300 virginica :50
Mean :3.758 Mean :1.199
3rd Qu.:5.100 3rd Qu.:1.800
Max. :6.900 Max. :2.500
min(iris$Sepal.Width); max(iris$Sepal.Width)
[1] 2
[1] 4.4
mean(iris$Sepal.Width)
[1] 3.057333
median(iris$Sepal.Width)
[1] 3
range(iris$Sepal.Width)
[1] 2.0 4.4
sd(iris$Sepal.Width); var(iris$Sepal.Width)
[1] 0.4358663
[1] 0.1899794
quantile(iris$Sepal.Width, c(0,0.25,0.5,0.75,1))
0% 25% 50% 75% 100%
2.0 2.8 3.0 3.3 4.4
table(iris$Species)
setosa versicolor virginica
50 50 50
iris$Large <- factor(iris$Sepal.Width>3)
table(iris$Species,iris$Large)
FALSE TRUE
setosa 8 42
versicolor 42 8
virginica 33 17
apply(iris[,1:4], 2, mean)
Sepal.Length Sepal.Width Petal.Length Petal.Width
5.843333 3.057333 3.758000 1.199333
s.iris <- sweep(iris[,1:4], 2,
apply(iris[,1:4],2,mean),
"-")
head(s.iris[,1:2], 3)
Sepal.Length Sepal.Width
1 -0.7433333 0.44266667
2 -0.9433333 -0.05733333
3 -1.1433333 0.14266667
apply(s.iris, 2, mean)
Sepal.Length Sepal.Width Petal.Length Petal.Width
-3.666902e-16 9.169893e-17 -3.158931e-17 -3.359437e-17
s2.iris <- scale(iris[,1:2])
apply(s2.iris,2,mean); apply(s2.iris,2,sd)
Sepal.Length Sepal.Width
-4.484318e-16 2.034094e-16
Sepal.Length Sepal.Width
1 1
by(iris$Sepal.Length, iris$Species, mean)
iris$Species: setosa
[1] 5.006
--------------------------------------------------------
iris$Species: versicolor
[1] 5.936
--------------------------------------------------------
iris$Species: virginica
[1] 6.588
tapply(iris$Sepal.Length, iris$Species, sd)
setosa versicolor virginica
0.3524897 0.5161711 0.6358796
head(tapply(iris$Sepal.Length, iris$Species,
quantile, probs=c(0.25,0.75)), 2)
$setosa
25% 75%
4.8 5.2
$versicolor
25% 75%
5.6 6.3
data(airquality)
What is the dimension of the data?
What are the variables included in the data? What are their types?
How can you obtain a summary of the data?
What is the number of days having a temperature larger than 80°F? What is the average Ozone rate and Wind speed for these days?
What is the average Ozone rate for each month?
list
list
is a collection of heterogeneous objects:
a.list <- list("name"="a list", "matrix"=B,
"dataframe"=df); a.list
$name
[1] "a list"
$matrix
[,1] [,2] [,3] [,4] [,5]
1 1 3 5 7 9
2 2 4 6 8 10
$dataframe
nom age
1 Nathalie 38
2 Madalina 34
3 Nicolas 31
class(a.list)
[1] "list"
length(a.list)
[1] 3
names(a.list)
[1] "name" "matrix" "dataframe"
lapply(a.list, class)
$name
[1] "character"
$matrix
[1] "matrix"
$dataframe
[1] "data.frame"
lapply(a.list, length)
$name
[1] 1
$matrix
[1] 10
$dataframe
[1] 2