This file gives the answer to the document “Practical statistical analysis of RNA-Seq data” using the R packages edgeR (version 3.6.8), limma (version 3.20.7), mixOmics (version 5.0-1), RColorBrewer(version 1.0-5) and HTSFilter (version 1.6.0).

library(edgeR)

## Loading required package: limma

library(limma)
library(RColorBrewer)
library(mixOmics)

## Loading required package: MASS
## Loading required package: lattice

library(HTSFilter)

## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:limma':
## 
##     plotMA
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist
## 
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

4.0. Starting from count table

Properly set the directory from which files are imported:

directory <- "RNAseq_data/count_table_files/"
dir(directory)

## [1] "count_table.tsv"    "pasilla_design.txt"

Exercise 3.1 Read the files:

rawCountTable <- read.table(paste0(directory,"count_table.tsv"), header=TRUE,
                            row.names=1)
sampleInfo <- read.table(paste0(directory,"pasilla_design.txt"), header=TRUE,
                         row.names=1)

Exercise 3.2 Have a look at the count data:

head(rawCountTable)

##             untreated1 untreated2 untreated3 untreated4 treated1 treated2
## FBgn0000003          0          0          0          0        0        0
## FBgn0000008         92        161         76         70      140       88
## FBgn0000014          5          1          0          0        4        0
## FBgn0000015          0          2          1          2        1        0
## FBgn0000017       4664       8714       3564       3150     6205     3072
## FBgn0000018        583        761        245        310      722      299
##             treated3
## FBgn0000003        1
## FBgn0000008       70
## FBgn0000014        0
## FBgn0000015        0
## FBgn0000017     3334
## FBgn0000018      308

nrow(rawCountTable)

## [1] 14599

14599 genes are included in this file.

Exercise 3.3 Have a look at the sample information and order the count table in the same way that the sample information:

sampleInfo

##                   type number.of.lanes total.number.of.reads exon.counts
## treated1   single-read               5              35158667    15679615
## treated2    paired-end               2         12242535 (x2)    15620018
## treated3    paired-end               2         12443664 (x2)    12733865
## untreated1 single-read               2              17812866    14924838
## untreated2 single-read               6              34284521    20764558
## untreated3  paired-end               2         10542625 (x2)    10283129
## untreated4  paired-end               2         12214974 (x2)    11653031

rawCountTable <- rawCountTable[,match(rownames(sampleInfo),
                                      colnames(rawCountTable))]
head(rawCountTable)

##             treated1 treated2 treated3 untreated1 untreated2 untreated3
## FBgn0000003        0        0        1          0          0          0
## FBgn0000008      140       88       70         92        161         76
## FBgn0000014        4        0        0          5          1          0
## FBgn0000015        1        0        0          0          2          1
## FBgn0000017     6205     3072     3334       4664       8714       3564
## FBgn0000018      722      299      308        583        761        245
##             untreated4
## FBgn0000003          0
## FBgn0000008         70
## FBgn0000014          0
## FBgn0000015          2
## FBgn0000017       3150
## FBgn0000018        310

Exercise 3.4 Create the ‘condition’ column

sampleInfo$condition <- substr(rownames(sampleInfo), 1,
                               nchar(rownames(sampleInfo))-1)
sampleInfo$condition[sampleInfo$condition=="untreated"] <- "control"
sampleInfo

##                   type number.of.lanes total.number.of.reads exon.counts
## treated1   single-read               5              35158667    15679615
## treated2    paired-end               2         12242535 (x2)    15620018
## treated3    paired-end               2         12443664 (x2)    12733865
## untreated1 single-read               2              17812866    14924838
## untreated2 single-read               6              34284521    20764558
## untreated3  paired-end               2         10542625 (x2)    10283129
## untreated4  paired-end               2         12214974 (x2)    11653031
##            condition
## treated1     treated
## treated2     treated
## treated3     treated
## untreated1   control
## untreated2   control
## untreated3   control
## untreated4   control

4.1. Starting from count table

Exercise 4.1 Create a DGEList data object

dgeFull <- DGEList(rawCountTable, group=sampleInfo$condition)
dgeFull

## An object of class "DGEList"
## $counts
##             treated1 treated2 treated3 untreated1 untreated2 untreated3
## FBgn0000003        0        0        1          0          0          0
## FBgn0000008      140       88       70         92        161         76
## FBgn0000014        4        0        0          5          1          0
## FBgn0000015        1        0        0          0          2          1
## FBgn0000017     6205     3072     3334       4664       8714       3564
##             untreated4
## FBgn0000003          0
## FBgn0000008         70
## FBgn0000014          0
## FBgn0000015          2
## FBgn0000017       3150
## 14594 more rows ...
## 
## $samples
##              group lib.size norm.factors
## treated1   treated 18670279            1
## treated2   treated  9571826            1
## treated3   treated 10343856            1
## untreated1 control 13972512            1
## untreated2 control 21911438            1
## untreated3 control  8358426            1
## untreated4 control  9841335            1

Exercise 4.2 Add the sample information object in the DGEList data

dgeFull$sampleInfo <- sampleInfo
dgeFull

## An object of class "DGEList"
## $counts
##             treated1 treated2 treated3 untreated1 untreated2 untreated3
## FBgn0000003        0        0        1          0          0          0
## FBgn0000008      140       88       70         92        161         76
## FBgn0000014        4        0        0          5          1          0
## FBgn0000015        1        0        0          0          2          1
## FBgn0000017     6205     3072     3334       4664       8714       3564
##             untreated4
## FBgn0000003          0
## FBgn0000008         70
## FBgn0000014          0
## FBgn0000015          2
## FBgn0000017       3150
## 14594 more rows ...
## 
## $samples
##              group lib.size norm.factors
## treated1   treated 18670279            1
## treated2   treated  9571826            1
## treated3   treated 10343856            1
## untreated1 control 13972512            1
## untreated2 control 21911438            1
## untreated3 control  8358426            1
## untreated4 control  9841335            1
## 
## $sampleInfo
##                   type number.of.lanes total.number.of.reads exon.counts
## treated1   single-read               5              35158667    15679615
## treated2    paired-end               2         12242535 (x2)    15620018
## treated3    paired-end               2         12443664 (x2)    12733865
## untreated1 single-read               2              17812866    14924838
## untreated2 single-read               6              34284521    20764558
## untreated3  paired-end               2         10542625 (x2)    10283129
## untreated4  paired-end               2         12214974 (x2)    11653031
##            condition
## treated1     treated
## treated2     treated
## treated3     treated
## untreated1   control
## untreated2   control
## untreated3   control
## untreated4   control

4.2. Starting from separate files

directory <- "RNAseq_data/separate_files/"
dir(directory)

## [1] "pasilla_design.txt" "treated1fb.txt"     "treated2fb.txt"    
## [4] "treated3fb.txt"     "untreated1fb.txt"   "untreated2fb.txt"  
## [7] "untreated3fb.txt"   "untreated4fb.txt"

Exercise 4.3 Reading the passilla design

fileInfo <- read.table(paste0(directory, "pasilla_design.txt"), header=TRUE)
fileInfo

##              files        type number.of.lanes total.number.of.reads
## 1   treated1fb.txt single-read               5              35158667
## 2   treated2fb.txt  paired-end               2         12242535 (x2)
## 3   treated3fb.txt  paired-end               2         12443664 (x2)
## 4 untreated1fb.txt single-read               2              17812866
## 5 untreated2fb.txt single-read               6              34284521
## 6 untreated3fb.txt  paired-end               2         10542625 (x2)
## 7 untreated4fb.txt  paired-end               2         12214974 (x2)
##   exon.counts
## 1    15679615
## 2    15620018
## 3    12733865
## 4    14924838
## 5    20764558
## 6    10283129
## 7    11653031

Exercise 4.4 Create an additional column for the groups:

fileInfo$group <- substr(rownames(fileInfo), 1, nchar(rownames(fileInfo))-7)
fileInfo$group[fileInfo$group=="untreated"] <- "control"
fileInfo

##              files        type number.of.lanes total.number.of.reads
## 1   treated1fb.txt single-read               5              35158667
## 2   treated2fb.txt  paired-end               2         12242535 (x2)
## 3   treated3fb.txt  paired-end               2         12443664 (x2)
## 4 untreated1fb.txt single-read               2              17812866
## 5 untreated2fb.txt single-read               6              34284521
## 6 untreated3fb.txt  paired-end               2         10542625 (x2)
## 7 untreated4fb.txt  paired-end               2         12214974 (x2)
##   exon.counts group
## 1    15679615      
## 2    15620018      
## 3    12733865      
## 4    14924838      
## 5    20764558      
## 6    10283129      
## 7    11653031

Exercise 4.5 Import data from separate files with readDGE:

dgeHTSeq <- readDGE(fileInfo, path=directory)
dgeHTSeq

## An object of class "DGEList"
## $samples
##              files        type number.of.lanes total.number.of.reads
## 1   treated1fb.txt single-read               5              35158667
## 2   treated2fb.txt  paired-end               2         12242535 (x2)
## 3   treated3fb.txt  paired-end               2         12443664 (x2)
## 4 untreated1fb.txt single-read               2              17812866
## 5 untreated2fb.txt single-read               6              34284521
## 6 untreated3fb.txt  paired-end               2         10542625 (x2)
## 7 untreated4fb.txt  paired-end               2         12214974 (x2)
##   exon.counts group lib.size norm.factors
## 1    15679615       88834542            1
## 2    15620018       22381538            1
## 3    12733865       22573930            1
## 4    14924838       37094861            1
## 5    20764558       66650218            1
## 6    10283129       19318565            1
## 7    11653031       20196315            1
## 
## $counts
##                 1 2 3 4 5 6 7
## FBgn0000008:001 0 0 0 0 0 0 0
## FBgn0000008:002 0 0 0 0 0 1 0
## FBgn0000008:003 0 1 0 1 1 1 0
## FBgn0000008:004 1 0 1 0 1 0 1
## FBgn0000008:005 4 1 1 2 2 0 1
## 70461 more rows ...

4.3 Preparing the data object for the analysis of interest

Exercise 4.6 Select the subset paired-end samples from degFull

dge <- DGEList(dgeFull$counts[,dgeFull$sampleInfo$type=="paired-end"],
               group=dgeFull$sampleInfo$condition[
                 dgeFull$sampleInfo$type=="paired-end"])
dge$sampleInfo <- dgeFull$sampleInfo[dgeFull$sampleInfo$type=="paired-end",]

4.4 Data exploration and quality assessment

Exercise 4.7 Extract pseudo-counts (ie \(\log_2(K+1)\))

pseudoCounts <- log2(dge$counts+1)
head(pseudoCounts)

##             treated2 treated3 untreated3 untreated4
## FBgn0000003    0.000    1.000      0.000      0.000
## FBgn0000008    6.476    6.150      6.267      6.150
## FBgn0000014    0.000    0.000      0.000      0.000
## FBgn0000015    0.000    0.000      1.000      1.585
## FBgn0000017   11.585   11.703     11.800     11.622
## FBgn0000018    8.229    8.271      7.943      8.281

Exercise 4.8 Histogram for pseudo-counts (sample treated2)

hist(pseudoCounts[,"treated2"])

plot of chunk histoPseudoCounts

Exercise 4.9 Boxplot for pseudo-counts

boxplot(pseudoCounts, col="gray")

plot of chunk boxplotPseudoCounts

Exercise 4.10 MA-plots between control or treated samples

par(mfrow=c(1,2))
## treated2 vs treated3
# A values
avalues <- (pseudoCounts[,1] + pseudoCounts[,2])/2
# M values
mvalues <- (pseudoCounts[,1] - pseudoCounts[,2])
plot(avalues, mvalues, xlab="A", ylab="M", pch=19, main="treated")
abline(h=0, col="red")
## untreated3 vs untreated4
# A values
avalues <- (pseudoCounts[,3] + pseudoCounts[,4])/2
# M values
mvalues <- (pseudoCounts[,3] - pseudoCounts[,4])
plot(avalues, mvalues, xlab="A", ylab="M", pch=19, main="control")
abline(h=0, col="red")

plot of chunk maPlotPseudoCounts

Exercise 4.11 MDS for pseudo-counts (using limma package)

plotMDS(pseudoCounts)

plot of chunk MDSPseudoCounts

Exercise 4.12 heatmap for pseudo-counts (using mixOmics package)

sampleDists <- as.matrix(dist(t(pseudoCounts)))
sampleDists

##            treated2 treated3 untreated3 untreated4
## treated2       0.00    60.43      75.96      70.65
## treated3      60.43     0.00      80.03      71.72
## untreated3    75.96    80.03       0.00      65.98
## untreated4    70.65    71.72      65.98       0.00

cimColor <- colorRampPalette(rev(brewer.pal(9, "Blues")))(16)
cim(sampleDists, col=cimColor, symkey=FALSE)

plot of chunk cimPseudoCounts

4.5 Differential expression analysis

Exercise 4.13 remove genes with zero counts for all samples

dge <- DGEList(dge$counts[apply(dge$counts, 1, sum) != 0, ],
               group=dge$sampleInfo$condition)
dge$sampleInfo <- dge$sampleInfo
head(dge$counts)

##             treated2 treated3 untreated3 untreated4
## FBgn0000003        0        1          0          0
## FBgn0000008       88       70         76         70
## FBgn0000015        0        0          1          2
## FBgn0000017     3072     3334       3564       3150
## FBgn0000018      299      308        245        310
## FBgn0000024        7        5          3          3

Exercise 4.14 estimate the normalization factors

dge <- calcNormFactors(dge)
dge$samples

##              group lib.size norm.factors
## treated2   treated  9571826       1.0081
## treated3   treated 10343856       1.0179
## untreated3 control  8358426       1.0041
## untreated4 control  9841335       0.9706

Exercise 4.15 estimate common and tagwise dispersion

dge <- estimateCommonDisp(dge)
dge <- estimateTagwiseDisp(dge)
dge

## An object of class "DGEList"
## $counts
##             treated2 treated3 untreated3 untreated4
## FBgn0000003        0        1          0          0
## FBgn0000008       88       70         76         70
## FBgn0000015        0        0          1          2
## FBgn0000017     3072     3334       3564       3150
## FBgn0000018      299      308        245        310
## 11496 more rows ...
## 
## $samples
##              group lib.size norm.factors
## treated2   treated  9571826       1.0081
## treated3   treated 10343856       1.0179
## untreated3 control  8358426       1.0041
## untreated4 control  9841335       0.9706
## 
## $common.dispersion
## [1] 0.004761
## 
## $pseudo.counts
##              treated2  treated3 untreated3 untreated4
## FBgn0000003 0.000e+00 9.154e-01  2.776e-17  2.776e-17
## FBgn0000008 8.671e+01 6.273e+01  8.564e+01  6.960e+01
## FBgn0000015 2.776e-17 2.776e-17  1.167e+00  1.990e+00
## FBgn0000017 3.025e+03 3.008e+03  4.032e+03  3.133e+03
## FBgn0000018 2.944e+02 2.777e+02  2.777e+02  3.083e+02
## 11496 more rows ...
## 
## $pseudo.lib.size
## [1] 9499789
## 
## $AveLogCPM
## [1] -2.083  3.036 -1.793  8.440  4.940
## 11496 more elements ...
## 
## $prior.n
## [1] 5
## 
## $tagwise.dispersion
## [1] 7.439e-05 8.684e-03 7.439e-05 6.891e-03 3.601e-03
## 11496 more elements ...

Exercise 4.16 perform an exact test for the difference in expression between the two conditions “treated” and “control”

dgeTest <- exactTest(dge)
dgeTest

## An object of class "DGEExact"
## $table
##                logFC logCPM  PValue
## FBgn0000003  2.25662 -2.083 1.00000
## FBgn0000008 -0.05492  3.036 0.82398
## FBgn0000015 -3.78099 -1.793 0.25001
## FBgn0000017 -0.24803  8.440 0.04289
## FBgn0000018 -0.03655  4.940 0.78928
## 11496 more rows ...
## 
## $comparison
## [1] "control" "treated"
## 
## $genes
## NULL

4.6 Independant filtering

Exercise 4.17 remove low expressed genes

filtData <- HTSFilter(dge)$filteredData

plot of chunk filter

dgeTestFilt <- exactTest(filtData)
dgeTestFilt

## An object of class "DGEExact"
## $table
##                 logFC logCPM    PValue
## FBgn0000008 -0.054919  3.036 8.240e-01
## FBgn0000017 -0.248031  8.440 4.289e-02
## FBgn0000018 -0.036550  4.940 7.893e-01
## FBgn0000032  0.004668  6.172 9.729e-01
## FBgn0000042  0.516376 12.711 6.202e-08
## 6614 more rows ...
## 
## $comparison
## [1] "control" "treated"
## 
## $genes
## NULL

4.7 Diagnostic plot for multiple testing

Exercise 4.18 plot a histogram of unadjusted p-values

hist(dgeTest$table[,"PValue"], breaks=50)

plot of chunk histogramPVal

Exercise 4.19 plot a histogram of unadjusted p-values after filtering

hist(dgeTestFilt$table[,"PValue"], breaks=50)

plot of chunk histogramFDR

4.8 Inspecting the results

Exercise 4.20 extract a summary of the differential expression statistics

resNoFilt <- topTags(dgeTest, n=nrow(dgeTest$table))
head(resNoFilt)

## Comparison of groups:  treated-control 
##              logFC logCPM     PValue        FDR
## FBgn0039155 -4.378  5.588 4.293e-184 4.937e-180
## FBgn0025111  2.943  7.159 2.758e-152 1.586e-148
## FBgn0003360 -2.961  8.059 1.939e-151 7.432e-148
## FBgn0039827 -4.129  4.281 5.594e-104 1.608e-100
## FBgn0026562 -2.447 11.903 2.260e-102  5.198e-99
## FBgn0035085 -2.499  5.542  1.241e-96  2.379e-93

resFilt <- topTags(dgeTestFilt, n=nrow(dgeTest$table))
head(resFilt)

## Comparison of groups:  treated-control 
##              logFC logCPM     PValue        FDR
## FBgn0039155 -4.378  5.588 4.293e-184 2.841e-180
## FBgn0025111  2.943  7.159 2.758e-152 9.128e-149
## FBgn0003360 -2.961  8.059 1.939e-151 4.277e-148
## FBgn0039827 -4.129  4.281 5.594e-104 9.257e-101
## FBgn0026562 -2.447 11.903 2.260e-102  2.992e-99
## FBgn0035085 -2.499  5.542  1.241e-96  1.369e-93

Exercise 4.21 compare the number of differentially expressed genes with and without filtering

# before independent filtering
sum(resNoFilt$table$FDR < 0.05)

## [1] 1347

# after independent filtering
sum(resFilt$table$FDR < 0.05)

## [1] 1363

Exercise 4.22 extract and sort differentially expressed genes

sigDownReg <- resFilt$table[resFilt$table$FDR<0.05,]
sigDownReg <- sigDownReg[order(sigDownReg$logFC),]
head(sigDownReg)

##              logFC logCPM     PValue        FDR
## FBgn0085359 -5.153  1.966  2.843e-26  3.764e-24
## FBgn0039155 -4.378  5.588 4.293e-184 2.841e-180
## FBgn0024288 -4.208  2.161  1.359e-33  2.645e-31
## FBgn0039827 -4.129  4.281 5.594e-104 9.257e-101
## FBgn0034434 -3.824  3.107  1.732e-52  7.644e-50
## FBgn0034736 -3.482  4.060  5.794e-68  3.196e-65

sigUpReg <- sigDownReg[order(sigDownReg$logFC, decreasing=TRUE),]
head(sigUpReg)

##             logFC logCPM     PValue        FDR
## FBgn0033764 3.268  2.612  3.224e-29  4.743e-27
## FBgn0035189 2.973  4.427  7.154e-48  2.492e-45
## FBgn0025111 2.943  7.159 2.758e-152 9.128e-149
## FBgn0037290 2.935  2.523  1.192e-25  1.461e-23
## FBgn0038198 2.670  2.587  4.163e-19  3.167e-17
## FBgn0000071 2.565  5.034  1.711e-78  1.416e-75

Exercise 4.24 write the results in csv files

write.csv(sigDownReg, file="sigDownReg.csv")
write.csv(sigUpReg, file="sigUpReg.csv")

4.9 Interpreting the DE analysis results

Exercise 4.25 create a MA plot with 1% differentially expressed genes

plotSmear(dgeTestFilt,
          de.tags = rownames(resFilt$table)[which(resFilt$table$FDR<0.01)])

plot of chunk MADEG

Exercise 4.26 create a Volcano plot

volcanoData <- cbind(resFilt$table$logFC, -log10(resFilt$table$FDR))
colnames(volcanoData) <- c("logFC", "negLogPval")
head(volcanoData)

##       logFC negLogPval
## [1,] -4.378     179.55
## [2,]  2.943     148.04
## [3,] -2.961     147.37
## [4,] -4.129     100.03
## [5,] -2.447      98.52
## [6,] -2.499      92.86

plot(volcanoData, pch=19)

plot of chunk volcanoPlot

Exercise 4.27 transform the normalized counts in log-counts-per-million

y <- cpm(dge, log=TRUE, prior.count = 1)
head(y)

##             treated2 treated3 untreated3 untreated4
## FBgn0000003  -3.2526  -2.3226     -3.253     -3.253
## FBgn0000008   3.2056   2.7556      3.195      2.894
## FBgn0000015  -3.2526  -3.2526     -2.158     -1.670
## FBgn0000017   8.3151   8.3073      8.730      8.366
## FBgn0000018   4.9585   4.8757      4.873      5.025
## FBgn0000024  -0.2681  -0.7863     -1.113     -1.255

Exercise 4.28 select 1% differentially expressed genes and produce a heatmap

selY <- y[rownames(resFilt$table)[resFilt$table$FDR<0.01 & 
                                    abs(resFilt$table$logFC)>1.5],]
head(selY)

##             treated2 treated3 untreated3 untreated4
## FBgn0039155   2.0155    2.335      6.466      6.608
## FBgn0025111   7.9476    7.996      5.059      5.006
## FBgn0003360   5.9800    5.878      8.802      8.970
## FBgn0039827   0.6377    1.516      5.252      5.212
## FBgn0026562  10.1798   10.247     12.544     12.769
## FBgn0035085   3.8172    3.843      6.279      6.363

cimColor <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)[255:1]
cim(t(selY), col=cimColor, symkey=FALSE)

plot of chunk selectDEHeatmap

Practical statistical analysis of RNA-Seq data - edgeR

Annick Moisan, Ignacio Gonzales, Nathalie Villa-Vialaneix

14/10/2014