voom
+ limma
)A recent study by McClain et al. analyzed the host response to SARS-CoV-2 infection through RNA sequencing of peripheral blood samples from subjects with COVID-19, comparing them to subjects with seasonal coronavirus, influenza, bacterial pneumonia, and healthy controls. Here, we will use the data generated by this study, publicly available at GEO under accession GSE161731, to perform a differential gene expression (DGE) analysis between conditions in R.
Make sure you have R (>=4.0.0
) and Rstudio installed. Then install the following packages:
install.packages("BiocManager") # From CRAN
BiocManager::install(version = "3.12", update = FALSE)
options(timeout = 600) # Allow large package download
BiocManager::install(c("edgeR", "topGO", "org.Hs.eg.db", # From Bioconductor
"factoextra", "pheatmap" # and CRAN
), update = FALSE)
Load them in R:
library(edgeR) # edgeR loads limma
library(factoextra)
library(pheatmap)
library(topGO)
library(org.Hs.eg.db)
And set your working directory:
# e.g. in Windows
setwd("C:/Users/User/Desktop") # Substitute 'User' by your actual username
files_to_download <- c("GSE161731_counts.csv.gz", # Gene expression (count matrix)
"GSE161731_counts_key.csv.gz") # Metadata
# Download them
for (f in files_to_download) {
url <- paste0("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE161nnn/GSE161731/suppl/", f)
download.file(url, destfile = f)
}
# Load both files in R
counts <- read.csv("GSE161731_counts.csv.gz",
header = TRUE, check.names = FALSE, row.names = 1)
metadata <- read.csv("GSE161731_counts_key.csv.gz",
header = TRUE, check.names = FALSE, row.names = 1)
Have a look at counts
:
counts[1:5, 1:5]
dim(counts)
94189 | DU09-03S0000604 | DU09-03S0000611 | 105920 | DU09-03S0000774 | |
---|---|---|---|---|---|
ENSG00000223972 | 0 | 0 | 3 | 49 | 6 |
ENSG00000227232 | 393 | 142 | 152 | 246 | 586 |
ENSG00000278267 | 0 | 22 | 29 | 44 | 104 |
ENSG00000243485 | 0 | 0 | 0 | 0 | 1 |
ENSG00000274890 | 0 | 0 | 0 | 0 | 0 |
## [1] 60675 201
Have a look at metadata
:
head(metadata)
dim(metadata)
subject_id | age | gender | race | cohort | time_since_onset | hospitalized | batch | |
---|---|---|---|---|---|---|---|---|
94189 | A1BD46 | 57 | Female | Black/African American | Bacterial | NA | NA | 1 |
DU09-03S0000604 | 44DF6B | 19 | Male | Black/African American | Influenza | NA | NA | 2 |
DU09-03S0000611 | 658A11 | 14 | Male | White | Influenza | NA | NA | 2 |
105920 | 61DE97 | 21 | Female | White | Influenza | NA | NA | 1 |
DU09-03S0000774 | 4D4F7C | 50 | Female | Black/African American | Influenza | NA | NA | 1 |
DU09-03S0000775 | C4D511 | 39 | Female | Black/African American | Influenza | NA | NA | 1 |
## [1] 198 8
Before doing any further analysis, we should clean the data up. Among others:
Remove duplicated measurements:
# Count samples per condition
table(metadata$cohort)
##
## Bacterial CoV other COVID-19 healthy Influenza
## 24 61 77 19 17
# Some individuals have replicated measurements
table(table(metadata$subject_id))
##
## 1 2 3 7
## 131 9 14 1
metadata <- metadata[!duplicated(metadata$subject_id), ] # remove them
table(metadata$cohort)
##
## Bacterial CoV other COVID-19 healthy Influenza
## 24 50 46 18 17
counts <- counts[, rownames(metadata)] # Subset count matrix accordingly
Ensure that the variables are of the appropriate class:
# Convert age to numeric
class(metadata$age)
## [1] "character"
metadata$age <- gsub(">89", "90", metadata$age)
metadata$age <- as.numeric(as.character(metadata$age))
class(metadata$age)
## [1] "numeric"
Metadata cleanup:
# Remove/modify some characters in the "cohort" and "race" columns
cols <- c("cohort", "race")
metadata[, cols] <- apply(metadata[, cols], 2, function(x){
y <- gsub("-", "", x) # Remove "-"
z <- gsub(" ", "_", y) # Change " " (space) by "_"
w <- gsub("/", "_", z) # Change "/" by "_"
return(w)
})
# Rename rows of 'metadata' (columns of 'counts') to individual id
identical(rownames(metadata), colnames(counts))
## [1] TRUE
rownames(metadata) <- metadata$subject_id
colnames(counts) <- metadata$subject_id
# Remove some columns from 'metadata'
metadata[, c("subject_id", "time_since_onset", "hospitalized")] <- NULL
Have a look (again) at counts
:
counts[1:5, 1:5]
dim(counts)
A1BD46 | 44DF6B | 658A11 | 61DE97 | 4D4F7C | |
---|---|---|---|---|---|
ENSG00000223972 | 0 | 0 | 3 | 49 | 6 |
ENSG00000227232 | 393 | 142 | 152 | 246 | 586 |
ENSG00000278267 | 0 | 22 | 29 | 44 | 104 |
ENSG00000243485 | 0 | 0 | 0 | 0 | 1 |
ENSG00000274890 | 0 | 0 | 0 | 0 | 0 |
## [1] 60675 155
Have a look (again) at metadata
:
head(metadata)
dim(metadata)
age | gender | race | cohort | batch | |
---|---|---|---|---|---|
A1BD46 | 57 | Female | Black_African_American | Bacterial | 1 |
44DF6B | 19 | Male | Black_African_American | Influenza | 2 |
658A11 | 14 | Male | White | Influenza | 2 |
61DE97 | 21 | Female | White | Influenza | 1 |
4D4F7C | 50 | Female | Black_African_American | Influenza | 1 |
C4D511 | 39 | Female | Black_African_American | Influenza | 1 |
## [1] 155 5
Subset the conditions of interest. As an example, here we will use the ‘Bacterial’ and ‘Influenza’ groups.
metadata <- subset(metadata, cohort %in% c("Bacterial", "Influenza"))
counts <- counts[, rownames(metadata)]
Prior to differential gene expression analysis, the count matrix should be normalized to take into account the library size (e.g. in an experiment A with a high number of mapped reads, a gene will get more reads than in an experiment B with a smaller number of mapped reads). Lowly expressed genes should also be removed.
dge <- DGEList(counts) # Create DGEList object
class(dge)
## [1] "DGEList"
## attr(,"package")
## [1] "edgeR"
dge$counts[1:5, 1:5]
## A1BD46 44DF6B 658A11 61DE97 4D4F7C
## ENSG00000223972 0 0 3 49 6
## ENSG00000227232 393 142 152 246 586
## ENSG00000278267 0 22 29 44 104
## ENSG00000243485 0 0 0 0 1
## ENSG00000274890 0 0 0 0 0
head(dge$samples)
## group lib.size norm.factors
## A1BD46 1 52431553 1
## 44DF6B 1 50351794 1
## 658A11 1 52172232 1
## 61DE97 1 53276313 1
## 4D4F7C 1 43639732 1
## C4D511 1 56439096 1
# CPM (Counts Per Million Reads) - corrects for library size
CPM <- cpm(dge)
class(CPM)
## [1] "matrix" "array"
CPM[1:5, 1:5]
## A1BD46 44DF6B 658A11 61DE97 4D4F7C
## ENSG00000223972 0.000000 0.0000000 0.05750185 0.9197333 0.1374894
## ENSG00000227232 7.495487 2.8201577 2.91342720 4.6174366 13.4281301
## ENSG00000278267 0.000000 0.4369258 0.55585124 0.8258830 2.3831494
## ENSG00000243485 0.000000 0.0000000 0.00000000 0.0000000 0.0229149
## ENSG00000274890 0.000000 0.0000000 0.00000000 0.0000000 0.0000000
# Remove lowly expressed genes (e.g median CPM < 0.2)
drop <- which(apply(CPM, 1, median) < 0.2)
dge <- dge[-drop, ]
CPM <- CPM[-drop, ]
dim(CPM) # Number of genes left = dim(dge)
## [1] 16415 41
PCA <- prcomp(t(CPM), scale = T)
fviz_pca_ind(PCA,
habillage = metadata$cohort,
# try others e.g. metadata$race, metadata$gender
# for numerical variables e.g. 'age', use instead: col.ind = metadata$age
pointsize = 3)
pheatmap(log2(CPM + 1),
show_rownames = F,
annotation_col = metadata)
Remove outlier individuals (896282
). Recompute metadata
, counts
, dge
and CPM
:
metadata <- metadata[-which(rownames(metadata) == "896282"), ]
counts <- counts[, -which(colnames(counts) == "896282")]
dge <- DGEList(counts)
CPM <- cpm(dge)
drop <- which(apply(CPM, 1, median) < 0.2)
dge <- dge[-drop, ]
CPM <- CPM[-drop, ]
voom
+ limma
)# Specify model (include age to correct)
age <- metadata$age
cohort <- metadata$cohort
mm <- model.matrix(~0 + age + cohort)
# Learn voom weigths
y <- voom(dge, mm, plot = T)
What is voom
doing? (more details here)
Counts are transformed to log2(CPM)
A linear model is fitted to the log2(CPM) for each gene, and the residuals are calculated.
A smoothed curve is fitted to the sqrt(residual standard deviation) by average expression (see red line in plot above).
The smoothed curve is used to obtain weights for each gene and sample that are passed into limma
along with the log2(CPM).
Now limma
fits a linear model using weighted least squares for each gene:
fit <- lmFit(y, mm)
# Comparisons between groups (log fold-changes) are obtained as contrasts of these fitted linear models
contr <- makeContrasts(cohortBacterial - cohortInfluenza, levels = colnames(coef(fit)))
# Estimate contrast for each gene
tmp <- contrasts.fit(fit, contr)
# Empirical Bayes smoothing of standard errors
# (shrinks standard errors that are much larger or smaller than those
# from other genes towards the average standard error,
# see https://www.degruyter.com/doi/10.2202/1544-6115.1027)
tmp <- eBayes(tmp)
results <- topTable(tmp, sort.by = "P", n = Inf) # Get all results, ordered by P-value
Have a look at results
:
head(results)
dim(results)
logFC | AveExpr | t | P.Value | adj.P.Val | B | |
---|---|---|---|---|---|---|
ENSG00000179299 | 3.646705 | 3.639779 | 10.904972 | 0 | 0 | 21.74357 |
ENSG00000112299 | 4.943435 | 5.962736 | 10.503483 | 0 | 0 | 20.80718 |
ENSG00000111335 | -4.768375 | 7.500121 | -10.369014 | 0 | 0 | 20.40415 |
ENSG00000123119 | 7.773127 | -1.829586 | 10.302889 | 0 | 0 | 16.74393 |
ENSG00000007237 | 2.340649 | 8.401871 | 9.889617 | 0 | 0 | 18.98974 |
ENSG00000090376 | 3.353896 | 7.644722 | 9.783039 | 0 | 0 | 18.66466 |
## [1] 16571 6
logFC
: log2 fold change of Bacterial/Influenza
logFC
is positive: a gene has higher expression in ‘Bacterial’ than in ‘Influenza’.logFC
is negative: has lower expression in ‘Bacterial’ than in ‘Influenza’.AveExpr
: Average expression across all samples, in log2(CPM)t
: logFC
divided by its standard errorP.Value
: Raw p-value (based on t
) from test that logFC
differs from 0adj.P.Val
: Benjamini-Hochberg false discovery rate (multiple testing correction, FDR) adjusted p-valueB
: log-odds that gene is DE (less useful than the other columns)Volcano plot:
ggplot(results, aes(x = logFC, -log10(adj.P.Val))) +
geom_point()
How many DE genes are there? (5% FDR)
results.sig5 <- subset(results, adj.P.Val < 0.05)
dim(results.sig5)
## [1] 4303 6
Volcano plot (coloring top genes):
results$sel <- (results$adj.P.Val < 0.05 & abs(results$logFC) > 5)
ggplot(results, aes(x = logFC, -log10(adj.P.Val), color = sel)) +
geom_point()
Boxplots for the top 1 gene:
df <- data.frame(metadata, expression = log2(CPM[rownames(results.sig5)[1], ] +1 ))
ggplot(df, aes(x = cohort, y = expression, fill = cohort)) +
geom_boxplot() +
ylab("log2(CPM+1)") +
ggtitle("Top 1 gene expression vs 'cohort'") +
theme(plot.title = element_text(hjust = 0.5))
Heatmap/clustering (only DE genes):
pheatmap(log2(CPM[rownames(results.sig5), ] + 1 ),
show_rownames = F,
annotation_col = metadata[, "cohort", drop = F])
You can find out more about the differentially expressed genes and their protein products (if any) at GeneCards, Ensembl, Uniprot or Pubmed.
# topGO requires all genes as a named list of P-values, plus a function to select significant genes
allGenes <- results$adj.P.Val
names(allGenes) <- rownames(results)
# Create topGO object
GOdata <- new("topGOdata",
ontology = "BP",
allGenes = allGenes,
geneSel = function(p){p < 0.05}, # Define function
annotationFun = annFUN.org,
mapping = "org.Hs.eg.db",
ID = "ensembl")
resultFisher <- runTest(GOdata, algorithm = "classic", statistic = "fisher")
# Results
GOres <- GenTable(GOdata, classicFisher = resultFisher, topNodes = 10)
Have a look at GOres
:
head(GOres)
GO.ID | Term | Annotated | Significant | Expected | classicFisher |
---|---|---|---|---|---|
GO:0006955 | immune response | 1762 | 732 | 534.87 | 1.9e-27 |
GO:0002376 | immune system process | 2506 | 957 | 760.72 | 2.0e-21 |
GO:0044419 | interspecies interaction between organis… | 1721 | 671 | 522.43 | 8.6e-17 |
GO:0009607 | response to biotic stimulus | 1143 | 472 | 346.97 | 9.8e-17 |
GO:0043207 | response to external biotic stimulus | 1119 | 462 | 339.68 | 2.3e-16 |
GO:0051707 | response to other organism | 1119 | 462 | 339.68 | 2.3e-16 |
You can learn more about Gene Ontology here.