Introduction

A recent study by McClain et al. analyzed the host response to SARS-CoV-2 infection through RNA sequencing of peripheral blood samples from subjects with COVID-19, comparing them to subjects with seasonal coronavirus, influenza, bacterial pneumonia, and healthy controls. Here, we will use the data generated by this study, publicly available at GEO under accession GSE161731, to perform a differential gene expression (DGE) analysis between conditions in R.

Before starting: install and load required R packages

Make sure you have R (>=4.0.0) and Rstudio installed. Then install the following packages:

install.packages("BiocManager")  # From CRAN

BiocManager::install(version = "3.12", update = FALSE)

options(timeout = 600) # Allow large package download

BiocManager::install(c("edgeR", "topGO", "org.Hs.eg.db",  # From Bioconductor
                       "factoextra", "pheatmap"           # and CRAN
                       ), update = FALSE)

Load them in R:

library(edgeR)  # edgeR loads limma 
library(factoextra)
library(pheatmap)
library(topGO)
library(org.Hs.eg.db)

And set your working directory:

# e.g. in Windows
setwd("C:/Users/User/Desktop")  # Substitute 'User' by your actual username


Download RNA-seq data from GEO and load it into R

files_to_download <- c("GSE161731_counts.csv.gz",      # Gene expression (count matrix)
                       "GSE161731_counts_key.csv.gz")  # Metadata

# Download them
for (f in files_to_download) {
  url <- paste0("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE161nnn/GSE161731/suppl/", f)
  download.file(url, destfile = f)
}

# Load both files in R
counts <- read.csv("GSE161731_counts.csv.gz", 
                   header = TRUE, check.names = FALSE, row.names = 1)
metadata <- read.csv("GSE161731_counts_key.csv.gz", 
                     header = TRUE, check.names = FALSE, row.names = 1)

Have a look at counts:

counts[1:5, 1:5]
dim(counts)
94189 DU09-03S0000604 DU09-03S0000611 105920 DU09-03S0000774
ENSG00000223972 0 0 3 49 6
ENSG00000227232 393 142 152 246 586
ENSG00000278267 0 22 29 44 104
ENSG00000243485 0 0 0 0 1
ENSG00000274890 0 0 0 0 0
## [1] 60675   201

Have a look at metadata:

head(metadata)
dim(metadata)
subject_id age gender race cohort time_since_onset hospitalized batch
94189 A1BD46 57 Female Black/African American Bacterial NA NA 1
DU09-03S0000604 44DF6B 19 Male Black/African American Influenza NA NA 2
DU09-03S0000611 658A11 14 Male White Influenza NA NA 2
105920 61DE97 21 Female White Influenza NA NA 1
DU09-03S0000774 4D4F7C 50 Female Black/African American Influenza NA NA 1
DU09-03S0000775 C4D511 39 Female Black/African American Influenza NA NA 1
## [1] 198   8


Data preprocessing

Before doing any further analysis, we should clean the data up. Among others:

Remove duplicated measurements:

# Count samples per condition
table(metadata$cohort)
## 
## Bacterial CoV other  COVID-19   healthy Influenza 
##        24        61        77        19        17
# Some individuals have replicated measurements
table(table(metadata$subject_id)) 
## 
##   1   2   3   7 
## 131   9  14   1
metadata <- metadata[!duplicated(metadata$subject_id), ] # remove them
table(metadata$cohort)
## 
## Bacterial CoV other  COVID-19   healthy Influenza 
##        24        50        46        18        17
counts <- counts[, rownames(metadata)]  # Subset count matrix accordingly

Ensure that the variables are of the appropriate class:

# Convert age to numeric
class(metadata$age)
## [1] "character"
metadata$age <- gsub(">89", "90", metadata$age)
metadata$age <- as.numeric(as.character(metadata$age))
class(metadata$age)
## [1] "numeric"

Metadata cleanup:

# Remove/modify some characters in the "cohort" and "race" columns
cols <- c("cohort", "race")
metadata[, cols] <- apply(metadata[, cols], 2, function(x){
    y <- gsub("-", "", x)   # Remove "-" 
    z <- gsub(" ", "_", y)  # Change " " (space) by "_"
    w <- gsub("/", "_", z)  # Change "/" by "_"
    return(w)
})
  
# Rename rows of 'metadata' (columns of 'counts') to individual id
identical(rownames(metadata), colnames(counts))
## [1] TRUE
rownames(metadata) <- metadata$subject_id
colnames(counts) <- metadata$subject_id

# Remove some columns from 'metadata'
metadata[, c("subject_id", "time_since_onset", "hospitalized")]  <- NULL 

Have a look (again) at counts:

counts[1:5, 1:5]
dim(counts)
A1BD46 44DF6B 658A11 61DE97 4D4F7C
ENSG00000223972 0 0 3 49 6
ENSG00000227232 393 142 152 246 586
ENSG00000278267 0 22 29 44 104
ENSG00000243485 0 0 0 0 1
ENSG00000274890 0 0 0 0 0
## [1] 60675   155

Have a look (again) at metadata:

head(metadata)
dim(metadata)
age gender race cohort batch
A1BD46 57 Female Black_African_American Bacterial 1
44DF6B 19 Male Black_African_American Influenza 2
658A11 14 Male White Influenza 2
61DE97 21 Female White Influenza 1
4D4F7C 50 Female Black_African_American Influenza 1
C4D511 39 Female Black_African_American Influenza 1
## [1] 155   5

Subset the conditions of interest. As an example, here we will use the ‘Bacterial’ and ‘Influenza’ groups.

metadata <- subset(metadata, cohort %in% c("Bacterial", "Influenza"))
counts <- counts[, rownames(metadata)]


Gene expression normalization and removal of lowly expressed genes

Prior to differential gene expression analysis, the count matrix should be normalized to take into account the library size (e.g. in an experiment A with a high number of mapped reads, a gene will get more reads than in an experiment B with a smaller number of mapped reads). Lowly expressed genes should also be removed.

dge <- DGEList(counts)  # Create DGEList object
class(dge)
## [1] "DGEList"
## attr(,"package")
## [1] "edgeR"
dge$counts[1:5, 1:5]
##                 A1BD46 44DF6B 658A11 61DE97 4D4F7C
## ENSG00000223972      0      0      3     49      6
## ENSG00000227232    393    142    152    246    586
## ENSG00000278267      0     22     29     44    104
## ENSG00000243485      0      0      0      0      1
## ENSG00000274890      0      0      0      0      0
head(dge$samples)
##        group lib.size norm.factors
## A1BD46     1 52431553            1
## 44DF6B     1 50351794            1
## 658A11     1 52172232            1
## 61DE97     1 53276313            1
## 4D4F7C     1 43639732            1
## C4D511     1 56439096            1
# CPM (Counts Per Million Reads) - corrects for library size
CPM <- cpm(dge)
class(CPM)
## [1] "matrix" "array"
CPM[1:5, 1:5]
##                   A1BD46    44DF6B     658A11    61DE97     4D4F7C
## ENSG00000223972 0.000000 0.0000000 0.05750185 0.9197333  0.1374894
## ENSG00000227232 7.495487 2.8201577 2.91342720 4.6174366 13.4281301
## ENSG00000278267 0.000000 0.4369258 0.55585124 0.8258830  2.3831494
## ENSG00000243485 0.000000 0.0000000 0.00000000 0.0000000  0.0229149
## ENSG00000274890 0.000000 0.0000000 0.00000000 0.0000000  0.0000000
# Remove lowly expressed genes (e.g median CPM < 0.2)
drop <- which(apply(CPM, 1, median) < 0.2)
dge <- dge[-drop, ]
CPM <- CPM[-drop, ]
dim(CPM)  # Number of genes left = dim(dge)
## [1] 16415    41


Exploratory data analysis

Principal Component Analysis (PCA)

PCA <- prcomp(t(CPM), scale = T)
fviz_pca_ind(PCA, 
             habillage = metadata$cohort, 
             # try others e.g. metadata$race, metadata$gender
             # for numerical variables e.g. 'age', use instead: col.ind = metadata$age
             pointsize = 3) 

Clustering/Heatmap

pheatmap(log2(CPM + 1),
         show_rownames = F, 
         annotation_col = metadata)

Remove outlier individuals (896282). Recompute metadata, counts, dge and CPM:

metadata <- metadata[-which(rownames(metadata) == "896282"), ] 
counts <- counts[, -which(colnames(counts) == "896282")]
dge <- DGEList(counts)
CPM <- cpm(dge)
drop <- which(apply(CPM, 1, median) < 0.2)
dge <- dge[-drop, ]
CPM <- CPM[-drop, ]


Differential gene expression analysis (voom + limma)

# Specify model (include age to correct)
age <- metadata$age 
cohort <- metadata$cohort

mm <- model.matrix(~0 + age + cohort)

# Learn voom weigths
y <- voom(dge, mm, plot = T)

What is voom doing? (more details here)


Now limma fits a linear model using weighted least squares for each gene:

fit <- lmFit(y, mm)
  
# Comparisons between groups (log fold-changes) are obtained as contrasts of these fitted linear models
contr <- makeContrasts(cohortBacterial - cohortInfluenza, levels = colnames(coef(fit)))
  
# Estimate contrast for each gene
tmp <- contrasts.fit(fit, contr)
  
# Empirical Bayes smoothing of standard errors 
# (shrinks standard errors that are much larger or smaller than those 
# from other genes towards the average standard error,
# see https://www.degruyter.com/doi/10.2202/1544-6115.1027)
tmp <- eBayes(tmp)
  
results <- topTable(tmp, sort.by = "P", n = Inf) # Get all results, ordered by P-value

Have a look at results:

head(results)
dim(results)
logFC AveExpr t P.Value adj.P.Val B
ENSG00000179299 3.646705 3.639779 10.904972 0 0 21.74357
ENSG00000112299 4.943435 5.962736 10.503483 0 0 20.80718
ENSG00000111335 -4.768375 7.500121 -10.369014 0 0 20.40415
ENSG00000123119 7.773127 -1.829586 10.302889 0 0 16.74393
ENSG00000007237 2.340649 8.401871 9.889617 0 0 18.98974
ENSG00000090376 3.353896 7.644722 9.783039 0 0 18.66466
## [1] 16571     6

Volcano plot:

ggplot(results, aes(x = logFC, -log10(adj.P.Val))) + 
  geom_point() 

How many DE genes are there? (5% FDR)

results.sig5 <- subset(results, adj.P.Val < 0.05) 
dim(results.sig5)
## [1] 4303    6

Volcano plot (coloring top genes):

results$sel <- (results$adj.P.Val < 0.05 & abs(results$logFC) > 5)
ggplot(results, aes(x = logFC, -log10(adj.P.Val), color = sel)) + 
  geom_point() 

Boxplots for the top 1 gene:

df <- data.frame(metadata, expression = log2(CPM[rownames(results.sig5)[1], ] +1 ))
ggplot(df, aes(x = cohort, y = expression, fill = cohort)) +
  geom_boxplot() +
  ylab("log2(CPM+1)") +
  ggtitle("Top 1 gene expression vs 'cohort'") +
  theme(plot.title = element_text(hjust = 0.5))

Heatmap/clustering (only DE genes):

pheatmap(log2(CPM[rownames(results.sig5), ] + 1 ), 
         show_rownames = F, 
         annotation_col = metadata[, "cohort", drop = F])

You can find out more about the differentially expressed genes and their protein products (if any) at GeneCards, Ensembl, Uniprot or Pubmed.


Gene Ontology enrichment analysis

# topGO requires all genes as a named list of P-values, plus a function to select significant genes
allGenes <- results$adj.P.Val
names(allGenes) <- rownames(results)
  
# Create topGO object
GOdata <- new("topGOdata",
              ontology = "BP",
              allGenes = allGenes,
              geneSel = function(p){p < 0.05}, # Define function
              annotationFun = annFUN.org,
              mapping = "org.Hs.eg.db",
              ID = "ensembl")
  
resultFisher <- runTest(GOdata, algorithm = "classic", statistic = "fisher")

# Results
GOres <- GenTable(GOdata, classicFisher = resultFisher, topNodes = 10) 

Have a look at GOres:

head(GOres)
GO.ID Term Annotated Significant Expected classicFisher
GO:0006955 immune response 1762 732 534.87 1.9e-27
GO:0002376 immune system process 2506 957 760.72 2.0e-21
GO:0044419 interspecies interaction between organis… 1721 671 522.43 8.6e-17
GO:0009607 response to biotic stimulus 1143 472 346.97 9.8e-17
GO:0043207 response to external biotic stimulus 1119 462 339.68 2.3e-16
GO:0051707 response to other organism 1119 462 339.68 2.3e-16

You can learn more about Gene Ontology here.