Last updated: 2023-09-16
Prior to this analysis, reads were: 1. Trimmed using
2. Aligned to GRCm38/mm10
using STAR
3. Reads quantification performed with
Transcript QC, alignment, and quantification were performed by Dr Jimmy Breen
# working with data
# Visualisation:
# Set ggplot theme
pub <- theme_update(
plot.title = element_text(color = "gray20", size = 12, angle = 0, hjust = 0.5, vjust = .5, face = "bold"),
plot.subtitle = element_text(color = "gray20", size = 11, angle = 0, hjust = 0, vjust = .5, face = "plain"),
legend.title = element_text(color = "gray20", size = 11, angle = 0, hjust = 0.5, vjust = .5, face = "plain"),
legend.text = element_text(color = "gray20", size = 11, angle = 0, hjust = 0, vjust = .5, face = "plain"),
axis.text.x = element_text(color = "gray20", size = 11, angle = 0, hjust = .5, vjust = 0, face = "plain"),
axis.title.x = element_text(color = "gray20", size = 11, angle = 0, hjust = .5, vjust = 0, face = "plain"),
axis.text.y = element_text(color = "gray20", size = 11, angle = 0, hjust = 1, vjust = 0.5, face = "plain"),
axis.title.y = element_text(color = "gray20", size = 11, angle = 90, hjust = .5, vjust = .5, face = "plain"))
# Bioconductor packages:
Due to the unusual library size, control 3 was removed from the analysis
# import the mergedOnly dataset, provided by Dr Jimmy Breen on the 24/09/21
rawCount <- read_tsv(here::here("0_data/raw_data/allSamples_mergedOnly.featureCounts.txt"),
col_names = TRUE,
comment = "#") %>%
dplyr::rename(CONT1 = "../2_Hisat2_merged/CONT1_ATGTCA_merged.sorted.nodup.bam",
CONT2 = "../2_Hisat2_merged/CONT2_CGATGT_merged.sorted.nodup.bam",
CONT4 = "../2_Hisat2_merged/CONT4_ACTTGA_merged.sorted.nodup.bam",
INT1 = "../2_Hisat2_merged/INT1_GTCCGC_merged.sorted.nodup.bam",
INT2 = "../2_Hisat2_merged/INT2_ACAGTG_merged.sorted.nodup.bam",
INT3 = "../2_Hisat2_merged/INT3_GATCAG_merged.sorted.nodup.bam",
INT4 = "../2_Hisat2_merged/INT4_CTTGTA_merged.sorted.nodup.bam",
SVX1 = "../2_Hisat2_merged/SVX1_GTTTCG_merged.sorted.nodup.bam",
SVX2 = "../2_Hisat2_merged/SVX2_TAGCTT_merged.sorted.nodup.bam",
SVX3 = "../2_Hisat2_merged/SVX3_ATCACG_merged.sorted.nodup.bam",
SVX4 = "../2_Hisat2_merged/SVX4_GCCAAT_merged.sorted.nodup.bam",
SVX_VAS1 = "../2_Hisat2_merged/SVX_VAS1_AGTCAA_merged.sorted.nodup.bam",
SVX_VAS2 = "../2_Hisat2_merged/SVX_VAS2_AGTTCC_merged.sorted.nodup.bam",
SVX_VAS3 = "../2_Hisat2_merged/SVX_VAS3_TGACCA_merged.sorted.nodup.bam",
SVX_VAS4 = "../2_Hisat2_merged/SVX_VAS4_GGCTAC_merged.sorted.nodup.bam",
VAS1 = "../2_Hisat2_merged/VAS1_CAGATC_merged.sorted.nodup.bam",
VAS2 = "../2_Hisat2_merged/VAS2_GTGAAA_merged.sorted.nodup.bam",
VAS3 = "../2_Hisat2_merged/VAS3_GTGGCC_merged.sorted.nodup.bam",
VAS4 = "../2_Hisat2_merged/VAS4_CCGTCC_merged.sorted.nodup.bam",) %>%
column_to_rownames("Geneid") %>%
rownames(rawCount) <- gsub("\\..+$", "", rownames(rawCount))
# Removing the non-numerical metadata column. SVX_VAS1 may also be an outlier, it is number 18 (BTW)
rawCount<- rawCount[, c(6,7,9:25)]
DT <- function(x, caption){
DT::datatable(x,caption = caption,
extensions = 'Buttons',
options = list(dom = 'Blfrtip', scrollX=T,
buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
lengthMenu = list(c(10,25,50,-1),
saveRDS(DT, here::here("0_data/RDS_objects/DT.rds"))
There are generally two metadata required for DGE analysis.
metadata about each sample
metadata about each gene
The sample metadata can be extracted from the logCPM
column names. These data include sample_id
, sample_type
The sample metadata will be manually generated and stored in the
samples <- read_tsv(here::here("0_data/raw_data/samples.tsv"),
col_names = TRUE) %>%
Gene annotation is useful for the DGE analysis as it will provide useful information about the genes. The annotated genes of Mus musculus can be pulled down by using Annotation Hub.
Annotation Hub also has a web service that can be assessed through
the display function. Pulling down the gene annotation can take a long
time, so after the initial run, the annotated genes is saved to a
file. To save time, if genes.rds
already present, don’t run the code chunk.
ah <- AnnotationHub()
ah %>%
subset(grepl("musculus", species)) %>%
subset(rdataclass == "EnsDb")
#viewing web service for annotation hub
#d <- display(ah)
# Annotation hub html site was used to identify 'code' for the latest mouse genome from Ensembl
ensDb <- ah[["AH95775"]]
genes <- genes(ensDb) %>%
#the annotated genes are saved into a RDS object to save computational time in subsequent run of the setUp.Rmd
genes %>% saveRDS(here::here("0_data/RDS_objects/gene_metadata.rds"))
Using the annotated gene list through AnnotationHub(), load into
object called geneMetadata
. Filter out all genes that are
present in the rawCount and display the number of unique gene_biotypes
present in the rawCount and geneMetadata
geneMetadata <- read_rds(here::here("0_data/RDS_objects/gene_metadata.rds"))
#prepare the gene data frame to contain the genes listed in the rownames of 'rawCount' data
geneMetadata <- data.frame(gene = rownames(rawCount)) %>%
left_join(geneMetadata %>%,
by = c("gene"="gene_id")) %>%
dplyr::distinct(gene, .keep_all=TRUE)
rownames(geneMetadata) <- geneMetadata$gene
#Using the table function, the details of the genes present in the rawCount data can be summaried.
genes <- geneMetadata$gene_biotype %>% table %>%
colnames(genes) <- c("Gene Biotype", "Frequency")
# kable(genes) %>% kable_styling(bootstrap_options = c("striped", "hover")) %>% scroll_box(height = "600px")
genes %>% DT(.,caption = "Table: Gene biotype")
Digital Gene Expression List (DGElist) is a R object class often used for differential gene expression analysis as it simplifies plotting, and interaction with data and metadata.
The DGEList object holds the three dataset that have
imported/created, including rawCount
data and
and geneMetadata
To further save time and memory, genes that were not expressed across
all samples (i.e., 0
count across all columns) are all
#Create DGElist with rawCOunt and gene data. Remove all genes with 0 expression in all treatment groups
dge <- DGEList(counts = rawCount,
samples = samples,
genes = geneMetadata,
remove.zeros = TRUE
Pre-processing steps increased the power of the downstream DGE analysis by eliminating majority of unwanted variance that could obscure the true variance caused by the differences in sample conditions. There are several standard steps that are commonly followed to pre-process and QC raw read counts, including:
Checking Library Size
Removal of Undetectable Genes
QC through MDS/PCA
A simple pre-processing/QC step is checking the quality of library
size (total number of mapped and quantified reads) for each treatment.
This enable identification of potentially mis-labelled or outlying
samples. This is often visualised through ggplot
libSize <- dge$samples %>%
#plot the sample with the lib.size in x and sample_group in y, colour fill for each sample_group
x = lib.size,
y = rownames(dge$samples),
fill = dge$samples$group)
) +
geom_col() +
#draw a vertical line for the mean lib.size
aes (xintercept = lib.size),
data = . %>% summarise_at(vars(lib.size), mean),
linetype = 2
) +
#labelling splot
title = "Sample Library Size",
x = "Library Size",
y = "Samples",
fill = "Sample Groups"
) +
theme(legend.position = "none")
Sample library size. Dash line represent average library size
#save the plot to .svg
plot = libSize + pub,
width = 250,
height = 166,
units = "mm")
Filtering out low-expressed genes is a standard pre-processing step in DGE analysis as it can significantly increase the power to differentiate differentially expressed genes by eliminating the variance caused by genes that are lowly expressed in all samples.
The threshold of removal is arbitrary and is often determined after
visualisation of the count distribution. The count distribution can be
illustrated in a density plot through ggplot
. A common
metric used to display the count distribution is log Counts per
Million (logCPM)
beforeFiltering <- dge %>%
#transform the raw count to logCPM
edgeR::cpm(log = TRUE) %>%
#melting (reorganising) the transformed logCPM data with respect to the id variable (i.e., the row and column names). Very similar to pivot_long function
melt %>%
#retain all rows where the logCPM (value) is finite. All of them in this case are finite
dplyr::filter(is.finite(value)) %>%
#plot the long formate logCPM counts in a density plot with x the logCPM and colour by the sample_id
x = value,
colour = Var2
)) +
geom_density() +
# remove the legend
guides(colour = FALSE) +
#add figure tittle and subtitle and labels
ggtitle("Before Filtering Low-expressed Genes", subtitle = paste0(nrow(dge), " genes"))+
x = "logCPM",
y = "Density",
colour = "Sample Groups"
#save plot
plot = beforeFiltering + pub,
width = 250,
height = 166,
units = "mm",
path = here::here("2_plots/qc/"))
Ideally, the filtering the low-expressed genes should remove the
large peak with logCPM < 0
, i.e., remove any genes which
have less than one count per million.
A common guideline is to keep all genes that have > 1-2 cpm in the
smallest group on a treatment. In this case, the smallest group is 3 as
each treatment condition had three replicates. However, due to the high
variance of some groups, the filtering is increased to keep genes that
are are more than 3 CPM in at least 3 samples.
Mathematically this would be identifying genes (rows) with CPM
> 3
; and identifying total row sum that is
>= 3
#the genes kept have >2 CPM for at least 3 samples
keptGenes <- (rowSums(cpm(dge) > 3) >= 3)
afterFiltering <- dge %>%
#transform the raw count to logCPM
edgeR::cpm(log = TRUE) %>%
#for var1 (gene names) extract only the keptGenes and discard all other genes in the logCPM data
magrittr::extract(keptGenes,) %>%
#melting (reorganising) the transformed logCPM data with respect to the id variable (i.e., the row and column names). Very similar to pivot_long function
melt %>%
#retain all rows where the logCPM (value) is finite. All of them in this case are finite
dplyr::filter(is.finite(value)) %>%
x = value,
colour = Var2
)) +
geom_density() +
#remove the legend
guides(colour = FALSE) +
#add figure tittle and subtitle and labels. since keptGenes is a logic element, the second element represents the number of genes that were kept after the filtering
ggtitle("After Filtering Low-expressed Genes", subtitle = paste0(table(keptGenes)[[2]], " genes")) +
x = "logCPM",
y = "Density",
colour = "Sample Groups"
#save plot
plot = afterFiltering + pub,
width = 250,
height = 166,
units = "mm",
path = here::here("2_plots/qc/"))
#display plot
# afterFiltering
#display plot side by side
cowplot::plot_grid(beforeFiltering + pub, afterFiltering + pub)
Before and after removal of lowly expressed genes
ggsave(filename = "counts_before_after_filtering_3_3.svg",
path = here::here("2_plots/qc/"),
width = 320,
height = 180,
units = "mm")
Following the filtering of low-expressed genes < 3 CPM in at least 3 samples, out of the total 36300 genes left after the removal of genes with no expression, 19078 genes were removed, leaving only 17222 genes remaining for the downstream analysis
After filtering the low-expressed genes, the DGElist object is updated to eliminate the low-expressed genes from future analysis
#extract genes from keptGenes and recalculate the lib size
dge <- dge[keptGenes,,keep.lib.sizes = FALSE]
Using the TMM (trimmed mean of M value) method of normalisation
through the edgeR
package. The TMM approach creates a
scaling factor as an offset to be supplied to Negative Binomial model.
The ca;cNormFactors
function calculate the normalisation
and return the adjusted norm.factor
to the
#after normalisation
dge <- edgeR::calcNormFactors(object = dge,
method = "TMM")
# knitr::kable(dge$samples, caption = "Normalised samples") %>%
# kable_styling(bootstrap_options = c("striped", "hover")) %>%
# scroll_box(height = "600px")
dge$samples %>% DT(.,caption = "Table: Normalised samples")
The following visualisation of the TMM normalisation is plotted using
the mean-difference (MD) plot. The MD plot visualise the library
size-adjusted logFC between two samples (the difference) against the
log-expression across all samples (the mean). In this instance,
sample 1
is used to compare against an artificial library
construct from the average of all the other samples
limma::plotMD(cpm(dge, log = TRUE), column=1)
abline(h=0, col="red", lty=2, lwd=2)
MA plot of TMM normalisation for control 1
Ideally, the bulk of gene expression following the TMM normalisation
should be centred around expression log-ratio
of 0, which
indicates that library size bias between samples have been successfully
removed. This should be repeated with all the samples in the dge
samples <- dge$samples %>% rownames_to_column("sampleName")
samples$rep <- samples$rep %>% as.factor()
# Perform PCA analysis:
pca_analysis <- prcomp(t(cpm(dge, log = TRUE)))
summary(pca_analysis)$importance %>%
group.colours <- c(CONT = "#F8766D", INT = "#A3A500")
# Create the plot
a <- pca_analysis$x %>% %>%
rownames_to_column("sampleName") %>%
left_join(samples) %>%
as_tibble() %>%
ggplot(aes(x = PC1, y = PC2, colour = group, shape = rep)) +
geom_point(size=3, alpha=0.5) +
scale_shape_manual(values = c(15:18)) +
x = paste0("PC1 (", percent(summary(pca_analysis)$importance["Proportion of Variance","PC1"]),")"),
y = paste0("PC2 (", percent(summary(pca_analysis)$importance["Proportion of Variance","PC2"]),")"),
colour = "Groups",
shape = "Replicates"
b <- pca_analysis$x %>% %>%
rownames_to_column("sampleName") %>%
left_join(samples) %>%
as_tibble() %>%
ggplot(aes(x = PC2, y = PC3, colour = group, shape = rep)) +
geom_point(size=3, alpha=0.5) +
scale_shape_manual(values = c(15:18)) +
x = paste0("PC2 (", percent(summary(pca_analysis)$importance["Proportion of Variance","PC2"]),")"),
y = paste0("PC3 (", percent(summary(pca_analysis)$importance["Proportion of Variance","PC3"]),")"),
colour = "Groups",
shape = "Replicates"
c <- pca_analysis$x %>% %>%
rownames_to_column("sampleName") %>%
left_join(samples) %>%
as_tibble() %>%
dplyr::slice(1:7) %>%
ggplot(aes(x = PC1, y = PC2, colour = group, shape = rep)) +
geom_point(size=3, alpha=0.8) +
scale_color_manual(values = group.colours)+
scale_shape_manual(values = c(15:18)) +
x = paste0("PC1 (", percent(summary(pca_analysis)$importance["Proportion of Variance","PC1"]),")"),
y = paste0("PC2 (", percent(summary(pca_analysis)$importance["Proportion of Variance","PC2"]),")"),
colour = "Groups",
shape = "Replicates"
PCA plot of all samples.
plot = c + pub,
path = here::here("2_plots/qc/"),
width = 150,
height = 100,
units = "mm")
corr_plot <- pca_analysis$x %>% %>%
rownames_to_column("sampleName") %>%
left_join(samples) %>%
as_tibble() %>%
"Library size"=lib.size,
"Normalisation Factor"=norm.factors
) %>%
mutate(Groups = as.numeric(as.factor(Groups))) %>%
cor(method = "spearman") %>%
type = "lower",
diag = FALSE,
addCoef.col = 1, addCoefasPercent = TRUE
Correlation between first three principle components and measured variables
# Save DGElist object into the data/R directory
saveRDS(object = dge, file = here::here("0_data/RDS_objects/dge.rds"))
saveRDS(object = pub, file = here::here("0_data/RDS_objects/pub.rds"))
# saveRDS(object = gg_publish, file = here::here("0_data/RDS_objects/gg_publish.rds"))
