MAST - MattHuff/scRNASeq_011224 GitHub Wiki
Load Packages, Set Working Directory, and Load RDS
# Load Packages
library(Seurat)
library(dplyr)
library(Libra)
library(MAST)
library(openxlsx)
library(tidyverse)
library(ggrepel)
library(ggplot2)
library(ggpubr)
library(cowplot)
library(EnhancedVolcano)
library(patchwork)
# Set Working Directory
setwd("~/Downloads/scRNASeq_011224/5_Seurat/")
# Optional - Load RCS
h5_seurat_class <- readRDS("outputs/7_classified_integrated_norris.rds")
Add Co-Variates
There are not many co-variates in this analysis compared to the previous analysis, but it is still good practice to use what we can.
# Define Patients
[email protected] <- [email protected] |>
dplyr::mutate(patient = dplyr::case_when(sample %in% c("1", "2") ~ "LVF002",
sample %in% c("3", "4") ~ "LVF006",
.default = "NA"))
# Confirm cell counts
cell_counts <- [email protected] %>%
group_by(sample, manual_celltype) %>%
summarise(cell_count = n())
Run Find Markers with MAST
Prepare Output Directories
de_path <- "outputs/de_out"
if(!dir.exists(de_path)){dir.create(de_path,recursive = T)}
Set Up Loop
celltypes <- unique(h5_seurat_class$manual_celltype)
combined_df <- data.frame()
Command to Run MAST
for (i in 1:length(celltypes)) {
cur_celltype = celltypes[i](/MattHuff/scRNASeq_011224/wiki/i)
print(cur_celltype)
cur_seurat_obj <- subset(h5_seurat_class, manual_celltype == cur_celltype)
Idents(cur_seurat_obj) <- cur_seurat_obj$condition
print(unique(Idents(cur_seurat_obj)))
markers <- FindMarkers(object = cur_seurat_obj,
ident.1="Pap", ident.2="Septum",
only.pos = FALSE, min.pct = 0.10,
test.use = "MAST",
logfc.threshold = 0.10, assay='integrated',
latent.vars = c("patient"))
markers <- rownames_to_column(markers, "genes")
markers$diff <- markers$pct.1 - markers$pct.2
markers$celltype <- cur_celltype
combined_df <- rbind(combined_df, markers)
}
Write Results Files
All genes
## Write to Notebook
list <- split(combined_df,combined_df$celltype)
wb <- createWorkbook()
Map(function(data, nameofsheet){
addWorksheet(wb, nameofsheet)
writeData(wb, nameofsheet, data)
}, list, names(list))
saveWorkbook(wb, file = "outputs/de_out/Norris_Dge_FindMarkersMAST_FullStat.xlsx", overwrite = TRUE)
## Write to TSV
write.table(combined_df, "outputs/de_out/Norris_Dge_FindMarkersMAST_FullStat.tsv",
sep = "\t", quote = F,
row.names = FALSE, col.names = TRUE)
Significant Genes
## Filter for Significant DE Genes
combined_df_sign <- combined_df %>% filter(p_val_adj < 0.05, abs(avg_log2FC) > 0.3)
## Save to File
### Excel
openxlsx::write.xlsx(combined_df_sign,
file = "outputs/de_out/Norris_Dge_FindMarkersMAST_Sign_run2.xlsx",
colNames = TRUE,
rowNames = FALSE,
borders = "columns",
sheetName="Stats",
overwrite = TRUE)
### TSV
write.table(combined_df_sign,
file = "outputs/de_out/Norris_Dge_FindMarkersMAST_Sign_run2.txt",
sep = "\t", quote = F,
row.names = FALSE, col.names = TRUE)
#### Save individual TSVs for each cell type.
in_de_path <- "outputs/de_out/indiv_sign"
if(!dir.exists(in_de_path)){dir.create(in_de_path,recursive = T)}
for (i in 1:length(celltypes)) {
cur_celltype <- celltypes[i](/MattHuff/scRNASeq_011224/wiki/i)
print(cur_celltype)
current_df_sign <- subset(combined_df_sign, celltype == cur_celltype)
write.table(current_df_sign,
file = paste0("outputs/de_out/indiv_sign/Norris_Dge_FindMarkersMAST_", cur_celltype, "_Sign.txt"),
sep = "\t", quote = F,
row.names = FALSE, col.names = TRUE)
}
Data Visualization
Volcano Plot - All Cells
### Set Visualization Path
dviz_path <- "katy_samples/de_out/viz_de"
if(!dir.exists(dviz_path)){dir.create(dviz_path,recursive = T)}
EnhancedVolcano(df_celltype,
lab = df_celltype$genes,
x = "avg_log2FC",
y = "p_val_adj",
title = "Wild-Type vs Knock-Out",
pCutoff = 10e-32,
FCcutoff = 0.5,
pointSize = 3.0,
labSize = 6.0,
col=c('grey', 'black', 'blue', 'red3'),
colAlpha = 1,
legendLabels=c('NotSig','Log2FC','p-val',
'p-val&Log2FC'),
legendPosition = 'bottom',
legendLabSize = 16,
legendIconSize = 5.0) +
ylim(0,300) + xlim(-5,+5)
ggsave(filename = paste0(dviz_path, "/MAST_volcano_plot_enhanced.pdf"), height = 12, width = 12)
Volcano Plots - Per Cell Type
idviz_path <- "outputs/de_out/viz_de/per_celltype_plots"
if(!dir.exists(idviz_path)){dir.create(idviz_path,recursive = T)}
vol_path <- paste0(idviz_path, "/MAST_Volcano")
if(!dir.exists(vol_path)){dir.create(vol_path,recursive = T)}
for (cell in unique(df_celltype$celltype)) {
new_df <- subset(df_celltype, celltype == cell)
max_y <- max(-log10(filter(new_df, p_val_adj != 0)$p_val_adj))
EnhancedVolcano(new_df,
lab = new_df$genes,
x = "avg_log2FC",
y = "p_val_adj",
title = paste0("Wild-Type vs Knock-Out", cell),
pCutoff = 10e-32,
FCcutoff = 0.5,
pointSize = 3.0,
labSize = 6.0,
col=c('grey', 'black', 'blue', 'red3'),
colAlpha = 1,
legendLabels=c('NotSig','Log2FC','p-val',
'p-val&Log2FC'),
legendPosition = 'bottom',
legendLabSize = 16,
legendIconSize = 5.0) +
ylim(0,max_y+20) + xlim(-5,+5)
ggsave(filename = paste0(idviz_path, "/MAST_Volcano/MAST_volcano_plot_enhanced_",cell,".pdf"), height = 12, width = 12)
}
Violin Plots
combined_df_sign <- combined_df_sign %>%
mutate(Abs = abs(avg_log2FC))
for (cell in unique(combined_df_sign$celltype)){
top10 <- combined_df_sign %>%
filter(celltype == cell) %>%
arrange(-Abs) %>%
head(10)
if (nrow(top10) >= 3){
Idents(h5_seurat_class) <- "condition"
tmp <- subset(h5_seurat_class, manual_celltype == cell)
p <- VlnPlot(tmp, features=top10$genes[1:3], cols = c('#003f5c','#ffa600'), combine = FALSE)
(p[1](/MattHuff/scRNASeq_011224/wiki/1) + theme(legend.position = "none") +
p[2](/MattHuff/scRNASeq_011224/wiki/2) + theme(legend.position = "none")) +
#p[3](/MattHuff/scRNASeq_011224/wiki/3) + theme(legend.position = "none")) +
plot_annotation(title = paste0(cell, " - top 2 differentially expressed genes"),
caption = paste0("N = ", nrow(filter(combined_df_sign, celltype == cell))))
ggsave(filename = paste0(idviz_path, "/MAST_Violin/sub_Vln_", cell, ".pdf"), width = 12)
for(g in top10$genes[1:3]){
p <- FeaturePlot(h5_seurat_class, features = g, split.by = 'condition')
ggsave(stringr::str_glue("{idviz_path}/MAST_Violin/FtPlot_{cell}_{g}.pdf"), width = 12)
}
}
}
Dot Plots
Idents(h5_seurat_class) <- "Cell"
overall_top <- combined_df_sign %>%
arrange(-Abs) %>%
head(20)
for (rep in unique(h5_seurat_class["patient"](/MattHuff/scRNASeq_011224/wiki/"patient"))){
tmp <- subset(h5_seurat_class, patient == rep)
DotPlot(tmp, features = unique(overall_top$gene), assay = 'SCT', split.by = 'condition') + rotate_x_text(angle = 45) +
theme(legend.position = "right") + ggtitle(rep)
ggsave(paste0(idviz_path, "/dotplot_topfeats_", rep, ".pdf"), width = 8)
}
DotPlot(h5_seurat_class, features = unique(overall_top$gene), assay = 'SCT', split.by = 'condition') + rotate_x_text(angle = 45) +
theme(legend.position = "right") + ggtitle('Overall')
ggsave(paste0(dviz_path, "/dotplot_topfeats.pdf"), width = 8)
Bar Plots
library(viridis)
library(forcats)
DE_counts <- data.frame()
for (i in 1:length(cellTypes)) {
cur_celltype <- cellTypes[i](/MattHuff/scRNASeq_011224/wiki/i)
cur_genes <- subset(combined_df_sign, celltype == cur_celltype)
cur_count <- nrow(cur_genes)
DE_counts <- rbind(DE_counts, c(cur_celltype, cur_count))
}
colnames(DE_counts) <- c("Celltype", "DEG_Count")
DE_counts$DEG_Count <- as.numeric(DE_counts$DEG_Count)
DE_counts$Celltype <- c("Fibroblasts", "Endothelial Cells", "Myeloid", "Ventricular Cardiomyocytes", "Lymphoid", "Pericytes", "Smooth Muscle Cells")
b1 <- ggplot(DE_counts, aes(x = fct_rev(fct_reorder(Celltype, DEG_Count)), y = DEG_Count, fill = Celltype)) +
geom_bar(stat = "identity", color = "black", show.legend = FALSE) +
scale_x_discrete(guide = guide_axis(angle = 45)) +
scale_y_continuous(limits = c(0, 1750)) +
scale_fill_viridis(discrete = TRUE) +
labs(x = "Celltype", y = "Number of DEGs (Padj <= 0)", title = "Number of Differentially Expressed Genes by Celltype", color = "Celltype") +
geom_text(aes(label = DEG_Count), vjust = -1, color = "black")
ggsave("DEG_BarChart.pdf")
Venn Diagrams of Gene Overlap
library(ggVennDiagram)
fibro_df <- subset(combined_df_sign, celltype == "Fibro")
fibro_DEGS <- fibro_df$genes
endo_df <- subset(combined_df_sign, celltype == "Endothelial")
endo_DEGS <- endo_df$genes
cardio_df <- subset(combined_df_sign, celltype == "Ventricular_Cardio")
cardio_DEGS <- cardio_df$genes
x <- list(fibro_DEGS, endo_DEGS, cardio_DEGS)
ggVennDiagram(x, category.names = c("Fibro",
"Endothelial",
"Ventricular_Cardio")) +
scale_fill_gradient(low = "#F4FAFE", high = "#4981BF")
ggsave("outputs/de_out/all_gene_overlap_LFC1.pdf")
## Overlap of Subset DEGs - Top 20, 50, and 100
top_counts <- c(20, 50, 100)
for (c in top_counts) {
print(c)
subset_fibro <- fibro_df$genes %>%
head(c)
subset_endo <- endo_df$genes %>%
head(c)
subset_cardio <- cardio_df$genes %>%
head(c)
x2 <- list(subset_fibro, subset_endo, subset_cardio)
ggVennDiagram(x2, category.names = c("Fibro",
"Endothelial",
"Ventricular_Cardio")) +
scale_fill_gradient(low = "#F4FAFE", high = "#4981BF")
ggsave(paste0("outputs/de_out/top", c, "_gene_overlap_LFC1.pdf"))
}