MAST - MattHuff/scRNASeq_011224 GitHub Wiki

Previous MAST Documentation

Load Packages, Set Working Directory, and Load RDS

# Load Packages
library(Seurat)
library(dplyr)
library(Libra)
library(MAST)
library(openxlsx)
library(tidyverse)
library(ggrepel)
library(ggplot2)
library(ggpubr)
library(cowplot)
library(EnhancedVolcano)
library(patchwork)

# Set Working Directory
setwd("~/Downloads/scRNASeq_011224/5_Seurat/")

# Optional - Load RCS
h5_seurat_class <- readRDS("outputs/7_classified_integrated_norris.rds")

Add Co-Variates

There are not many co-variates in this analysis compared to the previous analysis, but it is still good practice to use what we can.

# Define Patients
[email protected] <- [email protected] |>
  dplyr::mutate(patient = dplyr::case_when(sample %in% c("1", "2") ~ "LVF002",
                                             sample %in% c("3", "4") ~ "LVF006",
                                             .default = "NA"))

# Confirm cell counts
cell_counts <- [email protected] %>%
  group_by(sample, manual_celltype) %>%
  summarise(cell_count = n())

Run Find Markers with MAST

Prepare Output Directories

de_path <- "outputs/de_out"
if(!dir.exists(de_path)){dir.create(de_path,recursive = T)}

Set Up Loop

celltypes <- unique(h5_seurat_class$manual_celltype)
combined_df <- data.frame()

Command to Run MAST

for (i in 1:length(celltypes)) {
  cur_celltype = celltypes[i](/MattHuff/scRNASeq_011224/wiki/i)
  print(cur_celltype)
  
  cur_seurat_obj <- subset(h5_seurat_class, manual_celltype == cur_celltype)
  Idents(cur_seurat_obj) <- cur_seurat_obj$condition
  print(unique(Idents(cur_seurat_obj)))
  
  markers <- FindMarkers(object = cur_seurat_obj,
                         ident.1="Pap", ident.2="Septum",
                         only.pos = FALSE, min.pct = 0.10,
                         test.use = "MAST",
                         logfc.threshold = 0.10, assay='integrated',
                         latent.vars = c("patient"))
  
  markers <- rownames_to_column(markers, "genes")
  markers$diff <- markers$pct.1 - markers$pct.2
  markers$celltype <- cur_celltype
  
  combined_df <- rbind(combined_df, markers)
}

Write Results Files

All genes

## Write to Notebook
list <- split(combined_df,combined_df$celltype)
wb <- createWorkbook()
Map(function(data, nameofsheet){    
  addWorksheet(wb, nameofsheet)
  writeData(wb, nameofsheet, data)
}, list, names(list))

saveWorkbook(wb, file = "outputs/de_out/Norris_Dge_FindMarkersMAST_FullStat.xlsx", overwrite = TRUE)

## Write to TSV
write.table(combined_df, "outputs/de_out/Norris_Dge_FindMarkersMAST_FullStat.tsv", 
            sep = "\t", quote = F, 
            row.names = FALSE, col.names = TRUE)

Significant Genes

## Filter for Significant DE Genes
combined_df_sign <- combined_df %>% filter(p_val_adj < 0.05, abs(avg_log2FC) > 0.3)

## Save to File
### Excel
openxlsx::write.xlsx(combined_df_sign, 
                     file = "outputs/de_out/Norris_Dge_FindMarkersMAST_Sign_run2.xlsx",
                     colNames = TRUE,
                     rowNames = FALSE,
                     borders = "columns",
                     sheetName="Stats",
                     overwrite = TRUE)

### TSV
write.table(combined_df_sign, 
            file = "outputs/de_out/Norris_Dge_FindMarkersMAST_Sign_run2.txt", 
            sep = "\t", quote = F, 
            row.names = FALSE, col.names = TRUE)

#### Save individual TSVs for each cell type.
in_de_path <- "outputs/de_out/indiv_sign"
if(!dir.exists(in_de_path)){dir.create(in_de_path,recursive = T)}

for (i in 1:length(celltypes)) {
  cur_celltype <- celltypes[i](/MattHuff/scRNASeq_011224/wiki/i)
  print(cur_celltype)
  
  current_df_sign <- subset(combined_df_sign, celltype == cur_celltype)
  write.table(current_df_sign, 
              file = paste0("outputs/de_out/indiv_sign/Norris_Dge_FindMarkersMAST_", cur_celltype, "_Sign.txt"), 
              sep = "\t", quote = F, 
              row.names = FALSE, col.names = TRUE)
}

Data Visualization

Volcano Plot - All Cells

### Set Visualization Path
dviz_path <- "katy_samples/de_out/viz_de"
if(!dir.exists(dviz_path)){dir.create(dviz_path,recursive = T)}

EnhancedVolcano(df_celltype,
                lab = df_celltype$genes,
                x = "avg_log2FC", 
                y = "p_val_adj",
                title = "Wild-Type vs Knock-Out",
                pCutoff = 10e-32,
                FCcutoff = 0.5,
                pointSize = 3.0,
                labSize = 6.0,
                col=c('grey', 'black', 'blue', 'red3'),
                colAlpha = 1,
                legendLabels=c('NotSig','Log2FC','p-val',
                               'p-val&Log2FC'),
                legendPosition = 'bottom',
                legendLabSize = 16,
                legendIconSize = 5.0) +
  ylim(0,300) + xlim(-5,+5)
ggsave(filename = paste0(dviz_path, "/MAST_volcano_plot_enhanced.pdf"), height = 12, width = 12)

Volcano Plots - Per Cell Type

idviz_path <- "outputs/de_out/viz_de/per_celltype_plots"
if(!dir.exists(idviz_path)){dir.create(idviz_path,recursive = T)}

vol_path <- paste0(idviz_path, "/MAST_Volcano")
if(!dir.exists(vol_path)){dir.create(vol_path,recursive = T)}

for (cell in unique(df_celltype$celltype)) {
  new_df <- subset(df_celltype, celltype == cell)
  max_y <- max(-log10(filter(new_df, p_val_adj != 0)$p_val_adj))
  EnhancedVolcano(new_df,
                  lab = new_df$genes,
                  x = "avg_log2FC", 
                  y = "p_val_adj",
                  title = paste0("Wild-Type vs Knock-Out", cell),
                  pCutoff = 10e-32,
                  FCcutoff = 0.5,
                  pointSize = 3.0,
                  labSize = 6.0,
                  col=c('grey', 'black', 'blue', 'red3'),
                  colAlpha = 1,
                  legendLabels=c('NotSig','Log2FC','p-val',
                                 'p-val&Log2FC'),
                  legendPosition = 'bottom',
                  legendLabSize = 16,
                  legendIconSize = 5.0) +
    ylim(0,max_y+20) + xlim(-5,+5)
  ggsave(filename = paste0(idviz_path, "/MAST_Volcano/MAST_volcano_plot_enhanced_",cell,".pdf"), height = 12, width = 12)
}

Violin Plots

combined_df_sign <- combined_df_sign %>%
  mutate(Abs = abs(avg_log2FC))

for (cell in unique(combined_df_sign$celltype)){
  top10 <- combined_df_sign %>%
    filter(celltype == cell) %>%
    arrange(-Abs) %>%
    head(10)
  
  if (nrow(top10) >= 3){
    Idents(h5_seurat_class) <- "condition"
    tmp <- subset(h5_seurat_class, manual_celltype == cell)
    p <- VlnPlot(tmp, features=top10$genes[1:3], cols = c('#003f5c','#ffa600'), combine = FALSE)
    (p[1](/MattHuff/scRNASeq_011224/wiki/1) + theme(legend.position = "none") +
        p[2](/MattHuff/scRNASeq_011224/wiki/2) + theme(legend.position = "none")) +
      #p[3](/MattHuff/scRNASeq_011224/wiki/3) + theme(legend.position = "none")) + 
      plot_annotation(title = paste0(cell, " - top 2 differentially expressed genes"),
                      caption = paste0("N = ", nrow(filter(combined_df_sign, celltype == cell))))
    ggsave(filename = paste0(idviz_path, "/MAST_Violin/sub_Vln_", cell, ".pdf"), width = 12)
    
    for(g in top10$genes[1:3]){
      p <- FeaturePlot(h5_seurat_class, features = g, split.by = 'condition')
      ggsave(stringr::str_glue("{idviz_path}/MAST_Violin/FtPlot_{cell}_{g}.pdf"), width = 12)
    }
  }
}

Dot Plots

Idents(h5_seurat_class) <- "Cell"
overall_top <- combined_df_sign %>%
  arrange(-Abs) %>%
  head(20)

for (rep in unique(h5_seurat_class["patient"](/MattHuff/scRNASeq_011224/wiki/"patient"))){
  tmp <- subset(h5_seurat_class, patient == rep)
  DotPlot(tmp, features = unique(overall_top$gene), assay = 'SCT', split.by = 'condition') + rotate_x_text(angle = 45) +
    theme(legend.position = "right") + ggtitle(rep)
  ggsave(paste0(idviz_path, "/dotplot_topfeats_", rep, ".pdf"), width = 8)
}
DotPlot(h5_seurat_class, features = unique(overall_top$gene), assay = 'SCT', split.by = 'condition') + rotate_x_text(angle = 45) +
  theme(legend.position = "right") + ggtitle('Overall')
ggsave(paste0(dviz_path, "/dotplot_topfeats.pdf"), width = 8)

Bar Plots

library(viridis)
library(forcats)
DE_counts <- data.frame()
for (i in 1:length(cellTypes)) {
  cur_celltype <- cellTypes[i](/MattHuff/scRNASeq_011224/wiki/i)
  cur_genes <- subset(combined_df_sign, celltype == cur_celltype)
  cur_count <- nrow(cur_genes)
  DE_counts <- rbind(DE_counts, c(cur_celltype, cur_count))
}

colnames(DE_counts) <- c("Celltype", "DEG_Count")
DE_counts$DEG_Count <- as.numeric(DE_counts$DEG_Count)
DE_counts$Celltype <- c("Fibroblasts", "Endothelial Cells", "Myeloid", "Ventricular Cardiomyocytes", "Lymphoid", "Pericytes", "Smooth Muscle Cells")

b1 <- ggplot(DE_counts, aes(x = fct_rev(fct_reorder(Celltype, DEG_Count)), y = DEG_Count, fill = Celltype)) +
  geom_bar(stat = "identity", color = "black", show.legend = FALSE) +
  scale_x_discrete(guide = guide_axis(angle = 45)) +
  scale_y_continuous(limits = c(0, 1750)) +
  scale_fill_viridis(discrete = TRUE) +
  labs(x = "Celltype", y = "Number of DEGs (Padj <= 0)", title = "Number of Differentially Expressed Genes by Celltype", color = "Celltype") +
  geom_text(aes(label = DEG_Count), vjust = -1, color = "black")

ggsave("DEG_BarChart.pdf")

Venn Diagrams of Gene Overlap

library(ggVennDiagram)

fibro_df <- subset(combined_df_sign, celltype == "Fibro")
fibro_DEGS <- fibro_df$genes

endo_df <- subset(combined_df_sign, celltype == "Endothelial")
endo_DEGS <- endo_df$genes

cardio_df <- subset(combined_df_sign, celltype == "Ventricular_Cardio")
cardio_DEGS <- cardio_df$genes

x <- list(fibro_DEGS, endo_DEGS, cardio_DEGS)
ggVennDiagram(x, category.names = c("Fibro",
                                    "Endothelial",
                                    "Ventricular_Cardio")) +
  scale_fill_gradient(low = "#F4FAFE", high = "#4981BF")
ggsave("outputs/de_out/all_gene_overlap_LFC1.pdf")

## Overlap of Subset DEGs - Top 20, 50, and 100

top_counts <- c(20, 50, 100)

for (c in top_counts) {
  print(c)
  
  subset_fibro <- fibro_df$genes %>%
    head(c)
  subset_endo <- endo_df$genes %>%
    head(c)
  subset_cardio <- cardio_df$genes %>%
    head(c)
  
  x2 <- list(subset_fibro, subset_endo, subset_cardio)
  ggVennDiagram(x2, category.names = c("Fibro",
                                      "Endothelial",
                                      "Ventricular_Cardio")) +
    scale_fill_gradient(low = "#F4FAFE", high = "#4981BF")
  ggsave(paste0("outputs/de_out/top", c, "_gene_overlap_LFC1.pdf"))
}