Seurat - MattHuff/scRNASeq_011224 GitHub Wiki

Previous Seurat Documentation.

Load Packages and Set Working Directory

# Load Packages
library(Seurat)
library(SeuratData)
library(dplyr)
library(plyr)
library(hdf5r)
library(patchwork)
library(sctransform)
library(ggplot2)
library(cowplot)
library(stringr)
library(clustree)
library(harmony)

# Set Working Directory
setwd("~/Downloads/scRNASeq_011224/5_Seurat/")
dir.create("outputs", recursive = T)

Run Seurat

Obtain Seurat Objects for All Samples

file_names <- list.files(path = ".", pattern = "\\.h5", full.names = TRUE)
h5_read <- lapply(file_names, Read10X_h5)
h5_seurat <- lapply(h5_read, CreateSeuratObject)
h5_seurat <- merge(h5_seurat[[1]], y = h5_seurat[2:length(h5_seurat)], add.cell.ids = c("1", "2", "3", "4"), project = "jordan_011923")
h5_seurat <- JoinLayers(h5_seurat)

# Add sample column to metadata
sample <- str_split_i(colnames(h5_seurat), pattern = "_", 1)
h5_seurat$sample <- sample

Detect Mitochondrial Gene Percentages

h5_seurat[['pMito']] <- PercentageFeatureSet(h5_seurat, pattern = "^[Mm][Tt]-")
head(h5_seurat)

Save Unfiltered RDS Object

saveRDS(h5_seurat, file="./outputs/1_combined_unfiltered_seurat.rds")

If your analysis is interrupted, you can use an RDS file to save your progress up to this point. Load the RDS file with the following command:

h5_seurat <- readRDS(file="./katy_samples/combined_unfiltered_seurat.rds")

Data Visualization

## Create Directory to Store Visualizations
qcviz_path = "outputs/viz_qc"
if(!dir.exists(qcviz_path)){dir.create(qcviz_path, recursive = T)}

## QC Plots - Default
pdf(file = paste0(qcviz_path, '/01_qc_plot_rna.pdf'), width = 15)
VlnPlot(h5_seurat, features = c('nCount_RNA', 'nFeature_RNA', 'pMito'),
        group.by = 'sample')
invisible(dev.off())

## QC Plot - Dots Removed
pdf(file = paste0(qcviz_path, '/01a_qc_plot_rna.pdf'), width = 15)
VlnPlot(h5_seurat, features = c('nCount_RNA', 'nFeature_RNA', 'pMito'), pt.size = 0,
        group.by = 'sample') + theme(legend.position = 'none')
invisible(dev.off())

## Density Plot
pdf(file = paste0(qcviz_path, '/01b_density_plot_rna.pdf'))
[email protected] %>%
  ggplot(aes(color=sample, x=nCount_RNA, fill= sample)) +
  geom_density(alpha = 0.2) +
  scale_x_log10() +
  theme_classic() +
  ylab("Cell density")
invisible(dev.off())

## Feature Scatter Plots
### RNA against Genes
pdf(file = paste0(qcviz_path, '/02_qc_plot_umi-gene.pdf'))
FeatureScatter(h5_seurat, "nCount_RNA", "nFeature_RNA", group.by = "sample", pt.size = 0)
invisible(dev.off())

### RNA against % Mt
pdf(file = paste0(qcviz_path, '/03_qc_plot_numi-pmito.pdf'))
FeatureScatter(h5_seurat, "nCount_RNA", "pMito", group.by = "sample", pt.size = 0)
invisible(dev.off())

## Plot correlation of genes detected and RNA counts
scatter <- [email protected] %>% 
  ggplot(aes(x=nCount_RNA, y=nFeature_RNA, color=pMito)) + 
  geom_point() + 
  scale_colour_gradient(low = "gray90", high = "black") +
  stat_smooth(method=lm) +
  scale_x_log10() + 
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = 500) +
  geom_hline(yintercept = 250) +
  facet_wrap(~sample)
ggsave(file = paste0(qcviz_path, "/05_qc_plot_correlation.pdf"))

Identify Samples to Filter Out

## Identify Data to be Filtered per Sample
for (samp in unique(h5_seurat$sample)){
  tmp <- as.data.frame([email protected]) %>%
    filter(sample == samp)
  
  ### percent Mito > 20
  tmp <- tmp %>% mutate(pmito_out = pMito > 20)
  filt <- as.data.frame(table(tmp$pmito_out))
  colnames(filt) <- c('Var1', paste0('percent_mito > ', 20, '%'))
  
  ### nCount RNA > 10,000
  tmp <- tmp %>% mutate(numi_out = tmp$nCount_RNA > 10000)
  u_filt <- as.data.frame(table(tmp$numi_out))
  colnames(u_filt) <- c('Var1', paste0('nCount_RNA > ', 10000))
  
  filtering <- merge(filt, u_filt, by='Var1')
  
  ### Find all cells to be filtered out
  tmp <- tmp %>% mutate(all_out = nCount_RNA > 10000 | pMito > 20)
  all_filt <- as.data.frame(table(tmp$all_out))
  colnames(all_filt) <- c('Var1', 'Total')
  
  filtering <- merge(filtering, all_filt, by='Var1')
  
  filtering$Var1 <- plyr::mapvalues(filtering$Var1,
                                    from = c('FALSE', 'TRUE'),
                                    to = c('Remaining', 'To be filtered out'))
  rownames(filtering) <- filtering$Var1
  filtering <- filtering[,-1]
  
  write.csv(filtering, file = paste0(qcviz_path, "/pm_numi_all_counts_", samp, ".csv"))
}

## Create Combined Table
### percent mito > 20
[email protected] <- [email protected] %>% mutate(mito_filter_out = pMito > 20)
filt <- as.data.frame(table([email protected]$mito_filter_out))
colnames(filt) <- c('Var1', paste0('percent_mito > ', 20, '%'))

### nUMI > 10,000
[email protected] <- [email protected] %>% mutate(numi_out = nCount_RNA > 10000)
u_filt <- as.data.frame(table([email protected]$numi_out))
colnames(u_filt) <- c('Var1', paste0('nCount_RNA > ', 10000))

filtering <- merge(filt, u_filt, by='Var1')

### All cells to be filtered out
[email protected] <- [email protected] %>%
  mutate(all_out = (nCount_RNA > 10000) | (pMito > 20))
all_filt <- as.data.frame(table([email protected]$all_out))
colnames(all_filt) <- c('Var1', 'Total')

filtering <- merge(filtering, all_filt, by="Var1")

filtering$Var1 <- mapvalues(filtering$Var1,
                            from = c('FALSE', 'TRUE'),
                            to = c('Remaining', 'To be filtered out'))
rownames(filtering) <- filtering$Var1
filtering <- filtering[,-1]

write.csv(filtering, file = paste0(qcviz_path, "/pm_numi_all_counts.csv"))

Filter Out Data

nUMI_thresh <- 10000
mito_thresh <- 20

mito_filtered <- LayerData(h5_seurat, assay = "RNA", layer = "counts")[str_subset(rownames(LayerData(h5_seurat, assay = "RNA", layer = "counts")),
                                                                                  pattern = "^[Mm][Tt]",
                                                                                  negate = TRUE),]

h5_seurat_final <- CreateSeuratObject(counts = mito_filtered, project = "katy_integrated")

h5_seurat_final <- AddMetaData(h5_seurat_final, metadata = as.data.frame([email protected]))

h5_seurat_filtered <- subset(h5_seurat_final,
                             subset = nCount_RNA < nUMI_thresh & pMito < mito_thresh)

rm(h5_seurat, h5_seurat_final, mito_filtered)

df <- [email protected] %>% as.data.frame()
tmp <- table(df$sample) %>% as.data.frame()

ggplot(tmp,aes(x=Var1, y=Freq, fill=Var1)) + 
  geom_bar(stat="identity") +
  geom_text(aes(label=Freq), vjust=-0.3, size=3.5) +
  ggtitle("Gene Expression by Sample") +
  xlab("Sample") +
  ylab("Gene Expression") +
  theme(legend.position="none")
ggsave(file = paste0(qcviz_path, "/06_qc_gex_bar_plot.pdf"))

This would be a good place to create another RDS file:

saveRDS(h5_seurat_filtered, file = "outputs//2_filtered_seurat_pre_int.rds")

Integrate Seurat Data

## Split Seurat objects
h5_seurat_split <- SplitObject(h5_seurat_filtered, split.by = "sample")

## Run SCTransform on each individual object
for (i in 1:length(h5_seurat_split)) {
  h5_seurat_split[[i]] <- SCTransform(h5_seurat_split[[i]],
                                      vars.to.regress = c("nCount_RNA", "pMito"),
                                      verbose = T)
}

## Find integration features
integ_features <- SelectIntegrationFeatures(object.list = h5_seurat_split, nFeatures = 4000)

## Run prep SCT integration
h5_seurat_split <- PrepSCTIntegration(object.list = h5_seurat_split,
                                      anchor.features = integ_features)

## Find integration anchors
integ_anchors <- FindIntegrationAnchors(object.list = h5_seurat_split,
                                        normalization.method = "SCT",
                                        anchor.features = integ_features)

## Integrate
h5_seurat_integrated <- IntegrateData(
  anchorset = integ_anchors,
  new.assay.name = "integrated",
  normalization.method = "SCT",
  dims = 1:30,
  k.weight = 100,
  sd.weight = 1,
  eps = 0.5,
  verbose = T
)

DefaultAssay(h5_seurat_integrated) <- 'integrated'

Add Condition Information

# Add Condition Information
[email protected] <- [email protected] |>
  dplyr::mutate(condition = dplyr::case_when(sample %in% c("1", "3") ~ "Pap",
                                             sample %in% c("2", "4") ~"Septum",
                                             .default = "no_condition"))

# Save Integrated RDS
saveRDS(h5_seurat_integrated, file = "outputs/3_jordan_seurat_integrated.rds")

Identify Sample Clusters

## Create Directory
clviz_path="outputs/viz_cluster"
if(!dir.exists(clviz_path)){dir.create(clviz_path,recursive = T)}

## Set Map Resolution
umap_resolution <- c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2)

## Set Default Assay
DefaultAssay(h5_seurat_integrated) <- 'integrated'

## Run PCA
h5_seurat_integrated <- RunPCA(h5_seurat_integrated,
                               features = NULL,
                               weight.by.var = TRUE,
                               ndims.print = 1:5,
                               nfeatures.print = 30,
                               npcs = 50,
                               reduction.name = "pca")

## Perform Batch Correction
h5_seurat_integrated <- RunHarmony(h5_seurat_integrated, "sample")

## Find Neighbors
h5_seurat_integrated <- FindNeighbors(object = h5_seurat_integrated, 
                                      reduction = "harmony", 
                                      dims = 1:50, 
                                      nn.eps = 0.5)

## Find Clusters using Resolution Object
h5_seurat_integrated <- FindClusters(h5_seurat_integrated,
                                     resolution = umap_resolution,
                                     algorithm = 1,
                                     n.iter = 1000)

## Run Clustree and Plot Clusters
clustree(h5_seurat_integrated, prefix = "integrated_snn_res.", node_colour = "sc3_stability")
ggsave(file = paste0(clviz_path, "/07_clustree.pdf"), height = 12, width = 8)

## Set Resolution - Typically 0.6, but this may vary.
Idents(h5_seurat_integrated) <- "integrated_snn_res.0.6"

## Run UMap
h5_seurat_integrated <- RunUMAP(h5_seurat_integrated, reduction = "harmony", dims = 1:50)

Visualize Clusters

### Plot with Labels
p1 <- DimPlot(object = h5_seurat_integrated, reduction = "umap", label = TRUE, pt.size = 0.5) + theme(legend.position="none")
ggsave(file = paste0(clviz_path, "/08_DIMPLOT_labels.pdf"))

### Plot with Samples
p2 <- DimPlot(object = h5_seurat_integrated, reduction = "umap", label = FALSE, pt.size = 0.5, group.by="sample")
ggsave(file = paste0(clviz_path, "/09_DIMPLOT_sample.pdf"))

Checkpoint: Save RDS

saveRDS(h5_seurat_integrated, file="outputs/4_processed_integrated_norris.rds")
⚠️ **GitHub.com Fallback** ⚠️