Seurat - MattHuff/SingleCellDocumentation_112023 GitHub Wiki
Seurat is a multi-functional R toolkit for use in single cell genomics.
First, make sure you have R installed on your machine. You may also install RStudio, an integrated development environment (IDE) that gives you extra options to improve visualization of results and ease of use.
Once both are installed, launch RStudio. In the command line, run the following command:
install.packages('Seurat')
This will install Seurat and all required dependencies. However, you cannot use installed packages in R without loading them first. The next step will load Seurat and other packages that will be used in this pipeline:
library(Seurat)
library(SeuratData)
library(dplyr)
library(plyr)
library(hdf5r)
library(patchwork)
library(sctransform)
library(ggplot2)
library(cowplot)
library(stringr)
If you run into issues installing the package SeuratData
, you will need to install the devtools
package and install it with this command:
install_github("SeuratData")
By default, RStudio will load in your "home" directory (~/
for short.) It is always a good idea to have a specific directory set aside for your input data and results, and you can use the setwd()
command to go to this directory:
setwd("~/Downloads/seurat_test/")
Within this directory, I have a subdirectory set aside to store all of Katy's data - katy_samples
- and my outputs. This can be created in R using the following command:
dir.create("katy_samples", recursive = T)
This analysis will likely fail if you run it on your own computer; Sample 1 did not properly filter the same way the other files did, so it is much larger, making analysis memory-intensive. Despite this, I can confirm that these steps work and can be run. Alternatively, skip Sample 1 and run this only on Samples 2-4.
Another point of concern is you available version of Seurat. Seurat v5 has changed how the Seurat Object stores "counts" objects, which affects certain commands in the pipeline. Most notably, the "RNA" assay no longer has an "data" layer, which results in some adjustments to the pipeline compared to what I had previously provided. This page will use the Seurat v5 instructions.
file_names <- list.files(path = "katy_samples", pattern = "\\.h5", full.names = TRUE)
h5_read <- lapply(file_names, Read10X_h5)
h5_seurat <- lapply(h5_read, CreateSeuratObject)
h5_seurat <- merge(h5_seurat[[1]], y = h5_seurat[2:length(h5_seurat)], add.cell.ids = c("2", "3", "4"), project = "katy_integrated")
h5_seurat <- JoinLayers(h5_seurat)
# Add sample column to metadata
sample <- str_split_i(colnames(h5_seurat), pattern = "_", 1)
h5_seurat$sample <- sample
h5_seurat[['pMito']] <- PercentageFeatureSet(h5_seurat, pattern = "^[Mm][Tt]-")
head(h5_seurat)
saveRDS(h5_seurat, file="./katy_samples/combined_unfiltered_seurat.rds")
If your analysis is interrupted, you can use this file to reload it up to this point without rerunning the previous steps:
h5_seurat <- readRDS(file="./katy_samples/combined_unfiltered_seurat.rds")
## Create Directory to Store Visualizations
qcviz_path = "katy_samples/viz_qc"
if(!dir.exists(qcviz_path)){dir.create(qcviz_path,recursive = T)}
## QC Plot - Default
pdf(file = paste0(qcviz_path, '/01_qc_plot_rna.pdf'), width = 15)
VlnPlot(h5_seurat, features = c('nCount_RNA', 'nFeature_RNA', 'pMito'),
group.by = 'sample')
invisible(dev.off())
## QC Plot - Dots Removed
pdf(file = paste0(qcviz_path, '/01a_qc_plot_rna.pdf'), width = 15)
VlnPlot(h5_seurat, features = c('nCount_RNA', 'nFeature_RNA', 'pMito'), pt.size = 0,
group.by = 'sample') + theme(legend.position = 'none')
invisible(dev.off())
## Density Plot
pdf(file = paste0(qcviz_path, '/01b_density_plot_rna.pdf'))
[email protected] %>%
ggplot(aes(color=sample, x=nCount_RNA, fill= sample)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
ylab("Cell density")
invisible(dev.off())
## Feature Scatter Plots
pdf(file = paste0(qcviz_path, '/02_qc_plot_umi-gene.pdf'))
FeatureScatter(h5_seurat, "nCount_RNA", "nFeature_RNA", group.by = "sample", pt.size = 0)
invisible(dev.off())
pdf(file = paste0(qcviz_path, '/03_qc_plot_numi-pmito.pdf'))
FeatureScatter(h5_seurat, "nCount_RNA", "pMito", group.by = "sample", pt.size = 0)
invisible(dev.off())
## Plot correlation of genes detected and RNA counts
scatter <- [email protected] %>%
ggplot(aes(x=nCount_RNA, y=nFeature_RNA, color=pMito)) +
geom_point() +
scale_colour_gradient(low = "gray90", high = "black") +
stat_smooth(method=lm) +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = 500) +
geom_hline(yintercept = 250) +
facet_wrap(~sample)
ggsave(file = paste0(qcviz_path, "/05_qc_plot_correlation.pdf"))
## Identify Data to be filtered per Sample
for (samp in unique(h5_seurat$sample)){
tmp <- as.data.frame([email protected]) %>%
filter(sample == samp)
# percent Mito > 20
tmp <- tmp %>% mutate(pmito_out = pMito > 20)
filt <- as.data.frame(table(tmp$pmito_out))
colnames(filt) <- c('Var1', paste0('percent_mito > ', 20, '%'))
# nCount RNA > 10,000
tmp <- tmp %>% mutate(numi_out = tmp$nCount_RNA > 10000)
u_filt <- as.data.frame(table(tmp$numi_out))
colnames(u_filt) <- c('Var1', paste0('nCount_RNA > ', 10000))
filtering <- merge(filt, u_filt, by='Var1')
# Find all cells to be filtered out
tmp <- tmp %>% mutate(all_out = nCount_RNA > 10000 | pMito > 20)
all_filt <- as.data.frame(table(tmp$all_out))
colnames(all_filt) <- c('Var1', 'Total')
filtering <- merge(filtering, all_filt, by='Var1')
filtering$Var1 <- plyr::mapvalues(filtering$Var1,
from = c('FALSE', 'TRUE'),
to = c('Remaining', 'To be filtered out'))
rownames(filtering) <- filtering$Var1
filtering <- filtering[,-1]
write.csv(filtering, file = paste0(qcviz_path, "/pm_numi_all_counts_", samp, ".csv"))
}
## Identify total data to be filtered out across all samples
# percent mito > 20
[email protected] <- [email protected] %>% mutate(mito_filter_out = pMito > 20)
filt <- as.data.frame(table([email protected]$mito_filter_out))
colnames(filt) <- c('Var1', paste0('percent_mito > ', 20, '%'))
# nUMI > 10,000
[email protected] <- [email protected] %>% mutate(numi_out = nCount_RNA > 10000)
u_filt <- as.data.frame(table([email protected]$numi_out))
colnames(u_filt) <- c('Var1', paste0('nCount_RNA > ', 10000))
filtering <- merge(filt, u_filt, by='Var1')
# All cells to be filtered out
[email protected] <- [email protected] %>%
mutate(all_out = (nCount_RNA > 10000) | (pMito > 20))
all_filt <- as.data.frame(table([email protected]$all_out))
colnames(all_filt) <- c('Var1', 'Total')
filtering <- merge(filtering, all_filt, by="Var1")
filtering$Var1 <- mapvalues(filtering$Var1,
from = c('FALSE', 'TRUE'),
to = c('Remaining', 'To be filtered out'))
rownames(filtering) <- filtering$Var1
filtering <- filtering[,-1]
write.csv(filtering, file = paste0(qcviz_path, "/pm_numi_all_counts.csv"))
nUMI_thresh <- 10000
mito_thresh <- 20
mito_filtered <- LayerData(h5_seurat, assay = "RNA", layer = "counts")[str_subset(rownames(LayerData(h5_seurat, assay = "RNA", layer = "counts")),
pattern = "^[Mm][Tt]",
negate = TRUE),]
h5_seurat_final <- CreateSeuratObject(counts = mito_filtered, project = "katy_integrated")
h5_seurat_final <- AddMetaData(h5_seurat_final, metadata = as.data.frame([email protected]))
h5_seurat_filtered <- subset(h5_seurat_final,
subset = nCount_RNA < nUMI_thresh & pMito < mito_thresh)
rm(h5_seurat, h5_seurat_final, mito_filtered)
df <- [email protected] %>% as.data.frame()
tmp <- table(df$sample) %>% as.data.frame()
ggplot(tmp,aes(x=Var1, y=Freq, fill=Var1)) +
geom_bar(stat="identity") +
geom_text(aes(label=Freq), vjust=-0.3, size=3.5) +
ggtitle("Gene Expression by Sample") +
xlab("Sample") +
ylab("Gene Expression") +
theme(legend.position="none")
ggsave(file = paste0(qcviz_path, "/06_qc_gex_bar_plot.pdf"))
With filtering done, I'd recommend saving the filtered Seurat object as an RDS file:
saveRDS(h5_seurat_filtered, file = "katy_samples/01_filtered_seurat_pre_int.rds")
# Split Seurat objects
h5_seurat_split <- SplitObject(h5_seurat_filtered, split.by = "sample")
# Run SCTransform on each individual object
for (i in 1:length(h5_seurat_split)) {
h5_seurat_split[[i]] <- SCTransform(h5_seurat_split[[i]],
vars.to.regress = c("nCount_RNA", "pMito"),
verbose = T)
}
# Find integration features
integ_features <- SelectIntegrationFeatures(object.list = h5_seurat_split, nFeatures = 4000)
# Run prep SCT integration
h5_seurat_split <- PrepSCTIntegration(object.list = h5_seurat_split,
anchor.features = integ_features)
# Find integration anchors
integ_anchors <- FindIntegrationAnchors(object.list = h5_seurat_split,
normalization.method = "SCT",
anchor.features = integ_features)
# Integrate
h5_seurat_integrated <- IntegrateData(
anchorset = integ_anchors,
new.assay.name = "integrated",
normalization.method = "SCT",
dims = 1:30,
k.weight = 100,
sd.weight = 1,
eps = 0.5,
verbose = T
)
DefaultAssay(h5_seurat_integrated) <- 'integrated'
[email protected] <- [email protected] |>
dplyr::mutate(condition = dplyr::case_when(sample %in% c("1", "2") ~ "wt",
sample %in% c("3", "4") ~"ko",
.default = "no_condition"))
Once again, save an RDS file of the integrated output:
saveRDS(h5_seurat_integrated, file = "katy_samples/02_katy_seurat_integrated.rds")
## Create Directory
clviz_path="katy_samples/viz_cluster"
if(!dir.exists(clviz_path)){dir.create(clviz_path,recursive = T)}
## Set Map Resolution
umap_resolution <- c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2)
## Set Default Assay
DefaultAssay(h5_seurat_integrated) <- 'integrated'
## Run PCA
h5_seurat_integrated <- RunPCA(h5_seurat_integrated,
features = NULL,
weight.by.var = TRUE,
ndims.print = 1:5,
nfeatures.print = 30,
npcs = 50,
reduction.name = "pca")
## Perform Batch Correction
h5_seurat_integrated <- RunHarmony(h5_seurat_integrated, "sample")
## Find Neighbors
h5_seurat_integrated <- FindNeighbors(object = h5_seurat_integrated,
reduction = "harmony",
dims = 1:50,
nn.eps = 0.5)
## Find Clusters using Resolution Object
h5_seurat_integrated <- FindClusters(h5_seurat_integrated,
resolution = umap_resolution,
algorithm = 1,
n.iter = 1000)
## Run Clustree and Plot Clusters
clustree(h5_seurat_integrated, prefix = "integrated_snn_res.", node_colour = "sc3_stability")
ggsave(file = paste0(clviz_path, "/07_clustree.pdf"), height = 12, width = 8)
## Set Resolution - Typically 0.6, but this may vary.
Idents(h5_seurat_integrated) <- "integrated_snn_res.0.6"
## Run UMap
h5_seurat_integrated <- RunUMAP(h5_seurat_integrated, reduction = "harmony", dims = 1:50)
### Plot with Labels
p1 <- DimPlot(object = h5_seurat_integrated, reduction = "umap", label = TRUE, pt.size = 0.5) + theme(legend.position="none")
ggsave(file = paste0(clviz_path, "/08_DIMPLOT_labels.pdf"))
### Plot with Samples
p2 <- DimPlot(object = h5_seurat_integrated, reduction = "umap", label = FALSE, pt.size = 0.5, group.by="sample")
ggsave(file = paste0(clviz_path, "/09_DIMPLOT_sample.pdf"))
saveRDS(h5_seurat_integrated, file="katy_samples/03_processed_integrated_norris.rds")