Single Cell Omics‐‐theory - igheyas/Bioinformatics GitHub Wiki
What is “omics”
"Omics" means studying everything in a system—like all the genes, or all the proteins, or all the molecules in a cell.
Here are a few types:
Genomics = study of all the genes (DNA)
Transcriptomics = study of all the RNA made from genes
Proteomics = study of all the proteins
Metabolomics = study of all the small chemicals in cells
So “omics” means looking at the big picture, not just one thing.
What is single-cell omics?
Most omics studies look at many cells together, like a tissue or blood sample. But...
👉 Single-cell omics looks at one cell at a time!
That’s super powerful because:
Different cells do different jobs.
Even in the same tissue, no two cells are exactly alike.
Diseases like cancer can start in just one weird cell.
🔍 Instead of mixing thousands of cells and seeing an average, we now look at each cell to learn more.
🎯 Main Goals / Outputs:
Cell Types
- What different kinds of cells are in a sample?
- Which cells are similar or different?
Gene Expression Patterns
- Which genes are turned on/off in each cell?
- What does that say about the cell’s job?
Cell States or Activities
- Is a cell dividing? Resting? Dying?
- What’s happening in the cell right now?
Rare Cells or Subgroups
- Can we find tiny groups that might be important—like early cancer cells or stem cells?
Changes Over Time or Condition
- How do cells change in disease, or during development?
So, the output is usually a map or groupings of cells, showing:
-
What kind of cell it is
-
What it’s doing
-
How it compares to others
So yes! Each type focuses on a different layer of the cell.
Want to try naming which layer you think gives the best clue about a cell’s current job?
What do you think the point is of using just the variable genes, instead of all 100?
library(Seurat)
set.seed(123)
# ---- parameters ----
n_genes <- 100
cells_per_cluster <- 25
clusters <- rep(c("A","B"), each = cells_per_cluster)
n_cells <- length(clusters)
# ---- simulate counts ----
# baseline expression ~ Poisson(5)
counts <- matrix(
rpois(n_genes * n_cells, lambda = 5),
nrow = n_genes, ncol = n_cells
)
# bump up expression of genes 1:10 in cluster B
counts[1:10, clusters=="B"] <-
rpois(sum(clusters=="B") * 10, lambda = 20)
# give the genes & cells names
rownames(counts) <- paste0("Gene", sprintf("%03d", 1:n_genes))
colnames(counts) <- paste0("Cell", sprintf("%02d", 1:n_cells))
# metadata
meta <- data.frame(
row.names = colnames(counts),
cluster = clusters
)
# Inspect
head(counts)[,1:6]
#> Cell01 Cell02 Cell03 Cell04 Cell05 Cell06
#> Gene001 4 8 6 7 3 6
#> Gene002 4 5 7 6 2 5
#> Gene003 5 7 7 6 6 4
#> Gene004 3 6 5 5 4 8
#> Gene005 8 7 1 5 6 7
#> Gene006 10 3 9 2 11 6
head(meta)
#> cluster
#> Cell01 A
#> Cell02 A
#> Cell03 A
#> Cell04 A
#> Cell05 A
#> Cell06 A
library(Seurat)
set.seed(123)
➡ Makes random numbers reproducible — so every time you run it, you get the same data.
🧮 Parameters: how many genes and cells?
n_genes <- 100
cells_per_cluster <- 25
- You’ll make 100 genes, and 50 cells total — split into 2 groups of 25 each.
clusters <- rep(c("A","B"), each = cells_per_cluster)
- Makes a vector like: A, A, A, ..., B, B, B — 25 A's, 25 B's. Each letter is a "cluster label" for a cell.
n_cells <- length(clusters)
- Now n_cells is 50.
counts <- matrix(
rpois(n_genes * n_cells, lambda = 5),
nrow = n_genes, ncol = n_cells
)
counts[1:10, clusters=="B"] <-
rpois(sum(clusters=="B") * 10, lambda = 20)
rownames(counts) <- paste0("Gene", sprintf("%03d", 1:n_genes))
colnames(counts) <- paste0("Cell", sprintf("%02d", 1:n_cells))
meta <- data.frame(
row.names = colnames(counts),
cluster = clusters
)
head(counts)[,1:6]
head(meta)
counts <- matrix(
rpois(n_genes * n_cells, lambda = 5),
nrow = n_genes, ncol = n_cells
)
# 3.1 Create Seurat object
sc <- CreateSeuratObject(
counts = counts,
meta.data = meta,
project = "ToySC"
)
# 3.2 Quality control (skip mitochondrial for toy data)
sc <- subset(sc, subset = nFeature_RNA > 5)
# 3.3 Normalize
sc <- NormalizeData(sc, normalization.method = "LogNormalize", scale.factor = 1e4)
# 3.4 Find highly variable features
sc <- FindVariableFeatures(
object = sc,
selection.method = "vst",
nfeatures = 20
)
# 3.5 Scale (regress out no covariates here)
sc <- ScaleData(
object = sc,
features = VariableFeatures(sc)
)
#3.6. Run PCA on just those genes, asking for e.g. 10 components,
# and use the full (lapack) SVD if you like (approx = FALSE).
sc <- RunPCA(
object = sc,
features = VariableFeatures(sc),
npcs = 10,
approx = FALSE, # forces a standard SVD instead of irlba
verbose = FALSE # suppresses progress messages
)
# Check your top PCs
ElbowPlot(sc, ndims = 10)
# 3.7 Neighbors & clustering
sc <- FindNeighbors(sc, dims = 1:10)
sc <- FindClusters(sc, resolution = 0.5)
# 3.8 UMAP embedding
sc <- RunUMAP(sc, dims = 1:10)
DimPlot(sc, label = TRUE)
Build a Seurat object & run the basic pipeline
sc <- CreateSeuratObject(
counts = counts,
meta.data = meta,
project = "ToySC"
)
sc <- subset(sc, subset = nFeature_RNA > 5)
sc <- NormalizeData(sc, normalization.method = "LogNormalize", scale.factor = 1e4)
log1p( (gene count / total counts per cell) × 10,000 )
range(sc["RNA"](/igheyas/Bioinformatics/wiki/"RNA")@data)
sc <- FindVariableFeatures(
object = sc,
selection.method = "vst",
nfeatures = 20
)
sc <- ScaleData(
object = sc,
features = VariableFeatures(sc)
)
sc <- RunPCA(
object = sc,
features = VariableFeatures(sc),
npcs = 10,
approx = FALSE,
verbose = FALSE
)
ElbowPlot(sc, ndims = 10)
sc <- FindNeighbors(sc, dims = 1:10)
sc <- FindClusters(sc, resolution = 0.5)
sc <- RunUMAP(sc, dims = 1:10)
DimPlot(sc, label = TRUE)
Idents(sc) <- sc$cluster
table(Idents(sc))
# A B
# 25 25
de_A_vs_B <- FindMarkers(
object = sc,
ident.1 = "A", # first group
ident.2 = "B", # second group
min.pct = 0.1, # only test genes detected in ≥10% of cells
logfc.threshold = 0.25 # only report genes with log2FC ≥ 0.25
)
# Inspect the top hits
head(de_A_vs_B, n = 10)
# p_val avg_log2FC pct.1 pct.2 p_val_adj
# Gene003 1.2e-08 1.95 1.00 0.04 1.0e-07
# Gene008 3.5e-07 1.80 1.00 0.08 2.9e-06
# …etc…
de_A_vs_B <- FindMarkers(
object = sc,
ident.1 = "A", # first group
ident.2 = "B", # second group
min.pct = 0.1, # only test genes detected in ≥10% of cells
logfc.threshold = 0.25 # only report genes with log2FC ≥ 0.25
)
head(de_A_vs_B, 10)
Output:
top5 <- rownames(de_A_vs_B)[1:5]
# Violin plots
VlnPlot(sc, features = top5, group.by = "cluster", pt.size = 0.5)
# Feature (UMAP) plots
FeaturePlot(sc, features = top5)
# Heatmap
DoHeatmap(sc, features = top5, group.by = "cluster") + NoLegend()
Violin Plots
#UMap plots
# 1) Scale just the DE genes you want to plot
sc <- ScaleData(
object = sc,
features = top5
)
# 2) Now they live in sc["RNA"](/igheyas/Bioinformatics/wiki/"RNA")@scale.data, so DoHeatmap will find them
DoHeatmap(
object = sc,
features = top5,
group.by = "cluster"
) + NoLegend()
DoHeatmap(
object = sc,
features = top5,
group.by = "cluster",
slot = "data", # pull from NormalizeData() output
assay = "RNA"
) + NoLegend()
BiocManager::install("slingshot")
library(slingshot)
library(SingleCellExperiment)
####If you quit R using q() and then start R again, the previous data in the R console appears to be lost.
B.2 Convert Seurat → SingleCellExperiment
library(Seurat)
# Re‐simulate your toy data
set.seed(123)
n_genes <- 100; cells_per_cluster <- 25
clusters <- rep(c("A","B"), each = cells_per_cluster)
counts <- matrix(rpois(n_genes * length(clusters), 5), nrow=n_genes)
counts[1:10, clusters=="B"] <- rpois(sum(clusters=="B")*10, 20)
rownames(counts) <- paste0("Gene", sprintf("%03d",1:n_genes))
colnames(counts) <- paste0("Cell", sprintf("%02d",1:length(clusters)))
meta <- data.frame(cluster = clusters, row.names = colnames(counts))
# Re‐create Seurat object
sc <- CreateSeuratObject(counts = counts, meta.data = meta, project = "ToySC")
sc <- subset(sc, subset = nFeature_RNA > 5)
sc <- NormalizeData(sc)
sc <- FindVariableFeatures(sc, nfeatures = 20)
sc <- ScaleData(sc, features = VariableFeatures(sc))
sc <- RunPCA(sc, features = VariableFeatures(sc), npcs = 10, approx = FALSE)
sc <- FindNeighbors(sc, dims = 1:10)
sc <- FindClusters(sc, resolution = 0.5)
sc <- RunUMAP(sc, dims = 1:10)
Trajectory and Pseudotime Analysis
# ─────────────────────────────────────────────────────────────────────────────
# B.1 Install & load
# (only need to install once; skip if already done)
if (!requireNamespace("slingshot", quietly = TRUE)) {
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("slingshot")
}
# load packages
library(Seurat) # for as.SingleCellExperiment()
library(SingleCellExperiment) # for the SCE container
library(slingshot) # core trajectory function
library(scater) # for plotUMAP()
library(ggplot2) # for custom ggplot layers
# ─────────────────────────────────────────────────────────────────────────────
# B.2 Convert Seurat → SingleCellExperiment
sce <- as.SingleCellExperiment(sc)
# copy over UMAP coords and original clusters from `sc`
reducedDims(sce)$UMAP <- Embeddings(sc, "umap")
colData(sce)$cluster <- sc$cluster
# quick sanity-checks:
stopifnot("UMAP" %in% names(reducedDims(sce)))
stopifnot("cluster" %in% colnames(colData(sce)))
# ─────────────────────────────────────────────────────────────────────────────
# B.3 Run Slingshot
sce <- slingshot(
sce,
clusterLabels = "cluster",
reducedDim = "UMAP",
start.clus = "A",
end.clus = "B"
)
# ─────────────────────────────────────────────────────────────────────────────
# ─────────────────────────────────────────────────────────────────────────────
# You’re all set!
library(Seurat) # for as.SingleCellExperiment()
library(SingleCellExperiment) # to use the SCE format
library(slingshot) # Slingshot = trajectory inference tool
library(scater) # for plotting (like plotUMAP)
library(ggplot2) # for customizing plots
sce <- as.SingleCellExperiment(sc)
sce <- as.SingleCellExperiment(sc)
reducedDims(sce)$UMAP <- Embeddings(sc, "umap")
reducedDims(sce)$UMAP <- Embeddings(sc, "umap")
reducedDims(sce)$UMAP
colData(sce)$cluster <- sc$cluster
library(slingshot)
# pull out your UMAP embedding
umap <- reducedDims(sce)$UMAP
# make a factor of your clusters
clust_fac <- factor(colData(sce)$cluster, levels = c("A","B"))
# now assign colours, e.g. red for A, blue for B
cols <- c("A"="red","B"="blue")[ as.character(clust_fac) ]
# plot the points
plot(umap,
col = cols,
pch = 16,
asp = 1,
xlab = "UMAP1",
ylab = "UMAP2",
main = "Slingshot Trajectory, clusters")
# overlay the smooth Slingshot curves in black
lines(sce, lwd = 2, col = "black")
library(slingshot)
Loads the Slingshot package, which you already used to compute trajectories.
umap <- reducedDims(sce)$UMAP
clust_fac <- factor(colData(sce)$cluster, levels = c("A", "B"))
cols <- c("A"="red", "B"="blue")[as.character(clust_fac)]
cols <- c("A" = "red", "B" = "blue")[c("A", "A", ..., "B")]
plot(umap,
col = cols,
pch = 16,
asp = 1,
xlab = "UMAP1",
ylab = "UMAP2",
main = "Slingshot Trajectory, clusters")
lines(sce, lwd = 2, col = "black")
Plot Pseudotime
# 1) Run Slingshot (you’ve done this)
sce <- slingshot(
sce,
clusterLabels = "cluster",
reducedDim = "UMAP",
start.clus = "A",
end.clus = "B"
)
# 2) Extract UMAP coords and pseudotime
umap <- reducedDims(sce)$UMAP # this creates the 'umap' matrix
pt <- slingPseudotime(sce)[,1] # your pseudotime vector
# 3) Plot cells colored by pseudotime
library(viridis) # for a nice continuous palette
cols <- viridis(100)[ cut(pt, breaks=100) ]
plot(umap,
col = cols,
pch = 16,
asp = 1,
xlab = "UMAP1",
ylab = "UMAP2",
main = "Slingshot Trajectory (pseudotime)")
# 4) Overlay the inferred lineage curve
lines(sce, lwd = 2, col = "black")
umap <- reducedDims(sce)$UMAP
pt <- slingPseudotime(sce)[,1]
library(viridis)
cols <- viridis(100)[ cut(pt, breaks = 100) ]
viridis(100)[cut(pt, breaks = 100)]
plot(umap,
col = cols, # pseudotime-based color
pch = 16, # solid dots
asp = 1, # equal axes
xlab = "UMAP1",
ylab = "UMAP2",
main = "Slingshot Trajectory (pseudotime)")
lines(sce, lwd = 2, col = "black") # lineage curve