Single Cell Omics - igheyas/Bioinformatics GitHub Wiki
If youβre looking to completely tear down the conda environment you just created (i.e. scenv), you can do:
# 1. Deactivate it (if you havenβt already)
conda deactivate
# 2. Remove the entire env (all packages, metadata, etc.)
conda env remove -n scenv
- Environment setup Open your Ubuntu shell and (optionally) create a fresh conda env:
cd /mnt/c/Users/IAGhe/OneDrive/Documents/Bioinformatics
conda create -n scenv -c conda-forge \
r-base=4.3 r-essentials \
r-seurat r-ggplot2 r-dplyr r-cowplot -y
conda activate scenv
Then launch R:
R
2. Simulate toy scRNA-seq data
Below we simulate:
100 genes
50 cells split into two βclustersβ of 25 each
Poisson counts with different means in the two groups
library(Seurat)
set.seed(123)
# ---- parameters ----
n_genes <- 100
cells_per_cluster <- 25
clusters <- rep(c("A","B"), each = cells_per_cluster)
n_cells <- length(clusters)
# ---- simulate counts ----
# baseline expression ~ Poisson(5)
counts <- matrix(
rpois(n_genes * n_cells, lambda = 5),
nrow = n_genes, ncol = n_cells
)
# bump up expression of genes 1:10 in cluster B
counts[1:10, clusters=="B"] <-
rpois(sum(clusters=="B") * 10, lambda = 20)
# give the genes & cells names
rownames(counts) <- paste0("Gene", sprintf("%03d", 1:n_genes))
colnames(counts) <- paste0("Cell", sprintf("%02d", 1:n_cells))
# metadata
meta <- data.frame(
row.names = colnames(counts),
cluster = clusters
)
# Inspect
head(counts)[,1:6]
#> Cell01 Cell02 Cell03 Cell04 Cell05 Cell06
#> Gene001 4 8 6 7 3 6
#> Gene002 4 5 7 6 2 5
#> Gene003 5 7 7 6 6 4
#> Gene004 3 6 5 5 4 8
#> Gene005 8 7 1 5 6 7
#> Gene006 10 3 9 2 11 6
head(meta)
#> cluster
#> Cell01 A
#> Cell02 A
#> Cell03 A
#> Cell04 A
#> Cell05 A
#> Cell06 A
- Build a Seurat object & run the basic pipeline
# 3.1 Create Seurat object
sc <- CreateSeuratObject(
counts = counts,
meta.data = meta,
project = "ToySC"
)
# 3.2 Quality control (skip mitochondrial for toy data)
sc <- subset(sc, subset = nFeature_RNA > 5)
# 3.3 Normalize
sc <- NormalizeData(sc, normalization.method = "LogNormalize", scale.factor = 1e4)
# 3.4 Find highly variable features
sc <- FindVariableFeatures(
object = sc,
selection.method = "vst",
nfeatures = 20
)
# 3.5 Scale (regress out no covariates here)
sc <- ScaleData(
object = sc,
features = VariableFeatures(sc)
)
#3.6. Run PCA on just those genes, asking for e.g. 10 components,
# and use the full (lapack) SVD if you like (approx = FALSE).
sc <- RunPCA(
object = sc,
features = VariableFeatures(sc),
npcs = 10,
approx = FALSE, # forces a standard SVD instead of irlba
verbose = FALSE # suppresses progress messages
)
# Check your top PCs
ElbowPlot(sc, ndims = 10)
# 3.7 Neighbors & clustering
sc <- FindNeighbors(sc, dims = 1:10)
sc <- FindClusters(sc, resolution = 0.5)
# 3.8 UMAP embedding
sc <- RunUMAP(sc, dims = 1:10)
DimPlot(sc, label = TRUE)
- Visualize and inspect
library(cowplot)
library(ggplot2)
# 1. True labels (from your original meta$cluster)
p1 <- DimPlot(sc, group.by="cluster", pt.size=2) +
labs(title="True labels")
#print(p1)
# 2. Seuratβinferred clusters (seurat_clusters)
p2 <- DimPlot(
object = sc,
label = TRUE,
pt.size = 2
) + labs(title = "Seurat clusters")
# then to draw it
#p2
# or
#print(p2)
# 3. Combine sideβbyβside
cowplot::plot_grid(p1, p2, ncol = 2)
What weβve learned Counts matrix: genesΓcells raw integer counts.
CreateSeuratObject: bundling counts + metadata.
QC / Filtering: remove lowβquality cells (here very simply by feature count).
Normalization: logβnormalize to make cells comparable.
Feature selection: pick highly variable genes.
Dimension reduction: PCA β UMAP to visualize.
Clustering: graphβbased to recover cell groups.
From here youβd move on to differential expression, cellβtype annotation, trajectory analysis, etc. But this tiny toy example shows the core workflow on your own laptop.
5. Differential expression between clusters
5.1. Make sure your identities are set to the βtrueβ clusters
By default, Seuratβs Idents(sc) is your graphβbased seurat_clusters (0/1). If youβd rather test on your original labels βAβ vs βBβ, run:
# set active identities to your simulated truth
Idents(sc) <- sc$cluster
You can check:
table(Idents(sc))
# A B
# 25 25
5.2. Run FindMarkers
Use Seuratβs builtβin Wilcoxon rankβsum test to find genes up in A vs B:
de_A_vs_B <- FindMarkers(
object = sc,
ident.1 = "A", # first group
ident.2 = "B", # second group
min.pct = 0.1, # only test genes detected in β₯10% of cells
logfc.threshold = 0.25 # only report genes with log2FC β₯ 0.25
)
# Inspect the top hits
head(de_A_vs_B, n = 10)
# p_val avg_log2FC pct.1 pct.2 p_val_adj
# Gene003 1.2e-08 1.95 1.00 0.04 1.0e-07
# Gene008 3.5e-07 1.80 1.00 0.08 2.9e-06
# β¦etcβ¦
Columns explained:
-
p_val: raw Wilcoxon p-value
-
avg_log2FC: mean(log2(Aβ+β1)) β mean(log2(Bβ+β1))
-
pct.1 / pct.2: fraction of cells in A/B where the gene is detected
-
p_val_adj: Bonferroniβadjusted p-value
5.3. Visualize top markers
Pick the top 5 genes by smallest adjusted p-value:
top5 <- rownames(de_A_vs_B)[1:5]
5.3.1. Violin plots
VlnPlot(
object = sc,
features = top5,
group.by = "cluster", # still has your original A/B in meta
pt.size = 0.5
)
5.3.2. Feature (UMAP) plots
FeaturePlot(
object = sc,
features = top5
)
5.3.3. Heatmap of top markers
# 1) Scale just the DE genes you want to plot
sc <- ScaleData(
object = sc,
features = top5
)
# 2) Now they live in sc["RNA"](/igheyas/Bioinformatics/wiki/"RNA")@scale.data, so DoHeatmap will find them
DoHeatmap(
object = sc,
features = top5,
group.by = "cluster"
) + NoLegend()
R
# if you havenβt yet installed BiocManager:
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
type q() to quit R
Install via conda (still at Bash)
(scenv) $ conda install -c bioconda bioconductor-slingshot -y
Trajectory and Pseudotime Analysis
create a new virtual environment:
conda deactivate
conda create -n sc44 \
-c conda-forge \
-c bioconda \
r-base=4.4 \
r-essentials \
r-seurat \
r-ggplot2 \
r-cairo \
r-ggrastr \
r-cowplot \
bioconductor-slingshot \
bioconductor-singlecellexperiment \
bioconductor-scater \
-y
cd /mnt/c/Users/IAGhe/OneDrive/Documents/Bioinformatics
conda activate sc44
R
Inside R you can then do:
library(Seurat)
library(slingshot)
library(SingleCellExperiment)
library(scater)
library(cowplot)
# β¦your conversion + slingshot + plotting codeβ¦
# In your R console (scenv active):
if (!requireNamespace("BiocManager", quietly=TRUE))
install.packages("BiocManager")
# Install the missing dependency
BiocManager::install("DelayedMatrixStats")
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# B.1 Install & load
# (only need to install once; skip if already done)
if (!requireNamespace("slingshot", quietly = TRUE)) {
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("slingshot")
}
# load packages
library(Seurat) # for as.SingleCellExperiment()
library(SingleCellExperiment) # for the SCE container
library(slingshot) # core trajectory function
library(scater) # for plotUMAP()
library(ggplot2) # for custom ggplot layers
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# B.2 Convert Seurat β SingleCellExperiment
sce <- as.SingleCellExperiment(sc)
# copy over UMAP coords and original clusters from `sc`
reducedDims(sce)$UMAP <- Embeddings(sc, "umap")
colData(sce)$cluster <- sc$cluster
# quick sanity-checks:
stopifnot("UMAP" %in% names(reducedDims(sce)))
stopifnot("cluster" %in% colnames(colData(sce)))
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# B.3 Run Slingshot
sce <- slingshot(
sce,
clusterLabels = "cluster",
reducedDim = "UMAP",
start.clus = "A",
end.clus = "B"
)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Youβre all set!
The following will actually draw your 50 cells plus the inferred lineage curve..you can plot the lineages with base graphics:
library(slingshot)
# pull out your UMAP embedding
umap <- reducedDims(sce)$UMAP
# make a factor of your clusters
clust_fac <- factor(colData(sce)$cluster, levels = c("A","B"))
# now assign colours, e.g. red for A, blue for B
cols <- c("A"="red","B"="blue")[ as.character(clust_fac) ]
# plot the points
plot(umap,
col = cols,
pch = 16,
asp = 1,
xlab = "UMAP1",
ylab = "UMAP2",
main = "Slingshot Trajectory, clusters")
# overlay the smooth Slingshot curves in black
lines(sce, lwd = 2, col = "black")
###If youβd rather plot pseudotime:
# 1) Run Slingshot (youβve done this)
sce <- slingshot(
sce,
clusterLabels = "cluster",
reducedDim = "UMAP",
start.clus = "A",
end.clus = "B"
)
# 2) Extract UMAP coords and pseudotime
umap <- reducedDims(sce)$UMAP # this creates the 'umap' matrix
pt <- slingPseudotime(sce)[,1] # your pseudotime vector
# 3) Plot cells colored by pseudotime
library(viridis) # for a nice continuous palette
cols <- viridis(100)[ cut(pt, breaks=100) ]
plot(umap,
col = cols,
pch = 16,
asp = 1,
xlab = "UMAP1",
ylab = "UMAP2",
main = "Slingshot Trajectory (pseudotime)")
# 4) Overlay the inferred lineage curve
lines(sce, lwd = 2, col = "black")
That will produce the UMAP scatter of your 50 toy cells, colored by pseudotime, with the black Slingshot curve on top
conda activate scenv # or create a new py env
pip install scvelo scanpy
#C.2 Basic scVelo workflow #Toy data (R)
library(Seurat)
set.seed(123)
# ---- parameters ----
n_genes <- 100
cells_per_cluster <- 25
clusters <- rep(c("A","B"), each = cells_per_cluster)
n_cells <- length(clusters)
# ---- simulate counts ----
# baseline expression ~ Poisson(5)
counts <- matrix(
rpois(n_genes * n_cells, lambda = 5),
nrow = n_genes, ncol = n_cells
)
# bump up expression of genes 1:10 in cluster B
counts[1:10, clusters=="B"] <-
rpois(sum(clusters=="B") * 10, lambda = 20)
# give the genes & cells names
rownames(counts) <- paste0("Gene", sprintf("%03d", 1:n_genes))
colnames(counts) <- paste0("Cell", sprintf("%02d", 1:n_cells))
# metadata
meta <- data.frame(
row.names = colnames(counts),
cluster = clusters
)
# Inspect
head(counts)[,1:6]
#> Cell01 Cell02 Cell03 Cell04 Cell05 Cell06
#> Gene001 4 8 6 7 3 6
#> Gene002 4 5 7 6 2 5
#> Gene003 5 7 7 6 6 4
#> Gene004 3 6 5 5 4 8
#> Gene005 8 7 1 5 6 7
#> Gene006 10 3 9 2 11 6
head(meta)
#> cluster
#> Cell01 A
#> Cell02 A
#> Cell03 A
#> Cell04 A
#> Cell05 A
#> Cell06 A
In R, dump your toy matrix and metadata to disk:
write.csv(counts, "counts.csv", quote=FALSE)
write.csv(meta, "meta.csv", quote=FALSE)
#Write out as a .py using IPython magic In a new notebook cell, put at the very top:
If youβre looking to completely tear down the conda environment you just created (i.e. scenv), you can do:
```bash
# 1. Deactivate it (if you havenβt already)
conda deactivate
# 2. Remove the entire env (all packages, metadata, etc.)
conda env remove -n scenv
- Environment setup Open your Ubuntu shell and (optionally) create a fresh conda env:
cd /mnt/c/Users/IAGhe/OneDrive/Documents/Bioinformatics
conda create -n scenv -c conda-forge \
r-base=4.3 r-essentials \
r-seurat r-ggplot2 r-dplyr r-cowplot -y
conda activate scenv
Then launch R:
R
2. Simulate toy scRNA-seq data
Below we simulate:
100 genes
50 cells split into two βclustersβ of 25 each
Poisson counts with different means in the two groups
library(Seurat)
set.seed(123)
# ---- parameters ----
n_genes <- 100
cells_per_cluster <- 25
clusters <- rep(c("A","B"), each = cells_per_cluster)
n_cells <- length(clusters)
# ---- simulate counts ----
# baseline expression ~ Poisson(5)
counts <- matrix(
rpois(n_genes * n_cells, lambda = 5),
nrow = n_genes, ncol = n_cells
)
# bump up expression of genes 1:10 in cluster B
counts[1:10, clusters=="B"] <-
rpois(sum(clusters=="B") * 10, lambda = 20)
# give the genes & cells names
rownames(counts) <- paste0("Gene", sprintf("%03d", 1:n_genes))
colnames(counts) <- paste0("Cell", sprintf("%02d", 1:n_cells))
# metadata
meta <- data.frame(
row.names = colnames(counts),
cluster = clusters
)
# Inspect
head(counts)[,1:6]
#> Cell01 Cell02 Cell03 Cell04 Cell05 Cell06
#> Gene001 4 8 6 7 3 6
#> Gene002 4 5 7 6 2 5
#> Gene003 5 7 7 6 6 4
#> Gene004 3 6 5 5 4 8
#> Gene005 8 7 1 5 6 7
#> Gene006 10 3 9 2 11 6
head(meta)
#> cluster
#> Cell01 A
#> Cell02 A
#> Cell03 A
#> Cell04 A
#> Cell05 A
#> Cell06 A
- Build a Seurat object & run the basic pipeline
# 3.1 Create Seurat object
sc <- CreateSeuratObject(
counts = counts,
meta.data = meta,
project = "ToySC"
)
# 3.2 Quality control (skip mitochondrial for toy data)
sc <- subset(sc, subset = nFeature_RNA > 5)
# 3.3 Normalize
sc <- NormalizeData(sc, normalization.method = "LogNormalize", scale.factor = 1e4)
# 3.4 Find highly variable features
sc <- FindVariableFeatures(
object = sc,
selection.method = "vst",
nfeatures = 20
)
# 3.5 Scale (regress out no covariates here)
sc <- ScaleData(
object = sc,
features = VariableFeatures(sc)
)
#3.6. Run PCA on just those genes, asking for e.g. 10 components,
# and use the full (lapack) SVD if you like (approx = FALSE).
sc <- RunPCA(
object = sc,
features = VariableFeatures(sc),
npcs = 10,
approx = FALSE, # forces a standard SVD instead of irlba
verbose = FALSE # suppresses progress messages
)
# Check your top PCs
ElbowPlot(sc, ndims = 10)
# 3.7 Neighbors & clustering
sc <- FindNeighbors(sc, dims = 1:10)
sc <- FindClusters(sc, resolution = 0.5)
# 3.8 UMAP embedding
sc <- RunUMAP(sc, dims = 1:10)
DimPlot(sc, label = TRUE)
- Visualize and inspect
library(cowplot)
library(ggplot2)
# 1. True labels (from your original meta$cluster)
p1 <- DimPlot(sc, group.by="cluster", pt.size=2) +
labs(title="True labels")
#print(p1)
# 2. Seuratβinferred clusters (seurat_clusters)
p2 <- DimPlot(
object = sc,
label = TRUE,
pt.size = 2
) + labs(title = "Seurat clusters")
# then to draw it
#p2
# or
#print(p2)
# 3. Combine sideβbyβside
cowplot::plot_grid(p1, p2, ncol = 2)
What weβve learned Counts matrix: genesΓcells raw integer counts.
CreateSeuratObject: bundling counts + metadata.
QC / Filtering: remove lowβquality cells (here very simply by feature count).
Normalization: logβnormalize to make cells comparable.
Feature selection: pick highly variable genes.
Dimension reduction: PCA β UMAP to visualize.
Clustering: graphβbased to recover cell groups.
From here youβd move on to differential expression, cellβtype annotation, trajectory analysis, etc. But this tiny toy example shows the core workflow on your own laptop.
5. Differential expression between clusters
5.1. Make sure your identities are set to the βtrueβ clusters
By default, Seuratβs Idents(sc) is your graphβbased seurat_clusters (0/1). If youβd rather test on your original labels βAβ vs βBβ, run:
# set active identities to your simulated truth
Idents(sc) <- sc$cluster
You can check:
table(Idents(sc))
# A B
# 25 25
5.2. Run FindMarkers
Use Seuratβs builtβin Wilcoxon rankβsum test to find genes up in A vs B:
de_A_vs_B <- FindMarkers(
object = sc,
ident.1 = "A", # first group
ident.2 = "B", # second group
min.pct = 0.1, # only test genes detected in β₯10% of cells
logfc.threshold = 0.25 # only report genes with log2FC β₯ 0.25
)
# Inspect the top hits
head(de_A_vs_B, n = 10)
# p_val avg_log2FC pct.1 pct.2 p_val_adj
# Gene003 1.2e-08 1.95 1.00 0.04 1.0e-07
# Gene008 3.5e-07 1.80 1.00 0.08 2.9e-06
# β¦etcβ¦
Columns explained:
-
p_val: raw Wilcoxon p-value
-
avg_log2FC: mean(log2(Aβ+β1)) β mean(log2(Bβ+β1))
-
pct.1 / pct.2: fraction of cells in A/B where the gene is detected
-
p_val_adj: Bonferroniβadjusted p-value
5.3. Visualize top markers
Pick the top 5 genes by smallest adjusted p-value:
top5 <- rownames(de_A_vs_B)[1:5]
5.3.1. Violin plots
VlnPlot(
object = sc,
features = top5,
group.by = "cluster", # still has your original A/B in meta
pt.size = 0.5
)
5.3.2. Feature (UMAP) plots
FeaturePlot(
object = sc,
features = top5
)
5.3.3. Heatmap of top markers
# 1) Scale just the DE genes you want to plot
sc <- ScaleData(
object = sc,
features = top5
)
# 2) Now they live in sc["RNA"](/igheyas/Bioinformatics/wiki/"RNA")@scale.data, so DoHeatmap will find them
DoHeatmap(
object = sc,
features = top5,
group.by = "cluster"
) + NoLegend()
R
# if you havenβt yet installed BiocManager:
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
type q() to quit R
Install via conda (still at Bash)
(scenv) $ conda install -c bioconda bioconductor-slingshot -y
Trajectory and Pseudotime Analysis
create a new virtual environment:
conda deactivate
conda create -n sc44 \
-c conda-forge \
-c bioconda \
r-base=4.4 \
r-essentials \
r-seurat \
r-ggplot2 \
r-cairo \
r-ggrastr \
r-cowplot \
bioconductor-slingshot \
bioconductor-singlecellexperiment \
bioconductor-scater \
-y
cd /mnt/c/Users/IAGhe/OneDrive/Documents/Bioinformatics
conda activate sc44
R
Inside R you can then do:
library(Seurat)
library(slingshot)
library(SingleCellExperiment)
library(scater)
library(cowplot)
# β¦your conversion + slingshot + plotting codeβ¦
# In your R console (scenv active):
if (!requireNamespace("BiocManager", quietly=TRUE))
install.packages("BiocManager")
# Install the missing dependency
BiocManager::install("DelayedMatrixStats")
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# B.1 Install & load
# (only need to install once; skip if already done)
if (!requireNamespace("slingshot", quietly = TRUE)) {
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("slingshot")
}
# load packages
library(Seurat) # for as.SingleCellExperiment()
library(SingleCellExperiment) # for the SCE container
library(slingshot) # core trajectory function
library(scater) # for plotUMAP()
library(ggplot2) # for custom ggplot layers
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# B.2 Convert Seurat β SingleCellExperiment
sce <- as.SingleCellExperiment(sc)
# copy over UMAP coords and original clusters from `sc`
reducedDims(sce)$UMAP <- Embeddings(sc, "umap")
colData(sce)$cluster <- sc$cluster
# quick sanity-checks:
stopifnot("UMAP" %in% names(reducedDims(sce)))
stopifnot("cluster" %in% colnames(colData(sce)))
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# B.3 Run Slingshot
sce <- slingshot(
sce,
clusterLabels = "cluster",
reducedDim = "UMAP",
start.clus = "A",
end.clus = "B"
)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Youβre all set!
The following will actually draw your 50 cells plus the inferred lineage curve..you can plot the lineages with base graphics:
library(slingshot)
# pull out your UMAP embedding
umap <- reducedDims(sce)$UMAP
# make a factor of your clusters
clust_fac <- factor(colData(sce)$cluster, levels = c("A","B"))
# now assign colours, e.g. red for A, blue for B
cols <- c("A"="red","B"="blue")[ as.character(clust_fac) ]
# plot the points
plot(umap,
col = cols,
pch = 16,
asp = 1,
xlab = "UMAP1",
ylab = "UMAP2",
main = "Slingshot Trajectory, clusters")
# overlay the smooth Slingshot curves in black
lines(sce, lwd = 2, col = "black")
###If youβd rather plot pseudotime:
# 1) Run Slingshot (youβve done this)
sce <- slingshot(
sce,
clusterLabels = "cluster",
reducedDim = "UMAP",
start.clus = "A",
end.clus = "B"
)
# 2) Extract UMAP coords and pseudotime
umap <- reducedDims(sce)$UMAP # this creates the 'umap' matrix
pt <- slingPseudotime(sce)[,1] # your pseudotime vector
# 3) Plot cells colored by pseudotime
library(viridis) # for a nice continuous palette
cols <- viridis(100)[ cut(pt, breaks=100) ]
plot(umap,
col = cols,
pch = 16,
asp = 1,
xlab = "UMAP1",
ylab = "UMAP2",
main = "Slingshot Trajectory (pseudotime)")
# 4) Overlay the inferred lineage curve
lines(sce, lwd = 2, col = "black")
That will produce the UMAP scatter of your 50 toy cells, colored by pseudotime, with the black Slingshot curve on top
conda activate scenv # or create a new py env
pip install scvelo scanpy
#C.2 Basic scVelo workflow #Toy data (R)
library(Seurat)
set.seed(123)
# ---- parameters ----
n_genes <- 100
cells_per_cluster <- 25
clusters <- rep(c("A","B"), each = cells_per_cluster)
n_cells <- length(clusters)
# ---- simulate counts ----
# baseline expression ~ Poisson(5)
counts <- matrix(
rpois(n_genes * n_cells, lambda = 5),
nrow = n_genes, ncol = n_cells
)
# bump up expression of genes 1:10 in cluster B
counts[1:10, clusters=="B"] <-
rpois(sum(clusters=="B") * 10, lambda = 20)
# give the genes & cells names
rownames(counts) <- paste0("Gene", sprintf("%03d", 1:n_genes))
colnames(counts) <- paste0("Cell", sprintf("%02d", 1:n_cells))
# metadata
meta <- data.frame(
row.names = colnames(counts),
cluster = clusters
)
# Inspect
head(counts)[,1:6]
#> Cell01 Cell02 Cell03 Cell04 Cell05 Cell06
#> Gene001 4 8 6 7 3 6
#> Gene002 4 5 7 6 2 5
#> Gene003 5 7 7 6 6 4
#> Gene004 3 6 5 5 4 8
#> Gene005 8 7 1 5 6 7
#> Gene006 10 3 9 2 11 6
head(meta)
#> cluster
#> Cell01 A
#> Cell02 A
#> Cell03 A
#> Cell04 A
#> Cell05 A
#> Cell06 A
In R, dump your toy matrix and metadata to disk:
write.csv(counts, "counts.csv", quote=FALSE)
write.csv(meta, "meta.csv", quote=FALSE)
#Write out as a .py using IPython magic In a new notebook cell, put at the very top:
#!/usr/bin/env python3
import pandas as pd
import scanpy as sc
import scvelo as scv
import numpy as np
# 0) scVelo style
scv.settings.verbosity = 3
scv.settings.set_figure_params('scvelo', figsize=(5,5))
# 1) Load toy data
counts = pd.read_csv("counts.csv", index_col=0)
meta = pd.read_csv("meta.csv", index_col=0)
# 2) Build AnnData (cells Γ genes)
adata = sc.AnnData(
X = counts.T.values,
obs = meta,
var = pd.DataFrame(index=counts.index)
)
# 3) Fake velocity layers with a little signal
np.random.seed(0)
adata.layers["spliced"] = adata.X.copy()
adata.layers["unspliced"] = adata.X * 0.8 + np.random.normal(scale=0.5, size=adata.X.shape)
# 4) Filter & normalize (note the PLURAL argument names)
scv.pp.filter_and_normalize(
adata,
min_shared_counts=5,
n_top_genes =50,
layers = ["spliced"],
layers_norm = ["unspliced"]
)
# 5) DimRed + neighbors
sc.tl.pca(adata, n_comps=10, svd_solver='arpack')
sc.pp.neighbors(adata, n_pcs=10, n_neighbors=15)
sc.tl.umap(adata)
# 6) Moments smoothing
scv.pp.moments(adata, n_pcs=10, n_neighbors=15)
# 7) Velocity & velocity graph
scv.tl.velocity(adata, mode='stochastic')
scv.tl.velocity_graph(adata)
# 8) Plot
sc.pl.umap(adata, color='cluster', legend_loc='right margin')
scv.pl.velocity_embedding_stream(
adata,
basis ='umap',
color ='cluster',
linewidth =1.5,
legend_loc='right margin',
title ='Toy scVelo velocity'
)
# either via conda-forge
conda install -c conda-forge scikit-misc -y
# β¦or via pip
pip install scikit-misc
Once the file exists you can execute it from bash:
python toy_velocity.py
You can put the following line at the top of a notebook cell and run the code, the script will be saved as .py file
%%writefile toy_velocity.py
Then in WSL you can run the .py file
python toy_velocity.py
#Create & activate a fresh Python 3.10 env
cd /mnt/c/Users/IAGhe/OneDrive/Documents/Bioinformatics
# 1. Create & activate
conda create -n scvelo-env -c conda-forge python=3.10 -y
conda activate scvelo-env
# 2. Install everything
conda install -c conda-forge \
scvelo scanpy scikit-misc ipywidgets tqdm matplotlib jupyterlab \
-y
# 3. (Optional) check imports
python - <<'PYCODE'
import scanpy as sc, scvelo as scv, skmisc
print(sc.__version__, scv.__version__, skmisc.__version__)
PYCODE
# 4. Launch Jupyter Lab
jupyter lab
# 5. Or run your script directly
python toy_velocity.py