single cell omics‐‐velocity - igheyas/Bioinformatics GitHub Wiki
1) Make a brand‐new conda env (Python 3.10)
Open your WSL Ubuntu shell and run:
# 1.1) Deactivate any old env
conda deactivate
cd /mnt/c/Users/IAGhe/OneDrive/Documents/Bioinformatics
# 1.2) Create a new env with Python 3.10
conda create -n scvelo-py310 python=3.10 -y
# 1.3) Activate it
conda activate scvelo-py310
# 1.4) Install everything via pip (these versions are known to work together)
pip install numpy pandas scipy matplotlib seaborn scanpy scvelo anndata jupyterlab
C.2 Basic scVelo workflow -Toy data (R) Toy Dataset
library(Seurat)
set.seed(123)
# ---- parameters ----
n_genes <- 100
cells_per_cluster <- 25
clusters <- rep(c("A","B"), each = cells_per_cluster)
n_cells <- length(clusters)
# ---- simulate counts ----
# baseline expression ~ Poisson(5)
counts <- matrix(
rpois(n_genes * n_cells, lambda = 5),
nrow = n_genes, ncol = n_cells
)
# bump up expression of genes 1:10 in cluster B
counts[1:10, clusters=="B"] <-
rpois(sum(clusters=="B") * 10, lambda = 20)
# give the genes & cells names
rownames(counts) <- paste0("Gene", sprintf("%03d", 1:n_genes))
colnames(counts) <- paste0("Cell", sprintf("%02d", 1:n_cells))
# metadata
meta <- data.frame(
row.names = colnames(counts),
cluster = clusters
)
# Inspect
head(counts)[,1:6]
#> Cell01 Cell02 Cell03 Cell04 Cell05 Cell06
#> Gene001 4 8 6 7 3 6
#> Gene002 4 5 7 6 2 5
#> Gene003 5 7 7 6 6 4
#> Gene004 3 6 5 5 4 8
#> Gene005 8 7 1 5 6 7
#> Gene006 10 3 9 2 11 6
head(meta)
#> cluster
#> Cell01 A
#> Cell02 A
#> Cell03 A
#> Cell04 A
#> Cell05 A
#> Cell06 A
In R, dump your toy matrix and metadata to disk:
write.csv(counts, "counts.csv", quote=FALSE)
write.csv(meta, "meta.csv", quote=FALSE)
conda activate scvelo-env
conda install -c conda-forge notebook -y
jupyter notebook --no-browser --ip=0.0.0.0 --port=8888
Put in a browser URL:
http://127.0.0.1:8888/tree?token=efe55241d07d4978267af586c6c6e65d7d33aee1e4181eb9
!pip install scikit-misc
Once that finishes, restart your notebook kernel (to pick up the new package).
Then in your notebook run the following cell:
%matplotlib inline
import numpy as np
import pandas as pd
import scanpy as sc
import scvelo as scv
# 0) scVelo / plotting settings
scv.settings.verbosity = 3
scv.settings.set_figure_params('scvelo', figsize=(6,6), dpi=100)
# 1) Load toy data
counts = pd.read_csv("counts.csv", index_col=0) # 100 genes × 50 cells
meta = pd.read_csv("meta.csv", index_col=0) # 50 cells × 1 column "cluster"
# 2) Build AnnData: cells × genes
adata = sc.AnnData(
X = counts.T.values, # now 50 cells × 100 genes
obs = meta.copy(), # must have obs["cluster"]
var = pd.DataFrame(index=counts.index)
)
adata.obs_names = counts.columns
adata.var_names = counts.index
# 3) Fake velocity layers (unspliced is slightly shifted so the fit isn't degenerate)
np.random.seed(0)
adata.layers["spliced"] = adata.X.copy()
adata.layers["unspliced"] = adata.X * 0.8 + np.random.normal(
scale=0.5, size=adata.X.shape
)
# 4) Gene‐level filtering: keep genes seen ≥5× in **both** layers
mask_sp = np.sum(adata.layers["spliced"], axis=0) >= 5
mask_un = np.sum(adata.layers["unspliced"], axis=0) >= 5
mask = mask_sp & mask_un
adata = adata[:, mask]
# 5) HVG selection via Scanpy (top 50 by dispersion)
sc.pp.highly_variable_genes(adata, n_top_genes=50, flavor="seurat_v3")
adata = adata[:, adata.var["highly_variable"]]
# 6) Per‐cell normalization + log:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
# 7) PCA / neighbors / UMAP (Scanpy)
sc.pp.pca(adata, n_comps=10, svd_solver="arpack")
sc.pp.neighbors(adata, n_pcs=10, n_neighbors=15)
sc.tl.umap(adata)
# 8) Compute velocity moments (smoothing)
scv.pp.moments(adata, n_pcs=10, n_neighbors=15)
# 9) Fit velocities & graph
scv.tl.velocity(adata, mode="stochastic")
scv.tl.velocity_graph(adata)
# 10) Plot clusters + streams
sc.pl.umap(
adata,
color="cluster",
legend_loc="right margin",
title="Toy scVelo clusters"
)
scv.pl.velocity_embedding_stream(
adata,
basis="umap",
color="cluster",
linewidth=1.5,
legend_loc="right margin",
title="Toy scVelo velocity"
)
Output:
%matplotlib inline
import numpy as np
import pandas as pd
import scanpy as sc
import scvelo as scv
scv.settings.verbosity = 3
scv.settings.set_figure_params('scvelo', figsize=(6,6), dpi=100)
counts = pd.read_csv("counts.csv", index_col=0)
meta = pd.read_csv("meta.csv", index_col=0)
adata = sc.AnnData(
X = counts.T.values,
obs = meta.copy(),
var = pd.DataFrame(index=counts.index)
)
adata.obs_names = counts.columns
##Output
adata.obs_names = counts.columns
<img width="1588" height="824" alt="image" src="https://github.com/user-attachments/assets/93b63fb7-c6bc-4dda-9495-e5362d9b6a47" />
###Output:
adata.var_names = counts.index
# 3) Fake velocity layers (unspliced is slightly shifted so the fit isn't degenerate)
np.random.seed(0)
adata.layers["spliced"] = adata.X.copy()
adata.layers["unspliced"] = adata.X * 0.8 + np.random.normal(
scale=0.5, size=adata.X.shape
)
adata.layers
Output:
np.random.seed(0)
adata.layers["spliced"] = adata.X.copy()
adata.layers["unspliced"] = adata.X * 0.8 + np.random.normal(
scale=0.5, size=adata.X.shape
)
# 4) Gene‐level filtering: keep genes seen ≥5× in **both** layers
mask_sp = np.sum(adata.layers["spliced"], axis=0) >= 5
mask_un = np.sum(adata.layers["unspliced"], axis=0) >= 5
mask = mask_sp & mask_un
adata = adata[:, mask]
##Output:
mask_sp = np.sum(adata.layers["spliced"], axis=0) >= 5
mask_un = np.sum(adata.layers["unspliced"], axis=0) >= 5
mask = mask_sp & mask_un
adata = adata[:, mask]
sc.pp.highly_variable_genes(adata, n_top_genes=50, flavor="seurat_v3")
adata = adata[:, adata.var["highly_variable"]]
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.pca(adata, n_comps=10, svd_solver="arpack")
sc.pp.neighbors(adata, n_pcs=10, n_neighbors=15)
sc.tl.umap(adata)
scv.pp.moments(adata, n_pcs=10, n_neighbors=15)
adata.obsp.keys()
adata.uns.keys()
scv.tl.velocity(adata, mode="stochastic")
scv.tl.velocity_graph(adata)
adata.layers["velocity"]
Output:
adata.layers["velocity"].shape
scv.pl.velocity(adata, var_names=adata.var_names[:5], color="cluster")
scv.pl.velocity_embedding_stream(adata, basis="umap", color="cluster")
Final Outputs:
sc.pl.umap(
adata,
color="cluster",
legend_loc="right margin",
title="Toy scVelo clusters"
)
scv.pl.velocity_embedding_stream(
adata,
basis="umap",
color="cluster",
linewidth=1.5,
legend_loc="right margin",
title="Toy scVelo velocity"
)