Multi‐Omics and Batch Integration - iffatAGheyas/bioinformatics-tutorial-wiki GitHub Wiki

6.1.8 Multi-Omics & Batch Integration

Modern single-cell experiments often measure multiple modalities (e.g. RNA + surface proteins) or combine data from multiple batches. Properly integrating these datasets increases power and reduces technical artifacts.


A. CITE-seq (RNA + Protein)

Installation (Seurat v4)

# in R
if (!requireNamespace("BiocManager", quietly=TRUE))
    install.packages("BiocManager")
BiocManager::install("Seurat")  

Load 10x CITE-seq output

library(Seurat)

# point to a cellranger “filtered_feature_bc_matrix” dir
data_dir <- "outs/filtered_feature_bc_matrix/"

# Read both RNA & ADT (antibody-derived tag) assays
cite_data <- Read10X(data.dir = data_dir)
# cite_data is a list: $`Gene Expression`, $`Antibody Capture`, etc.

# Create Seurat object with RNA
seurat_obj <- CreateSeuratObject(
  counts = cite_data["Gene Expression"](/iffatAGheyas/bioinformatics-tutorial-wiki/wiki/"Gene-Expression"),
  project = "CITE",
  assay   = "RNA"
)

# Add ADT assay
adt_assay <- CreateAssayObject(counts = cite_data["Antibody Capture"](/iffatAGheyas/bioinformatics-tutorial-wiki/wiki/"Antibody-Capture"))
seurat_obj["ADT"](/iffatAGheyas/bioinformatics-tutorial-wiki/wiki/"ADT") <- adt_assay

# Normalize RNA & protein separately
seurat_obj <- SCTransform(seurat_obj, assay="RNA", verbose=FALSE)
seurat_obj <- NormalizeData(seurat_obj, assay="ADT", normalization.method="CLR")
seurat_obj <- ScaleData(seurat_obj, assay="ADT")

# Joint dimensionality reduction
seurat_obj <- RunPCA(seurat_obj, assay="RNA",   reduction.name="pca.rna")
seurat_obj <- RunPCA(seurat_obj, assay="ADT",   reduction.name="pca.adt")
seurat_obj <- FindMultiModalNeighbors(
  seurat_obj,
  reduction.list = list("pca.rna", "pca.adt"),
  dims.list      = list(1:30,        1:18),
  modality.weight.name = "RNA.protein.weight"
)
seurat_obj <- RunUMAP(
  seurat_obj,
  nn.name        = "weighted.nn",
  reduction.name = "wnn.umap",
  reduction.key  = "wnnUMAP_"
)
seurat_obj <- FindClusters(
  seurat_obj,
  graph.name = "wsnn",
  algorithm  = 3,      # Leiden
  resolution = 0.5
)

# Visualization
DimPlot(seurat_obj, reduction="wnn.umap", group.by="seurat_clusters") +
  ggtitle("CITE-seq WNN Clustering")

B. Batch Correction
  1. Harmony (R)
# install harmony
BiocManager::install("harmony")
library(harmony)

# assume `seurat_obj` has a `batch` metadata column
seurat_obj <- RunHarmony(
  object        = seurat_obj,
  group.by.vars = "batch",
  assay.use     = "SCT",
  reduction     = "pca"
)

# UMAP on Harmony embeddings
seurat_obj <- RunUMAP(
  seurat_obj,
  reduction = "harmony",
  dims      = 1:30,
  assay     = "SCT"
)
DimPlot(seurat_obj, reduction="umap", split.by="batch")

  1. Seurat IntegrateData (R)
# Split by batch and SCTransform each
seurat_list <- SplitObject(seurat_obj, split.by="batch")
for (i in seq_along(seurat_list)) {
  seurat_list[i](/iffatAGheyas/bioinformatics-tutorial-wiki/wiki/i) <- SCTransform(seurat_list[i](/iffatAGheyas/bioinformatics-tutorial-wiki/wiki/i), verbose=FALSE)
}

# Select integration features
features <- SelectIntegrationFeatures(
  object.list = seurat_list,
  nfeatures   = 3000
)

# Prepare for SCT integration
seurat_list <- PrepSCTIntegration(
  object.list   = seurat_list,
  anchor.features = features
)

# Find anchors & integrate
anchors      <- FindIntegrationAnchors(
  object.list   = seurat_list,
  normalization.method = "SCT",
  anchor.features      = features
)
seurat_int   <- IntegrateData(
  anchorset            = anchors,
  normalization.method = "SCT"
)

# Run PCA/UMAP on integrated object
seurat_int <- RunPCA(seurat_int, verbose=FALSE)
seurat_int <- RunUMAP(seurat_int, dims=1:30)
DimPlot(seurat_int, group.by="batch")

C. Multi-ome Integration

1.** MOFA2 (R)**

# install MOFA2
BiocManager::install("MOFA2")
library(MOFA2)

# Prepare named list of modalities (e.g. RNA & protein)
data_list <- list(
  RNA = as.matrix(seurat_obj@assays$RNA@counts),
  ADT = as.matrix(seurat_obj@assays$ADT@counts)
)

# Create and run MOFA model
mofa_obj <- create_mofa(data_list)
mofa_obj <- run_mofa(mofa_obj)

# Extract factors
factors <- get_factors(mofa_obj, factors="all")
plot_factors(mofa_obj, factors=1:2, color_by=seurat_obj$batch)

  1. Seurat WNN (R) (see CITE-seq section above—WNN is Seurat’s built-in multi-omic integration)
# After FindMultiModalNeighbors
seurat_obj <- FindClusters(
  seurat_obj,
  graph.name = "wsnn",   # uses weighted shared nearest neighbors
  algorithm  = 3,
  resolution = 0.5
)
DimPlot(seurat_obj, reduction="wnn.umap", label=TRUE)

Key points:

  • CITE-seq leverages ADT counts to refine clusters beyond RNA alone.

  • Harmony and IntegrateData both correct batch effects—Harmony works “in‐place” on embeddings, while Seurat’s integration creates a new assay.

  • MOFA2 uncovers shared latent factors across modalities; Seurat WNN uses a graph‐based approach for joint clustering.

  • Always inspect batch mixing (e.g. UMAP colored by batch) and modality contributions to ensure successful integration.