Tutorial 1: 10X Visium (DLPFC dataset) - cmzuo11/stClinic GitHub Wiki
Here, we present our re-analysis of four slices (151673-151676) from the human dorsolateral prefrontal cortex (DLPFC) dataset. Maynard et al. manually annotated the DLPFC layers and white matter (WM) based on morphological features and gene markers. The processed data can be accessed at https://github.com/cmzuo11/stClinic/tree/main/Datasets/DLPFC, where each slice, annotated with clusters, is included in its respective folder.
This tutorial demonstrates how to integrate and identify spatial domains in 10X Visium data using our unsupervised stClinic model. If you are interested in using the trained models presented in our paper, please visit the following link: https://github.com/cmzuo11/stClinic/blob/main/Datasets/DLPFC/integrated_adata_151673_151674_151675_151676.h5ad.
Preparation
import os
import anndata
import scanpy as sc
import random
import torch
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.metrics import adjusted_rand_score as ari_score
from sklearn.metrics import normalized_mutual_info_score as nmi_score
from sklearn.metrics import silhouette_score as s_score
import stClinic as stClinic
warnings.filterwarnings("ignore")
# Set R environment
os.environ['R_HOME'] = '/sibcb/program/install/r-4.0/lib64/R'
os.environ['R_USER'] = '/sibcb1/chenluonanlab8/zuochunman/anaconda3/envs/stClinic-test/lib/python3.8/site-packages/rpy2'
#Set seed
seed = 666
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
#Set parameters
used_device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
parser = stClinic.parameter_setting()
args = parser.parse_args()
args.out_dir = args.input_dir + 'stClinic/'
Path(args.out_dir).mkdir(parents=True, exist_ok=True)
Load data
section_ids = ['151673','151674','151675','151676']
Batch_list = []
adj_list = []
for idx, section_id in enumerate(section_ids):
# Read h5 file
input_dir = os.path.join(args.input_dir, section_id)
adata = sc.read_visium(path=input_dir, count_file='filtered_feature_bc_matrix.h5', load_images=True)
adata.var_names_make_unique(join="++")
# Read corresponding annotation file
Ann_df = pd.read_csv(os.path.join(input_dir, section_id + '_annotation.txt'), sep='\t', header=0, index_col=0)
Ann_df.loc[Ann_df['Layer'].isna(),'Layer'] = "unknown"
adata.obs['Ground Truth'] = Ann_df.loc[adata.obs_names, 'Layer'].astype('category')
adata.obs['batch_name_idx'] = idx
# Make spot name unique
adata.obs_names = [x+'_'+section_id for x in adata.obs_names]
# Construct intra-edges
stClinic.Cal_Spatial_Net(adata, rad_cutoff=args.rad_cutoff)
# Normalization
sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=args.n_top_genes)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata = adata[:, adata.var['highly_variable']]
sc.tl.pca(adata, n_comps=10, random_state=seed)
adj_list.append(adata.uns['adj'])
Batch_list.append(adata)
# Concat scanpy objects
adata_concat = anndata.concat(Batch_list, label="slice_name", keys=section_ids)
adata_concat.obs['Ground Truth'] = adata_concat.obs['Ground Truth'].astype('category')
adata_concat.obs["batch_name"] = adata_concat.obs["slice_name"].astype('category')
Construct unified graph
# mnn_dict = stClinic.create_dictionary_mnn(adata_concat, use_rep='X_pca', batch_name='batch_name', k=1) # k=0
adj_concat = stClinic.inter_linked_graph(adj_list, section_ids, mnn_dict=None)
adata_concat.uns['adj'] = adj_concat
adata_concat.uns['edgeList'] = np.nonzero(adj_concat)
Run stClinic
centroids_num = 7
adata_concat = stClinic.train_stClinic_model(adata_concat, n_centroids=centroids_num, lr=args.lr_integration, device=used_device)
Note that centroids_num represents the number of GMM components, which we have defined as 7. If this value is unknown, it can be calculated using the function: centroids_num = estimate_k(adata_concat.X.T)
adata_concat.obsm['stClinic'] contains the spot/cell embeddings learned using the unsupervised stClinic method.
Clustering and visualization
#Clustering by mclust algorithm
stClinic.mclust_R(adata_concat, num_cluster=len(np.unique(adata_concat.obs[adata_concat.obs['Ground Truth']!='unknown']['Ground Truth'])), used_obsm='stClinic')
adata_concat = adata_concat[adata_concat.obs['Ground Truth']!='unknown']
colors = ["#6D1A9B", "#CB79A6", "#7494D2", "#59BD85", "#56B3E8", "#FDB815", "#F46867"]
Batch_list1 = []
for section_id in section_ids:
Batch_list1.append(adata_concat[adata_concat.obs['batch_name'] == section_id])
spot_size = 200
title_size = 12
ARI_list = []
for bb in range(4):
ARI_list.append(round(ari_score(Batch_list1[bb].obs['Ground Truth'], Batch_list1[bb].obs['mclust']), 2))
fig, ax = plt.subplots(1, 4, figsize=(10, 5), gridspec_kw={'wspace': 0.05, 'hspace': 0.1})
_sc_0 = sc.pl.spatial(Batch_list1[0], img_key=None, color=['mclust'], title=[''],spot_size=200, legend_loc=None,
legend_fontsize=12, show=False, ax=ax[0], frameon=False, palette=colors)
_sc_0[0].set_title("ARI=" + str(ARI_list[0]), size=12)
_sc_1 = sc.pl.spatial(Batch_list1[1], img_key=None, color=['mclust'], title=[''],spot_size=200, legend_loc=None,
legend_fontsize=12, show=False, ax=ax[1], frameon=False, palette=colors)
_sc_1[0].set_title("ARI=" + str(ARI_list[1]), size=12)
_sc_2 = sc.pl.spatial(Batch_list1[2], img_key=None, color=['mclust'], title=[''],spot_size=200, legend_loc=None,
legend_fontsize=12, show=False, ax=ax[2], frameon=False, palette=colors)
_sc_2[0].set_title("ARI=" + str(ARI_list[2]), size=12)
_sc_3 = sc.pl.spatial(Batch_list1[3], img_key=None, color=['mclust'], title=[''],spot_size=200, legend_loc=None,
legend_fontsize=12, show=False, ax=ax[3], frameon=False, palette=colors)
_sc_3[0].set_title("ARI=" + str(ARI_list[3]), size=12)
adata_concat.obs['mclust'] includes the cluster lables predicted by mclust algorithm
UMAP reduction and visualization
sc.pp.neighbors(adata_concat, use_rep='stClinic', random_state=seed)
sc.tl.umap(adata_concat, random_state=seed)
slice_color = ["#FF0000", "#00FF00", "#0000FF", "#FFFF00"]
with rc_context({'figure.figsize': (5, 5)}):
sc.pl.umap(adata_concat, color='batch_name',
palette=slice_color,
add_outline=True, s=15,
legend_fontsize=12,
frameon=False)
sc.pl.umap(adata_concat, color='Ground Truth',
palette=colors,
legend_loc='on data', legend_fontoutline=2,
add_outline=True, s=15,
outline_width=(0.8, 0.05),
legend_fontsize=12,
frameon=False)
sc.pl.umap(adata_concat, color='mclust',
palette=colors,
legend_loc='on data', legend_fontoutline=2,
add_outline=True, s=15,
outline_width=(0.8, 0.05),
legend_fontsize=12,
frameon=False)