Tutorial 4: Application on Zebrafish Melanoma Dataset - yihe-csu/SAGE GitHub Wiki
This tutorial showcases the application of SAGE to a zebrafish melanoma spatial transcriptomics dataset, aiming to assess the cross-species generalizability and spatial domain resolution capabilities of our unsupervised model. We utilize three zebrafish melanoma samples (A, B, and C, Hunter et al., 2021), each accompanied by H&E-stained tissue images and expert-curated spatial annotations, including tumor tissue, normal tissue, and a transitional tumorβmuscle interface. For this tutorial, we focus on Sample B as a representative example to demonstrate SAGEβs ability to delineate biologically meaningful spatial domains, particularly in the interface regionβa biologically relevant boundary that marks the invasive front of the tumor.
The dataset and annotations used here are publicly available from the original study (Hunter et al., 2021). Due to the large size of the processed dataset, we have uploaded it to Figshare: π https://figshare.com/articles/dataset/Zebrafish_melanomas/29493044. Please download the dataset and place it in the appropriate directory before running this tutorial.
Identify HSGs
1. Load SAGE and its dependent packages
import os
import sys
import numpy as np
import pandas as pd
import torch
from sklearn import metrics
import matplotlib.pyplot as plt
import scanpy as sc
import importlib
sys.path.append(os.path.abspath("C://Users//heyi//Desktop/code_iteration/SAGE-main/"))
import SAGE
print(SAGE.__version__)
print(SAGE.__author__)
print(SAGE.__email__)
1.1.8
Yi He
[email protected]
2. Set up the working environment and import data
# Run device, by default, the package is implemented on 'cpu'. We recommend using GPU.
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# the location of R, which is necessary for mclust algorithm. Please replace the path below with local R installation path
os.environ['R_HOME'] = 'E:\\R-4.4.1'
base_path = "C:/Users/heyi/Desktop/code_iteration/SAGE-main/"
Dataset = "Zebrafish_melanomas/Sample_B"
file_fold = f'{base_path}/Dataset/{Dataset}/'
# Set directory (If you want to use additional data, please change the file path)
output_dir=f"{base_path}/Result/{Dataset}"
output_process_dir = output_dir + "/process_data"
output_result_dir = output_dir + "/result_data"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if not os.path.exists(output_process_dir):
os.makedirs(output_process_dir)
if not os.path.exists(output_result_dir):
os.makedirs(output_result_dir)
feature_bc_matrix_path = file_fold + f"\\filtered_feature_bc_matrix.h5"
pos_path = file_fold + f"\\tissue_positions_list.csv.gz"
img_path = file_fold + f"\\image.tif.gz"
adata = sc.read_10x_h5(feature_bc_matrix_path)
positions = pd.read_csv(pos_path, header=None)
positions.columns = ["barcode", "in_tissue", "array_x", "array_y", "image_x", "image_y"]
positions.set_index("barcode", inplace=True)
max_x = positions['image_x'].max()
positions["image_x"] = max_x - positions["image_x"]
positions = positions.loc[adata.obs.index]
adata.obs = adata.obs.join(positions)
adata.obsm["spatial"] = adata.obs["image_x", "image_y"](/yihe-csu/SAGE/wiki/"image_x",-"image_y").values
adata.var_names_make_unique()
adata
AnnData object with n_obs Γ n_vars = 2677 Γ 32268
obs: 'in_tissue', 'array_x', 'array_y', 'image_x', 'image_y'
var: 'gene_ids', 'feature_types', 'genome'
obsm: 'spatial'
3. Expression data preprocessing
adata = adata.copy()
sc.pp.filter_genes(adata, min_cells=5)
sc.pp.filter_genes(adata, min_counts=5)
sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=3000)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.pca(adata, n_comps=50, mask_var="highly_variable", svd_solver='arpack')
4. Multi-resolution Consensus Clustering
con_method = "leiden" #["leiden","mclust"]
con_range = ( 0.1, 3 , 0.1)
con_use_rep = 'X_pca'
n_neig_coord=6
n_neig_feat =6
con_dim =25
con_radius=20
con_refine=True
SAGE.consensus_clustering(adata,
method=con_method,
resolution_range=con_range,
n_neighbors=n_neig_feat,
use_rep=con_use_rep,
dims=con_dim,
radius=con_radius,
refinement=con_refine)
>>> Starting leiden clustering...
Running leiden clustering: 100%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 29/29 [00:03<00:00, 8.06it/s]
>>> leiden clustering finished. Time elapsed: 10.45 seconds
>>> Computing consensus matrix...
>>> Consensus matrix computed. Time elapsed: 0.63 seconds
>>> Performing final Leiden clustering on consensus matrix...
>>> Consensus clustering completed.
>>> adata.obs['pre_domain'] generated!
>>> adata.obsm['consensus_freq'] generated!
>>> adata.obsm['clusters_results'] generated!
5.Topic Selection via Supervised Learning
SAGE.preprocess.topics_selection(adata, n_topics=30)
>>> Start the genes_mri calculation ...
>>> Finish genes_mri calculation! Time taken: 2.07 seconds
>>> adata.var['genes_mri'] generated!
>>> Step1: Starting NMF calculation...
>>> NMF completed. 30 topics identified. Time taken: 25.18 seconds
>>> Step2: Filtering based on Topics Moran's I values...
>>> Stpe3: Filtering based on Random Forest importances...
>>> 27 topics selected !
>>> adata.obsm['W_nmf'], adata.varm['H_nmf'] generate!
>>> adata.uns['topic_mri_list'] generate!
>>> adata.uns['sorted_indices_imp'] generate!
>>> adata.uns['final_topics'] generate!
6. Draw Topics detected by RF
SAGE.plot.plot_topics(adata,
uns_key="final_topics",
img_key=None,
spot_size=500,
ncols=9,
figsize=(4, 4),
fontsize=20,
frameon=False,
legend_loc=False,
colorbar_loc=None,
show=False)
7. Construction of high-specificity genes (HSGs)
SAGE.preprocess.genes_selection(adata, n_genes=3000, lower_p=10, upper_p=99)
percentile_genes
Topic gene_lower rank_lower gene_upper rank_upper
0 0 rps25 15 tcta 1110
1 1 COX5B (1 of many)-1 84 ppp2r5ea 2922
2 2 rgcc 24 dnajc28 2779
3 3 cox7a2a 90 clptm1l 2926
4 4 AC024175.4 50 cecr2 2900
5 5 frzb 38 filip1b 2888
6 6 ba1-1 30 trove2 2816
7 7 rpl39 47 nosip 2892
8 8 calm2b 59 sox9b 2906
9 9 mt-nd2 31 si:ch211-213a13.1 2815
10 10 atp5e 73 dlx5a 2902
11 11 rplp1 28 u2af1 2826
12 12 si:ch211-251b21.1 81 scocb 2918
13 13 ldb3b 39 slc39a13 2870
14 14 paics 38 MRPL49 2906
15 15 calm3b 71 capn1b 2921
16 16 fth1a 12 mmp2 2836
17 17 CABZ01073795.1 57 tnrc6b 2905
18 18 ak1 21 ap1b1 2876
19 19 rpl22 35 dph6 2876
20 20 rps12 34 mtfmt 2854
21 21 chac1 30 stx11a 2838
22 22 rps16 49 dlx2a 2892
23 23 ddx5 39 syne1a 2869
24 24 si:dkey-6n3.3 72 TPGS1 2915
25 25 anxa2a 45 gpr137 2910
26 26 rpl13a 60 safb 2911
number_svgs:649
number_no_svgs:694
>>> HSGs selected!
>>> adata.uns['HSG'] generate!
>>> adata.varm['gene_topic_corr'] generate! (method='pearson')
>>> adata.uns['marker_genes_all_dict'] generate! (corr_threshold=0.2)
Clustering
1. Consensus-driven Graph Construction
SAGE.preprocess.optimize_graph_topology(
adata,
n_neig_coord=6,
cut_side_thr=0.3,
min_neighbor=2,
n_neig_feat=15,
)
>>> Graph_coord constructed!
>>> adata.obsm['adj_opt'] generate!
>>> Graph_feat constructed!
>>> adata.obsm['adj_feat'] generate!
2. Draw SAG edge
SAGE.plot.plot_neighbors_cut(adata,
img_key=None,
spot_size=500,
figsize=(5, 5),
frameon=False,
legend_loc=None,
colorbar_loc=None,
show=False)
3. Run SAGE
# define model
model = SAGE.SAGE(adata, device=device, epochs=800)
adata = model.train()
>>> SAGE model construction completed, training begins.
>>> Optimization finished for ST data.
4. Domain clustering
n_clusters=20
SAGE.utils.clustering(adata,
data= adata.obsm["emb_latent_att"],
method='mclust',
n_clusters=n_clusters,
res = 0.3,
radius=30,
refinement=False)
R[write to console]: __ __
____ ___ _____/ /_ _______/ /_
/ __ `__ \/ ___/ / / / / ___/ __/
/ / / / / / /__/ / /_/ (__ ) /_
/_/ /_/ /_/\___/_/\__,_/____/\__/ version 6.1.1
Type 'citation("mclust")' for citing this R package in publications.
fitting ...
|======================================================================| 100%
>>> Clustering completed using mclust with 20 clusters.
>>> adata.obsm['domain'] & ['mclust'] generate!
5. Save result
adata.write_h5ad(output_result_dir+"/result.h5")
clusters = adata.obs["domain"]
clusters.to_csv(output_result_dir+f"/clusters.csv",header = False)
embedding = adata.obsm["emb_latent_att"]
np.savetxt(output_result_dir+"/embedding.txt",embedding)
SAGE.utils.export_H_zscore_to_csv(adata, out_dir=output_process_dir)
SAGE.utils.export_Corr_to_csv(adata, out_dir=output_process_dir)
6. Plot clustering results
# Read data
adata = sc.read(output_result_dir+'/result.h5')
# Set colors
custom_palette = [
'#F4E8B8', '#EEC30E', '#8F0100', '#058187', '#0C4DA7',
'#B44541', '#632621', '#92C7A3', '#D98882', '#6A93CB',
'#F0C94A', '#AD6448', '#4F6A9C', '#CCB9A1', '#0B3434',
'#3C4F76', '#C1D354', '#7D5BA6', '#F28522', '#4A9586',
'#FF6F61', '#D32F2F', '#1976D2', '#388E3C', '#FBC02D',
'#8E24AA', '#0288D1', '#7B1FA2', '#F57C00', '#C2185B'
]
cluster_palette = {int(cluster): custom_palette[i % len(custom_palette)]
for i, cluster in enumerate(adata.obs['domain'].unique())}
plt.rcParams["figure.figsize"] = (4,4)
plt.rcParams['font.size'] = 10
labels_pred = adata.obs["domain"]
X = adata.obsm["emb_latent_att"]
SC = metrics.silhouette_score(X, labels_pred)
DB = metrics.davies_bouldin_score(X, labels_pred)
sc.pl.spatial(adata,
img_key = None,
spot_size = 500,
palette=cluster_palette,
color = ["domain"],
title = [f'(SC={SC:.3f} DB={DB:.3f})'],
na_in_legend = False,
frameon=False,
show= False)
plt.tight_layout()
plt.show()
7. Marker genes detection
SAGE.utils.run_domain_gene_mapping_pipeline(
adata,
useref="domain",
topic_key="W_nmf",
corr_threshold=0.2,
topic_gene_corr_key="gene_topic_corr",
domain_mapping_topic_key="domain_mapping_topic",
marker_genes_dict_key="marker_genes_multi_dict",
domain_to_genes_key="domain_to_genes"
)
>>> Step 1: Generating marker genes for topics...
>>> Step 2: Matching domains to best topics...
>>> Step 3: Generating domain β genes dictionary...
>>> generate adata.uns['domain_to_genes']οΌcontain 20 domainγ
>>> Pipeline completed.
SAGE.plot.plot_marker_genes(
adata,
userep="domain",
domain='13',
n_genes=4,
ncols=4,
spot_size=500,
out_dir=None,
figsize=(4, 4),
fontsize=10,
frameon=None,
legend_loc='on data',
colorbar_loc="right",
show_title=True,
palette_dict=cluster_palette
)
8. Plot UMAP and PAGA graph
adata = sc.read(output_result_dir+'/result.h5')
adata.var_names_make_unique()
SAGE_embed = pd.read_csv(output_result_dir+"/embedding.txt",header = None, delim_whitespace=True)
labels_true = adata.obs["domain"].copy()
SAGE_embed.index = labels_true.index
adata.obsm["SAGE"] = SAGE_embed
adata.obsm["SAGE"].columns = adata.obsm["SAGE"].columns.astype(str)
# Plot
plt.rcParams["figure.figsize"] = (6,5)
sc.pp.neighbors(adata, use_rep="emb_latent_att")
sc.tl.umap(adata)
sc.tl.paga(adata,groups='domain')
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
sc.pl.umap(adata,
color='domain',
palette=cluster_palette,
legend_loc='on data',
legend_fontoutline=3,
add_outline=True, s=30,
outline_width=(0.3, 0.05),
legend_fontsize=10,
frameon=False,
ax=axes[0],
show=False)
axes[0].set_title('UMAP', fontsize=20)
umap_coords = pd.DataFrame(adata.obsm['X_umap'],
index=adata.obs.index,
columns=['UMAP1', 'UMAP2'])
clusters = adata.obs['domain'].astype(int)
cluster_centers = umap_coords.groupby(clusters).mean()
sc.pl.paga(adata,
color='domain',
pos=cluster_centers.values,
node_size_scale=10,
fontoutline=3,
frameon=False,
edge_width_scale=1,
fontsize=10,
# fontweight='bold',
ax=axes[1],
show=False)
axes[1].set_title('PAGA', fontsize=20)
9. Plot Topic-gene UMAP
topics_list = ['1','2','3','6','18','26']
marker_genes_umap_df, selected_order = SAGE.utils.analyze_topics_and_umap(
adata,
method="pearson",
corr_threshold=0.2,
n_pca_components=50,
n_neighbors=15,
embedding_method="umap",
min_dist=0.1,
random_state=42,
n_genes=20,
topics_list=topics_list
)
SAGE.plot.plot_umap(marker_genes_umap_df)