Tutorial 4: Application on Zebrafish Melanoma Dataset - yihe-csu/SAGE GitHub Wiki

This tutorial showcases the application of SAGE to a zebrafish melanoma spatial transcriptomics dataset, aiming to assess the cross-species generalizability and spatial domain resolution capabilities of our unsupervised model. We utilize three zebrafish melanoma samples (A, B, and C, Hunter et al., 2021), each accompanied by H&E-stained tissue images and expert-curated spatial annotations, including tumor tissue, normal tissue, and a transitional tumor–muscle interface. For this tutorial, we focus on Sample B as a representative example to demonstrate SAGE’s ability to delineate biologically meaningful spatial domains, particularly in the interface region—a biologically relevant boundary that marks the invasive front of the tumor.

The dataset and annotations used here are publicly available from the original study (Hunter et al., 2021). Due to the large size of the processed dataset, we have uploaded it to Figshare: 🔗 https://figshare.com/articles/dataset/Zebrafish_melanomas/29493044. Please download the dataset and place it in the appropriate directory before running this tutorial.

Identify HSGs

1. Load SAGE and its dependent packages

import os
import sys
import numpy as np
import pandas as pd
import torch
from sklearn import metrics
import matplotlib.pyplot as plt
import scanpy as sc
import importlib
sys.path.append(os.path.abspath("C://Users//heyi//Desktop/code_iteration/SAGE-main/"))

import SAGE
print(SAGE.__version__)
print(SAGE.__author__)
print(SAGE.__email__)

1.1.8 
Yi He
[email protected]

2. Set up the working environment and import data

# Run device, by default, the package is implemented on 'cpu'. We recommend using GPU.
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# the location of R, which is necessary for mclust algorithm. Please replace the path below with local R installation path
os.environ['R_HOME'] = 'E:\\R-4.4.1'

base_path = "C:/Users/heyi/Desktop/code_iteration/SAGE-main/"
Dataset = "Zebrafish_melanomas/Sample_B"

file_fold = f'{base_path}/Dataset/{Dataset}/'
# Set directory (If you want to use additional data, please change the file path)
output_dir=f"{base_path}/Result/{Dataset}"
output_process_dir = output_dir + "/process_data"
output_result_dir = output_dir + "/result_data"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
if not os.path.exists(output_process_dir):
    os.makedirs(output_process_dir)
if not os.path.exists(output_result_dir):
    os.makedirs(output_result_dir)
    
feature_bc_matrix_path = file_fold + f"\\filtered_feature_bc_matrix.h5"
pos_path = file_fold + f"\\tissue_positions_list.csv.gz"
img_path =  file_fold + f"\\image.tif.gz"

adata = sc.read_10x_h5(feature_bc_matrix_path)
positions = pd.read_csv(pos_path, header=None)
positions.columns = ["barcode", "in_tissue", "array_x", "array_y", "image_x", "image_y"]
positions.set_index("barcode", inplace=True)
max_x = positions['image_x'].max()
positions["image_x"] = max_x - positions["image_x"]
positions = positions.loc[adata.obs.index]
adata.obs = adata.obs.join(positions)
adata.obsm["spatial"] = adata.obs["image_x", "image_y"](/yihe-csu/SAGE/wiki/"image_x",-"image_y").values
adata.var_names_make_unique()
adata

AnnData object with n_obs × n_vars = 2677 × 32268
    obs: 'in_tissue', 'array_x', 'array_y', 'image_x', 'image_y'
    var: 'gene_ids', 'feature_types', 'genome'
    obsm: 'spatial'

3. Expression data preprocessing

adata = adata.copy()
sc.pp.filter_genes(adata, min_cells=5)
sc.pp.filter_genes(adata, min_counts=5)
sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=3000)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.pca(adata, n_comps=50, mask_var="highly_variable", svd_solver='arpack')

4. Multi-resolution Consensus Clustering

con_method = "leiden"  #["leiden","mclust"]
con_range = ( 0.1, 3 , 0.1)
con_use_rep = 'X_pca'
n_neig_coord=6
n_neig_feat =6
con_dim =25
con_radius=20
con_refine=True

SAGE.consensus_clustering(adata, 
                          method=con_method, 
                          resolution_range=con_range, 
                          n_neighbors=n_neig_feat, 
                          use_rep=con_use_rep, 
                          dims=con_dim, 
                          radius=con_radius, 
                          refinement=con_refine)

>>> Starting leiden clustering...
Running leiden clustering: 100%|███████████████████████████████████████████████████████| 29/29 [00:03<00:00,  8.06it/s]
>>> leiden clustering finished. Time elapsed: 10.45 seconds
>>> Computing consensus matrix...
>>> Consensus matrix computed. Time elapsed: 0.63 seconds
>>> Performing final Leiden clustering on consensus matrix...
>>> Consensus clustering completed.
>>> adata.obs['pre_domain'] generated!
>>> adata.obsm['consensus_freq'] generated!
>>> adata.obsm['clusters_results'] generated!

5.Topic Selection via Supervised Learning

SAGE.preprocess.topics_selection(adata, n_topics=30)

>>> Start the genes_mri calculation ...
>>> Finish genes_mri calculation! Time taken: 2.07 seconds
>>> adata.var['genes_mri'] generated!
>>> Step1: Starting NMF calculation...
>>> NMF completed. 30 topics identified. Time taken: 25.18 seconds
>>> Step2: Filtering based on Topics Moran's I values...
>>> Stpe3: Filtering based on Random Forest importances...
>>> 27 topics selected !
>>> adata.obsm['W_nmf'], adata.varm['H_nmf'] generate!
>>> adata.uns['topic_mri_list'] generate!
>>> adata.uns['sorted_indices_imp'] generate!
>>> adata.uns['final_topics'] generate!

6. Draw Topics detected by RF

SAGE.plot.plot_topics(adata, 
                    uns_key="final_topics", 
                    img_key=None, 
                    spot_size=500, 
                    ncols=9,
                    figsize=(4, 4),
                    fontsize=20,
                    frameon=False, 
                    legend_loc=False, 
                    colorbar_loc=None,
                    show=False)

7. Construction of high-specificity genes (HSGs)

SAGE.preprocess.genes_selection(adata, n_genes=3000, lower_p=10, upper_p=99)

percentile_genes
    Topic           gene_lower  rank_lower         gene_upper  rank_upper
0       0                rps25          15               tcta        1110
1       1  COX5B (1 of many)-1          84           ppp2r5ea        2922
2       2                 rgcc          24            dnajc28        2779
3       3              cox7a2a          90            clptm1l        2926
4       4           AC024175.4          50              cecr2        2900
5       5                 frzb          38            filip1b        2888
6       6                ba1-1          30             trove2        2816
7       7                rpl39          47              nosip        2892
8       8               calm2b          59              sox9b        2906
9       9               mt-nd2          31  si:ch211-213a13.1        2815
10     10                atp5e          73              dlx5a        2902
11     11                rplp1          28              u2af1        2826
12     12    si:ch211-251b21.1          81              scocb        2918
13     13                ldb3b          39           slc39a13        2870
14     14                paics          38             MRPL49        2906
15     15               calm3b          71             capn1b        2921
16     16                fth1a          12               mmp2        2836
17     17       CABZ01073795.1          57             tnrc6b        2905
18     18                  ak1          21              ap1b1        2876
19     19                rpl22          35               dph6        2876
20     20                rps12          34              mtfmt        2854
21     21                chac1          30             stx11a        2838
22     22                rps16          49              dlx2a        2892
23     23                 ddx5          39             syne1a        2869
24     24        si:dkey-6n3.3          72              TPGS1        2915
25     25               anxa2a          45             gpr137        2910
26     26               rpl13a          60               safb        2911
number_svgs:649
number_no_svgs:694
>>> HSGs selected!
>>> adata.uns['HSG'] generate!
>>> adata.varm['gene_topic_corr'] generate! (method='pearson')
>>> adata.uns['marker_genes_all_dict'] generate! (corr_threshold=0.2)

Clustering

1. Consensus-driven Graph Construction

SAGE.preprocess.optimize_graph_topology(
    adata,
    n_neig_coord=6,
    cut_side_thr=0.3,
    min_neighbor=2,
    n_neig_feat=15,
)

>>> Graph_coord constructed!
>>> adata.obsm['adj_opt'] generate!
>>> Graph_feat constructed!
>>> adata.obsm['adj_feat'] generate!

2. Draw SAG edge

SAGE.plot.plot_neighbors_cut(adata,
                   img_key=None,
                   spot_size=500,
                   figsize=(5, 5),
                   frameon=False,
                   legend_loc=None,
                   colorbar_loc=None,
                   show=False)

3. Run SAGE

# define model
model = SAGE.SAGE(adata, device=device, epochs=800)
adata = model.train()

>>> SAGE model construction completed, training begins.
>>> Optimization finished for ST data.

4. Domain clustering

n_clusters=20

SAGE.utils.clustering(adata, 
                     data= adata.obsm["emb_latent_att"], 
                     method='mclust', 
                     n_clusters=n_clusters, 
                     res = 0.3, 
                     radius=30,  
                     refinement=False)

R[write to console]:                    __           __ 
   ____ ___  _____/ /_  _______/ /_
  / __ `__ \/ ___/ / / / / ___/ __/
 / / / / / / /__/ / /_/ (__  ) /_  
/_/ /_/ /_/\___/_/\__,_/____/\__/   version 6.1.1
Type 'citation("mclust")' for citing this R package in publications.

fitting ...
  |======================================================================| 100%
>>> Clustering completed using mclust with 20 clusters.
>>> adata.obsm['domain'] & ['mclust'] generate!

5. Save result

adata.write_h5ad(output_result_dir+"/result.h5")
clusters = adata.obs["domain"] 
clusters.to_csv(output_result_dir+f"/clusters.csv",header = False)
embedding = adata.obsm["emb_latent_att"]
np.savetxt(output_result_dir+"/embedding.txt",embedding)

SAGE.utils.export_H_zscore_to_csv(adata, out_dir=output_process_dir)
SAGE.utils.export_Corr_to_csv(adata, out_dir=output_process_dir)

6. Plot clustering results

# Read data
adata = sc.read(output_result_dir+'/result.h5')


# Set colors
custom_palette = [
    '#F4E8B8',  '#EEC30E',  '#8F0100',  '#058187',  '#0C4DA7',  
    '#B44541',  '#632621',  '#92C7A3',  '#D98882',  '#6A93CB',  
    '#F0C94A',  '#AD6448',  '#4F6A9C',  '#CCB9A1',  '#0B3434',  
    '#3C4F76',  '#C1D354',  '#7D5BA6',  '#F28522',  '#4A9586',
    '#FF6F61',  '#D32F2F',  '#1976D2',  '#388E3C',  '#FBC02D', 
    '#8E24AA',  '#0288D1',  '#7B1FA2',  '#F57C00',  '#C2185B'
]
cluster_palette = {int(cluster): custom_palette[i % len(custom_palette)] 
                   for i, cluster in enumerate(adata.obs['domain'].unique())}

plt.rcParams["figure.figsize"] = (4,4)
plt.rcParams['font.size'] = 10
labels_pred = adata.obs["domain"]
X = adata.obsm["emb_latent_att"]
SC = metrics.silhouette_score(X, labels_pred)
DB = metrics.davies_bouldin_score(X, labels_pred)

sc.pl.spatial(adata, 
            img_key = None, 
            spot_size = 500,
            palette=cluster_palette,
            color = ["domain"],
            title = [f'(SC={SC:.3f}  DB={DB:.3f})'], 
            na_in_legend = False,
            frameon=False,
            show= False)
plt.tight_layout()
plt.show()

7. Marker genes detection

SAGE.utils.run_domain_gene_mapping_pipeline(
    adata,
    useref="domain",
    topic_key="W_nmf",
    corr_threshold=0.2,
    topic_gene_corr_key="gene_topic_corr",
    domain_mapping_topic_key="domain_mapping_topic",
    marker_genes_dict_key="marker_genes_multi_dict",
    domain_to_genes_key="domain_to_genes"
)

>>> Step 1: Generating marker genes for topics...
>>> Step 2: Matching domains to best topics...
>>> Step 3: Generating domain → genes dictionary...
>>> generate adata.uns['domain_to_genes']，contain 20 domain。 
>>> Pipeline completed.

SAGE.plot.plot_marker_genes(
    adata,
    userep="domain",
    domain='13',
    n_genes=4, 
    ncols=4,
    spot_size=500, 
    out_dir=None,
    figsize=(4, 4), 
    fontsize=10,
    frameon=None, 
    legend_loc='on data', 
    colorbar_loc="right",
    show_title=True,
    palette_dict=cluster_palette
)

8. Plot UMAP and PAGA graph

adata = sc.read(output_result_dir+'/result.h5')
adata.var_names_make_unique()

SAGE_embed = pd.read_csv(output_result_dir+"/embedding.txt",header = None, delim_whitespace=True)
labels_true = adata.obs["domain"].copy()
SAGE_embed.index = labels_true.index
adata.obsm["SAGE"] = SAGE_embed
adata.obsm["SAGE"].columns = adata.obsm["SAGE"].columns.astype(str)

# Plot
plt.rcParams["figure.figsize"] = (6,5)
sc.pp.neighbors(adata, use_rep="emb_latent_att")
sc.tl.umap(adata)
sc.tl.paga(adata,groups='domain')

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

sc.pl.umap(adata,
           color='domain',
           palette=cluster_palette,
           legend_loc='on data',
           legend_fontoutline=3,
           add_outline=True, s=30,
           outline_width=(0.3, 0.05),
           legend_fontsize=10,
           frameon=False,
           ax=axes[0],   
           show=False)  

axes[0].set_title('UMAP', fontsize=20)

umap_coords = pd.DataFrame(adata.obsm['X_umap'],
                           index=adata.obs.index,
                           columns=['UMAP1', 'UMAP2'])

clusters = adata.obs['domain'].astype(int)
cluster_centers = umap_coords.groupby(clusters).mean()

sc.pl.paga(adata,
           color='domain',
           pos=cluster_centers.values,
           node_size_scale=10,
           fontoutline=3,
           frameon=False,
           edge_width_scale=1,
           fontsize=10,
           # fontweight='bold',
           ax=axes[1], 
           show=False)

axes[1].set_title('PAGA', fontsize=20)

9. Plot Topic-gene UMAP

topics_list = ['1','2','3','6','18','26']
marker_genes_umap_df, selected_order = SAGE.utils.analyze_topics_and_umap(
                        adata, 
                        method="pearson",  
                        corr_threshold=0.2,   
                        n_pca_components=50, 
                        n_neighbors=15,
                        embedding_method="umap",  
                        min_dist=0.1, 
                        random_state=42, 
                        n_genes=20,  
                        topics_list=topics_list 
                    )
SAGE.plot.plot_umap(marker_genes_umap_df)