Tutorial 2: Application on Human Breast Cancer (HBC) dataset. - yihe-csu/SAGE GitHub Wiki

This tutorial demonstrates the analysis of a representative sample from the Human Breast Cancer (HBC) spatial transcriptomics dataset. The dataset includes expert-annotated tissue domains such as invasive ductal carcinoma (IDC), ductal/lobular carcinoma in situ (DCIS/LCIS), tumor edge, and healthy tissue. We apply our unsupervised model, SAGE, to identify spatial domains by integrating gene expression and spatial context. The processed data can be accessed at the following link: πŸ”— https://github.com/yihe-csu/SAGE/tree/main/Dataset/Human_Breast_Cancer.

Identify HSGs

1. Load SAGE and its dependent packages

import os
import sys
import numpy as np
import pandas as pd
import torch
from sklearn import metrics
import matplotlib.pyplot as plt
import scanpy as sc
import importlib
sys.path.append(os.path.abspath("C://Users//heyi//Desktop/code_iteration/SAGE-main/"))
import SAGE
print(SAGE.__version__)
print(SAGE.__author__)
print(SAGE.__email__)
1.1.8 
Yi He
[email protected]

2. Set up the working environment and import data

# Run device, by default, the package is implemented on 'cpu'. We recommend using GPU.
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# the location of R, which is necessary for mclust algorithm. Please replace the path below with local R installation path
os.environ['R_HOME'] = 'E:\\R-4.4.1'

base_path = "C:/Users/heyi/Desktop/code_iteration/SAGE-main"
Dataset = "Human_Breast_Cancer"

file_fold = f'{base_path}/Dataset/{Dataset}/'
# Set directory (If you want to use additional data, please change the file path)
output_dir=f"{base_path}/Result/{Dataset}"
output_process_dir = output_dir + "/process_data"
output_result_dir = output_dir + "/result_data"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
if not os.path.exists(output_process_dir):
    os.makedirs(output_process_dir)
if not os.path.exists(output_result_dir):
    os.makedirs(output_result_dir)


# Read data from input_dir
adata = sc.read_visium(file_fold, count_file='filtered_feature_bc_matrix.h5', load_images=True)
adata.var_names_make_unique()
adata.raw =adata.copy()

# add ground_truth
df_meta = pd.read_csv(file_fold + '/metadata.tsv', sep='\t')
df_meta_layer = df_meta['ground_truth']
adata.obs['ground_truth'] = df_meta_layer.values
# filter out NA nodes
adata = adata[~pd.isnull(adata.obs['ground_truth'])]
adata
View of AnnData object with n_obs Γ— n_vars = 3798 Γ— 36601
    obs: 'in_tissue', 'array_row', 'array_col', 'ground_truth'
    var: 'gene_ids', 'feature_types', 'genome'
    uns: 'spatial'
    obsm: 'spatial'

3. Expression data preprocessing

adata = adata.copy()
sc.pp.filter_genes(adata, min_cells=5)
sc.pp.filter_genes(adata, min_counts=5)
sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=3000)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.pca(adata, n_comps=50, mask_var="highly_variable", svd_solver='arpack')

4. Multi-resolution Consensus Clustering

con_method = "leiden"  #["leiden","mclust"]
con_range = ( 0.1, 2 , 0.1)
con_use_rep = 'X_pca'
n_neig_coord=6
n_neig_feat =6
con_dim =25
con_radius=20
con_refine=True

SAGE.consensus_clustering(adata, 
                          method=con_method, 
                          resolution_range=con_range, 
                          n_neighbors=n_neig_feat, 
                          use_rep=con_use_rep, 
                          dims=con_dim, 
                          radius=con_radius, 
                          refinement=con_refine)
>>> Starting leiden clustering...
Running leiden clustering:   0%|                                                                | 0/19 [00:00<?, ?it/s]C:\Users\heyi\Desktop\code_iteration\SAGE-main\SAGE\preprocess.py:422: FutureWarning: In the future, the default backend for leiden will be igraph instead of leidenalg.

 To achieve the future defaults please pass: flavor="igraph" and n_iterations=2.  directed must also be False to work with igraph's implementation.
  sc.tl.leiden(adata, resolution=param, random_state=0)
Running leiden clustering: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 19/19 [00:04<00:00,  4.33it/s]
>>> leiden clustering finished. Time elapsed: 11.38 seconds
>>> Computing consensus matrix...
>>> Consensus matrix computed. Time elapsed: 0.86 seconds
>>> Performing final Leiden clustering on consensus matrix...
>>> Consensus clustering completed.
>>> adata.obs['pre_domain'] generated!
>>> adata.obsm['consensus_freq'] generated!
>>> adata.obsm['clusters_results'] generated!

5.Topic Selection via Supervised Learning

SAGE.preprocess.topics_selection(adata, n_topics=30)
>>> Start the genes_mri calculation ...
>>> Finish genes_mri calculation! Time taken: 3.53 seconds
>>> adata.var['genes_mri'] generated!
>>> Step1: Starting NMF calculation...
>>> NMF completed. 30 topics identified. Time taken: 39.70 seconds
>>> Step2: Filtering based on Topics Moran's I values...
>>> Stpe3: Filtering based on Random Forest importances...
>>> 20 topics selected !
>>> adata.obsm['W_nmf'], adata.varm['H_nmf'] generate!
>>> adata.uns['topic_mri_list'] generate!
>>> adata.uns['sorted_indices_imp'] generate!
>>> adata.uns['final_topics'] generate!

6. Draw Topics detected by RF

SAGE.plot.plot_topics(adata, 
                    uns_key="final_topics", 
                    img_key=None, 
                    spot_size=400, 
                    ncols=6,
                    figsize=(4, 4),
                    fontsize=20,
                    frameon=False, 
                    legend_loc=False, 
                    colorbar_loc=None,
                    show=False)

7. Construction of high-specificity genes (HSGs)

SAGE.preprocess.genes_selection(adata, n_genes=3000)
percentile_genes
    Topic gene_lower  rank_lower gene_upper  rank_upper
0       0      RACK1          81     EIF2B5        2900
1       1      RPL30         189       ARNT        2944
2       2       PIGT         194      CYB5A        2945
3       3     CYB5R1         173     YTHDC1        2939
4       4       SIX1         176     DNAJA3        2945
5       5        HK2         140       PLS3        2939
6       6       RPL5         182     TIMM8B        2945
7       7        DSP         154     FAM8A1        2938
8       8      RPL17          94      MAP1B        2917
9       9       OPTN          92        RPE        2931
10     10   SERPINA3          65       PNKP        2923
11     11       MYL6          73      AKAP1        2895
12     12       CTSB         148     METTL9        2938
13     13    COL16A1          73    FILIP1L        2933
14     14      RPL36          89    SLC50A1        2918
15     15    S100A13         117      PNRC1        2905
16     16      RPL27         136   TRNAU1AP        2940
17     17       CD74          94      ITPR3        2907
18     18      RPL37          92      MYH14        2909
19     19     RNASE1         115      LRCH3        2921
number_svgs:925
number_no_svgs:1289
>>> HSGs selected!
>>> adata.uns['HSG'] generate!
>>> adata.varm['gene_topic_corr'] generate! (method='pearson')
>>> adata.uns['marker_genes_all_dict'] generate! (corr_threshold=0.2)

Clustering

1. Consensus-driven Graph Construction

SAGE.preprocess.optimize_graph_topology(
    adata,
    n_neig_coord=6,
    cut_side_thr=0.4,
    min_neighbor=2,
    n_neig_feat=15,
)
>>> Graph_coord constructed!
>>> adata.obsm['adj_opt'] generate!
>>> Graph_feat constructed!
>>> adata.obsm['adj_feat'] generate!

2. Draw SAG edge

SAGE.plot.plot_neighbors_cut(adata,
                   img_key=None,
                   spot_size=400,
                   figsize=(3, 3),
                   frameon=False,
                   legend_loc=None,
                   colorbar_loc=None,
                   show=False)

3. Run SAGE

# define model
model = SAGE.SAGE(adata, device=device, epochs=800)
adata = model.train()
>>> SAGE model construction completed, training begins.
>>> Optimization finished for ST data.

4. Domain clustering

n_clusters = len(adata.obs["ground_truth"].unique())
SAGE.utils.clustering(adata, 
                     data= adata.obsm["emb_latent_att"], 
                     method='mclust', 
                     n_clusters=n_clusters, 
                     res = 0.3, 
                     radius=30,  
                     refinement=True)
R[write to console]:                    __           __ 
   ____ ___  _____/ /_  _______/ /_
  / __ `__ \/ ___/ / / / / ___/ __/
 / / / / / / /__/ / /_/ (__  ) /_  
/_/ /_/ /_/\___/_/\__,_/____/\__/   version 6.1.1
Type 'citation("mclust")' for citing this R package in publications.

fitting ...
  |======================================================================| 100%
>>> Clustering completed using mclust with 20 clusters.
>>> adata.obsm['domain'] & ['mclust'] generate!

5. Save result

adata.write_h5ad(output_result_dir+"/result.h5")
clusters = adata.obs["domain"] 
clusters.to_csv(output_result_dir+f"/clusters.csv",header = False)
embedding = adata.obsm["emb_latent_att"]
np.savetxt(output_result_dir+"/embedding.txt",embedding)

SAGE.utils.export_H_zscore_to_csv(adata, out_dir=output_process_dir)
SAGE.utils.export_Corr_to_csv(adata, out_dir=output_process_dir)

6. Plot clustering results

# # Read data
# adata = sc.read(output_result_dir+'/result.h5')

# Plot clustering results
combine_palette ={
    '1': '#632621',            '6': '#92C7A3',           '11': '#EEC30E',           '16': '#CCB9A1',
    '2': '#0B3434',            '7': '#F0C94A',           '12': '#B44541',           '17': '#3C4F76',
    '3': '#0C4DA7',            '8': '#4F6A9C',           '13': '#AD6448',           '18': '#7D5BA6',
    '4': '#058187',            '9': '#8F0100',           '14': '#F4E8B8',           '19': '#F28522',
    '5': '#C1D354',           '10': '#D98882',           '15': '#6A93CB',           '20': '#4A9586',
    
    'DCIS/LCIS_1': '#C1D354',  'IDC_2': '#0C4DA7',       'Tumor_edge_2': '#92C7A3', 'IDC_7': '#0B3434',
    'DCIS/LCIS_2': '#7D5BA6',  'IDC_3': '#058187',       'Tumor_edge_3': '#B44541', 'IDC_8': '#632621',
    'DCIS/LCIS_4': '#CCB9A1',  'IDC_4': '#EEC30E',       'Tumor_edge_4': '#F28522', 'Tumor_edge_6': '#F0C94A',
    'DCIS/LCIS_5': '#D98882',  'IDC_5': '#4F6A9C',       'Tumor_edge_5': '#F4E8B8', 'IDC_6': '#6A93CB',
    'Healthy_1': '#8F0100',    'Healthy_2': '#AD6448',   'IDC_1': '#4A9586',        'Tumor_edge_1': '#3C4F76'}



plt.rcParams["figure.figsize"] = (4,4)
plt.rcParams['font.size'] = 10
labels_true = adata.obs["ground_truth"]
labels_pred = adata.obs["domain"]
NMI = metrics.normalized_mutual_info_score(labels_true, labels_pred)
ARI = metrics.adjusted_mutual_info_score(labels_true, labels_pred)
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
sc.pl.spatial(adata, 
            img_key = None, 
            spot_size = 400,
            color = ["ground_truth"],
            title = ["Manual annotation"], 
            palette=combine_palette,  
            na_in_legend = False,
            frameon=False,
            ncols = 2,
            size = 1,
            ax = axes[0],
            show= False)
sc.pl.spatial(adata, 
            img_key = None, 
            spot_size = 400,
            color = ["domain"],
            title = [f'(ARI={ARI:.3f} NMI={NMI:.3f})'], 
            palette=combine_palette,  
            na_in_legend = False,
            frameon=False,
            ncols = 2,
            size = 1,
            ax = axes[1],
            show= False)
plt.tight_layout()
plt.show()

7. Marker genes detection

SAGE.utils.run_domain_gene_mapping_pipeline(
    adata,
    useref="domain",
    topic_key="W_nmf",
    corr_threshold=0.2,
    topic_gene_corr_key="gene_topic_corr",
    domain_mapping_topic_key="domain_mapping_topic",
    marker_genes_dict_key="marker_genes_multi_dict",
    domain_to_genes_key="domain_to_genes"
)
>>> Step 1: Generating marker genes for topics...
>>> Step 2: Matching domains to best topics...
>>> Step 3: Generating domain β†’ genes dictionary...
>>> generate adata.uns['domain_to_genes'],contain 20 domain。 
>>> Pipeline completed.
SAGE.plot.plot_marker_genes(
    adata,
    userep="domain",
    domain='4',
    n_genes=4, 
    ncols=4,
    spot_size=400, 
    out_dir=None,
    figsize=(4, 4), 
    fontsize=10,
    frameon=None, 
    legend_loc='on data', 
    colorbar_loc="right",
    show_title=True,
    palette_dict=combine_palette,
)