Embeddings on H200 - danifilho/Evo2_BASF GitHub Wiki
First i splited the chr4 into chunks of desired length with
module load SAMtools/1.19.2-GCC-13.2.0 samtools faidx GCF_000001735.4_TAIR10.1_genomic.fna NC_003075.7 > chr4_TAIR10.fna
module load BEDTools/2.31.0-GCC-12.3.0 generate_windows_chr4.sh 8192 (in this case)
#!/usr/bin/env bash
# generate_windows_chr4.sh <window_size_bp>
# Creates: windows_<SIZE>/NC_003075.7_<start>_<end>.fa (one file per window)
set -euo pipefail
# 1) Setting paths
CHR4_FASTA="/mnt/gs21/scratch/dasilvaf/evo2_arabidopsis/GCF_000001735.4_TAIR10.1_genomic.fna" # NC_003075.7
BEDTOOLS=$(command -v bedtools)
# 2) Parsing args
[[ $# -ne 1 ]] && { echo "Usage: $0 <window_size_bp>"; exit 1; }
WIN=$1
OUTDIR="windows_${WIN}_full"
BEDFILE="${OUTDIR}/tair10_${WIN}.bed"
STEP=$(( WIN / 2 ))
mkdir -p "$OUTDIR"
# 3) Building BED of contiguous windows across chr4 (actually now is the whole genome)
$BEDTOOLS makewindows -g "$CHR4_FASTA.fai" -w "$WIN" -s "$STEP" \
| awk -v OFS="\t" '{print $1,$2,$3, $1"_"$2"_"$3}' > "$BEDFILE"
echo "Creating BED for ${WIN}-bp windows β¦"
#$BEDTOOLS makewindows -g /mnt/gs21/scratch/dasilvaf/evo2_arabidopsis/GCF_000001735.4_TAIR10.1_genomic.fna.fai -w "$WIN" -s "$WIN" \
# | awk -v OFS="\t" '{print $1,$2,$3, "NC_003075.7_"$2"_"$3}' > "$BEDFILE"
# 4) Extracting multi-FASTA, then splitting into individual files
echo "Extracting FASTA β¦"
MULTIFASTA="${OUTDIR}/chr4_${WIN}.fa"
$BEDTOOLS getfasta -fi "$CHR4_FASTA" -bed "$BEDFILE" -name -fo "$MULTIFASTA"
echo "Splitting multi-FASTA into individual files β¦"
awk -v outdir="$OUTDIR" '
/^>/ {
split($0, a, "::") # a[1] = ">NC_003075.7_0_8192"
fname = substr(a[1], 2) ".fa" # drop leading ">"
f = outdir "/" fname
next
}
{ print > f }
' "$MULTIFASTA"
rm "$MULTIFASTA"
echo "DONE β FASTA windows in $OUTDIR/"
This is the script to create the embeddings
also i am logging on h200 with salloc
salloc --gpus=h200:1 --cpus-per-gpu=16 --mem=64G --time=2:00:00
#!/usr/bin/env python3
"""
embed_evo2.py β create Evo2 embeddings for *all* FASTA windows in a folder
"""
import argparse
import pathlib
import sys
import time
import torch
from evo2 import Evo2
from tqdm import tqdm # tiny progress bar; pip install tqdm if needed
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description="Embed every FASTA window in a directory with Evo2."
)
p.add_argument(
"--indir",
required=True,
type=pathlib.Path,
help="Directory containing *.fa / *.fasta windows.",
)
p.add_argument(
"--outdir",
default=pathlib.Path("embeddings_full"),
type=pathlib.Path,
help="Where .pt tensors will be written (created if missing).",
)
p.add_argument(
"--layer",
default="blocks.28.mlp.l3",
help="Evo2 layer name for embeddings (paper notes blocks.27.mlp.l3).",
)
p.add_argument(
"--model",
default="evo2_7b",
help="Which Evo2 checkpoint to load (evo2_7b, evo2_15b-fp16, β¦).",
)
p.add_argument(
"--recursive",
action="store_true",
help="Recurse into sub-directories of --indir.",
)
return p.parse_args()
def get_fasta_paths(indir: pathlib.Path, recursive: bool) -> list[pathlib.Path]:
pattern = "**/*.fa*" if recursive else "*.fa*"
return sorted(indir.glob(pattern))
def load_sequence(path: pathlib.Path) -> str:
# FASTA windows are one-liner after the header, so grab the 2nd line only
return path.read_text().splitlines()[0].strip().upper()
def main() -> None:
args = parse_args()
# 1. Model
t0 = time.time()
evo2_model = Evo2(args.model)
tok = evo2_model.tokenizer
# 2. File list
fasta_paths = get_fasta_paths(args.indir, args.recursive)
if not fasta_paths:
sys.exit(f"No FASTA files found in {args.indir}")
args.outdir.mkdir(exist_ok=True, parents=True)
# 3. Iterate
for fa_path in tqdm(fasta_paths, desc="Embeddings", unit="file"):
out_file = args.outdir / (fa_path.stem + ".pt")
if out_file.exists():
# Optional: skip to avoid recomputing
tqdm.write(f"β {out_file.name} already exists; skipping.")
continue
sequence = load_sequence(fa_path)
input_ids = torch.tensor(tok.tokenize(sequence), dtype=torch.int)[None]
if torch.cuda.is_available():
input_ids = input_ids.cuda()
with torch.inference_mode():
_, embeddings = evo2_model(
input_ids, return_embeddings=True, layer_names=[args.layer]
)
torch.save(embeddings[args.layer].squeeze(0).cpu(), out_file)
tqdm.write(f"β saved {out_file.name}")
print(
f"\nDone β processed {len(fasta_paths)} files in {time.time() - t0:.1f}s, "
f"wrote tensors to {args.outdir.resolve()}"
)
if __name__ == "__main__":
main()
and this is the slurm script
#!/bin/bash --login
#SBATCH --job-name=evo2_20000_2_full
#SBATCH -N 1
#SBATCH --gpus-per-node=h200:1 # 1 Γ NVIDIA H200
#SBATCH --cpus-per-gpu=40
#SBATCH --mem=64G
#SBATCH --time=4:00:00 # queue limit on βshortβ
#SBATCH -o "/mnt/gs21/scratch/dasilvaf/evo2/logs/stdout.%x.%j.%N"
#SBATCH -e "/mnt/gs21/scratch/dasilvaf/evo2/logs/stderr.%x.%j.%N"
# 1) Setting user adjustable paths
WORKDIR=$PWD # directory where you launch sbatch
SIF=$WORKDIR/evo2_latest.sif # container image
SCRIPT=$WORKDIR/embed_evo2.py # updated wrapper
HFCACHE=$WORKDIR/huggingface # holds HF auth token / model cache
INDIR=$WORKDIR/windows_20000_full # input FASTA windows (bind mount)
OUTDIR=$WORKDIR/embeddings_20000_full # will receive .pt tensors
echo "============================================================"
echo "JOB START : $(date --iso-8601=seconds)"
echo "NODE : $(hostname)"
echo "SLURM ID : $SLURM_JOB_ID"
echo "============================================================"
# 1) Main work
singularity exec --nv --writable-tmpfs --pwd /workspace \
-B "$HFCACHE:/root/.cache/huggingface" \
-B "$INDIR:/workspace/windows_20000_full" \
-B "$OUTDIR:/workspace/embeddings_20000_full" \
-B "$SCRIPT:/workspace/embed_evo2.py" \
"$SIF" \
python3 /workspace/embed_evo2.py \
--indir /workspace/windows_20000_full \
--outdir /workspace/embeddings_20000_full
STATUS=$?
# 2) Resource-usage summary
echo "RESOURCE SUMMARY (sacct)"
sacct -j "$SLURM_JOB_ID" \
--units=M \
--format=JobIDRaw,Elapsed,MaxRSS,MaxVMSize,AveRSS,AveCPU,MaxRSSNode,AllocTRES,ReqTRES%30 \
--parsable2 2>/dev/null \
|| {
echo "sacct data not yet available β falling back to sstat"
sstat -j "${SLURM_JOB_ID}.batch" \
--format=MaxRSS,MaxVMSize,AveRSS,AveCPU,MaxDiskRead,MaxDiskWrite \
--units=M
}
echo "EXIT CODE : $STATUS"
echo "JOB END : $(date --iso-8601=seconds)"