Embeddings on H200 - danifilho/Evo2_BASF GitHub Wiki

First i splited the chr4 into chunks of desired length with

module load SAMtools/1.19.2-GCC-13.2.0 samtools faidx GCF_000001735.4_TAIR10.1_genomic.fna NC_003075.7 > chr4_TAIR10.fna

module load BEDTools/2.31.0-GCC-12.3.0 generate_windows_chr4.sh 8192 (in this case)

#!/usr/bin/env bash
# generate_windows_chr4.sh  <window_size_bp>
# Creates: windows_<SIZE>/NC_003075.7_<start>_<end>.fa  (one file per window)

set -euo pipefail


# 1) Setting paths

CHR4_FASTA="/mnt/gs21/scratch/dasilvaf/evo2_arabidopsis/GCF_000001735.4_TAIR10.1_genomic.fna"   # NC_003075.7
BEDTOOLS=$(command -v bedtools)

# 2) Parsing args
[[ $# -ne 1 ]] && { echo "Usage: $0 <window_size_bp>"; exit 1; }
WIN=$1
OUTDIR="windows_${WIN}_full"
BEDFILE="${OUTDIR}/tair10_${WIN}.bed"
STEP=$(( WIN / 2 ))

mkdir -p "$OUTDIR"


# 3) Building BED of contiguous windows across chr4 (actually now is the whole genome)

$BEDTOOLS makewindows -g "$CHR4_FASTA.fai" -w "$WIN" -s "$STEP" \
  | awk -v OFS="\t" '{print $1,$2,$3, $1"_"$2"_"$3}' > "$BEDFILE"

echo "Creating BED for ${WIN}-bp windows …"
#$BEDTOOLS makewindows -g /mnt/gs21/scratch/dasilvaf/evo2_arabidopsis/GCF_000001735.4_TAIR10.1_genomic.fna.fai -w "$WIN" -s "$WIN" \
#    | awk -v OFS="\t" '{print $1,$2,$3, "NC_003075.7_"$2"_"$3}' > "$BEDFILE"


# 4) Extracting multi-FASTA, then splitting into individual files
echo "Extracting FASTA …"
MULTIFASTA="${OUTDIR}/chr4_${WIN}.fa"
$BEDTOOLS getfasta -fi "$CHR4_FASTA" -bed "$BEDFILE" -name -fo "$MULTIFASTA"

echo "Splitting multi-FASTA into individual files …"
awk -v outdir="$OUTDIR" '
    /^>/ {
        split($0, a, "::")                # a[1] = ">NC_003075.7_0_8192"
        fname = substr(a[1], 2) ".fa"     # drop leading ">"
        f = outdir "/" fname
        next
    }
    { print > f }
' "$MULTIFASTA"

rm "$MULTIFASTA"
echo "DONE – FASTA windows in $OUTDIR/"

This is the script to create the embeddings

also i am logging on h200 with salloc

salloc --gpus=h200:1 --cpus-per-gpu=16 --mem=64G --time=2:00:00

#!/usr/bin/env python3
"""
embed_evo2.py – create Evo2 embeddings for *all* FASTA windows in a folder
"""

import argparse
import pathlib
import sys
import time
import torch
from evo2 import Evo2
from tqdm import tqdm   # tiny progress bar; pip install tqdm if needed


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(
        description="Embed every FASTA window in a directory with Evo2."
    )
    p.add_argument(
        "--indir",
        required=True,
        type=pathlib.Path,
        help="Directory containing *.fa / *.fasta windows.",
    )
    p.add_argument(
        "--outdir",
        default=pathlib.Path("embeddings_full"),
        type=pathlib.Path,
        help="Where .pt tensors will be written (created if missing).",
    )
    p.add_argument(
        "--layer",
        default="blocks.28.mlp.l3",
        help="Evo2 layer name for embeddings (paper notes blocks.27.mlp.l3).",
    )
    p.add_argument(
        "--model",
        default="evo2_7b",
        help="Which Evo2 checkpoint to load (evo2_7b, evo2_15b-fp16, …).",
    )
    p.add_argument(
        "--recursive",
        action="store_true",
        help="Recurse into sub-directories of --indir.",
    )
    return p.parse_args()


def get_fasta_paths(indir: pathlib.Path, recursive: bool) -> list[pathlib.Path]:
    pattern = "**/*.fa*" if recursive else "*.fa*"
    return sorted(indir.glob(pattern))


def load_sequence(path: pathlib.Path) -> str:
    # FASTA windows are one-liner after the header, so grab the 2nd line only
    return path.read_text().splitlines()[0].strip().upper()


def main() -> None:
    args = parse_args()

    # 1. Model
    t0 = time.time()
    evo2_model = Evo2(args.model)
    tok = evo2_model.tokenizer

    # 2. File list
    fasta_paths = get_fasta_paths(args.indir, args.recursive)
    if not fasta_paths:
        sys.exit(f"No FASTA files found in {args.indir}")

    args.outdir.mkdir(exist_ok=True, parents=True)

    # 3. Iterate
    for fa_path in tqdm(fasta_paths, desc="Embeddings", unit="file"):
        out_file = args.outdir / (fa_path.stem + ".pt")
        if out_file.exists():
            # Optional: skip to avoid recomputing
            tqdm.write(f"β†’ {out_file.name} already exists; skipping.")
            continue

        sequence = load_sequence(fa_path)
        input_ids = torch.tensor(tok.tokenize(sequence), dtype=torch.int)[None]
        if torch.cuda.is_available():
            input_ids = input_ids.cuda()

        with torch.inference_mode():
            _, embeddings = evo2_model(
                input_ids, return_embeddings=True, layer_names=[args.layer]
            )

        torch.save(embeddings[args.layer].squeeze(0).cpu(), out_file)
        tqdm.write(f"βœ“ saved {out_file.name}")

    print(
        f"\nDone – processed {len(fasta_paths)} files in {time.time() - t0:.1f}s, "
        f"wrote tensors to {args.outdir.resolve()}"
    )


if __name__ == "__main__":
    main()

and this is the slurm script

#!/bin/bash --login
#SBATCH --job-name=evo2_20000_2_full         
#SBATCH -N 1
#SBATCH --gpus-per-node=h200:1            # 1 Γ— NVIDIA H200
#SBATCH --cpus-per-gpu=40                 
#SBATCH --mem=64G                         
#SBATCH --time=4:00:00                    # queue limit on β€œshort”
#SBATCH -o "/mnt/gs21/scratch/dasilvaf/evo2/logs/stdout.%x.%j.%N"
#SBATCH -e "/mnt/gs21/scratch/dasilvaf/evo2/logs/stderr.%x.%j.%N"


# 1) Setting user adjustable paths 

WORKDIR=$PWD                               # directory where you launch sbatch
SIF=$WORKDIR/evo2_latest.sif               # container image
SCRIPT=$WORKDIR/embed_evo2.py              # updated wrapper
HFCACHE=$WORKDIR/huggingface               # holds HF auth token / model cache
INDIR=$WORKDIR/windows_20000_full           # input FASTA windows (bind mount)
OUTDIR=$WORKDIR/embeddings_20000_full       # will receive .pt tensors

echo "============================================================"
echo "JOB  START : $(date --iso-8601=seconds)"
echo "NODE       : $(hostname)"
echo "SLURM ID   : $SLURM_JOB_ID"
echo "============================================================"

# 1) Main work
singularity exec --nv --writable-tmpfs --pwd /workspace \
  -B "$HFCACHE:/root/.cache/huggingface" \
  -B "$INDIR:/workspace/windows_20000_full" \
  -B "$OUTDIR:/workspace/embeddings_20000_full" \
  -B "$SCRIPT:/workspace/embed_evo2.py" \
  "$SIF" \
  python3 /workspace/embed_evo2.py \
        --indir  /workspace/windows_20000_full \
        --outdir /workspace/embeddings_20000_full
STATUS=$?


# 2)  Resource-usage summary


echo "RESOURCE SUMMARY (sacct)"
sacct -j "$SLURM_JOB_ID" \
      --units=M \
      --format=JobIDRaw,Elapsed,MaxRSS,MaxVMSize,AveRSS,AveCPU,MaxRSSNode,AllocTRES,ReqTRES%30 \
      --parsable2 2>/dev/null \
  || {
        echo "sacct data not yet available – falling back to sstat"
        sstat -j "${SLURM_JOB_ID}.batch" \
              --format=MaxRSS,MaxVMSize,AveRSS,AveCPU,MaxDiskRead,MaxDiskWrite \
              --units=M
     }

echo "EXIT CODE  : $STATUS"
echo "JOB  END   : $(date --iso-8601=seconds)"
⚠️ **GitHub.com Fallback** ⚠️