Install and test SV callers - GooglingTheCancerGenome/sv-callers GitHub Wiki

Install SV callers

wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh # python 3.6
# wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh # python 2.7
bash Miniconda3-latest-Linux-x86_64.sh
# add conda to the PATH & source the env
source ~/.bashrc
conda update -y conda
conda config --add channels conda-forge
conda config --add channels defaults
conda config --add channels r
conda config --add channels bioconda
  • create a new environment e.g. sv-callers and install the tools (latest versions)
conda create -n sv-callers manta delly lumpy-sv gridss sambamba samblaster bwa R
source activate sv-callers

Note: To install the latest samtools (v1.5) with the correct bzip2 distribution use -c conda-forge -c bioconda options.

or install the tools using env.yaml:

channels:
  - conda-forge
  - bioconda

dependencies:
  - manta=1.1.0
  - delly=0.7.7
  - lumpy-sv=0.2.13
  - samblaster=0.1.24
  - gridss=1.3.4
  - bwa=0.7.15
  - R
conda-env create -n sv-callers -f env.yaml

Test runs

Compute infra: HPC/UMC Utrecht (CentOS Linux rel.7.3)

Installed software

conda env export -n sv-callers

name: sv-callers
channels:
- bioconda
- r
- defaults
- conda-forge
dependencies:
- bcftools=1.4.1=0
- bwa=0.7.15=1
- delly=0.7.7=boost1.61_1
- gawk=4.1.3=1
- gridss=1.3.4=py27_0
- htslib=1.4.1=0
- lumpy-sv=0.2.13=py27_0
- manta=1.1.0=py27_0
- pysam=0.11.2.2=py27_0
- sambamba=0.6.6=0
- samblaster=0.1.24=0
- samtools=1.4.1=0
- boost=1.61.0=py27_0
- bzip2=1.0.6=3
- cairo=1.14.8=0
- curl=7.52.1=0
- fontconfig=2.12.1=3
- freetype=2.5.5=2
- glib=2.50.2=1
- gmp=6.1.0=0
- gsl=2.2.1=0
- harfbuzz=0.9.39=2
- icu=54.1=0
- jbig=2.1=0
- jpeg=9b=0
- libffi=3.2.1=1
- libgcc=5.2.0=0
- libiconv=1.14=0
- libpng=1.6.27=0
- libtiff=4.0.6=3
- libxml2=2.9.4=0
- mkl=2017.0.1=0
- mpfr=3.1.5=0
- ncurses=5.9=10
- numpy=1.13.0=py27_0
- openjdk=8.0.121=1
- openssl=1.0.2l=0
- pango=1.40.3=1
- pcre=8.39=1
- pip=9.0.1=py27_1
- pixman=0.34.0=0
- python=2.7.13=0
- readline=6.2=2
- setuptools=27.2.0=py27_0
- sqlite=3.13.0=0
- tk=8.5.18=0
- wheel=0.29.0=py27_0
- xz=5.2.2=1
- zlib=1.2.8=3
- r=3.3.2=r3.3.2_0
- r-base=3.3.2=1
- r-boot=1.3_18=r3.3.2_0
- r-class=7.3_14=r3.3.2_0
- r-cluster=2.0.5=r3.3.2_0
- r-codetools=0.2_15=r3.3.2_0
- r-foreign=0.8_67=r3.3.2_0
- r-kernsmooth=2.23_15=r3.3.2_0
- r-lattice=0.20_34=r3.3.2_0
- r-mass=7.3_45=r3.3.2_0
- r-matrix=1.2_7.1=r3.3.2_0
- r-mgcv=1.8_16=r3.3.2_0
- r-nlme=3.1_128=r3.3.2_0
- r-nnet=7.3_12=r3.3.2_0
- r-recommended=3.3.2=r3.3.2_0
- r-rpart=4.1_10=r3.3.2_0
- r-spatial=7.3_11=r3.3.2_0
- r-survival=2.40_1=r3.3.2_0
prefix: /home/cog/akuzniar/miniconda2/envs/sv-caller
sv caller implementation parallelism I/O file formats
Manta C++/Python pyFlow tasks, SIMD BAM,CRAM / VCF
DELLY C++ OpenMP threads BAM / BCF
LUMPY C/C++/Python - BAM / VCF
GRIDSS Java/R/Python threads, SIMD BAM,CRAM / VCF,BCF

Data sets

  1. Prostate cancer sample

    • based dir: /hpc/cog_bioinf/ridder/users/akuzniar
    • input files: VCAP_dedup.realigned.bam(79GB)->chr21_pairs.bam|cram(1.6GB|0.7GB) or chr1_pairs.bam|cram(5.8GB|...), Homo_sapiens.GRCh37.GATK.illumina.fasta(3GB)
  2. Tumor-Normal (blood) samples:

    • base dir: /hpc/cog_bioinf/ridder/users/akuzniar
    • input files: Z424-10B00A0_dedup.realigned.bam(63GB; T) and Z424-00A00A0_dedup.realigned.bam(61GB; N)
export DATA_DIR=/hpc/cog_bioinf/ridder/users/akuzniar
cd $DATA_DIR

Use BAM or CRAM

samtools view -bh -@ 8 -f 2 -o chr21_pairs.bam VCAP_dedup.realigned.bam 21
samtools index -@ 8 chr21_pairs.bam # create index file (.bai)

or

samtools view -C -@ 8 -f 2 -C -T Homo_sapiens.GRCh37.GATK.illumina.fasta -o chr21_pairs.cram chr21_pairs.bam # 
samtools index -@ 8 chr21_pairs.cram # create index file (.crai)

Alternatively, use sambamba (with BAM/CRAM) instead of samtools.

sambamba view -o chr21.bam -f bam -F "proper_pair" -t 8 VCAP_dedup.realigned.bam 21 # or with -f cram

N.B. To work on a subset of the human genome (e.g. chr.21) select proper read pairs.

Check read pairs statistics

samtools flagstat chr21_pairs.bam

20672736 + 0 in total (QC-passed reads + QC-failed reads)
37338 + 0 secondary
0 + 0 supplementary
1637676 + 0 duplicates
20672736 + 0 mapped (100.00% : N/A)
20635398 + 0 paired in sequencing
10317699 + 0 read1
10317699 + 0 read2
20635398 + 0 properly paired (100.00% : N/A)
20635398 + 0 with itself and mate mapped
0 + 0 singletons (0.00% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)

samtools flagstat chr1_pairs.bam

80671804 + 0 in total (QC-passed reads + QC-failed reads)
150516 + 0 secondary
0 + 0 supplementary
9041508 + 0 duplicates
80671804 + 0 mapped (100.00% : N/A)
80521288 + 0 paired in sequencing
40260644 + 0 read1
40260644 + 0 read2
80521288 + 0 properly paired (100.00% : N/A)
80521288 + 0 with itself and mate mapped
0 + 0 singletons (0.00% : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)

Run Manta

  • tumor sample (chr.21)
configManta.py --tumorBam=chr21_pairs.bam --reference=Homo_sapiens.GRCh37.GATK.illumina.fasta --runDir=. # or with *.cram
THREADS=1
echo ". activate sv-callers; ./runWorkflow.py --quiet -m local -j ${THREADS}" | qsub -N manta-${THREADS} -cwd -V -l h_rt=00:30:00 -l h_vmem=1G -pe threaded ${THREADS}
# ./runWorkflow.py -m sge -j $THREADS # submit directly to SGE cluster as pyflowTask(s)
  • tumor-normal samples
configManta.py --tumorBam=Z424-10B00A0_dedup.realigned.bam --normalBam=Z424-00A00A0_dedup.realigned.bam --reference=Homo_sapiens.GRCh37.GATK.illumina.fasta --runDir=. 
echo ". activate sv-callers; ./runWorkflow.py --quiet -m local -j ${THREADS}" | qsub -N manta-${THREADS} -cwd -V -l h_rt=07:00:00 -l h_vmem=2G -pe threaded ${THREADS}

Run DELLY

  • tumor sample (chr.21)
# enable multi-threading at sample-level, requires more than one BAM file
# export OMP_NUM_THREADS=1
# export OMP_PROC_BIND=true # for >1 OMP_NUM_THREADS
# qsub ... -pe threaded $OMP_NUM_THREADS
SV_TYPES=(DEL DUP INV BND INS)
for t in "${SV_TYPES[@]}"; do
   echo ". activate sv-callers; delly call -t ${t} -g Homo_sapiens.GRCh37.GATK.illumina.fasta -o delly_chr21_${t}.bcf -x chr21.excl.tsv chr21_pairs.bam" | qsub -N delly-${t}-${OMP_NUM_THREADS} -cwd -V -l h_rt=01:00:00 -l h_vmem=1G
done
  • tumor-normal samples
for t in "${SV_TYPES[@]}"
do
   echo ". activate sv-callers; delly call -t ${t} -g Homo_sapiens.GRCh37.GATK.illumina.fasta -o delly-${t}_Z424.bcf Z424-10B00A0_dedup.realigned.bam Z424-00A00A0_dedup.realigned.bam" | qsub -N delly-${t}-${OMP_NUM_THREADS} -cwd -V -l h_rt=24:00:00 -l h_vmem=8G -pe threaded ${OMP_NUM_THREADS}
done

Note: DELLY must be called for each SV type separately. DELLY's runtime can be further improved by parallel execution per chromosome (for INS, INV, DUP and DEL) and per chromosome pairs (BND = translocation).

Run LUMPY

  • tumor sample (chr.21)
echo ". activate sv-callers; lumpyexpress -B chr21_pairs.bam -o lumpy_chr21_pairs.vcf" | qsub -N lumpy -cwd -V -l h_rt=00:30:00 -l h_vmem=1G
  • tumor-normal samples
echo ". activate sv-callers; lumpyexpress -B Z424-10B00A0_dedup.realigned.bam,Z424-00A00A0_dedup.realigned.bam -o lumpy-Z424.vcf" | qsub -N lumpy -cwd -V -l h_rt=10:00:00 -l h_vmem=8G

TODO: To (potentially) speed-up the execution, extract split and discordant reads using SAMBAMBA instead of SAMBLASTER prior running LUMPY(express). For this, use -S and -D options of LUMPY.

Run GRIDSS

  • tumor sample (chr.21)
THREADS=1
# rm -fr *gridss* *.dict snappy* # clean-up tmp files & dirs
echo ". activate sv-callers; gridss -Xmx31g gridss.CallVariants WORKER_THREADS=${THREADS} TMP_DIR=. WORKING_DIR=. REFERENCE_SEQUENCE=Homo_sapiens.GRCh37.GATK.illumina.fasta INPUT=chr21_pairs.bam OUTPUT=gridss_chr21.vcf ASSEMBLY=gridss_assembly.bam" | qsub -N gridss-${THREADS} -cwd -V -l h_vmem=64G -l h_rt=02:00:00 -pe threaded ${THREADS}
  • tumor-normal samples
echo ". activate sv-callers; gridss -Xmx31g gridss.CallVariants WORKER_THREADS=${THREADS} TMP_DIR=. WORKING_DIR=. REFERENCE_SEQUENCE=Homo_sapiens.GRCh37.GATK.illumina.fasta INPUT=Z424-00A00A0_dedup.realigned.bam INPUT=Z424-10B00A0_dedup.realigned.bam OUTPUT=gridss-${THREADS}_Z424.vcf ASSEMBLY=gridss-${THREADS}_Z424_assembly.bam" | qsub -N gridss-${THREADS} -cwd -V -l h_vmem=64G -l h_rt=48:00:00 -pe threaded ${THREADS}

Summary

chromosome runtime (s) memory use (M)
1 3158/487/317 21/29/38
2 3377/486/217 21/30/49
3 2710/467/183 21/29/48
4 2429/396/167 21/30/49
5 2547/487/184 21/29/49
6 2243/487/136 21/29/38
7 2763/429/182 21/29/49
8 2866/487/190 21/29/38
9 2244/294/145 21/29/38
10 1976/277/145 21/29/39
11 2073/487/142 21/29/49
12 2284/487/149 21/29/38
13 1211/487/76 21/29/38
14 1186/136/74 21/29/38
15 1439/199/106 21/29/38
16 863/97/67 21/29/38
17 1182/135/82 21/29/38
18 1167/140/72 19/29/47
19 902/100/63 21/29/38
20 1141/134/80 21/29/38
21 871/101/62 21/29/47
22 531/62/46 21/29/38
MT 96/15/7 21/29/37
X 1434/157/92 21/29/38
Y 468/66/42 21/34/37

Note: Samtools segfaults when requesting less than 1G h_vmem for 16 threads.

  • chromosome 21 (tumor sample)
sv caller # threads runtime memory use
Manta 1/8/16 18m 58s/2m 38s/1m 53s 85.5/328.9/496.5M
DELLY 1 DEL: 20m 19s DUP: 1m 29s INV: 1m 4s INS : 1h 3m BND: 56s DEL: 57 DUP: 58.4 INV: 48.5 INS: 55 BND: 48.5M
LUMPY 1 9m 6s 240M
GRIDSS 1/8/16 1h 42m/1h 7m/1h 2m 12.8/10.4/14.7G
  • chromosome 1 (tumor sample)
sv caller # threads runtime memory use
Manta 1/8/16 23m 54s/3m 54s/2m 38s 70/260.2/440.9M
DELLY 1 DEL: 1h 12m DUP: 3m 21s INV: 2m 54s INS: 1h 23m BND: 2m 23s DEL: 252 DUP: 272 INV: 49.6 INS: 250 BND: 49.6M
LUMPY 1 38m 21s 1G
GRIDSS 1/8/16 2h 54m/1h 2m/58m 40s 18.3/18/17.8G
  • chromosome pairs (tumor-normal samples)
sv caller # threads runtime memory use
Manta 24 1m 21s 772.7M
DELLY 2 DEL: 13m 5s DUP: 2m 37s INV: 2m INS: 12m 27s BND: 1m 54s DEL: 113.1 DUP: 113 INV: 113.1 INS: 113.1 BND: 113.1M
LUMPY 1 15m 11s 169.2M
GRIDSS 24 47m 10s 25.8G

Note: LUMPY failed due to an issue with tmpspace.

  • all chromosomes (tumor-normal samples)
sv caller # threads runtime memory use
Manta 1/8/16 7h/1h 27m/34m 57s 248M/1.6/3.1G
DELLY 1/2 DEL: 24h 3m/17h 29m DUP: 5h 46m/3h 37m INV: 5h 49m/3h 58m INS: 24h 4m/19h 30m BND: >48h/>20h DEL: 432.2/514.8M DUP: 409.7/465.8M INV: 417.5/468.5M INS: 318/324.5M BND: 1.8/2.6G
LUMPY 1 10h 45m 9.4G
GRIDSS 1/8/16 >48h/22h 20m/14h 33m >18.9/40.5/42G

Note: In DELLY the multi-threaded parallelism works only if multiple BAM files are used. Its runtime performance can be further improved (>15%) using core binding (data locality). runtime=ru_wallclock and memory use=maxvmem

Monitor jobs: watch --interval 10 qstat

Show job accounting log: qacct -j [job_id|job_name]