GATK scripts from Jack - melisakman/Helianthus GitHub Wiki
#Trim and Filter, Done Local
bbmap/bbduk.sh in1=BCIS21_S74_L007_R1_001.fastq.gz in2=BCIS21_S74_L007_R2_001.fastq.gz out1=2BCIS21_1.fq out2=2BCIS21_2.fq ref=bbmap/resources/adapters.fa ktrim=r tbo tpe overwrite=T qtrim=rl trimq=10 -Xmx4g
#Map Reads number arrays equal number samples
#!/bin/bash
#SBATCH --job-name=IMSWCbatch3
#SBATCH --workdir=/global/scratch/jmcbulls/IMSWC
#SBACTH --verbose
#SBATCH --account=fc_blackman
#SBATCH --partition=savio2
#SBATCH --qos=savio_normal
#SBATCH --nodes=1
#SBATCH --cpus-per-task=20
#SBATCH --mem=32000
#SBATCH --time=24:00:00
#SBATCH --error=YOUR_ERR_FILE3.err
#SBATCH --output=YOUR_STD_OUTFILE3.out
#SBATCH --mail-user=[email protected]
#SBATCH --mail-type=All
#SBATCH --array=1-8
#load any modules you'll need module load bwa/0.7.15 module load samtools/1.3.1
srun bwa mem -t 16 Mguttatus_256_v2.0.fa 2BCIS21_1.fq 2BCIS21_2.fq | samtools view -b -S -o bci21.bam
#Sort index and assign read groups
#!/bin/bash
#SBATCH --job-name=Picard31x
#SBATCH --workdir=/global/scratch/jmcbulls/IMSWC
#SBACTH --verbose
#SBATCH --account=co_rosalind
#SBATCH --partition=savio
#SBATCH --qos=rosalind_savio_normal
#SBATCH --mem-per-cpu=32G
#SBATCH --time=24:00:00
#SBATCH --error=YOUR_ERR_FILEx.err
#SBATCH --output=YOUR_STD_OUTFILEx.out
#SBATCH --mail-user=[email protected]
#SBATCH --mail-type=All
#SBATCH --array=0-7
#load any modules you'll need module load picard/2.4.1
INPUT=(bcis51.bam bcis52.bam bcis53.bam bcis54.bam bcis55.bam bcis56.bam bcis57.bam bcis58.bam) OUTPUT=(sbcis51.rg.bam sbcis52.rg.bam sbcis53.rg.bam sbcis54.rg.bam sbcis55.rg.bam sbcis56.rg.bam sbcis57.rg.bam sbcis58.rg.bam) RGSM=(IM51 IM52 IM53 IM54 IM55 IM56 IM57 IM58)
java -Xmx100G -jar /clusterfs/vector/home/groups/software/sl-6.x86_64/modules/picard/2.4.1/picard.jar AddOrReplaceReadGroups
INPUT=${INPUT[$SLURM_ARRAY_TASK_ID]}
OUTPUT=${OUTPUT[$SLURM_ARRAY_TASK_ID]}
RGID=H0164.1
RGLB=lib1
RGPL=illumina
RGPU=H0164ALXX140820.1
RGSM=${RGSM[$SLURM_ARRAY_TASK_ID]}
SORT_ORDER=coordinate
CREATE_INDEX=true
#Mark Duplicates
#!/bin/bash
#SBATCH --job-name=Picard31x
#SBATCH --workdir=/global/scratch/jmcbulls/IMSWC
#SBACTH --verbose
#SBATCH --account=co_rosalind
#SBATCH --partition=savio2_htc
#SBATCH --qos=rosalind_htc2_normal
#SBATCH --mem-per-cpu=110G
#SBATCH --time=24:00:00
#SBATCH --error=YOUR_ERR_FILEx.err
#SBATCH --output=YOUR_STD_OUTFILEx.out
#SBATCH --mail-user=[email protected]
#SBATCH --mail-type=All
#SBATCH --array=0-3
#load any modules you'll need module load picard/2.4.1
INPUT=(bcis42.rg.bam sbcis43.rg.bam sbcis44.rg.bam sbcis45.rg.bam) OUTPUT=(sbcis42.mdup.rg.bam sbcis43.mdup.rg.bam sbcis44.mdup.rg.bam sbcis45.mdup.rg.bam) METRICS=(42.metrics 43.metrics 44.metrics 45.metrics)
java -Xmx100G -jar /clusterfs/vector/home/groups/software/sl-6.x86_64/modules/picard/2.4.1/picard.jar MarkDuplicates
INPUT=${INPUT[$SLURM_ARRAY_TASK_ID]}
OUTPUT=${OUTPUT[$SLURM_ARRAY_TASK_ID]}
METRICS_FILE=${METRICS[$SLURM_ARRAY_TASK_ID]}
CREATE_INDEX=true
#Use GATK to creat .vcf
#!/bin/bash
#SBATCH --job-name=GATKIMSWC24_2
#SBATCH --workdir=/global/scratch/jmcbulls/IMSWC
#SBACTH --verbose
#SBATCH --account=co_rosalind
#SBATCH --partition=savio2_htc
#SBATCH --qos=rosalind_htc2_normal
#SBATCH --mem-per-cpu=110G
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --time=4-12:00:00
#SBATCH --error=Scaf2.err
#SBATCH --output=Scaf2.out
#SBATCH --mail-user=[email protected]
#SBATCH --mail-type=All
#load any modules you'll need module load GATK/3.6
java -Xmx108G -jar /clusterfs/vector/home/groups/software/sl-6.x86_64/modules/GenomeAnalysisTK-3.6/GenomeAnalysisTK.jar
-R Mguttatus_256_v2.0.fa
-T HaplotypeCaller
-L scaffold_2
-I sbcis11.mdup.rg.bam \ -I sbcis14.mdup.rg.bam \ -I sbcis15.mdup.rg.bam \ -I sbcis16.mdup.rg.bam \ -I sbcis17.mdup.rg.bam \ -I sbcis18.mdup.rg.bam
-I sbcis21.mdup.rg.bam \ -I sbcis22.mdup.rg.bam \ -I sbcis23.mdup.rg.bam \ -I sbcis24.mdup.rg.bam \ -I sbcis25.mdup.rg.bam \ -I sbcis27.mdup.rg.bam \ -I sbcis28.mdup.rg.bam
-I sbcis31.rg.mdup.bam \ -I sbcis32.mdup.rg.bam \ -I sbcis33.mdup.rg.bam \ -I sbcis34.mdup.rg.bam \ -I sbcis35.mdup.rg.bam \ -I sbcis36.mdup.rg.bam \ -I sbcis37.mdup.rg.bam \ -I sbcis38.mdup.rg.bam
-I sbcis41.mdup.rg.bam \ -I sbcis42.mdup.rg.bam \ -I sbcis43.mdup.rg.bam \ -I sbcis44.mdup.rg.bam \ -I sbcis45.mdup.rg.bam \ -I sbcis46.mdup.rg.bam \ -I sbcis47.mdup.rg.bam \ -I sbcis48.mdup.rg.bam
-I sbcis52.mdup.rg.bam \ -I sbcis53.mdup.rg.bam \ -I sbcis54.mdup.rg.bam \ -I sbcis55.mdup.rg.bam \ -I sbcis56.mdup.rg.bam \ -I sbcis57.mdup.rg.bam
-ploidy 2
-o scaf2test.vcf
-rf DuplicateRead
--min_base_quality_score 25
-maxAltAlleles 2
-mmq 24
--heterozygosity 0.01
#Seperate SNPs
#!/bin/bash
#SBATCH --job-name=GatTesta
#SBATCH --workdir=/global/scratch/jmcbulls/IMSWC
#SBACTH --verbose
#SBATCH --account=co_rosalind
#SBATCH --partition=savio2_htc
#SBATCH --qos=rosalind_htc2_normal
#SBATCH --mem-per-cpu=110G
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --time=24:00:00
#SBATCH --error=YOUR_ERR_FILE2a.err
#SBATCH --output=YOUR_STD_OUTFILE2a.out
#SBATCH --mail-user=[email protected]
#SBATCH --mail-type=All
#load any modules you'll need module load GATK/3.6
java -Xmx108G -jar /clusterfs/vector/home/groups/software/sl-6.x86_64/modules/GenomeAnalysisTK-3.6/GenomeAnalysisTK.jar
-T SelectVariants
-R Mguttatus_256_v2.0.fa
-V scaf2test.vcf
-selectType SNP
-o scaf2_SNP.vcf
#Standard Filtering
#!/bin/bash
#SBATCH --job-name=GatTesta
#SBATCH --workdir=/global/scratch/jmcbulls/IMSWC
#SBACTH --verbose
#SBATCH --account=co_rosalind
#SBATCH --partition=savio2_htc
#SBATCH --qos=rosalind_htc2_normal
#SBATCH --mem-per-cpu=110G
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --time=24:00:00
#SBATCH --error=YOUR_ERR_FILE2a.err
#SBATCH --output=YOUR_STD_OUTFILE2a.out
#SBATCH --mail-user=[email protected]
#SBATCH --mail-type=All
#load any modules you'll need module load GATK/3.6
java -Xmx108G -jar /clusterfs/vector/home/groups/software/sl-6.x86_64/modules/GenomeAnalysisTK-3.6/GenomeAnalysisTK.jar
-T VariantFiltration
-R Mguttatus_256_v2.0.fa
-V scaf2_snps.vcf
--filterExpression "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0"
--filterName "my_snp_filter"
-o scaf2_filteredsnps.vcf
#Seperate Indels
#!/bin/bash
#SBATCH --job-name=GatTesta
#SBATCH --workdir=/global/scratch/jmcbulls/IMSWC
#SBACTH --verbose
#SBATCH --account=co_rosalind
#SBATCH --partition=savio2_htc
#SBATCH --qos=rosalind_htc2_normal
#SBATCH --mem-per-cpu=110G
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --time=24:00:00
#SBATCH --error=YOUR_ERR_FILE2a.err
#SBATCH --output=YOUR_STD_OUTFILE2a.out
#SBATCH --mail-user=[email protected]
#SBATCH --mail-type=All
#load any modules you'll need module load GATK/3.6
java -Xmx108G -jar /clusterfs/vector/home/groups/software/sl-6.x86_64/modules/GenomeAnalysisTK-3.6/GenomeAnalysisTK.jar
-T SelectVariants
-R Mguttatus_256_v2.0.fa
-V scaf2test.vcf
-selectType INDEL
-o scaf2_Indels.vcf
#Standard Filtering
#!/bin/bash
#SBATCH --job-name=GatTesta
#SBATCH --workdir=/global/scratch/jmcbulls/IMSWC
#SBACTH --verbose
#SBATCH --account=co_rosalind
#SBATCH --partition=savio2_htc
#SBATCH --qos=rosalind_htc2_normal
#SBATCH --mem-per-cpu=110G
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --time=24:00:00
#SBATCH --error=YOUR_ERR_FILE2a.err
#SBATCH --output=YOUR_STD_OUTFILE2a.out
#SBATCH --mail-user=[email protected]
#SBATCH --mail-type=All
#load any modules you'll need module load GATK/3.6
java -Xmx108G -jar /clusterfs/vector/home/groups/software/sl-6.x86_64/modules/GenomeAnalysisTK-3.6/GenomeAnalysisTK.jar
-T VariantFiltration
-R Mguttatus_256_v2.0.fa
-V scaf2_Indels.vcf
--filterExpression "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0"
--filterName "my_indel_filter"
-o scaf2_Indelsfiltered.vcf
#Identify Biallelic sites, MinQ over 500, sites that didn't pass filter, or over 25% missing data
#!/bin/bash
#SBATCH --job-name=GatTesta
#SBATCH --workdir=/global/scratch/jmcbulls/IMSWC
#SBACTH --verbose
#SBATCH --account=co_rosalind
#SBATCH --partition=savio2_htc
#SBATCH --qos=rosalind_htc2_normal
#SBATCH --mem-per-cpu=110G
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --time=24:00:00
#SBATCH --error=YOUR_ERR_FILE2a.err
#SBATCH --output=YOUR_STD_OUTFILE2a.out
#SBATCH --mail-user=[email protected]
#SBATCH --mail-type=All
module load vcftools/0.1.13
vcftools --vcf scaf2_filteredsnps.vcf --max-missing 0.75 --remove-filtered-all --minQ 500 --recode --recode-INFO-all --min-alleles 2 --max-alleles 2 --out scaf2_filteredsnps2.vcf
#Identify Biallelic sites, MinQ over 500, sites that didn't pass filter, or over 25% missing data
#!/bin/bash
#SBATCH --job-name=GatTesta
#SBATCH --workdir=/global/scratch/jmcbulls/IMSWC
#SBACTH --verbose
#SBATCH --account=co_rosalind
#SBATCH --partition=savio2_htc
#SBATCH --qos=rosalind_htc2_normal
#SBATCH --mem-per-cpu=110G
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --time=24:00:00
#SBATCH --error=YOUR_ERR_FILE2a.err
#SBATCH --output=YOUR_STD_OUTFILE2a.out
#SBATCH --mail-user=[email protected]
#SBATCH --mail-type=All
module load vcftools/0.1.13
vcftools --vcf scaf2_filteredIndels.vcf --max-missing 0.75 --remove-filtered-all --minQ 500 --recode --recode-INFO-all --min-alleles 2 --max-alleles 2 --out scaf2_filteredIndels2.vcf