MaSuRCA - NBISweden/workshop-genome_assembly GitHub Wiki
Notes:
Write config (Illumina PE data):
#!/usr/bin/env bash
DATA_DIR=/path/to/reads
FILES=( "$DATA_DIR"/*_R1.fastq.gz )
PREFIX=$(basename "$DATA_DIR")
mkdir -p "${PREFIX}-masurca_assembly"
cd "${PREFIX}-masurca_assembly"
cat <<-EOF > "${PREFIX}_masurca.cfg"
DATA
$( perl -e '$id="aa"; for(@ARGV){ $read1 = $_ ; chomp $read1 ;($read2 = $read1 ) =~ s/_R1\./_R2./ ; print "PE = $id 500 50 $read1 $read2\n"; $id++} ' "${FILES[@]}" )
END
PARAMETERS
GRAPH_KMER_SIZE = auto
END
EOF
cd ..
Slurm script:
#!/usr/bin/env bash
set -ueo pipefail
module load bioinfo-tools MaSuRCA
CPUS="${SLURM_NPROCS:-16}"
JOB=$SLURM_ARRAY_TASK_ID
DATA_DIR=/path/to/reads
PREFIX=$(basename "$DATA_DIR")
cd "${PREFIX}-masurca_assembly"
masurca "${PREFIX}_masurca.cfg"
bash assemble.sh
cd ..
Command (Illumina PE data):
#!/usr/bin/env bash
module load bioinfo-tools MaSuRCA
CPUS="${SLURM_NPROCS:-16}"
JOB=$SLURM_ARRAY_TASK_ID
DATA_DIR=/path/to/reads
FILES=( "$DATA_DIR"/*_R1.fastq.gz )
apply_masurca () {
READ1="$1" # The first read pair is the first parameter to this function
READ2="$2" # The second read pair is the second parameter to this function
PREFIX=$(basename "${READ1%_R1*}")
mkdir -p "${PREFIX}-masurca_assembly"
cd "${PREFIX}-masurca_assembly"
cat <<-EOF > "${PREFIX}_masurca.cfg"
DATA
PE = pe 500 50 $READ1 $READ2
END
PARAMETERS
GRAPH_KMER_SIZE = auto
END
EOF
masurca "${PREFIX}_masurca.cfg"
bash assemble.sh
cd ..
}
FASTQ="${FILES[$JOB]}"
apply_masurca "$FASTQ" "${FASTQ/_R1./_R2.}"