MaSuRCA - NBISweden/workshop-genome_assembly GitHub Wiki

MaSuRCA: De novo genome assembler

Notes:

All reads pairs belong to one sample

Write config (Illumina PE data):

#!/usr/bin/env bash

DATA_DIR=/path/to/reads
FILES=( "$DATA_DIR"/*_R1.fastq.gz )

PREFIX=$(basename "$DATA_DIR")
mkdir -p "${PREFIX}-masurca_assembly"
cd "${PREFIX}-masurca_assembly"
cat <<-EOF > "${PREFIX}_masurca.cfg"
DATA
$( perl -e '$id="aa"; for(@ARGV){ $read1 = $_ ; chomp $read1 ;($read2 = $read1 ) =~ s/_R1\./_R2./ ; print "PE = $id 500 50 $read1 $read2\n"; $id++} ' "${FILES[@]}" )
END

PARAMETERS
GRAPH_KMER_SIZE = auto
END
EOF
cd ..

Slurm script:

#!/usr/bin/env bash

set -ueo pipefail

module load bioinfo-tools MaSuRCA
CPUS="${SLURM_NPROCS:-16}"
JOB=$SLURM_ARRAY_TASK_ID

DATA_DIR=/path/to/reads

PREFIX=$(basename "$DATA_DIR")
cd "${PREFIX}-masurca_assembly"

masurca "${PREFIX}_masurca.cfg"
bash assemble.sh
cd ..

Each read pair is a sample

Command (Illumina PE data):

#!/usr/bin/env bash

module load bioinfo-tools MaSuRCA
CPUS="${SLURM_NPROCS:-16}"
JOB=$SLURM_ARRAY_TASK_ID

DATA_DIR=/path/to/reads
FILES=( "$DATA_DIR"/*_R1.fastq.gz )

apply_masurca () {
	READ1="$1" 			# The first read pair is the first parameter to this function
	READ2="$2" 			# The second read pair is the second parameter to this function
	PREFIX=$(basename "${READ1%_R1*}")
	mkdir -p "${PREFIX}-masurca_assembly"
	cd "${PREFIX}-masurca_assembly"
	cat <<-EOF > "${PREFIX}_masurca.cfg"
	DATA
	PE = pe 500 50 $READ1 $READ2
	END

	PARAMETERS
	GRAPH_KMER_SIZE = auto
	END
	EOF
	masurca "${PREFIX}_masurca.cfg"
	bash assemble.sh
	cd ..
}

FASTQ="${FILES[$JOB]}"
apply_masurca "$FASTQ" "${FASTQ/_R1./_R2.}"
⚠️ **GitHub.com Fallback** ⚠️