Kmer Analysis Toolkit - NBISweden/workshop-genome_assembly GitHub Wiki

KAT: Data sequence k-mer histogram

Notes:

  • A jagged histogram indicates overlapping reads.
  • No peak indicates insufficient data.
  • Distance between peaks gives an estimate of ploidy.
  • Estimates only core genome size - i.e. repetitive content is often not included in the size estimate.

Command:

#!/usr/bin/env bash

module load bioinfo-tools KAT

CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID

DATA_DIR=/path/to/reads
FILES=( $DATA_DIR/*_R1.fastq.gz )

apply_kathist () {
	READ1="$1"      # Read 1 of the read pair to be screened
	READ2="$2"      # Read 2 of the read pair to be screened
	if [ "$READ1" == "$READ2" ]; then
		>&2 echo "READ1 and READ2 are the same file. R2 Pattern replacement failed. Please check string substitution pattern lower down"
		exit 2
	fi
	PREFIX=$(basename "${READ1%_R1*}")
	TMP_FASTQ=$(mktemp -u --suffix ".fastq")
	mkfifo "${TMP_FASTQ}" && zcat "$READ1" "$READ2" > "${TMP_FASTQ}" &
	sleep 5
	kat hist -H 800000000 -t "$CPUS" -o "${PREFIX}-hist" "${TMP_FASTQ}"
	rm "${TMP_FASTQ}"
}

FASTQ="${FILES[$JOB]}"
apply_kathist "$FASTQ" "${FASTQ/_R1./_R2.}"

KAT: Data sequence k-mer gc histogram

Notes:

  • Useful for detecting contamination.
  • Often needs a scale change

Command:

#!/usr/bin/env bash

module load bioinfo-tools KAT

CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID

DATA_DIR=/path/to/reads
FILES=( $DATA_DIR/*_R1.fastq.gz )

apply_katgcp () {
	READ1="$1"	# Read 1 of the read pair to be screened
	READ2="$2"	# Read 2 of the read pair to be screened
	if [ "$READ1" == "$READ2" ]; then
		>&2 echo "READ1 and READ2 are the same file. R2 Pattern replacement failed. Please check string substitution pattern lower down"
		exit 2
	fi
	PREFIX=$(basename "${READ1%_R1*}")
	TMP_FASTQ=$(mktemp -u --suffix ".fastq")
	mkfifo "${TMP_FASTQ}" && zcat "$READ1" "$READ2" > "${TMP_FASTQ}" &
	sleep 5
	kat gcp -H 800000000 -t "$CPUS" -o "${PREFIX}-gcp" "${TMP_FASTQ}"
	rm "${TMP_FASTQ}"
}

FASTQ="${FILES[$JOB]}"
apply_katgcp "$FASTQ" "${FASTQ/_R1./_R2.}"

KAT: Data sequence Run comparison

Notes:

  • Useful for comparing sequence content.

Command: R1 vs R2

#!/usr/bin/env bash

module load bioinfo-tools KAT

CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID

DATA_DIR=/path/to/reads
FILES=( $DATA_DIR/*_R1.fastq.gz )

apply_katcompreads () {
	READ1="$1"	# Read 1 of the read pair to be screened
	READ2="$2"	# Read 2 of the read pair to be screened
	if [ "$READ1" == "$READ2" ]; then
		>&2 echo "READ1 and READ2 are the same file. R2 Pattern replacement failed. Please check string substitution pattern lower down"
		exit 2
	fi
	PREFIX=$(basename "${READ1%_R1*}")
	TMP_FASTQ1=$(mktemp -u --suffix "_R1.fastq")
	TMP_FASTQ2=$(mktemp -u --suffix "_R2.fastq")
	mkfifo "${TMP_FASTQ1}" && zcat "$READ1" > "${TMP_FASTQ1}" &
	mkfifo "${TMP_FASTQ2}" && zcat "$READ2" > "${TMP_FASTQ2}" &
	sleep 5
	kat comp -n -H 800000000 -I 80000000 -t "$CPUS" -o "${PREFIX}-r1vsr2" "${TMP_FASTQ1}" "${TMP_FASTQ2}"
	kat plot spectra-mx -i -o "${PREFIX}-r1vsr2-main.mx.spectra-mx.png" "${PREFIX}-r1vsr2-main.mx"
	rm "${TMP_FASTQ1}" "${TMP_FASTQ2}"
}

FASTQ="${FILES[$JOB]}"
apply_katcompreads "$FASTQ" "${FASTQ/_R1./_R2.}"

Command: LIB_A_R{1,2} vs LIB_B_R{1,2}

#!/usr/bin/env bash

module load bioinfo-tools KAT

CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID

DATA_DIR=/path/to/reads
FILES=( $DATA_DIR/*_R1.fastq.gz )

apply_katcomplibs () {
	READ_A1="$1"	# Read 1 of the Sample A read pair to be screened
	READ_A2="$2"	# Read 2 of the Sample A read pair to be screened
	READ_B1="$3"	# Read 1 of the Sample B read pair to be screened
	READ_B2="$4"	# Read 2 of the Sample B read pair to be screened
	PREFIX_A=$(basename "${READ_A1%_R1*}")
	PREFIX_B=$(basename "${READ_B1%_R1*}")
	TMP_FASTQ1=$(mktemp -u --suffix "_A.fastq")
	TMP_FASTQ2=$(mktemp -u --suffix "_B.fastq")
	mkfifo "${TMP_FASTQ1}" && zcat "$READ_A1" "$READ_A2" > "${TMP_FASTQ1}" &
	mkfifo "${TMP_FASTQ2}" && zcat "$READ_B1" "$READ_B2" > "${TMP_FASTQ2}" &
	sleep 5
	kat comp -H 800000000 -I 80000000 -t "$CPUS" -o "${PREFIX_A}vs${PREFIX_B}-comp" "${TMP_FASTQ1}" "${TMP_FASTQ2}"
	rm "${TMP_FASTQ1}" "${TMP_FASTQ2}"
}

# Generate file pairing for all files in $FILES[@] excluding self.
PAR1=()
PAR2=()
set -- "${FILES[@]}"
for FILE_A; do
	shift;
	for FILE_B; do
		PAR1+=("$FILE_A")
		PAR2+=("$FILE_B")
	done
done

if [ ! -z "${PAR1[$JOB]}" ]; then
	FASTQ_LIBA="${PAR1[$JOB]}"
	FASTQ_LIBB="${PAR2[$JOB]}"
	apply_katcomplibs "$FASTQ_LIBA" "${FASTQ_LIBA/_R1./_R2.}" "$FASTQ_LIBB" "${FASTQ_LIBB/_R1./_R2.}"
else
	echo "No file pairing exists for array job $JOB" && exit 1
fi

Reference Coverage: Reference k-mer coverage screen

Notes:

  • Dependencies: KAT, Rscript, R modules:Gviz
  • Shorter execution time and can handle high coverage and wider intervals than making BAM files.

Command:

#!/usr/bin/env bash

module load bioinfo-tools KAT R_packages

CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID

REF=/path/to/reference_assembly
DATA_DIR=/path/to/reads
FILES=( $DATA_DIR/*_R1.fastq.gz )
TOOLS=/path/to/r_script

apply_katsect () {
	READ1="$1"		# Read 1 of the read pair to be screened
	READ2="$2"		# Read 2 of the read pair to be screened
	if [ "$READ1" == "$READ2" ]; then
		>&2 echo "READ1 and READ2 are the same file. R2 Pattern replacement failed. Please check string substitution pattern lower down"
		exit 2
	fi
	REFERENCE="$3"	# Reference to be screened against
	# REFERENCE=${REFERENCE_DIR}/reference/Reference.fasta
	PREFIX="$(basename "${READ1%_R1*}")-kat_$(basename "$REFERENCE" .fastq.gz )"
	TMP_FASTQ=$(mktemp -u --suffix ".fastq")
	mkfifo "${TMP_FASTQ}" && zcat "$READ1" "$READ2" > "${TMP_FASTQ}" &
	sleep 5
	kat sect -H 800000000 -t "$CPUS" -o "${PREFIX}-sect" "$REFERENCE" "${TMP_FASTQ}"
	rm "${TMP_FASTQ}"
	Rscript $TOOLS/plot_kmer_coverage.R "${PREFIX}-sect-counts.cvg"
}

FASTQ="${FILES[$JOB]}"
apply_katsect "$FASTQ" "${FASTQ/_R1./_R2.}"

plot_kmer_coverage.R

library(Gviz)
args <- commandArgs(trailingOnly=TRUE)
options(ucscChromosomeNames=FALSE)
plotCoverage <- function(cvg) {
	gtrack <- GenomeAxisTrack()
	cov_data <- read.table(pipe(paste('tr " " "\n" < ',cvg,' | tr -d ">"',sep="")),header=TRUE)
	start <- seq(1,length(cov_data[,1]))
	end <- start
	bedgraph_data <- data.frame(start,end,cov_data[,1])
	coverage_track <- DataTrack(bedgraph_data,type="histogram",name=paste("Coverage (Median:",median(cov_data[,1]),")"),genome="REFERENCE",chromosome="REFERENCE")
	png(paste(cvg,".png",sep=""),width=800,height=400)
	plotTracks(list(gtrack,coverage_track),from=0,to=100000)
	dev.off()
}
lapply(args,plotCoverage)

KAT: Reads to Assembly k-mer comparison

Notes: *

Command:

#!/usr/bin/env bash

module load bioinfo-tools KAT

CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID

SAMPLE_PREFIX=SampleA_trimmed_no_human_normalised
DATA_DIR=/path/to/reads
FASTA_DIR=/path/to/assemblies
FILES=( $FASTA_DIR/*.fasta )

apply_katcomp () {
	ASSEMBLY="$1"		# The assembly is the first parameter to this function
	READ1="$2" 		# The first read pair is the second parameter to this function
	READ2="$3" 		# The second read pair is the third parameter to this function
	PREFIX=$( basename "${ASSEMBLY}" .fasta)
	TMP_FASTQ=$(mktemp -u --suffix ".fastq")
	mkfifo "${TMP_FASTQ}" && zcat "$READ1" "$READ2" > "${TMP_FASTQ}" &		# Make a named pipe and combine reads
	sleep 5																				# Give a little time for the pipe to be made
	kat comp -H 800000000 -t "$CPUS" -o "${PREFIX}_vs_reads.cmp" "${TMP_FASTQ}" "$ASSEMBLY" 	# Compare Reads to Assembly
	rm "${TMP_FASTQ}"
}

FASTA="${FILES[$JOB]}"
apply_katcomp "$FASTA" "$DATA_DIR/${SAMPLE_PREFIX}_R"{1,2}.fastq.gz

KAT: Reads vs Assembly - k-mer coverage vs GC comparison blobtools style.

Notes: *

Command:

#!/usr/bin/env bash

module load bioinfo-tools KAT

CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID

SAMPLE_PREFIX=SampleA_trimmed_no_human_normalised
DATA_DIR=/path/to/reads
FASTA_DIR=/path/to/assemblies
FILES=( $FASTA_DIR/*.fasta )

apply_katcold () {
	ASSEMBLY="$1"		# The assembly is the first parameter to this function
	READ1="$2" 		# The first read pair is the second parameter to this function
	READ2="$3" 		# The second read pair is the third parameter to this function
	PREFIX=$( basename "${ASSEMBLY}" .fasta)
	TMP_FASTQ=$(mktemp -u --suffix ".fastq")
	mkfifo "${TMP_FASTQ}" && zcat "$READ1" "$READ2" > "${TMP_FASTQ}" &		# Make a named pipe and combine reads
	sleep 5												# Give a little time for the pipe to be made
	kat cold -H 8000000000 -t "$CPUS" -o "${PREFIX}_vs_reads.cold" "$ASSEMBLY" "${TMP_FASTQ}" 	# Compare Reads to Assembly
	rm "${TMP_FASTQ}"
}

FASTA="${FILES[$JOB]}"
apply_katcold "$FASTA" "$DATA_DIR/${SAMPLE_PREFIX}_R"{1,2}.fastq.gz