Kmer Analysis Toolkit - NBISweden/workshop-genome_assembly GitHub Wiki
KAT: Data sequence k-mer histogram
Notes:
- A jagged histogram indicates overlapping reads.
- No peak indicates insufficient data.
- Distance between peaks gives an estimate of ploidy.
- Estimates only core genome size - i.e. repetitive content is often not included in the size estimate.
Command:
#!/usr/bin/env bash
module load bioinfo-tools KAT
CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID
DATA_DIR=/path/to/reads
FILES=( $DATA_DIR/*_R1.fastq.gz )
apply_kathist () {
READ1="$1" # Read 1 of the read pair to be screened
READ2="$2" # Read 2 of the read pair to be screened
if [ "$READ1" == "$READ2" ]; then
>&2 echo "READ1 and READ2 are the same file. R2 Pattern replacement failed. Please check string substitution pattern lower down"
exit 2
fi
PREFIX=$(basename "${READ1%_R1*}")
TMP_FASTQ=$(mktemp -u --suffix ".fastq")
mkfifo "${TMP_FASTQ}" && zcat "$READ1" "$READ2" > "${TMP_FASTQ}" &
sleep 5
kat hist -H 800000000 -t "$CPUS" -o "${PREFIX}-hist" "${TMP_FASTQ}"
rm "${TMP_FASTQ}"
}
FASTQ="${FILES[$JOB]}"
apply_kathist "$FASTQ" "${FASTQ/_R1./_R2.}"
KAT: Data sequence k-mer gc histogram
Notes:
- Useful for detecting contamination.
- Often needs a scale change
Command:
#!/usr/bin/env bash
module load bioinfo-tools KAT
CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID
DATA_DIR=/path/to/reads
FILES=( $DATA_DIR/*_R1.fastq.gz )
apply_katgcp () {
READ1="$1" # Read 1 of the read pair to be screened
READ2="$2" # Read 2 of the read pair to be screened
if [ "$READ1" == "$READ2" ]; then
>&2 echo "READ1 and READ2 are the same file. R2 Pattern replacement failed. Please check string substitution pattern lower down"
exit 2
fi
PREFIX=$(basename "${READ1%_R1*}")
TMP_FASTQ=$(mktemp -u --suffix ".fastq")
mkfifo "${TMP_FASTQ}" && zcat "$READ1" "$READ2" > "${TMP_FASTQ}" &
sleep 5
kat gcp -H 800000000 -t "$CPUS" -o "${PREFIX}-gcp" "${TMP_FASTQ}"
rm "${TMP_FASTQ}"
}
FASTQ="${FILES[$JOB]}"
apply_katgcp "$FASTQ" "${FASTQ/_R1./_R2.}"
KAT: Data sequence Run comparison
Notes:
- Useful for comparing sequence content.
Command: R1 vs R2
#!/usr/bin/env bash
module load bioinfo-tools KAT
CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID
DATA_DIR=/path/to/reads
FILES=( $DATA_DIR/*_R1.fastq.gz )
apply_katcompreads () {
READ1="$1" # Read 1 of the read pair to be screened
READ2="$2" # Read 2 of the read pair to be screened
if [ "$READ1" == "$READ2" ]; then
>&2 echo "READ1 and READ2 are the same file. R2 Pattern replacement failed. Please check string substitution pattern lower down"
exit 2
fi
PREFIX=$(basename "${READ1%_R1*}")
TMP_FASTQ1=$(mktemp -u --suffix "_R1.fastq")
TMP_FASTQ2=$(mktemp -u --suffix "_R2.fastq")
mkfifo "${TMP_FASTQ1}" && zcat "$READ1" > "${TMP_FASTQ1}" &
mkfifo "${TMP_FASTQ2}" && zcat "$READ2" > "${TMP_FASTQ2}" &
sleep 5
kat comp -n -H 800000000 -I 80000000 -t "$CPUS" -o "${PREFIX}-r1vsr2" "${TMP_FASTQ1}" "${TMP_FASTQ2}"
kat plot spectra-mx -i -o "${PREFIX}-r1vsr2-main.mx.spectra-mx.png" "${PREFIX}-r1vsr2-main.mx"
rm "${TMP_FASTQ1}" "${TMP_FASTQ2}"
}
FASTQ="${FILES[$JOB]}"
apply_katcompreads "$FASTQ" "${FASTQ/_R1./_R2.}"
Command: LIB_A_R{1,2} vs LIB_B_R{1,2}
#!/usr/bin/env bash
module load bioinfo-tools KAT
CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID
DATA_DIR=/path/to/reads
FILES=( $DATA_DIR/*_R1.fastq.gz )
apply_katcomplibs () {
READ_A1="$1" # Read 1 of the Sample A read pair to be screened
READ_A2="$2" # Read 2 of the Sample A read pair to be screened
READ_B1="$3" # Read 1 of the Sample B read pair to be screened
READ_B2="$4" # Read 2 of the Sample B read pair to be screened
PREFIX_A=$(basename "${READ_A1%_R1*}")
PREFIX_B=$(basename "${READ_B1%_R1*}")
TMP_FASTQ1=$(mktemp -u --suffix "_A.fastq")
TMP_FASTQ2=$(mktemp -u --suffix "_B.fastq")
mkfifo "${TMP_FASTQ1}" && zcat "$READ_A1" "$READ_A2" > "${TMP_FASTQ1}" &
mkfifo "${TMP_FASTQ2}" && zcat "$READ_B1" "$READ_B2" > "${TMP_FASTQ2}" &
sleep 5
kat comp -H 800000000 -I 80000000 -t "$CPUS" -o "${PREFIX_A}vs${PREFIX_B}-comp" "${TMP_FASTQ1}" "${TMP_FASTQ2}"
rm "${TMP_FASTQ1}" "${TMP_FASTQ2}"
}
# Generate file pairing for all files in $FILES[@] excluding self.
PAR1=()
PAR2=()
set -- "${FILES[@]}"
for FILE_A; do
shift;
for FILE_B; do
PAR1+=("$FILE_A")
PAR2+=("$FILE_B")
done
done
if [ ! -z "${PAR1[$JOB]}" ]; then
FASTQ_LIBA="${PAR1[$JOB]}"
FASTQ_LIBB="${PAR2[$JOB]}"
apply_katcomplibs "$FASTQ_LIBA" "${FASTQ_LIBA/_R1./_R2.}" "$FASTQ_LIBB" "${FASTQ_LIBB/_R1./_R2.}"
else
echo "No file pairing exists for array job $JOB" && exit 1
fi
Reference Coverage: Reference k-mer coverage screen
Notes:
- Dependencies: KAT, Rscript, R modules:Gviz
- Shorter execution time and can handle high coverage and wider intervals than making BAM files.
Command:
#!/usr/bin/env bash
module load bioinfo-tools KAT R_packages
CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID
REF=/path/to/reference_assembly
DATA_DIR=/path/to/reads
FILES=( $DATA_DIR/*_R1.fastq.gz )
TOOLS=/path/to/r_script
apply_katsect () {
READ1="$1" # Read 1 of the read pair to be screened
READ2="$2" # Read 2 of the read pair to be screened
if [ "$READ1" == "$READ2" ]; then
>&2 echo "READ1 and READ2 are the same file. R2 Pattern replacement failed. Please check string substitution pattern lower down"
exit 2
fi
REFERENCE="$3" # Reference to be screened against
# REFERENCE=${REFERENCE_DIR}/reference/Reference.fasta
PREFIX="$(basename "${READ1%_R1*}")-kat_$(basename "$REFERENCE" .fastq.gz )"
TMP_FASTQ=$(mktemp -u --suffix ".fastq")
mkfifo "${TMP_FASTQ}" && zcat "$READ1" "$READ2" > "${TMP_FASTQ}" &
sleep 5
kat sect -H 800000000 -t "$CPUS" -o "${PREFIX}-sect" "$REFERENCE" "${TMP_FASTQ}"
rm "${TMP_FASTQ}"
Rscript $TOOLS/plot_kmer_coverage.R "${PREFIX}-sect-counts.cvg"
}
FASTQ="${FILES[$JOB]}"
apply_katsect "$FASTQ" "${FASTQ/_R1./_R2.}"
plot_kmer_coverage.R
library(Gviz)
args <- commandArgs(trailingOnly=TRUE)
options(ucscChromosomeNames=FALSE)
plotCoverage <- function(cvg) {
gtrack <- GenomeAxisTrack()
cov_data <- read.table(pipe(paste('tr " " "\n" < ',cvg,' | tr -d ">"',sep="")),header=TRUE)
start <- seq(1,length(cov_data[,1]))
end <- start
bedgraph_data <- data.frame(start,end,cov_data[,1])
coverage_track <- DataTrack(bedgraph_data,type="histogram",name=paste("Coverage (Median:",median(cov_data[,1]),")"),genome="REFERENCE",chromosome="REFERENCE")
png(paste(cvg,".png",sep=""),width=800,height=400)
plotTracks(list(gtrack,coverage_track),from=0,to=100000)
dev.off()
}
lapply(args,plotCoverage)
KAT: Reads to Assembly k-mer comparison
Notes: *
Command:
#!/usr/bin/env bash
module load bioinfo-tools KAT
CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID
SAMPLE_PREFIX=SampleA_trimmed_no_human_normalised
DATA_DIR=/path/to/reads
FASTA_DIR=/path/to/assemblies
FILES=( $FASTA_DIR/*.fasta )
apply_katcomp () {
ASSEMBLY="$1" # The assembly is the first parameter to this function
READ1="$2" # The first read pair is the second parameter to this function
READ2="$3" # The second read pair is the third parameter to this function
PREFIX=$( basename "${ASSEMBLY}" .fasta)
TMP_FASTQ=$(mktemp -u --suffix ".fastq")
mkfifo "${TMP_FASTQ}" && zcat "$READ1" "$READ2" > "${TMP_FASTQ}" & # Make a named pipe and combine reads
sleep 5 # Give a little time for the pipe to be made
kat comp -H 800000000 -t "$CPUS" -o "${PREFIX}_vs_reads.cmp" "${TMP_FASTQ}" "$ASSEMBLY" # Compare Reads to Assembly
rm "${TMP_FASTQ}"
}
FASTA="${FILES[$JOB]}"
apply_katcomp "$FASTA" "$DATA_DIR/${SAMPLE_PREFIX}_R"{1,2}.fastq.gz
KAT: Reads vs Assembly - k-mer coverage vs GC comparison blobtools style.
Notes: *
Command:
#!/usr/bin/env bash
module load bioinfo-tools KAT
CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID
SAMPLE_PREFIX=SampleA_trimmed_no_human_normalised
DATA_DIR=/path/to/reads
FASTA_DIR=/path/to/assemblies
FILES=( $FASTA_DIR/*.fasta )
apply_katcold () {
ASSEMBLY="$1" # The assembly is the first parameter to this function
READ1="$2" # The first read pair is the second parameter to this function
READ2="$3" # The second read pair is the third parameter to this function
PREFIX=$( basename "${ASSEMBLY}" .fasta)
TMP_FASTQ=$(mktemp -u --suffix ".fastq")
mkfifo "${TMP_FASTQ}" && zcat "$READ1" "$READ2" > "${TMP_FASTQ}" & # Make a named pipe and combine reads
sleep 5 # Give a little time for the pipe to be made
kat cold -H 8000000000 -t "$CPUS" -o "${PREFIX}_vs_reads.cold" "$ASSEMBLY" "${TMP_FASTQ}" # Compare Reads to Assembly
rm "${TMP_FASTQ}"
}
FASTA="${FILES[$JOB]}"
apply_katcold "$FASTA" "$DATA_DIR/${SAMPLE_PREFIX}_R"{1,2}.fastq.gz