Kraken - NBISweden/workshop-genome_assembly GitHub Wiki

Kraken: Taxonomic classification of sequences

Kraken2

Notes:

  • Dependencies: Kraken, Krona

Command:

#!/usr/bin/env bash

module load bioinfo-tools Kraken2 Krona

CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID

DATA_DIR=/path/to/reads
FILES=( $DATA_DIR/*_R1.fastq.gz )

KRAKEN2DB=$SNIC_TMP/kraken_db
mkdir -p "$KRAKEN2DB"
rsync -a ${KRAKEN_DB:-/sw/data/uppnex/Kraken/latest}/* $KRAKEN2DB

apply_krakenrp () {
   ASSEMBLY="$1" # The assembly is the first parameter to this function. This file must end in .fasta
   PREFIX=$( basename "$ASSEMBLY" .fasta )
   echo "Running Kraken2: $ASSEMBLY"
   kraken2 --threads "$CPUS" --db "$KRAKEN2DB" --report "${PREFIX}_kraken.rpt" --gzip-compressed --paired "$READ1" "$READ2" > "${PREFIX}_kraken.tsv"
   ktImportTaxonomy <( cut -f2,3 "${PREFIX}_kraken.tsv" ) -o "${PREFIX}_kraken_krona.html"
}

FASTQ="${FILES[$JOB]}"
apply_krakenrp "$FASTQ" "${FASTQ/_R1./_R2.}"

Kraken2

Notes:

  • Dependencies: Kraken, Krona

Command:

#!/usr/bin/env bash

module load bioinfo-tools Kraken2 Krona

CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID

FASTA_DIR=/path/to/assemblies
FILES=( $FASTA_DIR/*.fasta )

KRAKEN2DB=$SNIC_TMP/kraken_db
mkdir -p "$KRAKEN2DB"
rsync -a ${KRAKEN_DB:-/sw/data/uppnex/Kraken/latest}/* $KRAKEN2DB

apply_krakenasm () {
   ASSEMBLY="$1" # The assembly is the first parameter to this function. This file must end in .fasta
   PREFIX=$( basename "$ASSEMBLY" .fasta )
   echo "Running Kraken2: $ASSEMBLY"
   kraken2 --threads "$CPUS" --db "$KRAKEN2DB" --report "${PREFIX}_kraken.rpt" "$ASSEMBLY" > "${PREFIX}_kraken.tsv"
   ktImportTaxonomy <( cut -f2,3 "${PREFIX}_kraken.tsv" ) -o "${PREFIX}_kraken_krona.html"
}

FASTA="${FILES[$JOB]}"
apply_krakenasm "$FASTA"

Kraken v1.

Notes:

  • Users will need 175 GB of RAM for full database
  • Dependencies: Kraken, Krona

Command:

#!/usr/bin/env bash

module load bioinfo-tools Kraken Krona

CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID

DATA_DIR=/path/to/reads
FILES=( $DATA_DIR/*_R1.fastq.gz )

KRAKENDATABASE=$SNIC_TMP/kraken_db
mkdir -p "$KRAKENDATABASE"
rsync -a ${KRAKEN_DB:-/sw/data/uppnex/Kraken/latest}/* $KRAKENDATABASE

apply_krakenrp () {
	READ1="$1"	# Read 1 of the read pair to be screened
	READ2="$2"	# Read 2 of the read pair to be screened
	if [ "$READ1" == "$READ2" ]; then
		>&2 echo "READ1 and READ2 are the same file. R2 Pattern replacement failed. Please check string substitution pattern lower down"
		exit 2
	fi
	echo "Running Kraken: $READ1 $READ2"
	PREFIX=$(basename "${READ1%_R1*}")
	kraken --threads "$CPUS" --db "$KRAKENDATABASE" --fastq-input --gzip-compressed --paired "$READ1" "$READ2" > "${PREFIX}-kraken_classification.tsv"
	kraken-report --db "$KRAKENDATABASE" "${PREFIX}-kraken_classification.tsv" > "${PREFIX}-kraken_classification.rpt"
	ktImportTaxonomy <( cut -f2,3 "${PREFIX}-kraken_classification.tsv" ) -o "${PREFIX}-kraken_classification.html"
}

FASTQ="${FILES[$JOB]}"
apply_krakenrp "$FASTQ" "${FASTQ/_R1./_R2.}"

Kraken: Taxonomic classification of sequences

Notes:

  • Users will need 175 GB of RAM for full database
  • Dependencies: Kraken, Krona

Command:

#!/usr/bin/env bash

module load bioinfo-tools Kraken Krona

CPUS="${SLURM_NPROCS:-8}"
JOB=$SLURM_ARRAY_TASK_ID

FASTA_DIR=/path/to/assemblies
FILES=( $FASTA_DIR/*.fasta )

KRAKENDATABASE=$SNIC_TMP/kraken_db
mkdir -p "$KRAKENDATABASE"
rsync -a ${KRAKEN_DB:-/sw/data/uppnex/Kraken/latest}/* $KRAKENDATABASE

apply_krakenasm () {
	ASSEMBLY="$1" # The assembly is the first parameter to this function
	echo "Running Kraken: $ASSEMBLY"
	PREFIX=$(basename "$ASSEMBLY" .fasta )
	kraken --threads "$CPUS" --db "$KRAKENDATABASE" --fasta-input "$ASSEMBLY" > "${PREFIX}-kraken_classification.tsv"
	kraken-report --db "$KRAKENDATABASE" "${PREFIX}-kraken_classification.tsv" > "${PREFIX}-kraken_classification.rpt"
	ktImportTaxonomy <( cut -f2,3 "${PREFIX}-kraken_classification.tsv" ) -o "${PREFIX}-kraken_classification.html"
}

FASTA="${FILES[$JOB]}"
apply_krakenasm "$FASTA"