Blast - ACHG2018/metagenomics-classification-tools GitHub Wiki
The scripts for blast:
Installation
sudo apt install ncbi-blast+
sudo apt-get install seqtk
sudo apt-get install fastx-toolkit
put all reference files into one single file and build a database
cd <reference_folder>
cat *.fasta > precisionFDA.fasta
makeblastdb -in precisionFDA.fasta -dbtype nucl
Blast the challenge data against the reference
cd <Challenge_data_folder>
cp <reference_folder>/precisionFDA* ./
alternatively: mv <reference_folder>/precisionFDA* ./
ls *R{1,2}.fastq | xargs -n1 sh -c 'fastx_clipper -Q33 -l 1 -i $0 -o ./tmp_folder/$0.filtered'
ls ./tmp_folder/*.filtered | xargs -n1 sh -c 'seqtk seq -a $0 > $0.fa'
ls ./tmp_folder/*.fa | xargs -n1 sh -c 'time blastn -db precisionFDA.fasta -query $0 -max_target_seqs 1 -max_hsps 1 -outfmt 6 -out $0.output'
rename 's/.fastq.filtered//' ./tmp_folder/*.output
Checking result
ls *.output | xargs -n1 sh -c 'grep "CR_[0-9]*" $0 | cut -f 2 | cut -d \| -f 3 | sort | uniq -c | sort -n -r > ./result/$0.result'
cut -f 2,3 refGenomeTaxMapping.tsv | cut -d ' ' -f 1,2 > ./reslut/taxid_sp_ref
./counter.pl <result_file> |sort -n -r
ls *.result | xargs -n1 sh -c './counter.pl $0 |sort -n -r >$0.stat'
rename fq.filtered.fa.output.result.stat stat *.stat
For merge purpose:
ls *.output | xargs -n1 sh -c 'cut -f 1,2 $0 | sed "s/\//|/g" | cut -d "|" -f 1,4 | sed "s/|/\t/" > ./result_merge_purpose/$0.merge'
ls *.output | xargs -n1 sh -c 'cut -f 1,2 $0 | sed "s/|/\t/g" | cut -f 1,4 > ./result_merge_purpose/$0.merge'
(for C10 to C21)
For generating specific format for submission
ls *.output | xargs -n1 sh -c 'cut -f 2 $0|cut -d "_" -f 1,2| sort | uniq -c | sort -n -r> tmp3/$0.abundance'
ls *.abundance | xargs -n1 sh -c './counter3.pl $0 | sort -k1,1V > abundance/$0.count'