Figures prep - eolesin/AMOR_Indiv_Assembly_Protocol GitHub Wiki
Taxonomy from CAT needs to consist of only contigs that were assignable.
# Remove headers from each of the individual sample files
for i in `cat AMOR_2020_Good`;
do tail -n +2 ${i}_CAT_.contig2classification.fullnames.txt > \
${i}_CAT_.contig2classification.fullnamesnew.txt;
done
# concatenate the CAT result files for all samples.
cat *_CAT_.contig2classification.fullnamesnew.txt > ALL_CAT_fullnames.txt
# Limit file to just positive taxonomy assignments. No unknowns allowed.
awk -F '\t' '{if ($2 !="no taxid assigned") print $0}' \
ALL_CAT_contigfullnames.txt > ALL_CAT_contigfullnames_forR.txt
# Count the number of fields per each line in the file.
# this should be 12, otherwise it is a failed database hit.
awk -F '\t' '{a[NF]++}END{for(k in a)print k,a[k]}' ALL_CAT_contigfullnames_forR.txt
# Clean up the taxonomy to exclude the scores.
# This eliminated everything after the colon in a column example - "Bacteria: 1.00"
# becomes just "Bacteria"
awk 'BEGIN{FS=OFS="\t"} {sub(/:.*/,"",$12)} 1' 6.txt > 7.txt