5. Homology based scaffolding workflow - USDA-ARS-GBRU/Pepper_TrioBinning GitHub Wiki

Order chromosomes and scaffolds. Then give them pretty names.

#!/bin/bash
#SBATCH --job-name="reorder-rename_YakHifi_HDA149"
#SBATCH --output="%x_%j.o" # job standard output file (%j replaced by job id)
#SBATCH --error="%x_%j.e" # job standard error file (%j replaced by job id)

module load samtools
samp='YakHifi_HDA149_B5'
in='../STEP_5/YakHifi_HDA149_B5/ragtag.scaffold.fasta'

################################################################
# Reorder
################################################################

# Get the .fai file
samtools faidx ${in}

# Sort largest to smallest
cat ${in}.fai | sort -t$'\t' -k2,2nr > ${samp}_temp-1.txt

#######################################
# STOP
# Manually edit to put chromosomes in order from 1-12
######################################


# Then run to grab just column 1, which will be used to reorder the fasta file.
awk '{print $1}' ${samp}_temp-1.txt > ${samp}_temp-2.txt

# Now reorder chromosomes
samtools faidx ${in} $(cat ${samp}_temp-2.txt) > ${samp}_reordered.fasta

# To check chromosome lengths (to make sure order is correct)
samtools faidx ${samp}_reordered.fasta

###################################################################
# Rename
##################################################################

# Grab scaffolds names, which are at the end of temp-2 file
awk '{if (NR>12) print}' ${samp}_temp-2.txt | cat -n > ${samp}_temp-3.txt
# Rename them as scaffold1, scaffold2, based on length (remember that they are ordered by length so we can use row number)
awk '{print "scaffold"$1}' ${samp}_temp-3.txt > ${samp}_temp-scaff.txt

# Grab chromosome names, which are the first 12 lines of the temp-2 file
head -n 12 ${samp}_temp-2.txt | cat -n > ${samp}_temp-4.txt

# Rename them as chr1, chr2, based on length (remember that we already ordered them manually so we can use row number)
awk '{print "chr"$1}' ${samp}_temp-4.txt > ${samp}_temp-chr.txt

#concat chromosomes and scaffold names back together
cat ${samp}_temp-chr.txt ${samp}_temp-scaff.txt > ${samp}_renames.txt

# This awk commands replaces the chromosome names in the assembly with the names in new_chr_names.txt
awk 'NR == FNR { o[n++] = $0; next } /^>/ && i < n { $0 = ">" o[i++] } 1' ${samp}_renames.txt ${samp}_reordered.fasta > ${samp}_FINAL.fasta
samtools faidx ${samp}_FINAL.fasta

################
# Clean up
###############

rm ${samp}_temp-1.txt ${samp}_temp-2.txt ${samp}_temp-3.txt ${samp}_temp-4.txt ${samp}_temp-scaff.txt ${samp}_temp-chr.txt