Sequence capture data processing assembly with secapr - barrettlab/2021-Genomics-bootcamp GitHub Wiki

# make a practice dir
cd /data/samski/2021_02_12_Sam_Zinniinae_capture/secapr_practice$
mkdir reads

# go up one, then make executable
cd ..
sudo chmod 777 -R secapr_practice


# Get some read pairs
sudo cp reads/Heliopsis_a* secapr_practice/reads
sudo cp reads/Heliopsis_b* secapr_practice/reads
sudo cp reads/Heliopsis_c* secapr_practice/reads
sudo cp reads/Heliopsis_p* secapr_practice/reads
sudo cp reads/Heliopsis_r* secapr_practice/reads

# unzip files

cd reads
pigz -d *.fastq.gz

# rename files to remove trailing '001'
rename 's/_001.fastq/.fastq/' *.fastq

# make a config file called 'adapter_info.txt'

[adapters]
i7:CAAGCAGAAGACGGCATACGAGAT*GTGACTGGAGTTCAG
i5:AATGATACGGCGACCACCGAGATCTACAC*ACACTCTTTCCCTAC

[names]
Heliopsis_annua_S36_L001:_
Heliopsis_buphthalmoides_S38_L001:_
Heliopsis_canescens_S39_L001:_
Heliopsis_purpurea_S50_L001:_
Heliopsis_rubra_S51_L001:_

[barcodes]
i7-Heliopsis_annua_S36_L001:TCTGAGAG
i5-Heliopsis_annua_S36_L001:CTCGTTCT
i7-Heliopsis_buphthalmoides_S38_L001:ACCGCATA
i5-Heliopsis_buphthalmoides_S38_L001:CTCGTTCT
i7-Heliopsis_canescens_S39_L001:TCTCTAGG
i5-Heliopsis_canescens_S39_L001:AAGACGAG
i7-Heliopsis_purpurea_S50_L001:TCTCTAGG
i5-Heliopsis_purpurea_S50_L001:TGTGGCTT
i7-Heliopsis_rubra_S51_L001:CCAAGCAA
i5-Heliopsis_rubra_S51_L001:TGTGGCTT

# activate the secapr conda environment, your promps should change to (secapr_env). To exit this, type 'conda deactivate'

conda activate /home/bsinn/anaconda3/envs/secapr_env

# 1. check quality of raw reads
secapr quality_check --input reads --output raw_out

# 2. trim the reads
secapr clean_reads --input reads --config adapter_info.txt --output trimmed_reads --index double  --read_min 10000 &

# 3. Check quality of trimmed reads
secapr quality_check --input trimmed_reads --output clean_out

# 4. Assemble reads into contigs, this may take a while!
secapr assemble_reads --input trimmed_reads --output contigs --assembler abyss --kmer 35 --cores 28 --max_memory 200 &

# 5. Extract target contigs. This may take a while! Not sure, but may want to try '--cores 28' to speed up

secapr find_target_contigs --contigs contigs --reference compositae_probes.fasta --output target_contigs &

# 6. Check the results
secapr plot_sequence_yield --extracted_contigs target_contigs --output plots

# 7. Align loci
secapr align_sequences --sequences target_contigs/extracted_target_contigs_all_samples.fasta --outdir alignments/contig_alignments/ --aligner mafft --no_trim --cores 28 &

# 8. Check alignments
secapr plot_sequence_yield --extracted_contigs target_contigs --alignments alignments/contig_alignments/ --output plots

# 9. There is also a helpful little function that adds dummy sequences to your alignments that are missing one or several of your samples. This helps preparing your data for some phylogenetic methods that require each alignment containing the same samples, such as e.g. BEAST. This function will add strings of NNNNNN as the sequence for these samples.

secapr add_missing_sequences --input alignments/contig_alignments/ --output alignments/contig_alignments_complete