AB TrimAl - mendessoares/BuddySuite GitHub Wiki
--trimal, -trm
Description
This essentially reimplements portions of the excellent software trimAl, allowing the removal of poorly aligned regions to increase overall alignment quality.
Note: If working from annotated genbank files, any features spanning columns that are removed will be truncated as is appropriate, and will be deleted completely if all of its columns are removed.
Arguments
Mode ( { float, int, mode_str } )
The trimal function contains four distinct modes: 'clean', 'all' (or 'no_gaps'), 'gappyout', and a threshold mode controlled by supplying number.
clean: Removes any columns that are 100% gaps (example 1).
all: Removes all columns with any number of gaps (example 2).
gappyout: A more involved algorithm that takes into account the distribution of alignment gaps to automatically select the most appropriate threshold (example 3).
Threshold ( int ): Specifically set the maximum number of gaps a column may contain without being removed (example 4).
Threshold ( float ): Use a fractional proportion (0.05, 0.65, 0.99, etc) less than 1 to set the maximum percentage of columns with gaps (example 5).
Examples
Input file: Mnemiopsis.nex
#NEXUS
begin data;
dimensions ntax=7 nchar=216;
format datatype=protein missing=? gap=-;
matrix
Mle_Panxα9 --ML-ILSKF--KGVTPFKGR-QYTGSVIS--GFKKFG--STFAEDY-VRLLHN--PVES-DQELFSIGD--FKTYGSEW----IKKLKL-EDNLATEA-FLRDDSA---IKHMYFQRKL-RGNGKAL----------------------------------------------------------------------------------------
Mle_Panxα7A -MGV-ILFPI--RATAP-KSRQQLGGAYIA--GFSRDEEYERFAEEW-VAMLQN--PVED-GFEKFKFGS--FAEYGLLW----DTRNP--LNNVQSLS-SLDRSPAVGVISKLYLKDESPRAEVLKRRSKKVKVPSPRKPKLLFHEEIKKKLIKRTERKDDNLTKYAQEDVLDSEYVVVEQSVPETMTEQESVEESVPEISKAEQEGGSSDHIDV
Mle_Panxα8 -MVL-V-ALF--PRLAPFKVR-SYTGSVIE--GFLKVP--VEFAKDY-ISMLHN--PVET-AEELFHIGS--FFSYGVDW----AAEEPEPGEDYKTLA-FLRDEPA---YSEIWRVGGE-----------------------------VRCLVERC-------------FSLNRPHI--------------------PHLVK-------------
Mle_Panxα5 -MIYWV-AVF--KRMAPFKVG-IIA-STIK--GFEDSFRSEAFVDEY-INLLHN--PLTRDEEELFKIGN--FVTYGFEW----VGSQVP-NGSTYTLT-FWND-WN---LRHLYWNEYV-QYNGHLKTS--------------------------------------------------------------------------------------
Mle_Panxα4 -MVIEL-AGY--KGLSPFKDR-QYS-TLIA--GFTKFH--PQFAEDY-IKLLCL--ASGSPEQQMFEYGSNTWYRYGADWYGTRFSSYHE-TNNSITLG-MLPDKPS---YRYVFMTTSK-NADVIENLS-----------------------------------------KLDSSVI----------------------------ELGSKDSI--
Mle_Panxα3 MLLLGS-GTI--KNLS-FKDS-QYT-KNIS--GFTKFG--EDFSQDY-LKIMSDYYHCTT-ETPSFQVGD--FKTYGIEW----LRQFPN-PENYSTSGHFSHKHPG---WKFMYYKHLR-IGHVPGE-------------------------------------------YLTDPA---------------------------------------
Mle_Panxα6 -MLLEI-ANF--KGAT-FKER-QYT-GIIA--GLTKFS--AAFAEDYILAMLHN--PVDG-DDVTFESGR--YLTYGSEW----FASLDK-QSNYTSFC-FLKENSK---LKFIYFEEKN-KQHLKGSKD-----------------------------------------ILV------------------------------------------
;
end;
Usage example 1
$: alb Mnemiopsis.nex -trm clean
Output
#NEXUS
begin data;
dimensions ntax=7 nchar=212;
format datatype=protein missing=? gap=-;
matrix
Mle_Panxα9 --ML-ILSKFKGVTPFKGR-QYTGSVISGFKKFG--STFAEDY-VRLLHN--PVES-DQELFSIGD--FKTYGSEW----IKKLKL-EDNLATEA-FLRDDSA---IKHMYFQRKL-RGNGKAL----------------------------------------------------------------------------------------
Mle_Panxα7A -MGV-ILFPIRATAP-KSRQQLGGAYIAGFSRDEEYERFAEEW-VAMLQN--PVED-GFEKFKFGS--FAEYGLLW----DTRNP--LNNVQSLS-SLDRSPAVGVISKLYLKDESPRAEVLKRRSKKVKVPSPRKPKLLFHEEIKKKLIKRTERKDDNLTKYAQEDVLDSEYVVVEQSVPETMTEQESVEESVPEISKAEQEGGSSDHIDV
Mle_Panxα8 -MVL-V-ALFPRLAPFKVR-SYTGSVIEGFLKVP--VEFAKDY-ISMLHN--PVET-AEELFHIGS--FFSYGVDW----AAEEPEPGEDYKTLA-FLRDEPA---YSEIWRVGGE-----------------------------VRCLVERC-------------FSLNRPHI--------------------PHLVK-------------
Mle_Panxα5 -MIYWV-AVFKRMAPFKVG-IIA-STIKGFEDSFRSEAFVDEY-INLLHN--PLTRDEEELFKIGN--FVTYGFEW----VGSQVP-NGSTYTLT-FWND-WN---LRHLYWNEYV-QYNGHLKTS--------------------------------------------------------------------------------------
Mle_Panxα4 -MVIEL-AGYKGLSPFKDR-QYS-TLIAGFTKFH--PQFAEDY-IKLLCL--ASGSPEQQMFEYGSNTWYRYGADWYGTRFSSYHE-TNNSITLG-MLPDKPS---YRYVFMTTSK-NADVIENLS-----------------------------------------KLDSSVI----------------------------ELGSKDSI--
Mle_Panxα3 MLLLGS-GTIKNLS-FKDS-QYT-KNISGFTKFG--EDFSQDY-LKIMSDYYHCTT-ETPSFQVGD--FKTYGIEW----LRQFPN-PENYSTSGHFSHKHPG---WKFMYYKHLR-IGHVPGE-------------------------------------------YLTDPA---------------------------------------
Mle_Panxα6 -MLLEI-ANFKGAT-FKER-QYT-GIIAGLTKFS--AAFAEDYILAMLHN--PVDG-DDVTFESGR--YLTYGSEW----FASLDK-QSNYTSFC-FLKENSK---LKFIYFEEKN-KQHLKGSKD-----------------------------------------ILV------------------------------------------
;
end;
Usage example 2
$: alb Mnemiopsis.nex -trm all
Output
#NEXUS
begin data;
dimensions ntax=7 nchar=89;
format datatype=protein missing=? gap=-;
matrix
Mle_Panxα9 MLISKFKGVTKGRQYTSVISGFKKFGSTFAEDYVRLLHNPVESDQELFSIGDFKTYGSEWIKKLKEDNLATEAFLRDSAIKHMYFQRKL
Mle_Panxα7A GVIFPIRATAKSRQLGAYIAGFSRDEERFAEEWVAMLQNPVEDGFEKFKFGSFAEYGLLWDTRNPLNNVQSLSSLDRPAISKLYLKDES
Mle_Panxα8 VLVALFPRLAKVRSYTSVIEGFLKVPVEFAKDYISMLHNPVETAEELFHIGSFFSYGVDWAAEEPGEDYKTLAFLRDPAYSEIWRVGGE
Mle_Panxα5 IYVAVFKRMAKVGIIASTIKGFEDSFEAFVDEYINLLHNPLTREEELFKIGNFVTYGFEWVGSQVNGSTYTLTFWNDWNLRHLYWNEYV
Mle_Panxα4 VILAGYKGLSKDRQYSTLIAGFTKFHPQFAEDYIKLLCLASGSEQQMFEYGSWYRYGADWFSSYHTNNSITLGMLPDPSYRYVFMTTSK
Mle_Panxα3 LLSGTIKNLSKDSQYTKNISGFTKFGEDFSQDYLKIMSDHCTTETPSFQVGDFKTYGIEWLRQFPPENYSTSGFSHKPGWKFMYYKHLR
Mle_Panxα6 LLIANFKGATKERQYTGIIAGLTKFSAAFAEDYLAMLHNPVDGDDVTFESGRYLTYGSEWFASLDQSNYTSFCFLKESKLKFIYFEEKN
;
end;
Usage example 3
$: alb Mnemiopsis.nex -trm gappyout
Output
#NEXUS
begin data;
dimensions ntax=7 nchar=104;
format datatype=protein missing=? gap=-;
matrix
Mle_Panxα9 -MLISKFKGVTPFKGRQYTSVISGFKKFGSTFAEDYVRLLHNPVESDQELFSIGDFKTYGSEWIKKLKLEDNLATEAFLRDDSAIKHMYFQRKLRGNGKAL---
Mle_Panxα7A MGVIFPIRATAP-KSRQLGAYIAGFSRDEERFAEEWVAMLQNPVEDGFEKFKFGSFAEYGLLWDTRNP-LNNVQSLSSLDRSPAISKLYLKDESRAEVLKRVLD
Mle_Panxα8 MVLVALFPRLAPFKVRSYTSVIEGFLKVPVEFAKDYISMLHNPVETAEELFHIGSFFSYGVDWAAEEPEGEDYKTLAFLRDEPAYSEIWRVGGE-------SLN
Mle_Panxα5 MIYVAVFKRMAPFKVGIIASTIKGFEDSFEAFVDEYINLLHNPLTREEELFKIGNFVTYGFEWVGSQVPNGSTYTLTFWND-WNLRHLYWNEYVQYNGHLK---
Mle_Panxα4 MVILAGYKGLSPFKDRQYSTLIAGFTKFHPQFAEDYIKLLCLASGSEQQMFEYGSWYRYGADWFSSYHETNNSITLGMLPDKPSYRYVFMTTSKNADVIENKLD
Mle_Panxα3 LLLSGTIKNLS-FKDSQYTKNISGFTKFGEDFSQDYLKIMSDHCTTETPSFQVGDFKTYGIEWLRQFPNPENYSTSGFSHKHPGWKFMYYKHLRIGHVPGEYLT
Mle_Panxα6 MLLIANFKGAT-FKERQYTGIIAGLTKFSAAFAEDYLAMLHNPVDGDDVTFESGRYLTYGSEWFASLDKQSNYTSFCFLKENSKLKFIYFEEKNKQHLKGSILV
;
end;
Usage example 4
$: alb Mnemiopsis.nex -trm 3
Output
#NEXUS
begin data;
dimensions ntax=7 nchar=110;
format datatype=protein missing=? gap=-;
matrix
Mle_Panxα9 -ML-ISKFKGVTPFKGRQYTSVISGFKKFGSTFAEDYVRLLHNPVESDQELFSIGDFKTYGSEWIKKLKLEDNLATEAFLRDDSAIKHMYFQRKLRGNGKAL--------
Mle_Panxα7A MGV-IFPIRATAP-KSRQLGAYIAGFSRDEERFAEEWVAMLQNPVEDGFEKFKFGSFAEYGLLWDTRNP-LNNVQSLSSLDRSPAISKLYLKDESRAEVLKRRSVLDSEY
Mle_Panxα8 MVL-VALFPRLAPFKVRSYTSVIEGFLKVPVEFAKDYISMLHNPVETAEELFHIGSFFSYGVDWAAEEPEGEDYKTLAFLRDEPAYSEIWRVGGE---------SLNRPH
Mle_Panxα5 MIYWVAVFKRMAPFKVGIIASTIKGFEDSFEAFVDEYINLLHNPLTREEELFKIGNFVTYGFEWVGSQVPNGSTYTLTFWND-WNLRHLYWNEYVQYNGHLKTS------
Mle_Panxα4 MVIELAGYKGLSPFKDRQYSTLIAGFTKFHPQFAEDYIKLLCLASGSEQQMFEYGSWYRYGADWFSSYHETNNSITLGMLPDKPSYRYVFMTTSKNADVIENLSKLDSSV
Mle_Panxα3 LLLGSGTIKNLS-FKDSQYTKNISGFTKFGEDFSQDYLKIMSDHCTTETPSFQVGDFKTYGIEWLRQFPNPENYSTSGFSHKHPGWKFMYYKHLRIGHVPGE--YLTDPA
Mle_Panxα6 MLLEIANFKGAT-FKERQYTGIIAGLTKFSAAFAEDYLAMLHNPVDGDDVTFESGRYLTYGSEWFASLDKQSNYTSFCFLKENSKLKFIYFEEKNKQHLKGSKDILV---
;
end;
Usage example 5
$: alb Mnemiopsis.nex -trm 0.75
Output
#NEXUS
begin data;
dimensions ntax=7 nchar=138;
format datatype=protein missing=? gap=-;
matrix
Mle_Panxα9 -ML-ILSKFKGVTPFKGRQYTGSVISGFKKFG--STFAEDYVRLLHNPVES-DQELFSIGDFKTYGSEWIKKLKLEDNLATEAFLRDDSAIKHMYFQRKLRGNGKAL-------------------------------
Mle_Panxα7A MGV-ILFPIRATAP-KSRQLGGAYIAGFSRDEEYERFAEEWVAMLQNPVED-GFEKFKFGSFAEYGLLWDTRNP-LNNVQSLSSLDRSPAISKLYLKDESRAEVLKRRSKKKLIKRTDVLDSEYVPEISKEGGSSDHI
Mle_Panxα8 MVL-V-ALFPRLAPFKVRSYTGSVIEGFLKVP--VEFAKDYISMLHNPVET-AEELFHIGSFFSYGVDWAAEEPEGEDYKTLAFLRDEPAYSEIWRVGGE---------VRCLVERCFSLNRPHIPHLVK--------
Mle_Panxα5 MIYWV-AVFKRMAPFKVGIIA-STIKGFEDSFRSEAFVDEYINLLHNPLTRDEEELFKIGNFVTYGFEWVGSQVPNGSTYTLTFWND-WNLRHLYWNEYVQYNGHLKTS-----------------------------
Mle_Panxα4 MVIEL-AGYKGLSPFKDRQYS-TLIAGFTKFH--PQFAEDYIKLLCLASGSPEQQMFEYGSWYRYGADWFSSYHETNNSITLGMLPDKPSYRYVFMTTSKNADVIENLS---------KLDSSVI-----ELGSKDSI
Mle_Panxα3 LLLGS-GTIKNLS-FKDSQYT-KNISGFTKFG--EDFSQDYLKIMSDHCTT-ETPSFQVGDFKTYGIEWLRQFPNPENYSTSGFSHKHPGWKFMYYKHLRIGHVPGE-----------YLTDPA--------------
Mle_Panxα6 MLLEI-ANFKGAT-FKERQYT-GIIAGLTKFS--AAFAEDYLAMLHNPVDG-DDVTFESGRYLTYGSEWFASLDKQSNYTSFCFLKENSKLKFIYFEEKNKQHLKGSKD---------ILV-----------------
;
end;