Multiple sequence alignment - igheyas/Bioinformatics GitHub Wiki
# 0) Install packages if needed ----------------------------------------------
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
for (pkg in c("Biostrings","msa")) {
if (!requireNamespace(pkg, quietly = TRUE))
BiocManager::install(pkg)
}
# 1) Load libraries ---------------------------------------------------------
library(Biostrings)
library(msa)
# 2) Read the original FASTA -----------------------------------------------
fasta_path <- "C:/Users/IAGhe/OneDrive/Documents/Learning/bio/toy.fasta"
seqs <- readDNAStringSet(fasta_path)
# 3) Perform multiple sequence alignment ----------------------------------
# method = "Muscle" (you can also choose "ClustalW" or "ClustalOmega")
alignment <- msa(seqs, method = "Muscle", order = "input")
# 4) Inspect the result ----------------------------------------------------
# Prints a summary and the aligned blocks
print(alignment)
# If you want a simple character matrix of the aligned sequences:
aligned_mat <- as.matrix(alignment)
print(aligned_mat)
# 5) Write the aligned sequences back to FASTA -----------------------------
aligned_fasta_path <- "C:/Users/IAGhe/OneDrive/Documents/Learning/bio/toy_aligned.fasta"
# Coerce alignment to a DNAStringSet (with gaps) and write
writeXStringSet(as(alignment, "DNAStringSet"), aligned_fasta_path)
cat("Aligned FASTA written to:\n", aligned_fasta_path, "\n")
Output:
>seq1
ATGCGTACGTTAG
>seq2
GGGAAACCCGGGTTT
>seq3
TTATTAGCCG