Multiple sequence alignment - igheyas/Bioinformatics GitHub Wiki

# 0) Install packages if needed ----------------------------------------------
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
for (pkg in c("Biostrings","msa")) {
  if (!requireNamespace(pkg, quietly = TRUE))
    BiocManager::install(pkg)
}

# 1) Load libraries ---------------------------------------------------------
library(Biostrings)
library(msa)

# 2) Read the original FASTA -----------------------------------------------
fasta_path <- "C:/Users/IAGhe/OneDrive/Documents/Learning/bio/toy.fasta"
seqs       <- readDNAStringSet(fasta_path)

# 3) Perform multiple sequence alignment ----------------------------------
#    method = "Muscle" (you can also choose "ClustalW" or "ClustalOmega")
alignment <- msa(seqs, method = "Muscle", order = "input")

# 4) Inspect the result ----------------------------------------------------
#    Prints a summary and the aligned blocks
print(alignment)

#    If you want a simple character matrix of the aligned sequences:
aligned_mat <- as.matrix(alignment)
print(aligned_mat)

# 5) Write the aligned sequences back to FASTA -----------------------------
aligned_fasta_path <- "C:/Users/IAGhe/OneDrive/Documents/Learning/bio/toy_aligned.fasta"
# Coerce alignment to a DNAStringSet (with gaps) and write
writeXStringSet(as(alignment, "DNAStringSet"), aligned_fasta_path)

cat("Aligned FASTA written to:\n", aligned_fasta_path, "\n")

Output:

>seq1
ATGCGTACGTTAG
>seq2
GGGAAACCCGGGTTT
>seq3
TTATTAGCCG