Intro to Biostring - igheyas/Bioinformatics GitHub Wiki
Next steps Go ahead and run your toy‐FASTA code now that the package is loaded. For example, in your R script:
library(Biostrings)
# 1. Build a DNAStringSet
seqs_set <- DNAStringSet(c("ATGCGTACGTTAG",
"GGGAAACCCGGGTTT",
"TTATTAGCCG"))
names(seqs_set) <- c("seq1","seq2","seq3")
# 2. Write it to your target directory
out_path <- "C:/Users/IAGhe/OneDrive/Documents/Learning/bio/toy.fasta"
writeXStringSet(seqs_set, out_path)
# 3. Confirm it wrote correctly
cat(readLines(out_path), sep = "\n")
# 1. Load Biostrings
library(Biostrings)
# 2. Read your FASTA file
fasta_path <- "C:/Users/IAGhe/OneDrive/Documents/Learning/bio/toy.fasta"
seqs <- readDNAStringSet(fasta_path)
# 3. Compute GC counts and sequence lengths
# letterFrequency(seqs, letters, as.prob=FALSE) gives raw counts
gc_counts <- letterFrequency(seqs, letters = c("G","C"), as.prob = FALSE)
total_bases <- width(seqs) # vector of sequence lengths
# 4. Compute GC percentage
gc_percent <- rowSums(gc_counts) / total_bases * 100
# 5. Build a nice summary table
df_gc <- data.frame(
Name = names(seqs),
Length = total_bases,
G_count = gc_counts[, "G"],
C_count = gc_counts[, "C"],
GC_count = rowSums(gc_counts),
GC_percent = round(gc_percent, 2)
)
print(df_gc)
Output:
# Load Biostrings
library(Biostrings)
# 1) Read in your toy FASTA
fasta_path <- "C:/Users/IAGhe/OneDrive/Documents/Learning/bio/toy.fasta"
seqs <- readDNAStringSet(fasta_path)
# 2) Precompute each sequence’s length
seq_lens <- width(seqs)
# 3) Sliding‐window parameters
window <- 5 # window size in bp
step <- 2 # step size in bp
# 4) Compute GC profiles
gc_profiles <- lapply(seq_along(seqs), function(i) {
seq_name <- names(seqs)[i]
seq_i <- seqs[i](/igheyas/Bioinformatics/wiki/i) # a DNAString
len_i <- seq_lens[i] # its length
# valid window start positions
starts <- seq(1, len_i - window + 1, by = step)
# extract each window as a Views object
vs <- Views(seq_i, start = starts, width = window)
# count G and C in each window
gc_cnts <- letterFrequency(vs, letters = c("G","C"), as.prob = FALSE)
# count A, T, G, C to detect ambiguous bases
atgc_cnts <- letterFrequency(vs, letters = c("A","T","G","C"), as.prob = FALSE)
nonambig <- rowSums(atgc_cnts) # number of A/T/G/C in each window
# assemble per-window stats
data.frame(
Sequence = seq_name,
Start = starts,
End = starts + window - 1,
G_count = gc_cnts[ , "G"],
C_count = gc_cnts[ , "C"],
GC_count = rowSums(gc_cnts),
GC_percent = round(rowSums(gc_cnts) / nonambig * 100, 2),
ambiguous = window - nonambig,
stringsAsFactors = FALSE
)
})
# 5) Combine and display
df_gc_profile <- do.call(rbind, gc_profiles)
print(df_gc_profile)