Intro to Biostring - igheyas/Bioinformatics GitHub Wiki

Next steps Go ahead and run your toy‐FASTA code now that the package is loaded. For example, in your R script:

library(Biostrings)

# 1. Build a DNAStringSet
seqs_set <- DNAStringSet(c("ATGCGTACGTTAG",
                           "GGGAAACCCGGGTTT",
                           "TTATTAGCCG"))
names(seqs_set) <- c("seq1","seq2","seq3")

# 2. Write it to your target directory
out_path <- "C:/Users/IAGhe/OneDrive/Documents/Learning/bio/toy.fasta"
writeXStringSet(seqs_set, out_path)

# 3. Confirm it wrote correctly
cat(readLines(out_path), sep = "\n")

# 1. Load Biostrings
library(Biostrings)

# 2. Read your FASTA file
fasta_path <- "C:/Users/IAGhe/OneDrive/Documents/Learning/bio/toy.fasta"
seqs <- readDNAStringSet(fasta_path)

# 3. Compute GC counts and sequence lengths
#    letterFrequency(seqs, letters, as.prob=FALSE) gives raw counts
gc_counts    <- letterFrequency(seqs, letters = c("G","C"), as.prob = FALSE)
total_bases  <- width(seqs)   # vector of sequence lengths

# 4. Compute GC percentage
gc_percent <- rowSums(gc_counts) / total_bases * 100

# 5. Build a nice summary table
df_gc <- data.frame(
  Name       = names(seqs),
  Length     = total_bases,
  G_count    = gc_counts[, "G"],
  C_count    = gc_counts[, "C"],
  GC_count   = rowSums(gc_counts),
  GC_percent = round(gc_percent, 2)
)

print(df_gc)

Output:

# Load Biostrings
library(Biostrings)

# 1) Read in your toy FASTA
fasta_path <- "C:/Users/IAGhe/OneDrive/Documents/Learning/bio/toy.fasta"
seqs       <- readDNAStringSet(fasta_path)

# 2) Precompute each sequence’s length
seq_lens <- width(seqs)

# 3) Sliding‐window parameters
window <- 5   # window size in bp
step   <- 2   # step size in bp

# 4) Compute GC profiles
gc_profiles <- lapply(seq_along(seqs), function(i) {
  seq_name <- names(seqs)[i]
  seq_i    <- seqs[i](/igheyas/Bioinformatics/wiki/i)           # a DNAString
  len_i    <- seq_lens[i]         # its length
  
  # valid window start positions
  starts <- seq(1, len_i - window + 1, by = step)
  
  # extract each window as a Views object
  vs <- Views(seq_i, start = starts, width = window)
  
  # count G and C in each window
  gc_cnts   <- letterFrequency(vs, letters = c("G","C"), as.prob = FALSE)
  # count A, T, G, C to detect ambiguous bases
  atgc_cnts <- letterFrequency(vs, letters = c("A","T","G","C"), as.prob = FALSE)
  nonambig  <- rowSums(atgc_cnts)         # number of A/T/G/C in each window
  
  # assemble per-window stats
  data.frame(
    Sequence   = seq_name,
    Start      = starts,
    End        = starts + window - 1,
    G_count    = gc_cnts[ , "G"],
    C_count    = gc_cnts[ , "C"],
    GC_count   = rowSums(gc_cnts),
    GC_percent = round(rowSums(gc_cnts) / nonambig * 100, 2),
    ambiguous  = window - nonambig,
    stringsAsFactors = FALSE
  )
})

# 5) Combine and display
df_gc_profile <- do.call(rbind, gc_profiles)
print(df_gc_profile)