DNA_RNA_Protein - igheyas/Bioinformatics GitHub Wiki
from Bio.Seq import Seq
import random
from collections import Counter
import pandas as pd
# Generate the DNA
bases = ["A", "C", "G", "T"]
dna = Seq("".join(random.choices(bases, k=300)))
# Count each base
counts = Counter(dna)
total = sum(counts.values())
# Build a DataFrame of counts and percentages
comp = {
base: {"Count": cnt, "Percentage": cnt/total*100}
for base, cnt in counts.items()
}
df_comp = pd.DataFrame.from_dict(comp, orient="index")
df_comp
counts = Counter(dna)
total = sum(counts.values())
df_comp = pd.DataFrame.from_dict(comp, orient="index")
df_comp
Simple Python example
from collections import Counter
def gc_content(seq: str) -> float:
"""Return GC percentage of a DNA sequence (case-insensitive)."""
seq = seq.upper()
counts = Counter(seq)
g = counts.get("G", 0)
c = counts.get("C", 0)
total = sum(counts[b] for b in ("A","T","G","C"))
if total == 0:
return 0.0
return (g + c) / total * 100
# Example
sequence = "ATGCGCGATTACCGGTT"
print(f"GC% = {gc_content(sequence):.1f}%")
Output:
GC% = 56.3%
def sliding_gc(seq: str, window: int = 100, step: int = 10):
"""Yield (start, GC%) for each window along seq."""
seq = seq.upper()
for i in range(0, len(seq) - window + 1, step):
window_seq = seq[i:i+window]
yield i, gc_content(window_seq)
# Example usage
for pos, gc in sliding_gc(sequence, window=5, step=2):
print(f"{pos}-{pos+5}: {gc:.1f}%")
Gene Vs Genome
Codon
from Bio.Seq import Seq
# A sample mRNA
mrna = Seq("AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCG")
# Translate until the first stop
protein = mrna.translate(to_stop=True)
print(protein) # e.g. 'MA M A P E L E S H P'
from Bio import SeqIO
from Bio.Seq import Seq
# 1. Transcription (DNA → RNA)
dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
mrna = dna.transcribe() # replaces T → U
print("mRNA:", mrna)
# Output: mRNA: AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG
# 2. Translation (RNA → Protein)
protein = mrna.translate(to_stop=True)
print("Protein:", protein)
# Output: Protein: MAIVMGR*
Amino Acid Vs Protein
from Bio.Seq import Seq
# Example mRNA → protein translation
mrna = Seq("AUGGCCUUUGCU") # AUG GCC UUU GCU
protein = mrna.translate() # uses standard genetic code
print(protein) # “MAFG”
Motif
seq_dna = "ACGT…" # your 300‐nt DNA string with TATAAT at pos 50–55
motif = "TATAAT"
# Find all exact matches (0‐based indexing)
positions = [i for i in range(len(seq_dna) - len(motif) + 1)
if seq_dna[i:i+len(motif)] == motif]
print("Found motif at DNA positions:", positions)
# For the RNA (just replace T → U)
seq_rna = seq_dna.replace("T", "U")
positions_rna = [i for i in range(len(seq_rna) - len(motif) + 1)
if seq_rna[i:i+len(motif)] == motif]
print("Found motif at RNA positions:", positions_rna)