DNA_RNA_Protein - igheyas/Bioinformatics GitHub Wiki

from Bio.Seq import Seq
import random
from collections import Counter
import pandas as pd

# Generate the DNA
bases = ["A", "C", "G", "T"]
dna = Seq("".join(random.choices(bases, k=300)))

# Count each base
counts = Counter(dna)
total = sum(counts.values())

# Build a DataFrame of counts and percentages
comp = {
    base: {"Count": cnt, "Percentage": cnt/total*100}
    for base, cnt in counts.items()
}
df_comp = pd.DataFrame.from_dict(comp, orient="index")
df_comp

counts = Counter(dna)
total  = sum(counts.values())

df_comp = pd.DataFrame.from_dict(comp, orient="index")
df_comp

Simple Python example

from collections import Counter

def gc_content(seq: str) -> float:
    """Return GC percentage of a DNA sequence (case-insensitive)."""
    seq = seq.upper()
    counts = Counter(seq)
    g = counts.get("G", 0)
    c = counts.get("C", 0)
    total = sum(counts[b] for b in ("A","T","G","C"))
    if total == 0:
        return 0.0
    return (g + c) / total * 100

# Example
sequence = "ATGCGCGATTACCGGTT"
print(f"GC% = {gc_content(sequence):.1f}%")

Output:

GC% = 56.3%

def sliding_gc(seq: str, window: int = 100, step: int = 10):
    """Yield (start, GC%) for each window along seq."""
    seq = seq.upper()
    for i in range(0, len(seq) - window + 1, step):
        window_seq = seq[i:i+window]
        yield i, gc_content(window_seq)

# Example usage
for pos, gc in sliding_gc(sequence, window=5, step=2):
    print(f"{pos}-{pos+5}: {gc:.1f}%")

Gene Vs Genome

Codon

from Bio.Seq import Seq

# A sample mRNA
mrna = Seq("AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCG")

# Translate until the first stop
protein = mrna.translate(to_stop=True)
print(protein)  # e.g. 'MA M A P E L E S H P'

from Bio import SeqIO
from Bio.Seq import Seq

# 1. Transcription (DNA → RNA)
dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
mrna = dna.transcribe()      # replaces T → U
print("mRNA:", mrna)
# Output: mRNA: AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG

# 2. Translation (RNA → Protein)
protein = mrna.translate(to_stop=True)
print("Protein:", protein)
# Output: Protein: MAIVMGR*

Amino Acid Vs Protein

from Bio.Seq import Seq

# Example mRNA → protein translation
mrna = Seq("AUGGCCUUUGCU")     # AUG GCC UUU GCU
protein = mrna.translate()     # uses standard genetic code
print(protein)  # “MAFG”

Motif

seq_dna = "ACGT…"  # your 300‐nt DNA string with TATAAT at pos 50–55
motif = "TATAAT"

# Find all exact matches (0‐based indexing)
positions = [i for i in range(len(seq_dna) - len(motif) + 1)
             if seq_dna[i:i+len(motif)] == motif]
print("Found motif at DNA positions:", positions)

# For the RNA (just replace T → U)
seq_rna = seq_dna.replace("T", "U")
positions_rna = [i for i in range(len(seq_rna) - len(motif) + 1)
                 if seq_rna[i:i+len(motif)] == motif]
print("Found motif at RNA positions:", positions_rna)

DNA_RNA_Protein - igheyas/Bioinformatics GitHub Wiki

Gene Vs Genome

Codon

Amino Acid Vs Protein

Motif

bp vs aa