DNA_RNA_Protein - igheyas/Bioinformatics GitHub Wiki

image image image image image image image image image image image image image image image image image image image image image image image image

from Bio.Seq import Seq
import random
from collections import Counter
import pandas as pd

# Generate the DNA
bases = ["A", "C", "G", "T"]
dna = Seq("".join(random.choices(bases, k=300)))

# Count each base
counts = Counter(dna)
total = sum(counts.values())

# Build a DataFrame of counts and percentages
comp = {
    base: {"Count": cnt, "Percentage": cnt/total*100}
    for base, cnt in counts.items()
}
df_comp = pd.DataFrame.from_dict(comp, orient="index")
df_comp

image

counts = Counter(dna)
total  = sum(counts.values())

image image image

df_comp = pd.DataFrame.from_dict(comp, orient="index")
df_comp

image image image image image image image image image image image image image image image image image image image image image image

image

image

Simple Python example

from collections import Counter

def gc_content(seq: str) -> float:
    """Return GC percentage of a DNA sequence (case-insensitive)."""
    seq = seq.upper()
    counts = Counter(seq)
    g = counts.get("G", 0)
    c = counts.get("C", 0)
    total = sum(counts[b] for b in ("A","T","G","C"))
    if total == 0:
        return 0.0
    return (g + c) / total * 100

# Example
sequence = "ATGCGCGATTACCGGTT"
print(f"GC% = {gc_content(sequence):.1f}%")

Output:

GC% = 56.3%

image

def sliding_gc(seq: str, window: int = 100, step: int = 10):
    """Yield (start, GC%) for each window along seq."""
    seq = seq.upper()
    for i in range(0, len(seq) - window + 1, step):
        window_seq = seq[i:i+window]
        yield i, gc_content(window_seq)

# Example usage
for pos, gc in sliding_gc(sequence, window=5, step=2):
    print(f"{pos}-{pos+5}: {gc:.1f}%")


image

Gene Vs Genome

image image image image

Codon

image image image image

image

from Bio.Seq import Seq

# A sample mRNA
mrna = Seq("AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCG")

# Translate until the first stop
protein = mrna.translate(to_stop=True)
print(protein)  # e.g. 'MA M A P E L E S H P'

image image image image image image image

from Bio import SeqIO
from Bio.Seq import Seq

# 1. Transcription (DNA → RNA)
dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
mrna = dna.transcribe()      # replaces T → U
print("mRNA:", mrna)
# Output: mRNA: AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG

# 2. Translation (RNA → Protein)
protein = mrna.translate(to_stop=True)
print("Protein:", protein)
# Output: Protein: MAIVMGR*

image

Amino Acid Vs Protein

image image image image image

from Bio.Seq import Seq

# Example mRNA → protein translation
mrna = Seq("AUGGCCUUUGCU")     # AUG GCC UUU GCU
protein = mrna.translate()     # uses standard genetic code
print(protein)  # “MAFG”

image

Motif

image image

seq_dna = "ACGT…"  # your 300‐nt DNA string with TATAAT at pos 50–55
motif = "TATAAT"

# Find all exact matches (0‐based indexing)
positions = [i for i in range(len(seq_dna) - len(motif) + 1)
             if seq_dna[i:i+len(motif)] == motif]
print("Found motif at DNA positions:", positions)

# For the RNA (just replace T → U)
seq_rna = seq_dna.replace("T", "U")
positions_rna = [i for i in range(len(seq_rna) - len(motif) + 1)
                 if seq_rna[i:i+len(motif)] == motif]
print("Found motif at RNA positions:", positions_rna)

image

bp vs aa

image