Metagenomics‐theory - igheyas/Bioinformatics GitHub Wiki
Step1:
cd /mnt/c/Users/IAGhe/OneDrive/Documents/Bioinformatics
conda activate metagenomics_toy
#Toy data: Toy genomes
#!/usr/bin/env python3
import random
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
def random_seq(n):
return ''.join(random.choices("ACGT", k=n))
records = []
for i in range(1,4):
seq = random_seq(1000)
rec = SeqRecord(Seq(seq),
id=f"genome{i}",
description="toy genome")
records.append(rec)
with open("data/genomes.fasta", "w") as out:
SeqIO.write(records, out, "fasta")
print("Wrote 3 toy genomes to data/genomes.fasta")
# make directory and run
mkdir -p data
chmod +x generate_toy_genomes.py
./generate_toy_genomes.py
Open genome file:
The file contain s the following:
Step 2: Simulate paired-end Illumina reads
We’ll use ART (HiSeq2500 profile, 100 bp reads, insert 200±10 bp):
mkdir -p data/simulated
art_illumina \
-ss HS25 \
-i data/genomes.fasta \
-p \
-l 100 \
-f 10 \
-m 200 \
-s 10 \
-o data/simulated/toy
Output:
Output:
toy1.ALN
toy1.fq
toy2.ALN
toy2.fq
4. Quality control with Fastp
mkdir -p qc
fastp \
-i data/simulated/toy1.fq \
-I data/simulated/toy2.fq \
-o qc/clean_1.fq \
-O qc/clean_2.fq \
-h qc/fastp.html \
-j qc/fastp.json
- Open qc/fastp.html in your browser to inspect per-base quality, adapter trimming, etc.
Output
#fastap.html
Step: 5a
mkdir -p kraken_db
kraken2-build --download-taxonomy --skip-maps --db kraken_db
sed -e 's/^>genome1/>genome1|kraken:taxid|562|/' \
-e 's/^>genome2/>genome2|kraken:taxid|561|/' \
-e 's/^>genome3/>genome3|kraken:taxid|562|/' \
data/genomes.fasta > data/genomes_taxid.fasta
kraken2-build --build --db kraken_db
Step 5b. Classify your QC’d reads
# from your project root
mkdir -p kraken_out
kraken2 \
--db kraken_db \
--paired qc/clean_1.fq qc/clean_2.fq \
--report kraken_out/report.txt \
--output kraken_out/output.txt
Output
6. Assemble with MEGAHIT
# (make sure you’re in your project root)
megahit \
-1 qc/clean_1.fq \
-2 qc/clean_2.fq \
-o assembly
Output
this is snapshot of final.configs.fa file:
7. Annotate contigs with Prokka
prokka \
--outdir prokka \
--prefix toy \
--cpus 4 \
assembly/final.contigs.fa
Output:
#File toy.gff: ##gff-version 3 ##sequence-region k21_2 1 978 ##sequence-region k21_1 1 893 ##sequence-region k21_0 1 933 ##FASTA
k21_2 TTGTGAAATCAGTATCGAGGATGCGTTGGTTGGCCTCCAGCGAGCATGGGAGAATCATCC GAGTCCCTTCCAGAATCCACAAGTCCTCACTGATAAGTGACGGTTTGGAAAAGTTCTGGT TCCGTATGCTTCCTACCGCCCCTTGTTCTGTGTCAGACCGCCGCATAAAGTCTATCCACT TCGCACATAGAGGACTCCTAGCCACATCCCTTCGCGGGCATGCCTGCAGGACCCACCTTT CACAGCTTCCTTGCGGGCCGGGCTGTCCGGGGCCTACAGTTGGCGACGAGTGCGGTACCG GGGAAGGCGCCTGGCATACCTTTGACTGAGCGCGGAATTAGGTAGATTAACACTCGCGGC TGAGATTTAGTTGTCTGAGGTACTATTTCTAACTTTTGACCTTTAGCGCAGGCGAACGCG CACAGCTCCGCTTAAATGCGTTACGAAGGGCGATCTATTCCGGTTACACACGTCCATACT ACAGGAAACTTGCACCGCCTCGCGCCGAGTACCGGGAATGTATCTATGCTAACTACTGGT GATAACCGTAAGGAGGCGTCTGAGATAACAGGGACAGGAGCCTCACCAGACGTAACAGAC TCTTCAAGCATTTTCCAACGTCCGGAACGGTTCCAGTACAACAGCGACAATCCTGGGATG GTTCGTACCAAGTTCTCATTTAAGCAACACAACCAATAGTTCCACGCAATACGCGTAGGC CGGTCCTGGCTAGTGTTCGAGTCAGCTCCCGCTGAGCTCAGTCGACGCTATACGAACAAG GCGTAGCATCAGTTTCCCCGCCGGGGGCCTCTCTGAACCGCGGTGAAGTGCCGCATACAC AATTTTTCGCAATAAATGATTACTTGGTACACAGTGGTTGGTCAAATAAGGCCCCGAGAG CCAACTATTGGTGAGTTAGTATATGGTGAGCGAAATAGGAGATCTCGTTTTCGCCGGCGC GAAGCGCGAATAAGTTAT k21_1 AAATTCGCACTGTTCCGAACCTCCGCGCCGTAGAAGGGACTCTAGCGTGGGCTTGTGGTC AGCTTGACCATACTGTCTACGCGCAAGGCGCACCACATGGGGGCGAGGATAATGCATAGC CAAAGACGTTCATACAACGGATTCATAATCCACGCATTAGGCAGTTCGCTTTTTAGTGAC GTAGGCTAGTACACAACTGCCATCATCGGGGTCCTACAGGGGGGCAAATCCCTCGTTGGA GGCGTGGAAGGGATAAGATTGTTGCTTCTGTAATAACTCCAATCCTAAATTTTAAGGAGT TCGTGATATTCTCAATTCGTCCTACGGCTTGGACACACGACTTTCCTCTCGGGTTCCAAG ATGGTATTGTTTATGACGTGGTAATTAAACTCCTTGCTAGCGAACCACTCTTAAAGCTCA CACGATCGGAAGGTTCAGTTTCCTTCCACAGGGATTGCTTCCTGAGCATCAATTATACGA CGCGTCATCAAAAACACGAATATAGCATCACCGAAACCGGTAACACCCTGTGACGTCCGC ACCTGCGTACGTGTACTCCAAGTCTATAAAGCGTTGACTCCTGTGCTAACGGTGCTAAGA CTACACGGGATGAAGGGGGTGGTCATTCCTCGGATGAAGTAGACGTCCGTGCACCACTGA GTAAAGAGACCACTTGCAACTTACTGAGGGCCGCATTGGACTACAGCAAGGGGATTCGAT GGCTCGATATGGGGTTACCTCCTAGAAAAGCTACAACGATTTAGCTTCGAAGAAAATATG TTACCTAAACCCTCGATGGTTGTGGACCCCAGCCTTGTTATGTGCATCCGCATAGCATTG GACCGCCGATGCTGGAAACGCCGAGGCCGATGACGCGTCTTGGCAGGCACATA k21_0 CAGCGGATCGAAATCCTACGCGTAGCACATTACGAGGCCCCAGAACTGAACTTTGTAGAA TGCGCTAATACGTAACGTGGTGGTAACTTTTAACTTTTGGTGTGGGAGGAGCGATCTTCT TGCGACCACACTATAGTAGAGCCCGCCTAGCTCCCAATCGACTCCTTGGAACGGCTGTCT TTGTTACACTTTATAAACAGGAGCGGATGACTAAGAAAAAGGACCCCCCGTTCCCCTTTC GTTATATATTGGTAATCCCTCTACATGATAGACTTATCACAGGCATTGGTTTCGTCCCGT GTCTTCGCGTTCAAGGTGTCCATTAGCACTACAGCGCGAACAGTCTTTCAGCTATACAAC TGTATTCTCACTGTCCCGGGAAGTAGGCTGTAGATGGTAAGCCTGCGACCTACCATATAA TAGGTGCGGGAACCCCGCCCAGAGGATCAACGTCGCGTCAGGGTGTAGCTCAATAGCCTA CTACGTCGCCTGTATCGTCATCCGTGTTATATTAAATGGTAACACCCTCACTATTTTCGC CGTAGTTTTTTTCCTCACCTTTGTTGAAGACCGAGTTGCTTCGATAAGCGCAATGGCTAA TCGCTGAGTTGGCTGCGAGCTATTTCGATTCCATTTCTTCGAGGGCGTAACACTCTTCGG CAGACGCCTGGTAAGGGCAACGGGGCTATGACCAGCCTCCGCGACTCAGCCGACACCATG ATGGCGGGTGATTTCTATAGAATCCGGTCGCATGGGTGTAGCACACGAAACTTACCCTAG CCCGAGATAGTCCGTACGAGAGCGGATGGTTATCTGTCGAGCCTCGCTACAAAGCTGCCA CGAAAGGGTAAGGATCGGCCATCGAGGCCTGCTACGACGATAAGAACCATCCAAGATATT GAGCGTCATGATATCCAAGTTATTCATGGCCCC
toy.faa file is empty
#toy.ffn is also empty
8. Parse & visualize in Python
%%writefile analysis.py
#!/usr/bin/env python3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
# 1) Taxonomic report
rep = pd.read_csv(
"kraken_out/report.txt",
sep="\t",
header=None,
names=["pct","clade_reads","taxon_reads","rank","taxid","name"]
)
top = rep.sort_values("taxon_reads", ascending=False).head(5)
print("\nTop 5 taxa by read count:")
print(top["name","taxon_reads"](/igheyas/Bioinformatics/wiki/"name","taxon_reads"))
plt.figure(figsize=(6,4))
sns.barplot(data=top, y="name", x="taxon_reads", palette="mako")
plt.xlabel("Reads assigned")
plt.ylabel("")
plt.tight_layout()
plt.savefig("kraken_out/top5_taxa.png")
plt.close()
# 2) Assembly stats
contigs = list(SeqIO.parse("assembly/final.contigs.fa", "fasta"))
lengths = np.array([len(c.seq) for c in contigs])
print(f"\nAssembly: {len(lengths)} contigs, {lengths.sum()} bp total")
plt.figure(figsize=(6,4))
sns.histplot(lengths, bins=min(20, len(lengths)), color="steelblue")
plt.xlabel("Contig length (bp)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("assembly/contig_length_hist.png")
plt.close()
# 3) Prokka features (robust parsing)
gff = "prokka/toy.gff"
cds = rrna = trna = 0
with open(gff) as fh:
for line in fh:
line = line.strip()
# skip blank lines, comments, FASTA headers
if not line or line.startswith("#") or line.startswith(">"):
continue
cols = line.split("\t")
if len(cols) < 3:
continue
feat = cols[2]
if feat == "CDS":
cds += 1
elif feat == "rRNA":
rrna += 1
elif feat == "tRNA":
trna += 1
print(f"\nProkka found: {cds} CDS, {rrna} rRNA, {trna} tRNA")