III. Granges and Biostrings - abishpius/R-for-Computational-Biology GitHub Wiki

IRanges and GRanges

Introduction to Genomic Ranges

# install ERBS
install_github("genomicsclass/ERBS")

# load GM12878 and HepG2 objects from ERBS package
library(ERBS)
data(GM12878)
data(HepG2)

# inspect HepG2 GRanges object
class(HepG2)
HepG2
values(HepG2)

# seqnames extracts chromosome names
seqnames(HepG2)     # stored as type Rle
chr = seqnames(HepG2)
as.character(chr)    # view as character type

# make a table of numbers of sequences on each chromosome
table(chr)
table(chr)[1:24]    # restrict to autosomes, X and Y

# GRanges can be subsetted and ordered
HepG2[chr=="chr20",]
x = HepG2[order(HepG2),]
seqnames(x)     # demonstrate usefulness of Rle type
as.character(seqnames(x))

Interval ranges: IRanges

library(IRanges)
ir <- IRanges(5,10) #Create IRanges object 
start(ir)
end(ir)
width(ir)
?IRanges #Documentation

## Intra-Range Functions
shift(ir, -2) #right shift
narrow(ir, start = 2) #relative starts 
narrow(ir, end = 5) #relative end
flank(ir, width=3, start= TRUE, both = FALSE)
flank(ir, width=3, start= FALSE, both = FALSE)
flank(ir, width=3, start= TRUE, both = TRUE)

#Plot IRanges
plotir(ir,1)
polygon(c(start(ir)-.5,start(ir)-.5, end(ir)+.5, end(ir)+.5), c(-1,9,9,-1), col = rgb(1,0,0,.2), border = NA)
plotir(shift(ir,-2),2)
plotir(narrow(r,start=2),3)
plotir(flank(ir, width=3, start= TRUE, both = FALSE),5)

#Inter-Range Methods
ir <- IRanges(start = c(3,5,17), end = c(10,0,20))
range(ir)
reduce(ir)
gaps(ir)
disjoin(ir)

Genomic Ranges: GRanges

# Create G Range Object
library(GenomicRanges)
gr <- Granges('chr2', IRanges(start = c(5,10), end = c(35,45)), strangd = "+", seqlengths = c(chrZ-100L))
gr
shift(gr,10)
shift(gr,80) #overshoots range
trim(shift(gr,80))
mcols(gr)
mcols(gr)$value <- c(-1,4) #adjust values in gr
gr

# Granges Lists
gr2<- GRanges("chr2", IRanges(11:13,51:53))
mcols(gr)$value <- NULL
grl <- GrangesList(gr,gr2)
grl
length(grl)
grl[1](/abishpius/R-for-Computational-Biology/wiki/1)
mcols(grl)$value<- c(5,7)
grl

# Find Overlaps and %over%
gr1 <- GRanges("chr2", IRanges ( c(1,11,21,31,41), width =5))
gr2 <- GRanges("chr2", IRanges(c(19,33),c(38,35)))
fo <- findOverlaps(gr1,gr2)
fo
queryHits(fo)
subjectHits(fo)

gr1 %over% gr2 # finds overlaps
gr1[gr1 %over% gr2]
gr1 <- GRanges("chrZ",IRanges(1,10),strand="+")
gr2 <- GRanges("chrZ",IRanges(1,10),strand="-")
gr1 %over% gr2

#Rle and views classes
r <- Rle(c(1,1,1,0,0,-2,-2,-2,rep(-1,20)))
str(r)
as.numeric(r)

v <- Views(r, start=c(4,2), end=c(7,6))
str(v)

Operating on GRanges

  • reduce(x) produces a set of nonoverlapping ranges that cover all positions covered by x. This can be used to reduce complexity of a gene model with many transcripts, where we may just want the addresses of intervals known to be transcribed, regardless of transcript of residence.

  • disjoin(x) produces a set of ranges that cover all positions covered by x, such that none of the ranges in the disjoin output overlaps any end points of intervals in x. This gives us the largest possible collection of contiguous intervals that are separated wherever the original set of intervals had an endpoint.

  • gaps(x) produces a set of ranges covering the positions in [start(x), end(x)] that are not covered by any range in x. Given coding sequence addresses and exon intervals, this can be used to enumerate introns.

Code: Operating on Simple Ranges

ir <- IRanges(c(3, 8, 14, 15, 19, 34, 40),
  width = c(12, 6, 6, 15, 6, 2, 7))

#Let’s visualize ir and several intra-range operations.
par(mfrow=c(4,1), mar=c(4,2,2,2))
plotRanges(ir, xlim=c(0,60))
plotRanges(reduce(ir), xlim=c(0,60))
plotRanges(disjoin(ir), xlim=c(0,60))
plotRanges(gaps(ir), xlim=c(0,60))

Code: Extension to GRanges

library(GenomicRanges)
gir = GRanges(seqnames="chr1", ir, strand=c(rep("+", 4), rep("-",3)))

par(mfrow=c(4,1), mar=c(4,2,2,2))
plotGRanges(gir, xlim=c(0,60))
plotGRanges(resize(gir,1), xlim=c(0,60),col="green")
plotGRanges(flank(gir,3), xlim=c(0,60), col="purple")
plotGRanges(flank(gir,2,start=FALSE), xlim=c(0,60), col="brown")

Using GRanges for Genomic Analysis

Finding Overlaps

# load packages
library(GenomicFeatures)
library(GenomicRanges)
library(IRanges)
library(ERBS)

# load ESRRA ChIP data
data(HepG2)
data(GM12878)

browseVignettes("GenomicRanges")

# find binding sites common to both HepG2 and GM12878
?findOverlaps
# for each row in query, return overlapping row in subject
res = findOverlaps(HepG2, GM12878)
class(res)
res

# ranges from the query for which we found a hit in the subject
index = queryHits(res)
erbs = HepG2[index,]
erbs

# extract only the ranges
granges(erbs)
erbs

Genes as GRanges

## before video, run code from previous video AND these commands:
res = findOverlaps(HepG2,GM12878)
erbs = HepG2[queryHits(res)]
erbs = granges(erbs)

# define human genes
library(Homo.sapiens)
ghs = genes(Homo.sapiens)
ghs

# learn about the precede function (and related functions like follow)
?precede

# for each range in erbs, find the closest preceding range in ghs
index = precede(erbs, ghs)
ghs[index[1:3]]
erbs[1:3]    # note result is strand-aware

Finding the Nearest Gene

# distance between binding sites and nearest preceding genes
distance(erbs, ghs[index])

# find transcription start site nearest to each binding site
tssgr = resize(ghs, 1)
tssgr

# distance between binding site and nearest TSS
d = distanceToNearest(erbs, tssgr)
queryHits(d)
dists = values(d)$distance
hist(dists, nc=100, xlim=c(0,100000))

index = subjectHits(d)[dists < 1000]
index

Annotating Genes

# use select function to query Homo sapiens database
tssgr[index,]
keytypes(Homo.sapiens)
keys = as.character(values(tssgr[index])$GENEID)
columns(Homo.sapiens)
res = select(Homo.sapiens, keys = keys,
             columns = c("SYMBOL", "GENENAME"), keytype="GENEID")
res[1:2,]

Biostrings

Simple and efficient handling of DNA, RNA and amino acid sequences in Bioconductor is enabled by the Biostrings package. The package defines several classes for representing individual molecular sequences as well as sets of sequences. The package also provides highly optimized functions for performing common operations on sequences and sequence sets.

DNAStringSet Objects

# basics of DNAStringSets
set1 <- DNAStringSet(c("TCA", "AAATCG", "ACGTGCCTA", "CGCGCA", "GTT", "TCA"))    # define a DNAStringSet
set1
set1[2:3]    # extract subset of sequences
set1[4](/abishpius/R-for-Computational-Biology/wiki/4)    # extract one sequence as a single DNAString
length(set1)    # number of DNAstrings in set
width(set1)    # size of each DNAString
duplicated(set1)    # detect which sequences are duplicated
unique(set1)    # keep only unique sequences
sort(set1)

Operations on DNAStrings

dna_seq <- DNAString("ATCGCGCGCGGCTCTTTTAAAAAAACGCTACTACCATGTGTGTCTATC")

# analyze DNAStrings
letterFrequency(dna_seq, "A")    # count A in sequence
letterFrequency(dna_seq, "GC")    # count G or C in sequence
dinucleotideFrequency(dna_seq)    # frequencies of all dinucleotides
trinucleotideFrequency(dna_seq)    # frequencies of all trinucleotides

# convert DNAStrings
reverseComplement(dna_seq)    # find reverse complement
translate(dna_seq)    # amino acid translation

Matching and Counting with Biostrings

# count and match on individual Biostrings
dna_seq <- DNAString("ATCGCGCGCGGCTCTTTTAAAAAAACGCTACTACCATGTGTGTCTATC")
dna_seq
countPattern("CG", dna_seq)    # pattern "CG" occurs 5 times
matchPattern("CG", dna_seq)    # locations of pattern "CG"
start(matchPattern("CG", dna_seq))    # start locations of the pattern
matchPattern("CTCTTTTAAAAAAACGCTACTACCATGTGT", dna_seq)    # match patterns of any length

# check for pattern and its reverse complement
countPattern("TAG", dna_seq)
countPattern(reverseComplement(DNAString("TAG")), dna_seq)

# count and match on sets of Biostrings
set2 <- DNAStringSet(c("AACCGGTTTCGA", "CATGCTGCTACA", "CGATCGCGCCGG", "TACAACCGTACA"))
set2
vcountPattern("CG", set2)    # CG counts for entire DNAStringSet
vmatchPattern("CG", set2)
vmatchPattern("CG", set2)[1](/abishpius/R-for-Computational-Biology/wiki/1)    # access matches for the first element of the DNAStringSet

Getting the Sequence of Regions

library(ERBS)
data(HepG2)
HepG2

# load and inspect human reference genome
library(BSgenome.Hsapiens.UCSC.hg19)
Hsapiens

# extract chromosome 17 sequence
c17 = Hsapiens$chr17
c17

?getSeq
class(Hsapiens)
showMethods("getSeq")

# collection of DNA strings with ChIP-seq binding peaks
hepseq = getSeq(Hsapiens, HepG2)
length(HepG2)    # same number of sequences
width(HepG2)[1:5]    # widths match

# collection of shifted DNA strings with no relationship to binding sequences - essentially random
rhepseq = getSeq(Hsapiens, shift(hepG2, 2500))

# count occurrences of a motif in DNA sequences
mot = "TCAAGGTCA"
?vmatchPattern
vcountPattern(mot, hepseq)

# consider both forward matches and reverse complement matches 
sum(vcountPattern(mot, hepseq))    # forward pattern match
sum(vcountPattern(mot, reverseComplement(hepseq)))    # reverse pattern match

## compare motif occurrence in binding peak to random upstream sequences
# count of motifs in binding peaks
sum(vcountPattern(mot, hepseq)) +
    sum(vcountPattern(mot, reverseComplement(hepseq)))
# count of motifs in randomly selected regions of equal length
sum(vcountPattern(mot, rhepseq)) +
    sum(vcountPattern(mot, reverseComplement(rhepseq)))

# for real analysis, use MotifDb package, probabilistic binding packages like MEME and FIMO