Produce awesome plots using ggplot2 - rochevin/formation_ggplot2_2023 GitHub Wiki

Produce awesome plots using ggplot2

Testing different geom_*

Heatmap using geom_raster

p_raster <- iris.tbl %>% ggplot(aes(y=Part,x=Individual,fill=obs)) + geom_raster() + facet_wrap(~Metric) + scale_fill_viridis_c()
p_raster

pos_end_species <- iris.tbl %>% group_by(Species) %>% arrange(Individual) %>% slice(1)
p_raster + scale_x_continuous(breaks = pos_end_species$Individual,labels = pos_end_species$Species)

p_raster_2 <- iris.tbl %>%
  unite(Feature,Part,Metric)%>%  #Remerge into one column
  group_by(Species) %>% mutate(Ind_spe = 1:dplyr::n()) %>%  # Create Ind label but by species 
  ggplot(aes(y=Species,x=Ind_spe,fill=obs)) + geom_raster() + facet_wrap(~Feature,scales="free_x") + scale_fill_viridis_c()
p_raster_2

Distribution with geom_histogram and geom_density

iris.tbl %>% ggplot(aes(x=obs,y=..density..,col=Species)) + geom_histogram(bins=50) + geom_density() + facet_wrap(Part~Metric)

Statistics using stat_summary

By calling within a geom_*:

p1_bar <- iris.tbl %>% unite(Feature,Part,Metric) %>% ggplot(aes(x=Species,y=obs,fill=Feature)) +
  geom_bar(col="black",position="dodge",stat = "summary", fun = "mean")+ scale_fill_viridis_d(option="plasma") +
  coord_cartesian(expand = F)
p1_bar

Or by using it directly:

p2_bar <- iris.tbl %>% unite(Feature,Part,Metric) %>% ggplot(aes(x=Species,y=obs,fill=Feature)) + 
  stat_summary(geom = "bar",position = "dodge",fun.y = "mean",col="black") + scale_fill_viridis_d(option="plasma") +
  coord_cartesian(expand = F)
## Warning: `fun.y` is deprecated. Use `fun` instead.
p2_bar

You can also do it before plotting using stat="identity"

p3_bar <- iris.tbl %>% unite(Feature,Part,Metric) %>% group_by(Species,Feature) %>% summarise(mean=mean(obs)) %>% 
  ggplot(aes(x=Species,y=mean,fill=Feature)) + geom_bar(col="black",stat="identity",position="dodge") + scale_fill_viridis_d(option="plasma")
## `summarise()` has grouped output by 'Species'. You can override using the
## `.groups` argument.
p3_bar

You can also add standard deviation:

p4_bar <- iris.tbl %>% unite(Feature,Part,Metric) %>% ggplot(aes(x=Species,y=obs,fill=Feature)) + 
  stat_summary(geom = "bar",position = "dodge",fun = "mean",col="black") + scale_fill_viridis_d(option="plasma") +
  stat_summary(fun.data=mean_sdl,geom = "errorbar",position="dodge") +
  coord_cartesian(expand = F)
p4_bar

iris %>% ggplot(aes(x=Petal.Length,y=Petal.Width)) + geom_point()

iris %>% ggplot(aes(x=Petal.Length,y=Petal.Width)) + geom_point(aes(col=Species))

iris %>% ggplot(aes(x=Petal.Length,y=Petal.Width))  +
  geom_point(stat = "summary",fun = "mean",col="black",size = 4) +
  stat_summary(fun.data=mean_sdl,geom = "errorbar") +
  geom_point(aes(col=Species))

iris %>% ggplot(aes(x=Petal.Length,y=Petal.Width))  +
  geom_point(stat = "summary",fun = "mean",col="black",size = 4) +
  stat_summary(fun.data=mean_sdl,geom = "errorbar") +
  geom_line(stat = "summary",fun = "mean",col="black") +
  geom_point(aes(col=Species))

If you have continous value, you can plot a line with geom_line and plotting the standard deviation using geom_ribbon

iris %>% ggplot(aes(x=Petal.Length,y=Petal.Width))  +
  stat_summary(fun.data=mean_sdl,geom = "ribbon",alpha=0.4) +
  geom_line(stat = "summary",fun = "mean",col="black") +
  geom_point(aes(col=Species))

Add a regression line with geom_smooth

iris %>% ggplot(aes(x=Petal.Length,y=Petal.Width))  +
  geom_point()+
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

iris %>% ggplot(aes(x=Petal.Length,y=Petal.Width))  +
  geom_point()+
  geom_smooth(method = "loess")
## `geom_smooth()` using formula 'y ~ x'

iris %>% ggplot(aes(x=Petal.Length,y=Petal.Width))  +
  geom_point()+
  geom_smooth(method = "lm",formula = y ~ log(x))

iris %>% ggplot(aes(x=Petal.Length,y=Petal.Width,col=Species))  +
  geom_point()+
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

iris %>% ggplot(aes(x=Petal.Length,y=Petal.Width))  +
  geom_point(aes(col=Species))+
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

Using extensions to enhance your plots

A PCA for iris with ggrepel and ggforce

A PCA with prcomp

pca.iris <- prcomp(dplyr::select(iris,-Species),scale=TRUE,center=TRUE)
First, see how much variance are explained by my component
percent_var_explained <- tibble(
  PC=1:ncol(pca.iris$x),
  percent_Var=(pca.iris$sdev^2 / sum(pca.iris$sdev^2)),
  cumsum_Var = cumsum(percent_Var)
)

pca.p1 <- percent_var_explained %>% 
  ggplot(aes(x=PC,y=percent_Var)) +
  geom_bar(stat="identity") +
  geom_line(aes(y=cumsum_Var),group=1) +
  geom_point(aes(y=cumsum_Var),col="red") +
  ggrepel::geom_label_repel(aes(y=cumsum_Var,label = scales::percent(cumsum_Var)),direction="y") +
  labs(
    x = "**Principal Component** <br>(PC)", 
    y = "**Contribution** <br>(Variance explained in %)"
  ) +
  scale_y_continuous(labels = scales::percent) +
  theme(
    axis.title.x = ggtext::element_markdown(),
    axis.title.y = ggtext::element_markdown()
    )
pca.p1

The first two axis are enough to explain 96% of the data
pca.iris.tibble <- bind_cols(
  iris %>%  dplyr::select(Species),
  pca.iris$x %>% setNames(glue::glue("PC{1:4}"))
)

pca.p2 <- pca.iris.tibble %>% ggplot(aes(x=PC1,y=PC2,fill=Species,label=Species)) +
  geom_point() + ggforce::geom_mark_hull()
pca.p2

require(patchwork)
## Le chargement a nΓ©cessitΓ© le package : patchwork
pca.p1 + pca.p2 + plot_layout(width=c(0.4,0.6))

Stack geom with extensions

A first example using ggdist and gghalves

iris.tbl  %>% 
  ggplot(aes(x = Species, y = obs,col=Species,fill = after_scale(colorspace::lighten(color, .5)))) + 
  ggdist::stat_halfeye(
    adjust = .5, 
    width = .6, 
    .width = 0, 
    justification = -.2, 
    point_colour = NA
  ) + 
  geom_boxplot(
    width = .15, 
    outlier.shape = NA
  ) +
  ## add justified jitter from the {gghalves} package
  gghalves::geom_half_point(
    ## draw jitter on the left
    side = "l", 
    ## control range of jitter
    range_scale = .4, 
    ## add some transparency
    alpha = .3
  ) + facet_grid(Part~Metric)

A second example using ggdist and ggridges

iris_ratio <- iris %>%
  mutate(Ind = 1:dplyr::n()) %>% 
  gather(key="Feature",value ="value",-Species,-Ind) %>%
  separate(Feature,into = c("Feature","Measure"),sep = "\\.") %>% 
  spread(key = Measure,value=value) %>% mutate(ratio = log2(Length/Width)) %>% 
  group_by(Species,Feature) %>% 
  mutate(
    n = dplyr::n(),
    median = median(ratio),
    max = max(ratio)
  ) %>% 
  ungroup() %>% 
  mutate(species_num = as.numeric(Species))



p_stat <- iris_ratio %>% 
  ggplot(aes(y=Species,x=ratio,color=Species)) +
  stat_summary(
    geom = "linerange",
    fun.min = function(x) -Inf,
    fun.max = function(x) median(x, na.rm = TRUE),
    linetype = "dotted",
    orientation = "y",
    size = .7
  ) +
  geom_point(
    aes(y = species_num - .12),
    shape = "|",
    size = 2,
    alpha = .33
  )  +
  ggridges::geom_density_ridges(aes(fill = Species),scale=0.9,alpha=0.7) +
  ggdist::stat_pointinterval(
    aes(
      y = species_num,
      color = Species,
      fill = after_scale(colorspace::lighten(color, .5))
    ),
    shape = 18,
    point_size = 3,
    interval_size = 1.8,
    adjust = .5,
    .width = c(0, 1)
  ) +
  geom_text(
    aes(x = median, label = format(round(median, 2), nsmall = 2)),
    stat = "unique",
    color = "white",
    family = "Open Sans",
    fontface = "bold",
    size = 3.4,
    nudge_y = .15
  ) +
  geom_label(
    aes(x = max, label = glue::glue("n = {n}")),
    stat = "unique",
    family = "Open Sans",
    fontface = "bold",
    size = 3.5,
    hjust = 0,
    nudge_x = .05,
    nudge_y = .02
  ) +
  facet_wrap(~Feature) +
  coord_cartesian(clip="off")

p_stat
## Picking joint bandwidth of 0.116

## Picking joint bandwidth of 0.0502

Solution for Differential Expression analysis

Read the data

dataset <- "DE_result_edgeR_1_0.05_CvsC_OHT_table.tsv"
dataset <- dataset %>% read_tsv()
## Rows: 12382 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): rowname, Gene.name
## dbl (7): logFC, logCPM, LR, PValue, p.adj, FILTER.FC, FILTER.P
## 
## β„Ή Use `spec()` to retrieve the full column specification for this data.
## β„Ή Specify the column types or set `show_col_types = FALSE` to quiet this message.

## # A tibble: 12,382 Γ— 9
##    rowname        logFC logCPM      LR PValue p.adj FILTER.FC FILTER.P Gene.name
##    <chr>          <dbl>  <dbl>   <dbl>  <dbl> <dbl>     <dbl>    <dbl> <chr>    
##  1 ENSG000000… -0.00411   4.65 0.00101  0.975 1.00          0        0 TSPAN6   
##  2 ENSG000000…  0.0447    5.30 0.159    0.690 1.00          0        0 DPM1     
##  3 ENSG000000… -0.0252    2.78 0.0215   0.883 1.00          0        0 SCYL3    
##  4 ENSG000000… -0.0467    4.78 0.0998   0.752 1.00          0        0 C1orf112 
##  5 ENSG000000…  0.351     1.22 1.04     0.308 0.939         0        0 CFH      
##  6 ENSG000000… -0.0307    5.56 0.0581   0.810 1.00          0        0 FUCA2    
##  7 ENSG000000… -0.0262    5.27 0.0509   0.822 1.00          0        0 GCLC     
##  8 ENSG000000…  0.0491    5.67 0.188    0.665 1.00          0        0 NFYA     
##  9 ENSG000000… -0.254     2.78 2.23     0.135 0.736         0        0 STPG1    
## 10 ENSG000000… -0.178     5.05 1.89     0.169 0.794         0        0 NIPAL3   
## # … with 12,372 more rows
  • rowname: gene ID (Ensembl).
  • logFC: log2 ratio between expression of the treatment and the control condition.
  • logCPM: average expression for this gene.
  • p.adj: is the logFC significant ?
  • Gene.name: Name of the gene.

A first plot

dataset %>% ggplot(aes(x=logFC,y=-log10(p.adj))) +
  geom_point() +
  coord_cartesian(xlim=c(-2,2)) +
  geom_hline(yintercept = -log10(0.05),linetype="dashed") +
  geom_vline(xintercept = c(-1,1),linetype="dashed")

Gene classification using the fold change and the p-value

dataset <- dataset %>% mutate(Type = case_when(
  p.adj< 0.05 & logFC > 1 ~ "Upregulated",
  p.adj< 0.05 & logFC < -1 ~ "Downregulated",
  TRUE ~ "None"
))
dataset %>% count(Type)
## # A tibble: 3 Γ— 2
##   Type              n
##   <chr>         <int>
## 1 Downregulated    12
## 2 None          12314
## 3 Upregulated      56
dataset %>% ggplot(aes(x=logFC,y=-log10(p.adj))) +
  geom_point(aes(col=Type)) +
  coord_cartesian(xlim=c(-2,2)) +
  geom_hline(yintercept = -log10(0.05),linetype="dashed") +
  geom_vline(xintercept = c(-1,1),linetype="dashed")

Add the number of genes by category to the annotation

require(glue)
## Le chargement a nΓ©cessitΓ© le package : glue
dataset <- dataset %>% group_by(Type) %>% mutate(number_Type = dplyr::n()) %>% 
  mutate(Type_nb = glue("{Type} ({number_Type})"))

Build specific color by gene annotation

require(ggtext)
## Le chargement a nΓ©cessitΓ© le package : ggtext
dataset <- dataset %>% mutate(color = case_when(
  Type == "Upregulated" ~ "#c0392b",
  Type == "Downregulated" ~ "#2980b9",
  TRUE ~ "#535c68"
)) %>% 
  mutate(Type_nb = as.factor(glue("<i style='color:{color}'>{Type}</i> ({number_Type})"))) 
p1 <- dataset %>% ggplot(aes(x=logFC,y=-log10(p.adj))) +
  geom_point(aes(col=color)) +
  scale_color_identity(labels=levels(dataset$Type_nb),guide="legend") +
  geom_hline(yintercept = -log10(0.05),linetype="dashed") +
  geom_vline(xintercept = c(-1,1),linetype="dashed") +
  theme(legend.text = element_markdown())
p1

Label gene top of their respective category (need ggrepel package)

Here we need the top 15 of each category

require(ggrepel)
## Le chargement a nΓ©cessitΓ© le package : ggrepel
dataset_label <- dataset %>% filter(Type != "None") %>% 
  group_by(Type) %>% arrange(desc(abs(logFC))) %>%  slice(1:15)


p2 <- p1 + geom_label_repel(data=dataset_label,aes(label=Gene.name,col=color),fill=NA,box.padding = 1,max.overlaps = Inf,show.legend = F)

We can enhance the plot using a custom theme, and with adding title/subtitle/ect …

theme_set(theme_classic(base_size=12,base_family = "Open Sans"))
theme_update(legend.text = element_markdown(),
             legend.position ="bottom",
             plot.title.position = "plot",
             plot.caption.position = "plot",
             plot.title = ggtext::element_markdown(),
             plot.subtitle = ggtext::element_markdown(),
             plot.caption = ggtext::element_markdown(),
             legend.title = ggtext::element_markdown(),
             axis.title.x = ggtext::element_markdown(),
             axis.title.y = ggtext::element_markdown()
             )
p2 + labs(
  title = "Differential expression analysis after **DNA double-strand break damage induction**",
  subtitle = 'from RNA-seq data between two conditions : *OHT* (DSB damage induction) and *untreated*.',
  caption = "Data accession **E-MTAB-6318**",
  x = "**Fold-Change** after damage induction <br>(Log2)", 
  y = "**Adjusted P-value** <br>(-log10)",
  color = "Fold-change <br>(categorical)"
)
## Warning: Removed 1 rows containing missing values (geom_label_repel).

⚠️ **GitHub.com Fallback** ⚠️