Data manipulation with Tidyverse - rochevin/formation_ggplot2_2023 GitHub Wiki

Data manipulation with Tidyverse

Read/write tibbles: readr::read_* / readr::write_*

Read/write tibbles: :

  • readr::read_*:

    • read_csv: comma delimited files.
    • read_csv2: semi-colon delimited files.
    • read_tsv: tab delimited files.
    • read_delim: any delimiter.
  • readr::write_*:

    • write_csv: comma delimited files.
    • write_csv2: semi-colon delimited files.
    • write_tsv: tab delimited files.
    • write_delim: any delimiter.

Example

iris <- read_tsv("iris.tsv",
         col_types = cols(
            Sepal.Length = col_double(),Sepal.Width = col_double(),
            Petal.Length = col_double(),Petal.Width = col_double(),
            Species = col_character()
         )
    )
iris
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <chr>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # … with 140 more rows

Using tidyr to reshape data

Use tidyr::pivot_longer() and tidyr::pivot_wider() to reorganize the value.

iris.tbl <- iris %>% 
    dplyr::mutate(Individual = 1:dplyr::n()) %>%
    tidyr::pivot_longer(-Species:-Individual,names_to = "Type",values_to = "obs")
iris.tbl
# A tibble: 600 × 4
   Species Individual Type           obs
   <fct>        <int> <chr>        <dbl>
 1 setosa           1 Sepal.Length   5.1
 2 setosa           1 Sepal.Width    3.5
 3 setosa           1 Petal.Length   1.4
 4 setosa           1 Petal.Width    0.2
 5 setosa           2 Sepal.Length   4.9
 6 setosa           2 Sepal.Width    3  
 7 setosa           2 Petal.Length   1.4
 8 setosa           2 Petal.Width    0.2
 9 setosa           3 Sepal.Length   4.7
10 setosa           3 Sepal.Width    3.2
iris.tbl %>% pivot_wider(names_from = Type,values_from = obs)
# A tibble: 150 × 6
   Species Individual Sepal.Length Sepal.Width Petal.Length Petal.Width
   <fct>        <int>        <dbl>       <dbl>        <dbl>       <dbl>
 1 setosa           1          5.1         3.5          1.4         0.2
 2 setosa           2          4.9         3            1.4         0.2
 3 setosa           3          4.7         3.2          1.3         0.2
 4 setosa           4          4.6         3.1          1.5         0.2
 5 setosa           5          5           3.6          1.4         0.2
 6 setosa           6          5.4         3.9          1.7         0.4
 7 setosa           7          4.6         3.4          1.4         0.3
 8 setosa           8          5           3.4          1.5         0.2
 9 setosa           9          4.4         2.9          1.4         0.2
10 setosa          10          4.9         3.1          1.5         0.1

`

Use separate to split one column into multiples columns

iris.tbl <- iris.tbl %>% tidyr::separate(Type,into = c("Part","Metric"),se="\\.")
iris.tbl
# A tibble: 600 × 5
   Species Individual Part  Metric   obs
   <fct>        <int> <chr> <chr>  <dbl>
 1 setosa           1 Sepal Length   5.1
 2 setosa           1 Sepal Width    3.5
 3 setosa           1 Petal Length   1.4
 4 setosa           1 Petal Width    0.2
 5 setosa           2 Sepal Length   4.9
 6 setosa           2 Sepal Width    3  
 7 setosa           2 Petal Length   1.4
 8 setosa           2 Petal Width    0.2
 9 setosa           3 Sepal Length   4.7
10 setosa           3 Sepal Width    3.2
# … with 590 more rows

Manipulate a dataset using dplyr

Manipulate variables using dplyr::select()

iris.tbl %>% select(1:3)
iris.tbl %>% select(Part,obs)
iris.tbl %>% select(Part:obs)
iris.tbl %>% select(-Part,-Metric)
iris %>% select(starts_with("Petal"))

Manipulate variables using dplyr::pull()

iris %>% pull(Sepal.Length)
  [1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1 5.7
 [20] 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2 5.2 4.7 4.8 5.4 5.2 5.5 4.9 5.0 5.5 4.9
 [39] 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0 6.4 6.9 5.5 6.5 5.7 6.3
 [58] 4.9 6.6 5.2 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6
 [77] 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0 5.4 6.0 6.7 6.3 5.6 5.5 5.5 6.1 5.8 5.0 5.6
 [96] 5.7 5.7 6.2 5.1 5.7 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4 6.8 5.7
[115] 5.8 6.4 6.5 7.7 7.7 6.0 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4
[134] 6.3 6.1 7.7 6.3 6.4 6.0 6.9 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9

Modify variables using dplyr::mutate

iris %>%
    mutate(Petal.Length = 
               scales::percent(Petal.Length/max(Petal.Length))
           )
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl> <chr>              <dbl> <fct>  
##  1          5.1         3.5 20.3%                0.2 setosa 
##  2          4.9         3   20.3%                0.2 setosa 
##  3          4.7         3.2 18.8%                0.2 setosa 
##  4          4.6         3.1 21.7%                0.2 setosa 
##  5          5           3.6 20.3%                0.2 setosa 
##  6          5.4         3.9 24.6%                0.4 setosa 
##  7          4.6         3.4 20.3%                0.3 setosa 
##  8          5           3.4 21.7%                0.2 setosa 
##  9          4.4         2.9 20.3%                0.2 setosa 
## 10          4.9         3.1 21.7%                0.1 setosa 
## # … with 140 more rows

Create variables using dplyr::mutate

iris %>%
    #Case 1
    mutate(Size = dplyr::case_when(
            Petal.Length >= 4 & Petal.Width >= 1.3 ~ "Big",
            Petal.Length < 4 & Petal.Width < 1.3 ~ "Small"
        )
    ) %>%
    #Case 2
    mutate(Ratio_Petal_Sepal_width = Petal.Width/Sepal.Width)
# A tibble: 150 × 7
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species Size  Ratio_Petal_Sepa…
          <dbl>       <dbl>        <dbl>       <dbl> <fct>   <chr>             <dbl>
 1          5.1         3.5          1.4         0.2 setosa  Small            0.0571
 2          4.9         3            1.4         0.2 setosa  Small            0.0667
 3          4.7         3.2          1.3         0.2 setosa  Small            0.0625
 4          4.6         3.1          1.5         0.2 setosa  Small            0.0645
 5          5           3.6          1.4         0.2 setosa  Small            0.0556
 6          5.4         3.9          1.7         0.4 setosa  Small            0.103 
 7          4.6         3.4          1.4         0.3 setosa  Small            0.0882
 8          5           3.4          1.5         0.2 setosa  Small            0.0588
 9          4.4         2.9          1.4         0.2 setosa  Small            0.0690
10          4.9         3.1          1.5         0.1 setosa  Small            0.0323
# … with 140 more rows

other functions:

  • dplyr::mutate_all(): Apply a function to every columns.
  • dplyr::mutate_at(): Apply a function to specific column.

Arrange dataset using dplyr::arrange()

iris %>% arrange(Petal.Length) %>% slice(1:5)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          4.6         3.6          1.0         0.2  setosa
2          4.3         3.0          1.1         0.1  setosa
3          5.8         4.0          1.2         0.2  setosa
4          5.0         3.2          1.2         0.2  setosa
5          4.7         3.2          1.3         0.2  setosa

Use desc() to order by high to low.

Manipulate observations using dplyr::filter()

iris %>% filter(Sepal.Length < 7) %>% slice(1:5)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
iris %>% filter(Species ==  "versicolor") %>% slice(1:5)
  Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
1          7.0         3.2          4.7         1.4 versicolor
2          6.4         3.2          4.5         1.5 versicolor
3          6.9         3.1          4.9         1.5 versicolor
4          5.5         2.3          4.0         1.3 versicolor
5          6.5         2.8          4.6         1.5 versicolor

Manipulate observations using dplyr::filter()

iris %>%
    filter(Sepal.Length < 7) %>%
    filter(Species ==  "setosa") %>% slice(1:5)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa

Summarise observations using dplyr::summarise()

iris %>% summarise(mean = mean(Petal.Length), sd = sd(Petal.Length))
   mean       sd
1 3.758 1.765298
iris %>% select(-Species) %>%summarise_all(mean)
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1     5.843333    3.057333        3.758    1.199333
iris %>% summarise(n = n())
    n
1 150

Summarise observations using dplyr::group_by()

iris %>% group_by(Species) %>% summarise(mean = mean(Petal.Length), sd = sd(Petal.Length))
# A tibble: 3 × 3
  Species     mean    sd
  <fct>      <dbl> <dbl>
1 setosa      1.46 0.174
2 versicolor  4.26 0.470
3 virginica   5.55 0.552
iris %>% group_by(Species) %>% summarise(n = n())
# A tibble: 3 × 2
  Species        n
  <fct>      <int>
1 setosa        50
2 versicolor    50
3 virginica     50
iris %>% group_by(Species) %>% tally()
iris %>% count(Species)
# A tibble: 3 × 2
  Species        n
  <fct>      <int>
1 setosa        50
2 versicolor    50
3 virginica     50

Summarise observations using dplyr::group_by()

iris %>% group_by(Species) %>% filter(Petal.Length >= max(Petal.Length))
# A tibble: 4 × 5
# Groups:   Species [3]
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
         <dbl>       <dbl>        <dbl>       <dbl> <fct>     
1          4.8         3.4          1.9         0.2 setosa    
2          5.1         3.8          1.9         0.4 setosa    
3          6           2.7          5.1         1.6 versicolor
4          7.7         2.6          6.9         2.3 virginica 
iris %>% group_by(Species) %>% top_n(1,Petal.Length)
# A tibble: 4 × 5
# Groups:   Species [3]
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
         <dbl>       <dbl>        <dbl>       <dbl> <fct>     
1          4.8         3.4          1.9         0.2 setosa    
2          5.1         3.8          1.9         0.4 setosa    
3          6           2.7          5.1         1.6 versicolor
4          7.7         2.6          6.9         2.3 virginica 

Mutating joins using dplyr::*_join()

iris.meta <- tibble(
    Species = factor(c("setosa","versicolor","virginica")),
    Colony = c("A","A","B") ,
    Ploidy = c("diploid","hexaploid","tetraploid"),
    `Common name` = c("Beachhead iris","Harlequin blueflag","Virginia iris")
)

Inner join

Inner join: return all rows from x where there are matching values in y, and all columns from x and y. If there are multiple matches between x and y, all combination of the matches are returned.

iris.meta.nosetosa <- iris.meta %>% filter(Species !="setosa")
iris %>% inner_join(iris.meta.nosetosa,by = "Species")
# A tibble: 100 × 8
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species    Colony Ploidy   
          <dbl>       <dbl>        <dbl>       <dbl> <fct>      <chr>  <chr>    
 1          7           3.2          4.7         1.4 versicolor A      hexaploid
 2          6.4         3.2          4.5         1.5 versicolor A      hexaploid
 3          6.9         3.1          4.9         1.5 versicolor A      hexaploid
 4          5.5         2.3          4           1.3 versicolor A      hexaploid
 5          6.5         2.8          4.6         1.5 versicolor A      hexaploid
 6          5.7         2.8          4.5         1.3 versicolor A      hexaploid
 7          6.3         3.3          4.7         1.6 versicolor A      hexaploid
 8          4.9         2.4          3.3         1   versicolor A      hexaploid
 9          6.6         2.9          4.6         1.3 versicolor A      hexaploid
10          5.2         2.7          3.9         1.4 versicolor A      hexaploid

Outer joins

  • A left join keeps all observations in x.
  • A right join keeps all observations in y.
  • A full join keeps all observations in x and y.

Left join

left join:

iris.meta.nosetosa <- iris.meta %>% filter(Species !="setosa")
iris %>% left_join(iris.meta.nosetosa,by = "Species")
# A tibble: 150 × 8
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species Colony Ploidy
          <dbl>       <dbl>        <dbl>       <dbl> <fct>   <chr>  <chr> 
 1          5.1         3.5          1.4         0.2 setosa  NA     NA    
 2          4.9         3            1.4         0.2 setosa  NA     NA    
 3          4.7         3.2          1.3         0.2 setosa  NA     NA    
 4          4.6         3.1          1.5         0.2 setosa  NA     NA    
 5          5           3.6          1.4         0.2 setosa  NA     NA    
 6          5.4         3.9          1.7         0.4 setosa  NA     NA    
 7          4.6         3.4          1.4         0.3 setosa  NA     NA    
 8          5           3.4          1.5         0.2 setosa  NA     NA    
 9          4.4         2.9          1.4         0.2 setosa  NA     NA    
10          4.9         3.1          1.5         0.1 setosa  NA     NA    
# … with 140 more rows, and 1 more variable: `Common name` <chr>

Right join

right join:

iris.nosetosa <- iris %>% filter(Species !="setosa")
iris.nosetosa %>% right_join(iris.meta,by = "Species")%>% tail()
# A tibble: 6 × 8
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species   Colony Ploidy    
         <dbl>       <dbl>        <dbl>       <dbl> <fct>     <chr>  <chr>     
1          6.7         3            5.2         2.3 virginica B      tetraploid
2          6.3         2.5          5           1.9 virginica B      tetraploid
3          6.5         3            5.2         2   virginica B      tetraploid
4          6.2         3.4          5.4         2.3 virginica B      tetraploid
5          5.9         3            5.1         1.8 virginica B      tetraploid
6         NA          NA           NA          NA   setosa    A      diploid   
# … with 1 more variable: `Common name` <chr>

Outer joins

full join:

iris.meta.nosetosa <- iris.meta %>% filter(Species !="setosa")
iris %>% filter(Species != "virginica") %>% full_join(iris.meta.nosetosa,by = "Species") %>% group_by(Species) %>% slice(1:5)
# A tibble: 11 × 8
# Groups:   Species [3]
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species    Colony Ploidy    
          <dbl>       <dbl>        <dbl>       <dbl> <fct>      <chr>  <chr>     
 1          5.1         3.5          1.4         0.2 setosa     NA     NA        
 2          4.9         3            1.4         0.2 setosa     NA     NA        
 3          4.7         3.2          1.3         0.2 setosa     NA     NA        
 4          4.6         3.1          1.5         0.2 setosa     NA     NA        
 5          5           3.6          1.4         0.2 setosa     NA     NA        
 6          7           3.2          4.7         1.4 versicolor A      hexaploid 
 7          6.4         3.2          4.5         1.5 versicolor A      hexaploid 
 8          6.9         3.1          4.9         1.5 versicolor A      hexaploid 
 9          5.5         2.3          4           1.3 versicolor A      hexaploid 
10          6.5         2.8          4.6         1.5 versicolor A      hexaploid 
11         NA          NA           NA          NA   virginica  B      tetraploid
# … with 1 more variable: `Common name` <chr>

Filtering joins

  • semi_join(x, y) keeps all observations in x that have a match in y.

  • anti_join(x, y) drops all observations in x that have a match in y.

Filtering joins

treatments <- 
    tibble(
        treatment_a = sample(10:25,size = 3,replace = T),
        treatment_b = sample(10:25,size = 3,replace = T),
        treatment_c = sample(10:25,size = 3,replace = T),
        person = c("John Smith","Jane Doe","Mary Johnson")
    ) %>% pivot_longer(-person,names_to = "treatment",values_to = "result")


treatment.meta <- tibble(
    First_Name = c("John","Jane"),
    Last_Name = c("Smith","Doe"),
    age = sample(20:80,size = 2),
    sex = c("M","F"),
    adress = c("221B Baker Street","57 Rue de Varenne")
)

treatments %>%
    separate(person,into = c("First_Name","Last_Name"),sep=" ") %>%
    semi_join(treatment.meta,by = c("First_Name","Last_Name"))
# A tibble: 6 × 4
  First_Name Last_Name treatment   result
  <chr>      <chr>     <chr>        <int>
1 John       Smith     treatment_a     20
2 John       Smith     treatment_b     21
3 John       Smith     treatment_c     14
4 Jane       Doe       treatment_a     19
5 Jane       Doe       treatment_b     13
6 Jane       Doe       treatment_c     21
⚠️ **GitHub.com Fallback** ⚠️