Data manipulation with Tidyverse - rochevin/formation_ggplot2 GitHub Wiki
Read/write tibbles: :
-
readr::read_*
:-
read_csv
: comma delimited files. -
read_csv2
: semi-colon delimited files. -
read_tsv
: tab delimited files. -
read_delim
: any delimiter.
-
-
readr::write_*
:-
write_csv
: comma delimited files. -
write_csv2
: semi-colon delimited files. -
write_tsv
: tab delimited files. -
write_delim
: any delimiter.
-
iris <- read_tsv("iris.tsv",
col_types = cols(
Sepal.Length = col_double(),Sepal.Width = col_double(),
Petal.Length = col_double(),Petal.Width = col_double(),
Species = col_character()
)
)
iris
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # … with 140 more rows
iris.tbl <- iris %>%
dplyr::mutate(Individual = 1:dplyr::n()) %>%
tidyr::pivot_longer(-Species:-Individual,names_to = "Type",values_to = "obs")
iris.tbl
# A tibble: 600 × 4
Species Individual Type obs
<fct> <int> <chr> <dbl>
1 setosa 1 Sepal.Length 5.1
2 setosa 1 Sepal.Width 3.5
3 setosa 1 Petal.Length 1.4
4 setosa 1 Petal.Width 0.2
5 setosa 2 Sepal.Length 4.9
6 setosa 2 Sepal.Width 3
7 setosa 2 Petal.Length 1.4
8 setosa 2 Petal.Width 0.2
9 setosa 3 Sepal.Length 4.7
10 setosa 3 Sepal.Width 3.2
iris.tbl %>% pivot_wider(names_from = Type,values_from = obs)
# A tibble: 150 × 6
Species Individual Sepal.Length Sepal.Width Petal.Length Petal.Width
<fct> <int> <dbl> <dbl> <dbl> <dbl>
1 setosa 1 5.1 3.5 1.4 0.2
2 setosa 2 4.9 3 1.4 0.2
3 setosa 3 4.7 3.2 1.3 0.2
4 setosa 4 4.6 3.1 1.5 0.2
5 setosa 5 5 3.6 1.4 0.2
6 setosa 6 5.4 3.9 1.7 0.4
7 setosa 7 4.6 3.4 1.4 0.3
8 setosa 8 5 3.4 1.5 0.2
9 setosa 9 4.4 2.9 1.4 0.2
10 setosa 10 4.9 3.1 1.5 0.1
`
iris.tbl <- iris.tbl %>% tidyr::separate(Type,into = c("Part","Metric"),se="\\.")
iris.tbl
# A tibble: 600 × 5
Species Individual Part Metric obs
<fct> <int> <chr> <chr> <dbl>
1 setosa 1 Sepal Length 5.1
2 setosa 1 Sepal Width 3.5
3 setosa 1 Petal Length 1.4
4 setosa 1 Petal Width 0.2
5 setosa 2 Sepal Length 4.9
6 setosa 2 Sepal Width 3
7 setosa 2 Petal Length 1.4
8 setosa 2 Petal Width 0.2
9 setosa 3 Sepal Length 4.7
10 setosa 3 Sepal Width 3.2
# … with 590 more rows
iris.tbl %>% select(1:3)
iris.tbl %>% select(Part,obs)
iris.tbl %>% select(Part:obs)
iris.tbl %>% select(-Part,-Metric)
iris %>% select(starts_with("Petal"))
iris %>% pull(Sepal.Length)
[1] 5.1 4.9 4.7 4.6 5.0 5.4 4.6 5.0 4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1 5.7
[20] 5.1 5.4 5.1 4.6 5.1 4.8 5.0 5.0 5.2 5.2 4.7 4.8 5.4 5.2 5.5 4.9 5.0 5.5 4.9
[39] 4.4 5.1 5.0 4.5 4.4 5.0 5.1 4.8 5.1 4.6 5.3 5.0 7.0 6.4 6.9 5.5 6.5 5.7 6.3
[58] 4.9 6.6 5.2 5.0 5.9 6.0 6.1 5.6 6.7 5.6 5.8 6.2 5.6 5.9 6.1 6.3 6.1 6.4 6.6
[77] 6.8 6.7 6.0 5.7 5.5 5.5 5.8 6.0 5.4 6.0 6.7 6.3 5.6 5.5 5.5 6.1 5.8 5.0 5.6
[96] 5.7 5.7 6.2 5.1 5.7 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3 6.7 7.2 6.5 6.4 6.8 5.7
[115] 5.8 6.4 6.5 7.7 7.7 6.0 6.9 5.6 7.7 6.3 6.7 7.2 6.2 6.1 6.4 7.2 7.4 7.9 6.4
[134] 6.3 6.1 7.7 6.3 6.4 6.0 6.9 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9
iris %>%
mutate(Petal.Length =
scales::percent(Petal.Length/max(Petal.Length))
)
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <chr> <dbl> <fct>
## 1 5.1 3.5 20.3% 0.2 setosa
## 2 4.9 3 20.3% 0.2 setosa
## 3 4.7 3.2 18.8% 0.2 setosa
## 4 4.6 3.1 21.7% 0.2 setosa
## 5 5 3.6 20.3% 0.2 setosa
## 6 5.4 3.9 24.6% 0.4 setosa
## 7 4.6 3.4 20.3% 0.3 setosa
## 8 5 3.4 21.7% 0.2 setosa
## 9 4.4 2.9 20.3% 0.2 setosa
## 10 4.9 3.1 21.7% 0.1 setosa
## # … with 140 more rows
iris %>%
#Case 1
mutate(Size = dplyr::case_when(
Petal.Length >= 4 & Petal.Width >= 1.3 ~ "Big",
Petal.Length < 4 & Petal.Width < 1.3 ~ "Small"
)
) %>%
#Case 2
mutate(Ratio_Petal_Sepal_width = Petal.Width/Sepal.Width)
# A tibble: 150 × 7
Sepal.Length Sepal.Width Petal.Length Petal.Width Species Size Ratio_Petal_Sepa…
<dbl> <dbl> <dbl> <dbl> <fct> <chr> <dbl>
1 5.1 3.5 1.4 0.2 setosa Small 0.0571
2 4.9 3 1.4 0.2 setosa Small 0.0667
3 4.7 3.2 1.3 0.2 setosa Small 0.0625
4 4.6 3.1 1.5 0.2 setosa Small 0.0645
5 5 3.6 1.4 0.2 setosa Small 0.0556
6 5.4 3.9 1.7 0.4 setosa Small 0.103
7 4.6 3.4 1.4 0.3 setosa Small 0.0882
8 5 3.4 1.5 0.2 setosa Small 0.0588
9 4.4 2.9 1.4 0.2 setosa Small 0.0690
10 4.9 3.1 1.5 0.1 setosa Small 0.0323
# … with 140 more rows
other functions:
-
dplyr::mutate_all()
: Apply a function to every columns. -
dplyr::mutate_at()
: Apply a function to specific column.
iris %>% arrange(Petal.Length) %>% slice(1:5)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 4.6 3.6 1.0 0.2 setosa
2 4.3 3.0 1.1 0.1 setosa
3 5.8 4.0 1.2 0.2 setosa
4 5.0 3.2 1.2 0.2 setosa
5 4.7 3.2 1.3 0.2 setosa
Use desc()
to order by high to low.
iris %>% filter(Sepal.Length < 7) %>% slice(1:5)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
iris %>% filter(Species == "versicolor") %>% slice(1:5)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 7.0 3.2 4.7 1.4 versicolor
2 6.4 3.2 4.5 1.5 versicolor
3 6.9 3.1 4.9 1.5 versicolor
4 5.5 2.3 4.0 1.3 versicolor
5 6.5 2.8 4.6 1.5 versicolor
iris %>%
filter(Sepal.Length < 7) %>%
filter(Species == "setosa") %>% slice(1:5)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
iris %>% summarise(mean = mean(Petal.Length), sd = sd(Petal.Length))
mean sd
1 3.758 1.765298
iris %>% select(-Species) %>%summarise_all(mean)
Sepal.Length Sepal.Width Petal.Length Petal.Width
1 5.843333 3.057333 3.758 1.199333
iris %>% summarise(n = n())
n
1 150
iris %>% group_by(Species) %>% summarise(mean = mean(Petal.Length), sd = sd(Petal.Length))
# A tibble: 3 × 3
Species mean sd
<fct> <dbl> <dbl>
1 setosa 1.46 0.174
2 versicolor 4.26 0.470
3 virginica 5.55 0.552
iris %>% group_by(Species) %>% summarise(n = n())
# A tibble: 3 × 2
Species n
<fct> <int>
1 setosa 50
2 versicolor 50
3 virginica 50
iris %>% group_by(Species) %>% tally()
iris %>% count(Species)
# A tibble: 3 × 2
Species n
<fct> <int>
1 setosa 50
2 versicolor 50
3 virginica 50
iris %>% group_by(Species) %>% filter(Petal.Length >= max(Petal.Length))
# A tibble: 4 × 5
# Groups: Species [3]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 4.8 3.4 1.9 0.2 setosa
2 5.1 3.8 1.9 0.4 setosa
3 6 2.7 5.1 1.6 versicolor
4 7.7 2.6 6.9 2.3 virginica
iris %>% group_by(Species) %>% top_n(1,Petal.Length)
# A tibble: 4 × 5
# Groups: Species [3]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 4.8 3.4 1.9 0.2 setosa
2 5.1 3.8 1.9 0.4 setosa
3 6 2.7 5.1 1.6 versicolor
4 7.7 2.6 6.9 2.3 virginica
iris.meta <- tibble(
Species = factor(c("setosa","versicolor","virginica")),
Colony = c("A","A","B") ,
Ploidy = c("diploid","hexaploid","tetraploid"),
`Common name` = c("Beachhead iris","Harlequin blueflag","Virginia iris")
)
Inner join: return all rows from x
where there are matching values in y
, and all columns from x
and y
. If there are multiple matches between x
and y
, all combination of the matches are returned.
iris.meta.nosetosa <- iris.meta %>% filter(Species !="setosa")
iris %>% inner_join(iris.meta.nosetosa,by = "Species")
# A tibble: 100 × 8
Sepal.Length Sepal.Width Petal.Length Petal.Width Species Colony Ploidy
<dbl> <dbl> <dbl> <dbl> <fct> <chr> <chr>
1 7 3.2 4.7 1.4 versicolor A hexaploid
2 6.4 3.2 4.5 1.5 versicolor A hexaploid
3 6.9 3.1 4.9 1.5 versicolor A hexaploid
4 5.5 2.3 4 1.3 versicolor A hexaploid
5 6.5 2.8 4.6 1.5 versicolor A hexaploid
6 5.7 2.8 4.5 1.3 versicolor A hexaploid
7 6.3 3.3 4.7 1.6 versicolor A hexaploid
8 4.9 2.4 3.3 1 versicolor A hexaploid
9 6.6 2.9 4.6 1.3 versicolor A hexaploid
10 5.2 2.7 3.9 1.4 versicolor A hexaploid
- A left join keeps all observations in
x
. - A right join keeps all observations in
y
. - A full join keeps all observations in
x
andy
.
left join:
iris.meta.nosetosa <- iris.meta %>% filter(Species !="setosa")
iris %>% left_join(iris.meta.nosetosa,by = "Species")
# A tibble: 150 × 8
Sepal.Length Sepal.Width Petal.Length Petal.Width Species Colony Ploidy
<dbl> <dbl> <dbl> <dbl> <fct> <chr> <chr>
1 5.1 3.5 1.4 0.2 setosa NA NA
2 4.9 3 1.4 0.2 setosa NA NA
3 4.7 3.2 1.3 0.2 setosa NA NA
4 4.6 3.1 1.5 0.2 setosa NA NA
5 5 3.6 1.4 0.2 setosa NA NA
6 5.4 3.9 1.7 0.4 setosa NA NA
7 4.6 3.4 1.4 0.3 setosa NA NA
8 5 3.4 1.5 0.2 setosa NA NA
9 4.4 2.9 1.4 0.2 setosa NA NA
10 4.9 3.1 1.5 0.1 setosa NA NA
# … with 140 more rows, and 1 more variable: `Common name` <chr>
right join:
iris.nosetosa <- iris %>% filter(Species !="setosa")
iris.nosetosa %>% right_join(iris.meta,by = "Species")%>% tail()
# A tibble: 6 × 8
Sepal.Length Sepal.Width Petal.Length Petal.Width Species Colony Ploidy
<dbl> <dbl> <dbl> <dbl> <fct> <chr> <chr>
1 6.7 3 5.2 2.3 virginica B tetraploid
2 6.3 2.5 5 1.9 virginica B tetraploid
3 6.5 3 5.2 2 virginica B tetraploid
4 6.2 3.4 5.4 2.3 virginica B tetraploid
5 5.9 3 5.1 1.8 virginica B tetraploid
6 NA NA NA NA setosa A diploid
# … with 1 more variable: `Common name` <chr>
full join:
iris.meta.nosetosa <- iris.meta %>% filter(Species !="setosa")
iris %>% filter(Species != "virginica") %>% full_join(iris.meta.nosetosa,by = "Species") %>% group_by(Species) %>% slice(1:5)
# A tibble: 11 × 8
# Groups: Species [3]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species Colony Ploidy
<dbl> <dbl> <dbl> <dbl> <fct> <chr> <chr>
1 5.1 3.5 1.4 0.2 setosa NA NA
2 4.9 3 1.4 0.2 setosa NA NA
3 4.7 3.2 1.3 0.2 setosa NA NA
4 4.6 3.1 1.5 0.2 setosa NA NA
5 5 3.6 1.4 0.2 setosa NA NA
6 7 3.2 4.7 1.4 versicolor A hexaploid
7 6.4 3.2 4.5 1.5 versicolor A hexaploid
8 6.9 3.1 4.9 1.5 versicolor A hexaploid
9 5.5 2.3 4 1.3 versicolor A hexaploid
10 6.5 2.8 4.6 1.5 versicolor A hexaploid
11 NA NA NA NA virginica B tetraploid
# … with 1 more variable: `Common name` <chr>
-
semi_join(x, y)
keeps all observations inx
that have a match iny
.
-
anti_join(x, y)
drops all observations inx
that have a match iny
.
treatments <-
tibble(
treatment_a = sample(10:25,size = 3,replace = T),
treatment_b = sample(10:25,size = 3,replace = T),
treatment_c = sample(10:25,size = 3,replace = T),
person = c("John Smith","Jane Doe","Mary Johnson")
) %>% pivot_longer(-person,names_to = "treatment",values_to = "result")
treatment.meta <- tibble(
First_Name = c("John","Jane"),
Last_Name = c("Smith","Doe"),
age = sample(20:80,size = 2),
sex = c("M","F"),
adress = c("221B Baker Street","57 Rue de Varenne")
)
treatments %>%
separate(person,into = c("First_Name","Last_Name"),sep=" ") %>%
semi_join(treatment.meta,by = c("First_Name","Last_Name"))
# A tibble: 6 × 4
First_Name Last_Name treatment result
<chr> <chr> <chr> <int>
1 John Smith treatment_a 20
2 John Smith treatment_b 21
3 John Smith treatment_c 14
4 Jane Doe treatment_a 19
5 Jane Doe treatment_b 13
6 Jane Doe treatment_c 21