Normalization

var: 1, mean: 0

scale(x, center = min(x), scale = max(x)-min(x))

btw 0-1

scale(x, center = min(x), scale = max(x)-min(x))

Visualization

Template

ggplot(data = <DATA>) + 
  <GEOM_FUNCTION>( 
    mapping = aes(<MAPPINGS>), 
    stat = <STAT>, 
    position = <POSITION> 
  ) + 
  <COORDINATE_FUNCTION> + 
  <FACET_FUNCTION>

Plot

ggplot( data = < DATA >) + < GEOM_FUNCTION >( mapping = aes( < MAPPINGS >))
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy, color = class))
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy, size = class))
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy, alpha = class))
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy, shape = class))
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy), color = "blue")

Facet

ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy)) + facet_wrap( ~ class, nrow = 2)
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy)) + facet_grid( .~ class)
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy)) + facet_grid( drv ~ cyl)

Smooth

ggplot( data = mpg) + geom_smooth( mapping = aes( x = displ, y = hwy, linetype = drv))
ggplot( data = mpg, mapping = aes( x = displ, y = hwy)) + geom_point( mapping = aes( color = class)) + geom_smooth()

Bar chart

ggplot( data = diamonds) + geom_bar( mapping = aes( x = cut))
ggplot( data = demo) + geom_bar( mapping = aes( x = a, y = b), stat = "identity" )
ggplot( data = diamonds) + geom_bar( mapping = aes( x = cut, y = ..prop.., group = 1) )
ggplot( data = diamonds) + stat_summary( mapping = aes( x = cut, y = depth), fun.ymin = min, fun.ymax = max, fun.y = median )
ggplot( data = diamonds) + geom_bar( mapping = aes( x = cut, color = cut))
ggplot( data = diamonds) + geom_bar( mapping = aes( x = cut, fill = clarity))
ggplot( data = diamonds) + geom_bar( mapping = aes( x = cut, fill = clarity), position = "dodge" )
ggplot( data = diamonds) + geom_bar( mapping = aes( x = cut, fill = clarity), position = "fill" )
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy), position = "jitter" )

Freqpoly

ggplot(data=diamonds) + geom_freqpoly(mapping = aes( x = carat, color = cut), binwidth = 0.1)

Coord flip

ggplot( data = mpg, mapping = aes( x = class, y = hwy)) + geom_boxplot()
ggplot( data = mpg, mapping = aes( x = class, y = hwy)) + geom_boxplot() + coord_flip()

bar <- ggplot( data = diamonds) + geom_bar( mapping = aes( x = cut, fill = cut), show.legend = FALSE, width = 1 ) + theme( aspect.ratio = 1) + labs( x = NULL, y = NULL) 
bar + coord_flip() 
bar + coord_polar()

Box plot

ggplot( data = mpg) + 
  geom_boxplot( mapping = aes(
    x = reorder(class, hwy, FUN = median), 
    y = hwy 
  )
)

Tile

diamonds %>% 
  count( color, cut) %>% 
  ggplot( mapping = aes( x = color, y = cut)) + 
    geom_tile( mapping = aes( fill = n))

Data transformation

Setup

library(nycflights13)
library(tidyverse)

Types

int stands for integers.

dbl stands for doubles, or real numbers.

chr stands for character vectors, or strings.

dttm stands for date-times (a date + a time).

lgl stands for logical, vectors that contain only TRUE or FALSE.

fctr stands for factors, which R uses to represent categorical variables with fixed possible values.

date stands for dates.

Data manipulator

filter() Pick observations by their values.

filter( flights, month %in% c( 11, 12))

filter( flights, !( arr_delay > 120 | dep_delay > 120))

filter() only includes rows where the condition is TRUE; it excludes both FALSE and NA values. If you want to preserve missing values, ask for them explicitly

arrange() Reorder the rows.

arrange( flights, year, month, desc(day))

select() Pick variables by their names.

select(flights, year, day, everything())

starts_with("abc")

ends_with("xyz")

contains("ijk")

matches("(.)\\1")

mutate() Create new variables with functions of existing variables.

transmute() Create new variables and keep only them.

transmute(flights, dep_time, hour = dep_time %/% 100, minute = dep_time %% 100)

summarize() Collapse many values down to a single summary.

summarize(flights, delay = mean(dep_delay, na.rm = TRUE))

dplyr

Parse a vector

readr uses UTF-8 everywhere:

readr()
parse_integer()
parse_double(" 1,23", locale = locale( decimal_mark = ","))
parse_number( "123.456.789", locale = locale( grouping_mark = ".") )

It ignores non-numeric characters before and after the number.
parse_logical()
parse_datetime() parse_date() parse_time()

`parse_date("01/02/15", "%m/%d/%y")
`parse_character( x2, locale = locale( encoding = "Shift-JIS"))

guess_encoding( charToRaw( x1)) Helping you figure out encoding. (Input accepts file or raw vector)
parse_factor()
problems(x)
stop_for_problems(x)

read_csv()

challenge <- read_csv(readr_example("challenge.csv"))

challenge <- read_csv( 
  readr_example("challenge.csv"), 
  col_types = cols( 
    x = col_double(), 
    y = col_character() 
  ) 
)

challenge2 <- read_csv(readr_example("challenge.csv"), 
  col_types = cols(.default = col_character()) 
)

Writing a File

CSV
- write_csv(challenge, "challenge.csv")
Excel
- write_write_excel_csv(challenge, "challenge.csv")
RDS
- write_rds(challenge, "challenge.rds")
- read_rds("challenge.rds")

Modify data layout

Gathering
- gather( '1999', '2000', key = "year", value = "cases")
Spreading
- spread(table2, key = type, value = count)
Separate
- table3 %>% separate(rate, into = c("cases", "population"), sep = "/", convert = TRUE)
  
  By default, separate() will split values wherever it sees a non-alphanumeric character. It leaves the type of the column as is.
- table3 %>% separate( year, into = c(" century", "year"), sep = 2)
  
  It will interpret the integers as positions to split at.
Unite
- table5 %>% unite(new, century, year)
  
  The default will place an underscore (_) between the values from different columns.
- table5 %>% unite(new, century, year, sep = "")
Complete
- stocks %>% complete(year, qtr)
- complete() takes a set of columns, and finds all unique combinations.
  
  It then ensures the original dataset contains all those values, filling in explicit NAs where necessary.
Fill
- treatment %>% fill(person)
  
  It takes a set of columns where you want missing values to be replaced by the most recent nonmissing value.
guess_parser()
```
guess_parser("2010-10-01") 
#>[1] "date"
```
```
str(parse_guess(" 2010-10-10"))
#>Date[1:1], format: "2010-10-10"
```
The heuristic tries each of the following types, stopping when it finds a match:
- logical Contains only “F”, “T”, “FALSE”, or “TRUE”.
- integer Contains only numeric characters (and -).
- double Contains only valid doubles (including numbers like 4.5e-5).
- number Contains valid doubles with the grouping mark inside.
- time Matches the default time_format.
- date Matches the default date_format.
- date-time Any ISO8601 date.
If none of these rules apply, then the column will stay as a vector of strings.

Join tables - Define key columns

by = NULL Uses all variables that appear in both tables.
by = "x" Uses only declared common variables.
by = c("a" = "b") This will match variable a in table x to variable b in table y.
base::merge()

dplyr merge

inner_join(x, y) merge(x, y)

left_join(x, y) merge(x, y, all.x = TRUE)

right_join(x, y) merge(x, y, all.y = TRUE)

full_join(x, y) merge(x, y, all.x = TRUE, all.y = TRUE)
semi_join(x, y) keeps all observations in x that have a match in y.

dplyr	merge
inner_join(x, y)	merge(x, y)
left_join(x, y)	merge(x, y, all.x = TRUE)
right_join(x, y)	merge(x, y, all.y = TRUE)
full_join(x, y)	merge(x, y, all.x = TRUE, all.y = TRUE)

anti_join(x, y) drops all observations in x that have a match in y.

flights %>% 
anti_join(planes, by = "tailnum") %>% 
count(tailnum, sort = TRUE)

intersect(x, y) Return only observations in both x and y.
union(x, y) Return unique observations in x and y.
setdiff(x, y) Return observations in x, but not in y.

Stringr

library(stringr)

Customise string

str_length("abc")

str_c("x", "y", "z", sep=", ")

x <- c("abc", NA)
str_c("|-", str_replace_na(x), "-|") 
#>[1] "|-abc-|" "|-NA-|"

str_c( 
  "Good ", time_of_day, " ", name, 
  if (birthday) " and HAPPY BIRTHDAY", "."
)

str_sub(x, 1, 3)
str_to_lower(x)

Utilize regexp(regular expression)

str_view(x, ".a.")
- . match any single word.
- ^ match the start of the string.
- $ tmatch the end of the string.
- \d matches any digit.
- \s matches any whitespace (e.g., space, tab, newline).
- [abc] matches a, b, or c.
- [^abc] matches anything except a, b, or c.
- (a|b) matches a or b
- ? 0 or 1
- + 1 or more
- * 0 or more
- {n} exactly n
- {n,} n or more
- {,m} at most m
- {n,m} between n and m
str_view_all(x, ".a.")
str_detect(x, ".a.")
str_subset(x, ".a.")
str_count(x, ".a.")
str_extract(x, ".a.") only extracts the first match.
str_extract_all(x, ".a.") returns a list.
str_match(x, "a.")

str_extract() gives us the complete match; str_match() gives each individual component.
str_match_all(x, "a.")
str_replace(x, "a.", "--") replace only the first match.

str_replace_all(x, "a.", "--") replace all matches.

str_replace_all() can perform multiple replacements by supplying a named vector:
x <- c("1 house", "2 cars", "3 people") 
str_replace_all( x, c("1" = "one", "2" = "two", "3" = "three")) 
#>[1] "one house" "two cars" "three people"

str_split(sentences, " ")
str_locale(x, "a.")

regex()

bananas <- c(" banana", "Banana", "BANANA")
str_view(bananas, regex("banana", ignore_case = TRUE))
x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, regex("^Line", multiline = TRUE))[[1]] 
#>[1] "Line" "Line" "Line"
phone <- regex(" 
\\(? # optional opening parens
(\\ d{ 3}) # area code 
comments = TRUE allows you to use comments and white space to make complex regular expressions more understandable. Spaces are ignored, as is everything after #. To match a literal space, you’ll need to escape it: "\ ". dotall = TRUE allows . to match everything, including \n.

Date, Time

library(lubridate)

today()
now()

From Strings

ymd("2017-01-31", tz="UTC")
mdy("2017-01-31")
ymd_hms("2017-01-31 20:11:59")
make_datetime(year, month, day, time %/% 100, time %% 100)

Date-time component

year()
month(datetime, label = TRUE)
mday()
yday()
wday(datetime, label = TRUE, abbr = FALSE)
hour()
minute()
second()

Draw a line of the average departure delay by minute within the hour.

flights_dt %>% 
  mutate( minute = minute( dep_time)) %>% 
  group_by( minute) %>% 
  summarize( 
    avg_delay = mean( arr_delay, na.rm = TRUE), n = n()
  ) %>% 
  ggplot( aes( minute, avg_delay)) + 
    geom_line()

Rounding

floor_date() Round down
round_date()
ceiling_date() Round up

Plot the number of flights per week

flights_dt %>% 
  count(week = floor_date(dep_time, "week")) %>% 
  ggplot(aes( week, n)) + 
    geom_line()

Set components & Calc datetime

hour( datetime) <- hour( datetime) + 1
update( datetime, year = 2020, month = 2, mday = 2, hour = 2)

Show the distribution of flights across the course of the day for every day of the year.

flights_dt %>% 
  mutate(dep_hour = update(dep_time, yday = 1)) %>% 
  ggplot(aes( dep_hour)) + 
    geom_freqpoly(binwidth = 300)

Durations (Physical time)

tomorrow <- today() + ddays(1)
last_year <- today() - dyears(1)

However, because durations represent an exact number of seconds, sometimes you might get an unexpected result:
one_pm <- ymd_hms( "2016-03-12 13: 00: 00", tz = "America/ New_York" ) 
one_pm 
#>[1] "2016-03-12 13: 00: 00 EST" 
one_pm + ddays(1) 
#>[1] "2016-03-13 14: 00: 00 EDT" 
Why is one day after 1 p.m. on March 12, 2 p.m. on March 13?! If you look carefully at the date you might also notice that the time zones have changed. Because of DST, March 12 only has 23 hours, so if we add a full day’s worth of seconds we end up with a different time. In this case, we can use days().
one_pm + days(1) 
#>[1] "2016-03-13 13:00:00 EDT"

Periods (Human time)

tomorrow <- today() + days(1)
last_year <- today() - years(1)

Intervals

one_pm <- ymd_hms( "2016-03-12 13:00:00", tz = "America/New_York" ) 

# Add physical datetime (24 hours).
n_d <- one_pm + ddays(1) 
n_d
#>[1] "2016-03-13 14:00:00 EDT"
# Add human datetime (1 day).
n_p <- one_pm + days(1) 
n_p
#>[1] "2016-03-13 13:00:00 EDT"

(n_d-one_pm)
#> Time difference of 1 days
(n_p-one_pm)
#> Time difference of 23 hours

(n_p - one_pm) / dhours(1)   #Calculated physical time difference (duration).
#>[1] 23
(n_p %--% one_pm) / hours(1) #Calculated estimated time difference (period).
#>[1] -24

Time zone

with_tz( x4, tzone = "Australia/ Lord_Howe") Keep the instant in time the same, and change how it’s displayed.

x4a <- with_tz( x4, tzone = "Australia/ Lord_Howe") 
x4a 
#> [1] "2015-06-02 02: 30: 00 LHST" 
#> [2] "2015-06-02 02: 30: 00 LHST" 
#> [3] "2015-06-02 02: 30: 00 LHST" 
x4a - x4 
#> Time differences in secs 
#> [1] 0 0 0

force_tz( x4, tzone = "Australia/ Lord_Howe") Change the underlying instant in time.

x4b <- force_tz( x4, tzone = "Australia/ Lord_Howe") 
x4b 
#> [1] "2015-06-01 09: 00: 00 LHST" 
#> [2] "2015-06-01 09: 00: 00 LHST" 
#> [3] "2015-06-01 09: 00: 00 LHST" 
x4b - x4 
#> Time differences in hours 
#> [1] -17.5 -17.5 -17.5

Basic functions

seq(1, 10, length.out = 5)
filter(mpg, year==1999 & cyl==4)
View(nycflights13:: flights)
tibble(x = c(1, NA, 3))
cumsum(1:10)
median(x) Value where 50% of x is above it, and 50% is below it.
quantile( x, 0.25)
class(as.data.frame( tb))
who %>% mutate(r=row_number()) Add row number.
rownames(x)
colnames(x)
apropos("replace") searches all objects available from the global environment.
dir(pattern = "\\. Rmd $") lists all the files in a directory.
Sys.timezone() Show current timezone.
OlsonNames() Show the complete list of all time zone names

Tips

To see shortcuts

Alt-Shift-K

Show records including N/A

ts[!complete.cases(ts),]

To check R version

install.packages("rversions")
sessionInfo()

"==" is different from "near()"

sqrt(2) ^ 2 == 2
[1]False
near(sqrt(2) ^ 2, 2)
[1]True
Computers use finite precision arithmetic (they obviously can’t store an infinite number of digits!) so remember that every number you see is an approximation. Instead of relying on = =, use near():

Any operation involving NA returns NA.

NA + 10
[1] NA
NA = = NA
[1] NA

Missing values are always sorted at the end

Sample codes

delays <- flights %>% 
  group_by( dest) %>% 
  summarize( 
    count = n(), 
    dist = mean( distance, na.rm = TRUE), 
    delay = mean( arr_delay, na.rm = TRUE) 
  ) %>% 
  filter( count > 20, dest != "HNL")

daily <- group_by( flights, year, month, day)
(per_day   <- summarize( daily, flights = n()))
(per_month <- summarize( per_day, flights = sum( flights)))
(per_year  <- summarize( per_month, flights = sum( flights)))

flights_sml %>% 
  group_by( year, month, day) %>% 
  filter( rank( desc( arr_delay)) < 10)

popular_dests %>% 
  filter( arr_delay > 0) %>% 
  mutate( prop_delay = arr_delay / sum( arr_delay)) %>% 
  select( year:day, dest, arr_delay, prop_delay)

ggplot( diamonds) + 
  geom_histogram( mapping = aes( x = y), binwidth = 0.5) + 
  coord_cartesian( ylim = c( 0, 50))

diamonds2 <- diamonds %>% 
  mutate( y = ifelse( y < 3 | y > 20, NA, y))

who %>% 
  gather(code, value, new_sp_m014: newrel_f65, na.rm = TRUE) %>% 
  mutate(code = stringr::str_replace(code, "newrel", "new_rel") ) %>% 
  separate(code, c("new", "var", "sexage")) %>% 
  select(-new, -iso2, -iso3) %>% 
  separate(sexage, c("sex", "age"), sep = 1)

noun <- "(a|the) ([^ ]+)" 
has_noun <- stringr::sentences %>% 
  str_subset(noun) %>% 
  head(10) 
has_noun %>% 
  str_extract(noun)

tibble(sentence = sentences) %>% 
  tidyr::extract( 
    sentence, c("article", "noun"), "(a|the) ([^ ]+)", remove = FALSE 
  )

fct_reorder() reorder factor values.

relig <- gss_cat %>% 
  group_by(relig) %>% 
  summarize( 
    age = mean(age, na.rm = TRUE), 
    tvhours = mean(tvhours, na.rm = TRUE), 
    n = n() 
  ) 
relig %>% 
  mutate(relig = fct_reorder(relig, tvhours)) %>% 
  ggplot(aes(tvhours, relig)) + 
    geom_point()

fct_relevel(a, b) order value b to the front of the line.

rincome <- gss_cat %>% 
  group_by(rincome) %>% 
  summarize(
    age = mean(age, na.rm = TRUE), 
    tvhours = mean(tvhours, na.rm = TRUE), 
    n = n() 
  )
ggplot(rincome, aes(age, fct_relevel(rincome, "Not applicable"))) + 
  geom_point()

fct_infreq() reorder by frequency, and fct_rev() make it inverse.

gss_cat %>% 
  mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>% 
  ggplot(aes(marital)) + 
    geom_bar()

fct_recode() changes levels.

gss_cat %>% 
  mutate(partyid = fct_recode(partyid, 
    "Republican, strong" = "Strong republican", 
    "Republican, weak" = "Not str republican", 
    "Independent, near rep" = "Ind, near rep", 
    "Independent, near dem" = "Ind, near dem", 
    "Democrat, weak" = "Not str democrat", 
    "Democrat, strong" = "Strong democrat" )
  ) %>% 
  count(partyid)

fct_collapse() collapses levels.

gss_cat %>% 
  mutate(partyid = fct_collapse(partyid, 
    other = c("No answer", "Don't know", "Other party"), 
    rep = c("Strong republican", "Not str republican"), 
    ind = c("Ind, near rep", "Independent", "Ind, near dem"), 
    dem = c("Not str democrat", "Strong democrat") )
  ) %>% 
  count(partyid)

fct_lump() lumps small number levels together.

gss_cat %>% 
  mutate(relig = fct_lump(relig, n = 10)) %>% 
  count( relig, sort = TRUE) %>% 
  print(n = Inf)

Plot a number of flights through timeline.

make_datetime_100 <- 
  function( year, month, day, time) { 
    make_datetime( year, month, day, time %/% 100, time %% 100) 
  } 
flights_dt <- flights %>% 
  filter(! is.na( dep_time), !is.na( arr_time)) %>% 
  mutate( 
    dep_time = make_datetime_100( year, month, day, dep_time), 
    arr_time = make_datetime_100( year, month, day, arr_time), 
    sched_dep_time = make_datetime_100(year, month, day, sched_dep_time ), 
    sched_arr_time = make_datetime_100( year, month, day, sched_arr_time ) 
  ) %>% 
  select( origin, dest, ends_with(" delay"), ends_with(" time"))
### Across the year...
flights_dt %>% 
  ggplot( aes( dep_time)) + 
    geom_freqpoly( binwidth = 86400) # 86400 seconds = 1 day
### Within a day...
flights_dt %>% 
  filter( dep_time < ymd( 20130102)) %>% 
  ggplot( aes( dep_time)) + 
    geom_freqpoly( binwidth = 600) # 600 s = 10 minutes

plot bar chart with date x axis

loss_tibble %>% 
  ggplot + 
    geom_bar( mapping = aes(x = date, y = value), stat = "identity") + 
    scale_x_date(date_breaks = "1 month", date_labels =  "%b %Y")