Basic samples for R - Masa-Ochi/r-samples GitHub Wiki
scale(x, center = min(x), scale = max(x)-min(x))
scale(x, center = min(x), scale = max(x)-min(x))
ggplot(data = <DATA>) + <GEOM_FUNCTION>( mapping = aes(<MAPPINGS>), stat = <STAT>, position = <POSITION> ) + <COORDINATE_FUNCTION> + <FACET_FUNCTION>
ggplot( data = < DATA >) + < GEOM_FUNCTION >( mapping = aes( < MAPPINGS >))
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy, color = class))
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy, size = class))
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy, alpha = class))
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy, shape = class))
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy), color = "blue")
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy)) + facet_wrap( ~ class, nrow = 2)
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy)) + facet_grid( .~ class)
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy)) + facet_grid( drv ~ cyl)
ggplot( data = mpg) + geom_smooth( mapping = aes( x = displ, y = hwy, linetype = drv))
ggplot( data = mpg, mapping = aes( x = displ, y = hwy)) + geom_point( mapping = aes( color = class)) + geom_smooth()
ggplot( data = diamonds) + geom_bar( mapping = aes( x = cut))
ggplot( data = demo) + geom_bar( mapping = aes( x = a, y = b), stat = "identity" )
ggplot( data = diamonds) + geom_bar( mapping = aes( x = cut, y = ..prop.., group = 1) )
ggplot( data = diamonds) + stat_summary( mapping = aes( x = cut, y = depth), fun.ymin = min, fun.ymax = max, fun.y = median )
ggplot( data = diamonds) + geom_bar( mapping = aes( x = cut, color = cut))
ggplot( data = diamonds) + geom_bar( mapping = aes( x = cut, fill = clarity))
ggplot( data = diamonds) + geom_bar( mapping = aes( x = cut, fill = clarity), position = "dodge" )
ggplot( data = diamonds) + geom_bar( mapping = aes( x = cut, fill = clarity), position = "fill" )
ggplot( data = mpg) + geom_point( mapping = aes( x = displ, y = hwy), position = "jitter" )
ggplot(data=diamonds) + geom_freqpoly(mapping = aes( x = carat, color = cut), binwidth = 0.1)
ggplot( data = mpg, mapping = aes( x = class, y = hwy)) + geom_boxplot()
ggplot( data = mpg, mapping = aes( x = class, y = hwy)) + geom_boxplot() + coord_flip()
bar <- ggplot( data = diamonds) + geom_bar( mapping = aes( x = cut, fill = cut), show.legend = FALSE, width = 1 ) + theme( aspect.ratio = 1) + labs( x = NULL, y = NULL) bar + coord_flip() bar + coord_polar()
ggplot( data = mpg) + geom_boxplot( mapping = aes( x = reorder(class, hwy, FUN = median), y = hwy ) )
diamonds %>% count( color, cut) %>% ggplot( mapping = aes( x = color, y = cut)) + geom_tile( mapping = aes( fill = n))
library(nycflights13) library(tidyverse)
int
stands for integers.dbl
stands for doubles, or real numbers.chr
stands for character vectors, or strings.dttm
stands for date-times (a date + a time).lgl
stands for logical, vectors that contain only TRUE or FALSE.fctr
stands for factors, which R uses to represent categorical variables with fixed possible values.date
stands for dates.
filter()
Pick observations by their values.
filter( flights, month %in% c( 11, 12))
filter( flights, !( arr_delay > 120 | dep_delay > 120))
filter()
only includes rows where the condition is TRUE; it excludes both FALSE and NA values. If you want to preserve missing values, ask for them explicitly
arrange()
Reorder the rows.
arrange( flights, year, month, desc(day))
select()
Pick variables by their names.
select(flights, year, day, everything())
starts_with("abc")
ends_with("xyz")
contains("ijk")
matches("(.)\\1")
mutate()
Create new variables with functions of existing variables.transmute()
Create new variables and keep only them.
transmute(flights, dep_time, hour = dep_time %/% 100, minute = dep_time %% 100)
summarize()
Collapse many values down to a single summary.
summarize(flights, delay = mean(dep_delay, na.rm = TRUE))
readr uses UTF-8 everywhere:
readr()
parse_integer()
parse_double(" 1,23", locale = locale( decimal_mark = ","))
-
parse_number( "123.456.789", locale = locale( grouping_mark = ".") )
It ignores non-numeric characters before and after the number.
parse_logical()
-
parse_datetime()
parse_date()
parse_time()
`parse_date("01/02/15", "%m/%d/%y")
- `parse_character( x2, locale = locale( encoding = "Shift-JIS"))
guess_encoding( charToRaw( x1))
Helping you figure out encoding. (Input accepts file or raw vector) parse_factor()
problems(x)
stop_for_problems(x)
-
read_csv()
challenge <- read_csv(readr_example("challenge.csv"))
challenge <- read_csv( readr_example("challenge.csv"), col_types = cols( x = col_double(), y = col_character() ) )
challenge2 <- read_csv(readr_example("challenge.csv"), col_types = cols(.default = col_character()) )
- CSV
write_csv(challenge, "challenge.csv")
- Excel
write_write_excel_csv(challenge, "challenge.csv")
- RDS
write_rds(challenge, "challenge.rds")
read_rds("challenge.rds")
-
Gathering
gather( '1999', '2000', key = "year", value = "cases")
-
Spreading
spread(table2, key = type, value = count)
-
Separate
-
table3 %>% separate(rate, into = c("cases", "population"), sep = "/", convert = TRUE)
By default, separate() will split values wherever it sees a non-alphanumeric character. It leaves the type of the column as is.
-
table3 %>% separate( year, into = c(" century", "year"), sep = 2)
It will interpret the integers as positions to split at.
-
-
Unite
-
table5 %>% unite(new, century, year)
The default will place an underscore (_) between the values from different columns.
table5 %>% unite(new, century, year, sep = "")
-
-
Complete
stocks %>% complete(year, qtr)
-
complete()
takes a set of columns, and finds all unique combinations.It then ensures the original dataset contains all those values, filling in explicit NAs where necessary.
-
Fill
-
treatment %>% fill(person)
It takes a set of columns where you want missing values to be replaced by the most recent nonmissing value.
-
-
guess_parser()
guess_parser("2010-10-01") #>[1] "date"
str(parse_guess(" 2010-10-10")) #>Date[1:1], format: "2010-10-10"
The heuristic tries each of the following types, stopping when it finds a match:
- logical Contains only “F”, “T”, “FALSE”, or “TRUE”.
- integer Contains only numeric characters (and -).
- double Contains only valid doubles (including numbers like 4.5e-5).
- number Contains valid doubles with the grouping mark inside.
- time Matches the default time_format.
- date Matches the default date_format.
- date-time Any ISO8601 date.
If none of these rules apply, then the column will stay as a vector of strings.
-
by = NULL
Uses all variables that appear in both tables. -
by = "x"
Uses only declared common variables. -
by = c("a" = "b")
This will match variable a in table x to variable b in table y. - base::merge()
dplyr merge inner_join(x, y) merge(x, y) left_join(x, y) merge(x, y, all.x = TRUE) right_join(x, y) merge(x, y, all.y = TRUE) full_join(x, y) merge(x, y, all.x = TRUE, all.y = TRUE) -
semi_join(x, y)
keeps all observations in x that have a match in y. -
anti_join(x, y)
drops all observations in x that have a match in y.flights %>% anti_join(planes, by = "tailnum") %>% count(tailnum, sort = TRUE)
-
intersect(x, y)
Return only observations in both x and y. -
union(x, y)
Return unique observations in x and y. -
setdiff(x, y)
Return observations in x, but not in y.
library(stringr)
str_length("abc")
-
str_c("x", "y", "z", sep=", ")
x <- c("abc", NA) str_c("|-", str_replace_na(x), "-|") #>[1] "|-abc-|" "|-NA-|"
str_c( "Good ", time_of_day, " ", name, if (birthday) " and HAPPY BIRTHDAY", "." )
str_sub(x, 1, 3)
str_to_lower(x)
-
str_view(x, ".a.")
-
.
match any single word. -
^
match the start of the string. -
$
tmatch the end of the string. -
\d
matches any digit. -
\s
matches any whitespace (e.g., space, tab, newline). -
[abc]
matches a, b, or c. -
[^abc]
matches anything except a, b, or c. -
(a|b)
matches a or b -
?
0 or 1 -
+
1 or more -
*
0 or more -
{n}
exactly n -
{n,}
n or more -
{,m}
at most m -
{n,m}
between n and m
-
str_view_all(x, ".a.")
str_detect(x, ".a.")
str_subset(x, ".a.")
str_count(x, ".a.")
-
str_extract(x, ".a.")
only extracts the first match. -
str_extract_all(x, ".a.")
returns a list. -
str_match(x, "a.")
str_extract()
gives us the complete match;str_match()
gives each individual component. str_match_all(x, "a.")
-
str_replace(x, "a.", "--")
replace only the first match. -
str_replace_all(x, "a.", "--")
replace all matches.str_replace_all()
can perform multiple replacements by supplying a named vector:x <- c("1 house", "2 cars", "3 people") str_replace_all( x, c("1" = "one", "2" = "two", "3" = "three")) #>[1] "one house" "two cars" "three people"
str_split(sentences, " ")
str_locale(x, "a.")
-
regex()
bananas <- c(" banana", "Banana", "BANANA") str_view(bananas, regex("banana", ignore_case = TRUE))
x <- "Line 1\nLine 2\nLine 3" str_extract_all(x, regex("^Line", multiline = TRUE))[[1]] #>[1] "Line" "Line" "Line"
phone <- regex(" \\(? # optional opening parens (\\ d{ 3}) # area code
comments = TRUE
allows you to use comments and white space to make complex regular expressions more understandable. Spaces are ignored, as is everything after #. To match a literal space, you’ll need to escape it: "\ ".dotall = TRUE
allows.
to match everything, including\n
.
library(lubridate)
today()
now()
ymd("2017-01-31", tz="UTC")
mdy("2017-01-31")
ymd_hms("2017-01-31 20:11:59")
make_datetime(year, month, day, time %/% 100, time %% 100)
- year()
- month(datetime, label = TRUE)
- mday()
- yday()
- wday(datetime, label = TRUE, abbr = FALSE)
- hour()
- minute()
- second()
Draw a line of the average departure delay by minute within the hour.
flights_dt %>%
mutate( minute = minute( dep_time)) %>%
group_by( minute) %>%
summarize(
avg_delay = mean( arr_delay, na.rm = TRUE), n = n()
) %>%
ggplot( aes( minute, avg_delay)) +
geom_line()
-
floor_date()
Round down round_date()
-
ceiling_date()
Round up
Plot the number of flights per week
flights_dt %>%
count(week = floor_date(dep_time, "week")) %>%
ggplot(aes( week, n)) +
geom_line()
hour( datetime) <- hour( datetime) + 1
update( datetime, year = 2020, month = 2, mday = 2, hour = 2)
Show the distribution of flights across the course of the day for every day of the year.
flights_dt %>%
mutate(dep_hour = update(dep_time, yday = 1)) %>%
ggplot(aes( dep_hour)) +
geom_freqpoly(binwidth = 300)
tomorrow <- today() + ddays(1)
last_year <- today() - dyears(1)
However, because durations represent an exact number of seconds, sometimes you might get an unexpected result:
one_pm <- ymd_hms( "2016-03-12 13: 00: 00", tz = "America/ New_York" ) one_pm #>[1] "2016-03-12 13: 00: 00 EST" one_pm + ddays(1) #>[1] "2016-03-13 14: 00: 00 EDT"
Why is one day after 1 p.m. on March 12, 2 p.m. on March 13?! If you look carefully at the date you might also notice that the time zones have changed. Because of DST, March 12 only has 23 hours, so if we add a full day’s worth of seconds we end up with a different time. In this case, we can use
days()
.one_pm + days(1) #>[1] "2016-03-13 13:00:00 EDT"
tomorrow <- today() + days(1)
last_year <- today() - years(1)
one_pm <- ymd_hms( "2016-03-12 13:00:00", tz = "America/New_York" )
# Add physical datetime (24 hours).
n_d <- one_pm + ddays(1)
n_d
#>[1] "2016-03-13 14:00:00 EDT"
# Add human datetime (1 day).
n_p <- one_pm + days(1)
n_p
#>[1] "2016-03-13 13:00:00 EDT"
(n_d-one_pm)
#> Time difference of 1 days
(n_p-one_pm)
#> Time difference of 23 hours
(n_p - one_pm) / dhours(1) #Calculated physical time difference (duration).
#>[1] 23
(n_p %--% one_pm) / hours(1) #Calculated estimated time difference (period).
#>[1] -24
-
with_tz( x4, tzone = "Australia/ Lord_Howe")
Keep the instant in time the same, and change how it’s displayed.
x4a <- with_tz( x4, tzone = "Australia/ Lord_Howe")
x4a
#> [1] "2015-06-02 02: 30: 00 LHST"
#> [2] "2015-06-02 02: 30: 00 LHST"
#> [3] "2015-06-02 02: 30: 00 LHST"
x4a - x4
#> Time differences in secs
#> [1] 0 0 0
-
force_tz( x4, tzone = "Australia/ Lord_Howe")
Change the underlying instant in time.
x4b <- force_tz( x4, tzone = "Australia/ Lord_Howe")
x4b
#> [1] "2015-06-01 09: 00: 00 LHST"
#> [2] "2015-06-01 09: 00: 00 LHST"
#> [3] "2015-06-01 09: 00: 00 LHST"
x4b - x4
#> Time differences in hours
#> [1] -17.5 -17.5 -17.5
seq(1, 10, length.out = 5)
filter(mpg, year==1999 & cyl==4)
View(nycflights13:: flights)
tibble(x = c(1, NA, 3))
cumsum(1:10)
-
median(x)
Value where 50% of x is above it, and 50% is below it. quantile( x, 0.25)
class(as.data.frame( tb))
-
who %>% mutate(r=row_number())
Add row number. rownames(x)
colnames(x)
-
apropos("replace")
searches all objects available from the global environment. -
dir(pattern = "\\. Rmd $")
lists all the files in a directory. -
Sys.timezone()
Show current timezone. -
OlsonNames()
Show the complete list of all time zone names
Alt-Shift-K
ts[!complete.cases(ts),]
install.packages("rversions") sessionInfo()
sqrt(2) ^ 2 == 2 [1]False near(sqrt(2) ^ 2, 2) [1]True
Computers use finite precision arithmetic (they obviously can’t store an infinite number of digits!) so remember that every number you see is an approximation. Instead of relying on = =, use near():
NA + 10 [1] NA NA = = NA [1] NA
delays <- flights %>%
group_by( dest) %>%
summarize(
count = n(),
dist = mean( distance, na.rm = TRUE),
delay = mean( arr_delay, na.rm = TRUE)
) %>%
filter( count > 20, dest != "HNL")
daily <- group_by( flights, year, month, day)
(per_day <- summarize( daily, flights = n()))
(per_month <- summarize( per_day, flights = sum( flights)))
(per_year <- summarize( per_month, flights = sum( flights)))
flights_sml %>%
group_by( year, month, day) %>%
filter( rank( desc( arr_delay)) < 10)
popular_dests %>%
filter( arr_delay > 0) %>%
mutate( prop_delay = arr_delay / sum( arr_delay)) %>%
select( year:day, dest, arr_delay, prop_delay)
ggplot( diamonds) +
geom_histogram( mapping = aes( x = y), binwidth = 0.5) +
coord_cartesian( ylim = c( 0, 50))
diamonds2 <- diamonds %>%
mutate( y = ifelse( y < 3 | y > 20, NA, y))
who %>%
gather(code, value, new_sp_m014: newrel_f65, na.rm = TRUE) %>%
mutate(code = stringr::str_replace(code, "newrel", "new_rel") ) %>%
separate(code, c("new", "var", "sexage")) %>%
select(-new, -iso2, -iso3) %>%
separate(sexage, c("sex", "age"), sep = 1)
noun <- "(a|the) ([^ ]+)"
has_noun <- stringr::sentences %>%
str_subset(noun) %>%
head(10)
has_noun %>%
str_extract(noun)
tibble(sentence = sentences) %>%
tidyr::extract(
sentence, c("article", "noun"), "(a|the) ([^ ]+)", remove = FALSE
)
fct_reorder() reorder factor values.
relig <- gss_cat %>%
group_by(relig) %>%
summarize(
age = mean(age, na.rm = TRUE),
tvhours = mean(tvhours, na.rm = TRUE),
n = n()
)
relig %>%
mutate(relig = fct_reorder(relig, tvhours)) %>%
ggplot(aes(tvhours, relig)) +
geom_point()
fct_relevel(a, b) order value b to the front of the line.
rincome <- gss_cat %>%
group_by(rincome) %>%
summarize(
age = mean(age, na.rm = TRUE),
tvhours = mean(tvhours, na.rm = TRUE),
n = n()
)
ggplot(rincome, aes(age, fct_relevel(rincome, "Not applicable"))) +
geom_point()
fct_infreq() reorder by frequency, and fct_rev() make it inverse.
gss_cat %>%
mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>%
ggplot(aes(marital)) +
geom_bar()
fct_recode() changes levels.
gss_cat %>%
mutate(partyid = fct_recode(partyid,
"Republican, strong" = "Strong republican",
"Republican, weak" = "Not str republican",
"Independent, near rep" = "Ind, near rep",
"Independent, near dem" = "Ind, near dem",
"Democrat, weak" = "Not str democrat",
"Democrat, strong" = "Strong democrat" )
) %>%
count(partyid)
fct_collapse() collapses levels.
gss_cat %>%
mutate(partyid = fct_collapse(partyid,
other = c("No answer", "Don't know", "Other party"),
rep = c("Strong republican", "Not str republican"),
ind = c("Ind, near rep", "Independent", "Ind, near dem"),
dem = c("Not str democrat", "Strong democrat") )
) %>%
count(partyid)
fct_lump() lumps small number levels together.
gss_cat %>%
mutate(relig = fct_lump(relig, n = 10)) %>%
count( relig, sort = TRUE) %>%
print(n = Inf)
Plot a number of flights through timeline.
make_datetime_100 <-
function( year, month, day, time) {
make_datetime( year, month, day, time %/% 100, time %% 100)
}
flights_dt <- flights %>%
filter(! is.na( dep_time), !is.na( arr_time)) %>%
mutate(
dep_time = make_datetime_100( year, month, day, dep_time),
arr_time = make_datetime_100( year, month, day, arr_time),
sched_dep_time = make_datetime_100(year, month, day, sched_dep_time ),
sched_arr_time = make_datetime_100( year, month, day, sched_arr_time )
) %>%
select( origin, dest, ends_with(" delay"), ends_with(" time"))
### Across the year...
flights_dt %>%
ggplot( aes( dep_time)) +
geom_freqpoly( binwidth = 86400) # 86400 seconds = 1 day
### Within a day...
flights_dt %>%
filter( dep_time < ymd( 20130102)) %>%
ggplot( aes( dep_time)) +
geom_freqpoly( binwidth = 600) # 600 s = 10 minutes
plot bar chart with date x axis
loss_tibble %>%
ggplot +
geom_bar( mapping = aes(x = date, y = value), stat = "identity") +
scale_x_date(date_breaks = "1 month", date_labels = "%b %Y")