Data Distribution - littleclouds/R-for-everyone GitHub Wiki

BASIC FIRST

library(dslabs) data("heights")

to see names of variable in the data set

names(heights) class(heights$sex) print(heights$sex)

to check unique values in the variable

x <- c(3,3,3,3,4,4,2) a <- unique(x) length(a) b <- unique(heights$height) length(b)

to compute the frequency of each unique value in a categorical value

table(x) table(heights$sex) tab <- table(heights$height) sum(tab == 1) sum(c)

to see the propotionl frequency

prop.table(table(heights$sex))

Cumulative Distribution Function

As defined , plot of the CDF for male heights values a on the x axis and the propotion of students with heights of that value or lower F(a) on the y axis

a <- seq(min(heights$height),max(heights$height),length = 100) #sets the range of height cdf_func <- function(x){ #function to comute pr(x <= a) mean(heights$height <= x) } cdf_val <- sapply(a,cdf_func)

lapply returns a list of the same length as X, each element of which is the result of applying FUN to the corresponding element of X. sapply is a user-friendly version and wrapper of lapply by default returning a vector, matrix.

plot(a,cdf_val) hist(a) hist(heights$height)

Normal Distribution

library(tidyverse) library(dslabs) data("heights") index <- heights$sex == 'Male' x <- heights$height[index]

avg <- mean(x) std <- sd(x) c(avg = avg,std = std)

TO FIND STANDARDISED UNITS or Z-scores

z <- scale(x) mean(abs(z) < 2) abs(z)

x <- heights %>% filter(sex == 'Male') %>% pull(height) pull() works like [[ for local data frames, and automatically collects before indexing for remote data tables.

pnorm gives the value of the CDF F(a) for the normal distribution

pnorm(70.5,mean(x),sd(x)) #here it gives the probablity of data more than 70.5 1 - pnorm(70.5,mean(x),sd(x)) #this will give probablity for less than 70.5

plot distribution of exact heights

plot(prop.table(table(x)),xlab = 's = height in inches' , ylab = 'pr(x=a)')

discreteistion

problities in actual dta over length 1 ranges containing an integer mean(x <= 68.5) - mean(x <= 67.5) mean(x <= 69.5) - mean(x <= 68.5) mean(x <= 70.5) - mean(x <= 69.5)

probablities in normal approximation matches well with the result above

pnorm(68.5,mean(x),sd(x)) - pnorm(67.5,mean(x),sd(x)) pnorm(69.5,mean(x),sd(x)) - pnorm(68.5,mean(x),sd(x)) pnorm(70.5,mean(x),sd(x)) - pnorm(69.5,mean(x),sd(x))

probalities in actual data over other ranges dont match normal approx well

mean(x <= 70.9) - mean(x <= 70.1) pnorm(70.9,mean(x),sd(x)) - pnorm(70.1,mean(x),sd(x))

changes due to_mistake in data

install.packages('HistData') library(HistData) data(Galton) x <- Galton$child mean(x) median(x) mad(x) #medain around deviation sd(x) mean and median , mad and sd are simillar this happens when data follows normal approximation

if ther is mistake in some value of data it can be updated through this x_with_error <- x x_with_error[1] <- x_with_error[1]*10

mean(x_with_error) - mean(x) sd(x_with_error) - sd(x) median(x_with_error) - median(x) mad(x_with_error) - mad(x)

we see after a change in data or mistake mean and sd changes but median and mad remain the same

x <- Galton$child error_avg <- function(k){ x[1] <- k mean(x) }

error_avg(10000) error_avg(-10000)

qqplots boxplots

library(dslabs) data("heights")

summary()

gives you min,max,mean,quartiles summary(heights$height)

percentiles

are quartiles which divide the data set into 100 intervals each with 1% probablity p <- seq(0.01,0.99,0.01) printing 1% to 99% with gap of 1%, this is use to divide the data set into desired intervals percentiles <- quantile(heights$height,p)

to get quartiles, we find 25th percentile , 50th and 75th percentiles[names(percentiles == 25)] percentiles[names(percentiles == 75)] print(percentiles)

theoretical percentile qnorm

are calculated in order to verify whether sample follows normal distribution or not p <- seq(0.01,0.99,0.01) theoretical_quantiles <- qnorm(p,69,3) give theoretical value of a quantile with proablity p .69=mean 3=standard deviation print(theoretical_quantiles)

qq plot,

the sample quantiles in the observed data are compared to theoretical. if the data are well approximated by normal distribution , then the points on qqplot fall near the identity line (observed = theoretiacl)

library(tidyverse) library(dslabs) data("heights") index <- heights$sex == 'Male' x <- heights$height[index] z <- scale(x)

propotion of data below 69.5 mean(x <= 69.5)

calculate observed and theoretical quantiles

p <- seq(0.05,0.95,0.05) observed_quantiles <-quantile(x,p) theoretical_quantiles <- qnorm(p,mean = mean(x),sd = sd(x))

make qq plot

plot(theoretical_quantiles,observed_quantiles) abline(0,1)

make qqplot with scaled value/standardised units/z-scores observed_quantiles <- quantile(z,p) theoretical_quantiles <- qnorm(p) plot(theoretical_quantiles,observed_quantiles) abline(0,1)

for female index <- heights$sex == 'Female' x <- heights$height[index] z <- scale(x)

propotion of data below 69.5 mean(x <= 69.5)

calculate observed and theoretical quantiles p <- seq(0.05,0.95,0.05) observed_quantiles <-quantile(x,p) theoretical_quantiles <- qnorm(p,mean = mean(x),sd = sd(x))

make qq plot plot(theoretical_quantiles,observed_quantiles) abline(0,1)

make qqplot with scaled value/standardised units/z-scores observed_quantiles <- quantile(z,p) theoretical_quantiles <- qnorm(p) plot(theoretical_quantiles,observed_quantiles) abline(0,1)