Data Distribution - littleclouds/R-for-everyone GitHub Wiki
BASIC FIRST
library(dslabs)
data("heights")
to see names of variable in the data set
names(heights)
class(heights$sex)
print(heights$sex)
to check unique values in the variable
x <- c(3,3,3,3,4,4,2)
a <- unique(x)
length(a)
b <- unique(heights$height)
length(b)
to compute the frequency of each unique value in a categorical value
table(x)
table(heights$sex)
tab <- table(heights$height)
sum(tab == 1)
sum(c)
to see the propotionl frequency
prop.table(table(heights$sex))
Cumulative Distribution Function
As defined , plot of the CDF for male heights values a on the x axis and the propotion of students with heights of that value or lower F(a) on the y axis
a <- seq(min(heights$height),max(heights$height),length = 100) #sets the range of height
cdf_func <- function(x){ #function to comute pr(x <= a)
mean(heights$height <= x)
}
cdf_val <- sapply(a,cdf_func)
lapply returns a list of the same length as X, each element of which is the result of applying FUN to the corresponding element of X. sapply is a user-friendly version and wrapper of lapply by default returning a vector, matrix.
plot(a,cdf_val)
hist(a)
hist(heights$height)
Normal Distribution
library(tidyverse)
library(dslabs)
data("heights")
index <- heights$sex == 'Male'
x <- heights$height[index]
avg <- mean(x)
std <- sd(x)
c(avg = avg,std = std)
TO FIND STANDARDISED UNITS or Z-scores
z <- scale(x)
mean(abs(z) < 2)
abs(z)
x <- heights %>% filter(sex == 'Male') %>% pull(height)
pull() works like [[ for local data frames,
and automatically collects before indexing for remote data tables.
pnorm gives the value of the CDF F(a) for the normal distribution
pnorm(70.5,mean(x),sd(x)) #here it gives the probablity of data more than 70.5
1 - pnorm(70.5,mean(x),sd(x)) #this will give probablity for less than 70.5
plot distribution of exact heights
plot(prop.table(table(x)),xlab = 's = height in inches' , ylab = 'pr(x=a)')
discreteistion
problities in actual dta over length 1 ranges containing an integer
mean(x <= 68.5) - mean(x <= 67.5)
mean(x <= 69.5) - mean(x <= 68.5)
mean(x <= 70.5) - mean(x <= 69.5)
probablities in normal approximation matches well with the result above
pnorm(68.5,mean(x),sd(x)) - pnorm(67.5,mean(x),sd(x))
pnorm(69.5,mean(x),sd(x)) - pnorm(68.5,mean(x),sd(x))
pnorm(70.5,mean(x),sd(x)) - pnorm(69.5,mean(x),sd(x))
probalities in actual data over other ranges dont match normal approx well
mean(x <= 70.9) - mean(x <= 70.1)
pnorm(70.9,mean(x),sd(x)) - pnorm(70.1,mean(x),sd(x))
changes due to_mistake in data
install.packages('HistData')
library(HistData)
data(Galton)
x <- Galton$child
mean(x)
median(x)
mad(x) #medain around deviation
sd(x)
mean and median , mad and sd are simillar this happens when data follows normal approximation
if ther is mistake in some value of data it can be updated through this
x_with_error <- x
x_with_error[1] <- x_with_error[1]*10
mean(x_with_error) - mean(x)
sd(x_with_error) - sd(x)
median(x_with_error) - median(x)
mad(x_with_error) - mad(x)
we see after a change in data or mistake mean and sd changes but median and mad remain the same
x <- Galton$child
error_avg <- function(k){
x[1] <- k
mean(x)
}
error_avg(10000)
error_avg(-10000)
qqplots boxplots
library(dslabs)
data("heights")
summary()
gives you min,max,mean,quartiles
summary(heights$height)
percentiles
are quartiles which divide the data set into 100 intervals each with 1% probablity
p <- seq(0.01,0.99,0.01)
printing 1% to 99% with gap of 1%, this is use to divide the data set into desired intervals
percentiles <- quantile(heights$height,p)
to get quartiles, we find 25th percentile , 50th and 75th
percentiles[names(percentiles == 25)]
percentiles[names(percentiles == 75)]
print(percentiles)
theoretical percentile qnorm
are calculated in order to verify whether sample follows normal distribution or not
p <- seq(0.01,0.99,0.01)
theoretical_quantiles <- qnorm(p,69,3)
give theoretical value of a quantile with proablity p .69=mean 3=standard deviation
print(theoretical_quantiles)
qq plot,
the sample quantiles in the observed data are compared to theoretical. if the data are well approximated by normal distribution , then the points on qqplot fall near the identity line (observed = theoretiacl)
library(tidyverse)
library(dslabs)
data("heights")
index <- heights$sex == 'Male'
x <- heights$height[index]
z <- scale(x)
propotion of data below 69.5
mean(x <= 69.5)
calculate observed and theoretical quantiles
p <- seq(0.05,0.95,0.05)
observed_quantiles <-quantile(x,p)
theoretical_quantiles <- qnorm(p,mean = mean(x),sd = sd(x))
make qq plot
plot(theoretical_quantiles,observed_quantiles)
abline(0,1)
make qqplot with scaled value/standardised units/z-scores
observed_quantiles <- quantile(z,p)
theoretical_quantiles <- qnorm(p)
plot(theoretical_quantiles,observed_quantiles)
abline(0,1)
for female
index <- heights$sex == 'Female'
x <- heights$height[index]
z <- scale(x)
propotion of data below 69.5
mean(x <= 69.5)
calculate observed and theoretical quantiles
p <- seq(0.05,0.95,0.05)
observed_quantiles <-quantile(x,p)
theoretical_quantiles <- qnorm(p,mean = mean(x),sd = sd(x))
make qq plot
plot(theoretical_quantiles,observed_quantiles)
abline(0,1)
make qqplot with scaled value/standardised units/z-scores
observed_quantiles <- quantile(z,p)
theoretical_quantiles <- qnorm(p)
plot(theoretical_quantiles,observed_quantiles)
abline(0,1)