Piecewise or Segmented Regression - vaibhavdobriyal/R_Tutorials GitHub Wiki
Example
Play with the following code in R Studio
###############################just some clean up
rm(list=ls())
set.seed(42)
############################## Data Preparation Started ################################
#loading data
#this sample only had data for one site and includes temperature and usage
dat <- read.table("334_13_temperatureData.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
head(dat)
library(plyr)
#renaming columns
dat <- rename(dat,c("dailytemperaturemean"="tmp_mean",
"dailyopenhourmean"="usage_mean_open"
))
head(dat)
#Only analysing for a meter and usage_mean_open
#dat <- subset(dat, siteid == '334_13')
relevantData <- dat[,c("tmp_mean","usage_mean_open")]
#aading the temperature category atttributed
relevantData<-rename(relevantData,c("usage_mean_open"="usage"))
relevantData <- subset(relevantData, usage > 0)
processedData <- rename(relevantData,c("tmp_mean"="temperature_mean","usage"="usageKWhr"))
head(processedData)
# color coding based on bands not relevant at all
#processedData = within(processedData, {
# category_temp = ifelse(temperature_mean >= 20, "Heating", ifelse(temperature_mean<=14,"Cooling","Flat"))
#})
#head(processedData)
# TODOoutlier correction
############################## Data Prepartaion Finished ################################
############################## Visualizing Raw Data #######################################
library(ggplot2)
# PLotting raw data
plot <- ggplot(aes(y = usageKWhr , x = temperature_mean), data = processedData) +
geom_point(aes(y = usageKWhr , x = temperature_mean)) +
theme_bw(16) +
ggtitle("Usage vs. Temperature")+
scale_size_area() +
xlab("Temperature Mean") +
ylab("Usage WHr")
################################## Data Analysis Begins####################################
library(segmented)
# usageKWhr is Y and temperature_mean is X
# we are doing a linear regression
lfit <- lm(usageKWhr ~ temperature_mean, data = processedData)
coef <- coef(lfit)
# The other alternate was to use MASS which has on option of rlm instead of lm above
# supposedly less impacted by outliers and noise
#library(MASS)
#lfit <- rlm(usageKWhr ~ temperature_mean, data = processedData)
#coef <- coef(lfit)
# Approach rejected for now because there was not much differece - for now sticking no simple linear model
######################outlier correction - let us take the basic approach##################################
#remove X% od points with max residual and repeat regression
outlier_percent = 0
fitted <- fitted(lfit)
prelim_model <- data.frame(temperature_mean = processedData$temperature_mean, usageKWhr = fitted, orig_usage=processedData$usageKWhr)
#calculating absolute residual that is |observed - estimated|
prelim_model$residual_diff <- abs(prelim_model$orig_usage - prelim_model$usageKWhr)
#sorting by residual
prelim_model <- prelim_model[order(-prelim_model[,4]),]
#removing X% extreme residuals and following is the subset data which may be used for next round of regression
processedData2 <- tail(prelim_model,nrow(prelim_model)-as.integer(nrow(prelim_model)*(outlier_percent)))
#repeat regression with the data minus outliers
#Rejected performing repeat as R square was running too close to 1
#Depends on dataset - uncomment following two in case results are to be analyzed
#lfit <- lm(usageKWhr ~ temperature_mean, data = processedData2)
#coef <- coef(lfit)
summary(lfit)
print(lfit)
################################outlier correction finished###################################################
plot <- plot +
geom_hline(aes(yintercept = 0), color="grey", linetype="dashed")+
geom_vline(aes(xintercept=0),color="grey",linetype="dashed")
#incase visualization of somple liner is required
# we could have plottted using the line as we have slope and intercept
plot <- plot + geom_smooth(method="lm", se = FALSE)
plot
################################Peicewise regression###########################################################
# performing segmentedregression using the linear model obtained
#without psi this may return anything for number of segments - algo is free to choose number of breakpoints
#as well as position of breakpoint
sfit <- segmented(lfit, seg.Z = ~temperature_mean)
#trying with position of breakpoints
#sfit <- segmented(lfit, seg.Z = ~temperature_mean, psi = list(temperature_mean = c(14,20)))
#trying with number of breakpoints
#sfit <- segmented(lfit, seg.Z = ~temperature_mean,psi=NA, control=seg.control(display=TRUE,K=1))
#plot(sfit, conf.level=0.95, shade=TRUE)
summary(sfit)
print(sfit)
# get the breakpoints
sfit$psi
slope(sfit)
# get the fitted data
fitted <- fitted(sfit)
model_residuals <- residuals(sfit)
sd_residuals <- sd(model_residuals,na.rm = T)
model <- data.frame(temperature_mean = processedData$temperature_mean, usageKWhr = fitted)
# plot the fitted model
plot <- plot+
geom_point(data=model,aes(y = usageKWhr , x = temperature_mean),color="red",size=2) +
#geom_line(data=model,aes(y = usageKWhr , x = temperature_mean),color="red") +
geom_vline(aes(xintercept=14),color="grey",linetype="dashed",size=1.5) +
geom_vline(aes(xintercept=20),color="grey",linetype="dashed",size=1.5) +
geom_ribbon(data=model,aes(ymin = usageKWhr - sd_residuals, ymax = usageKWhr + sd_residuals), fill = "grey70",alpha=0.2)
plot
Reference Links:
http://www.inside-r.org/packages/cran/segmented/docs/segmented http://www.r-bloggers.com/using-segmented-regression-to-analyse-world-record-running-times/
http://www.r-bloggers.com/r-for-ecologists-putting-together-a-piecewise-regression/ http://stackoverflow.com/questions/29382903/how-to-apply-piecewise-linear-fit-in-python http://stackoverflow.com/questions/12105435/how-can-i-obtain-segmented-linear-regressions-with-a-priori-breakpoints https://rpubs.com/MarkusLoew/12164 -- very good
http://stats.stackexchange.com/questions/20890/how-to-use-segmented-package-to-fit-a-piecewise-linear-regression-with-one-break http://stats.stackexchange.com/questions/19772/estimating-the-break-point-in-a-broken-stick-piecewise-linear-model-with-rando http://www.waterlog.info/segreg.htm