Commit 199a81d9 authored by nicolas.dupont's avatar nicolas.dupont
Browse files

multilinear regression exercise Tuesday

parent 8b922dfe
rm(list=ls()) # clearing all variables from the environment
graphics.off() # closing all figures and plots
# setting the working directory with the path to the data file lm.exercise.data.txt
setwd("M:\\pc\\Dokumenter\\Conferences\\Runde Sommer school")
lm.exercise.data <- read.table("lm.exercise.data.txt", sep="", header=TRUE, row.names=NULL, blank.lines.skip=TRUE, na.strings="NA")
# Here are the variables:
# y is the response variable, it is going to be predicted as a linear combination of the 2 independent variables x and z
print(lm.exercise.data) # show the whole dataset
print(lm.exercise.data$y) # show the response variable "y" -> the one we are interested to predict
print(lm.exercise.data$x) # show the independent variable "x" -> the one we are interested to predict from
print(lm.exercise.data$z) # show the independent variable "z" -> the other one we are interested to predict from
# first let's plot the variables (response) against the order in the data to see if there is some outliers (data points that are standing out by their value) in the dataset.
plot(lm.exercise.data$y)
# another way to see if there is some outliers in the dataset is to plot the variables as a boxplot
boxplot(lm.exercise.data$y)
# the thick black line indicates the median of the data (value of y that is > to 50% of data in y)
# the box indicates the 25% and 75% percentiles (values of y that are > to 25% and 75% of the data in y)
# whiskers and their legnth indicates a range corresponding to 1.5 times the distance between the 25% or 75% and the median
# any data points outside the range defined by the whiskers can be seen as an outlier in the data y (not the case here)
# we want to predict y as function of x and z
# so we want to evaluate the regression model y = a + b*x + c*z (Eq. 1)
# where y, x, and z are the variables to use in the regression
# a is the intercept for the regression
# b is the regression parameter linked to the independent variable x
# c is the regression parameter linked to the independent variable z
# for each combination of x and z the observed value of y is varying around a mean "mu" with a standard deviation "sigma" (stochasticity)
# we need to take it into account in our regression model and so a random variable "epsilon" (eps) is added to the model which follows a normal distribution with mean=0 and standard deviation="sigma"
# so the equation becomes y = a + b*x + c*z + eps with eps~N(0,sigma) (Eq. 2)
# eps values for the dataset can be extracted by substracting predicted y (^y) to observed y (yobs) using the equation (1)
# ^y values are called the "fit" and eps values the "residuals" of the multilinear regression
# in order to assess the intercept "a" and the two regression parameters "b" and "c"
# we run a multiple linear regression analysis script
Model.object <- lm(y~x+z,data=lm.exercise.data) # "lm" is the linear regression modelling function, we then enter the equation to assess
# with the response variable on the left side the tilt sign (~) and then the summation of the independent variables
# the function generates a model object "Model.object" which contains different informations and results of the analysis
# data informs the cript in each dataset to take the "y", "x" and "z" values
# checking the results from the regression
summary(Model.object)
#Residuals:
# Min 1Q Median 3Q Max
#-5.6924 -2.4609 0.0036 1.9967 6.0810
#
#Coefficients:
# Estimate Std. Error t value Pr(>|t|)
#(Intercept) 2.79203 1.29627 2.154 0.0391 * <- "a" Estimate, standard error of the estimate, t-test parameter, P-value for "a" is equal to 0
#x 0.05489 0.02161 2.540 0.0163 * <- "b" Estimate, standard error of the estimate, t-test parameter, P-value for "b" is equal to 0
#z -1.46450 0.29388 -4.983 2.25e-05 *** <- "c" Estimate, standard error of the estimate, t-test parameter, P-value for "c" is equal to 0
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#Residual standard error: 3.567 on 31 degrees of freedom
#Multiple R-squared: 0.5665, Adjusted R-squared: 0.5386 <- Rsquare of the model
#F-statistic: 20.26 on 2 and 31 DF, p-value: 2.359e-06
#
# plotting the graphical summary for the model results
win.graph()
par(mfrow=c(2,2))
plot(Model.object) # The two upper panel are the most interesting. The one of the left plots residuals (eps) vs. predictions (^y) from the model given the x and z values from the data set
# we can see that there is no obvious trend left in the residuals and that the spread of the residuals around the 0 mean is homogeneous, indicating homoscedasticity i.e. sigma in eps~N(0,sigma) appears constant
# The one on the right is a normal qq plot. When the eps data points align themselves around the qq_line it indicates that those are following a normal distribution
# checking if the residuals are following a normal distribution with constant variance (#2 method)
eps <- resid(Model.object) # extract the residuals "eps" from the model object
fit <- fitted(Model.object) # calculating the predicted y values from the x and z values in the data set
plot(fit,eps,type="p",cex=2) # plotting fitted vs. residuals to check that the spread of residuals around the fitted values is constant on the range of fiited values "homoscedasticity"
abline(mean(eps),0) # plotting the mean value of eps on the same plot (checking that the mean value of the distribution is ~ 0)
qqnorm(eps) # checking that the residuals are following a normal distribution
qqline(eps,col="red") # the points should align themselves around the red line if they originate from a normal distribution
acf(eps) # checking for autocorrelation (eps value correlated against themselves but with 1 time lag, e.g line1 vs. line2 and line2 vs.line3 etc...)
# the first bar (0 time lag) should be 1, the remaining lag should be below significance level
# auto-correlation in the residuals can be a sign that there is a remaining trend in the residuals (either linear or not linear) indicating that the model is not the est fit to the data and that the calculated p-values are biaised
##### Not implemented #######
#library(mgcv)
#a.gam <- gam(eps~s(fit)) # checking for residual non-linear pattern in eps
#summary(a.gam) # P-value >0.05
##### Not implemented #######
# extracting estimates for "a", "b" and "c"
a <- summary(Model.object)$coefficients["(Intercept)","Estimate"]
b <- summary(Model.object)$coefficients["x","Estimate"]
c <- summary(Model.object)$coefficients["z","Estimate"]
# extracting standard errors for "a", "b", "c"
a.se <- summary(Model.object)$coefficients["(Intercept)","Std. Error"]
b.se <- summary(Model.object)$coefficients["x","Std. Error"]
c.se <- summary(Model.object)$coefficients["z","Std. Error"]
# calculating the confidence interval at 95% for "b" and "c"
#a.ci.low <- a - 1.96*a.se
#a.ci.high <- a + 1.96*a.se
b.ci.low <- b - 1.96*b.se
b.ci.high <- b + 1.96*b.se
c.ci.low <- c - 1.96*c.se
c.ci.high <- c + 1.96*c.se
# We now want to see the predictions from the regression model when we change the value of one of the explanatory variable
# First we establish a new data set where we allow one of the explanatory variable (here x) to vary
lm.exercise.data.predict4x <- data.frame(x=seq(0,100,4),z=rep(mean(lm.exercise.data$z),26))
print(lm.exercise.data.predict4x) # display the new data. x is varying from 0 to 100 by increments of 4
# z is constant at its mean value which means that its effect on the predictions is going to be null
b.predict <- predict(Model.object,newdata=lm.exercise.data.predict4x,type="response",se.fit=TRUE) # predict function given a new set of data
print(b.predict) # there are 4 different sub-objects in this object. The interesting one for now are "fit" aka the predictions (b.predict$fit) and "se.fit" the standard error for "fit" (b.predict$se.fit)
b.predict.ci <- predict(Model.object,newdata=lm.exercise.data.predict4x,interval="predict") # predict function is used in a slightly different way to give confidence interval for the observations
print(b.predict.ci) # this is a matrix with 3 columns. The first is fit similar to b.predict$fit and lwr and upr are the lower and upper confidence interval (95%) for the observations around the fit
# plotting regression line for "b" vs. observations with confidence interval for the regression line (blue) and for the observations (magenta)
win.graph() # new graphical window
par(mfrow=c(1,2)) # setting up two figures in the same window
plot(y~x,data=lm.exercise.data,cex=2,ylim=c(-15,15)) # another way to use plot. Instead of plot("horizontal axis","vertical axis"), one can use plot("vertical axis"~"horizontal axis") precising in data in which dataset to find the corresponding values
lines(seq(0,100,4),b.predict$fit,col="red",lwd=2) # adding lines to the current plot (use of plot would create another independent figure)
lines(seq(0,100,4),b.predict$fit-1.96*b.predict$se.fit,col="blue",lwd=1) # here we use "fit" and "se.fit" to calculate the lower confidence interval at 95% (fit-1.96*se.fit)
lines(seq(0,100,4),b.predict$fit+1.96*b.predict$se.fit,col="blue",lwd=1) # here we use "fit" and "se.fit" to calculate the upper confidence interval at 95% (fit+1.96*se.fit)
lines(seq(0,100,4),b.predict.ci[,2],col="magenta",lwd=1) # here we use "lwr" column (c.predict.ci[,2]) to plot the lower confidence interval at 95%
lines(seq(0,100,4),b.predict.ci[,3],col="magenta",lwd=1) # here we use "upr" column (b.predict.ci[,3]) to plot the upper confidence interval at 95%
# Then we establish a new data set where we allow the other explanatory variable (here z) to vary
lm.exercise.data.predict4z <- data.frame(x=rep(mean(lm.exercise.data$x),26),z=seq(-4,6,(4/10)))
print(lm.exercise.data.predict4z) # display the new data. z is varying from -4 to 6 by increments of 0.4
# x is constant at its mean value which means that its effect on the predictions is going to be null
c.predict <- predict(Model.object,newdata=lm.exercise.data.predict4z,type="response",se.fit=TRUE) # predict function given a new set of data
print(c.predict) # there are 4 different variables in this object. The interesting one for now are "fit" aka the predictions (c.predict$fit) and "se.fit" the standard error for "fit" (b.predict$se.fit)
c.predict.ci <- predict(Model.object,newdata=lm.exercise.data.predict4z,interval="predict") # predict function is used in a slightly different way to give confidence interval for the observations
print(c.predict.ci) # this is a matrix with 3 columns. The first is fit similar to c.predict$fit and lwr and upr are the lozer and upper confidence interval (95%) for the observations around the fit
# plotting regression line for "c" vs. observations with confidenc interval for the regression line (blue) and for the observations (magenta)
plot(y~z,data=lm.exercise.data,cex=2,ylim=c(-15,15)) # another way to use plot. Instead of plot("horizontal axis","vertical axis"), one can use plot("vertical axis"~"horizontal axis") precising in data in which dataset to find the corresponding values
lines(lm.exercise.data.predict4z$z,c.predict$fit,type="l",col="red",lwd=2) # adding lines to the current plot (use of plot would create another independent figure)
lines(lm.exercise.data.predict4z$z,c.predict$fit-1.96*c.predict$se.fit,col="blue",lwd=1) # here we use "fit" and "se.fit" to calculate the lower confidence interval at 95% (fit-1.96*se.fit)
lines(lm.exercise.data.predict4z$z,c.predict$fit+1.96*c.predict$se.fit,col="blue",lwd=1) # here we use "fit" and "se.fit" to calculate the upper confidence interval at 95% (fit+1.96*se.fit)
lines(lm.exercise.data.predict4z$z,c.predict.ci[,2],col="magenta",lwd=1) # here we use "lwr" column (c.predict.ci[,2]) to plot the lower confidence interval at 95%
lines(lm.exercise.data.predict4z$z,c.predict.ci[,3],col="magenta",lwd=1) # here we use "upr" column (c.predict.ci[,3]) to plot the upper confidence interval at 95%
######################## Extra material ############################
##### Same procedure but with centered explanatory variables #######
####################################################################
#lm.exercise.data$y.centered <- lm.exercise.data$y - mean(lm.exercise.data$y)
lm.exercise.data$x.centered <- lm.exercise.data$x - mean(lm.exercise.data$x)
lm.exercise.data$z.centered <- lm.exercise.data$z - mean(lm.exercise.data$z)
Model.object.centered <- lm(y~x.centered+z.centered,data=lm.exercise.data)
summary(Model.object.centered)
#Call:
#lm(formula = y ~ x.centered + z.centered, data = lm.exercise.data)
#
#Residuals:
# Min 1Q Median 3Q Max
#-5.6924 -2.4609 0.0036 1.9967 6.0810
#
#Coefficients:
# Estimate Std. Error t value Pr(>|t|)
#(Intercept) -2.722e-16 6.117e-01 0.000 1.0000
#x.centered 0.05489 0.02161 2.540 0.0163 *
#z.centered -1.46450 0.29388 -4.983 2.25e-05 ***
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#Residual standard error: 3.567 on 31 degrees of freedom
#Multiple R-squared: 0.5665, Adjusted R-squared: 0.5386
#F-statistic: 20.26 on 2 and 31 DF, p-value: 2.359e-06
# same "Estimate" and "Standard error" as previous regression, P-value are the same as well
# centering the data has changed the "Intercept" since now the "0" value for x and z is not the raw "0" but the corresponding predicted value for the mean values of "x" and "z"
# extracting estimates for "a.centered", "b" and "c"
a.centered <- summary(Model.object.centered)$coefficients["(Intercept)","Estimate"]
b.centered <- summary(Model.object.centered)$coefficients["x.centered","Estimate"]
c.centered <- summary(Model.object.centered)$coefficients["z.centered","Estimate"]
# extracting standard errors for "a.centered", "b", "c"
a.centered.se <- summary(Model.object.centered)$coefficients["(Intercept)","Std. Error"]
b.centered.se <- summary(Model.object.centered)$coefficients["x.centered","Std. Error"]
c.centered.se <- summary(Model.object.centered)$coefficients["z.centered","Std. Error"]
# calculating the confidence interval at 95% for "b.centered" and "c.centered"
a.centered.ci.low <- a.centered - 1.96*a.centered.se
a.centered.ci.high <- a.centered + 1.96*a.centered.se
b.centered.ci.low <- b.centered - 1.96*b.centered.se
b.centered.ci.high <- b.centered + 1.96*b.centered.se
c.centered.ci.low <- c.centered - 1.96*c.centered.se
c.centered.ci.high <- c.centered + 1.96*c.centered.se
b.centered.predict <- predict(Model.object,newdata=data.frame(x=seq(0,100,4),z=rep(mean(lm.exercise.data$z),26)),type="response",se.fit=TRUE)
b.centered.predict.ci <- predict(Model.object,newdata=data.frame(x=seq(0,100,4),z=rep(mean(lm.exercise.data$z),26)),interval="predict")
# plotting regression line for "b.centered" vs. observations with confidenc interval for the regression line (blue) and for the observations (magenta)
win.graph()
par(mfrow=c(1,2))
plot(y~x.centered,data=lm.exercise.data,cex=2,ylim=c(-15,15))
points(seq(-50,50,4),b.centered.predict$fit,type="l",col="red",lwd=2)
lines(seq(-50,50,4),b.centered.predict$fit-1.96*b.centered.predict$se.fit,col="blue",lwd=1)
lines(seq(-50,50,4),b.centered.predict$fit+1.96*b.centered.predict$se.fit,col="blue",lwd=1)
lines(seq(-50,50,4),b.centered.predict.ci[,2],col="magenta",lwd=1)
lines(seq(-50,50,4),b.centered.predict.ci[,3],col="magenta",lwd=1)
# plotting regression line for "c.centered" vs. observations with confidenc interval for the regression line (blue) and for the observations (magenta)
c.centered.predict <- predict(Model.object,newdata=data.frame(x=rep(mean(lm.exercise.data$x),26),z=seq(-4,6,(4/10))),type="response",se.fit=TRUE)
c.centered.predict.ci <- predict(Model.object,newdata=data.frame(x=rep(mean(lm.exercise.data$x),26),z=seq(-4,6,(4/10))),interval="predict")
plot(y~z.centered,data=lm.exercise.data,cex=2,ylim=c(-15,15))
points(seq(-4,6,(4/10)),c.centered.predict$fit,type="l",col="red",lwd=2)
lines(seq(-4,6,(4/10)),c.centered.predict$fit-1.96*c.centered.predict$se.fit,col="blue",lwd=1)
lines(seq(-4,6,(4/10)),c.centered.predict$fit+1.96*c.centered.predict$se.fit,col="blue",lwd=1)
lines(seq(-4,6,(4/10)),c.centered.predict.ci[,2],col="magenta",lwd=1)
lines(seq(-4,6,(4/10)),c.centered.predict.ci[,3],col="magenta",lwd=1)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment