Commit 24dbac19 authored by Morten Brun's avatar Morten Brun
Browse files

Merge branch 'master' of git.app.uib.no:Guttorm.Alendal/dcod-at-runde

parents 8ec03b8a f0658476
"y" "x" "z"
-0.909756548582618 2.46170637708295 1.27277445482252
11.376990080371 5.94145280796229 -1.48705218203546
1.72272082793011 3.14861800048537 4.5804969617416
-6.16312278765597 10.7768390378787 3.24229173869514
6.71769348271644 11.7954428557376 1.35442814978491
-0.169527146415094 14.5617227914136 0.911153184277097
1.74401220972197 18.9227896531973 -1.76327916894443
-8.80988741797101 17.2799660799045 6.8815925136262
-1.12670289770299 21.2636467370828 1.16375934669916
-0.00437256437925415 27.7596639304432 3.55907603643107
4.31162883198335 26.9122341923282 3.42075624005774
4.17529545289114 29.7108562853304 -0.739920265647183
8.20237716193544 39.2024873255502 -1.94375535715508
1.5977446231506 40.1253128588125 1.41911107483929
5.35056443168141 43.6040977838171 0.674308651653961
0.620342224472203 40.508677357116 1.75818022500943
6.28111895161037 49.2285478228924 0.922853110425292
1.84088282139783 51.7842036399679 -0.805334193437204
7.05230377388424 56.2097547247195 -0.212004580937585
4.34000995470397 51.2907121845547 -0.947179710734869
5.67213513184674 57.1567625503324 0.263820494299529
-0.22530916405757 58.7362303129754 0.374975496859908
4.41664813349567 62.9130507430897 2.13317916435549
14.3576094644326 66.5047616429968 -2.32203876824932
10.3846388518747 70.6986568950906 -0.732346591925287
14.6393986860606 79.4899266590262 -1.52443113229333
10.4636003861634 80.1806667931199 -3.50120306173641
9.17076816220583 82.9317015038247 2.17329971689274
1.39445484792714 83.5464350502361 1.0228089561488
8.62112531613397 88.749244347646 0.176686726314658
3.75104817018417 87.1979305914027 0.240844467688684
0.215985791486853 92.923306422665 4.22174214554689
4.7276332236672 93.5968100722317 2.10339999168194
10.311194012441 97.5155190326204 -0.405803508168145
rm(list=ls()) # clearing all variables from the environment
graphics.off() # closing all figures and plots
# setting the working directory with the path to the data file lm.exercise.data.txt
setwd("M:\\pc\\Dokumenter\\Conferences\\Runde Sommer school")
# loading the data in the R environment
lm.exercise.data <- read.table("lm.exercise.data.txt", sep="", header=TRUE, row.names=NULL, blank.lines.skip=TRUE, na.strings="NA")
# Here are the variables:
# y is the response variable, it is going to be predicted as a linear combination of the 2 independent variables x and z
print(lm.exercise.data) # show the whole dataset
print(lm.exercise.data$y) # show the response variable "y" -> the one we are interested to predict
print(lm.exercise.data$x) # show the independent variable "x" -> the one we are interested to predict from
print(lm.exercise.data$z) # show the independent variable "z" -> the other one we are interested to predict from
# first let's plot the variables (response) against the order in the data to see if there is some outliers (data points that are standing out by their value) in the dataset.
plot(lm.exercise.data$y)
# another way to see if there is some outliers in the dataset is to plot the variables as a boxplot
boxplot(lm.exercise.data$y)
# the thick black line indicates the median of the data (value of y that is > to 50% of data in y)
# the box indicates the 25% and 75% percentiles (values of y that are > to 25% and 75% of the data in y)
# whiskers and their legnth indicates a range corresponding to 1.5 times the distance between the 25% or 75% and the median
# any data points outside the range defined by the whiskers can be seen as an outlier in the data y (not the case here)
# we want to predict y as function of x and z
# so we want to evaluate the regression model y = a + b*x + c*z (Eq. 1)
# where y, x, and z are the variables to use in the regression
# a is the intercept for the regression
# b is the regression parameter linked to the independent variable x
# c is the regression parameter linked to the independent variable z
# for each combination of x and z the observed value of y is varying around a mean "mu" with a standard deviation "sigma" (stochasticity)
# we need to take it into account in our regression model and so a random variable "epsilon" (eps) is added to the model which follows a normal distribution with mean=0 and standard deviation="sigma"
# so the equation becomes y = a + b*x + c*z + eps with eps~N(0,sigma) (Eq. 2)
# eps values for the dataset can be extracted by substracting predicted y (^y) to observed y (yobs) using the equation (1)
# ^y values are called the "fit" and eps values the "residuals" of the multilinear regression
# in order to assess the intercept "a" and the two regression parameters "b" and "c"
# we run a multiple linear regression analysis script
Model.object <- lm(y~x+z,data=lm.exercise.data) # "lm" is the linear regression modelling function, we then enter the equation to assess
# with the response variable on the left side the tilt sign (~) and then the summation of the independent variables
# the function generates a model object "Model.object" which contains different informations and results of the analysis
# data informs the cript in each dataset to take the "y", "x" and "z" values
# checking the results from the regression
summary(Model.object)
#Residuals:
# Min 1Q Median 3Q Max
#-5.6924 -2.4609 0.0036 1.9967 6.0810
#
#Coefficients:
# Estimate Std. Error t value Pr(>|t|)
#(Intercept) 2.79203 1.29627 2.154 0.0391 * <- "a" Estimate, standard error of the estimate, t-test parameter, P-value for "a" is equal to 0
#x 0.05489 0.02161 2.540 0.0163 * <- "b" Estimate, standard error of the estimate, t-test parameter, P-value for "b" is equal to 0
#z -1.46450 0.29388 -4.983 2.25e-05 *** <- "c" Estimate, standard error of the estimate, t-test parameter, P-value for "c" is equal to 0
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#Residual standard error: 3.567 on 31 degrees of freedom
#Multiple R-squared: 0.5665, Adjusted R-squared: 0.5386 <- Rsquare of the model, the variables "x" and "z" explain ~54% of the observed variations in "y"
#F-statistic: 20.26 on 2 and 31 DF, p-value: 2.359e-06
#
# plotting the graphical summary for the model results
win.graph()
par(mfrow=c(2,2))
plot(Model.object) # The two upper panel are the most interesting. The one of the left plots residuals (eps) vs. predictions (^y) from the model given the x and z values from the data set
# we can see that there is no obvious trend left in the residuals and that the spread of the residuals around the 0 mean is homogeneous, indicating homoscedasticity i.e. sigma in eps~N(0,sigma) appears constant
# The one on the right is a normal qq plot. When the eps data points align themselves around the qq_line it indicates that those are following a normal distribution
# checking if the residuals are following a normal distribution with constant variance (#2 method)
eps <- resid(Model.object) # extract the residuals "eps" from the model object
fit <- fitted(Model.object) # calculating the predicted y values from the x and z values in the data set
plot(fit,eps,type="p",cex=2) # plotting fitted vs. residuals to check that the spread of residuals around the fitted values is constant on the range of fiited values "homoscedasticity"
abline(mean(eps),0) # plotting the mean value of eps on the same plot (checking that the mean value of the distribution is ~ 0)
qqnorm(eps) # checking that the residuals are following a normal distribution
qqline(eps,col="red") # the points should align themselves around the red line if they originate from a normal distribution
acf(eps) # checking for autocorrelation (eps value correlated against themselves but with 1 time lag, e.g line1 vs. line2 and line2 vs.line3 etc...)
# the first bar (0 time lag) should be 1, the remaining lag should be below significance level
# auto-correlation in the residuals can be a sign that there is a remaining trend in the residuals (either linear or not linear) indicating that the model is not the est fit to the data and that the calculated p-values are biaised
##### Not implemented #######
#library(mgcv)
#a.gam <- gam(eps~s(fit)) # checking for residual non-linear pattern in eps
#summary(a.gam) # P-value >0.05
##### Not implemented #######
# extracting estimates for "a", "b" and "c"
a <- summary(Model.object)$coefficients["(Intercept)","Estimate"]
b <- summary(Model.object)$coefficients["x","Estimate"]
c <- summary(Model.object)$coefficients["z","Estimate"]
# extracting standard errors for "a", "b", "c"
a.se <- summary(Model.object)$coefficients["(Intercept)","Std. Error"]
b.se <- summary(Model.object)$coefficients["x","Std. Error"]
c.se <- summary(Model.object)$coefficients["z","Std. Error"]
# calculating the confidence interval at 95% for "b" and "c"
#a.ci.low <- a - 1.96*a.se
#a.ci.high <- a + 1.96*a.se
b.ci.low <- b - 1.96*b.se
b.ci.high <- b + 1.96*b.se
c.ci.low <- c - 1.96*c.se
c.ci.high <- c + 1.96*c.se
# We now want to see the predictions from the regression model when we change the value of one of the explanatory variable
# First we establish a new data set where we allow one of the explanatory variable (here x) to vary
lm.exercise.data.predict4x <- data.frame(x=seq(0,100,4),z=rep(mean(lm.exercise.data$z),26))
print(lm.exercise.data.predict4x) # display the new data. x is varying from 0 to 100 by increments of 4
# z is constant at its mean value which means that its effect on the predictions is going to be null
b.predict <- predict(Model.object,newdata=lm.exercise.data.predict4x,type="response",se.fit=TRUE) # predict function given a new set of data
print(b.predict) # there are 4 different sub-objects in this object. The interesting one for now are "fit" aka the predictions (b.predict$fit) and "se.fit" the standard error for "fit" (b.predict$se.fit)
b.predict.ci <- predict(Model.object,newdata=lm.exercise.data.predict4x,interval="predict") # predict function is used in a slightly different way to give confidence interval for the observations
print(b.predict.ci) # this is a matrix with 3 columns. The first is fit similar to b.predict$fit and lwr and upr are the lower and upper confidence interval (95%) for the observations around the fit
# plotting regression line for "b" vs. observations with confidence interval for the regression line (blue) and for the observations (magenta)
win.graph() # new graphical window
par(mfrow=c(1,2)) # setting up two figures in the same window
plot(y~x,data=lm.exercise.data,cex=2,ylim=c(-15,15)) # another way to use plot. Instead of plot("horizontal axis","vertical axis"), one can use plot("vertical axis"~"horizontal axis") precising in data in which dataset to find the corresponding values
lines(seq(0,100,4),b.predict$fit,col="red",lwd=2) # adding lines to the current plot (use of plot would create another independent figure)
lines(seq(0,100,4),b.predict$fit-1.96*b.predict$se.fit,col="blue",lwd=1) # here we use "fit" and "se.fit" to calculate the lower confidence interval at 95% (fit-1.96*se.fit)
lines(seq(0,100,4),b.predict$fit+1.96*b.predict$se.fit,col="blue",lwd=1) # here we use "fit" and "se.fit" to calculate the upper confidence interval at 95% (fit+1.96*se.fit)
lines(seq(0,100,4),b.predict.ci[,2],col="magenta",lwd=1) # here we use "lwr" column (c.predict.ci[,2]) to plot the lower confidence interval at 95%
lines(seq(0,100,4),b.predict.ci[,3],col="magenta",lwd=1) # here we use "upr" column (b.predict.ci[,3]) to plot the upper confidence interval at 95%
# Then we establish a new data set where we allow the other explanatory variable (here z) to vary
lm.exercise.data.predict4z <- data.frame(x=rep(mean(lm.exercise.data$x),26),z=seq(-4,6,(4/10)))
print(lm.exercise.data.predict4z) # display the new data. z is varying from -4 to 6 by increments of 0.4
# x is constant at its mean value which means that its effect on the predictions is going to be null
c.predict <- predict(Model.object,newdata=lm.exercise.data.predict4z,type="response",se.fit=TRUE) # predict function given a new set of data
print(c.predict) # there are 4 different variables in this object. The interesting one for now are "fit" aka the predictions (c.predict$fit) and "se.fit" the standard error for "fit" (b.predict$se.fit)
c.predict.ci <- predict(Model.object,newdata=lm.exercise.data.predict4z,interval="predict") # predict function is used in a slightly different way to give confidence interval for the observations
print(c.predict.ci) # this is a matrix with 3 columns. The first is fit similar to c.predict$fit and lwr and upr are the lozer and upper confidence interval (95%) for the observations around the fit
# plotting regression line for "c" vs. observations with confidenc interval for the regression line (blue) and for the observations (magenta)
plot(y~z,data=lm.exercise.data,cex=2,ylim=c(-15,15)) # another way to use plot. Instead of plot("horizontal axis","vertical axis"), one can use plot("vertical axis"~"horizontal axis") precising in data in which dataset to find the corresponding values
lines(lm.exercise.data.predict4z$z,c.predict$fit,type="l",col="red",lwd=2) # adding lines to the current plot (use of plot would create another independent figure)
lines(lm.exercise.data.predict4z$z,c.predict$fit-1.96*c.predict$se.fit,col="blue",lwd=1) # here we use "fit" and "se.fit" to calculate the lower confidence interval at 95% (fit-1.96*se.fit)
lines(lm.exercise.data.predict4z$z,c.predict$fit+1.96*c.predict$se.fit,col="blue",lwd=1) # here we use "fit" and "se.fit" to calculate the upper confidence interval at 95% (fit+1.96*se.fit)
lines(lm.exercise.data.predict4z$z,c.predict.ci[,2],col="magenta",lwd=1) # here we use "lwr" column (c.predict.ci[,2]) to plot the lower confidence interval at 95%
lines(lm.exercise.data.predict4z$z,c.predict.ci[,3],col="magenta",lwd=1) # here we use "upr" column (c.predict.ci[,3]) to plot the upper confidence interval at 95%
######################## Extra material ############################
##### Same procedure but with centered explanatory variables #######
####################################################################
#lm.exercise.data$y.centered <- lm.exercise.data$y - mean(lm.exercise.data$y)
lm.exercise.data$x.centered <- lm.exercise.data$x - mean(lm.exercise.data$x)
lm.exercise.data$z.centered <- lm.exercise.data$z - mean(lm.exercise.data$z)
Model.object.centered <- lm(y~x.centered+z.centered,data=lm.exercise.data)
summary(Model.object.centered)
#Call:
#lm(formula = y ~ x.centered + z.centered, data = lm.exercise.data)
#
#Residuals:
# Min 1Q Median 3Q Max
#-5.6924 -2.4609 0.0036 1.9967 6.0810
#
#Coefficients:
# Estimate Std. Error t value Pr(>|t|)
#(Intercept) -2.722e-16 6.117e-01 0.000 1.0000
#x.centered 0.05489 0.02161 2.540 0.0163 *
#z.centered -1.46450 0.29388 -4.983 2.25e-05 ***
#---
#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#Residual standard error: 3.567 on 31 degrees of freedom
#Multiple R-squared: 0.5665, Adjusted R-squared: 0.5386
#F-statistic: 20.26 on 2 and 31 DF, p-value: 2.359e-06
# same "Estimate" and "Standard error" as previous regression, P-value are the same as well
# centering the data has changed the "Intercept" since now the "0" value for x and z is not the raw "0" but the corresponding predicted value for the mean values of "x" and "z"
# extracting estimates for "a.centered", "b" and "c"
a.centered <- summary(Model.object.centered)$coefficients["(Intercept)","Estimate"]
b.centered <- summary(Model.object.centered)$coefficients["x.centered","Estimate"]
c.centered <- summary(Model.object.centered)$coefficients["z.centered","Estimate"]
# extracting standard errors for "a.centered", "b", "c"
a.centered.se <- summary(Model.object.centered)$coefficients["(Intercept)","Std. Error"]
b.centered.se <- summary(Model.object.centered)$coefficients["x.centered","Std. Error"]
c.centered.se <- summary(Model.object.centered)$coefficients["z.centered","Std. Error"]
# calculating the confidence interval at 95% for "b.centered" and "c.centered"
a.centered.ci.low <- a.centered - 1.96*a.centered.se
a.centered.ci.high <- a.centered + 1.96*a.centered.se
b.centered.ci.low <- b.centered - 1.96*b.centered.se
b.centered.ci.high <- b.centered + 1.96*b.centered.se
c.centered.ci.low <- c.centered - 1.96*c.centered.se
c.centered.ci.high <- c.centered + 1.96*c.centered.se
b.centered.predict <- predict(Model.object,newdata=data.frame(x=seq(0,100,4),z=rep(mean(lm.exercise.data$z),26)),type="response",se.fit=TRUE)
b.centered.predict.ci <- predict(Model.object,newdata=data.frame(x=seq(0,100,4),z=rep(mean(lm.exercise.data$z),26)),interval="predict")
# plotting regression line for "b.centered" vs. observations with confidenc interval for the regression line (blue) and for the observations (magenta)
win.graph()
par(mfrow=c(1,2))
plot(y~x.centered,data=lm.exercise.data,cex=2,ylim=c(-15,15))
points(seq(-50,50,4),b.centered.predict$fit,type="l",col="red",lwd=2)
lines(seq(-50,50,4),b.centered.predict$fit-1.96*b.centered.predict$se.fit,col="blue",lwd=1)
lines(seq(-50,50,4),b.centered.predict$fit+1.96*b.centered.predict$se.fit,col="blue",lwd=1)
lines(seq(-50,50,4),b.centered.predict.ci[,2],col="magenta",lwd=1)
lines(seq(-50,50,4),b.centered.predict.ci[,3],col="magenta",lwd=1)
# plotting regression line for "c.centered" vs. observations with confidenc interval for the regression line (blue) and for the observations (magenta)
c.centered.predict <- predict(Model.object,newdata=data.frame(x=rep(mean(lm.exercise.data$x),26),z=seq(-4,6,(4/10))),type="response",se.fit=TRUE)
c.centered.predict.ci <- predict(Model.object,newdata=data.frame(x=rep(mean(lm.exercise.data$x),26),z=seq(-4,6,(4/10))),interval="predict")
plot(y~z.centered,data=lm.exercise.data,cex=2,ylim=c(-15,15))
points(seq(-4,6,(4/10)),c.centered.predict$fit,type="l",col="red",lwd=2)
lines(seq(-4,6,(4/10)),c.centered.predict$fit-1.96*c.centered.predict$se.fit,col="blue",lwd=1)
lines(seq(-4,6,(4/10)),c.centered.predict$fit+1.96*c.centered.predict$se.fit,col="blue",lwd=1)
lines(seq(-4,6,(4/10)),c.centered.predict.ci[,2],col="magenta",lwd=1)
lines(seq(-4,6,(4/10)),c.centered.predict.ci[,3],col="magenta",lwd=1)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment