Prof Sujit Sahu - Statistics and Data Science

Chapter 18: Multiple Linear Regression Model

# Please load the packages. 
library(ggplot2)
library(ipsRdbs)
# Start with a clean workspace 
rm(list=ls())

 summary(rice)
##      Yield           Days     
##  Min.   :2508   Min.   :16.0  
##  1st Qu.:3092   1st Qu.:23.5  
##  Median :3318   Median :31.0  
##  Mean   :3283   Mean   :31.0  
##  3rd Qu.:3549   3rd Qu.:38.5  
##  Max.   :3883   Max.   :46.0
 plot(rice$Days, rice$Yield, pch="*", xlab="Days", ylab="Yield")

 rice$daymin31 <- rice$Days-31
 rice.lm <- lm(Yield ~ daymin31, data=rice)
 summary(rice.lm)
## 
## Call:
## lm(formula = Yield ~ daymin31, data = rice)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -691.07 -217.65   45.85  271.77  612.14 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3283.12     103.95  31.582 2.05e-14 ***
## daymin31       12.26      11.28   1.088    0.295    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 415.8 on 14 degrees of freedom
## Multiple R-squared:  0.07791,    Adjusted R-squared:  0.01205 
## F-statistic: 1.183 on 1 and 14 DF,  p-value: 0.2951
 # Check the diagnostics 
 plot(rice.lm$fit, rice.lm$res, xlab="Fitted values", ylab = "Residuals")
 abline(h=0)

 # Should be a random scatter
 # Needs a quadratic term
 
 qqnorm(rice.lm$res)
 qqline(rice.lm$res)

 rice.lm2 <- lm(Yield ~ daymin31 + I(daymin31^2) ,data=rice)
 old.par <- par(no.readonly = TRUE)
 par(mfrow=c(1, 2))
 plot(rice.lm2$fit, rice.lm2$res, xlab="Fitted values", ylab = "Residuals")
 abline(h=0)
 # Should be a random scatter 
 # Much better plot!
 qqnorm(rice.lm2$res)
 qqline(rice.lm2$res)

 summary(rice.lm2)
## 
## Call:
## lm(formula = Yield ~ daymin31 + I(daymin31^2), data = rice)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -303.96 -118.11   13.86  115.67  319.06 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3668.6682    76.7086  47.826 5.33e-16 ***
## daymin31        12.2632     5.5286   2.218    0.045 *  
## I(daymin31^2)   -4.5358     0.6744  -6.726 1.41e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 203.9 on 13 degrees of freedom
## Multiple R-squared:  0.7942, Adjusted R-squared:  0.7625 
## F-statistic: 25.08 on 2 and 13 DF,  p-value: 3.452e-05
 par(old.par) # par(mfrow=c(1,1))
 plot(rice$Days,  rice$Yield, xlab="Days", ylab="Yield")
 lines(rice$Days, rice.lm2$fit, lty=1, col=3)

 rice.lm3 <- lm(Yield ~ daymin31 + I(daymin31^2)+I(daymin31^3) , data=rice)
 #check the diagnostics 
 summary(rice.lm3) # Will print the summary of the fitted model 
## 
## Call:
## lm(formula = Yield ~ daymin31 + I(daymin31^2) + I(daymin31^3), 
##     data = rice)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -281.97 -113.21   -6.11   97.75  330.92 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3668.66815   79.32668  46.248 6.82e-15 ***
## daymin31        17.52493   14.49451   1.209     0.25    
## I(daymin31^2)   -4.53580    0.69743  -6.504 2.92e-05 ***
## I(daymin31^3)   -0.03457    0.08751  -0.395     0.70    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 210.8 on 12 degrees of freedom
## Multiple R-squared:  0.7968, Adjusted R-squared:  0.746 
## F-statistic: 15.68 on 3 and 12 DF,  p-value: 0.0001876
 #### Predict at a new value of Days=31.1465
 
 # Create a new data set called new
 new <- data.frame(daymin31=32.1465-31)
 
 a <- predict(rice.lm2, newdata=new, se.fit=TRUE) 
 # Confidence interval for the mean of rice yield  at day=31.1465
 a <- predict(rice.lm2, newdata=new, interval="confidence") 
 a
##        fit      lwr      upr
## 1 3676.766 3511.904 3841.628
 
 b <- predict(rice.lm2, newdata=new, interval="prediction") 
 b
##        fit      lwr      upr
## 1 3676.766 3206.461 4147.071

summary(cheese)
##      Taste         AceticAcid          H2S             LacticAcid   
##  Min.   : 0.70   Min.   : 87.97   Min.   :   20.01   Min.   :0.860  
##  1st Qu.:13.55   1st Qu.:188.20   1st Qu.:   53.74   1st Qu.:1.250  
##  Median :20.95   Median :227.03   Median :  207.46   Median :1.450  
##  Mean   :24.53   Mean   :284.18   Mean   : 2702.98   Mean   :1.442  
##  3rd Qu.:36.70   3rd Qu.:358.84   3rd Qu.: 1950.35   3rd Qu.:1.667  
##  Max.   :57.20   Max.   :637.78   Max.   :26876.31   Max.   :2.010  
##      logH2S      
##  Min.   : 2.996  
##  1st Qu.: 3.977  
##  Median : 5.329  
##  Mean   : 5.942  
##  3rd Qu.: 7.575  
##  Max.   :10.199
 pairs(cheese)

 GGally::ggpairs(data=cheese)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

 cheese.lm <- lm(Taste ~ AceticAcid +  LacticAcid + logH2S, data=cheese, subset=2:30)
 # Check the diagnostics 
 plot(cheese.lm$fit, cheese.lm$res, xlab="Fitted values", ylab = "Residuals")
 abline(h=0)

 # Should be a random scatter
 qqnorm(cheese.lm$res)
 qqline(cheese.lm$res)

 summary(cheese.lm)
## 
## Call:
## lm(formula = Taste ~ AceticAcid + LacticAcid + logH2S, data = cheese, 
##     subset = 2:30)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.0718  -5.9432   0.3657   5.3919  25.0876 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -31.41789    9.95702  -3.155  0.00414 **
## AceticAcid    0.00478    0.01484   0.322  0.74997   
## LacticAcid   21.70560    8.69079   2.498  0.01945 * 
## logH2S        3.84994    1.21284   3.174  0.00396 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.06 on 25 degrees of freedom
## Multiple R-squared:  0.6631, Adjusted R-squared:  0.6227 
## F-statistic:  16.4 on 3 and 25 DF,  p-value: 4.222e-06
 cheese.lm2 <- lm(Taste ~ LacticAcid + logH2S, data=cheese)
 # Check the diagnostics 
 plot(cheese.lm2$fit, cheese.lm2$res, xlab="Fitted values", ylab = "Residuals")
 abline(h=0)

 qqnorm(cheese.lm2$res)
 qqline(cheese.lm2$res)

 summary(cheese.lm2)
## 
## Call:
## lm(formula = Taste ~ LacticAcid + logH2S, data = cheese)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.343  -6.529  -1.164   4.844  25.618 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -27.592      8.982  -3.072  0.00481 **
## LacticAcid    19.888      7.959   2.499  0.01885 * 
## logH2S         3.946      1.136   3.475  0.00174 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.942 on 27 degrees of freedom
## Multiple R-squared:  0.6517, Adjusted R-squared:  0.6259 
## F-statistic: 25.26 on 2 and 27 DF,  p-value: 6.552e-07
 # How can we predict? 
 newcheese <- data.frame(AceticAcid = 300, LacticAcid = 1.5, logH2S=4)
 cheese.pred <- predict(cheese.lm2, newdata=newcheese, se.fit=TRUE)
 cheese.pred
## $fit
##        1 
## 18.02409 
## 
## $se.fit
## [1] 3.111924
## 
## $df
## [1] 27
## 
## $residual.scale
## [1] 9.942372
 # Obtain confidence interval 
 cheese.pred$fit + c(-1, 1) * qt(0.975, df=27) * cheese.pred$se.fit
## [1] 11.63895 24.40923
 # Using R to predict  
 cheese.pred.conf.limits <- predict(cheese.lm2, newdata=newcheese, interval="confidence")
 cheese.pred.conf.limits
##        fit      lwr      upr
## 1 18.02409 11.63895 24.40923
 # How to find prediction interval 
 cheese.pred.pred.limits <- predict(cheese.lm2, newdata=newcheese, interval="prediction")
 cheese.pred.pred.limits
##        fit       lwr      upr
## 1 18.02409 -3.351891 39.40007


pairs(puffin)

summary(puffin)
##  Nesting_Frequency  Grass_Cover    Mean_Soil_Depth  Slope_Angle   
##  Min.   : 0.000    Min.   : 0.00   Min.   :24.30   Min.   : 2.00  
##  1st Qu.: 0.000    1st Qu.:40.00   1st Qu.:32.75   1st Qu.: 7.25  
##  Median : 7.500    Median :60.00   Median :37.40   Median :10.00  
##  Mean   : 7.684    Mean   :56.45   Mean   :37.72   Mean   :15.00  
##  3rd Qu.:12.750    3rd Qu.:80.00   3rd Qu.:42.83   3rd Qu.:21.25  
##  Max.   :25.000    Max.   :95.00   Max.   :51.40   Max.   :38.00  
##  Distance_from_Edge
##  Min.   : 3.00     
##  1st Qu.:18.00     
##  Median :30.00     
##  Mean   :30.39     
##  3rd Qu.:44.25     
##  Max.   :60.00
puffin$sqrtfreq <- sqrt(puffin$Nesting_Frequency)


puff.sqlm <- lm(sqrtfreq~ Grass_Cover + Mean_Soil_Depth + Slope_Angle
      + Distance_from_Edge, data=puffin) 


#par(mfrow=c(2,1))
qqnorm(puff.sqlm$res,main="Normal probability plot")
qqline(puff.sqlm$res)

plot(puff.sqlm$fit, puff.sqlm$res,xlab="Fitted values",ylab="Residuals", main="Anscombe plot")
abline(h=0)


summary(puff.sqlm)
## 
## Call:
## lm(formula = sqrtfreq ~ Grass_Cover + Mean_Soil_Depth + Slope_Angle + 
##     Distance_from_Edge, data = puffin)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.17058 -0.28555  0.07289  0.43606  1.08192 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.4176153  0.7479664   5.906 1.27e-06 ***
## Grass_Cover         0.0004179  0.0045698   0.091   0.9277    
## Mean_Soil_Depth     0.0486374  0.0181385   2.681   0.0114 *  
## Slope_Angle        -0.0318777  0.0182695  -1.745   0.0903 .  
## Distance_from_Edge -0.1189727  0.0134970  -8.815 3.45e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6217 on 33 degrees of freedom
## Multiple R-squared:  0.8853, Adjusted R-squared:  0.8714 
## F-statistic: 63.67 on 4 and 33 DF,  p-value: 4.753e-15

#####################################
# F test for two betas at the  same time: 
######################################

puff.sqlm2 <- lm(sqrtfreq~ Mean_Soil_Depth + Distance_from_Edge, data=puffin) 

anova(puff.sqlm)
## Analysis of Variance Table
## 
## Response: sqrtfreq
##                    Df Sum Sq Mean Sq F value    Pr(>F)    
## Grass_Cover         1  7.307   7.307  18.904 0.0001241 ***
## Mean_Soil_Depth     1  0.820   0.820   2.122 0.1546449    
## Slope_Angle         1 60.278  60.278 155.945 4.725e-14 ***
## Distance_from_Edge  1 30.034  30.034  77.700 3.453e-10 ***
## Residuals          33 12.756   0.387                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(puff.sqlm2)
## Analysis of Variance Table
## 
## Response: sqrtfreq
##                    Df Sum Sq Mean Sq  F value Pr(>F)    
## Mean_Soil_Depth     1  0.513   0.513   1.2596 0.2694    
## Distance_from_Edge  1 96.437  96.437 236.9547 <2e-16 ***
## Residuals          35 14.245   0.407                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

fval <-  1/2*(14.245-12.756)/0.387

# 1.924 
 qf(0.95, 2, 33)
## [1] 3.284918
# 3.28
 1-pf(fval, 2, 33)
## [1] 0.1620835
# 0.162

anova(puff.sqlm2, puff.sqlm)
## Analysis of Variance Table
## 
## Model 1: sqrtfreq ~ Mean_Soil_Depth + Distance_from_Edge
## Model 2: sqrtfreq ~ Grass_Cover + Mean_Soil_Depth + Slope_Angle + Distance_from_Edge
##   Res.Df    RSS Df Sum of Sq     F Pr(>F)
## 1     35 14.245                          
## 2     33 12.756  2    1.4889 1.926 0.1618


summary(emissions)

 Make              Odometer         Capacity         CS505

Length:54 Min. : 7285 Min. :1.300 Min. : 3.675
Class :character 1st Qu.: 44208 1st Qu.:1.600 1st Qu.: 8.364
Mode :character Median : 80164 Median :2.000 Median :10.598
Mean : 84444 Mean :2.346 Mean :10.949
3rd Qu.:117248 3rd Qu.:2.900 3rd Qu.:13.125
Max. :232574 Max. :5.000 Max. :21.475
T867 H505 ADR27 ADR37
Min. : 1.158 Min. : 1.340 Min. :0.480 Min. :0.340
1st Qu.: 4.188 1st Qu.: 4.884 1st Qu.:1.053 1st Qu.:1.005
Median : 6.769 Median : 8.795 Median :1.475 Median :1.423
Mean : 7.217 Mean : 9.376 Mean :1.525 Mean :1.478
3rd Qu.:10.001 3rd Qu.:12.906 3rd Qu.:1.981 3rd Qu.:2.029
Max. :15.982 Max. :21.465 Max. :2.851 Max. :2.862
logCS505 logT867 logH505 logADR27
Min. :1.302 Min. :0.1466 Min. :0.2927 Min. :-0.73397
1st Qu.:2.124 1st Qu.:1.4322 1st Qu.:1.5850 1st Qu.: 0.05191
Median :2.361 Median :1.9104 Median :2.1742 Median : 0.38857
Mean :2.318 Mean :1.8185 Mean :2.0440 Mean : 0.33895
3rd Qu.:2.575 3rd Qu.:2.3026 3rd Qu.:2.5577 3rd Qu.: 0.68382
Max. :3.067 Max. :2.7715 Max. :3.0664 Max. : 1.04765
logADR37
Min. :-1.078810
1st Qu.: 0.004947
Median : 0.352585
Mean : 0.290615
3rd Qu.: 0.707568
Max. : 1.051611

 
 rawdata <- emissions[, c(8, 4:7)]
 pairs(rawdata)

# Fit the model on the raw scale 
raw.lm <- lm(ADR37 ~ ADR27 + CS505  + T867 + H505, data=rawdata) 
old.par <- par(no.readonly = TRUE)
par(mfrow=c(2,1))
plot(raw.lm$fit, raw.lm$res,xlab="Fitted values",ylab="Residuals", main="Anscombe plot") 
abline(h=0)
qqnorm(raw.lm$res,main="Normal probability plot")
qqline(raw.lm$res)

# summary(raw.lm)
logdata <- log(rawdata)
# This only logs the values but not the column names!
# We can use the following command to change the column names or you can use
# fix(logdata) to do it. 
dimnames(logdata)[[2]] <- c("logADR37", "logCS505", "logT867", "logH505", "logADR27")
pairs(logdata)

log.lm <- lm(logADR37 ~ logADR27 + logCS505  + logT867 + logH505, data=logdata) 
plot(log.lm$fit, log.lm$res,xlab="Fitted values",ylab="Residuals", main="Anscombe plot") 
abline(h=0)
qqnorm(log.lm$res,main="Normal probability plot")
qqline(log.lm$res)

summary(log.lm)

Call: lm(formula = logADR37 ~ logADR27 + logCS505 + logT867 + logH505, data = logdata)

Residuals: Min 1Q Median 3Q Max -0.140390 -0.038389 -0.003712 0.040204 0.153275

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.12997 0.43073 -0.302 0.764119
logADR27 0.94173 0.24140 3.901 0.000292 logCS505 -0.16790 0.16086 -1.044 0.301712
logT867 0.15736 0.08986 1.751 0.086177 .
logH505 0.09997 0.01857 5.385 2.04e-06 — Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

Residual standard error: 0.06038 on 49 degrees of freedom Multiple R-squared: 0.9852, Adjusted R-squared: 0.984 F-statistic: 815.9 on 4 and 49 DF, p-value: < 2.2e-16

log.lm2 <- lm(logADR37 ~ logADR27 + logT867 + logH505, data=logdata) 
summary(log.lm2)

Call: lm(formula = logADR37 ~ logADR27 + logT867 + logH505, data = logdata)

Residuals: Min 1Q Median 3Q Max -0.157932 -0.039221 -0.004817 0.038672 0.157626

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.57484 0.06231 -9.226 2.25e-12 logADR27 0.69588 0.05289 13.157 < 2e-16 logT867 0.24471 0.03275 7.472 1.10e-09 logH505 0.09030 0.01610 5.608 8.85e-07 — Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

Residual standard error: 0.06043 on 50 degrees of freedom Multiple R-squared: 0.9849, Adjusted R-squared: 0.984 F-statistic: 1086 on 3 and 50 DF, p-value: < 2.2e-16

plot(log.lm2$fit, log.lm2$res,xlab="Fitted values",ylab="Residuals", main="Anscombe plot") 
abline(h=0)
qqnorm(log.lm2$res,main="Normal probability plot")
qqline(log.lm2$res)

par(old.par)
#####################################
# Multicollinearity Analysis 
######################################
mod.adr27 <-  lm(logADR27 ~ logT867 + logCS505 + logH505, data=logdata) 
summary(mod.adr27) # Multiple R^2 = 0.9936,

Call: lm(formula = logADR27 ~ logT867 + logCS505 + logH505, data = logdata)

Residuals: Min 1Q Median 3Q Max -0.060363 -0.020065 -0.005929 0.015471 0.153280

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.77188 0.02975 -59.56 < 2e-16 logT867 0.36474 0.01052 34.66 < 2e-16 logCS505 0.65020 0.02063 31.52 < 2e-16 * logH505 -0.02901 0.01007 -2.88 0.00584 — Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

Residual standard error: 0.03537 on 50 degrees of freedom Multiple R-squared: 0.9936, Adjusted R-squared: 0.9932 F-statistic: 2589 on 3 and 50 DF, p-value: < 2.2e-16

mod.t867 <-  lm(logT867 ~ logADR27 + logH505 + logCS505, data=logdata)  
summary(mod.t867) # Multiple R^2 = 0.977,

Call: lm(formula = logT867 ~ logADR27 + logH505 + logCS505, data = logdata)

Residuals: Min 1Q Median 3Q Max -0.46512 -0.04505 0.01989 0.05619 0.16509

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.64307 0.16840 27.572 <2e-16 logADR27 2.63215 0.07594 34.663 <2e-16 logH505 0.07192 0.02739 2.626 0.0114 *
logCS505 -1.66719 0.09218 -18.086 <2e-16 *** — Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

Residual standard error: 0.09503 on 50 degrees of freedom Multiple R-squared: 0.977, Adjusted R-squared: 0.9757 F-statistic: 709.1 on 3 and 50 DF, p-value: < 2.2e-16

mod.cs505 <-  lm(logCS505 ~ logADR27 + logH505 + logT867, data=logdata)  
summary(mod.cs505) # Multiple R^2 = 0.9837,

Call: lm(formula = logCS505 ~ logADR27 + logH505 + logT867, data = logdata)

Residuals: Min 1Q Median 3Q Max -0.197444 -0.025126 0.008913 0.028843 0.104478

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.64960 0.05473 48.411 < 2e-16 logADR27 1.46430 0.04646 31.519 < 2e-16 logH505 0.05760 0.01415 4.072 0.000166 logT867 -0.52029 0.02877 -18.086 < 2e-16 — Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

Residual standard error: 0.05309 on 50 degrees of freedom Multiple R-squared: 0.9837, Adjusted R-squared: 0.9828 F-statistic: 1008 on 3 and 50 DF, p-value: < 2.2e-16

mod.h505 <-  lm(logH505 ~ logADR27 + logCS505 + logT867, data=logdata)  
summary(mod.h505) # Multiple R^2 = 0.5784,

Call: lm(formula = logH505 ~ logADR27 + logCS505 + logT867, data = logdata)

Residuals: Min 1Q Median 3Q Max -1.50570 -0.11784 0.07984 0.24783 0.85079

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) -9.3778 3.0009 -3.125 0.002959 ** logADR27 -4.9041 1.7029 -2.880 0.005844 ** logCS505 4.3236 1.0618 4.072 0.000166 ** logT867 1.6848 0.6417 2.626 0.011444
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

Residual standard error: 0.4599 on 50 degrees of freedom Multiple R-squared: 0.5784, Adjusted R-squared: 0.5531 F-statistic: 22.87 on 3 and 50 DF, p-value: 1.849e-09

# Variance inflation factors 
vifs <- c(0.9936, 0.977, 0.9837, 0.5784)
vifs <- 1/(1-vifs) 
#Condition numbers 
X <- logdata 
# X is a copy of logdata 
X[,1] <- 1
# the first column of X is 1
# this is for the intercept 
X <- as.matrix(X) 
# Coerces X to be a matrix
xtx <- t(X) %*% X # Gives X^T X
eigenvalues <- eigen(xtx)$values
kappa <- max(eigenvalues)/min(eigenvalues)
kappa <- sqrt(kappa)
# kappa = 244 is much LARGER than 30!

### Validation statistic
# Fit the log.lm2 model with the first 45 observations  
# use the fitted model to predict the remaining 9 observations 
# Calculate the mean square error validation statistic 
log.lmsub <- lm(logADR37 ~ logADR27 + logT867 + logH505, data=logdata, subset=1:45) 
# Now predict all 54 observations using the fitted model
mod.pred <- predict(log.lmsub, logdata, se.fit=TRUE) 
mod.pred$fit # provides all the 54 predicted values

       1            2            3            4            5            6

0.869733716 0.751945766 0.464377364 0.065106368 -0.750908661 0.676749888 7 8 9 10 11 12 0.863161161 0.533123773 0.724024242 0.101489732 0.758165626 -0.009235144 13 14 15 16 17 18 0.262467812 0.499359749 0.896844370 -0.512513262 0.860758626 0.288778088 19 20 21 22 23 24 0.638085289 0.691680440 0.047967101 -0.125585969 0.542661299 -0.298423547 25 26 27 28 29 30 -0.607263492 0.529236907 0.296110240 0.476668407 -0.343488224 0.286358929 31 32 33 34 35 36 0.732443474 0.740078857 -0.002182982 0.421013296 -0.201423341 0.002081194 37 38 39 40 41 42 0.782644479 -0.305494632 -0.978948479 0.130633957 0.245486987 0.071403922 43 44 45 46 47 48 0.781436122 0.341921707 0.195907253 0.425131377 -0.094842350 0.754611045 49 50 51 52 53 54 0.444664924 0.353051544 0.805738645 -0.303182437 -0.356706826 1.052803842

logdata$pred <- mod.pred$fit
# Get only last 9 
a <- logdata[46:54, ]
validation.residuals <- a$logADR37 - a$pred  
validation.stat <- mean(validation.residuals^2)
validation.stat

[1] 0.005814136

Chapter 18: Multiple Linear Regression Model

Sujit K. Sahu, https://www.sujitsahu.com/

2023-07-05