Analysis of possum
bodyweight data
poss <- possum
poss$region <- factor(poss$Location)
levels(poss$region) <- c("W.A", "S.A", "N.T", "QuL", "NSW", "Vic", "Tas")
colnames(poss)<-c("y","z","Location", "x")
head(poss)
## y z Location x
## 1 3.5 89.0 1 W.A
## 2 3.3 91.5 1 W.A
## 3 3.5 95.5 1 W.A
## 4 3.1 92.0 1 W.A
## 5 2.8 85.5 1 W.A
## 6 3.0 90.5 1 W.A
# Draw side by side boxplots
boxplot(y~x, data=poss, col=2:8, xlab="region", ylab="Weight")
# Obtain scatter plot
# Start with a skeleton plot
plot(poss$z, poss$y, type="n", xlab="Length", ylab="Weight")
# Add points for the seven regions
for (i in 1:7) {
points(poss$z[poss$Location==i],poss$y[poss$Location==i],type="p", pch=as.character(i), col=i)
}
## Add legends
legend(x=76, y=4.2, legend=paste(as.character(1:7), levels(poss$x)), lty=1:7, col=1:7)
# Start modelling
#Fit the model with interaction.
poss.lm1<-lm(y~z+x+z:x,data=poss)
summary(poss.lm1)
##
## Call:
## lm(formula = y ~ z + x + z:x, data = poss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.67953 -0.15793 0.01999 0.14591 0.78255
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.5842639 1.5008154 -2.388 0.0191 *
## z 0.0748795 0.0167202 4.478 2.27e-05 ***
## xS.A 0.3283443 2.3839136 0.138 0.8908
## xN.T 6.0527549 3.3898871 1.786 0.0777 .
## xQuL -5.9670098 4.5482536 -1.312 0.1930
## xNSW -2.4035332 2.7983463 -0.859 0.3927
## xVic 0.8984502 2.4135144 0.372 0.7106
## xTas -0.1263706 2.4142962 -0.052 0.9584
## z:xS.A -0.0031036 0.0280484 -0.111 0.9121
## z:xN.T -0.0675469 0.0383301 -1.762 0.0815 .
## z:xQuL 0.0663129 0.0494622 1.341 0.1835
## z:xNSW 0.0256881 0.0318912 0.805 0.4227
## z:xVic -0.0141738 0.0279034 -0.508 0.6128
## z:xTas -0.0003129 0.0276672 -0.011 0.9910
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2828 on 87 degrees of freedom
## Multiple R-squared: 0.6731, Adjusted R-squared: 0.6243
## F-statistic: 13.78 on 13 and 87 DF, p-value: 4.8e-16
plot(poss$z, poss$y,type="n", xlab="Length", ylab="Weight")
for (i in 1:7) {
lines(poss$z[poss$Location==i],poss.lm1$fit[poss$Location==i],type="l",
lty=i, col=i, lwd=1.8)
points(poss$z[poss$Location==i],poss$y[poss$Location==i],type="p",
pch=as.character(i), col=i)
}
poss.lm0 <- lm(y~z,data=poss)
abline(poss.lm0, lwd=3, col=9)
# Has drawn the seven parallel regression lines
legend(x=76, y=4.2, legend=paste(as.character(1:7), levels(poss$x)),
lty=1:7, col=1:7)
n <- length(possum$Body_Weight)
# Wrong model since Location is not a numeric covariate
wrong.lm <- lm(Body_Weight~Location, data=possum)
summary(wrong.lm)
##
## Call:
## lm(formula = Body_Weight ~ Location, data = possum)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.20753 -0.21060 0.01309 0.21309 1.35124
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.16630 0.07740 40.910 < 2e-16 ***
## Location -0.07939 0.01801 -4.409 2.64e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4239 on 99 degrees of freedom
## Multiple R-squared: 0.1641, Adjusted R-squared: 0.1557
## F-statistic: 19.44 on 1 and 99 DF, p-value: 2.643e-05
nis <- table(possum$Location)
meanwts <- tapply(possum$Body_Weight, possum$Location, mean)
varwts <- tapply(possum$Body_Weight, possum$Location, var)
datasums <- data.frame(nis=nis, mean=meanwts, var=varwts)
datasums <- data.frame(nis=nis, mean=meanwts, var=varwts)
modelss <- sum(datasums[,2] * (meanwts - mean(meanwts))^2)
residss <- sum( (datasums[,2] - 1) * varwts)
fvalue <- (modelss/6) / (residss/94)
fcritical <- qf(0.95, df1= 6, df2=94)
x <- seq(from=0, to=12, length=200)
y <- df(x, df1=6, df2=94)
plot(x, y, type="l", xlab="", ylab="Density of F(6, 94)", col=4)
abline(v=fcritical, lty=3, col=3)
abline(v=fvalue, lty=2, col=2)
pvalue <- 1-pf(fvalue, df1=6, df2=94)
### Doing the above in R
# Convert the Location column to a factor
possum$Location <- as.factor(possum$Location)
summary(possum) # Now Location is a factor
## Body_Weight Length Location
## Min. :1.80 Min. :75.00 1:33
## 1st Qu.:2.60 1st Qu.:84.00 2:12
## Median :2.80 Median :88.00 3: 7
## Mean :2.88 Mean :87.16 4: 6
## 3rd Qu.:3.20 3rd Qu.:90.00 5:13
## Max. :4.20 Max. :96.50 6:13
## 7:17
# Put the identifiability constraint:
options(contrasts=c("contr.treatment", "contr.poly"))
colnames(possum) <- c("y", "z", "x")
# Fit model M1
possum.lm1 <- lm(y~x, data=possum)
summary(possum.lm1)
##
## Call:
## lm(formula = y ~ x, data = possum)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.84167 -0.23333 0.01765 0.21765 0.96667
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.13333 0.06402 48.943 < 2e-16 ***
## x2 -0.49167 0.12397 -3.966 0.000143 ***
## x3 -0.01905 0.15304 -0.124 0.901214
## x4 0.33333 0.16322 2.042 0.043929 *
## x5 -0.37949 0.12043 -3.151 0.002182 **
## x6 -0.68718 0.12043 -5.706 1.34e-07 ***
## x7 -0.45098 0.10979 -4.108 8.53e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3678 on 94 degrees of freedom
## Multiple R-squared: 0.4026, Adjusted R-squared: 0.3644
## F-statistic: 10.56 on 6 and 94 DF, p-value: 6.204e-09
anova(possum.lm1)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x 6 8.5667 1.42778 10.556 6.204e-09 ***
## Residuals 94 12.7137 0.13525
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
possum.lm2 <- lm(y~z, data=poss)
summary(possum.lm2)
##
## Call:
## lm(formula = y ~ z, data = poss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.77843 -0.19072 -0.02632 0.20928 0.82708
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.284148 0.633108 -6.767 9.36e-10 ***
## z 0.082202 0.007256 11.329 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3059 on 99 degrees of freedom
## Multiple R-squared: 0.5646, Adjusted R-squared: 0.5602
## F-statistic: 128.4 on 1 and 99 DF, p-value: < 2.2e-16
anova(possum.lm2)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## z 1 12.0139 12.0139 128.35 < 2.2e-16 ***
## Residuals 99 9.2665 0.0936
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Include both location and length but no interaction
possum.lm3 <- lm(y~x+z, data=poss)
summary(possum.lm3)
##
## Call:
## lm(formula = y ~ x + z, data = poss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.67242 -0.19970 0.01845 0.17516 0.78209
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.390403 0.816320 -4.153 7.27e-05 ***
## xS.A 0.057028 0.117868 0.484 0.62964
## xN.T 0.100261 0.119312 0.840 0.40288
## xQuL 0.152418 0.128260 1.188 0.23772
## xNSW -0.176672 0.096536 -1.830 0.07044 .
## xVic -0.310958 0.104334 -2.980 0.00367 **
## xTas -0.161791 0.092289 -1.753 0.08288 .
## z 0.072719 0.009083 8.006 3.29e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2845 on 93 degrees of freedom
## Multiple R-squared: 0.6463, Adjusted R-squared: 0.6197
## F-statistic: 24.28 on 7 and 93 DF, p-value: < 2.2e-16
anova(possum.lm3)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x 6 8.5667 1.4278 17.643 1.521e-13 ***
## z 1 5.1876 5.1876 64.102 3.287e-12 ***
## Residuals 93 7.5262 0.0809
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Include interaction effect
possum.lm4 <- lm(y~x+z+x:z, data=poss)
summary(possum.lm4)
##
## Call:
## lm(formula = y ~ x + z + x:z, data = poss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.67953 -0.15793 0.01999 0.14591 0.78255
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.5842639 1.5008154 -2.388 0.0191 *
## xS.A 0.3283443 2.3839136 0.138 0.8908
## xN.T 6.0527549 3.3898871 1.786 0.0777 .
## xQuL -5.9670098 4.5482536 -1.312 0.1930
## xNSW -2.4035332 2.7983463 -0.859 0.3927
## xVic 0.8984502 2.4135144 0.372 0.7106
## xTas -0.1263706 2.4142962 -0.052 0.9584
## z 0.0748795 0.0167202 4.478 2.27e-05 ***
## xS.A:z -0.0031036 0.0280484 -0.111 0.9121
## xN.T:z -0.0675469 0.0383301 -1.762 0.0815 .
## xQuL:z 0.0663129 0.0494622 1.341 0.1835
## xNSW:z 0.0256881 0.0318912 0.805 0.4227
## xVic:z -0.0141738 0.0279034 -0.508 0.6128
## xTas:z -0.0003129 0.0276672 -0.011 0.9910
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2828 on 87 degrees of freedom
## Multiple R-squared: 0.6731, Adjusted R-squared: 0.6243
## F-statistic: 13.78 on 13 and 87 DF, p-value: 4.8e-16
anova(possum.lm4)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x 6 8.5667 1.4278 17.8561 2.194e-13 ***
## z 1 5.1876 5.1876 64.8768 3.833e-12 ***
## x:z 6 0.5696 0.0949 1.1873 0.3206
## Residuals 87 6.9565 0.0800
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(possum.lm2, possum.lm3)
## Analysis of Variance Table
##
## Model 1: y ~ z
## Model 2: y ~ x + z
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 99 9.2665
## 2 93 7.5262 6 1.7404 3.5842 0.003065 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Check the diagnostics for M3
plot(possum.lm3$fit, possum.lm3$res,xlab="Fitted values",ylab="Residuals",
main="Anscombe plot")
abline(h=0)
qqnorm(possum.lm3$res,main="Normal probability plot")
qqline(possum.lm3$res)
Analysis of gas mileage
data
summary(gasmileage)
## mileage model
## Min. :22.00 Length:11
## 1st Qu.:24.00 Class :character
## Median :27.00 Mode :character
## Mean :26.55
## 3rd Qu.:28.50
## Max. :32.00
y <- c(22, 26, 28, 24, 29, 29, 32, 28, 23, 24)
xx <- c(1,1,2,2,2,3,3,3,4,4)
# Plot the observations
plot(xx, y, col="red", pch="*", xlab="Model", ylab="Mileage")
# Method1: Hand calculation
ni <- c(2, 3, 3, 2)
means <- tapply(y, xx, mean)
vars <- tapply(y, xx, var)
round(rbind(means, vars), 2)
## 1 2 3 4
## means 24 27 29.67 23.5
## vars 8 7 4.33 0.5
sum(y^2) # gives 7115
## [1] 7115
totalSS <- sum(y^2) - 10 * (mean(y))^2 # gives 92.5
RSSf <- sum(vars*(ni-1)) # gives 31.17
groupSS <- totalSS - RSSf # gives 61.3331.17/6
meangroupSS <- groupSS/3 # gives 20.44
meanErrorSS <- RSSf/6 # gives 5.194
Fvalue <- meangroupSS/meanErrorSS # gives 3.936
pvalue <- 1-pf(Fvalue, df1=3, df2=6)
#### Method 2: Illustrate using dummy variables
#################################
#Create the design matrix X for the full regression model
g <- 4
n1 <- 2
n2 <- 3
n3 <- 3
n4 <- 2
n <- n1+n2+n3+n4
X <- matrix(0, ncol=g, nrow=n) #Set X as a zero matrix initially
X[1:n1,1] <- 1 #Determine the first column of X
X[(n1+1):(n1+n2),2] <- 1 #the 2nd column
X[(n1+n2+1):(n1+n2+n3),3] <- 1 #the 3rd
X[(n1+n2+n3+1):(n1+n2+n3+n4),4] <- 1 #the 4th
#################################
####Fitting the full model####
#################################
#Estimation
XtXinv <- solve(t(X)%*%X)
betahat <- XtXinv %*%t(X)%*%y #Estimation of the coefficients
Yhat <- X%*%betahat #Fitted Y values
Resids <- y - Yhat #Residuals
SSE <- sum(Resids^2) #Error sum of squares
S2hat <- SSE/(n-g) #Estimation of sigma-square; mean square for error
Sigmahat <- sqrt(S2hat)
##############################################################
####Fitting the reduced model -- the 4 means are equal #####
##############################################################
Xr <- matrix(1, ncol=1, nrow=n)
kr <- dim(Xr)[2]
#Estimation
Varr <- solve(t(Xr)%*%Xr)
hbetar <- solve(t(Xr)%*%Xr)%*%t(Xr)%*% y #Estimation of the coefficients
hYr = Xr%*%hbetar #Fitted Y values
Resir <- y - hYr #Residuals
SSEr <- sum(Resir^2) #Total sum of squares
###################################################################
####F-test for comparing the reduced model with the full model ####
###################################################################
FStat <- ((SSEr-SSE)/(g-kr))/(SSE/(n-g)) #The test statistic of the F-test
alpha <- 0.05
Critical_value_F <- qf(1-alpha, g-kr,n-g) #The critical constant of F-test
pvalue_F <- 1-pf(FStat,g-kr, n-g) #p-value of F-test
modelA <- c(22, 26)
modelB <- c(28, 24, 29)
modelC <- c(29, 32, 28)
modelD <- c(23, 24)
SSerror = sum( (modelA-mean(modelA))^2 ) + sum( (modelB-mean(modelB))^2 )
+ sum( (modelC-mean(modelC))^2 ) + sum( (modelD-mean(modelD))^2 )
## [1] 9.166667
SStotal <- sum( (y-mean(y))^2 )
SSgroup <- SStotal-SSerror
####
#### Method 3: Use the built-in function lm directly
#####################################
aa <- "modelA"
bb <- "modelB"
cc <- "modelC"
dd <- "modelD"
Expl <- c(aa,aa,bb,bb,bb,cc,cc,cc,dd,dd)
is.factor(Expl)
## [1] FALSE
Expl <- factor(Expl)
model1 <- lm(y~Expl)
summary(model1)
##
## Call:
## lm(formula = y ~ Expl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.000 -1.417 0.000 1.750 2.333
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.000 1.612 14.892 5.77e-06 ***
## ExplmodelB 3.000 2.081 1.442 0.1994
## ExplmodelC 5.667 2.081 2.724 0.0345 *
## ExplmodelD -0.500 2.279 -0.219 0.8336
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.279 on 6 degrees of freedom
## Multiple R-squared: 0.6631, Adjusted R-squared: 0.4946
## F-statistic: 3.936 on 3 and 6 DF, p-value: 0.07227
anova(model1)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## Expl 3 61.333 20.4444 3.9358 0.07227 .
## Residuals 6 31.167 5.1944
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
###Alternatively ###
xxf <- factor(xx)
is.factor(xxf)
## [1] TRUE
model2 <- lm(y~xxf)
summary(model2)
##
## Call:
## lm(formula = y ~ xxf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.000 -1.417 0.000 1.750 2.333
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.000 1.612 14.892 5.77e-06 ***
## xxf2 3.000 2.081 1.442 0.1994
## xxf3 5.667 2.081 2.724 0.0345 *
## xxf4 -0.500 2.279 -0.219 0.8336
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.279 on 6 degrees of freedom
## Multiple R-squared: 0.6631, Adjusted R-squared: 0.4946
## F-statistic: 3.936 on 3 and 6 DF, p-value: 0.07227
anova(model2)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## xxf 3 61.333 20.4444 3.9358 0.07227 .
## Residuals 6 31.167 5.1944
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1