# Please load the packages.
library(ggplot2)
library(ipsRdbs)
# Start with a clean workspace
rm(list=ls())
summary(bodyfat)
## Skinfold Bodyfat
## Min. : 28.00 Min. : 5.630
## 1st Qu.: 37.52 1st Qu.: 6.968
## Median : 47.70 Median : 8.625
## Mean : 51.42 Mean : 9.251
## 3rd Qu.: 58.15 3rd Qu.:10.010
## Max. :113.50 Max. :19.940
plot(bodyfat$Skinfold, bodyfat$Bodyfat, xlab="Skin", ylab="Fat")
plot(bodyfat$Skinfold, log(bodyfat$Bodyfat), xlab="Skin", ylab="log Fat")
plot(log(bodyfat$Skinfold), log(bodyfat$Bodyfat), xlab="log Skin", ylab="log Fat")
<- par(no.readonly = TRUE)
old.par # par(mfrow=c(2,2)) # draws four plots in a graph
plot(bodyfat$Skinfold, bodyfat$Bodyfat, xlab="Skin", ylab="Fat")
plot(bodyfat$Skinfold, log(bodyfat$Bodyfat), xlab="Skin", ylab="log Fat")
plot(log(bodyfat$Skinfold), log(bodyfat$Bodyfat), xlab="log Skin", ylab="log Fat")
plot(1:5, 1:5, axes=FALSE, xlab="", ylab="", type="n")
text(2, 2, "Log both X and Y")
text(2, 1, "To have the best plot")
# Keep the transformed variables in the data set
$logskin <- log(bodyfat$Skinfold)
bodyfat$logbfat <- log(bodyfat$Bodyfat)
bodyfat$logskin <- log(bodyfat$Skinfold)
bodyfatpar(old.par)
# Create a grouped variable
$cutskin <- cut(log(bodyfat$Skinfold), breaks=6)
bodyfatboxplot(data=bodyfat, Bodyfat~cutskin, col=2:7)
head(bodyfat)
## Skinfold Bodyfat logskin logbfat cutskin
## 1 44.5 8.47 3.795489 2.136531 (3.57,3.8]
## 2 41.8 7.68 3.732896 2.038620 (3.57,3.8]
## 3 33.7 6.16 3.517498 1.818077 (3.33,3.57]
## 4 50.9 8.56 3.929863 2.147100 (3.8,4.03]
## 5 40.5 6.86 3.701302 1.925707 (3.57,3.8]
## 6 51.2 9.40 3.935740 2.240710 (3.8,4.03]
require(ggplot2)
<- ggplot(data=bodyfat, aes(x=cutskin, y=logbfat)) +
p2 geom_boxplot(col=2:7) +
stat_summary(fun=mean, geom="line", aes(group=1), col="blue", linewidth=1) +
labs(x="Skinfold", y="Percentage of log bodyfat",
title="Boxplot of log-bodyfat percentage vs grouped log-skinfold")
plot(p2)
# Calculate summary statistics
$logskin <- log(bodyfat$Skinfold)
bodyfat$logbfat <- log(bodyfat$Bodyfat)
bodyfat<- nrow(bodyfat)
n <- bodyfat$logskin
x <- bodyfat$logbfat
y <- mean(x)
xbar <- mean(y)
ybar <- var(x)
sx2 <- var(y)
sy2 <- cov(x, y)
sxy <- cor(x, y)
r print(list(n=n, xbar=xbar, ybar=ybar, sx2=sx2, sy2=sy2, sxy=sxy, r=r))
$n [1] 102
$xbar [1] 3.883108
$ybar [1] 2.175978
$sx2 [1] 0.1084285
$sy2 [1] 0.09106112
$sxy [1] 0.09566069
$r [1] 0.9627095
# Find parameter estimates by simple calculations in R
<- r * sqrt(sy2/sx2) # calculates estimate of the slope
hatbeta1 <- ybar - hatbeta1 * xbar # calculates estimate of the intercept
hatbeta0 <- y - hatbeta0 - hatbeta1 * x # calculates residuals
rs <- sum(rs^2)/(n-2) # calculates estimate of sigma2
s2 print(list(beta0=hatbeta0, beta1=hatbeta1, sigma2=s2))
## $beta0
## [1] -1.249881
##
## $beta1
## [1] 0.8822466
##
## $sigma2
## [1] 0.006731453
# Verify the above by model fitting
<- lm(logbfat ~ logskin, data=bodyfat)
bfat.lm summary(bfat.lm)
##
## Call:
## lm(formula = logbfat ~ logskin, data = bodyfat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.223408 -0.055661 0.000052 0.056390 0.152984
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.24988 0.09661 -12.94 <2e-16 ***
## logskin 0.88225 0.02479 35.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08205 on 100 degrees of freedom
## Multiple R-squared: 0.9268, Adjusted R-squared: 0.9261
## F-statistic: 1266 on 1 and 100 DF, p-value: < 2.2e-16
anova(bfat.lm)
## Analysis of Variance Table
##
## Response: logbfat
## Df Sum Sq Mean Sq F value Pr(>F)
## logskin 1 8.5240 8.5240 1266.3 < 2.2e-16 ***
## Residuals 100 0.6731 0.0067
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### Check the diagnostics
plot(bfat.lm$fit, bfat.lm$res, xlab="Fitted values", ylab = "Residuals")
abline(h=0)
### Should be a random scatter
qqnorm(bfat.lm$res)
qqline(bfat.lm$res)
plot(bodyfat$logskin, bodyfat$logbfat, xlab="log Skin", ylab="log Fat")
abline(bfat.lm, col=7)
title("Scatter plot with the fitted Linear Regression line")
# 95% CI for beta(1)
# 0.88225 + c(-1, 1) * qt(0.975, df=100) * 0.02479
# round(0.88225 + c(-1, 1) * qt(0.975, df=100) * 0.02479, 2)
# To test H0: beta1 = 1.
<- (0.88225 -1)/0.02479
tstat <- 2 * (1- pt(abs(tstat), df=100))
pval <- seq(from=-5, to=5, length=500)
x <- dt(x, df=100)
y plot(x, y, xlab="", ylab="", type="l")
title("T-density with df=100")
abline(v=abs(tstat))
abline(h=0)
<- seq(from=abs(tstat), to=10, length=100)
x1 <- rep(0, length=100)
y1 <- x1
x2 <- dt(x1, df=100)
y2 segments(x1, y1, x2, y2)
abline(h=0)
# Predict at a new value of Skinfold=70
# Create a new data set called new
<- data.frame(logskin=log(70))
newx <- predict(bfat.lm, newdata=newx, se.fit=TRUE)
a # Confidence interval for the mean of log bodyfat at skinfold=70
<- predict(bfat.lm, newdata=newx, interval="confidence")
a
a## fit lwr upr
## 1 2.498339 2.474198 2.52248
# Prediction interval for a future log bodyfat at skinfold=70
<- predict(bfat.lm, newdata=newx, interval="prediction")
a
a## fit lwr upr
## 1 2.498339 2.333783 2.662895
#Obtain prediction intervals for the mean
<- predict(bfat.lm, data=bodyfat, interval="confidence")
pred.bfat.clim #prediction intervals for future observation
<- suppressWarnings(predict(bfat.lm, data=bodyfat, interval="prediction"))
pred.bfat.plim plot(bodyfat$logskin, bodyfat$logbfat, xlab="log Skin", ylab="log Fat")
abline(bfat.lm, col=5)
lines(log(bodyfat$Skinfold), pred.bfat.clim[,2], lty=2, col=2)
lines(log(bodyfat$Skinfold), pred.bfat.clim[,3], lty=2, col=2)
lines(log(bodyfat$Skinfold), pred.bfat.plim[,2], lty=4, col=3)
lines(log(bodyfat$Skinfold), pred.bfat.plim[,3], lty=4, col=3)
title("Scatter plot with the fitted line and prediction intervals")
<- c("Fitted line", "95% CI for mean", "95% CI for observation")
symb ## legend(locator(1), legend = symb, lty = c(1, 2, 4), col = c(5, 2, 3))
# Shows where we predicted earlier
abline(v=log(70))