1 Code used to install libraries
# It is more convenient to install the package ipsRdbs
# Please issue the R commands
install.packages("ipsRdbs", dependencies = TRUE)
library(ipsRdbs)
ls("package:ipsRdbs")
# If these did not work please ask a local expert
# to help you with the installation.
# You may also search the internet for the error messages
# given by R.
# From now on it will be assumed that you have been able to
# install the package in your computer. Otherwise, the
# commands below will not work.
# If packages are installed successfully, please load those
library(ggplot2)
library(ipsRdbs)
2 Alternative ways to read data into R
# If installation of the package ipsRdbs is NOT successful, please
# Download .txt data files from # https://github.com/sujit-sahu/ipsRdbs/tree/main/data
# Or you may download the R data file
# from https://github.com/sujit-sahu/bookipsRdbs/blob/main/ipsRdbs.Rdata
# and save it in your computer.
# Then you may load all the data files by issuing the command
load(file="ipsRdbs.Rdata")
# Remember to set the working directory to the location # where the .Rdata file has been saved.
# Read the data sets directly from the book website if you
# were unable to install the R package ipsRdbs.
# It is not necessary to run these commands if the R package
# ipsRdbs has been installed and loaded already.
<- "https://www.sujitsahu.com/ipsRdbs/"
path <- scan(paste0(path, "cfail.txt"))
cfail <- read.csv(paste0(path, "ffood.csv"), head=T)
ffood <- read.table(paste0(path, "wtgain.txt"), head=T) # Reads the weight gain data set
wgain <- read.table(paste0(path, "billionaires.txt"), head=T)# Reads the billionaire data set bill
3 Code used to obtain summary statistics
## Assume that the ipsRdbs package has been installed and
## loaded by the library command:
library(ipsRdbs)
# Prints the data frame ffood
ffood## AM PM
## 1 38 45
## 2 100 62
## 3 64 52
## 4 43 72
## 5 63 81
## 6 59 88
## 7 107 64
## 8 52 75
## 9 86 59
## 10 77 70
## Prints first few rows
head(ffood)
## AM PM
## 1 38 45
## 2 100 62
## 3 64 52
## 4 43 72
## 5 63 81
## 6 59 88
## Prints last few rows
tail(ffood)
## AM PM
## 5 63 81
## 6 59 88
## 7 107 64
## 8 52 75
## 9 86 59
## 10 77 70
# Prints the dimension
dim(ffood)
## [1] 10 2
names(ffood) # Prints the column names of the data set
## [1] "AM" "PM"
$AM # Prints the values in the AM column
ffood## [1] 38 100 64 43 63 59 107 52 86 77
1,] # Prints the first row and all columns
ffood[## AM PM
## 1 38 45
1] # Prints the first column and all rows
ffood[,## [1] 38 100 64 43 63 59 107 52 86 77
1:2, ] # Prints the first two rows and all columns
ffood[## AM PM
## 1 38 45
## 2 100 62
1, 2] # Prints the first row second column entry
ffood[## [1] 45
# Prints the summaries
summary(ffood)
## AM PM
## Min. : 38.00 Min. :45.00
## 1st Qu.: 53.75 1st Qu.:59.75
## Median : 63.50 Median :67.00
## Mean : 68.90 Mean :66.80
## 3rd Qu.: 83.75 3rd Qu.:74.25
## Max. :107.00 Max. :88.00
summary(cfail)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 1.00 3.00 3.75 5.00 17.00
summary(wgain)
## student initial final
## Min. : 1.00 Min. :42.64 Min. : 43.54
## 1st Qu.:17.75 1st Qu.:53.86 1st Qu.: 54.32
## Median :34.50 Median :60.78 Median : 60.78
## Mean :34.50 Mean :61.72 Mean : 62.59
## 3rd Qu.:51.25 3rd Qu.:68.04 3rd Qu.: 68.49
## Max. :68.00 Max. :99.79 Max. :101.60
summary(bill)
## wealth age region
## Min. : 1.000 Min. : 7.00 Length:225
## 1st Qu.: 1.300 1st Qu.: 56.00 Class :character
## Median : 1.800 Median : 65.00 Mode :character
## Mean : 2.726 Mean : 64.03
## 3rd Qu.: 3.000 3rd Qu.: 72.00
## Max. :37.000 Max. :102.00
dim(ffood)
## [1] 10 2
names(ffood) # Prints the column names of the data set
## [1] "AM" "PM"
$AM # Prints the values in the AM column
ffood## [1] 38 100 64 43 63 59 107 52 86 77
1,] # Prints the first row and all columns
ffood[## AM PM
## 1 38 45
1] # Prints the first column and all rows
ffood[,## [1] 38 100 64 43 63 59 107 52 86 77
1:2, ] # Prints the first two rows and all columns
ffood[## AM PM
## 1 38 45
## 2 100 62
1, 2] # Prints the first row second column entry
ffood[## [1] 45
table(cfail)
## cfail
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 17
## 12 16 21 12 11 8 7 2 4 2 3 2 2 1 1
var(ffood$AM) # variance of the AM data
## [1] 538.3222
var(c(ffood$AM, ffood$PM))
## [1] 337.2921
# variance of the AM and PM data combined
table(bill$region)
##
## A E M O U
## 37 76 22 28 62
4 Some basic plotting functions
stem(ffood$AM)
##
## The decimal point is 1 digit(s) to the right of the |
##
## 2 | 8
## 4 | 329
## 6 | 347
## 8 | 6
## 10 | 07
stem(cfail)
##
## The decimal point is at the |
##
## 0 | 000000000000
## 1 | 0000000000000000
## 2 | 000000000000000000000
## 3 | 000000000000
## 4 | 00000000000
## 5 | 00000000
## 6 | 0000000
## 7 | 00
## 8 | 0000
## 9 | 00
## 10 | 000
## 11 | 00
## 12 | 00
## 13 | 0
## 14 |
## 15 |
## 16 |
## 17 | 0
hist(cfail)
hist(cfail, xlab="Number of weekly computer failures")
plot(wgain$initial, wgain$final) # plot wgain$final against wgain$initial
abline(0, 1, col="red") # add a 45 degree line
# Nicer and more informative plot
plot(wgain$initial, wgain$final,xlab="Wt in Week 1", ylab="Wt in Week 12", pch="*", las=1) # relabel both axes
abline(0, 1, col="red")
title("A scatterplot of the weights in Week 12 against the weights in Week 1") # add a title
boxplot(cfail)
## Example of programming in R
boxplot(ffood)
boxplot(data=bill, wealth ~ region, col=2:6)
# Get help by issuing the help command
# ?par
table(bill$region)
##
## A E M O U
## 37 76 22 28 62
barplot(table(cfail))
barplot(table(bill$region), col=2:6)
hist(cfail)
hist(cfail, xlab="Number of weekly computer failures")
plot(wgain$initial, wgain$final) # plot wgain$final against wgain$initial
abline(0, 1, col="red") # add a 45 degree line
# Nicer and more informative plot
plot(wgain$initial, wgain$final,xlab="Wt in Week 1", ylab="Wt in Week 12", pch="*", las=1) # relabel both axes
abline(0, 1, col="red")
title("A scatterplot of the weights in Week 12 against the weights in Week 1") # add a title
boxplot(cfail)
# Working with various R data types
<- seq(from=1, to=13, by =3)
x # prints out the help file.
?seq <- c(1,3,5,6,8,21) # if we have to input irregular data.
a1 <- seq(5,25, length=5)
a2 <- c(a1,a2)
a3 <- seq(from=min(a1), to=max(a1), length=10)
a4 <- rep(2, 5)
a5 <- c(1, 3, 9)
a6 <- rep(a6, times=2)
a7 <- rep(a6, each=2)
a8 <- rep(a6, c(2, 3, 1))
a9 cbind(a7, a8, a9) # Can we see the differences between a7, a8 and a9?
## a7 a8 a9
## [1,] 1 1 1
## [2,] 3 1 1
## [3,] 9 3 3
## [4,] 1 3 3
## [5,] 3 9 3
## [6,] 9 9 9
# Create a data frame called dframe by issuing the command:
<- data.frame(x=1:10, y=rnorm(10))
dframe
# You can add a new column to a data frame, dframe say, by issuing:
$xy <- dframe$x * dframe$y
dframe
# Certain statistical operations on vectors result in scalars
mean(dframe$x)
## [1] 5.5
var(dframe$x)
## [1] 9.166667
View(dframe)
<- list(mean=10, sd=3.32, values=5:15)
myresults
<- factor(c("uk", "us", "no", "in", "es", "in"))
citizen
table(citizen)
## citizen
## es in no uk us
## 1 2 1 1 1
levels(citizen)
## [1] "es" "in" "no" "uk" "us"
levels(bill$region) # Assuming you read the billionaire data set already.
## NULL
levels(bill$region) <- c("Asia", "Europe", "Mid-East", "Other", "USA")
>5]
a1[a1## [1] 6 8 21
$wealth> 5
bill## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [13] TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
$region == "A"
bill## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE
## [37] TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## [85] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE TRUE
## [133] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [181] FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
## [193] TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
<- bill[bill$wealth>5, ]
bill.wealth.ge5
bill.wealth.ge5## wealth age region
## 1 37.0 50 M
## 2 24.0 88 U
## 3 14.0 64 A
## 4 13.0 63 U
## 5 13.0 66 U
## 6 11.7 72 E
## 7 10.0 71 M
## 8 8.2 77 U
## 9 8.1 68 U
## 10 7.2 66 E
## 11 7.0 69 M
## 12 6.2 36 O
## 13 5.9 49 U
## 14 5.3 73 U
## 15 5.2 52 E
<- bill[ bill$region == "A", ]
bill.region.A
bill.region.A## wealth age region
## 3 14.0 64 A
## 18 4.9 62 A
## 31 4.0 68 A
## 35 4.0 49 A
## 36 3.9 64 A
## 37 3.9 83 A
## 38 3.8 41 A
## 39 3.8 78 A
## 40 3.6 80 A
## 44 3.4 54 A
## 46 3.3 69 A
## 54 3.0 57 A
## 60 2.8 68 A
## 75 2.5 49 A
## 78 2.4 76 A
## 80 2.3 54 A
## 85 2.2 45 A
## 93 2.0 73 A
## 111 1.8 62 A
## 114 1.8 68 A
## 115 1.8 60 A
## 116 1.8 53 A
## 119 1.8 67 A
## 127 1.7 53 A
## 128 1.7 67 A
## 130 1.6 62 A
## 132 1.6 69 A
## 135 1.6 78 A
## 162 1.4 52 A
## 163 1.4 73 A
## 177 1.3 59 A
## 182 1.2 69 A
## 188 1.2 59 A
## 192 1.2 68 A
## 193 1.2 69 A
## 195 1.2 64 A
## 211 1.0 69 A
<- 1:10
x
>3 & x<7
x## [1] FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
<3 | x>7
x## [1] TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
<- seq(1, 10, by =2)
a
<- bill[a, ] oddrows
5 Using ggplot2 package for better graphics
# Scatterplot
library(ggplot2)
library(ipsRdbs)
attach(wgain)
<- ggplot(data=wgain, aes(initial, final)) +
g geom_count() +
geom_smooth(method="lm", se=F)
plot(g)
<- ggplot2::ggplot(data=bill, aes(x=age, y=wealth)) +
gg geom_point(aes(col=region, size=wealth)) +
geom_smooth(method="loess", se=FALSE) +
xlim(c(7, 102)) +
ylim(c(1, 37)) +
labs(subtitle="Wealth vs Age of Billionaires",
y="Wealth (Billion US $)", x="Age",
title="Scatterplot", caption = "Source: Fortune Magazine, 1992.")
plot(gg)
## `geom_smooth()` using formula = 'y ~ x'
# Learning the apply and tapply functions
<- matrix(1:12, byrow=T, ncol=4) # type x to see what matrix you have got.
x
apply(x, 2, mean) # produces four column means of x
## [1] 5 6 7 8
apply(x, 1, mean) # produces three row means of x
## [1] 2.5 6.5 10.5
tapply(X=bill$wealth, INDEX=bill$region, FUN=mean)
## A E M O U
## 2.651351 2.257895 4.263636 2.278571 3.000000
tapply(X=bill$wealth, INDEX=bill$region, FUN=sd)
## A E M O U
## 2.192617 1.623187 7.657150 1.265308 3.659974
# Rounding the numbers make those look nice
round(tapply(X=bill$wealth, INDEX=bill$region, FUN=mean), 2)
## A E M O U
## 2.65 2.26 4.26 2.28 3.00
?tapply
6 Drawing the butterfly: coverpage graphics
# Makes a local version of the butterfly programme
# Handy if the ipsRdbs package has not been installed
<- function(color = 2, p1=2, p2=4) {
butterflylocal <- seq(from=0.0, to=24 * pi, len = 2000)
theta <- exp(cos(theta)) - p1 * cos(p2 * theta)
radius <- radius + sin(theta/12)
radius <- radius * sin(theta)
x <- - radius * cos(theta)
y plot(x, y, type = "l", lwd=1.5, axes = F, xlab = "", ylab = "", col = color)
}
par(mfrow=c(1,1))
butterflylocal(p1=20, p2=4)
butterflylocal(color = 6)
par(mfrow=c(2, 2))
butterflylocal(color = 6)
butterflylocal(p1=5, p2=5, color=2)
butterflylocal(p1=10, p2=1.5, color = "seagreen")
butterflylocal(p1=20, p2=4, color = "blue")