Abstract

This file supplements introduction of the R software package in Chapter 2. This page shows both input code and output results and graphics. The reader is invited to try the code by copy-pasting the code lines interactively on the R Console.

1 Code used to install libraries

# It is more convenient to install the package ipsRdbs 
# Please issue the R commands
install.packages("ipsRdbs", dependencies = TRUE)
library(ipsRdbs)
ls("package:ipsRdbs")
# If these did not work please ask a local expert 
# to help you with the installation. 
# You may also search the internet for the error messages
# given by R. 
# From now on it will be assumed that you have been able to 
# install the package in your computer. Otherwise, the 
# commands below will not work.

# If  packages are installed successfully, please load those 
library(ggplot2)
library(ipsRdbs)

2 Alternative ways to read data into R

# If installation of the package ipsRdbs is NOT successful, please 
# Download .txt data files from # https://github.com/sujit-sahu/ipsRdbs/tree/main/data
# Or you may download the R data file 
# from https://github.com/sujit-sahu/bookipsRdbs/blob/main/ipsRdbs.Rdata
# and save it in your computer. 
# Then you may load all the data files by issuing the command  
load(file="ipsRdbs.Rdata")
# Remember to set the working directory to the location # where the .Rdata  file has  been saved.

# Read the data sets directly from the book website if you
# were unable to install the R package ipsRdbs. 
# It is not necessary to run these commands if the R package
# ipsRdbs has been installed and loaded already. 
path <- "https://www.sujitsahu.com/ipsRdbs/"
cfail <- scan(paste0(path, "cfail.txt"))
ffood <- read.csv(paste0(path, "ffood.csv"), head=T)
wgain <- read.table(paste0(path, "wtgain.txt"), head=T) # Reads the weight gain data set  
bill <- read.table(paste0(path, "billionaires.txt"), head=T)# Reads the billionaire data set

3 Code used to obtain summary statistics

## Assume that the ipsRdbs package has been installed and 
## loaded by the library command: 
library(ipsRdbs)
# Prints the data frame ffood
ffood
##     AM PM
## 1   38 45
## 2  100 62
## 3   64 52
## 4   43 72
## 5   63 81
## 6   59 88
## 7  107 64
## 8   52 75
## 9   86 59
## 10  77 70
## Prints first few rows
head(ffood)
##    AM PM
## 1  38 45
## 2 100 62
## 3  64 52
## 4  43 72
## 5  63 81
## 6  59 88
## Prints last few rows
tail(ffood)
##     AM PM
## 5   63 81
## 6   59 88
## 7  107 64
## 8   52 75
## 9   86 59
## 10  77 70
# Prints the dimension 
dim(ffood)
## [1] 10  2
names(ffood)  # Prints the column names of the data set
## [1] "AM" "PM"
ffood$AM # Prints the values in the AM column
##  [1]  38 100  64  43  63  59 107  52  86  77
ffood[1,] # Prints the first row and all columns
##   AM PM
## 1 38 45
ffood[,1] # Prints the first column and all rows
##  [1]  38 100  64  43  63  59 107  52  86  77
ffood[1:2, ] # Prints the first two rows and all columns
##    AM PM
## 1  38 45
## 2 100 62
ffood[1, 2] # Prints the first row second column entry
## [1] 45

# Prints the summaries 
summary(ffood)
##        AM               PM       
##  Min.   : 38.00   Min.   :45.00  
##  1st Qu.: 53.75   1st Qu.:59.75  
##  Median : 63.50   Median :67.00  
##  Mean   : 68.90   Mean   :66.80  
##  3rd Qu.: 83.75   3rd Qu.:74.25  
##  Max.   :107.00   Max.   :88.00
summary(cfail)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    1.00    3.00    3.75    5.00   17.00
summary(wgain)
##     student         initial          final       
##  Min.   : 1.00   Min.   :42.64   Min.   : 43.54  
##  1st Qu.:17.75   1st Qu.:53.86   1st Qu.: 54.32  
##  Median :34.50   Median :60.78   Median : 60.78  
##  Mean   :34.50   Mean   :61.72   Mean   : 62.59  
##  3rd Qu.:51.25   3rd Qu.:68.04   3rd Qu.: 68.49  
##  Max.   :68.00   Max.   :99.79   Max.   :101.60
summary(bill)
##      wealth            age            region         
##  Min.   : 1.000   Min.   :  7.00   Length:225        
##  1st Qu.: 1.300   1st Qu.: 56.00   Class :character  
##  Median : 1.800   Median : 65.00   Mode  :character  
##  Mean   : 2.726   Mean   : 64.03                     
##  3rd Qu.: 3.000   3rd Qu.: 72.00                     
##  Max.   :37.000   Max.   :102.00

dim(ffood)
## [1] 10  2
names(ffood)  # Prints the column names of the data set
## [1] "AM" "PM"
ffood$AM # Prints the values in the AM column 
##  [1]  38 100  64  43  63  59 107  52  86  77
ffood[1,] # Prints the first row and all columns
##   AM PM
## 1 38 45
ffood[,1] # Prints the first column and all rows 
##  [1]  38 100  64  43  63  59 107  52  86  77
ffood[1:2, ] # Prints the first two rows and all columns
##    AM PM
## 1  38 45
## 2 100 62
ffood[1, 2] # Prints the first row second column entry 
## [1] 45

table(cfail)
## cfail
##  0  1  2  3  4  5  6  7  8  9 10 11 12 13 17 
## 12 16 21 12 11  8  7  2  4  2  3  2  2  1  1

var(ffood$AM) # variance of the AM data
## [1] 538.3222

var(c(ffood$AM, ffood$PM)) 
## [1] 337.2921
# variance of the AM and PM data combined

table(bill$region)
## 
##  A  E  M  O  U 
## 37 76 22 28 62

4 Some basic plotting functions


stem(ffood$AM)
## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##    2 | 8
##    4 | 329
##    6 | 347
##    8 | 6
##   10 | 07
stem(cfail)
## 
##   The decimal point is at the |
## 
##    0 | 000000000000
##    1 | 0000000000000000
##    2 | 000000000000000000000
##    3 | 000000000000
##    4 | 00000000000
##    5 | 00000000
##    6 | 0000000
##    7 | 00
##    8 | 0000
##    9 | 00
##   10 | 000
##   11 | 00
##   12 | 00
##   13 | 0
##   14 | 
##   15 | 
##   16 | 
##   17 | 0
hist(cfail)


hist(cfail, xlab="Number of weekly computer failures")


plot(wgain$initial, wgain$final) # plot wgain$final against wgain$initial

abline(0, 1, col="red") # add a 45 degree line

# Nicer and more informative plot
plot(wgain$initial, wgain$final,xlab="Wt in Week 1", ylab="Wt in Week 12", pch="*", las=1) # relabel both axes
abline(0, 1, col="red")
title("A scatterplot of the weights in Week 12 against the weights in Week 1") # add a title


boxplot(cfail)


## Example of programming in R

boxplot(ffood)


boxplot(data=bill, wealth ~ region, col=2:6)

# Get help by issuing the help command 
# ?par

table(bill$region)
## 
##  A  E  M  O  U 
## 37 76 22 28 62

barplot(table(cfail))

barplot(table(bill$region), col=2:6)

hist(cfail)

hist(cfail, xlab="Number of weekly computer failures")

plot(wgain$initial, wgain$final) # plot wgain$final against wgain$initial

abline(0, 1, col="red") # add a 45 degree line

# Nicer and more informative plot
plot(wgain$initial, wgain$final,xlab="Wt in Week 1", ylab="Wt in Week 12", pch="*", las=1) # relabel both axes
abline(0, 1, col="red")
title("A scatterplot of the weights in Week 12 against the weights in Week 1") # add a title

boxplot(cfail)

# Working with various R data types

x <- seq(from=1, to=13, by =3) 
?seq # prints out the help file. 
a1  <- c(1,3,5,6,8,21) # if we have to input irregular data. 
a2 <- seq(5,25, length=5)
a3  <- c(a1,a2)
a4  <- seq(from=min(a1), to=max(a1), length=10)
a5  <- rep(2, 5)
a6  <- c(1, 3, 9)
a7 <-  rep(a6, times=2)
a8 <-  rep(a6, each=2)
a9  <- rep(a6, c(2, 3, 1))
cbind(a7, a8, a9) # Can we see the differences between a7, a8 and a9?
##      a7 a8 a9
## [1,]  1  1  1
## [2,]  3  1  1
## [3,]  9  3  3
## [4,]  1  3  3
## [5,]  3  9  3
## [6,]  9  9  9

# Create a data frame called dframe by issuing the command:
dframe <- data.frame(x=1:10, y=rnorm(10))

# You can add a new column to a data frame, dframe say, by issuing:
dframe$xy <- dframe$x * dframe$y

# Certain statistical operations on vectors result in scalars
mean(dframe$x)
## [1] 5.5
var(dframe$x)
## [1] 9.166667

View(dframe)

myresults <- list(mean=10, sd=3.32, values=5:15)

citizen <- factor(c("uk", "us", "no", "in", "es", "in"))

table(citizen)
## citizen
## es in no uk us 
##  1  2  1  1  1

levels(citizen)
## [1] "es" "in" "no" "uk" "us"

levels(bill$region) # Assuming you read the billionaire data set already.
## NULL

levels(bill$region) <- c("Asia", "Europe", "Mid-East", "Other", "USA")

a1[a1>5]
## [1]  6  8 21

bill$wealth> 5
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [13]  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

bill$region == "A"
##   [1] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE
##  [37]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
##  [85]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE
## [133] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [181] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE
## [193]  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

bill.wealth.ge5 <- bill[bill$wealth>5, ]

bill.wealth.ge5
##    wealth age region
## 1    37.0  50      M
## 2    24.0  88      U
## 3    14.0  64      A
## 4    13.0  63      U
## 5    13.0  66      U
## 6    11.7  72      E
## 7    10.0  71      M
## 8     8.2  77      U
## 9     8.1  68      U
## 10    7.2  66      E
## 11    7.0  69      M
## 12    6.2  36      O
## 13    5.9  49      U
## 14    5.3  73      U
## 15    5.2  52      E

bill.region.A <- bill[ bill$region == "A", ]

bill.region.A
##     wealth age region
## 3     14.0  64      A
## 18     4.9  62      A
## 31     4.0  68      A
## 35     4.0  49      A
## 36     3.9  64      A
## 37     3.9  83      A
## 38     3.8  41      A
## 39     3.8  78      A
## 40     3.6  80      A
## 44     3.4  54      A
## 46     3.3  69      A
## 54     3.0  57      A
## 60     2.8  68      A
## 75     2.5  49      A
## 78     2.4  76      A
## 80     2.3  54      A
## 85     2.2  45      A
## 93     2.0  73      A
## 111    1.8  62      A
## 114    1.8  68      A
## 115    1.8  60      A
## 116    1.8  53      A
## 119    1.8  67      A
## 127    1.7  53      A
## 128    1.7  67      A
## 130    1.6  62      A
## 132    1.6  69      A
## 135    1.6  78      A
## 162    1.4  52      A
## 163    1.4  73      A
## 177    1.3  59      A
## 182    1.2  69      A
## 188    1.2  59      A
## 192    1.2  68      A
## 193    1.2  69      A
## 195    1.2  64      A
## 211    1.0  69      A

x <- 1:10

x>3 & x<7
##  [1] FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE

x<3 | x>7
##  [1]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE

a <- seq(1, 10, by =2)

oddrows <- bill[a, ]

5 Using ggplot2 package for better graphics


# Scatterplot
library(ggplot2)
library(ipsRdbs)
attach(wgain) 
g <- ggplot(data=wgain, aes(initial, final)) + 
  geom_count() + 
  geom_smooth(method="lm", se=F)
plot(g)

gg <- ggplot2::ggplot(data=bill, aes(x=age, y=wealth)) +
geom_point(aes(col=region, size=wealth)) + 
geom_smooth(method="loess", se=FALSE) + 
xlim(c(7, 102)) + 
ylim(c(1, 37)) + 
labs(subtitle="Wealth vs Age of Billionaires", 
y="Wealth (Billion US $)", x="Age", 
title="Scatterplot", caption = "Source: Fortune Magazine, 1992.")
plot(gg)
## `geom_smooth()` using formula = 'y ~ x'

# Learning the apply and tapply functions

x <- matrix(1:12, byrow=T, ncol=4) # type x to see what matrix you have got.

apply(x, 2, mean) # produces four column means of x
## [1] 5 6 7 8

apply(x, 1, mean) # produces three row means of x
## [1]  2.5  6.5 10.5

tapply(X=bill$wealth, INDEX=bill$region, FUN=mean)
##        A        E        M        O        U 
## 2.651351 2.257895 4.263636 2.278571 3.000000
tapply(X=bill$wealth, INDEX=bill$region, FUN=sd)
##        A        E        M        O        U 
## 2.192617 1.623187 7.657150 1.265308 3.659974

# Rounding the numbers make those look nice 
round(tapply(X=bill$wealth, INDEX=bill$region, FUN=mean), 2)
##    A    E    M    O    U 
## 2.65 2.26 4.26 2.28 3.00

?tapply

6 Drawing the butterfly: coverpage graphics

# Makes a local version of the butterfly programme
# Handy if the ipsRdbs package has not been installed 

butterflylocal <- function(color = 2, p1=2, p2=4) {
  theta <- seq(from=0.0, to=24 * pi, len = 2000)
  radius <- exp(cos(theta)) - p1 * cos(p2 * theta)
  radius <- radius + sin(theta/12)
  x <- radius * sin(theta)
  y <-  - radius * cos(theta)
  plot(x, y, type = "l", lwd=1.5, axes = F, xlab = "", ylab = "", col = color)
}

par(mfrow=c(1,1))
butterflylocal(p1=20, p2=4)

butterflylocal(color = 6)

par(mfrow=c(2, 2))
butterflylocal(color = 6)
butterflylocal(p1=5, p2=5, color=2)
butterflylocal(p1=10, p2=1.5, color = "seagreen")
butterflylocal(p1=20, p2=4, color = "blue")

Part 1: Getting started with R

Sujit K. Sahu, https://www.sujitsahu.com/

2023-07-05