chapter3

Introduction to Statistical Learning
Chapter 3
Tarek Dib
tdib03@gmail.com
April 12, 2015
1 Data Manipulation
setwd("/home/tarek/ISLR/dataSets")
auto <- read.csv("auto.csv")
# Structure of the data set
str(auto)
## 'data.frame': 397 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : Factor w/ 94 levels "?","100","102",..: 17 35 29 29 24 42 47 46 48 40 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2
# Remove the "name" column
auto <- auto[,-9]
# Remove rows where horsepower is missing
auto <- auto[auto$horsepower != "?",]
# Convert horsepower to numeric
auto$horsepower <- as.numeric(as.character(auto$horsepower))
# Change cylinders into a factor
auto$cylinders <- as.factor(auto$cylinders)
# Convert year to a factor data type
auto$year <- factor(auto$year)
2 Exploratory Data Analysis
1

library(ggplot2)
p.weight <- ggplot(auto, aes(weight, mpg, colour=cylinders))
p.weight + geom_point(size=2) + facet_grid(.~ cylinders) +
ggtitle("Mileage vs. Weight Grouped By Cylinders")
3 4 5 6 8
10
20
30
40
2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000
weight
mpg
cylinders
3
4
5
6
8
Mileage vs. Weight Grouped By Cylinders
p.HP <- ggplot(auto, aes(horsepower, mpg, colour=cylinders))
p.HP + geom_point(size=2) + facet_grid(.~ cylinders) +
ggtitle("Mileage vs. Horsepower")
3 4 5 6 8
10
20
30
40
50 100 150 200 50 100 150 200 50 100 150 200 50 100 150 200 50 100 150 200
horsepower
mpg
cylinders
3
4
5
6
8
Mileage vs. Horsepower
p.mpgHP <- ggplot(auto, aes(horsepower, mpg))
p.mpgHP + geom_point() + stat_smooth(method="lm", se=F)
10
20
30
40
50 100 150 200
horsepower
mpg
2

p.year <- ggplot(auto, aes(year, mpg, colour=cylinders))
p.year + geom_boxplot() +
ggtitle("Mileage change over the years Grouped by Cylinders")
10
20
30
40
70 71 72 73 74 75 76 77 78 79 80 81 82
year
mpg
cylinders
3
4
5
6
8
Mileage change over the years Grouped by Cylinders
plt1 <- ggplot(auto, aes(cylinders, mpg))
plt1 + geom_boxplot()
10
20
30
40
3 4 5 6 8
cylinders
mpg
3 Simple Linear Regression, mpg vs horsepower
lm.fit1 <- lm(mpg ~ horsepower, data = auto)
s <- summary(lm.fit1)
library(xtable)
summaryTab <- xtable(s, caption = "Simple Linear Regression Model, Summary Tablelabel{tab:mpg vs. hors
,label="tab:mpg vs. horsepower")
Rsquared.simple <- s$r.squared
# Predict mpg for horsepower = 98, and its 95% confidence and prediction interval, respectively
predict(lm.fit1, newdata=data.frame(horsepower=98), interval="confidence")
## fit lwr upr
## 1 24.46708 23.97308 24.96108
3

predict(lm.fit1, newdata=data.frame(horsepower=98), interval="prediction")
## fit lwr upr
## 1 24.46708 14.8094 34.12476
p.mpgHP + geom_point() + stat_smooth(method="lm", se=F) +
geom_abline(intercept = coef(lm.fit1)[1], slope=coef(lm.fit1)[2])
10
20
30
40
50 100 150 200
horsepower
mpg
Table 1: Simple Linear Regression Model, Summary Table
Estimate Std. Error t value Pr(>|t|)
(Intercept) 39.9359 0.7175 55.66 0.0000
horsepower -0.1578 0.0064 -24.49 0.0000
4 Removing Cylinders 3 and 5
Since there are only 4 cars with 3 cylinders and 3 cars with 5 cylinders, I have decided to exclude cars with 3 and 5
cylinders, and focus on presenting and analyzing cars with 4, 6 and 8 cylinders.
# Exclude 3 and 5 cylinder cars
auto <- auto[auto$cylinders != 3 & auto$cylinders != 5,]
# plot it
h1 <- ggplot(auto, aes(horsepower, mpg, colour = cylinders))
h1 + geom_point() + facet_grid(. ~ cylinders)
4

4 6 8
10
20
30
40
50 100 150 200 50 100 150 200 50 100 150 200
horsepower
mpg
cylinders
4
6
8
plt3 <- ggplot(auto, aes(weight, mpg, colour = cylinders))
plt3 + geom_point() +
facet_grid(. ~ cylinders) +
ggtitle("MPG vs. Weight Grouped by Cylinders") +
stat_smooth(method="lm", se=F)
4 6 8
10
20
30
40
2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000
weight
mpg
cylinders
4
6
8
MPG vs. Weight Grouped by Cylinders
5 Correlation Matrix
# Create a dataframe with the numeric variables: mpg, displacement, horsepower, weight, acceleration
df1 <- auto[,c("mpg", "displacement", "horsepower", "weight", "acceleration")]
# Correlation matrix
Cor <- cor(df1)
CorTab <- xtable(Cor, caption = "Correlation Matrixlabel{tab:Correlation}"
,label="Correlation")
6 Multiple Linear Regression Model
# Multi linear regression model
lm.fit2 <- lm(mpg ~ weight + horsepower + acceleration + cylinders + cylinders:horsepower, data=auto)
5

Table 2: Correlation Matrix
mpg displacement horsepower weight acceleration
mpg 1.00 -0.82 -0.78 -0.84 0.42
displacement -0.82 1.00 0.90 0.94 -0.56
horsepower -0.78 0.90 1.00 0.87 -0.69
weight -0.84 0.94 0.87 1.00 -0.43
acceleration 0.42 -0.56 -0.69 -0.43 1.00
# Estimate Variance Inflation Factor (vif) found in the car package.
# If vif >= 10, then remove the predictor for multicollinearity issue
library(car)
vifTab <- xtable(vif(lm.fit2))
s.fit2 <- summary(lm.fit2)
s.fit2.Tab <- xtable(s.fit2, caption = "Multiple Linear Regression Model, Summary Table")
Rsquared.multi <- s.fit2$r.squared
print(vifTab)
GVIF Df GVIF^(1/(2*Df))
weight 10.14 1.00 3.18
horsepower 26.99 1.00 5.20
acceleration 2.81 1.00 1.68
cylinders 2129.64 2.00 6.79
horsepower:cylinders 3527.06 2.00 7.71
print(s.fit2.Tab, caption.placement="top")
Table 3: Multiple Linear Regression Model, Summary Table
Estimate Std. Error t value Pr(>|t|)
(Intercept) 57.4208 2.7576 20.82 0.0000
weight -0.0027 0.0007 -3.76 0.0002
horsepower -0.2141 0.0256 -8.38 0.0000
acceleration -0.3145 0.1161 -2.71 0.0071
cylinders6 -22.2642 3.5438 -6.28 0.0000
cylinders8 -18.9358 2.9615 -6.39 0.0000
horsepower:cylinders6 0.1992 0.0356 5.60 0.0000
horsepower:cylinders8 0.1607 0.0240 6.69 0.0000
7 Model Diagnostics
6

−10
0
10
10 20 30
fitted
residuals
Figure 1: Residuals vs. Fitted Values. The gure proves that the homoscedasticity assumption i.e. constant variance
is violated. Thus, the multivariate linear regression model developed above seems not to be suitable. The response
variable may need to be transformed and then retted. Log transfomation of the response (mpg) variable may rectify
the non constant variance.
−10
0
10
−2 0 2
theoretical
sample
Figure 2: qqnorm and qqline plots to test for the normality assumption. The data do not seem to deviate much from
the normality assumption. There seems to be few outliers in the data set.
7

0
20
40
60
−10 0 10 20
residuals
count
0.00
0.05
0.10
0.15
0.20
0.25
0 100 200 300 400
Index
Leverages
Index plot of Leverages
8 Modifying the model
lm.fit3 - lm(log(mpg) ~ weight + horsepower + acceleration + cylinders +
cylinders * horsepower, data=auto)
summary(lm.fit3)
##
## Call:
## lm(formula = log(mpg) ~ weight + horsepower + acceleration +
## cylinders + cylinders * horsepower, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36326 -0.08544 -0.00528 0.08195 0.63418
##
8

## Coefficients:
## Estimate Std. Error t value Pr(|t|)
## (Intercept) 4.421e+00 1.063e-01 41.613 2e-16 ***
## weight -1.365e-04 2.738e-05 -4.986 9.40e-07 ***
## horsepower -6.784e-03 9.846e-04 -6.890 2.35e-11 ***
## acceleration -1.308e-02 4.473e-03 -2.924 0.00366 **
## cylinders6 -7.404e-01 1.365e-01 -5.422 1.05e-07 ***
## cylinders8 -4.742e-01 1.141e-01 -4.156 4.01e-05 ***
## horsepower:cylinders6 6.271e-03 1.371e-03 4.574 6.50e-06 ***
## horsepower:cylinders8 3.456e-03 9.263e-04 3.731 0.00022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1439 on 377 degrees of freedom
## Multiple R-squared: 0.8259,Adjusted R-squared: 0.8227
## F-statistic: 255.5 on 7 and 377 DF, p-value: 2.2e-16
par(mfrow=c(2,2))
plot(lm.fit3)
2.4 2.6 2.8 3.0 3.2 3.4 3.6
−0.40.6
Fitted values
Residuals
Residuals vs Fitted
387
361365
−3 −2 −1 0 1 2 3
−24
Theoretical Quantiles
Standardizedresiduals
Normal Q−Q
387
361365
2.4 2.6 2.8 3.0 3.2 3.4 3.6
0.01.5
Fitted values
Scale−Location
387
361365
0.00 0.05 0.10 0.15 0.20 0.25
−24
Leverage
Cook's distance
0.5
0.5
1
Residuals vs Leverage
334361
387
Rsquared.mod - summary(lm.fit3)$r.squared
8.1 Residuals and QQ Norm of Modied Model
par(mfrow=c(3,1))
plot(lm.fit3$fit, lm.fit3$res, xlab=Fitted Values, ylab=Residuals)
hist(resid(lm.fit3))
qqnorm(resid(lm.fit3))
qqline(resid(lm.fit3))
9

2.4 2.6 2.8 3.0 3.2 3.4 3.6
−0.40.00.20.40.6
Fitted Values
Residuals
Histogram of resid(lm.fit3)
resid(lm.fit3)
Frequency
−0.4 −0.2 0.0 0.2 0.4 0.6
020406080
−3 −2 −1 0 1 2 3
−0.40.00.20.40.6
Normal Q−Q Plot
Theoretical Quantiles
SampleQuantiles
10

9 Comparison of Rsquared value among the Models
# Dataframe of the rsquareds
df2 - data.frame(rbind(Rsquared.simple, Rsquared.multi, Rsquared.mod))
names(df2) - Rsquared
rownames(df2) - c(Simple, Multiple, Modified Multiple)
dfTab - xtable(df2)
Rsquared
Simple 0.61
Multiple 0.78
Modied Multiple 0.83
11

chapter3

Recommended

Recommended

More Related Content

Similar to chapter3

Similar to chapter3 (20)

chapter3