SlideShare a Scribd company logo
1 of 11
Download to read offline
Introduction to Statistical Learning
Chapter 3
Tarek Dib
tdib03@gmail.com
April 12, 2015
1 Data Manipulation
setwd("/home/tarek/ISLR/dataSets")
auto <- read.csv("auto.csv")
# Structure of the data set
str(auto)
## 'data.frame': 397 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : Factor w/ 94 levels "?","100","102",..: 17 35 29 29 24 42 47 46 48 40 ...
## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2
# Remove the "name" column
auto <- auto[,-9]
# Remove rows where horsepower is missing
auto <- auto[auto$horsepower != "?",]
# Convert horsepower to numeric
auto$horsepower <- as.numeric(as.character(auto$horsepower))
# Change cylinders into a factor
auto$cylinders <- as.factor(auto$cylinders)
# Convert year to a factor data type
auto$year <- factor(auto$year)
2 Exploratory Data Analysis
1
library(ggplot2)
p.weight <- ggplot(auto, aes(weight, mpg, colour=cylinders))
p.weight + geom_point(size=2) + facet_grid(.~ cylinders) +
ggtitle("Mileage vs. Weight Grouped By Cylinders")
3 4 5 6 8
10
20
30
40
2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000
weight
mpg
cylinders
3
4
5
6
8
Mileage vs. Weight Grouped By Cylinders
p.HP <- ggplot(auto, aes(horsepower, mpg, colour=cylinders))
p.HP + geom_point(size=2) + facet_grid(.~ cylinders) +
ggtitle("Mileage vs. Horsepower")
3 4 5 6 8
10
20
30
40
50 100 150 200 50 100 150 200 50 100 150 200 50 100 150 200 50 100 150 200
horsepower
mpg
cylinders
3
4
5
6
8
Mileage vs. Horsepower
p.mpgHP <- ggplot(auto, aes(horsepower, mpg))
p.mpgHP + geom_point() + stat_smooth(method="lm", se=F)
10
20
30
40
50 100 150 200
horsepower
mpg
2
p.year <- ggplot(auto, aes(year, mpg, colour=cylinders))
p.year + geom_boxplot() +
ggtitle("Mileage change over the years Grouped by Cylinders")
10
20
30
40
70 71 72 73 74 75 76 77 78 79 80 81 82
year
mpg
cylinders
3
4
5
6
8
Mileage change over the years Grouped by Cylinders
plt1 <- ggplot(auto, aes(cylinders, mpg))
plt1 + geom_boxplot()
10
20
30
40
3 4 5 6 8
cylinders
mpg
3 Simple Linear Regression, mpg vs horsepower
lm.fit1 <- lm(mpg ~ horsepower, data = auto)
s <- summary(lm.fit1)
library(xtable)
summaryTab <- xtable(s, caption = "Simple Linear Regression Model, Summary Tablelabel{tab:mpg vs. hors
,label="tab:mpg vs. horsepower")
Rsquared.simple <- s$r.squared
# Predict mpg for horsepower = 98, and its 95% confidence and prediction interval, respectively
predict(lm.fit1, newdata=data.frame(horsepower=98), interval="confidence")
## fit lwr upr
## 1 24.46708 23.97308 24.96108
3
predict(lm.fit1, newdata=data.frame(horsepower=98), interval="prediction")
## fit lwr upr
## 1 24.46708 14.8094 34.12476
p.mpgHP + geom_point() + stat_smooth(method="lm", se=F) +
geom_abline(intercept = coef(lm.fit1)[1], slope=coef(lm.fit1)[2])
10
20
30
40
50 100 150 200
horsepower
mpg
Table 1: Simple Linear Regression Model, Summary Table
Estimate Std. Error t value Pr(>|t|)
(Intercept) 39.9359 0.7175 55.66 0.0000
horsepower -0.1578 0.0064 -24.49 0.0000
4 Removing Cylinders 3 and 5
Since there are only 4 cars with 3 cylinders and 3 cars with 5 cylinders, I have decided to exclude cars with 3 and 5
cylinders, and focus on presenting and analyzing cars with 4, 6 and 8 cylinders.
# Exclude 3 and 5 cylinder cars
auto <- auto[auto$cylinders != 3 & auto$cylinders != 5,]
# plot it
h1 <- ggplot(auto, aes(horsepower, mpg, colour = cylinders))
h1 + geom_point() + facet_grid(. ~ cylinders)
4
4 6 8
10
20
30
40
50 100 150 200 50 100 150 200 50 100 150 200
horsepower
mpg
cylinders
4
6
8
plt3 <- ggplot(auto, aes(weight, mpg, colour = cylinders))
plt3 + geom_point() +
facet_grid(. ~ cylinders) +
ggtitle("MPG vs. Weight Grouped by Cylinders") +
stat_smooth(method="lm", se=F)
4 6 8
10
20
30
40
2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000
weight
mpg
cylinders
4
6
8
MPG vs. Weight Grouped by Cylinders
5 Correlation Matrix
# Create a dataframe with the numeric variables: mpg, displacement, horsepower, weight, acceleration
df1 <- auto[,c("mpg", "displacement", "horsepower", "weight", "acceleration")]
# Correlation matrix
Cor <- cor(df1)
CorTab <- xtable(Cor, caption = "Correlation Matrixlabel{tab:Correlation}"
,label="Correlation")
6 Multiple Linear Regression Model
# Multi linear regression model
lm.fit2 <- lm(mpg ~ weight + horsepower + acceleration + cylinders + cylinders:horsepower, data=auto)
5
Table 2: Correlation Matrix
mpg displacement horsepower weight acceleration
mpg 1.00 -0.82 -0.78 -0.84 0.42
displacement -0.82 1.00 0.90 0.94 -0.56
horsepower -0.78 0.90 1.00 0.87 -0.69
weight -0.84 0.94 0.87 1.00 -0.43
acceleration 0.42 -0.56 -0.69 -0.43 1.00
# Estimate Variance Inflation Factor (vif) found in the car package.
# If vif >= 10, then remove the predictor for multicollinearity issue
library(car)
vifTab <- xtable(vif(lm.fit2))
s.fit2 <- summary(lm.fit2)
s.fit2.Tab <- xtable(s.fit2, caption = "Multiple Linear Regression Model, Summary Table")
Rsquared.multi <- s.fit2$r.squared
print(vifTab)
GVIF Df GVIF^(1/(2*Df))
weight 10.14 1.00 3.18
horsepower 26.99 1.00 5.20
acceleration 2.81 1.00 1.68
cylinders 2129.64 2.00 6.79
horsepower:cylinders 3527.06 2.00 7.71
print(s.fit2.Tab, caption.placement="top")
Table 3: Multiple Linear Regression Model, Summary Table
Estimate Std. Error t value Pr(>|t|)
(Intercept) 57.4208 2.7576 20.82 0.0000
weight -0.0027 0.0007 -3.76 0.0002
horsepower -0.2141 0.0256 -8.38 0.0000
acceleration -0.3145 0.1161 -2.71 0.0071
cylinders6 -22.2642 3.5438 -6.28 0.0000
cylinders8 -18.9358 2.9615 -6.39 0.0000
horsepower:cylinders6 0.1992 0.0356 5.60 0.0000
horsepower:cylinders8 0.1607 0.0240 6.69 0.0000
7 Model Diagnostics
6
−10
0
10
10 20 30
fitted
residuals
Figure 1: Residuals vs. Fitted Values. The gure proves that the homoscedasticity assumption i.e. constant variance
is violated. Thus, the multivariate linear regression model developed above seems not to be suitable. The response
variable may need to be transformed and then retted. Log transfomation of the response (mpg) variable may rectify
the non constant variance.
−10
0
10
−2 0 2
theoretical
sample
Figure 2: qqnorm and qqline plots to test for the normality assumption. The data do not seem to deviate much from
the normality assumption. There seems to be few outliers in the data set.
7
0
20
40
60
−10 0 10 20
residuals
count
0.00
0.05
0.10
0.15
0.20
0.25
0 100 200 300 400
Index
Leverages
Index plot of Leverages
8 Modifying the model
lm.fit3 - lm(log(mpg) ~ weight + horsepower + acceleration + cylinders +
cylinders * horsepower, data=auto)
summary(lm.fit3)
##
## Call:
## lm(formula = log(mpg) ~ weight + horsepower + acceleration +
## cylinders + cylinders * horsepower, data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36326 -0.08544 -0.00528 0.08195 0.63418
##
8
## Coefficients:
## Estimate Std. Error t value Pr(|t|)
## (Intercept) 4.421e+00 1.063e-01 41.613  2e-16 ***
## weight -1.365e-04 2.738e-05 -4.986 9.40e-07 ***
## horsepower -6.784e-03 9.846e-04 -6.890 2.35e-11 ***
## acceleration -1.308e-02 4.473e-03 -2.924 0.00366 **
## cylinders6 -7.404e-01 1.365e-01 -5.422 1.05e-07 ***
## cylinders8 -4.742e-01 1.141e-01 -4.156 4.01e-05 ***
## horsepower:cylinders6 6.271e-03 1.371e-03 4.574 6.50e-06 ***
## horsepower:cylinders8 3.456e-03 9.263e-04 3.731 0.00022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1439 on 377 degrees of freedom
## Multiple R-squared: 0.8259,Adjusted R-squared: 0.8227
## F-statistic: 255.5 on 7 and 377 DF, p-value:  2.2e-16
par(mfrow=c(2,2))
plot(lm.fit3)
2.4 2.6 2.8 3.0 3.2 3.4 3.6
−0.40.6
Fitted values
Residuals
Residuals vs Fitted
387
361365
−3 −2 −1 0 1 2 3
−24
Theoretical Quantiles
Standardizedresiduals
Normal Q−Q
387
361365
2.4 2.6 2.8 3.0 3.2 3.4 3.6
0.01.5
Fitted values
Standardizedresiduals
Scale−Location
387
361365
0.00 0.05 0.10 0.15 0.20 0.25
−24
Leverage
Standardizedresiduals
Cook's distance
0.5
0.5
1
Residuals vs Leverage
334361
387
Rsquared.mod - summary(lm.fit3)$r.squared
8.1 Residuals and QQ Norm of Modied Model
par(mfrow=c(3,1))
plot(lm.fit3$fit, lm.fit3$res, xlab=Fitted Values, ylab=Residuals)
hist(resid(lm.fit3))
qqnorm(resid(lm.fit3))
qqline(resid(lm.fit3))
9
2.4 2.6 2.8 3.0 3.2 3.4 3.6
−0.40.00.20.40.6
Fitted Values
Residuals
Histogram of resid(lm.fit3)
resid(lm.fit3)
Frequency
−0.4 −0.2 0.0 0.2 0.4 0.6
020406080
−3 −2 −1 0 1 2 3
−0.40.00.20.40.6
Normal Q−Q Plot
Theoretical Quantiles
SampleQuantiles
10
9 Comparison of Rsquared value among the Models
# Dataframe of the rsquareds
df2 - data.frame(rbind(Rsquared.simple, Rsquared.multi, Rsquared.mod))
names(df2) - Rsquared
rownames(df2) - c(Simple, Multiple, Modified Multiple)
dfTab - xtable(df2)
Rsquared
Simple 0.61
Multiple 0.78
Modied Multiple 0.83
11

More Related Content

Similar to chapter3

MH prediction modeling and validation in r (1) regression 190709
MH prediction modeling and validation in r (1) regression 190709MH prediction modeling and validation in r (1) regression 190709
MH prediction modeling and validation in r (1) regression 190709Min-hyung Kim
 
R Programming: Transform/Reshape Data In R
R Programming: Transform/Reshape Data In RR Programming: Transform/Reshape Data In R
R Programming: Transform/Reshape Data In RRsquared Academy
 
library(tidyr) and library(ggplot2)
library(tidyr)  and library(ggplot2)library(tidyr)  and library(ggplot2)
library(tidyr) and library(ggplot2)Dr. Volkan OBAN
 
cars design code power system detai.pptx
cars design code power system detai.pptxcars design code power system detai.pptx
cars design code power system detai.pptxabomoayad19309
 
第5回 様々なファイル形式の読み込みとデータの書き出し
第5回 様々なファイル形式の読み込みとデータの書き出し第5回 様々なファイル形式の読み込みとデータの書き出し
第5回 様々なファイル形式の読み込みとデータの書き出しWataru Shito
 
More Than po: Debugging in LLDB @ CocoaConf SJ 2015
More Than po: Debugging in LLDB @ CocoaConf SJ 2015More Than po: Debugging in LLDB @ CocoaConf SJ 2015
More Than po: Debugging in LLDB @ CocoaConf SJ 2015Michele Titolo
 
System Integration
System IntegrationSystem Integration
System Integrationanjal3753
 
Introduction to R
Introduction to RIntroduction to R
Introduction to RStacy Irwin
 
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)Wataru Shito
 
Read/Import data from flat/delimited files into R
Read/Import data from flat/delimited files into RRead/Import data from flat/delimited files into R
Read/Import data from flat/delimited files into RRsquared Academy
 
Toyota 02 7 fdf30 forklift service repair manual
Toyota 02 7 fdf30 forklift service repair manualToyota 02 7 fdf30 forklift service repair manual
Toyota 02 7 fdf30 forklift service repair manualfjjsekdmme
 
Toyota 02 7 fgjf35 forklift service repair manual
Toyota 02 7 fgjf35 forklift service repair manualToyota 02 7 fgjf35 forklift service repair manual
Toyota 02 7 fgjf35 forklift service repair manualudfjjsjekdkdmm
 
Toyota 02 7 fdf20 forklift service repair manual
Toyota 02 7 fdf20 forklift service repair manualToyota 02 7 fdf20 forklift service repair manual
Toyota 02 7 fdf20 forklift service repair manualfusjejedrfjskekem
 
Toyota 42 7 fgf15 forklift service repair manual
Toyota 42 7 fgf15 forklift service repair manualToyota 42 7 fgf15 forklift service repair manual
Toyota 42 7 fgf15 forklift service repair manualfhjsjejfjskekemm
 
Toyota 42 7 fgf25 forklift service repair manual
Toyota 42 7 fgf25 forklift service repair manualToyota 42 7 fgf25 forklift service repair manual
Toyota 42 7 fgf25 forklift service repair manualfhjsjejfjskekemm
 
Toyota 02 7 fdf18 forklift service repair manual
Toyota 02 7 fdf18 forklift service repair manualToyota 02 7 fdf18 forklift service repair manual
Toyota 02 7 fdf18 forklift service repair manualfjjsefkkertsemme
 
Toyota 02 7 fdf15 forklift service repair manual
Toyota 02 7 fdf15 forklift service repair manualToyota 02 7 fdf15 forklift service repair manual
Toyota 02 7 fdf15 forklift service repair manualdujjsjekkkdmm
 
Toyota 02 7 fgjf35 forklift service repair manual
Toyota 02 7 fgjf35 forklift service repair manualToyota 02 7 fgjf35 forklift service repair manual
Toyota 02 7 fgjf35 forklift service repair manualfhsejkdkmem
 

Similar to chapter3 (20)

MH prediction modeling and validation in r (1) regression 190709
MH prediction modeling and validation in r (1) regression 190709MH prediction modeling and validation in r (1) regression 190709
MH prediction modeling and validation in r (1) regression 190709
 
R Programming: Transform/Reshape Data In R
R Programming: Transform/Reshape Data In RR Programming: Transform/Reshape Data In R
R Programming: Transform/Reshape Data In R
 
library(tidyr) and library(ggplot2)
library(tidyr)  and library(ggplot2)library(tidyr)  and library(ggplot2)
library(tidyr) and library(ggplot2)
 
cars design code power system detai.pptx
cars design code power system detai.pptxcars design code power system detai.pptx
cars design code power system detai.pptx
 
第5回 様々なファイル形式の読み込みとデータの書き出し
第5回 様々なファイル形式の読み込みとデータの書き出し第5回 様々なファイル形式の読み込みとデータの書き出し
第5回 様々なファイル形式の読み込みとデータの書き出し
 
More Than po: Debugging in LLDB @ CocoaConf SJ 2015
More Than po: Debugging in LLDB @ CocoaConf SJ 2015More Than po: Debugging in LLDB @ CocoaConf SJ 2015
More Than po: Debugging in LLDB @ CocoaConf SJ 2015
 
System Integration
System IntegrationSystem Integration
System Integration
 
Tsukubar8
Tsukubar8Tsukubar8
Tsukubar8
 
Introduction to tibbles
Introduction to tibblesIntroduction to tibbles
Introduction to tibbles
 
Introduction to R
Introduction to RIntroduction to R
Introduction to R
 
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
第5回 様々なファイル形式の読み込みとデータの書き出し(解答付き)
 
Read/Import data from flat/delimited files into R
Read/Import data from flat/delimited files into RRead/Import data from flat/delimited files into R
Read/Import data from flat/delimited files into R
 
Toyota 02 7 fdf30 forklift service repair manual
Toyota 02 7 fdf30 forklift service repair manualToyota 02 7 fdf30 forklift service repair manual
Toyota 02 7 fdf30 forklift service repair manual
 
Toyota 02 7 fgjf35 forklift service repair manual
Toyota 02 7 fgjf35 forklift service repair manualToyota 02 7 fgjf35 forklift service repair manual
Toyota 02 7 fgjf35 forklift service repair manual
 
Toyota 02 7 fdf20 forklift service repair manual
Toyota 02 7 fdf20 forklift service repair manualToyota 02 7 fdf20 forklift service repair manual
Toyota 02 7 fdf20 forklift service repair manual
 
Toyota 42 7 fgf15 forklift service repair manual
Toyota 42 7 fgf15 forklift service repair manualToyota 42 7 fgf15 forklift service repair manual
Toyota 42 7 fgf15 forklift service repair manual
 
Toyota 42 7 fgf25 forklift service repair manual
Toyota 42 7 fgf25 forklift service repair manualToyota 42 7 fgf25 forklift service repair manual
Toyota 42 7 fgf25 forklift service repair manual
 
Toyota 02 7 fdf18 forklift service repair manual
Toyota 02 7 fdf18 forklift service repair manualToyota 02 7 fdf18 forklift service repair manual
Toyota 02 7 fdf18 forklift service repair manual
 
Toyota 02 7 fdf15 forklift service repair manual
Toyota 02 7 fdf15 forklift service repair manualToyota 02 7 fdf15 forklift service repair manual
Toyota 02 7 fdf15 forklift service repair manual
 
Toyota 02 7 fgjf35 forklift service repair manual
Toyota 02 7 fgjf35 forklift service repair manualToyota 02 7 fgjf35 forklift service repair manual
Toyota 02 7 fgjf35 forklift service repair manual
 

chapter3

  • 1. Introduction to Statistical Learning Chapter 3 Tarek Dib tdib03@gmail.com April 12, 2015 1 Data Manipulation setwd("/home/tarek/ISLR/dataSets") auto <- read.csv("auto.csv") # Structure of the data set str(auto) ## 'data.frame': 397 obs. of 9 variables: ## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ... ## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ... ## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ... ## $ horsepower : Factor w/ 94 levels "?","100","102",..: 17 35 29 29 24 42 47 46 48 40 ... ## $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ... ## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ... ## $ year : int 70 70 70 70 70 70 70 70 70 70 ... ## $ origin : int 1 1 1 1 1 1 1 1 1 1 ... ## $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 # Remove the "name" column auto <- auto[,-9] # Remove rows where horsepower is missing auto <- auto[auto$horsepower != "?",] # Convert horsepower to numeric auto$horsepower <- as.numeric(as.character(auto$horsepower)) # Change cylinders into a factor auto$cylinders <- as.factor(auto$cylinders) # Convert year to a factor data type auto$year <- factor(auto$year) 2 Exploratory Data Analysis 1
  • 2. library(ggplot2) p.weight <- ggplot(auto, aes(weight, mpg, colour=cylinders)) p.weight + geom_point(size=2) + facet_grid(.~ cylinders) + ggtitle("Mileage vs. Weight Grouped By Cylinders") 3 4 5 6 8 10 20 30 40 2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000 weight mpg cylinders 3 4 5 6 8 Mileage vs. Weight Grouped By Cylinders p.HP <- ggplot(auto, aes(horsepower, mpg, colour=cylinders)) p.HP + geom_point(size=2) + facet_grid(.~ cylinders) + ggtitle("Mileage vs. Horsepower") 3 4 5 6 8 10 20 30 40 50 100 150 200 50 100 150 200 50 100 150 200 50 100 150 200 50 100 150 200 horsepower mpg cylinders 3 4 5 6 8 Mileage vs. Horsepower p.mpgHP <- ggplot(auto, aes(horsepower, mpg)) p.mpgHP + geom_point() + stat_smooth(method="lm", se=F) 10 20 30 40 50 100 150 200 horsepower mpg 2
  • 3. p.year <- ggplot(auto, aes(year, mpg, colour=cylinders)) p.year + geom_boxplot() + ggtitle("Mileage change over the years Grouped by Cylinders") 10 20 30 40 70 71 72 73 74 75 76 77 78 79 80 81 82 year mpg cylinders 3 4 5 6 8 Mileage change over the years Grouped by Cylinders plt1 <- ggplot(auto, aes(cylinders, mpg)) plt1 + geom_boxplot() 10 20 30 40 3 4 5 6 8 cylinders mpg 3 Simple Linear Regression, mpg vs horsepower lm.fit1 <- lm(mpg ~ horsepower, data = auto) s <- summary(lm.fit1) library(xtable) summaryTab <- xtable(s, caption = "Simple Linear Regression Model, Summary Tablelabel{tab:mpg vs. hors ,label="tab:mpg vs. horsepower") Rsquared.simple <- s$r.squared # Predict mpg for horsepower = 98, and its 95% confidence and prediction interval, respectively predict(lm.fit1, newdata=data.frame(horsepower=98), interval="confidence") ## fit lwr upr ## 1 24.46708 23.97308 24.96108 3
  • 4. predict(lm.fit1, newdata=data.frame(horsepower=98), interval="prediction") ## fit lwr upr ## 1 24.46708 14.8094 34.12476 p.mpgHP + geom_point() + stat_smooth(method="lm", se=F) + geom_abline(intercept = coef(lm.fit1)[1], slope=coef(lm.fit1)[2]) 10 20 30 40 50 100 150 200 horsepower mpg Table 1: Simple Linear Regression Model, Summary Table Estimate Std. Error t value Pr(>|t|) (Intercept) 39.9359 0.7175 55.66 0.0000 horsepower -0.1578 0.0064 -24.49 0.0000 4 Removing Cylinders 3 and 5 Since there are only 4 cars with 3 cylinders and 3 cars with 5 cylinders, I have decided to exclude cars with 3 and 5 cylinders, and focus on presenting and analyzing cars with 4, 6 and 8 cylinders. # Exclude 3 and 5 cylinder cars auto <- auto[auto$cylinders != 3 & auto$cylinders != 5,] # plot it h1 <- ggplot(auto, aes(horsepower, mpg, colour = cylinders)) h1 + geom_point() + facet_grid(. ~ cylinders) 4
  • 5. 4 6 8 10 20 30 40 50 100 150 200 50 100 150 200 50 100 150 200 horsepower mpg cylinders 4 6 8 plt3 <- ggplot(auto, aes(weight, mpg, colour = cylinders)) plt3 + geom_point() + facet_grid(. ~ cylinders) + ggtitle("MPG vs. Weight Grouped by Cylinders") + stat_smooth(method="lm", se=F) 4 6 8 10 20 30 40 2000 3000 4000 5000 2000 3000 4000 5000 2000 3000 4000 5000 weight mpg cylinders 4 6 8 MPG vs. Weight Grouped by Cylinders 5 Correlation Matrix # Create a dataframe with the numeric variables: mpg, displacement, horsepower, weight, acceleration df1 <- auto[,c("mpg", "displacement", "horsepower", "weight", "acceleration")] # Correlation matrix Cor <- cor(df1) CorTab <- xtable(Cor, caption = "Correlation Matrixlabel{tab:Correlation}" ,label="Correlation") 6 Multiple Linear Regression Model # Multi linear regression model lm.fit2 <- lm(mpg ~ weight + horsepower + acceleration + cylinders + cylinders:horsepower, data=auto) 5
  • 6. Table 2: Correlation Matrix mpg displacement horsepower weight acceleration mpg 1.00 -0.82 -0.78 -0.84 0.42 displacement -0.82 1.00 0.90 0.94 -0.56 horsepower -0.78 0.90 1.00 0.87 -0.69 weight -0.84 0.94 0.87 1.00 -0.43 acceleration 0.42 -0.56 -0.69 -0.43 1.00 # Estimate Variance Inflation Factor (vif) found in the car package. # If vif >= 10, then remove the predictor for multicollinearity issue library(car) vifTab <- xtable(vif(lm.fit2)) s.fit2 <- summary(lm.fit2) s.fit2.Tab <- xtable(s.fit2, caption = "Multiple Linear Regression Model, Summary Table") Rsquared.multi <- s.fit2$r.squared print(vifTab) GVIF Df GVIF^(1/(2*Df)) weight 10.14 1.00 3.18 horsepower 26.99 1.00 5.20 acceleration 2.81 1.00 1.68 cylinders 2129.64 2.00 6.79 horsepower:cylinders 3527.06 2.00 7.71 print(s.fit2.Tab, caption.placement="top") Table 3: Multiple Linear Regression Model, Summary Table Estimate Std. Error t value Pr(>|t|) (Intercept) 57.4208 2.7576 20.82 0.0000 weight -0.0027 0.0007 -3.76 0.0002 horsepower -0.2141 0.0256 -8.38 0.0000 acceleration -0.3145 0.1161 -2.71 0.0071 cylinders6 -22.2642 3.5438 -6.28 0.0000 cylinders8 -18.9358 2.9615 -6.39 0.0000 horsepower:cylinders6 0.1992 0.0356 5.60 0.0000 horsepower:cylinders8 0.1607 0.0240 6.69 0.0000 7 Model Diagnostics 6
  • 7. −10 0 10 10 20 30 fitted residuals Figure 1: Residuals vs. Fitted Values. The gure proves that the homoscedasticity assumption i.e. constant variance is violated. Thus, the multivariate linear regression model developed above seems not to be suitable. The response variable may need to be transformed and then retted. Log transfomation of the response (mpg) variable may rectify the non constant variance. −10 0 10 −2 0 2 theoretical sample Figure 2: qqnorm and qqline plots to test for the normality assumption. The data do not seem to deviate much from the normality assumption. There seems to be few outliers in the data set. 7
  • 8. 0 20 40 60 −10 0 10 20 residuals count 0.00 0.05 0.10 0.15 0.20 0.25 0 100 200 300 400 Index Leverages Index plot of Leverages 8 Modifying the model lm.fit3 - lm(log(mpg) ~ weight + horsepower + acceleration + cylinders + cylinders * horsepower, data=auto) summary(lm.fit3) ## ## Call: ## lm(formula = log(mpg) ~ weight + horsepower + acceleration + ## cylinders + cylinders * horsepower, data = auto) ## ## Residuals: ## Min 1Q Median 3Q Max ## -0.36326 -0.08544 -0.00528 0.08195 0.63418 ## 8
  • 9. ## Coefficients: ## Estimate Std. Error t value Pr(|t|) ## (Intercept) 4.421e+00 1.063e-01 41.613 2e-16 *** ## weight -1.365e-04 2.738e-05 -4.986 9.40e-07 *** ## horsepower -6.784e-03 9.846e-04 -6.890 2.35e-11 *** ## acceleration -1.308e-02 4.473e-03 -2.924 0.00366 ** ## cylinders6 -7.404e-01 1.365e-01 -5.422 1.05e-07 *** ## cylinders8 -4.742e-01 1.141e-01 -4.156 4.01e-05 *** ## horsepower:cylinders6 6.271e-03 1.371e-03 4.574 6.50e-06 *** ## horsepower:cylinders8 3.456e-03 9.263e-04 3.731 0.00022 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.1439 on 377 degrees of freedom ## Multiple R-squared: 0.8259,Adjusted R-squared: 0.8227 ## F-statistic: 255.5 on 7 and 377 DF, p-value: 2.2e-16 par(mfrow=c(2,2)) plot(lm.fit3) 2.4 2.6 2.8 3.0 3.2 3.4 3.6 −0.40.6 Fitted values Residuals Residuals vs Fitted 387 361365 −3 −2 −1 0 1 2 3 −24 Theoretical Quantiles Standardizedresiduals Normal Q−Q 387 361365 2.4 2.6 2.8 3.0 3.2 3.4 3.6 0.01.5 Fitted values Standardizedresiduals Scale−Location 387 361365 0.00 0.05 0.10 0.15 0.20 0.25 −24 Leverage Standardizedresiduals Cook's distance 0.5 0.5 1 Residuals vs Leverage 334361 387 Rsquared.mod - summary(lm.fit3)$r.squared 8.1 Residuals and QQ Norm of Modied Model par(mfrow=c(3,1)) plot(lm.fit3$fit, lm.fit3$res, xlab=Fitted Values, ylab=Residuals) hist(resid(lm.fit3)) qqnorm(resid(lm.fit3)) qqline(resid(lm.fit3)) 9
  • 10. 2.4 2.6 2.8 3.0 3.2 3.4 3.6 −0.40.00.20.40.6 Fitted Values Residuals Histogram of resid(lm.fit3) resid(lm.fit3) Frequency −0.4 −0.2 0.0 0.2 0.4 0.6 020406080 −3 −2 −1 0 1 2 3 −0.40.00.20.40.6 Normal Q−Q Plot Theoretical Quantiles SampleQuantiles 10
  • 11. 9 Comparison of Rsquared value among the Models # Dataframe of the rsquareds df2 - data.frame(rbind(Rsquared.simple, Rsquared.multi, Rsquared.mod)) names(df2) - Rsquared rownames(df2) - c(Simple, Multiple, Modified Multiple) dfTab - xtable(df2) Rsquared Simple 0.61 Multiple 0.78 Modied Multiple 0.83 11