More Related Content Similar to Chapter 2: R tutorial Handbook for Data Science and Machine Learning Practitioners (20) More from Raman Kannan (20) Chapter 2: R tutorial Handbook for Data Science and Machine Learning Practitioners2. https://www.r-project.org/
Essential tools:
RGUI Basic R processing,
RScript to run batch scripts,
RCMD (to install in Unix/Linux) variants
RStudio is a compelling tool – though defer RStudio until you know R very we
ll – tools are limiting you – bad idea to start with RStudio to learn the la
nguage, IMHO.
Reference Sites: (that I often use, don’t leave home without it)
https://www.r-bloggers.com
https://nabble.com/
http://rfunction.com
https://stackoverflow.com/
https://stats.stackexchange.com/
https://www.datasciencemadesimple.com/
http://www.r-tutor.com/
There are thousands if not more, useful R sites you can learn from
Again to do what you want to get done…otherwise you will be sucked into
vortex..
…
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
2 of 31 10/18/2020, 4:30 AM
3. ls()
## character(0)
X<-5
7->Y
ifelse(X<Y,'X is Less than Y', 'X is atleast equal to Y')
## [1] "X is Less than Y"
vec<-1:13
is.vector(vec)
## [1] TRUE
vec[4]
## [1] 4
by2<-seq(1,13,2)
(xy2<-seq(1,13,2))
## [1] 1 3 5 7 9 11 13
xy2[4]
## [1] 7
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
3 of 31 10/18/2020, 4:30 AM
4. is.vector(xy2[4])
## [1] TRUE
length(xy2[4])
## [1] 1
vec[vec %in% by2]
## [1] 1 3 5 7 9 11 13
(xyeven<-seq(0,13,2))
## [1] 0 2 4 6 8 10 12
length(vec)
## [1] 13
mean(vec)
## [1] 7
sd(vec)
## [1] 3.89444
sum(vec)
## [1] 91
cumprod(vec)
## [1] 1 2 6 24 120 720
## [7] 5040 40320 362880 3628800 39916800 479001600
## [13] 6227020800
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
4 of 31 10/18/2020, 4:30 AM
5. L<-list(X=5,reason="I like 5")
L
## $X
## [1] 5
##
## $reason
## [1] "I like 5"
mx<-matrix(c(rep(0,5),seq(1:5)),nrow=2,ncol=5) # fixed the error now the mx
should have correct values not ALL zeros
mx
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0 0 0 2 4
## [2,] 0 0 1 3 5
mxbyr<-matrix(c(rep(0,5),seq(1:5)),nrow=2,ncol=5,byrow=TRUE)
mxbyr
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0 0 0 0 0
## [2,] 1 2 3 4 5
dd <- structure(list(
population = c(4.560667108, 1.275920972)
,continents = c('Asia', 'Africa'))
,.Names = c("Pop", "Continent")
,row.names = c(NA, -2L)
,class = "data.frame")
dd
## Pop Continent
## 1 4.560667 Asia
## 2 1.275921 Africa
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
5 of 31 10/18/2020, 4:30 AM
6. dd<-rbind(dd,c(4.1570842,'Oceania'))
dd
## Pop Continent
## 1 4.560667108 Asia
## 2 1.275920972 Africa
## 3 4.1570842 Oceania
dd<-cbind(dd,density=c(100,36,4))
dd<-rbind(dd,c(0,'pangea'))
dd
## Pop Continent density
## 1 4.560667108 Asia 100
## 2 1.275920972 Africa 36
## 3 4.1570842 Oceania 4
## 4 0 pangea 0
which(dd$Pop==0)
## [1] 4
dd<-dd[-which(dd$Pop==0),]
dd
## Pop Continent density
## 1 4.560667108 Asia 100
## 2 1.275920972 Africa 36
## 3 4.1570842 Oceania 4
birds<-data.frame(nlegs=rep(2,5),can_fly=c(0,1,1,0,1),height=c(25,40,20,150,
10),
color=c('black','black','blue','black','brown'))
birds2<-cbind(birds,c('chicken','vulture','parrot','ostrich','sparrow'))
names(birds2)<-c('nlegs','can_fly','height','color','species')
birds2
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
6 of 31 10/18/2020, 4:30 AM
7. ## nlegs can_fly height color species
## 1 2 0 25 black chicken
## 2 2 1 40 black vulture
## 3 2 1 20 blue parrot
## 4 2 0 150 black ostrich
## 5 2 1 10 brown sparrow
…
chickencolors<-c('black','white','red','mixed')
vulturecolors<-c('grey','black','white')
parrotcolors<-c('teal','green','blue','mixed','pink')
ostrichcolors<-c('grey','black')
sparrowcolors<-c('dark cement','brown')
hchicken<-sample(rnorm(10,25,6),5)
hvulture<-sample(rnorm(10,40,4),5)
hparrot<-sample(rnorm(10,20,2),5)
hostrich<-sample(rnorm(10,150,20),5)
hsparrow<-sample(rnorm(10,10,1),5)
cdset<-rbind(birds2,data.frame(nlegs=rep(2,5),can_fly=rep(0,5), height=hchic
ken,
color=sample(chickencolors,5,replace=T),species=rep('chicken',5)),
data.frame(nlegs=rep(2,5),can_fly=rep(1,5), height=hvulture,
color=sample(vulturecolors,5,replace=T),species=rep('vulture',5)),
data.frame(nlegs=rep(2,5),can_fly=rep(1,5), height=hparrot,
color=sample(parrotcolors,5,replace=T),species=rep('parrot',5)),
data.frame(nlegs=rep(2,5),can_fly=rep(0,5), height=hostrich,
color=sample(ostrichcolors,5,replace=T),species=rep('ostrich',5)),
data.frame(nlegs=rep(2,5),can_fly=rep(1,5), height=hsparrow,
color=sample(sparrowcolors,5,replace=T),species=rep('sparrow',5)))
cdset # just print out the contents
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
7 of 31 10/18/2020, 4:30 AM
8. ## nlegs can_fly height color species
## 1 2 0 25.000000 black chicken
## 2 2 1 40.000000 black vulture
## 3 2 1 20.000000 blue parrot
## 4 2 0 150.000000 black ostrich
## 5 2 1 10.000000 brown sparrow
## 6 2 0 21.795787 red chicken
## 7 2 0 39.459162 mixed chicken
## 8 2 0 22.981968 black chicken
## 9 2 0 17.744720 black chicken
## 10 2 0 25.911222 mixed chicken
## 11 2 1 39.016163 white vulture
## 12 2 1 40.037789 white vulture
## 13 2 1 42.251693 grey vulture
## 14 2 1 39.014589 grey vulture
## 15 2 1 38.475420 white vulture
## 16 2 1 20.316044 mixed parrot
## 17 2 1 22.712721 teal parrot
## 18 2 1 22.840455 mixed parrot
## 19 2 1 14.934359 blue parrot
## 20 2 1 21.195914 blue parrot
## 21 2 0 160.085412 black ostrich
## 22 2 0 140.594205 black ostrich
## 23 2 0 174.088029 grey ostrich
## 24 2 0 157.684178 grey ostrich
## 25 2 0 135.249085 grey ostrich
## 26 2 1 9.295639 dark cement sparrow
## 27 2 1 11.266186 dark cement sparrow
## 28 2 1 9.336063 brown sparrow
## 29 2 1 10.169087 brown sparrow
## 30 2 1 11.060101 brown sparrow
dim(cdset) # what are the dimensions
## [1] 30 5
nrow(cdset) # number of rows
## [1] 30
ncol(cdset) # number of columns
## [1] 5
names(cdset) # data.frames have names matrices dont
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
8 of 31 10/18/2020, 4:30 AM
9. ## [1] "nlegs" "can_fly" "height" "color" "species"
head(cdset)
## nlegs can_fly height color species
## 1 2 0 25.00000 black chicken
## 2 2 1 40.00000 black vulture
## 3 2 1 20.00000 blue parrot
## 4 2 0 150.00000 black ostrich
## 5 2 1 10.00000 brown sparrow
## 6 2 0 21.79579 red chicken
tail(cdset)
## nlegs can_fly height color species
## 25 2 0 135.249085 grey ostrich
## 26 2 1 9.295639 dark cement sparrow
## 27 2 1 11.266186 dark cement sparrow
## 28 2 1 9.336063 brown sparrow
## 29 2 1 10.169087 brown sparrow
## 30 2 1 11.060101 brown sparrow
row.names(cdset)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "1
4" "15"
## [16] "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "2
9" "30"
cdset[1,3]# just one cell
## [1] 25
cdset[1,] # entire observation
## nlegs can_fly height color species
## 1 2 0 25 black chicken
cdset[,3]# entire column or the feature
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
9 of 31 10/18/2020, 4:30 AM
10. ## [1] 25.000000 40.000000 20.000000 150.000000 10.000000 21.795787
## [7] 39.459162 22.981968 17.744720 25.911222 39.016163 40.037789
## [13] 42.251693 39.014589 38.475420 20.316044 22.712721 22.840455
## [19] 14.934359 21.195914 160.085412 140.594205 174.088029 157.684178
## [25] 135.249085 9.295639 11.266186 9.336063 10.169087 11.060101
cdset[cdset$species=='sparrow',]# review just the sparrow data.entire observ
ations
## nlegs can_fly height color species
## 5 2 1 10.000000 brown sparrow
## 26 2 1 9.295639 dark cement sparrow
## 27 2 1 11.266186 dark cement sparrow
## 28 2 1 9.336063 brown sparrow
## 29 2 1 10.169087 brown sparrow
## 30 2 1 11.060101 brown sparrow
cdset[cdset$species=='sparrow',c(1,3,5)]# just some of the columns
## nlegs height species
## 5 2 10.000000 sparrow
## 26 2 9.295639 sparrow
## 27 2 11.266186 sparrow
## 28 2 9.336063 sparrow
## 29 2 10.169087 sparrow
## 30 2 11.060101 sparrow
cdset[cdset$species=='sparrow',c('nlegs','species')]# or by column names
## nlegs species
## 5 2 sparrow
## 26 2 sparrow
## 27 2 sparrow
## 28 2 sparrow
## 29 2 sparrow
## 30 2 sparrow
cdset[cdset$species=='sparrow',-which(names(cdset)=='species')]# filter OUT
some columns
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
10 of 31 10/18/2020, 4:30 AM
11. ## nlegs can_fly height color
## 5 2 1 10.000000 brown
## 26 2 1 9.295639 dark cement
## 27 2 1 11.266186 dark cement
## 28 2 1 9.336063 brown
## 29 2 1 10.169087 brown
## 30 2 1 11.060101 brown
cdset[cdset$species=='sparrow',-which(names(cdset)%in%c('nlegs','species'))]
## can_fly height color
## 5 1 10.000000 brown
## 26 1 9.295639 dark cement
## 27 1 11.266186 dark cement
## 28 1 9.336063 brown
## 29 1 10.169087 brown
## 30 1 11.060101 brown
…
…
…
lapply(1:3,FUN=function(x)x*x) -> exl
exl
## [[1]]
## [1] 1
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 9
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
11 of 31 10/18/2020, 4:30 AM
12. mx
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0 0 0 2 4
## [2,] 0 0 1 3 5
apply(mx,2,sd)
## [1] 0.0000000 0.0000000 0.7071068 0.7071068 0.7071068
(mx1<-sapply(mx,FUN=function(x)x+1))
## [1] 1 1 1 1 1 2 3 4 5 6
…
…
prodidlist<-c(paste("P0",1:9,sep=''),paste("P",10:99,sep=''))
cidlist<-c(paste("C0",1:9,sep=''),paste("C",10:22,sep=''))
(df<-data.frame(DID=1,CID="C01",
PID=sample(prodidlist,sample(1:20,1),replace=F),
stringsAsFactors=F))
## DID CID PID
## 1 1 C01 P88
## 2 1 C01 P53
## 3 1 C01 P86
## 4 1 C01 P90
## 5 1 C01 P21
## 6 1 C01 P34
(sample(prodidlist,sample(1:20,1),replace=F))
## [1] "P75" "P34" "P40" "P27" "P71" "P06" "P72"
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
12 of 31 10/18/2020, 4:30 AM
13. mdf<-do.call('rbind',lapply(1:50,FUN=function(x)
{
cidlist<-sample(cidlist,sample(1:length(cidlist),1),replace=F)
dfi<-do.call('rbind',lapply(cidlist,FUN=
function(cid)data.frame(DID=x,CID=cid,
PID=sample(prodidlist,sample(1:20,1),replace=F))))
}
))
write.table(mdf,
file='purchases.csv',
sep=',',row.names=F,
col.names=T,
quote=F)
head(mdf)
## DID CID PID
## 1 1 C19 P19
## 2 1 C19 P33
## 3 1 C19 P91
## 4 1 C19 P78
## 5 1 C19 P66
## 6 1 C19 P64
nrow(mdf)
## [1] 6436
…
read.csv('purchases.csv',head=T,sep=',')->rmdf
titanic<-read.csv("http://christianherta.de/lehre/dataScience/machineLearnin
g/data/titanic-train.csv",header=T)
head(titanic)
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
13 of 31 10/18/2020, 4:30 AM
14. ## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Pa
rch
## 1 Braund, Mr. Owen Harris male 22 1
0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
0
## 3 Heikkinen, Miss. Laina female 26 0
0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
0
## 5 Allen, Mr. William Henry male 35 0
0
## 6 Moran, Mr. James male NA 0
0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 S
## 2 PC 17599 71.2833 C85 C
## 3 STON/O2. 3101282 7.9250 S
## 4 113803 53.1000 C123 S
## 5 373450 8.0500 S
## 6 330877 8.4583 Q
dim(titanic)
## [1] 891 12
table(mdf==rmdf)
##
## TRUE
## 19308
cumprod(dim(mdf)) #rows ^ columns the number of elements all of them match a
s they should
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
14 of 31 10/18/2020, 4:30 AM
15. ## [1] 6436 19308
nrow(rmdf)* ncol(rmdf)
## [1] 19308
quantmod::getSymbols(c("IBM","SPY"),from='2020-01-01')
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
## [1] "IBM" "SPY"
dim(IBM)
## [1] 201 6
#dim(JNJ)
#quantmod::getSymbols(c("SPY"),from='2020-01-01')
head(IBM)
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
15 of 31 10/18/2020, 4:30 AM
16. ## IBM.Open IBM.High IBM.Low IBM.Close IBM.Volume IBM.Adjusted
## 2020-01-02 135.00 135.92 134.77 135.42 3148600 130.5377
## 2020-01-03 133.57 134.86 133.56 134.34 2373700 129.4967
## 2020-01-06 133.42 134.24 133.20 134.10 2425500 129.2654
## 2020-01-07 133.69 134.96 133.40 134.19 3090800 129.3521
## 2020-01-08 134.51 135.86 133.92 135.31 4346000 130.4317
## 2020-01-09 135.74 136.79 135.31 136.74 3730600 131.8102
head(IBM$IBM.Adjusted)
## IBM.Adjusted
## 2020-01-02 130.5377
## 2020-01-03 129.4967
## 2020-01-06 129.2654
## 2020-01-07 129.3521
## 2020-01-08 130.4317
## 2020-01-09 131.8102
DIBM<-c(head(IBM$IBM.Adjusted,1),head(IBM$IBM.Adjusted,200)) ## fixed the er
ror
head((dailyIBMReturns<-(((as.numeric(IBM$IBM.Adjusted)/DIBM) -1)*100))) ## f
ixed the error now results are full precision
## IBM.Adjusted
## 2020-01-02 0.00000000
## 2020-01-02 -0.79751719
## 2020-01-03 -0.17863237
## 2020-01-06 0.06709531
## 2020-01-07 0.83464824
## 2020-01-08 1.05682335
dailyReturnIBM<-dailyIBMReturns#(IBM[[6]]/DIBM)-1
head(dailyReturnIBM)
## IBM.Adjusted
## 2020-01-02 0.00000000
## 2020-01-02 -0.79751719
## 2020-01-03 -0.17863237
## 2020-01-06 0.06709531
## 2020-01-07 0.83464824
## 2020-01-08 1.05682335
DSPY<-c(head(SPY$SPY.Adjusted,1),head(SPY$SPY.Adjusted,200)) ## fixed the er
ror
head((dailySPYReturns<-(((as.numeric(SPY$SPY.Adjusted)/DSPY) -1)*100))) ## f
ixed the error now results are full precision
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
16 of 31 10/18/2020, 4:30 AM
17. ## SPY.Adjusted
## 2020-01-02 0.0000000
## 2020-01-02 -0.7572182
## 2020-01-03 0.3815075
## 2020-01-06 -0.2811862
## 2020-01-07 0.5329669
## 2020-01-08 0.6780544
dailyReturnSPY<-dailySPYReturns#(SPY[[6]]/DSPY)-1
lmModel<-lm(dailyReturnIBM~dailyReturnSPY)
summary(lmModel)
##
## Call:
## lm(formula = dailyReturnIBM ~ dailyReturnSPY)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7497 -0.7827 -0.1046 0.6906 7.0070
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.04934 0.09647 -0.511 0.61
## dailyReturnSPY 1.02923 0.04193 24.547 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.367 on 199 degrees of freedom
## Multiple R-squared: 0.7517, Adjusted R-squared: 0.7505
## F-statistic: 602.6 on 1 and 199 DF, p-value: < 2.2e-16
oldPar<-par(mfrow=c(2,1))
plot(dailyReturnIBM,color='black')
plot(dailyReturnSPY,color='blue')
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
17 of 31 10/18/2020, 4:30 AM
19. ## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
purchases<-mdf
rpt01<-sqldf('select DID,count(distinct(CID)) from mdf group by DID')
head(rpt01)
## DID count(distinct(CID))
## 1 1 14
## 2 2 19
## 3 3 6
## 4 4 16
## 5 5 16
## 6 6 9
tail(rpt01)
## DID count(distinct(CID))
## 45 45 22
## 46 46 15
## 47 47 15
## 48 48 19
## 49 49 10
## 50 50 10
sqldf('select distinct CID from mdf where DID=50')
## CID
## 1 C06
## 2 C21
## 3 C03
## 4 C11
## 5 C04
## 6 C17
## 7 C08
## 8 C14
## 9 C18
## 10 C15
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
19 of 31 10/18/2020, 4:30 AM
21. SELECT A.p1,
A.p2,
A.p1p2c / B.p1c AS condProb
FROM (SELECT apid P1,
bpid P2,
Count(*) P1P2C
FROM (SELECT A.did AS ADID,
A.cid AS ACID,
A.pid AS APID,
B.did AS BDID,
B.cid AS BCID,
B.pid AS BPID
FROM purchases A
JOIN purchases B
ON A.cid = B.cid
AND A.did = B.did
AND A.pid < B.pid) X
GROUP BY apid,
bpid) A
JOIN (SELECT pid AS P1,
Count(*) P1C
FROM purchases
GROUP BY pid) B
ON A.p1 = B.p1
ORDER BY condprob DESC;
sqlstr<-"select A.P1,A.P2, (A.P1P2C*100)/B.P1C as condProb from ( Select API
D P1,BPID P2,count(*) P1P2C from ( select A.DID as
ADID, A.CID as ACID , A.PID as APID , B.DID as BDID, B.CID as BCID , B.PID a
s BPID from purchases A join purchases B on
A.CID=B.CID AND A.DID=B.DID AND A.PID < B.PID ) X group by APID,BPID ) A jo
in (select PID as P1, count(*) P1C from purchases group by PID) B on A.P1=B.
P1 order by condProb desc"
sqlstr
## [1] "select A.P1,A.P2, (A.P1P2C*100)/B.P1C as condProb from ( Select APID
P1,BPID P2,count(*) P1P2C from ( select A.DID asnADID, A.CID as ACID , A.P
ID as APID , B.DID as BDID, B.CID as BCID , B.PID as BPID from purchases A j
oin purchases B onnA.CID=B.CID AND A.DID=B.DID AND A.PID < B.PID ) X group
by APID,BPID ) A join (select PID as P1, count(*) P1C from purchases group b
y PID) B on A.P1=B.P1 order by condProb desc"
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
21 of 31 10/18/2020, 4:30 AM
22. condprob<-sqldf(sqlstr)
#condprob
head(condprob)
## P1 P2 condProb
## 1 P27 P61 31
## 2 P66 P97 31
## 3 P04 P92 29
## 4 P27 P53 29
## 5 P27 P95 29
## 6 P52 P79 29
require(rpart)
## Loading required package: rpart
require(rpart.plot)
## Loading required package: rpart.plot
require(klaR)
## Loading required package: klaR
## Loading required package: MASS
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
22 of 31 10/18/2020, 4:30 AM
23. set.seed(43)
tridx<-sample(1:30,20,replace=F)
trdata<-cdset[tridx,]
tstdata<-cdset[-tridx,]
trmodel.rpart<-rpart(species~.,data=trdata,minsplit=2)
rpart.plot(trmodel.rpart)
#compare this to
table(trdata$species)/nrow(trdata)
##
## chicken ostrich parrot sparrow vulture
## 0.20 0.25 0.15 0.20 0.20
predicted.trmodel.rpart<-predict(trmodel.rpart,trdata[,-5],type='class')
table(trdata[,5],predicted.trmodel.rpart)
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
23 of 31 10/18/2020, 4:30 AM
24. ## predicted.trmodel.rpart
## chicken ostrich parrot sparrow vulture
## chicken 4 0 0 0 0
## ostrich 0 5 0 0 0
## parrot 0 0 3 0 0
## sparrow 0 0 0 4 0
## vulture 0 0 0 0 4
# removing colors that are present in test but not in train -- in small data
set
# tree cannot process that
tstdatnw<-tstdata[tstdata$color %in% trdata$color,]
tstdatnw
## nlegs can_fly height color species
## 10 2 0 25.91122 mixed chicken
## 11 2 1 39.01616 white vulture
## 15 2 1 38.47542 white vulture
## 18 2 1 22.84045 mixed parrot
## 20 2 1 21.19591 blue parrot
## 25 2 0 135.24908 grey ostrich
## 29 2 1 10.16909 brown sparrow
## 30 2 1 11.06010 brown sparrow
predicted.tstdatnw.rpart<-predict(trmodel.rpart,tstdatnw[,-5],type='class')
table(tstdatnw[,5],predicted.tstdatnw.rpart)
## predicted.tstdatnw.rpart
## chicken ostrich parrot sparrow vulture
## chicken 1 0 0 0 0
## ostrich 0 1 0 0 0
## parrot 0 0 2 0 0
## sparrow 0 0 0 2 0
## vulture 0 0 0 0 2
caret::confusionMatrix( table(tstdatnw[,5],predicted.tstdatnw.rpart))
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
24 of 31 10/18/2020, 4:30 AM
25. ## Confusion Matrix and Statistics
##
## predicted.tstdatnw.rpart
## chicken ostrich parrot sparrow vulture
## chicken 1 0 0 0 0
## ostrich 0 1 0 0 0
## parrot 0 0 2 0 0
## sparrow 0 0 0 2 0
## vulture 0 0 0 0 2
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.6306, 1)
## No Information Rate : 0.25
## P-Value [Acc > NIR] : 1.526e-05
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: chicken Class: ostrich Class: parrot Class: s
parrow
## Sensitivity 1.000 1.000 1.00
1.00
## Specificity 1.000 1.000 1.00
1.00
## Pos Pred Value 1.000 1.000 1.00
1.00
## Neg Pred Value 1.000 1.000 1.00
1.00
## Prevalence 0.125 0.125 0.25
0.25
## Detection Rate 0.125 0.125 0.25
0.25
## Detection Prevalence 0.125 0.125 0.25
0.25
## Balanced Accuracy 1.000 1.000 1.00
1.00
## Class: vulture
## Sensitivity 1.00
## Specificity 1.00
## Pos Pred Value 1.00
## Neg Pred Value 1.00
## Prevalence 0.25
## Detection Rate 0.25
## Detection Prevalence 0.25
## Balanced Accuracy 1.00
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
25 of 31 10/18/2020, 4:30 AM
26. tstdatnw[,5]
## [1] "chicken" "vulture" "vulture" "parrot" "parrot" "ostrich" "sparrow"
## [8] "sparrow"
predicted.tstdatnw.rpart
## 10 11 15 18 20 25 29 30
## chicken vulture vulture parrot parrot ostrich sparrow sparrow
## Levels: chicken ostrich parrot sparrow vulture
table(as.character(tstdatnw[,5]),as.character(predicted.tstdatnw.rpart))
##
## chicken ostrich parrot sparrow vulture
## chicken 1 0 0 0 0
## ostrich 0 1 0 0 0
## parrot 0 0 2 0 0
## sparrow 0 0 0 2 0
## vulture 0 0 0 0 2
caret::confusionMatrix( table(tstdatnw[,5],predicted.tstdatnw.rpart))
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
26 of 31 10/18/2020, 4:30 AM
27. ## Confusion Matrix and Statistics
##
## predicted.tstdatnw.rpart
## chicken ostrich parrot sparrow vulture
## chicken 1 0 0 0 0
## ostrich 0 1 0 0 0
## parrot 0 0 2 0 0
## sparrow 0 0 0 2 0
## vulture 0 0 0 0 2
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.6306, 1)
## No Information Rate : 0.25
## P-Value [Acc > NIR] : 1.526e-05
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: chicken Class: ostrich Class: parrot Class: s
parrow
## Sensitivity 1.000 1.000 1.00
1.00
## Specificity 1.000 1.000 1.00
1.00
## Pos Pred Value 1.000 1.000 1.00
1.00
## Neg Pred Value 1.000 1.000 1.00
1.00
## Prevalence 0.125 0.125 0.25
0.25
## Detection Rate 0.125 0.125 0.25
0.25
## Detection Prevalence 0.125 0.125 0.25
0.25
## Balanced Accuracy 1.000 1.000 1.00
1.00
## Class: vulture
## Sensitivity 1.00
## Specificity 1.00
## Pos Pred Value 1.00
## Neg Pred Value 1.00
## Prevalence 0.25
## Detection Rate 0.25
## Detection Prevalence 0.25
## Balanced Accuracy 1.00
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
27 of 31 10/18/2020, 4:30 AM
28. …
cdset
## nlegs can_fly height color species
## 1 2 0 25.000000 black chicken
## 2 2 1 40.000000 black vulture
## 3 2 1 20.000000 blue parrot
## 4 2 0 150.000000 black ostrich
## 5 2 1 10.000000 brown sparrow
## 6 2 0 21.795787 red chicken
## 7 2 0 39.459162 mixed chicken
## 8 2 0 22.981968 black chicken
## 9 2 0 17.744720 black chicken
## 10 2 0 25.911222 mixed chicken
## 11 2 1 39.016163 white vulture
## 12 2 1 40.037789 white vulture
## 13 2 1 42.251693 grey vulture
## 14 2 1 39.014589 grey vulture
## 15 2 1 38.475420 white vulture
## 16 2 1 20.316044 mixed parrot
## 17 2 1 22.712721 teal parrot
## 18 2 1 22.840455 mixed parrot
## 19 2 1 14.934359 blue parrot
## 20 2 1 21.195914 blue parrot
## 21 2 0 160.085412 black ostrich
## 22 2 0 140.594205 black ostrich
## 23 2 0 174.088029 grey ostrich
## 24 2 0 157.684178 grey ostrich
## 25 2 0 135.249085 grey ostrich
## 26 2 1 9.295639 dark cement sparrow
## 27 2 1 11.266186 dark cement sparrow
## 28 2 1 9.336063 brown sparrow
## 29 2 1 10.169087 brown sparrow
## 30 2 1 11.060101 brown sparrow
klaR::partimat(Species~.,data=iris,method="lda")
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
28 of 31 10/18/2020, 4:30 AM
29. cat ("the sample proportions are")
## the sample proportions are
table(cdset$species)
##
## chicken ostrich parrot sparrow vulture
## 6 6 6 6 6
table(cdset$species)/sum(table(cdset$species))
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
29 of 31 10/18/2020, 4:30 AM
30. ##
## chicken ostrich parrot sparrow vulture
## 0.2 0.2 0.2 0.2 0.2
cat ("the training set proportions are")
## the training set proportions are
table(trdata$species)
##
## chicken ostrich parrot sparrow vulture
## 4 5 3 4 4
table(trdata$species)/sum(table(trdata$species))
##
## chicken ostrich parrot sparrow vulture
## 0.20 0.25 0.15 0.20 0.20
cat ("the test set proportions are")
## the test set proportions are
table(tstdata$species)
##
## chicken ostrich parrot sparrow vulture
## 2 1 3 2 2
table(tstdata$species)/sum(table(tstdata$species))
##
## chicken ostrich parrot sparrow vulture
## 0.2 0.1 0.3 0.2 0.2
Chapter-02-R-Tutorial file:///C:/Users/rk215/Documents/R-Tutorial.html#Learning_R_to_do_D...
30 of 31 10/18/2020, 4:30 AM