Datamining R 2nd

R:

sesejun@is.ocha.ac.jp
2009/10/29( )

• contacts_train.csv

•
• (setwd
> > )
"Pred","Young","Myope","Astimatic","Tear"
"P","Y","Y","Y","N"
"P","Y","Y","N","N"
"P","N","Y","Y","N"
"P","N","Y","Y","N"
"N","Y","Y","Y","Y"
"N","Y","Y","N","Y"
"N","N","N","N","Y"
"N","N","N","N","N"
"N","N","N","N","Y"
"N","N","N","N","N"
contacts.csv

> contacts.train<-read.table("contacts_train.csv", header=T,
sep=",")
> contacts.train
Pred Young Myope Astimatic Tear
1 P Y Y Y N
2 P Y Y N N
3 P N Y Y N
4 P N Y Y N
5 N Y Y Y Y
6 N Y Y N Y
7 N N N N Y
8 N N N N N
9 N N N N Y
10 N N N N N

> contacts.train[1,]
1 P Y Y Y N
> contacts.train[,2]
[1] Y Y N N Y Y N N N N
Levels: N Y
> contacts.train[,"Pred"]
[1] P P P P N N N N N N
Levels: N P
> contacts.train$Pred
[1] P P P P N N N N N N
Levels: N P

> contacts.train[c(-1,-3,-5,-7,-9),]
2 P Y Y N N
4 P N Y Y N
6 N Y Y N Y
8 N N N N N
10 N N N N N

> class(contacts.train)
[1] "data.frame"

> forecast <- data.frame(date=c("10/1","10/2","10/3"),
weather=c("sunny","sunny","rain"))
> forecast
date weather
1 10/1 sunny
2 10/2 sunny
3 10/3 rain
> forecast$weather
[1] sunny sunny rain
Levels: rain sunny
> forecast$date
[1] 10/1 10/2 10/3

> nrow(contacts.train)
[1] 10
> ncol(contacts.train)
[1] 5
> rownames(contacts.train)
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
> colnames(contacts.train)
[1] "Pred" "Young" "Myope" "Astimatic" "Tear"

> colnames(contacts.train)[2]
[1] "Young"

> colnames(contacts.train)[2] <- "Old"
> colnames(contacts.train)
[1] "Pred" "Old" "Myope" "Astimatic" "Tear"

> colnames(contacts.train)[2] <- "Young"

> contacts.train$Young
[1] Y Y N N Y Y N N N N
Levels: N Y
> order(contacts.train$Young)
[1] 3 4 7 8 9 10 1 2 5 6
> contacts.train[order(contacts.train$Young),]
3 P N Y Y N
4 P N Y Y N
7 N N N N Y
8 N N N N N
9 N N N N Y
10 N N N N N
1 P Y Y Y N
2 P Y Y N N
5 N Y Y Y Y
6 N Y Y N Y

> library("mvpart")
> rpart(Young~., data=contacts.train, method="class")
n= 10

node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 10 4 N (0.6000000 0.4000000)
2) Myope=N 4 0 N (1.0000000 0.0000000) *
3) Myope=Y 6 2 Y (0.3333333 0.6666667) *

> rpart(Young~., data=contacts.train, method="class",
control=rpart.control(cp=-1))
n= 10

1) root 10 4 N (0.6000000 0.4000000)
2) Myope=N 4 0 N (1.0000000 0.0000000) *
3) Myope=Y 6 2 Y (0.3333333 0.6666667)
6) Pred=P 4 2 N (0.5000000 0.5000000) *
7) Pred=N 2 0 Y (0.0000000 1.0000000) *

IRIS
• http://archive.ics.uci.edu/ml/machine-learning-databases/iris/ iris.data

• iris.name
• (setosa, versicolor, virginia)

• http://togodb.sel.is.ocha.ac.jp/

> iris.train <- read.table("iris_train.csv", sep=",", header=T)
> length(rownames(iris.train))
[1] 120
> length(colnames(iris.train))
[1] 5

> hist(iris.train$Sepal.length)
> hist(iris.train$Petal.length)

> library(“mvpart”)
> rpart(Class~., data=iris.train, method="class",
control=rpart.control(cp=.1))
n= 120


1) root 120 77 Iris-setosa (0.35833333 0.34166667 0.30000000)
2) Petal.length< 2.45 43 0 Iris-setosa (1.00000000 0.00000000
0.00000000) *
3) Petal.length>=2.45 77 36 Iris-versicolor (0.00000000 0.53246753
0.46753247)
6) Petal.length< 4.75 37 1 Iris-versicolor (0.00000000 0.97297297
0.02702703) *
7) Petal.length>=4.75 40 5 Iris-virginica (0.00000000 0.12500000
0.87500000) *

> iris.dtree<-rpart(Class~., data=iris.train, method="class",
control=rpart.control(cp=.1))
> plot.new()
> plot(iris.dtree,uniform=T,margin=0.5)
> text(iris.dtree,use.n=T,all.leaves=F)

> plot(iris.train$Petal.length, iris.train$Petal.width, pch =
c(1,2,3)[unclass(iris.train$Class)])

> iris.test <- read.table("iris_test.csv", sep=",", header=T)

> iris.predict <- predict(iris.dtree, iris.test[1:4], type="class")
> iris.predict
2 4 18 34
Iris-setosa Iris-setosa Iris-setosa Iris-setosa
...

> iris.predict == iris.test$Class
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
[11] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[21] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

> sum(iris.predict == iris.test$Class) / length(iris.test$Class)
[1] 0.9666667
> sum(iris.predict != iris.test$Class) / length(iris.test$Class)
[1] 0.03333333

•
•
•
•
• rpart control=rpart.control(cp=.1) .1

• 10

• 3 2 3

Datamining R 2nd

Recomendados

Recomendados

Más contenido relacionado

Similar a Datamining R 2nd

Similar a Datamining R 2nd (20)

Más de sesejun

Más de sesejun (20)

Datamining R 2nd