Togaware DATA MINING
Desktop Survival Guide
by Graham Williams
Google

Predicting Wine Type

A traditional decision tree can be built from the Wine data (See Section [*]) using the rpart (recursive partitioning) function. Also see mvpart in the mvpart package.



library("rpart")
load("wine.Rdata")
wine.rpart <- rpart(Type ~ ., data=wine)
par(xpd = TRUE)
par(mar = rep(1.1, 4))
plot(wine.rpart)
text(wine.rpart, use.n=TRUE)

http://rattle.togaware.com/code/rplot-rpart.R



> wine.rpart
n= 178

node), split, n, loss, yval, (yprob)
      * denotes terminal node

 1) root 178 107 2 (0.33146067 0.39887640 0.26966292)
   2) Proline>=755 67  10 1 (0.85074627 0.05970149 0.08955224)
     4) Flavanoids>=2.165 59   2 1 (0.96610169 0.03389831 0.00000000) *
     5) Flavanoids< 2.165 8   2 3 (0.00000000 0.25000000 0.75000000) *
   3) Proline< 755 111  44 2 (0.01801802 0.60360360 0.37837838)
     6) Dilution>=2.115 65   4 2 (0.03076923 0.93846154 0.03076923) *
     7) Dilution< 2.115 46   6 3 (0.00000000 0.13043478 0.86956522)
      14) Hue>=0.9 7   2 2 (0.00000000 0.71428571 0.28571429) *
      15) Hue< 0.9 39   1 3 (0.00000000 0.02564103 0.97435897) *

The tree is displayed with the plot function.

You can even browse the plot with:

> path.rpart(fit)

Click on a node in the tree to display the path to that node. Exit with the right mouse button.

Use printcp to view the performance of the model.

> printcp(wine.rpart)

Classification tree:
rpart(formula = Type ~ ., data = wine)

Variables actually used in tree construction:
[1] Dilution   Flavanoids Hue        Proline

Root node error: 107/178 = 0.60112

n= 178

        CP nsplit rel error  xerror     xstd
1 0.495327      0   1.00000 1.00000 0.061056
2 0.317757      1   0.50467 0.47664 0.056376
3 0.056075      2   0.18692 0.28037 0.046676
4 0.028037      3   0.13084 0.23364 0.043323
5 0.010000      4   0.10280 0.21495 0.041825



> formula(wine.rpart)
Type ~ Alcohol + Malic + Ash + Alcalinity + Magnesium + Phenols +
    Flavanoids + Nonflavanoids + Proanthocyanins + Color + Hue +
    Dilution + Proline
attr(,"variables")
list(Type, Alcohol, Malic, Ash, Alcalinity, Magnesium, Phenols,
    Flavanoids, Nonflavanoids, Proanthocyanins, Color, Hue, Dilution,
    Proline)
attr(,"factors")
                Alcohol Malic Ash Alcalinity Magnesium Phenols Flavanoids
Type                  0     0   0          0         0       0          0
Alcohol               1     0   0          0         0       0          0
Malic                 0     1   0          0         0       0          0
Ash                   0     0   1          0         0       0          0
Alcalinity            0     0   0          1         0       0          0
Magnesium             0     0   0          0         1       0          0
Phenols               0     0   0          0         0       1          0
Flavanoids            0     0   0          0         0       0          1
Nonflavanoids         0     0   0          0         0       0          0
Proanthocyanins       0     0   0          0         0       0          0
Color                 0     0   0          0         0       0          0
Hue                   0     0   0          0         0       0          0
Dilution              0     0   0          0         0       0          0
Proline               0     0   0          0         0       0          0
                Nonflavanoids Proanthocyanins Color Hue Dilution Proline
Type                        0               0     0   0        0       0
Alcohol                     0               0     0   0        0       0
Malic                       0               0     0   0        0       0
Ash                         0               0     0   0        0       0
Alcalinity                  0               0     0   0        0       0
Magnesium                   0               0     0   0        0       0
Phenols                     0               0     0   0        0       0
Flavanoids                  0               0     0   0        0       0
Nonflavanoids               1               0     0   0        0       0
Proanthocyanins             0               1     0   0        0       0
Color                       0               0     1   0        0       0
Hue                         0               0     0   1        0       0
Dilution                    0               0     0   0        1       0
Proline                     0               0     0   0        0       1
attr(,"term.labels")
 [1] "Alcohol"         "Malic"           "Ash"             "Alcalinity"
 [5] "Magnesium"       "Phenols"         "Flavanoids"      "Nonflavanoids"
 [9] "Proanthocyanins" "Color"           "Hue"             "Dilution"
[13] "Proline"
attr(,"order")
 [1] 1 1 1 1 1 1 1 1 1 1 1 1 1
attr(,"intercept")
[1] 1
attr(,"response")
[1] 1
attr(,"predvars")
list(Type, Alcohol, Malic, Ash, Alcalinity, Magnesium, Phenols,
    Flavanoids, Nonflavanoids, Proanthocyanins, Color, Hue, Dilution,
    Proline)
attr(,"dataClasses")
           Type         Alcohol           Malic             Ash      Alcalinity
       "factor"       "numeric"       "numeric"       "numeric"       "numeric"
      Magnesium         Phenols      Flavanoids   Nonflavanoids Proanthocyanins
      "numeric"       "numeric"       "numeric"       "numeric"       "numeric"
          Color             Hue        Dilution         Proline
      "numeric"       "numeric"       "numeric"       "numeric"

You can find which terminal branch each entity in the training dataset ends up in with the where component of the object.

> wine.rpart$where
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20
  3   3   3   3   6   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3

 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40
  3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3

[...]

161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
  9   8   9   9   9   9   9   9   9   9   9   9   9   9   9   4   4   9

The predict function will apply the model to data. The data must contain the same variable on which the model was built. If not an error is generated. This is a common problem when wanting to apply the model to a new dataset that does not contain all the same variables, but does contain the variables you are interested in.

> cols <- c("Type", "Dilution", "Flavanoids", "Hue", "Proline")
> predict(wine.rpart, wine[,cols])
Error in eval(expr, envir, enclos) : Object "Alcohol" not found

Fix this up with

> wine.rpart <- rpart(Type ~ Dilution + Flavanoids + Hue + Proline, data=wine)
> predict(wine.rpart, wine[,cols])
             1          2          3
1   0.96610169 0.03389831 0.00000000
2   0.96610169 0.03389831 0.00000000
[...]
70  0.03076923 0.93846154 0.03076923
71  0.00000000 0.25000000 0.75000000
[...]
177 0.00000000 0.25000000 0.75000000
178 0.00000000 0.02564103 0.97435897

Display a confusion matrix.

> table(predict(wine.rpart, wine, type="class"), wine$Type)

     1  2  3
  1 57  2  0
  2  2 66  4
  3  0  3 44

Copyright © 2004-2006 Graham.Williams@togaware.com
Support further development through the purchase of the PDF version of the book.
Brought to you by Togaware.