Saturday, March 29, 2014

Classification, Statistical Learning within R

Logistic regression

This text contains:

  • Logistic regression
  • Linear Discriminant Analysis
  • K-Nearest Neighbors
require(ISLR)
names(Smarket)
## [1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"     
## [7] "Volume"    "Today"     "Direction"
summary(Smarket)
##       Year           Lag1             Lag2             Lag3       
##  Min.   :2001   Min.   :-4.922   Min.   :-4.922   Min.   :-4.922  
##  1st Qu.:2002   1st Qu.:-0.640   1st Qu.:-0.640   1st Qu.:-0.640  
##  Median :2003   Median : 0.039   Median : 0.039   Median : 0.038  
##  Mean   :2003   Mean   : 0.004   Mean   : 0.004   Mean   : 0.002  
##  3rd Qu.:2004   3rd Qu.: 0.597   3rd Qu.: 0.597   3rd Qu.: 0.597  
##  Max.   :2005   Max.   : 5.733   Max.   : 5.733   Max.   : 5.733  
##       Lag4             Lag5            Volume          Today       
##  Min.   :-4.922   Min.   :-4.922   Min.   :0.356   Min.   :-4.922  
##  1st Qu.:-0.640   1st Qu.:-0.640   1st Qu.:1.257   1st Qu.:-0.640  
##  Median : 0.038   Median : 0.038   Median :1.423   Median : 0.038  
##  Mean   : 0.002   Mean   : 0.006   Mean   :1.478   Mean   : 0.003  
##  3rd Qu.: 0.597   3rd Qu.: 0.597   3rd Qu.:1.642   3rd Qu.: 0.597  
##  Max.   : 5.733   Max.   : 5.733   Max.   :3.152   Max.   : 5.733  
##  Direction 
##  Down:602  
##  Up  :648  
# ?Smarket
pairs(Smarket, col = Smarket$Direction)

plot of chunk unnamed-chunk-1

Logistic regression

glm.fit = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket, 
    family = binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Smarket)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
##  -1.45   -1.20    1.07    1.15    1.33  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.12600    0.24074   -0.52     0.60
## Lag1        -0.07307    0.05017   -1.46     0.15
## Lag2        -0.04230    0.05009   -0.84     0.40
## Lag3         0.01109    0.04994    0.22     0.82
## Lag4         0.00936    0.04997    0.19     0.85
## Lag5         0.01031    0.04951    0.21     0.83
## Volume       0.13544    0.15836    0.86     0.39
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1742
## 
## Number of Fisher Scoring iterations: 3
glm.probs = predict(glm.fit, type = "response")
glm.probs[1:5]
##      1      2      3      4      5 
## 0.5071 0.4815 0.4811 0.5152 0.5108
# use training data ase test
glm.pred = ifelse(glm.probs > 0.5, "Up", "Down")
attach(Smarket)
table(glm.pred, Direction)
##         Direction
## glm.pred Down  Up
##     Down  145 141
##     Up    457 507
mean(glm.pred == Direction)
## [1] 0.5216
# Make training and test set
train = Year < 2005
glm.fit = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket, 
    family = binomial, subset = train)
glm.probs = predict(glm.fit, newdata = Smarket[!train, ], type = "response")
glm.pred = ifelse(glm.probs > 0.5, "Up", "Down")
Direction.2005 = Smarket$Direction[!train]
table(glm.pred, Direction.2005)
##         Direction.2005
## glm.pred Down Up
##     Down   77 97
##     Up     34 44
mean(glm.pred == Direction.2005)
## [1] 0.4802
# Fit smaller model
glm.fit = glm(Direction ~ Lag1 + Lag2, data = Smarket, family = binomial, subset = train)
glm.probs = predict(glm.fit, newdata = Smarket[!train, ], type = "response")
glm.pred = ifelse(glm.probs > 0.5, "Up", "Down")
table(glm.pred, Direction.2005)
##         Direction.2005
## glm.pred Down  Up
##     Down   35  35
##     Up     76 106
mean(glm.pred == Direction.2005)
## [1] 0.5595
106/(76 + 106)
## [1] 0.5824

Linear Discriminant Analysis

require(MASS)
lda.fit = lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = Year < 2005)
lda.fit
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = Year < 
##     2005)
## 
## Prior probabilities of groups:
##  Down    Up 
## 0.492 0.508 
## 
## Group means:
##          Lag1     Lag2
## Down  0.04279  0.03389
## Up   -0.03955 -0.03133
## 
## Coefficients of linear discriminants:
##          LD1
## Lag1 -0.6420
## Lag2 -0.5135
plot(lda.fit)

plot of chunk unnamed-chunk-3

Smarket.2005 = subset(Smarket, Year == 2005)
lda.pred = predict(lda.fit, Smarket.2005)
class(lda.pred)
## [1] "list"
data.frame(lda.pred)[1:5, ]
##      class posterior.Down posterior.Up      LD1
## 999     Up         0.4902       0.5098  0.08293
## 1000    Up         0.4792       0.5208  0.59114
## 1001    Up         0.4668       0.5332  1.16723
## 1002    Up         0.4740       0.5260  0.83335
## 1003    Up         0.4928       0.5072 -0.03793
table(lda.pred$class, Smarket.2005$Direction)
##       
##        Down  Up
##   Down   35  35
##   Up     76 106
mean(lda.pred$class == Smarket.2005$Direction)
## [1] 0.5595

K-Nearest Neighbors

library(class)
# ?knn
attach(Smarket)
Xlag = cbind(Lag1, Lag2)
train = Year < 2005
knn.pred = knn(Xlag[train, ], Xlag[!train, ], Direction[train], k = 10)
table(knn.pred, Direction[!train])
##         
## knn.pred Down Up
##     Down   51 62
##     Up     60 79
mean(knn.pred == Direction[!train])
## [1] 0.5159

Credit

Please note, this material is extracted from online Statistical Learning cource at Stanford University by Prof. T Hastie and Prof R. Tibshirani. It aims only for quick and future references in R and statistical learning. Please visit course page for more information and materials.


No comments: