This text contains:
- Logistic regression
- Linear Discriminant Analysis
- K-Nearest Neighbors
require(ISLR)
names(Smarket)
## [1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5"
## [7] "Volume" "Today" "Direction"
summary(Smarket)
## Year Lag1 Lag2 Lag3
## Min. :2001 Min. :-4.922 Min. :-4.922 Min. :-4.922
## 1st Qu.:2002 1st Qu.:-0.640 1st Qu.:-0.640 1st Qu.:-0.640
## Median :2003 Median : 0.039 Median : 0.039 Median : 0.038
## Mean :2003 Mean : 0.004 Mean : 0.004 Mean : 0.002
## 3rd Qu.:2004 3rd Qu.: 0.597 3rd Qu.: 0.597 3rd Qu.: 0.597
## Max. :2005 Max. : 5.733 Max. : 5.733 Max. : 5.733
## Lag4 Lag5 Volume Today
## Min. :-4.922 Min. :-4.922 Min. :0.356 Min. :-4.922
## 1st Qu.:-0.640 1st Qu.:-0.640 1st Qu.:1.257 1st Qu.:-0.640
## Median : 0.038 Median : 0.038 Median :1.423 Median : 0.038
## Mean : 0.002 Mean : 0.006 Mean :1.478 Mean : 0.003
## 3rd Qu.: 0.597 3rd Qu.: 0.597 3rd Qu.:1.642 3rd Qu.: 0.597
## Max. : 5.733 Max. : 5.733 Max. :3.152 Max. : 5.733
## Direction
## Down:602
## Up :648
# ?Smarket
pairs(Smarket, col = Smarket$Direction)
Logistic regression
glm.fit = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket,
family = binomial)
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Smarket)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.45 -1.20 1.07 1.15 1.33
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.12600 0.24074 -0.52 0.60
## Lag1 -0.07307 0.05017 -1.46 0.15
## Lag2 -0.04230 0.05009 -0.84 0.40
## Lag3 0.01109 0.04994 0.22 0.82
## Lag4 0.00936 0.04997 0.19 0.85
## Lag5 0.01031 0.04951 0.21 0.83
## Volume 0.13544 0.15836 0.86 0.39
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.2 on 1249 degrees of freedom
## Residual deviance: 1727.6 on 1243 degrees of freedom
## AIC: 1742
##
## Number of Fisher Scoring iterations: 3
glm.probs = predict(glm.fit, type = "response")
glm.probs[1:5]
## 1 2 3 4 5
## 0.5071 0.4815 0.4811 0.5152 0.5108
# use training data ase test
glm.pred = ifelse(glm.probs > 0.5, "Up", "Down")
attach(Smarket)
table(glm.pred, Direction)
## Direction
## glm.pred Down Up
## Down 145 141
## Up 457 507
mean(glm.pred == Direction)
## [1] 0.5216
# Make training and test set
train = Year < 2005
glm.fit = glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = Smarket,
family = binomial, subset = train)
glm.probs = predict(glm.fit, newdata = Smarket[!train, ], type = "response")
glm.pred = ifelse(glm.probs > 0.5, "Up", "Down")
Direction.2005 = Smarket$Direction[!train]
table(glm.pred, Direction.2005)
## Direction.2005
## glm.pred Down Up
## Down 77 97
## Up 34 44
mean(glm.pred == Direction.2005)
## [1] 0.4802
# Fit smaller model
glm.fit = glm(Direction ~ Lag1 + Lag2, data = Smarket, family = binomial, subset = train)
glm.probs = predict(glm.fit, newdata = Smarket[!train, ], type = "response")
glm.pred = ifelse(glm.probs > 0.5, "Up", "Down")
table(glm.pred, Direction.2005)
## Direction.2005
## glm.pred Down Up
## Down 35 35
## Up 76 106
mean(glm.pred == Direction.2005)
## [1] 0.5595
106/(76 + 106)
## [1] 0.5824
Linear Discriminant Analysis
require(MASS)
lda.fit = lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = Year < 2005)
lda.fit
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = Year <
## 2005)
##
## Prior probabilities of groups:
## Down Up
## 0.492 0.508
##
## Group means:
## Lag1 Lag2
## Down 0.04279 0.03389
## Up -0.03955 -0.03133
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.6420
## Lag2 -0.5135
plot(lda.fit)
Smarket.2005 = subset(Smarket, Year == 2005)
lda.pred = predict(lda.fit, Smarket.2005)
class(lda.pred)
## [1] "list"
data.frame(lda.pred)[1:5, ]
## class posterior.Down posterior.Up LD1
## 999 Up 0.4902 0.5098 0.08293
## 1000 Up 0.4792 0.5208 0.59114
## 1001 Up 0.4668 0.5332 1.16723
## 1002 Up 0.4740 0.5260 0.83335
## 1003 Up 0.4928 0.5072 -0.03793
table(lda.pred$class, Smarket.2005$Direction)
##
## Down Up
## Down 35 35
## Up 76 106
mean(lda.pred$class == Smarket.2005$Direction)
## [1] 0.5595
K-Nearest Neighbors
library(class)
# ?knn
attach(Smarket)
Xlag = cbind(Lag1, Lag2)
train = Year < 2005
knn.pred = knn(Xlag[train, ], Xlag[!train, ], Direction[train], k = 10)
table(knn.pred, Direction[!train])
##
## knn.pred Down Up
## Down 51 62
## Up 60 79
mean(knn.pred == Direction[!train])
## [1] 0.5159
Credit
Please note, this material is extracted from online Statistical Learning cource at Stanford University by Prof. T Hastie and Prof R. Tibshirani. It aims only for quick and future references in R and statistical learning. Please visit course page for more information and materials.
No comments:
Post a Comment