Saturday, March 29, 2014

Linear Regression, Statistical Learning within R

Simple linear regression

This text contains:

  • Simple linear regression
  • Multiple linear regression
  • Nonlinear terms and Interactions
  • Qualitative predictors fix
# Libaries containing example data
library(MASS)
library(ISLR)

Simple linear regression

names(Boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"
## ?Boston
plot(medv ~ lstat, Boston)
fit1 = lm(medv ~ lstat, data = Boston)
fit1
## 
## Call:
## lm(formula = medv ~ lstat, data = Boston)
## 
## Coefficients:
## (Intercept)        lstat  
##       34.55        -0.95
summary(fit1)
## 
## Call:
## lm(formula = medv ~ lstat, data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15.17  -3.99  -1.32   2.03  24.50 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  34.5538     0.5626    61.4   <2e-16 ***
## lstat        -0.9500     0.0387   -24.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.22 on 504 degrees of freedom
## Multiple R-squared:  0.544,  Adjusted R-squared:  0.543 
## F-statistic:  602 on 1 and 504 DF,  p-value: <2e-16
abline(fit1, col = "red")

plot of chunk unnamed-chunk-2

names(fit1)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
confint(fit1)
##              2.5 % 97.5 %
## (Intercept) 33.448 35.659
## lstat       -1.026 -0.874
predict(fit1, data.frame(lstat = c(5, 10, 15)), interval = "confidence")
##     fit   lwr   upr
## 1 29.80 29.01 30.60
## 2 25.05 24.47 25.63
## 3 20.30 19.73 20.87

Multiple linear regression

fit2 = lm(medv ~ lstat + age, data = Boston)
summary(fit2)
## 
## Call:
## lm(formula = medv ~ lstat + age, data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15.98  -3.98  -1.28   1.97  23.16 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  33.2228     0.7308   45.46   <2e-16 ***
## lstat        -1.0321     0.0482  -21.42   <2e-16 ***
## age           0.0345     0.0122    2.83   0.0049 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.17 on 503 degrees of freedom
## Multiple R-squared:  0.551,  Adjusted R-squared:  0.549 
## F-statistic:  309 on 2 and 503 DF,  p-value: <2e-16
fit3 = lm(medv ~ ., Boston)
summary(fit3)
## 
## Call:
## lm(formula = medv ~ ., data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.594  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.65e+01   5.10e+00    7.14  3.3e-12 ***
## crim        -1.08e-01   3.29e-02   -3.29  0.00109 ** 
## zn           4.64e-02   1.37e-02    3.38  0.00078 ***
## indus        2.06e-02   6.15e-02    0.33  0.73829    
## chas         2.69e+00   8.62e-01    3.12  0.00193 ** 
## nox         -1.78e+01   3.82e+00   -4.65  4.2e-06 ***
## rm           3.81e+00   4.18e-01    9.12  < 2e-16 ***
## age          6.92e-04   1.32e-02    0.05  0.95823    
## dis         -1.48e+00   1.99e-01   -7.40  6.0e-13 ***
## rad          3.06e-01   6.63e-02    4.61  5.1e-06 ***
## tax         -1.23e-02   3.76e-03   -3.28  0.00111 ** 
## ptratio     -9.53e-01   1.31e-01   -7.28  1.3e-12 ***
## black        9.31e-03   2.69e-03    3.47  0.00057 ***
## lstat       -5.25e-01   5.07e-02  -10.35  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.75 on 492 degrees of freedom
## Multiple R-squared:  0.741,  Adjusted R-squared:  0.734 
## F-statistic:  108 on 13 and 492 DF,  p-value: <2e-16
par(mfrow = c(2, 2))
plot(fit3)

plot of chunk unnamed-chunk-3

fit4 = update(fit3, ~. - age - indus)
summary(fit4)
## 
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + 
##     tax + ptratio + black + lstat, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.598  -2.739  -0.505   1.727  26.237 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.34115    5.06749    7.17  2.7e-12 ***
## crim         -0.10841    0.03278   -3.31  0.00101 ** 
## zn            0.04584    0.01352    3.39  0.00075 ***
## chas          2.71872    0.85424    3.18  0.00155 ** 
## nox         -17.37602    3.53524   -4.92  1.2e-06 ***
## rm            3.80158    0.40632    9.36  < 2e-16 ***
## dis          -1.49271    0.18573   -8.04  6.8e-15 ***
## rad           0.29961    0.06340    4.73  3.0e-06 ***
## tax          -0.01178    0.00337   -3.49  0.00052 ***
## ptratio      -0.94652    0.12907   -7.33  9.2e-13 ***
## black         0.00929    0.00267    3.47  0.00056 ***
## lstat        -0.52255    0.04742  -11.02  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.74 on 494 degrees of freedom
## Multiple R-squared:  0.741,  Adjusted R-squared:  0.735 
## F-statistic:  128 on 11 and 494 DF,  p-value: <2e-16

Nonlinear terms and Interactions

# Note: * in formula means interaction, not multiply
fit5 = lm(medv ~ lstat * age, Boston)
summary(fit5)
## 
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15.81  -4.04  -1.33   2.08  27.55 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.088536   1.469835   24.55  < 2e-16 ***
## lstat       -1.392117   0.167456   -8.31  8.8e-16 ***
## age         -0.000721   0.019879   -0.04    0.971    
## lstat:age    0.004156   0.001852    2.24    0.025 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.15 on 502 degrees of freedom
## Multiple R-squared:  0.556,  Adjusted R-squared:  0.553 
## F-statistic:  209 on 3 and 502 DF,  p-value: <2e-16
# square of lstat
fit6 = lm(medv ~ lstat + I(lstat^2), Boston)
summary(fit6)
## 
## Call:
## lm(formula = medv ~ lstat + I(lstat^2), data = Boston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15.28  -3.83  -0.53   2.31  25.41 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 42.86201    0.87208    49.1   <2e-16 ***
## lstat       -2.33282    0.12380   -18.8   <2e-16 ***
## I(lstat^2)   0.04355    0.00375    11.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.52 on 503 degrees of freedom
## Multiple R-squared:  0.641,  Adjusted R-squared:  0.639 
## F-statistic:  449 on 2 and 503 DF,  p-value: <2e-16
attach(Boston)
par(mfrow = c(1, 1))
plot(medv ~ lstat)
points(lstat, fitted(fit6), col = "red", pch = 20)
# or directly polynomical function can be used
fit7 = lm(medv ~ poly(lstat, 4))
points(lstat, fitted(fit7), col = "blue", pch = 20)

plot of chunk unnamed-chunk-4

Qualitative predictors

# fix(Carseats) # you may use fix command to edit a data
names(Carseats)
##  [1] "Sales"       "CompPrice"   "Income"      "Advertising" "Population" 
##  [6] "Price"       "ShelveLoc"   "Age"         "Education"   "Urban"      
## [11] "US"
summary(Carseats)
##      Sales         CompPrice       Income       Advertising   
##  Min.   : 0.00   Min.   : 77   Min.   : 21.0   Min.   : 0.00  
##  1st Qu.: 5.39   1st Qu.:115   1st Qu.: 42.8   1st Qu.: 0.00  
##  Median : 7.49   Median :125   Median : 69.0   Median : 5.00  
##  Mean   : 7.50   Mean   :125   Mean   : 68.7   Mean   : 6.63  
##  3rd Qu.: 9.32   3rd Qu.:135   3rd Qu.: 91.0   3rd Qu.:12.00  
##  Max.   :16.27   Max.   :175   Max.   :120.0   Max.   :29.00  
##    Population      Price      ShelveLoc        Age         Education   
##  Min.   : 10   Min.   : 24   Bad   : 96   Min.   :25.0   Min.   :10.0  
##  1st Qu.:139   1st Qu.:100   Good  : 85   1st Qu.:39.8   1st Qu.:12.0  
##  Median :272   Median :117   Medium:219   Median :54.5   Median :14.0  
##  Mean   :265   Mean   :116                Mean   :53.3   Mean   :13.9  
##  3rd Qu.:398   3rd Qu.:131                3rd Qu.:66.0   3rd Qu.:16.0  
##  Max.   :509   Max.   :191                Max.   :80.0   Max.   :18.0  
##  Urban       US     
##  No :118   No :142  
##  Yes:282   Yes:258  
##                     
##                     
##                     
## 
# use all fields and interaction of Income:Advertising and Age:Price
fit1 = lm(Sales ~ . + Income:Advertising + Age:Price, Carseats)
summary(fit1)
## 
## Call:
## lm(formula = Sales ~ . + Income:Advertising + Age:Price, data = Carseats)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.921 -0.750  0.018  0.675  3.341 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.575565   1.008747    6.52  2.2e-10 ***
## CompPrice           0.092937   0.004118   22.57  < 2e-16 ***
## Income              0.010894   0.002604    4.18  3.6e-05 ***
## Advertising         0.070246   0.022609    3.11  0.00203 ** 
## Population          0.000159   0.000368    0.43  0.66533    
## Price              -0.100806   0.007440  -13.55  < 2e-16 ***
## ShelveLocGood       4.848676   0.152838   31.72  < 2e-16 ***
## ShelveLocMedium     1.953262   0.125768   15.53  < 2e-16 ***
## Age                -0.057947   0.015951   -3.63  0.00032 ***
## Education          -0.020852   0.019613   -1.06  0.28836    
## UrbanYes            0.140160   0.112402    1.25  0.21317    
## USYes              -0.157557   0.148923   -1.06  0.29073    
## Income:Advertising  0.000751   0.000278    2.70  0.00729 ** 
## Price:Age           0.000107   0.000133    0.80  0.42381    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.01 on 386 degrees of freedom
## Multiple R-squared:  0.876,  Adjusted R-squared:  0.872 
## F-statistic:  210 on 13 and 386 DF,  p-value: <2e-16
# contrasts shows how R put quantitive variables in linear regression
contrasts(Carseats$ShelveLoc)
##        Good Medium
## Bad       0      0
## Good      1      0
## Medium    0      1

Credit

Please note, this material is extracted from online Statistical Learning cource at Stanford University by Prof. T Hastie and Prof R. Tibshirani. It aims only for quick and future references in R and statistical learning. Please visit course page for more information and materials.


No comments: