This text contains:
- Simple linear regression
- Multiple linear regression
- Nonlinear terms and Interactions
- Qualitative predictors fix
# Libaries containing example data
library(MASS)
library(ISLR)
Simple linear regression
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
## ?Boston
plot(medv ~ lstat, Boston)
fit1 = lm(medv ~ lstat, data = Boston)
fit1
##
## Call:
## lm(formula = medv ~ lstat, data = Boston)
##
## Coefficients:
## (Intercept) lstat
## 34.55 -0.95
summary(fit1)
##
## Call:
## lm(formula = medv ~ lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.17 -3.99 -1.32 2.03 24.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.5538 0.5626 61.4 <2e-16 ***
## lstat -0.9500 0.0387 -24.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.22 on 504 degrees of freedom
## Multiple R-squared: 0.544, Adjusted R-squared: 0.543
## F-statistic: 602 on 1 and 504 DF, p-value: <2e-16
abline(fit1, col = "red")
names(fit1)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
confint(fit1)
## 2.5 % 97.5 %
## (Intercept) 33.448 35.659
## lstat -1.026 -0.874
predict(fit1, data.frame(lstat = c(5, 10, 15)), interval = "confidence")
## fit lwr upr
## 1 29.80 29.01 30.60
## 2 25.05 24.47 25.63
## 3 20.30 19.73 20.87
Multiple linear regression
fit2 = lm(medv ~ lstat + age, data = Boston)
summary(fit2)
##
## Call:
## lm(formula = medv ~ lstat + age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.98 -3.98 -1.28 1.97 23.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.2228 0.7308 45.46 <2e-16 ***
## lstat -1.0321 0.0482 -21.42 <2e-16 ***
## age 0.0345 0.0122 2.83 0.0049 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.17 on 503 degrees of freedom
## Multiple R-squared: 0.551, Adjusted R-squared: 0.549
## F-statistic: 309 on 2 and 503 DF, p-value: <2e-16
fit3 = lm(medv ~ ., Boston)
summary(fit3)
##
## Call:
## lm(formula = medv ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.594 -2.730 -0.518 1.777 26.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.65e+01 5.10e+00 7.14 3.3e-12 ***
## crim -1.08e-01 3.29e-02 -3.29 0.00109 **
## zn 4.64e-02 1.37e-02 3.38 0.00078 ***
## indus 2.06e-02 6.15e-02 0.33 0.73829
## chas 2.69e+00 8.62e-01 3.12 0.00193 **
## nox -1.78e+01 3.82e+00 -4.65 4.2e-06 ***
## rm 3.81e+00 4.18e-01 9.12 < 2e-16 ***
## age 6.92e-04 1.32e-02 0.05 0.95823
## dis -1.48e+00 1.99e-01 -7.40 6.0e-13 ***
## rad 3.06e-01 6.63e-02 4.61 5.1e-06 ***
## tax -1.23e-02 3.76e-03 -3.28 0.00111 **
## ptratio -9.53e-01 1.31e-01 -7.28 1.3e-12 ***
## black 9.31e-03 2.69e-03 3.47 0.00057 ***
## lstat -5.25e-01 5.07e-02 -10.35 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.75 on 492 degrees of freedom
## Multiple R-squared: 0.741, Adjusted R-squared: 0.734
## F-statistic: 108 on 13 and 492 DF, p-value: <2e-16
par(mfrow = c(2, 2))
plot(fit3)
fit4 = update(fit3, ~. - age - indus)
summary(fit4)
##
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad +
## tax + ptratio + black + lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.598 -2.739 -0.505 1.727 26.237
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.34115 5.06749 7.17 2.7e-12 ***
## crim -0.10841 0.03278 -3.31 0.00101 **
## zn 0.04584 0.01352 3.39 0.00075 ***
## chas 2.71872 0.85424 3.18 0.00155 **
## nox -17.37602 3.53524 -4.92 1.2e-06 ***
## rm 3.80158 0.40632 9.36 < 2e-16 ***
## dis -1.49271 0.18573 -8.04 6.8e-15 ***
## rad 0.29961 0.06340 4.73 3.0e-06 ***
## tax -0.01178 0.00337 -3.49 0.00052 ***
## ptratio -0.94652 0.12907 -7.33 9.2e-13 ***
## black 0.00929 0.00267 3.47 0.00056 ***
## lstat -0.52255 0.04742 -11.02 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.74 on 494 degrees of freedom
## Multiple R-squared: 0.741, Adjusted R-squared: 0.735
## F-statistic: 128 on 11 and 494 DF, p-value: <2e-16
Nonlinear terms and Interactions
# Note: * in formula means interaction, not multiply
fit5 = lm(medv ~ lstat * age, Boston)
summary(fit5)
##
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.81 -4.04 -1.33 2.08 27.55
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.088536 1.469835 24.55 < 2e-16 ***
## lstat -1.392117 0.167456 -8.31 8.8e-16 ***
## age -0.000721 0.019879 -0.04 0.971
## lstat:age 0.004156 0.001852 2.24 0.025 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.15 on 502 degrees of freedom
## Multiple R-squared: 0.556, Adjusted R-squared: 0.553
## F-statistic: 209 on 3 and 502 DF, p-value: <2e-16
# square of lstat
fit6 = lm(medv ~ lstat + I(lstat^2), Boston)
summary(fit6)
##
## Call:
## lm(formula = medv ~ lstat + I(lstat^2), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.28 -3.83 -0.53 2.31 25.41
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.86201 0.87208 49.1 <2e-16 ***
## lstat -2.33282 0.12380 -18.8 <2e-16 ***
## I(lstat^2) 0.04355 0.00375 11.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.52 on 503 degrees of freedom
## Multiple R-squared: 0.641, Adjusted R-squared: 0.639
## F-statistic: 449 on 2 and 503 DF, p-value: <2e-16
attach(Boston)
par(mfrow = c(1, 1))
plot(medv ~ lstat)
points(lstat, fitted(fit6), col = "red", pch = 20)
# or directly polynomical function can be used
fit7 = lm(medv ~ poly(lstat, 4))
points(lstat, fitted(fit7), col = "blue", pch = 20)
Qualitative predictors
# fix(Carseats) # you may use fix command to edit a data
names(Carseats)
## [1] "Sales" "CompPrice" "Income" "Advertising" "Population"
## [6] "Price" "ShelveLoc" "Age" "Education" "Urban"
## [11] "US"
summary(Carseats)
## Sales CompPrice Income Advertising
## Min. : 0.00 Min. : 77 Min. : 21.0 Min. : 0.00
## 1st Qu.: 5.39 1st Qu.:115 1st Qu.: 42.8 1st Qu.: 0.00
## Median : 7.49 Median :125 Median : 69.0 Median : 5.00
## Mean : 7.50 Mean :125 Mean : 68.7 Mean : 6.63
## 3rd Qu.: 9.32 3rd Qu.:135 3rd Qu.: 91.0 3rd Qu.:12.00
## Max. :16.27 Max. :175 Max. :120.0 Max. :29.00
## Population Price ShelveLoc Age Education
## Min. : 10 Min. : 24 Bad : 96 Min. :25.0 Min. :10.0
## 1st Qu.:139 1st Qu.:100 Good : 85 1st Qu.:39.8 1st Qu.:12.0
## Median :272 Median :117 Medium:219 Median :54.5 Median :14.0
## Mean :265 Mean :116 Mean :53.3 Mean :13.9
## 3rd Qu.:398 3rd Qu.:131 3rd Qu.:66.0 3rd Qu.:16.0
## Max. :509 Max. :191 Max. :80.0 Max. :18.0
## Urban US
## No :118 No :142
## Yes:282 Yes:258
##
##
##
##
# use all fields and interaction of Income:Advertising and Age:Price
fit1 = lm(Sales ~ . + Income:Advertising + Age:Price, Carseats)
summary(fit1)
##
## Call:
## lm(formula = Sales ~ . + Income:Advertising + Age:Price, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.921 -0.750 0.018 0.675 3.341
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.575565 1.008747 6.52 2.2e-10 ***
## CompPrice 0.092937 0.004118 22.57 < 2e-16 ***
## Income 0.010894 0.002604 4.18 3.6e-05 ***
## Advertising 0.070246 0.022609 3.11 0.00203 **
## Population 0.000159 0.000368 0.43 0.66533
## Price -0.100806 0.007440 -13.55 < 2e-16 ***
## ShelveLocGood 4.848676 0.152838 31.72 < 2e-16 ***
## ShelveLocMedium 1.953262 0.125768 15.53 < 2e-16 ***
## Age -0.057947 0.015951 -3.63 0.00032 ***
## Education -0.020852 0.019613 -1.06 0.28836
## UrbanYes 0.140160 0.112402 1.25 0.21317
## USYes -0.157557 0.148923 -1.06 0.29073
## Income:Advertising 0.000751 0.000278 2.70 0.00729 **
## Price:Age 0.000107 0.000133 0.80 0.42381
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.01 on 386 degrees of freedom
## Multiple R-squared: 0.876, Adjusted R-squared: 0.872
## F-statistic: 210 on 13 and 386 DF, p-value: <2e-16
# contrasts shows how R put quantitive variables in linear regression
contrasts(Carseats$ShelveLoc)
## Good Medium
## Bad 0 0
## Good 1 0
## Medium 0 1
Credit
Please note, this material is extracted from online Statistical Learning cource at Stanford University by Prof. T Hastie and Prof R. Tibshirani. It aims only for quick and future references in R and statistical learning. Please visit course page for more information and materials.
No comments:
Post a Comment