Thursday, February 25, 2021

Chapter 3 Lab: Linear Regression within Python - An Introduction to Statistical Learning

Chapter 3 Linear Regression - An Introduction to Statistical Learning
In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model as lm
import statsmodels.api as sm
import seaborn as sns; sns.set(style="ticks", color_codes=True)
In [2]:
df=pd.read_csv('data/Boston.csv')
In [3]:
df.describe()
Out[3]:
crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000
mean 3.613524 11.363636 11.136779 0.069170 0.554695 6.284634 68.574901 3.795043 9.549407 408.237154 18.455534 356.674032 12.653063 22.532806
std 8.601545 23.322453 6.860353 0.253994 0.115878 0.702617 28.148861 2.105710 8.707259 168.537116 2.164946 91.294864 7.141062 9.197104
min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 2.900000 1.129600 1.000000 187.000000 12.600000 0.320000 1.730000 5.000000
25% 0.082045 0.000000 5.190000 0.000000 0.449000 5.885500 45.025000 2.100175 4.000000 279.000000 17.400000 375.377500 6.950000 17.025000
50% 0.256510 0.000000 9.690000 0.000000 0.538000 6.208500 77.500000 3.207450 5.000000 330.000000 19.050000 391.440000 11.360000 21.200000
75% 3.677082 12.500000 18.100000 0.000000 0.624000 6.623500 94.075000 5.188425 24.000000 666.000000 20.200000 396.225000 16.955000 25.000000
max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 37.970000 50.000000
In [4]:
sns.pairplot(df)
Out[4]:
<seaborn.axisgrid.PairGrid at 0x1cce7a3d888>
In [5]:
sns.heatmap(df.cov())
df.cov()
df.corr()
sns.heatmap(df.corr())
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ccf0d6d808>

Simple Linear Regression

In [10]:
X=df.lstat.values.reshape(-1,1)
Y=df.medv.values.reshape(-1,1)
In [11]:
model=lm.LinearRegression()
model.fit(X,Y)
Out[11]:
LinearRegression()
In [12]:
X=sm.add_constant(X)
In [13]:
model=sm.OLS(Y,X)
In [14]:
result=model.fit()
In [15]:
result.summary2()
Out[15]:
Model: OLS Adj. R-squared: 0.543
Dependent Variable: y AIC: 3286.9750
Date: 2021-02-25 16:35 BIC: 3295.4280
No. Observations: 506 Log-Likelihood: -1641.5
Df Model: 1 F-statistic: 601.6
Df Residuals: 504 Prob (F-statistic): 5.08e-88
R-squared: 0.544 Scale: 38.636
Coef. Std.Err. t P>|t| [0.025 0.975]
const 34.5538 0.5626 61.4151 0.0000 33.4485 35.6592
x1 -0.9500 0.0387 -24.5279 0.0000 -1.0261 -0.8740
Omnibus: 137.043 Durbin-Watson: 0.892
Prob(Omnibus): 0.000 Jarque-Bera (JB): 291.373
Skew: 1.453 Prob(JB): 0.000
Kurtosis: 5.319 Condition No.: 30
In [56]:
newX=np.array([[1,5],[1,10], [1,15]])
pre=result.get_prediction(newX)
In [17]:
pre.summary_frame()
Out[17]:
mean mean_se mean_ci_lower mean_ci_upper obs_ci_lower obs_ci_upper
0 29.803594 0.405247 29.007412 30.599776 17.565675 42.041513
1 25.053347 0.294814 24.474132 25.632563 12.827626 37.279068
2 20.303101 0.290893 19.731588 20.874613 8.077742 32.528459
In [18]:
pre.conf_int()
Out[18]:
array([[29.00741194, 30.59977628],
       [24.47413202, 25.63256267],
       [19.73158815, 20.87461299]])
In [19]:
pre.conf_int(obs=True)
Out[19]:
array([[17.56567478, 42.04151344],
       [12.82762635, 37.27906833],
       [ 8.0777421 , 32.52845905]])
In [20]:
np.std(result.resid)
Out[20]:
6.20346413142642
In [21]:
inf=result.get_influence()
In [22]:
from statsmodels.compat import lzip
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
In [23]:
model2=ols("medv~lstat",data=df).fit()
In [24]:
model2.summary2()
Out[24]:
Model: OLS Adj. R-squared: 0.543
Dependent Variable: medv AIC: 3286.9750
Date: 2021-02-25 16:35 BIC: 3295.4280
No. Observations: 506 Log-Likelihood: -1641.5
Df Model: 1 F-statistic: 601.6
Df Residuals: 504 Prob (F-statistic): 5.08e-88
R-squared: 0.544 Scale: 38.636
Coef. Std.Err. t P>|t| [0.025 0.975]
Intercept 34.5538 0.5626 61.4151 0.0000 33.4485 35.6592
lstat -0.9500 0.0387 -24.5279 0.0000 -1.0261 -0.8740
Omnibus: 137.043 Durbin-Watson: 0.892
Prob(Omnibus): 0.000 Jarque-Bera (JB): 291.373
Skew: 1.453 Prob(JB): 0.000
Kurtosis: 5.319 Condition No.: 30
In [25]:
fig, ax = plt.subplots()
sm.graphics.plot_fit(model2,1,ax=ax)
plt.show()
In [26]:
sns.regplot(x='lstat',y='medv', data=df)
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ccf3d23288>
In [27]:
from statsmodels.graphics.regressionplots import abline_plot
abline_plot(model_results=model2,ax=ax)
Out[27]:
In [28]:
sns.residplot(x='lstat',y='medv', data=df)
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ccf3e7a588>
In [29]:
fig,(ax0,ax1)=plt.subplots(1,2)
sns.regplot(x='lstat',y='medv', data=df,ax=ax0)
sns.residplot(x='lstat',y='medv', data=df,ax=ax1)
#abline_plot(model_results=model2,ax=axs[0,0])
Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ccf3efb848>
In [30]:
sm.graphics.plot_partregress_grid(model2)
Out[30]:

Multiple Linear Regression

In [32]:
mul_model=ols("medv~lstat+age",data=df)
In [33]:
res_mm=mul_model.fit()
In [34]:
res_mm.summary2()
Out[34]:
Model: OLS Adj. R-squared: 0.549
Dependent Variable: medv AIC: 3281.0064
Date: 2021-02-25 16:35 BIC: 3293.6860
No. Observations: 506 Log-Likelihood: -1637.5
Df Model: 2 F-statistic: 309.0
Df Residuals: 503 Prob (F-statistic): 2.98e-88
R-squared: 0.551 Scale: 38.108
Coef. Std.Err. t P>|t| [0.025 0.975]
Intercept 33.2228 0.7308 45.4579 0.0000 31.7869 34.6586
lstat -1.0321 0.0482 -21.4163 0.0000 -1.1267 -0.9374
age 0.0345 0.0122 2.8256 0.0049 0.0105 0.0586
Omnibus: 124.288 Durbin-Watson: 0.945
Prob(Omnibus): 0.000 Jarque-Bera (JB): 244.026
Skew: 1.362 Prob(JB): 0.000
Kurtosis: 5.038 Condition No.: 201
In [35]:
formula="medv~"+"+".join(df.columns[df.columns!='medv'].values)
mul_modelAll=ols(formula,data=df)
res_mmAll=mul_modelAll.fit()
res_mmAll.summary2()
Out[35]:
Model: OLS Adj. R-squared: 0.734
Dependent Variable: medv AIC: 3025.6086
Date: 2021-02-25 16:35 BIC: 3084.7801
No. Observations: 506 Log-Likelihood: -1498.8
Df Model: 13 F-statistic: 108.1
Df Residuals: 492 Prob (F-statistic): 6.72e-135
R-squared: 0.741 Scale: 22.518
Coef. Std.Err. t P>|t| [0.025 0.975]
Intercept 36.4595 5.1035 7.1441 0.0000 26.4322 46.4868
crim -0.1080 0.0329 -3.2865 0.0011 -0.1726 -0.0434
zn 0.0464 0.0137 3.3816 0.0008 0.0194 0.0734
indus 0.0206 0.0615 0.3343 0.7383 -0.1003 0.1414
chas 2.6867 0.8616 3.1184 0.0019 0.9939 4.3796
nox -17.7666 3.8197 -4.6513 0.0000 -25.2716 -10.2616
rm 3.8099 0.4179 9.1161 0.0000 2.9887 4.6310
age 0.0007 0.0132 0.0524 0.9582 -0.0253 0.0266
dis -1.4756 0.1995 -7.3980 0.0000 -1.8675 -1.0837
rad 0.3060 0.0663 4.6129 0.0000 0.1757 0.4364
tax -0.0123 0.0038 -3.2800 0.0011 -0.0197 -0.0049
ptratio -0.9527 0.1308 -7.2825 0.0000 -1.2098 -0.6957
black 0.0093 0.0027 3.4668 0.0006 0.0040 0.0146
lstat -0.5248 0.0507 -10.3471 0.0000 -0.6244 -0.4251
Omnibus: 178.041 Durbin-Watson: 1.078
Prob(Omnibus): 0.000 Jarque-Bera (JB): 783.126
Skew: 1.521 Prob(JB): 0.000
Kurtosis: 8.281 Condition No.: 15114
In [36]:
wantedColumns=set(df.columns.values).difference(set(['medv','indus']))
formula="medv~"+"+".join(wantedColumns)
mul_modelAllRefined=ols(formula,data=df)
res_mmAllRefined=mul_modelAllRefined.fit()
res_mmAllRefined.summary2()
Out[36]:
Model: OLS Adj. R-squared: 0.734
Dependent Variable: medv AIC: 3023.7235
Date: 2021-02-25 16:35 BIC: 3078.6685
No. Observations: 506 Log-Likelihood: -1498.9
Df Model: 12 F-statistic: 117.3
Df Residuals: 493 Prob (F-statistic): 6.42e-136
R-squared: 0.741 Scale: 22.477
Coef. Std.Err. t P>|t| [0.025 0.975]
Intercept 36.3639 5.0908 7.1430 0.0000 26.3614 46.3663
age 0.0007 0.0132 0.0528 0.9579 -0.0252 0.0266
chas 2.7164 0.8562 3.1725 0.0016 1.0341 4.3987
zn 0.0459 0.0136 3.3681 0.0008 0.0191 0.0727
crim -0.1084 0.0328 -3.3042 0.0010 -0.1729 -0.0439
nox -17.4295 3.6809 -4.7351 0.0000 -24.6618 -10.1973
rad 0.2999 0.0637 4.7101 0.0000 0.1748 0.4250
lstat -0.5235 0.0505 -10.3611 0.0000 -0.6227 -0.4242
rm 3.7970 0.4158 9.1323 0.0000 2.9801 4.6139
ptratio -0.9471 0.1296 -7.3075 0.0000 -1.2017 -0.6924
black 0.0093 0.0027 3.4607 0.0006 0.0040 0.0146
tax -0.0118 0.0034 -3.4888 0.0005 -0.0184 -0.0051
dis -1.4896 0.1948 -7.6477 0.0000 -1.8724 -1.1069
Omnibus: 178.124 Durbin-Watson: 1.079
Prob(Omnibus): 0.000 Jarque-Bera (JB): 784.481
Skew: 1.521 Prob(JB): 0.000
Kurtosis: 8.287 Condition No.: 14978
In [37]:
plt.subplots(20,20)
sm.graphics.plot_partregress_grid(res_mmAll)
D:\apps\Anaconda3\envs\dev\lib\site-packages\statsmodels\graphics\regressionplots.py:561: UserWarning: Tight layout not applied. tight_layout cannot make axes height small enough to accommodate all axes decorations
  fig.tight_layout()
Out[37]:
In [38]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
In [ ]:
 
In [39]:
X=df.iloc[:,0:-1].values
#{vif(X,i)  for i in range(X.shape[1])}
{n:vif(X,i)  for n,i in zip(df.columns[:-1].values,range(df.shape[1]))}
Out[39]:
{'crim': 2.1003728199615233,
 'zn': 2.8440132669462637,
 'indus': 14.485757706539308,
 'chas': 1.1529518589418775,
 'nox': 73.89494652814788,
 'rm': 77.94828304638538,
 'age': 21.38685048994314,
 'dis': 14.6996523837492,
 'rad': 15.167724857920897,
 'tax': 61.227274009649456,
 'ptratio': 85.02954731061801,
 'black': 20.104942636229136,
 'lstat': 11.102024772203539}
In [40]:
{n:vif(df.values,i)  for n,i in zip(df.columns.values,range(df.shape[1]))}
Out[40]:
{'crim': 2.1314042398916406,
 'zn': 2.9100040831335887,
 'indus': 14.485874343285056,
 'chas': 1.1762659532928834,
 'nox': 74.00426852716569,
 'rm': 136.1017425464841,
 'age': 21.39886253165994,
 'dis': 15.430455486916713,
 'rad': 15.36997984977581,
 'tax': 61.93971317434026,
 'ptratio': 87.22723281176584,
 'black': 21.35101506934191,
 'lstat': 12.615187641345385,
 'medv': 24.503206285254794}
In [41]:
wantedColumns=set(df.columns.values).difference(set(['medv','indus','rm']))
formula="medv~"+"+".join(wantedColumns)
mul_modelAllRefined2=ols(formula,data=df)
res_mmAllRefined2=mul_modelAllRefined2.fit()
res_mmAllRefined2.summary2()
Out[41]:
Model: OLS Adj. R-squared: 0.690
Dependent Variable: medv AIC: 3100.8063
Date: 2021-02-25 16:36 BIC: 3151.5247
No. Observations: 506 Log-Likelihood: -1538.4
Df Model: 11 F-statistic: 103.2
Df Residuals: 494 Prob (F-statistic): 2.45e-120
R-squared: 0.697 Scale: 26.226
Coef. Std.Err. t P>|t| [0.025 0.975]
Intercept 69.7816 3.8230 18.2529 0.0000 62.2701 77.2930
age 0.0257 0.0139 1.8437 0.0658 -0.0017 0.0531
chas 2.8877 0.9247 3.1230 0.0019 1.0710 4.7045
zn 0.0668 0.0145 4.5966 0.0000 0.0382 0.0953
crim -0.1153 0.0354 -3.2545 0.0012 -0.1849 -0.0457
nox -21.5501 3.9461 -5.4611 0.0000 -29.3034 -13.7969
rad 0.4115 0.0675 6.0975 0.0000 0.2789 0.5441
lstat -0.7753 0.0457 -16.9550 0.0000 -0.8651 -0.6855
ptratio -1.1526 0.1379 -8.3599 0.0000 -1.4234 -0.8817
black 0.0068 0.0029 2.3524 0.0190 0.0011 0.0124
tax -0.0157 0.0036 -4.3473 0.0000 -0.0228 -0.0086
dis -1.6983 0.2090 -8.1276 0.0000 -2.1088 -1.2877
Omnibus: 125.993 Durbin-Watson: 1.244
Prob(Omnibus): 0.000 Jarque-Bera (JB): 294.346
Skew: 1.279 Prob(JB): 0.000
Kurtosis: 5.724 Condition No.: 12346

Interaction Terms

In [42]:
mul_model_i=ols("medv~lstat*age",data=df).fit()
In [43]:
mul_model_i.summary2()
Out[43]:
Model: OLS Adj. R-squared: 0.553
Dependent Variable: medv AIC: 3277.9547
Date: 2021-02-25 16:36 BIC: 3294.8609
No. Observations: 506 Log-Likelihood: -1635.0
Df Model: 3 F-statistic: 209.3
Df Residuals: 502 Prob (F-statistic): 4.86e-88
R-squared: 0.556 Scale: 37.804
Coef. Std.Err. t P>|t| [0.025 0.975]
Intercept 36.0885 1.4698 24.5528 0.0000 33.2007 38.9763
lstat -1.3921 0.1675 -8.3134 0.0000 -1.7211 -1.0631
age -0.0007 0.0199 -0.0363 0.9711 -0.0398 0.0383
lstat:age 0.0042 0.0019 2.2443 0.0252 0.0005 0.0078
Omnibus: 135.601 Durbin-Watson: 0.965
Prob(Omnibus): 0.000 Jarque-Bera (JB): 296.955
Skew: 1.417 Prob(JB): 0.000
Kurtosis: 5.461 Condition No.: 6878

Non-Linear Trasnformation of the Predictors

In [44]:
mul_model_non_lin=ols("medv~lstat+I(lstat**2)",data=df).fit()
In [45]:
mul_model_non_lin.summary2()
Out[45]:
Model: OLS Adj. R-squared: 0.639
Dependent Variable: medv AIC: 3168.5160
Date: 2021-02-25 16:36 BIC: 3181.1956
No. Observations: 506 Log-Likelihood: -1581.3
Df Model: 2 F-statistic: 448.5
Df Residuals: 503 Prob (F-statistic): 1.56e-112
R-squared: 0.641 Scale: 30.511
Coef. Std.Err. t P>|t| [0.025 0.975]
Intercept 42.8620 0.8721 49.1489 0.0000 41.1486 44.5754
lstat -2.3328 0.1238 -18.8430 0.0000 -2.5761 -2.0896
I(lstat ** 2) 0.0435 0.0037 11.6275 0.0000 0.0362 0.0509
Omnibus: 107.006 Durbin-Watson: 0.921
Prob(Omnibus): 0.000 Jarque-Bera (JB): 228.388
Skew: 1.128 Prob(JB): 0.000
Kurtosis: 5.397 Condition No.: 1135
In [46]:
from statsmodels.stats.anova import anova_lm
model2=ols("medv~lstat",data=df).fit()
In [47]:
anova_lm(model2, mul_model_non_lin)
D:\apps\Anaconda3\envs\dev\lib\site-packages\scipy\stats\_distn_infrastructure.py:1932: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= _a)
Out[47]:
df_resid ssr df_diff ss_diff F Pr(>F)
0 504.0 19472.381418 0.0 NaN NaN NaN
1 503.0 15347.243158 1.0 4125.13826 135.199822 7.630116e-28
In [48]:
model_ploy=ols("medv~I(lstat**1)+I(lstat**2)+I(lstat**3)+I(lstat**4)+I(lstat**5)",data=df).fit()
In [49]:
model_ploy.summary2()
Out[49]:
Model: OLS Adj. R-squared: 0.679
Dependent Variable: medv AIC: 3113.2474
Date: 2021-02-25 16:36 BIC: 3138.6066
No. Observations: 506 Log-Likelihood: -1550.6
Df Model: 5 F-statistic: 214.2
Df Residuals: 500 Prob (F-statistic): 8.73e-122
R-squared: 0.682 Scale: 27.194
Coef. Std.Err. t P>|t| [0.025 0.975]
Intercept 67.6997 3.6043 18.7831 0.0000 60.6182 74.7811
I(lstat ** 1) -11.9911 1.5257 -7.8593 0.0000 -14.9887 -8.9935
I(lstat ** 2) 1.2728 0.2232 5.7034 0.0000 0.8344 1.7113
I(lstat ** 3) -0.0683 0.0144 -4.7471 0.0000 -0.0965 -0.0400
I(lstat ** 4) 0.0017 0.0004 4.1426 0.0000 0.0009 0.0025
I(lstat ** 5) -0.0000 0.0000 -3.6919 0.0002 -0.0000 -0.0000
Omnibus: 144.085 Durbin-Watson: 0.987
Prob(Omnibus): 0.000 Jarque-Bera (JB): 494.545
Skew: 1.292 Prob(JB): 0.000
Kurtosis: 7.096 Condition No.: 137417262

Qualitive Predictors

In [50]:
car_df=pd.read_csv('data/Carseats.csv', index_col=0)
In [51]:
car_df.describe()
Out[51]:
Sales CompPrice Income Advertising Population Price Age Education
count 400.000000 400.000000 400.000000 400.000000 400.000000 400.000000 400.000000 400.000000
mean 7.496325 124.975000 68.657500 6.635000 264.840000 115.795000 53.322500 13.900000
std 2.824115 15.334512 27.986037 6.650364 147.376436 23.676664 16.200297 2.620528
min 0.000000 77.000000 21.000000 0.000000 10.000000 24.000000 25.000000 10.000000
25% 5.390000 115.000000 42.750000 0.000000 139.000000 100.000000 39.750000 12.000000
50% 7.490000 125.000000 69.000000 5.000000 272.000000 117.000000 54.500000 14.000000
75% 9.320000 135.000000 91.000000 12.000000 398.500000 131.000000 66.000000 16.000000
max 16.270000 175.000000 120.000000 29.000000 509.000000 191.000000 80.000000 18.000000
In [71]:
#car_df.boxplot()
sns.set(rc={'figure.figsize':(9,4)})
sns.boxplot(data=car_df)
Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ccfff3b248>
In [59]:
car_df.head()
Out[59]:
Sales CompPrice Income Advertising Population Price ShelveLoc Age Education Urban US
1 9.50 138 73 11 276 120 Bad 42 17 Yes Yes
2 11.22 111 48 16 260 83 Good 65 10 Yes Yes
3 10.06 113 35 10 269 80 Medium 59 12 Yes Yes
4 7.40 117 100 4 466 97 Medium 55 14 Yes Yes
5 4.15 141 64 3 340 128 Bad 38 13 Yes No
In [58]:
features="+".join(set(car_df.columns.values).difference(["Sales"]))
print(features)
cat_model=ols("Sales~"+features+"+Income:Advertising+Price:Age",data=car_df).fit()
Advertising+Population+Income+Age+Education+US+Price+ShelveLoc+Urban+CompPrice
In [55]:
cat_model.summary2()
Out[55]:
Model: OLS Adj. R-squared: 0.872
Dependent Variable: Sales AIC: 1157.3378
Date: 2021-02-25 16:36 BIC: 1213.2183
No. Observations: 400 Log-Likelihood: -564.67
Df Model: 13 F-statistic: 210.0
Df Residuals: 386 Prob (F-statistic): 6.14e-166
R-squared: 0.876 Scale: 1.0213
Coef. Std.Err. t P>|t| [0.025 0.975]
Intercept 6.5756 1.0087 6.5185 0.0000 4.5922 8.5589
US[T.Yes] -0.1576 0.1489 -1.0580 0.2907 -0.4504 0.1352
ShelveLoc[T.Good] 4.8487 0.1528 31.7243 0.0000 4.5482 5.1492
ShelveLoc[T.Medium] 1.9533 0.1258 15.5307 0.0000 1.7060 2.2005
Urban[T.Yes] 0.1402 0.1124 1.2470 0.2132 -0.0808 0.3612
Advertising 0.0702 0.0226 3.1070 0.0020 0.0258 0.1147
Population 0.0002 0.0004 0.4329 0.6653 -0.0006 0.0009
Income 0.0109 0.0026 4.1828 0.0000 0.0058 0.0160
Age -0.0579 0.0160 -3.6329 0.0003 -0.0893 -0.0266
Education -0.0209 0.0196 -1.0632 0.2884 -0.0594 0.0177
Price -0.1008 0.0074 -13.5494 0.0000 -0.1154 -0.0862
CompPrice 0.0929 0.0041 22.5668 0.0000 0.0848 0.1010
Income:Advertising 0.0008 0.0003 2.6976 0.0073 0.0002 0.0013
Price:Age 0.0001 0.0001 0.8007 0.4238 -0.0002 0.0004
Omnibus: 1.281 Durbin-Watson: 2.047
Prob(Omnibus): 0.527 Jarque-Bera (JB): 1.147
Skew: 0.129 Prob(JB): 0.564
Kurtosis: 3.050 Condition No.: 130576
In [ ]:
 

No comments: