Thursday, February 25, 2021

Chapter 7 Moving Beyond Linearity within Python - An Introduction to Statistical Learning

Chapter 7 Moving Beyond Linearity - An Introduction to Statistical Learning
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sms

from sklearn.preprocessing import PolynomialFeatures 

import matplotlib.pyplot as plt
from patsy import dmatrix
In [2]:
data=pd.read_csv("data/Wage.csv", index_col=0)
df=data.loc[:,['age', 'wage']]
In [12]:
data.info()
#data.describe()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 231655 to 453557
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   year        3000 non-null   int64  
 1   age         3000 non-null   int64  
 2   sex         3000 non-null   object 
 3   maritl      3000 non-null   object 
 4   race        3000 non-null   object 
 5   education   3000 non-null   object 
 6   region      3000 non-null   object 
 7   jobclass    3000 non-null   object 
 8   health      3000 non-null   object 
 9   health_ins  3000 non-null   object 
 10  logwage     3000 non-null   float64
 11  wage        3000 non-null   float64
dtypes: float64(2), int64(2), object(8)
memory usage: 384.7+ KB
In [17]:
sns.pairplot(df)
Out[17]:
<seaborn.axisgrid.PairGrid at 0x22908ad5288>
In [26]:
sns.heatmap(df.cov())
#sns.heatmap(np.cov(df.T))
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x2290aba1c88>
In [3]:
age_grid=df.age.unique()
age_grid=np.sort(age_grid).reshape(-1,1)

# extract columns 
Y=df.wage.values
X=df.age.values.reshape(-1,1)

pf=PolynomialFeatures(degree=4)
X_pol=pf.fit_transform(X)
age_grid_pol=pf.fit_transform(age_grid)
In [154]:
model=sms.OLS(Y,X_pol)
result=model.fit()
result.summary2()
Out[154]:
Model: OLS Adj. R-squared: 0.085
Dependent Variable: y AIC: 30639.1079
Date: 2020-12-20 17:19 BIC: 30669.1397
No. Observations: 3000 Log-Likelihood: -15315.
Df Model: 4 F-statistic: 70.69
Df Residuals: 2995 Prob (F-statistic): 2.77e-57
R-squared: 0.086 Scale: 1593.2
Coef. Std.Err. t P>|t| [0.025 0.975]
const -184.1542 60.0404 -3.0672 0.0022 -301.8787 -66.4296
x1 21.2455 5.8867 3.6090 0.0003 9.7030 32.7880
x2 -0.5639 0.2061 -2.7357 0.0063 -0.9680 -0.1597
x3 0.0068 0.0031 2.2214 0.0264 0.0008 0.0128
x4 -0.0000 0.0000 -1.9519 0.0510 -0.0001 0.0000
Omnibus: 1097.594 Durbin-Watson: 1.960
Prob(Omnibus): 0.000 Jarque-Bera (JB): 4965.521
Skew: 1.722 Prob(JB): 0.000
Kurtosis: 8.279 Condition No.: 567467311
In [186]:
pre_result=result.get_prediction(age_grid_pol)
prediction_ci=pre_result.summary_frame()
#prediction_ci.plot()
In [190]:
fig, (ax1,ax2) =plt.subplots(1,2)
ax1.plot(df.age, df.wage, 'm.')
ax2.plot(age_grid,prediction_ci.loc[:,'mean'], 'r-', 
         age_grid,prediction_ci.mean_ci_lower.values,'y--', 
         age_grid,prediction_ci.mean_ci_upper.values,'y--' )
fig.suptitle("Degree 4 Polynomial")
Out[190]:
Text(0.5, 0.98, 'Degree 4 Polynomial')

Choosing Correct Model by ANOVA

In [206]:
def poly_linear_regression(d,X,Y):    
    X=df.age.values.reshape(-1,1)

    pf=PolynomialFeatures(degree=d)
    X_pol=pf.fit_transform(X)
    model=sms.OLS(Y,X_pol)
    return model.fit()
In [207]:
fit1=poly_linear_regression(1,df.age.values,df.wage.values)
fit2=poly_linear_regression(2,df.age.values,df.wage.values)
fit3=poly_linear_regression(3,df.age.values,df.wage.values)
fit4=poly_linear_regression(4,df.age.values,df.wage.values)
fit5=poly_linear_regression(5,df.age.values,df.wage.values)
D:\apps\Anaconda3\envs\dev\lib\site-packages\scipy\stats\_distn_infrastructure.py:1932: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= _a)
Out[207]:
df_resid ssr df_diff ss_diff F Pr(>F)
0 2998.0 5.022216e+06 0.0 NaN NaN NaN
1 2997.0 4.793430e+06 1.0 228786.010128 143.593107 2.363850e-32
2 2996.0 4.777674e+06 1.0 15755.693664 9.888756 1.679202e-03
3 2995.0 4.771604e+06 1.0 6070.152124 3.809813 5.104620e-02
4 2994.0 4.770322e+06 1.0 1282.563017 0.804976 3.696820e-01
In [208]:
sms.stats.anova_lm(fit1,fit2,fit3,fit4,fit5, typ=1)
D:\apps\Anaconda3\envs\dev\lib\site-packages\scipy\stats\_distn_infrastructure.py:1932: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= _a)
Out[208]:
df_resid ssr df_diff ss_diff F Pr(>F)
0 2998.0 5.022216e+06 0.0 NaN NaN NaN
1 2997.0 4.793430e+06 1.0 228786.010128 143.593107 2.363850e-32
2 2996.0 4.777674e+06 1.0 15755.693664 9.888756 1.679202e-03
3 2995.0 4.771604e+06 1.0 6070.152124 3.809813 5.104620e-02
4 2994.0 4.770322e+06 1.0 1282.563017 0.804976 3.696820e-01

Logistic Regression with Polynomial

In [5]:
model=sms.Logit(Y>250,X_pol)
model_log_results=model.fit()
model_log_results.summary2()
Optimization terminated successfully.
         Current function value: 0.116870
         Iterations 12
Out[5]:
Model: Logit Pseudo R-squared: 0.040
Dependent Variable: y AIC: 711.2198
Date: 2020-12-27 13:57 BIC: 741.2517
No. Observations: 3000 Log-Likelihood: -350.61
Df Model: 4 LL-Null: -365.27
Df Residuals: 2995 LLR p-value: 6.7472e-06
Converged: 1.0000 Scale: 1.0000
No. Iterations: 12.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
const -109.5530 47.6553 -2.2989 0.0215 -202.9556 -16.1504
x1 8.9950 4.1869 2.1484 0.0317 0.7888 17.2012
x2 -0.2816 0.1353 -2.0811 0.0374 -0.5469 -0.0164
x3 0.0039 0.0019 2.0219 0.0432 0.0001 0.0076
x4 -0.0000 0.0000 -1.9664 0.0492 -0.0000 -0.0000
In [103]:
prob_avg=model_log_results.predict(age_grid_pol, transform=False)
In [120]:
# use bootstrap method to obtain confidence intervals
def random_logit_model():
    n=X_pol.shape[0]
    seq=np.random.randint(0,n,n)
    Y_r=Y[seq]
    X_pol_r=X_pol[seq,:]
    return sms.Logit(Y_r>250,X_pol_r).fit(disp=False)

N=1000
res=np.zeros((N,age_grid_pol.shape[0]))
for i in range(N):
    model_random=random_logit_model();
    res[i,:]=model_random.predict(age_grid_pol, transform=False)
        
prob_std=np.std(res,axis=0)
prob_upper=np.percentile(res,95,axis=0)
prob_lower=np.percentile(res,5,axis=0)
In [121]:
fig, (ax1,ax2) =plt.subplots(1,2)
ax1.plot(df.age, df.wage, 'm.')
ax2.plot(age_grid,prob_avg, 'r-',
         age_grid,prob_lower,'g--', 
         age_grid,prob_upper,'g--' )
ax2.plot(age_grid,res[0:100,:].T, alpha=0.1)
fig.suptitle("Probability of Wage>250")
Out[121]:
Text(0.5, 0.98, 'Probability of Wage>250')

Step Function

In [154]:
X_c=pd.cut(df.age,4)
X_d=pd.get_dummies(X_c)
X_d_c=sms.add_constant(X_d)

model=sms.OLS(Y,X_d_c)
result=model.fit()
result.summary2()
#X_s=df_s['age'].values
Out[154]:
Model: OLS Adj. R-squared: 0.062
Dependent Variable: y AIC: 30714.1427
Date: 2020-12-27 16:56 BIC: 30738.1681
No. Observations: 3000 Log-Likelihood: -15353.
Df Model: 3 F-statistic: 66.58
Df Residuals: 2996 Prob (F-statistic): 1.13e-41
R-squared: 0.062 Scale: 1634.1
Coef. Std.Err. t P>|t| [0.025 0.975]
const 86.3984 1.0609 81.4354 0.0000 84.3182 88.4787
(17.938, 33.5] 7.7599 1.5598 4.9751 0.0000 4.7016 10.8183
(33.5, 49.0] 31.8134 1.3515 23.5402 0.0000 29.1636 34.4633
(49.0, 64.5] 31.4245 1.5441 20.3515 0.0000 28.3969 34.4521
(64.5, 80.0] 15.4005 3.8397 4.0109 0.0001 7.8719 22.9292
Omnibus: 1062.354 Durbin-Watson: 1.965
Prob(Omnibus): 0.000 Jarque-Bera (JB): 4551.200
Skew: 1.681 Prob(JB): 0.000
Kurtosis: 8.011 Condition No.: 12514609934612772
In [185]:
X_new=pd.get_dummies(pd.cut(age_grid,4))
X_new=sms.add_constant(X_new)
y_hat=result.predict(X_new)
In [208]:
plt.scatter(X,Y, alpha=0.1)
plt.plot(age_grid,y_hat,'r-')
Out[208]:
[<matplotlib.lines.Line2D at 0x25e49339348>]

Splines

In [221]:
X_bs=dmatrix("bs(X, knots=(25,40,60),degree=3, include_intercept=False)", {'X':X})
age_grid_bs=dmatrix("bs(X, knots=(25,40,60),degree=3, include_intercept=False)", {'X':age_grid})
model_bs=sms.OLS(Y, X_bs).fit(disp=0)

yHat_bs=model_bs.predict(age_grid_bs)
# standard error of prediction = X*var(Y)*X.T
yHat_bs_se=np.sqrt(np.diagonal((age_grid_bs@model_bs.cov_params())@age_grid_bs.T))

plt.scatter(X,Y,alpha=0.1)
plt.plot(age_grid,yHat_bs,'r-')
plt.plot(age_grid,yHat_bs-2*yHat_bs_se,'g--')
plt.plot(age_grid,yHat_bs+2*yHat_bs_se,'g--')
Out[221]:
[<matplotlib.lines.Line2D at 0x25e4a6d7608>]
In [220]:
# natural splines
X_ns=dmatrix("cr(X, df=4)", {'X':X})
age_grid_ns=dmatrix("cr(X, df=4)", {'X':age_grid})
model_ns=sms.OLS(Y, X_ns).fit(disp=0)

yHat_ns=model_ns.predict(age_grid_ns)
# standard error of prediction = X*var(Y)*X.T
yHat_ns_se=np.sqrt(np.diagonal((age_grid_ns@model_ns.cov_params())@age_grid_ns.T))

plt.scatter(X,Y,alpha=0.1)
plt.plot(age_grid,yHat_ns,'r-')
plt.plot(age_grid,yHat_ns-2*yHat_ns_se,'g--')
plt.plot(age_grid,yHat_ns+2*yHat_ns_se,'g--')
Out[220]:
[<matplotlib.lines.Line2D at 0x25e4a74ad48>]

Local Regression

In [4]:
yHat_local_7=sms.nonparametric.lowess(Y,X[:,0], it=3,frac=0.7, return_sorted=False)
yHat_local_2=sms.nonparametric.lowess(Y,X[:,0], it=3,frac=0.2, return_sorted=False)
plt.scatter(X,Y,alpha=0.05)
plt.plot(X[:,0],yHat_local_7,'r.')
plt.plot(X[:,0],yHat_local_2,'g.')
Out[4]:
[<matplotlib.lines.Line2D at 0x26f8e4c7fc8>]

Generalized Additive Model

In [39]:
X_gam=dmatrix("cr(year, df=4)+cr(age,5)+education", data)
model_gam=sms.OLS(Y, X_gam).fit(disp=0)
Yhat_gam=model_gam.predict(X_gam)

sns.lineplot(data['year'], Yhat_gam)
Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x26f91732888>
In [32]:
sns.lineplot(data['age'],Yhat_gam)
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x26f91a876c8>
In [40]:
sns.boxplot(data['education'],Yhat_gam)
Out[40]:
<matplotlib.axes._subplots.AxesSubplot at 0x26f92cb5d08>
In [ ]:
 

No comments: