In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sms
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
from patsy import dmatrix
In [2]:
data=pd.read_csv("data/Wage.csv", index_col=0)
df=data.loc[:,['age', 'wage']]
In [12]:
data.info()
#data.describe()
In [17]:
sns.pairplot(df)
Out[17]:
In [26]:
sns.heatmap(df.cov())
#sns.heatmap(np.cov(df.T))
Out[26]:
In [3]:
age_grid=df.age.unique()
age_grid=np.sort(age_grid).reshape(-1,1)
# extract columns
Y=df.wage.values
X=df.age.values.reshape(-1,1)
pf=PolynomialFeatures(degree=4)
X_pol=pf.fit_transform(X)
age_grid_pol=pf.fit_transform(age_grid)
In [154]:
model=sms.OLS(Y,X_pol)
result=model.fit()
result.summary2()
Out[154]:
In [186]:
pre_result=result.get_prediction(age_grid_pol)
prediction_ci=pre_result.summary_frame()
#prediction_ci.plot()
In [190]:
fig, (ax1,ax2) =plt.subplots(1,2)
ax1.plot(df.age, df.wage, 'm.')
ax2.plot(age_grid,prediction_ci.loc[:,'mean'], 'r-',
age_grid,prediction_ci.mean_ci_lower.values,'y--',
age_grid,prediction_ci.mean_ci_upper.values,'y--' )
fig.suptitle("Degree 4 Polynomial")
Out[190]:
Choosing Correct Model by ANOVA¶
In [206]:
def poly_linear_regression(d,X,Y):
X=df.age.values.reshape(-1,1)
pf=PolynomialFeatures(degree=d)
X_pol=pf.fit_transform(X)
model=sms.OLS(Y,X_pol)
return model.fit()
In [207]:
fit1=poly_linear_regression(1,df.age.values,df.wage.values)
fit2=poly_linear_regression(2,df.age.values,df.wage.values)
fit3=poly_linear_regression(3,df.age.values,df.wage.values)
fit4=poly_linear_regression(4,df.age.values,df.wage.values)
fit5=poly_linear_regression(5,df.age.values,df.wage.values)
Out[207]:
In [208]:
sms.stats.anova_lm(fit1,fit2,fit3,fit4,fit5, typ=1)
Out[208]:
Logistic Regression with Polynomial¶
In [5]:
model=sms.Logit(Y>250,X_pol)
model_log_results=model.fit()
model_log_results.summary2()
Out[5]:
In [103]:
prob_avg=model_log_results.predict(age_grid_pol, transform=False)
In [120]:
# use bootstrap method to obtain confidence intervals
def random_logit_model():
n=X_pol.shape[0]
seq=np.random.randint(0,n,n)
Y_r=Y[seq]
X_pol_r=X_pol[seq,:]
return sms.Logit(Y_r>250,X_pol_r).fit(disp=False)
N=1000
res=np.zeros((N,age_grid_pol.shape[0]))
for i in range(N):
model_random=random_logit_model();
res[i,:]=model_random.predict(age_grid_pol, transform=False)
prob_std=np.std(res,axis=0)
prob_upper=np.percentile(res,95,axis=0)
prob_lower=np.percentile(res,5,axis=0)
In [121]:
fig, (ax1,ax2) =plt.subplots(1,2)
ax1.plot(df.age, df.wage, 'm.')
ax2.plot(age_grid,prob_avg, 'r-',
age_grid,prob_lower,'g--',
age_grid,prob_upper,'g--' )
ax2.plot(age_grid,res[0:100,:].T, alpha=0.1)
fig.suptitle("Probability of Wage>250")
Out[121]:
Step Function¶
In [154]:
X_c=pd.cut(df.age,4)
X_d=pd.get_dummies(X_c)
X_d_c=sms.add_constant(X_d)
model=sms.OLS(Y,X_d_c)
result=model.fit()
result.summary2()
#X_s=df_s['age'].values
Out[154]:
In [185]:
X_new=pd.get_dummies(pd.cut(age_grid,4))
X_new=sms.add_constant(X_new)
y_hat=result.predict(X_new)
In [208]:
plt.scatter(X,Y, alpha=0.1)
plt.plot(age_grid,y_hat,'r-')
Out[208]:
Splines¶
In [221]:
X_bs=dmatrix("bs(X, knots=(25,40,60),degree=3, include_intercept=False)", {'X':X})
age_grid_bs=dmatrix("bs(X, knots=(25,40,60),degree=3, include_intercept=False)", {'X':age_grid})
model_bs=sms.OLS(Y, X_bs).fit(disp=0)
yHat_bs=model_bs.predict(age_grid_bs)
# standard error of prediction = X*var(Y)*X.T
yHat_bs_se=np.sqrt(np.diagonal((age_grid_bs@model_bs.cov_params())@age_grid_bs.T))
plt.scatter(X,Y,alpha=0.1)
plt.plot(age_grid,yHat_bs,'r-')
plt.plot(age_grid,yHat_bs-2*yHat_bs_se,'g--')
plt.plot(age_grid,yHat_bs+2*yHat_bs_se,'g--')
Out[221]:
In [220]:
# natural splines
X_ns=dmatrix("cr(X, df=4)", {'X':X})
age_grid_ns=dmatrix("cr(X, df=4)", {'X':age_grid})
model_ns=sms.OLS(Y, X_ns).fit(disp=0)
yHat_ns=model_ns.predict(age_grid_ns)
# standard error of prediction = X*var(Y)*X.T
yHat_ns_se=np.sqrt(np.diagonal((age_grid_ns@model_ns.cov_params())@age_grid_ns.T))
plt.scatter(X,Y,alpha=0.1)
plt.plot(age_grid,yHat_ns,'r-')
plt.plot(age_grid,yHat_ns-2*yHat_ns_se,'g--')
plt.plot(age_grid,yHat_ns+2*yHat_ns_se,'g--')
Out[220]:
Local Regression¶
In [4]:
yHat_local_7=sms.nonparametric.lowess(Y,X[:,0], it=3,frac=0.7, return_sorted=False)
yHat_local_2=sms.nonparametric.lowess(Y,X[:,0], it=3,frac=0.2, return_sorted=False)
plt.scatter(X,Y,alpha=0.05)
plt.plot(X[:,0],yHat_local_7,'r.')
plt.plot(X[:,0],yHat_local_2,'g.')
Out[4]:
Generalized Additive Model¶
In [39]:
X_gam=dmatrix("cr(year, df=4)+cr(age,5)+education", data)
model_gam=sms.OLS(Y, X_gam).fit(disp=0)
Yhat_gam=model_gam.predict(X_gam)
sns.lineplot(data['year'], Yhat_gam)
Out[39]:
In [32]:
sns.lineplot(data['age'],Yhat_gam)
Out[32]:
In [40]:
sns.boxplot(data['education'],Yhat_gam)
Out[40]:
In [ ]:
No comments:
Post a Comment