In [27]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import numpy as np
import statsmodels.api as sm
from itertools import combinations
from sklearn.preprocessing import scale
from sklearn import linear_model as lm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression, PLSSVD
from sklearn import model_selection
import matplotlib.pyplot as plt
In [2]:
hitters=pd.read_csv('data/Hitters.csv')
In [5]:
hitters.describe()
Out[5]:
In [ ]:
sns.pairplot(hitters)
Out[ ]:
In [190]:
sns.heatmap(hitters.corr())
hitters.info()
In [3]:
data = hitters[~pd.isna(hitters.Salary)]
data = data.rename(columns={'Unnamed: 0': 'Name'})
dummies = pd.get_dummies(data[['League', 'Division', 'NewLeague']])
y = data.Salary
X_ = data.drop(['Name', 'Salary', 'League', 'Division', 'NewLeague'],
axis=1).astype('float64')
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.5,random_state=1)
In [272]:
[len(X_train),len(y_train), len(X_test),len(y_test)]
#dummies
Out[272]:
In [11]:
data.shape
Out[11]:
Subset Selection Methods¶
OLS¶
In [10]:
X_train_ols = X_train.copy()
X_test_ols = X_test.copy()
X_train_ols = sm.add_constant(X_train_ols)
X_test_ols = sm.add_constant(X_test_ols)
model_ols = sm.OLS(y_train,X_train_ols)
model_ols_fit=model_ols.fit()
mean_squared_error(y_test.values,model_ols_fit.predict(X_test_ols))
Out[10]:
Best Subset Selection¶
In [21]:
def reg(X, Y):
X = X.copy()
X = sm.add_constant(X)
model = sm.OLS(Y, X)
result = model.fit()
return result.rsquared_adj
In [16]:
column_set = set(np.array(X.columns.values, dtype='str')) - {'Salary'}
column_comb = [list(combinations(columnSet, k)) for k in range(1, len(columnSet))]
num_features=len(columnSet)
In [179]:
features = {}
for k in range(0, len(column_comb)):
rss = {}
for cols in column_comb[k]:
colNames = list(cols)
x = X[colNames]
rss[str(colNames)] = reg(x, y)
best = max(rss, key=rss.get)
features[best] = rss[best]
In [173]:
rss
Out[173]:
In [170]:
#min(rss, key=rss.get)
In [12]:
features
In [184]:
bestFS=max(features, key=features.get)
print(f' {bestFS} -> {features[bestFS]}')
In [191]:
len(bestFS.split(','))
Out[191]:
In [186]:
bestFS
Out[186]:
In [194]:
Forward and Backward stepwise Selection¶
In [55]:
fwd_features=[]
models_adj_r2={}
for k in range(1,20):
adj_r2={}
for c in column_set:
if c in fwd_features:
continue
tmp_features=fwd_features.copy()
tmp_features.append(c)
x = X[tmp_features]
adj_r2[c] = reg(x, y)
best=max(adj_r2,key=adj_r2.get)
fwd_features.append(best)
models_adj_r2[str(fwd_features)] = adj_r2.get(best)
best_fwd_model=max(models_adj_r2,key=models_adj_r2.get)
best_fwd_mdl_len=len(best_fwd_model.split(','))
print(f'{best_fwd_mdl_len} {best_fwd_model} {models_adj_r2[best_fwd_model]}')
In [54]:
bwd_features=column_set.copy()
bwd_models_adj_r2={}
for k in range(len(bwd_features),1,-1):
adj_r2={}
for c in bwd_features:
tmp_features=bwd_features.copy()
tmp_features.remove(c)
x = X[tmp_features]
adj_r2[c] = reg(x, y)
worst_feature=max(adj_r2,key=adj_r2.get)
bwd_features.remove(worst_feature)
bwd_models_adj_r2[str(bwd_features)] = adj_r2.get(worst_feature)
#print(f' {k} {worst_feature} {adj_r2} {bwd_features} \n')
best_bwd_model=max(bwd_models_adj_r2,key=bwd_models_adj_r2.get)
best_bwd_mdl_len=len(best_bwd_model.split(','))
print(f'{best_bwd_mdl_len} {best_bwd_model} {bwd_models_adj_r2[best_bwd_model]}')
Ridge Regression and the Lasso¶
Ridge¶
In [209]:
grid=10**np.linspace(10,-2,num=100)
clf=lm.Ridge(grid,normalize=True)
allY=np.tile(y_train, (100,1)).T
model=clf.fit(X_train,allY)
In [210]:
# see some coeffiecients
[model.coef_[0,:],model.coef_[50,:],model.coef_[60,:],model.coef_[-1,:]]
Out[210]:
In [217]:
# aim in ridge regression is to reduce coefficients to control variance
plot(np.log10(grid),np.linalg.norm(model.coef_,ord=2,axis=1))
Out[217]:
In [218]:
# lets apply Ridge regression when alpha (\lambda) is 4
ridge4=lm.Ridge(alpha=4, normalize=True)
ridge4.fit(X_train,y_train)
ridge4.predict(X_test)
mean_squared_error(y_test,ridge4.predict(X_test))
Out[218]:
In [219]:
clfCV=lm.RidgeCV(grid,normalize=True,cv=10, scoring = 'neg_mean_squared_error')
ridgeCv_model=clfCV.fit(X_train,y_train)
In [220]:
ridgeCv_model.alpha_
Out[220]:
In [221]:
bestRidge=lm.Ridge(alpha=ridgeCv_model.alpha_, normalize=True)
bestRidge.fit(X_train,y_train)
mean_squared_error(y_test,bestRidge.predict(X_test))
Out[221]:
In [222]:
np.linalg.norm(bestRidge.coef_,ord=2)
Out[222]:
Lassso¶
In [233]:
lasso4=lm.Lasso(alpha=4,normalize=True,max_iter=1000,tol=0.0001)
lasso4.fit(X_train,y_train)
mean_squared_error(y_test,lasso4.predict(X_test))
Out[233]:
In [234]:
np.linalg.norm(lasso4.coef_,ord=2)
Out[234]:
In [242]:
lasso_cv=lm.LassoCV(cv=10, max_iter=100000, normalize=True)
lasso_cv.fit(X_train,y_train)
lasso_cv.alpha_
Out[242]:
In [255]:
lasso4.set_params(alpha=lasso_cv.alpha_)
lasso4.fit(X_train,y_train)
mean_squared_error(y_test,lasso4.predict(X_test))
Out[255]:
PCR and PLS Regression¶
Principal Components Regression (PCR)¶
In [24]:
pca=PCA()
X_reduced=pca.fit_transform(scale(X))
np.cumsum(pca.explained_variance_ratio_)
Out[24]:
In [39]:
n=len(X_reduced)
kf_10=model_selection.KFold(n_splits=10,shuffle=True, random_state=1)
regr=LinearRegression()
mse=[]
# Calculate MSE with only the intercept (no principal components in regression)
#score = -1*model_selection.cross_val_score(regr, np.ones((n,1)), y.ravel(), cv=kf_10, scoring='neg_mean_squared_error').mean()
#mse.append(score)
for i in np.arange(1, 20):
score = -1*model_selection.cross_val_score(regr, X_reduced[:,:i], y.ravel(), cv=kf_10, scoring='neg_mean_squared_error').mean()
mse.append(score)
# Plot results
plt.plot(mse, '-v')
plt.xlabel('Number of principal components in regression')
plt.ylabel('MSE')
plt.title('Salary')
plt.xlim(xmin=-1);
In [51]:
pca2 = PCA()
X_reduced_train = pca2.fit_transform(scale(X_train))
X_reduced_test = pca2.transform(scale(X_test))[:,:7]
# Train regression model on training data
regr = LinearRegression()
regr.fit(X_reduced_train[:,:7], y_train)
# Prediction with test data
pred = regr.predict(X_reduced_test)
mean_squared_error(y_test, pred)
Out[51]:
Partial Least Squares(PLS)¶
In [49]:
n = len(X_train)
# 10-fold CV, with shuffle
kf_10 = model_selection.KFold(n_splits=10, shuffle=True, random_state=1)
mse = []
for i in np.arange(1, 20):
pls = PLSRegression(n_components=i)
score = model_selection.cross_val_score(pls, scale(X_train), y_train, cv=kf_10, scoring='neg_mean_squared_error').mean()
mse.append(-score)
# Plot results
plt.plot(np.arange(1, 20), np.array(mse), '-v')
plt.xlabel('Number of principal components in regression')
plt.ylabel('MSE')
plt.title('Salary')
plt.xlim(xmin=-1)
Out[49]:
In [50]:
pls = PLSRegression(n_components=2)
pls.fit(scale(X_train), y_train)
mean_squared_error(y_test, pls.predict(scale(X_test)))
Out[50]:
No comments:
Post a Comment