Thursday, February 25, 2021

Chapter 6 - Linear Model Selection and Regularization within Python - An Introduction to Statistical Learning

Chapter 6 - Linear Model Selection and Regularization - An Introduction to Statistical Learning
In [27]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import numpy as np
import statsmodels.api as sm
from itertools import combinations

from sklearn.preprocessing import scale
from sklearn import linear_model as lm
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression, PLSSVD
from sklearn import model_selection

import matplotlib.pyplot as plt
In [2]:
hitters=pd.read_csv('data/Hitters.csv')
In [5]:
hitters.describe()
Out[5]:
AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI CWalks PutOuts Assists Errors Salary
count 322.000000 322.000000 322.000000 322.000000 322.000000 322.000000 322.000000 322.00000 322.000000 322.000000 322.000000 322.000000 322.000000 322.000000 322.000000 322.000000 263.000000
mean 380.928571 101.024845 10.770186 50.909938 48.027950 38.742236 7.444099 2648.68323 717.571429 69.490683 358.795031 330.118012 260.239130 288.937888 106.913043 8.040373 535.925882
std 153.404981 46.454741 8.709037 26.024095 26.166895 21.639327 4.926087 2324.20587 654.472627 86.266061 334.105886 333.219617 267.058085 280.704614 136.854876 6.368359 451.118681
min 16.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 19.00000 4.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 67.500000
25% 255.250000 64.000000 4.000000 30.250000 28.000000 22.000000 4.000000 816.75000 209.000000 14.000000 100.250000 88.750000 67.250000 109.250000 7.000000 3.000000 190.000000
50% 379.500000 96.000000 8.000000 48.000000 44.000000 35.000000 6.000000 1928.00000 508.000000 37.500000 247.000000 220.500000 170.500000 212.000000 39.500000 6.000000 425.000000
75% 512.000000 137.000000 16.000000 69.000000 64.750000 53.000000 11.000000 3924.25000 1059.250000 90.000000 526.250000 426.250000 339.250000 325.000000 166.000000 11.000000 750.000000
max 687.000000 238.000000 40.000000 130.000000 121.000000 105.000000 24.000000 14053.00000 4256.000000 548.000000 2165.000000 1659.000000 1566.000000 1378.000000 492.000000 32.000000 2460.000000
In [ ]:
sns.pairplot(hitters)
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x2556256aa08>
In [190]:
sns.heatmap(hitters.corr())
hitters.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  322 non-null    object 
 1   AtBat       322 non-null    int64  
 2   Hits        322 non-null    int64  
 3   HmRun       322 non-null    int64  
 4   Runs        322 non-null    int64  
 5   RBI         322 non-null    int64  
 6   Walks       322 non-null    int64  
 7   Years       322 non-null    int64  
 8   CAtBat      322 non-null    int64  
 9   CHits       322 non-null    int64  
 10  CHmRun      322 non-null    int64  
 11  CRuns       322 non-null    int64  
 12  CRBI        322 non-null    int64  
 13  CWalks      322 non-null    int64  
 14  League      322 non-null    object 
 15  Division    322 non-null    object 
 16  PutOuts     322 non-null    int64  
 17  Assists     322 non-null    int64  
 18  Errors      322 non-null    int64  
 19  Salary      263 non-null    float64
 20  NewLeague   322 non-null    object 
dtypes: float64(1), int64(16), object(4)
memory usage: 53.0+ KB
In [3]:
data = hitters[~pd.isna(hitters.Salary)]
data = data.rename(columns={'Unnamed: 0': 'Name'})
dummies = pd.get_dummies(data[['League', 'Division', 'NewLeague']])
y = data.Salary
X_ = data.drop(['Name', 'Salary', 'League', 'Division', 'NewLeague'],
               axis=1).astype('float64')
X = pd.concat([X_, dummies[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.5,random_state=1)
In [272]:
[len(X_train),len(y_train), len(X_test),len(y_test)]
#dummies
Out[272]:
[131, 131, 132, 132]
In [11]:
data.shape
Out[11]:
(263, 21)

Subset Selection Methods

OLS

In [10]:
X_train_ols = X_train.copy()
X_test_ols = X_test.copy()
X_train_ols = sm.add_constant(X_train_ols)
X_test_ols = sm.add_constant(X_test_ols)
model_ols = sm.OLS(y_train,X_train_ols)
model_ols_fit=model_ols.fit()
mean_squared_error(y_test.values,model_ols_fit.predict(X_test_ols))
Out[10]:
116690.46856661131

Best Subset Selection

In [21]:
def reg(X, Y):
    X = X.copy()
    X = sm.add_constant(X)
    model = sm.OLS(Y, X)
    result = model.fit()
    return result.rsquared_adj
In [16]:
column_set = set(np.array(X.columns.values, dtype='str')) - {'Salary'}
column_comb = [list(combinations(columnSet, k)) for k in range(1, len(columnSet))]
num_features=len(columnSet)
In [179]:
features = {}
for k in range(0, len(column_comb)):
    rss = {}
    for cols in column_comb[k]:
        colNames = list(cols)
        x = X[colNames]
        rss[str(colNames)] = reg(x, y)
    best = max(rss, key=rss.get)
    features[best] = rss[best]
In [173]:
rss
Out[173]:
{"['CHmRun', 'CWalks']": 0.28209443655784894,
 "['CHmRun', 'CAtBat']": 0.3012563293005055,
 "['CHmRun', 'League_N']": 0.27178135459779873,
 "['CHmRun', 'HmRun']": 0.2794759215370033,
 "['CHmRun', 'PutOuts']": 0.33414637935159397,
 "['CHmRun', 'Division_W']": 0.30205541912796274,
 "['CHmRun', 'CRuns']": 0.322885781950222,
 "['CHmRun', 'Walks']": 0.34779878467397485,
 "['CHmRun', 'RBI']": 0.329148001190001,
 "['CHmRun', 'AtBat']": 0.3546562112363083,
 "['CHmRun', 'Assists']": 0.28619980333496864,
 "['CHmRun', 'NewLeague_N']": 0.2725054301266332,
 "['CHmRun', 'CRBI']": 0.31624020228939564,
 "['CHmRun', 'Years']": 0.270950144928948,
 "['CHmRun', 'Hits']": 0.390284082287933,
 "['CHmRun', 'Runs']": 0.3652653369711405,
 "['CHmRun', 'CHits']": 0.3188556322019439,
 "['CHmRun', 'Errors']": 0.2768452122881023,
 "['CWalks', 'CAtBat']": 0.27217912764827,
 "['CWalks', 'League_N']": 0.2340789082209268,
 "['CWalks', 'HmRun']": 0.29114481574046225,
 "['CWalks', 'PutOuts']": 0.308880773912391,
 "['CWalks', 'Division_W']": 0.26251854002786446,
 "['CWalks', 'CRuns']": 0.3188586002634136,
 "['CWalks', 'Walks']": 0.3015179050354999,
 "['CWalks', 'RBI']": 0.3537137450778777,
 "['CWalks', 'AtBat']": 0.34556106581350843,
 "['CWalks', 'Assists']": 0.23747007077993898,
 "['CWalks', 'NewLeague_N']": 0.23417486619889738,
 "['CWalks', 'CRBI']": 0.3172127401258553,
 "['CWalks', 'Years']": 0.23438870662229272,
 "['CWalks', 'Hits']": 0.3806140131383714,
 "['CWalks', 'Runs']": 0.3535467272595695,
 "['CWalks', 'CHits']": 0.2959310361762484,
 "['CWalks', 'Errors']": 0.23755615599715463,
 "['CAtBat', 'League_N']": 0.2712587319624711,
 "['CAtBat', 'HmRun']": 0.32653548383250475,
 "['CAtBat', 'PutOuts']": 0.3462355008195678,
 "['CAtBat', 'Division_W']": 0.30490920440776836,
 "['CAtBat', 'CRuns']": 0.3325613129878169,
 "['CAtBat', 'Walks']": 0.37042174253570437,
 "['CAtBat', 'RBI']": 0.37160996405839763,
 "['CAtBat', 'AtBat']": 0.3572403950293861,
 "['CAtBat', 'Assists']": 0.272137856358006,
 "['CAtBat', 'NewLeague_N']": 0.27125576178810606,
 "['CAtBat', 'CRBI']": 0.31797414001128144,
 "['CAtBat', 'Years']": 0.3123022563939002,
 "['CAtBat', 'Hits']": 0.38584419480845766,
 "['CAtBat', 'Runs']": 0.38396316750540693,
 "['CAtBat', 'CHits']": 0.33704828388740515,
 "['CAtBat', 'Errors']": 0.27227182014394014,
 "['League_N', 'HmRun']": 0.11498383380979571,
 "['League_N', 'PutOuts']": 0.08398126036478515,
 "['League_N', 'Division_W']": 0.029875483317689966,
 "['League_N', 'CRuns']": 0.311617254788909,
 "['League_N', 'Walks']": 0.19106626484755607,
 "['League_N', 'RBI']": 0.20087912802126362,
 "['League_N', 'AtBat']": 0.1515519291004731,
 "['League_N', 'Assists']": -0.00679424506894577,
 "['League_N', 'NewLeague_N']": -0.00713234561479692,
 "['League_N', 'CRBI']": 0.31645394971814855,
 "['League_N', 'Years']": 0.15406976196669997,
 "['League_N', 'Hits']": 0.18912947590676854,
 "['League_N', 'Runs']": 0.1754320784241694,
 "['League_N', 'CHits']": 0.29593014522761696,
 "['League_N', 'Errors']": -0.00746976811439648,
 "['HmRun', 'PutOuts']": 0.16031647350050737,
 "['HmRun', 'Division_W']": 0.14377356817660614,
 "['HmRun', 'CRuns']": 0.3535358209851682,
 "['HmRun', 'Walks']": 0.21805110270417105,
 "['HmRun', 'RBI']": 0.2012570074203066,
 "['HmRun', 'AtBat']": 0.1717058843828041,
 "['HmRun', 'Assists']": 0.11764809255205189,
 "['HmRun', 'NewLeague_N']": 0.11539350064315768,
 "['HmRun', 'CRBI']": 0.34026253544680574,
 "['HmRun', 'Years']": 0.24445460884377135,
 "['HmRun', 'Hits']": 0.20327407847124435,
 "['HmRun', 'Runs']": 0.18015042009445426,
 "['HmRun', 'CHits']": 0.3488304175168727,
 "['HmRun', 'Errors']": 0.11088536582441011,
 "['PutOuts', 'Division_W']": 0.11776261658147835,
 "['PutOuts', 'CRuns']": 0.38356443500640547,
 "['PutOuts', 'Walks']": 0.2246585374123431,
 "['PutOuts', 'RBI']": 0.2245323951507575,
 "['PutOuts', 'AtBat']": 0.18476505178969826,
 "['PutOuts', 'Assists']": 0.0847851133008769,
 "['PutOuts', 'NewLeague_N']": 0.08367010332720137,
 "['PutOuts', 'CRBI']": 0.37797504923721326,
 "['PutOuts', 'Years']": 0.25001207191416663,
 "['PutOuts', 'Hits']": 0.21784973744402736,
 "['PutOuts', 'Runs']": 0.2078297724866741,
 "['PutOuts', 'CHits']": 0.36621885814326327,
 "['PutOuts', 'Errors']": 0.08408682519963029,
 "['Division_W', 'CRuns']": 0.339236705636372,
 "['Division_W', 'Walks']": 0.21685038730156425,
 "['Division_W', 'RBI']": 0.21934357159676388,
 "['Division_W', 'AtBat']": 0.1786593796765581,
 "['Division_W', 'Assists']": 0.030145552746343562,
 "['Division_W', 'NewLeague_N']": 0.029665573144196045,
 "['Division_W', 'CRBI']": 0.34899966381821046,
 "['Division_W', 'Years']": 0.18833824419763323,
 "['Division_W', 'Hits']": 0.21091394799546315,
 "['Division_W', 'Runs']": 0.19206771505883857,
 "['Division_W', 'CHits']": 0.32842687024145023,
 "['Division_W', 'Errors']": 0.02968514766489372,
 "['CRuns', 'Walks']": 0.385920122780967,
 "['CRuns', 'RBI']": 0.3964732664409548,
 "['CRuns', 'AtBat']": 0.3842346875832007,
 "['CRuns', 'Assists']": 0.31360930360911354,
 "['CRuns', 'NewLeague_N']": 0.3116288311858887,
 "['CRuns', 'CRBI']": 0.3229314105354729,
 "['CRuns', 'Years']": 0.34868727036594604,
 "['CRuns', 'Hits']": 0.41027739308673106,
 "['CRuns', 'Runs']": 0.39873847136239826,
 "['CRuns', 'CHits']": 0.3121937518348886,
 "['CRuns', 'Errors']": 0.31364675021353705,
 "['Walks', 'RBI']": 0.24852923519925663,
 "['Walks', 'AtBat']": 0.21368463833061235,
 "['Walks', 'Assists']": 0.19125158737541115,
 "['Walks', 'NewLeague_N']": 0.19093818934945883,
 "['Walks', 'CRBI']": 0.39560236209642263,
 "['Walks', 'Years']": 0.3100640825902411,
 "['Walks', 'Hits']": 0.23957382705896524,
 "['Walks', 'Runs']": 0.21476130115886838,
 "['Walks', 'CHits']": 0.39070561041451624,
 "['Walks', 'Errors']": 0.1926113935549798,
 "['RBI', 'AtBat']": 0.19963803716451434,
 "['RBI', 'Assists']": 0.19588143622703325,
 "['RBI', 'NewLeague_N']": 0.19978824606014245,
 "['RBI', 'CRBI']": 0.37875440765789503,
 "['RBI', 'Years']": 0.3160169619092684,
 "['RBI', 'Hits']": 0.21479746997757931,
 "['RBI', 'Runs']": 0.2083704014961152,
 "['RBI', 'CHits']": 0.38800746930943997,
 "['RBI', 'Errors']": 0.20135043550467813,
 "['AtBat', 'Assists']": 0.16306519990641954,
 "['AtBat', 'NewLeague_N']": 0.15043229703193228,
 "['AtBat', 'CRBI']": 0.3930481541276224,
 "['AtBat', 'Years']": 0.307105920758842,
 "['AtBat', 'Hits']": 0.19746612606429914,
 "['AtBat', 'Runs']": 0.17146979027160458,
 "['AtBat', 'CHits']": 0.37393737626461965,
 "['AtBat', 'Errors']": 0.16956867921406027,
 "['Assists', 'NewLeague_N']": -0.007024523923139903,
 "['Assists', 'CRBI']": 0.3227700066275804,
 "['Assists', 'Years']": 0.15766678306126491,
 "['Assists', 'Hits']": 0.19915226025591537,
 "['Assists', 'Runs']": 0.17252978089602766,
 "['Assists', 'CHits']": 0.2970015833939966,
 "['Assists', 'Errors']": -0.005957660262193354,
 "['NewLeague_N', 'CRBI']": 0.3165664693667427,
 "['NewLeague_N', 'Years']": 0.1541147063217585,
 "['NewLeague_N', 'Hits']": 0.1877571036988065,
 "['NewLeague_N', 'Runs']": 0.17394018199081507,
 "['NewLeague_N', 'CHits']": 0.2959382181067932,
 "['NewLeague_N', 'Errors']": -0.0076566263829809245,
 "['CRBI', 'Years']": 0.3477419673723352,
 "['CRBI', 'Hits']": 0.4208023906703695,
 "['CRBI', 'Runs']": 0.41406315382820014,
 "['CRBI', 'CHits']": 0.31765678412934917,
 "['CRBI', 'Errors']": 0.3199045556096528,
 "['Years', 'Hits']": 0.3415173041695664,
 "['Years', 'Runs']": 0.33581483785285304,
 "['Years', 'CHits']": 0.34009066340411487,
 "['Years', 'Errors']": 0.15746098034423872,
 "['Hits', 'Runs']": 0.18867658768747064,
 "['Hits', 'CHits']": 0.39802730079060555,
 "['Hits', 'Errors']": 0.20418577627073875,
 "['Runs', 'CHits']": 0.3996074675257747,
 "['Runs', 'Errors']": 0.17773346751411623,
 "['CHits', 'Errors']": 0.29696020482379715}
In [170]:
#min(rss, key=rss.get)
In [12]:
features
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-224628ecf868> in <module>
----> 1 features

NameError: name 'features' is not defined
In [184]:
bestFS=max(features, key=features.get)
print(f' {bestFS} -> {features[bestFS]}')
 ['CWalks', 'CAtBat', 'League_N', 'PutOuts', 'Division_W', 'CRuns', 'Walks', 'AtBat', 'Assists', 'CRBI', 'Hits'] -> 0.5225705787309167
In [191]:
len(bestFS.split(','))
Out[191]:
11
In [186]:
bestFS
Out[186]:
"['CWalks', 'CAtBat', 'League_N', 'PutOuts', 'Division_W', 'CRuns', 'Walks', 'AtBat', 'Assists', 'CRBI', 'Hits']"
In [194]:
 

Forward and Backward stepwise Selection

In [55]:
fwd_features=[]
models_adj_r2={}
for k in range(1,20):    
    adj_r2={}
    for c in  column_set:
        if c in  fwd_features:
            continue
        tmp_features=fwd_features.copy()
        tmp_features.append(c)
        x = X[tmp_features]
        adj_r2[c] = reg(x, y)            
    best=max(adj_r2,key=adj_r2.get)    
    fwd_features.append(best)    
    models_adj_r2[str(fwd_features)] = adj_r2.get(best)
    
best_fwd_model=max(models_adj_r2,key=models_adj_r2.get)
best_fwd_mdl_len=len(best_fwd_model.split(','))
print(f'{best_fwd_mdl_len} {best_fwd_model} {models_adj_r2[best_fwd_model]}')
   
11 ['CRBI', 'Hits', 'PutOuts', 'Division_W', 'AtBat', 'Walks', 'CWalks', 'CRuns', 'CAtBat', 'Assists', 'League_N'] 0.522570578730917
In [54]:
bwd_features=column_set.copy()
bwd_models_adj_r2={}
for k in range(len(bwd_features),1,-1):    
    adj_r2={}
    for c in  bwd_features:
        tmp_features=bwd_features.copy()
        tmp_features.remove(c)
        x = X[tmp_features]
        adj_r2[c] = reg(x, y)        
    worst_feature=max(adj_r2,key=adj_r2.get)        
    bwd_features.remove(worst_feature)    
    bwd_models_adj_r2[str(bwd_features)] = adj_r2.get(worst_feature)
    #print(f' {k} {worst_feature} {adj_r2} {bwd_features} \n')    

best_bwd_model=max(bwd_models_adj_r2,key=bwd_models_adj_r2.get)
best_bwd_mdl_len=len(best_bwd_model.split(','))
print(f'{best_bwd_mdl_len} {best_bwd_model} {bwd_models_adj_r2[best_bwd_model]}')
11 {'League_N', 'Hits', 'Walks', 'AtBat', 'CRuns', 'CAtBat', 'Division_W', 'CRBI', 'Assists', 'CWalks', 'PutOuts'} 0.522570578730917

Ridge Regression and the Lasso

Ridge

In [209]:
grid=10**np.linspace(10,-2,num=100)
clf=lm.Ridge(grid,normalize=True)
allY=np.tile(y_train, (100,1)).T
model=clf.fit(X_train,allY)
In [210]:
# see some coeffiecients
[model.coef_[0,:],model.coef_[50,:],model.coef_[60,:],model.coef_[-1,:]]
Out[210]:
[array([ 1.31746375e-10,  4.64748588e-10,  2.07986517e-09,  7.72617506e-10,
         9.39063959e-10,  9.76921945e-10,  3.96144226e-09,  1.06053327e-11,
         3.99360518e-11,  2.95942756e-10,  8.24524707e-11,  7.79545054e-11,
         9.89438678e-11,  7.26899124e-11, -2.61588516e-12,  2.08451397e-10,
        -2.50128060e-09, -1.54995108e-08, -2.02319577e-09]),
 array([ 1.51341339e-04,  5.33930479e-04,  2.38904385e-03,  8.87576538e-04,
         1.07880122e-03,  1.12238119e-03,  4.55017300e-03,  1.21821824e-05,
         4.58759986e-05,  3.39969176e-04,  9.47160464e-05,  8.95512865e-05,
         1.13659215e-04,  8.35363060e-05, -3.00167754e-06,  2.39667092e-04,
        -2.86725960e-03, -1.78189905e-02, -2.31826282e-03]),
 array([ 2.43328132e-03,  8.59903476e-03,  3.83710012e-02,  1.42818130e-02,
         1.73618719e-02,  1.80850012e-02,  7.30452502e-02,  1.95744258e-04,
         7.37648015e-04,  5.46862512e-03,  1.52292790e-03,  1.44042823e-03,
         1.82724827e-03,  1.35172041e-03, -4.74738449e-05,  3.90508320e-03,
        -4.46201103e-02, -2.89984018e-01, -3.58413694e-02]),
 array([  -2.1195973 ,    6.09169851,   -5.97208979,   -0.68059024,
           3.38880642,    4.68750515,   -4.1991478 ,   -0.12110116,
           0.23820712,    0.99830479,    0.5881033 ,    0.41409643,
          -0.26127304,    0.42756182,    0.28988444,   -3.44469648,
          93.19296264, -117.08603969,  -56.81711903])]
In [217]:
# aim in ridge regression is to reduce coefficients to control variance 
plot(np.log10(grid),np.linalg.norm(model.coef_,ord=2,axis=1))
Out[217]:
[<matplotlib.lines.Line2D at 0x2550bd9fb48>]
In [218]:
# lets apply Ridge regression when alpha (\lambda) is 4
ridge4=lm.Ridge(alpha=4, normalize=True)
ridge4.fit(X_train,y_train)
ridge4.predict(X_test)
mean_squared_error(y_test,ridge4.predict(X_test))
Out[218]:
106216.52238005561
In [219]:
clfCV=lm.RidgeCV(grid,normalize=True,cv=10, scoring = 'neg_mean_squared_error')
ridgeCv_model=clfCV.fit(X_train,y_train)
In [220]:
ridgeCv_model.alpha_
Out[220]:
0.6579332246575682
In [221]:
bestRidge=lm.Ridge(alpha=ridgeCv_model.alpha_, normalize=True)
bestRidge.fit(X_train,y_train)
mean_squared_error(y_test,bestRidge.predict(X_test))
Out[221]:
99691.75835893235
In [222]:
np.linalg.norm(bestRidge.coef_,ord=2)
Out[222]:
90.41841836091815

Lassso

In [233]:
lasso4=lm.Lasso(alpha=4,normalize=True,max_iter=1000,tol=0.0001)
lasso4.fit(X_train,y_train)
mean_squared_error(y_test,lasso4.predict(X_test))
Out[233]:
107288.70675263616
In [234]:
np.linalg.norm(lasso4.coef_,ord=2)
Out[234]:
52.560705201079166
In [242]:
lasso_cv=lm.LassoCV(cv=10, max_iter=100000, normalize=True)
lasso_cv.fit(X_train,y_train)
lasso_cv.alpha_
Out[242]:
2.402973015740077
In [255]:
lasso4.set_params(alpha=lasso_cv.alpha_)
lasso4.fit(X_train,y_train)
mean_squared_error(y_test,lasso4.predict(X_test))
Out[255]:
104960.65853895503

PCR and PLS Regression

Principal Components Regression (PCR)

In [24]:
pca=PCA()
X_reduced=pca.fit_transform(scale(X))
np.cumsum(pca.explained_variance_ratio_)
Out[24]:
array([0.3831424 , 0.60155315, 0.70841675, 0.79034194, 0.84290275,
       0.88634779, 0.92262888, 0.94963043, 0.96282691, 0.97255413,
       0.97977754, 0.986487  , 0.99151787, 0.99473033, 0.99746591,
       0.99893988, 0.99968159, 0.99993751, 1.        ])
In [39]:
n=len(X_reduced)
kf_10=model_selection.KFold(n_splits=10,shuffle=True, random_state=1)

regr=LinearRegression()
mse=[]

# Calculate MSE with only the intercept (no principal components in regression)
#score = -1*model_selection.cross_val_score(regr, np.ones((n,1)), y.ravel(), cv=kf_10, scoring='neg_mean_squared_error').mean()    
#mse.append(score)

for i in np.arange(1, 20):
    score = -1*model_selection.cross_val_score(regr, X_reduced[:,:i], y.ravel(), cv=kf_10, scoring='neg_mean_squared_error').mean()
    mse.append(score)
# Plot results    
plt.plot(mse, '-v')
plt.xlabel('Number of principal components in regression')
plt.ylabel('MSE')
plt.title('Salary')
plt.xlim(xmin=-1);
In [51]:
pca2 = PCA()
X_reduced_train = pca2.fit_transform(scale(X_train))
X_reduced_test = pca2.transform(scale(X_test))[:,:7]

# Train regression model on training data 
regr = LinearRegression()
regr.fit(X_reduced_train[:,:7], y_train)

# Prediction with test data
pred = regr.predict(X_reduced_test)
mean_squared_error(y_test, pred)
Out[51]:
111994.42273636989

Partial Least Squares(PLS)

In [49]:
n = len(X_train)

# 10-fold CV, with shuffle
kf_10 = model_selection.KFold(n_splits=10, shuffle=True, random_state=1)

mse = []

for i in np.arange(1, 20):
    pls = PLSRegression(n_components=i)
    score = model_selection.cross_val_score(pls, scale(X_train), y_train, cv=kf_10, scoring='neg_mean_squared_error').mean()
    mse.append(-score)

# Plot results
plt.plot(np.arange(1, 20), np.array(mse), '-v')
plt.xlabel('Number of principal components in regression')
plt.ylabel('MSE')
plt.title('Salary')
plt.xlim(xmin=-1)
Out[49]:
(-1.0, 19.9)
In [50]:
pls = PLSRegression(n_components=2)
pls.fit(scale(X_train), y_train)

mean_squared_error(y_test, pls.predict(scale(X_test)))
Out[50]:
104838.51042760804

No comments: