Thursday, February 25, 2021

Chapter 8 Tree Based Methods within Python - An Introduction to Statistical Learning

Chapter 8 Tree Based Methods - An Introduction to Statistical Learning
In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
import patsy as pt

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, BaggingClassifier,RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

from catboost import CatBoostRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from six import StringIO

import matplotlib.pyplot as plt
import graphviz 
from mpl_toolkits.mplot3d import Axes3D

import pydotplus 
from IPython.display import HTML
from IPython.display import Image
import catboost

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
In [2]:
# This function creates images of tree models using pydot
def print_tree(estimator, features, clazz_names=None, filled=True):    
    dot_data = StringIO()
    tree.export_graphviz(estimator, out_file=dot_data, feature_names=features, class_names=clazz_names, filled=filled)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())    
    return graph
In [3]:
data=pd.read_csv('data/Carseats.csv',index_col=[0])
#data.describe()
data['High']=(data['Sales']>8).astype(np.float64)
frml='High~ 0 + '+' + '.join(data.columns.difference(["High","Sales"]))
print(frml)
y,X=pt.dmatrices(frml,data)
High~ 0 + Advertising + Age + CompPrice + Education + Income + Population + Price + ShelveLoc + US + Urban
In [4]:
clf=DecisionTreeClassifier(max_depth=4)
clf=clf.fit(X,y)
In [7]:
data.ShelveLoc
Out[7]:
1         Bad
2        Good
3      Medium
4      Medium
5         Bad
        ...  
396      Good
397    Medium
398    Medium
399       Bad
400      Good
Name: ShelveLoc, Length: 400, dtype: object
In [9]:
# Visualise the tree with GraphViz
dot_data = tree.export_graphviz(clf, out_file=None,
                                feature_names=X.design_info.column_names,
                                class_names=['Low', 'High'],  
                                filled=True, rounded=True)


graph = graphviz.Source(dot_data) 
# grap will be exported to pdf 
graph.render("decisionTree",view=True)
#graph.size="3010,2010"
#svg=graph._repr_svg_()

#display(HTML(svg))
Out[9]:
'decisionTree.pdf'
In [10]:
list(enumerate(X.design_info.column_names))
Out[10]:
[(0, 'ShelveLoc[Bad]'),
 (1, 'ShelveLoc[Good]'),
 (2, 'ShelveLoc[Medium]'),
 (3, 'US[T.Yes]'),
 (4, 'Urban[T.Yes]'),
 (5, 'Advertising'),
 (6, 'Age'),
 (7, 'CompPrice'),
 (8, 'Education'),
 (9, 'Income'),
 (10, 'Population'),
 (11, 'Price')]
In [4]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.5,random_state=1)
In [12]:
clf1=DecisionTreeClassifier(max_depth=4)
clf1=clf1.fit(X_train,y_train)
In [13]:
yHat=clf1.predict(X_test)
cm=confusion_matrix(y_test,yHat,labels=[0,1])
sum(np.diag(cm))/cm.reshape(-1).sum()
Out[13]:
0.71
In [14]:
clf2=DecisionTreeClassifier(max_depth=4)
clf2.cost_complexity_pruning_path(X_train,y_train)
Out[14]:
{'ccp_alphas': array([0.        , 0.00604167, 0.00875   , 0.00943214, 0.01157051,
        0.01320714, 0.0207712 , 0.04225   , 0.04233753, 0.05415078,
        0.0648    ]),
 'impurities': array([0.18542616, 0.1975095 , 0.2062595 , 0.21569163, 0.22726215,
        0.24046929, 0.28201169, 0.32426169, 0.36659922, 0.42075   ,
        0.48555   ])}
In [15]:
print(clf1.tree_.node_count)
clf1.feature_importances_
fi=pd.DataFrame({'Feature':X.design_info.column_names,'importance':clf1.feature_importances_})

ax=sns.barplot(y='Feature',x='importance',data=fi.sort_values('importance',ascending=False))
plt.xticks(rotation=90)
25
Out[15]:
(array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 ]),
 <a list of 9 Text major ticklabel objects>)
In [6]:
boston_df=pd.read_csv('data/Boston.csv')
boston_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  black    506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB
In [7]:
boston_df.head()
frml_boston='medv~ 0 + '+' + '.join(boston_df.columns.difference(["medv"]))
print(frml_boston)
y_boston,X_boston=pt.dmatrices(frml_boston,boston_df)
medv~ 0 + age + black + chas + crim + dis + indus + lstat + nox + ptratio + rad + rm + tax + zn
In [8]:
X_b_train,X_b_test,y_b_train,y_b_test=train_test_split(X_boston,y_boston, test_size=0.5,random_state=12)
In [125]:
tree_reg=DecisionTreeRegressor(max_depth=5) #depth 5 is best according to CV (see below)
tree_reg.fit(X_b_train,y_b_train)
Out[125]:
DecisionTreeRegressor(max_depth=5)
In [126]:
graph=print_tree(tree_reg,features=X_boston.design_info.column_names)
Image(graph.create_png())
Out[126]:
In [128]:
print(tree_reg.get_depth())
print(tree_reg.get_n_leaves())
print(tree_reg.feature_importances_)
fiR=pd.DataFrame({'Feature':X_boston.design_info.column_names,'importance':tree_reg.feature_importances_})

ax=sns.barplot(y='Feature',x='importance',data=fiR.sort_values('importance',ascending=False))
plt.xticks(rotation=90)
5
28
[0.00354772 0.00218041 0.         0.02438228 0.08260226 0.
 0.60005043 0.01196751 0.01223328 0.         0.25128143 0.01175468
 0.        ]
Out[128]:
(array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]),
 <a list of 8 Text major ticklabel objects>)
In [100]:
y_b_test[:,0]
Out[100]:
array([20.7, 12.7,  8.5, 25.1, 28.2, 22.5, 18.2, 43.5, 36.1, 23.8, 22.6,
       22.6, 22. , 22.9, 35.4, 50. , 17.8, 24.1, 21.7, 20.6, 26.7, 19.7,
       21.2, 13.4, 23.1, 18.8, 20.9, 11.7, 21.6, 37.6, 26.5, 26.6, 23.2,
       31.6, 50. ,  7.2, 18.9, 50. , 15.4, 21.6, 22.5, 25.3, 21.7, 34.9,
       20.8, 17.5, 21.8, 27.5, 50. , 21.7, 23.3, 21.5, 10.2, 23.1, 12.1,
       18.1, 11.9, 21.7, 27.5, 13.1, 16.7, 32.7, 23.2, 19.8, 19.6, 22.2,
       22.3, 13.5, 18.9, 50. , 14. ,  7.5, 20.6, 18.5, 19.6, 23.7, 21.1,
       10.5, 21. , 32.9, 15. , 19. , 31.5, 19.1, 11.3, 23.1, 26.4, 21.9,
       23.8, 21. , 23.8, 18. , 31.1, 14.4, 37. , 21.2, 24.4, 14.9, 31. ,
       14.3, 32. , 21.8, 30.7, 22.4, 20.3, 10.2, 12. , 18.5, 25. , 29.8,
       35.1, 23.2, 23.9, 13.9, 24. , 21.9, 50. , 16.1, 30.8, 48.8, 33.4,
       17.8, 24.4, 46.7, 29. , 24.1, 22.2, 17.2, 33. , 18.4, 22.9, 19.9,
        8.4, 20.5, 15.6, 24.6,  7.4, 20.5, 42.3, 17.9, 15.3, 22.3, 19.7,
       19.1, 14.1, 19.4, 17.2, 15.6, 18.2, 21.9, 50. , 20.9, 24.8, 19.9,
       17.1, 22.1, 11.9, 24.7, 31.7, 17.6, 22.6, 36. , 20.6, 16.2, 17.8,
       21.4, 21.2, 10.2, 13.8, 26.2,  8.3, 39.8, 24.4, 26.6, 11.8, 22.8,
       14.6, 19.5, 10.9, 12.5, 22. , 12.3,  7.2, 24.3, 41.3, 23.9, 23.2,
       19.6, 23.9, 21. , 19.3, 21.4, 27.1, 19.5, 16.6, 23.1, 21.4, 24.8,
       25. , 28.7, 17.8, 22. , 25. , 28.5, 15.7, 22.5, 16.2, 16.4, 33.4,
       24.5,  8.3, 13.4, 22.2, 16.5, 50. , 19. , 16.5, 24.7, 22. , 28.4,
       20.5, 28. , 33.2, 15.4, 20.1, 20.2, 13.6, 29.6,  7. , 36.2, 14.5,
        5. , 22.9, 13.5, 14.5, 10.4, 27.5, 16. , 14.5, 45.4, 15. , 22.9,
       50. , 16.6, 20.8, 25. , 30.1, 20.2, 10.8, 17.7, 22.8, 33.1, 34.9])
In [127]:
yhat_b=tree_reg.predict(X_b_test)
#mse=np.mean(np.power((y_b_test[:,0]-yhat_b),2))
mse=mean_squared_error(y_b_test,yhat_b)
print(f"mse:{mse}")
mse:17.28281623263508
In [129]:
# pruning is useless in skitlearn
tree_reg_p=DecisionTreeRegressor(random_state=0)
path=tree_reg_p.cost_complexity_pruning_path(X_b_train,y_b_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

clfs = []
mse=[]
for ccp_alpha in ccp_alphas[:-1]:
    clf = DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha)
    clf=clf.fit(X_b_train, y_b_train)
    y_bHat_p=clf.predict(X_b_test)
    avg=mean_squared_error(y_b_test,yhat_b)
    mse.append(avg)
    clfs.append(clf)

best_model=np.argmin(mse)

print(len(ccp_alphas))
print(f'best_model:{best_model}')
print(mse[best_model])
best_tree=clfs[best_model]
print(best_tree)
print(best_tree.get_depth())
print(best_tree.get_n_leaves())
print(best_tree.feature_importances_)
fiR=pd.DataFrame({'Feature':X_boston.design_info.column_names,'importance':best_tree.feature_importances_})

ax=sns.barplot(y='Feature',x='importance',data=fiR.sort_values('importance',ascending=False))
plt.xticks(rotation=90)
230
best_model:0
17.28281623263508
DecisionTreeRegressor(random_state=0)
17
239
[4.14466439e-03 1.05996608e-02 8.61767351e-04 2.67701359e-02
 8.67952377e-02 3.67307860e-03 5.74911142e-01 1.27558465e-02
 1.32331503e-02 4.58760392e-04 2.50289247e-01 1.54026806e-02
 1.04628789e-04]
Out[129]:
(array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]),
 <a list of 8 Text major ticklabel objects>)
In [109]:
graph=print_tree(best_tree,features=X_boston.design_info.column_names)
Image(graph.create_png())
Out[109]:
In [124]:
# find best depth
param_grid = {'max_depth':[1,2,3,4,5,6,7,8,9,10]}
reg_estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, 
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, random_state=1,
           splitter='best')

grid_clf=GridSearchCV(cv=5, error_score='raise',
       estimator=reg_estimator,
        n_jobs=1,
       param_grid=param_grid,
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
grid_clf.fit(X_b_train, y_b_train)
Out[124]:
GridSearchCV(cv=5, error_score='raise',
             estimator=DecisionTreeRegressor(random_state=1), n_jobs=1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             return_train_score=True)
In [123]:
grid_clf.best_params_
Out[123]:
{'max_depth': 5}
In [122]:
grid_clf.best_params_
Out[122]:
{'max_depth': 5}

Bagging and Random Forest

In [175]:
bag_estimator=DecisionTreeRegressor(random_state=0)
bagging=BaggingRegressor(bag_estimator,random_state=0, n_estimators=500, n_jobs=4)
bagging.fit(X_b_train, y_b_train.ravel())
Out[175]:
BaggingRegressor(base_estimator=DecisionTreeRegressor(random_state=0),
                 n_estimators=500, n_jobs=4, random_state=0)
In [169]:
print(bagging.n_estimators)
print(bagging.n_features_)
print(bagging.n_jobs)
500
13
4
In [176]:
bagging_yHat=bagging.predict(X_b_test)
mse_bagging=mean_squared_error(y_b_test,bagging_yHat)
print(f'mse_bagging:{mse_bagging}')
mse_bagging:12.451899292490118
In [217]:
rf_regr=RandomForestRegressor(n_estimators=500,max_features=6,  n_jobs=4, random_state=0,verbose=True)
rf_regr.fit(X_b_train,y_b_train.ravel())
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.5s finished
Out[217]:
RandomForestRegressor(max_features=6, n_estimators=500, n_jobs=4,
                      random_state=0, verbose=True)
In [218]:
rf_yHat=rf_regr.predict(X_b_test)
mse_rf=mean_squared_error(y_b_test,rf_yHat)
print(f'mse_rf:{mse_rf}')
mse_rf:11.067864989723333
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.0s finished
In [219]:
importance_df=pd.DataFrame({'Feature':X_boston.design_info.column_names,'Importance':rf_regr.feature_importances_})
sorted_rf_df=importance_df.set_index('Feature').sort_values(by='Importance', ascending=True)
sorted_rf_df.plot(kind='barh')
plt.xlabel('Feature Importance')
Out[219]:
Text(0.5, 0, 'Feature Importance')
In [223]:
ada_boost_regr=AdaBoostRegressor(n_estimators=5000, random_state=0)
ada_boost_regr.fit(X_b_train,y_b_train.ravel())
Out[223]:
AdaBoostRegressor(n_estimators=5000, random_state=0)
In [224]:
ada_boost_yHat=ada_boost_regr.predict(X_b_test)
mse_ada_boost=mean_squared_error(y_b_test,ada_boost_yHat)
print(f'mse_ada_boost:{mse_ada_boost}')
mse_ada_boost:15.515534288829329
In [280]:
grad_boost_regr=GradientBoostingRegressor(n_estimators=5000,                                           
                                          learning_rate=0.01, random_state=0)
grad_boost_regr.fit(X_b_train,y_b_train.ravel())
Out[280]:
GradientBoostingRegressor(learning_rate=0.01, n_estimators=5000, random_state=0)
In [290]:
grad_boost_yHat=grad_boost_regr.predict(X_b_test)
mse_grad_boost=mean_squared_error(y_b_test,grad_boost_yHat)
print(f'mse_grad_boost:{mse_grad_boost}')
plt.plot(range(len(grad_boost_regr.train_score_)),grad_boost_regr.train_score_)
mse_grad_boost:11.609469986324811
Out[290]:
[<matplotlib.lines.Line2D at 0x227a895cd88>]
In [278]:
importance_grad_df=pd.DataFrame({'Importance':grad_boost_regr.feature_importances_}, index=X_boston.design_info.column_names)
sorted_grad_df=importance_grad_df.sort_values(by='Importance', ascending=True)
sorted_grad_df.plot(kind='barh')
plt.xlabel('Graident Boost Feature Importance')
Out[278]:
Text(0.5, 0, 'Graident Boost Feature Importance')
In [17]:
cat_boost_regr=CatBoostRegressor(iterations=50000,learning_rate=0.01,random_state=0, depth=4, thread_count=3)
pool = catboost.Pool(X_b_train,y_b_train.ravel(), feature_names=X_boston.design_info.column_names)
cat_boost_regr.fit(pool, verbose=False)
Out[17]:
<catboost.core.CatBoostRegressor at 0x1e87902f808>
In [13]:
print(cat_boost_regr.is_fitted())
cat_boost_yHat=cat_boost_regr.predict(X_b_test)
mse_cat_boost=mean_squared_error(y_b_test,cat_boost_yHat)
print(f'mse_cat_boost:{mse_cat_boost}')
pr
True
mse_cat_boost:11.614984306900931
In [325]:
importance_cat_df=pd.DataFrame({'Importance':cat_boost_regr.feature_importances_}, index=X_boston.design_info.column_names)
sorted_cat_df=importance_cat_df.sort_values(by='Importance', ascending=True)
sorted_cat_df.plot(kind='barh')
plt.xlabel('CatBoost Feature Importance')
Out[325]:
Text(0.5, 0, 'CatBoost Feature Importance')
In [326]:
cat_boost_regr.plot_tree(0, pool=pool)
Out[326]:
%3 0 lstat, value>10.33 1 crim, value>3.75753 0->1 No 2 crim, value>3.75753 0->2 Yes 3 lstat, value>12.795 1->3 No 4 lstat, value>12.795 1->4 Yes 5 lstat, value>12.795 2->5 No 6 lstat, value>12.795 2->6 Yes 7 rm, value>7.06 3->7 No 8 rm, value>7.06 3->8 Yes 9 rm, value>7.06 4->9 No 10 rm, value>7.06 4->10 Yes 11 rm, value>7.06 5->11 No 12 rm, value>7.06 5->12 Yes 13 rm, value>7.06 6->13 No 14 rm, value>7.06 6->14 Yes 15 val = 0.024 7->15 No 16 val = 0.156 7->16 Yes 17 val = 0.000 8->17 No 18 val = 0.000 8->18 Yes 19 val = 0.120 9->19 No 20 val = 0.000 9->20 Yes 21 val = 0.000 10->21 No 22 val = 0.000 10->22 Yes 23 val = -0.017 11->23 No 24 val = 0.000 11->24 Yes 25 val = -0.040 12->25 No 26 val = 0.000 12->26 Yes 27 val = 0.008 13->27 No 28 val = 0.000 13->28 Yes 29 val = -0.091 14->29 No 30 val = 0.000 14->30 Yes
In [352]:
cat_boost_regr.get_leaf_values().shape[0]
Out[352]:
79808
In [353]:
cat_boost_regr.tree_count_
Out[353]:
5000
In [387]:
# let see if we can produce prediction results manually ..?
test_x=X_b_test[0,:]
tree_start_leaf_index=np.cumsum(cat_boost_regr.get_tree_leaf_counts())
tree_start_leaf_index=np.insert(tree_start_leaf_index,0,0)
tree_leaf_index_0=cat_boost_regr.calc_leaf_indexes(test_x).ravel()
leaf_indexes=tree_start_leaf_index[:-1]+tree_leaf_index_0

leaf_values=cat_boost_regr.get_leaf_values()
leaf_weights=cat_boost_regr.get_leaf_weights()

values=leaf_values[leaf_indexes]
weights=leaf_weights[leaf_indexes]
sum(values*weights)
Out[387]:
46.99058082896288
In [388]:
cat_boost_regr.predict(test_x) ## we could not produce same result 
Out[388]:
23.299944998586998
In [393]:
xgb_boost_regr=XGBRegressor(n_estimators=5000,learning_rate=0.01, random_state=0)
xgb_boost_regr.fit(X_b_train,y_b_train.ravel())
[17:13:13] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Out[393]:
XGBRegressor(learning_rate=0.01, n_estimators=5000)
In [392]:
xgb_boost_yHat=xgb_boost_regr.predict(X_b_test)
mse_xgb_boost=mean_squared_error(y_b_test,xgb_boost_yHat)
print(f'mse_xgb_boost:{mse_xgb_boost}')
mse_xgb_boost:12.784486572339242
In [401]:
xgbrf_boost_regr=XGBRFRegressor(n_estimators=10000,learning_rate=0.01, random_state=0)
xgbrf_boost_regr.fit(X_b_train,y_b_train.ravel())
[17:20:26] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Out[401]:
XGBRFRegressor(learning_rate=0.01, n_estimators=10000)
In [402]:
xgbrf_boost_yHat=xgbrf_boost_regr.predict(X_b_test)
mse_xgbrf_boost=mean_squared_error(y_b_test,xgbrf_boost_yHat)
print(f'mse_xgbrf_boost:{mse_xgbrf_boost}')
mse_xgbrf_boost:559.8321126176156
In [ ]:
 

No comments: