import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
import patsy as pt

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, BaggingClassifier,RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

from catboost import CatBoostRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from six import StringIO

import matplotlib.pyplot as plt
import graphviz 
from mpl_toolkits.mplot3d import Axes3D

import pydotplus 
from IPython.display import HTML
from IPython.display import Image
import catboost

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error

# This function creates images of tree models using pydot
def print_tree(estimator, features, clazz_names=None, filled=True):    
    dot_data = StringIO()
    tree.export_graphviz(estimator, out_file=dot_data, feature_names=features, class_names=clazz_names, filled=filled)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())    
    return graph

data=pd.read_csv('data/Carseats.csv',index_col=[0])
#data.describe()
data['High']=(data['Sales']>8).astype(np.float64)
frml='High~ 0 + '+' + '.join(data.columns.difference(["High","Sales"]))
print(frml)
y,X=pt.dmatrices(frml,data)

High~ 0 + Advertising + Age + CompPrice + Education + Income + Population + Price + ShelveLoc + US + Urban

clf=DecisionTreeClassifier(max_depth=4)
clf=clf.fit(X,y)

data.ShelveLoc

1         Bad
2        Good
3      Medium
4      Medium
5         Bad
        ...  
396      Good
397    Medium
398    Medium
399       Bad
400      Good
Name: ShelveLoc, Length: 400, dtype: object

# Visualise the tree with GraphViz
dot_data = tree.export_graphviz(clf, out_file=None,
                                feature_names=X.design_info.column_names,
                                class_names=['Low', 'High'],  
                                filled=True, rounded=True)


graph = graphviz.Source(dot_data) 
# grap will be exported to pdf 
graph.render("decisionTree",view=True)
#graph.size="3010,2010"
#svg=graph._repr_svg_()

#display(HTML(svg))

'decisionTree.pdf'

list(enumerate(X.design_info.column_names))

[(0, 'ShelveLoc[Bad]'),
 (1, 'ShelveLoc[Good]'),
 (2, 'ShelveLoc[Medium]'),
 (3, 'US[T.Yes]'),
 (4, 'Urban[T.Yes]'),
 (5, 'Advertising'),
 (6, 'Age'),
 (7, 'CompPrice'),
 (8, 'Education'),
 (9, 'Income'),
 (10, 'Population'),
 (11, 'Price')]

X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.5,random_state=1)

clf1=DecisionTreeClassifier(max_depth=4)
clf1=clf1.fit(X_train,y_train)

yHat=clf1.predict(X_test)
cm=confusion_matrix(y_test,yHat,labels=[0,1])
sum(np.diag(cm))/cm.reshape(-1).sum()

0.71

clf2=DecisionTreeClassifier(max_depth=4)
clf2.cost_complexity_pruning_path(X_train,y_train)

{'ccp_alphas': array([0.        , 0.00604167, 0.00875   , 0.00943214, 0.01157051,
        0.01320714, 0.0207712 , 0.04225   , 0.04233753, 0.05415078,
        0.0648    ]),
 'impurities': array([0.18542616, 0.1975095 , 0.2062595 , 0.21569163, 0.22726215,
        0.24046929, 0.28201169, 0.32426169, 0.36659922, 0.42075   ,
        0.48555   ])}

print(clf1.tree_.node_count)
clf1.feature_importances_
fi=pd.DataFrame({'Feature':X.design_info.column_names,'importance':clf1.feature_importances_})

ax=sns.barplot(y='Feature',x='importance',data=fi.sort_values('importance',ascending=False))
plt.xticks(rotation=90)

25

(array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 ]),
 <a list of 9 Text major ticklabel objects>)

boston_df=pd.read_csv('data/Boston.csv')
boston_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  black    506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB

boston_df.head()
frml_boston='medv~ 0 + '+' + '.join(boston_df.columns.difference(["medv"]))
print(frml_boston)
y_boston,X_boston=pt.dmatrices(frml_boston,boston_df)

medv~ 0 + age + black + chas + crim + dis + indus + lstat + nox + ptratio + rad + rm + tax + zn

X_b_train,X_b_test,y_b_train,y_b_test=train_test_split(X_boston,y_boston, test_size=0.5,random_state=12)

tree_reg=DecisionTreeRegressor(max_depth=5) #depth 5 is best according to CV (see below)
tree_reg.fit(X_b_train,y_b_train)

DecisionTreeRegressor(max_depth=5)

graph=print_tree(tree_reg,features=X_boston.design_info.column_names)
Image(graph.create_png())

print(tree_reg.get_depth())
print(tree_reg.get_n_leaves())
print(tree_reg.feature_importances_)
fiR=pd.DataFrame({'Feature':X_boston.design_info.column_names,'importance':tree_reg.feature_importances_})

ax=sns.barplot(y='Feature',x='importance',data=fiR.sort_values('importance',ascending=False))
plt.xticks(rotation=90)

5
28
[0.00354772 0.00218041 0.         0.02438228 0.08260226 0.
 0.60005043 0.01196751 0.01223328 0.         0.25128143 0.01175468
 0.        ]

(array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]),
 <a list of 8 Text major ticklabel objects>)

y_b_test[:,0]

array([20.7, 12.7,  8.5, 25.1, 28.2, 22.5, 18.2, 43.5, 36.1, 23.8, 22.6,
       22.6, 22. , 22.9, 35.4, 50. , 17.8, 24.1, 21.7, 20.6, 26.7, 19.7,
       21.2, 13.4, 23.1, 18.8, 20.9, 11.7, 21.6, 37.6, 26.5, 26.6, 23.2,
       31.6, 50. ,  7.2, 18.9, 50. , 15.4, 21.6, 22.5, 25.3, 21.7, 34.9,
       20.8, 17.5, 21.8, 27.5, 50. , 21.7, 23.3, 21.5, 10.2, 23.1, 12.1,
       18.1, 11.9, 21.7, 27.5, 13.1, 16.7, 32.7, 23.2, 19.8, 19.6, 22.2,
       22.3, 13.5, 18.9, 50. , 14. ,  7.5, 20.6, 18.5, 19.6, 23.7, 21.1,
       10.5, 21. , 32.9, 15. , 19. , 31.5, 19.1, 11.3, 23.1, 26.4, 21.9,
       23.8, 21. , 23.8, 18. , 31.1, 14.4, 37. , 21.2, 24.4, 14.9, 31. ,
       14.3, 32. , 21.8, 30.7, 22.4, 20.3, 10.2, 12. , 18.5, 25. , 29.8,
       35.1, 23.2, 23.9, 13.9, 24. , 21.9, 50. , 16.1, 30.8, 48.8, 33.4,
       17.8, 24.4, 46.7, 29. , 24.1, 22.2, 17.2, 33. , 18.4, 22.9, 19.9,
        8.4, 20.5, 15.6, 24.6,  7.4, 20.5, 42.3, 17.9, 15.3, 22.3, 19.7,
       19.1, 14.1, 19.4, 17.2, 15.6, 18.2, 21.9, 50. , 20.9, 24.8, 19.9,
       17.1, 22.1, 11.9, 24.7, 31.7, 17.6, 22.6, 36. , 20.6, 16.2, 17.8,
       21.4, 21.2, 10.2, 13.8, 26.2,  8.3, 39.8, 24.4, 26.6, 11.8, 22.8,
       14.6, 19.5, 10.9, 12.5, 22. , 12.3,  7.2, 24.3, 41.3, 23.9, 23.2,
       19.6, 23.9, 21. , 19.3, 21.4, 27.1, 19.5, 16.6, 23.1, 21.4, 24.8,
       25. , 28.7, 17.8, 22. , 25. , 28.5, 15.7, 22.5, 16.2, 16.4, 33.4,
       24.5,  8.3, 13.4, 22.2, 16.5, 50. , 19. , 16.5, 24.7, 22. , 28.4,
       20.5, 28. , 33.2, 15.4, 20.1, 20.2, 13.6, 29.6,  7. , 36.2, 14.5,
        5. , 22.9, 13.5, 14.5, 10.4, 27.5, 16. , 14.5, 45.4, 15. , 22.9,
       50. , 16.6, 20.8, 25. , 30.1, 20.2, 10.8, 17.7, 22.8, 33.1, 34.9])

yhat_b=tree_reg.predict(X_b_test)
#mse=np.mean(np.power((y_b_test[:,0]-yhat_b),2))
mse=mean_squared_error(y_b_test,yhat_b)
print(f"mse:{mse}")

mse:17.28281623263508

# pruning is useless in skitlearn
tree_reg_p=DecisionTreeRegressor(random_state=0)
path=tree_reg_p.cost_complexity_pruning_path(X_b_train,y_b_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

clfs = []
mse=[]
for ccp_alpha in ccp_alphas[:-1]:
    clf = DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha)
    clf=clf.fit(X_b_train, y_b_train)
    y_bHat_p=clf.predict(X_b_test)
    avg=mean_squared_error(y_b_test,yhat_b)
    mse.append(avg)
    clfs.append(clf)

best_model=np.argmin(mse)

print(len(ccp_alphas))
print(f'best_model:{best_model}')
print(mse[best_model])
best_tree=clfs[best_model]
print(best_tree)
print(best_tree.get_depth())
print(best_tree.get_n_leaves())
print(best_tree.feature_importances_)
fiR=pd.DataFrame({'Feature':X_boston.design_info.column_names,'importance':best_tree.feature_importances_})

ax=sns.barplot(y='Feature',x='importance',data=fiR.sort_values('importance',ascending=False))
plt.xticks(rotation=90)

230
best_model:0
17.28281623263508
DecisionTreeRegressor(random_state=0)
17
239
[4.14466439e-03 1.05996608e-02 8.61767351e-04 2.67701359e-02
 8.67952377e-02 3.67307860e-03 5.74911142e-01 1.27558465e-02
 1.32331503e-02 4.58760392e-04 2.50289247e-01 1.54026806e-02
 1.04628789e-04]

(array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]),
 <a list of 8 Text major ticklabel objects>)

graph=print_tree(best_tree,features=X_boston.design_info.column_names)
Image(graph.create_png())

# find best depth
param_grid = {'max_depth':[1,2,3,4,5,6,7,8,9,10]}
reg_estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, 
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, random_state=1,
           splitter='best')

grid_clf=GridSearchCV(cv=5, error_score='raise',
       estimator=reg_estimator,
        n_jobs=1,
       param_grid=param_grid,
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
grid_clf.fit(X_b_train, y_b_train)

GridSearchCV(cv=5, error_score='raise',
             estimator=DecisionTreeRegressor(random_state=1), n_jobs=1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             return_train_score=True)

grid_clf.best_params_

{'max_depth': 5}

grid_clf.best_params_

{'max_depth': 5}

Bagging and Random Forest¶

bag_estimator=DecisionTreeRegressor(random_state=0)
bagging=BaggingRegressor(bag_estimator,random_state=0, n_estimators=500, n_jobs=4)
bagging.fit(X_b_train, y_b_train.ravel())

BaggingRegressor(base_estimator=DecisionTreeRegressor(random_state=0),
                 n_estimators=500, n_jobs=4, random_state=0)

print(bagging.n_estimators)
print(bagging.n_features_)
print(bagging.n_jobs)

500
13
4

bagging_yHat=bagging.predict(X_b_test)
mse_bagging=mean_squared_error(y_b_test,bagging_yHat)
print(f'mse_bagging:{mse_bagging}')

mse_bagging:12.451899292490118

rf_regr=RandomForestRegressor(n_estimators=500,max_features=6,  n_jobs=4, random_state=0,verbose=True)
rf_regr.fit(X_b_train,y_b_train.ravel())

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.5s finished

RandomForestRegressor(max_features=6, n_estimators=500, n_jobs=4,
                      random_state=0, verbose=True)

rf_yHat=rf_regr.predict(X_b_test)
mse_rf=mean_squared_error(y_b_test,rf_yHat)
print(f'mse_rf:{mse_rf}')

mse_rf:11.067864989723333

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.0s finished

importance_df=pd.DataFrame({'Feature':X_boston.design_info.column_names,'Importance':rf_regr.feature_importances_})
sorted_rf_df=importance_df.set_index('Feature').sort_values(by='Importance', ascending=True)
sorted_rf_df.plot(kind='barh')
plt.xlabel('Feature Importance')

Text(0.5, 0, 'Feature Importance')

ada_boost_regr=AdaBoostRegressor(n_estimators=5000, random_state=0)
ada_boost_regr.fit(X_b_train,y_b_train.ravel())

AdaBoostRegressor(n_estimators=5000, random_state=0)

ada_boost_yHat=ada_boost_regr.predict(X_b_test)
mse_ada_boost=mean_squared_error(y_b_test,ada_boost_yHat)
print(f'mse_ada_boost:{mse_ada_boost}')

mse_ada_boost:15.515534288829329

grad_boost_regr=GradientBoostingRegressor(n_estimators=5000,                                           
                                          learning_rate=0.01, random_state=0)
grad_boost_regr.fit(X_b_train,y_b_train.ravel())

GradientBoostingRegressor(learning_rate=0.01, n_estimators=5000, random_state=0)

grad_boost_yHat=grad_boost_regr.predict(X_b_test)
mse_grad_boost=mean_squared_error(y_b_test,grad_boost_yHat)
print(f'mse_grad_boost:{mse_grad_boost}')
plt.plot(range(len(grad_boost_regr.train_score_)),grad_boost_regr.train_score_)

mse_grad_boost:11.609469986324811

[<matplotlib.lines.Line2D at 0x227a895cd88>]

importance_grad_df=pd.DataFrame({'Importance':grad_boost_regr.feature_importances_}, index=X_boston.design_info.column_names)
sorted_grad_df=importance_grad_df.sort_values(by='Importance', ascending=True)
sorted_grad_df.plot(kind='barh')
plt.xlabel('Graident Boost Feature Importance')

Text(0.5, 0, 'Graident Boost Feature Importance')

cat_boost_regr=CatBoostRegressor(iterations=50000,learning_rate=0.01,random_state=0, depth=4, thread_count=3)
pool = catboost.Pool(X_b_train,y_b_train.ravel(), feature_names=X_boston.design_info.column_names)
cat_boost_regr.fit(pool, verbose=False)

<catboost.core.CatBoostRegressor at 0x1e87902f808>

print(cat_boost_regr.is_fitted())
cat_boost_yHat=cat_boost_regr.predict(X_b_test)
mse_cat_boost=mean_squared_error(y_b_test,cat_boost_yHat)
print(f'mse_cat_boost:{mse_cat_boost}')
pr

True
mse_cat_boost:11.614984306900931

importance_cat_df=pd.DataFrame({'Importance':cat_boost_regr.feature_importances_}, index=X_boston.design_info.column_names)
sorted_cat_df=importance_cat_df.sort_values(by='Importance', ascending=True)
sorted_cat_df.plot(kind='barh')
plt.xlabel('CatBoost Feature Importance')

Text(0.5, 0, 'CatBoost Feature Importance')

cat_boost_regr.plot_tree(0, pool=pool)

cat_boost_regr.get_leaf_values().shape[0]

79808

cat_boost_regr.tree_count_

5000

# let see if we can produce prediction results manually ..?
test_x=X_b_test[0,:]
tree_start_leaf_index=np.cumsum(cat_boost_regr.get_tree_leaf_counts())
tree_start_leaf_index=np.insert(tree_start_leaf_index,0,0)
tree_leaf_index_0=cat_boost_regr.calc_leaf_indexes(test_x).ravel()
leaf_indexes=tree_start_leaf_index[:-1]+tree_leaf_index_0

leaf_values=cat_boost_regr.get_leaf_values()
leaf_weights=cat_boost_regr.get_leaf_weights()

values=leaf_values[leaf_indexes]
weights=leaf_weights[leaf_indexes]
sum(values*weights)

46.99058082896288

cat_boost_regr.predict(test_x) ## we could not produce same result

23.299944998586998

xgb_boost_regr=XGBRegressor(n_estimators=5000,learning_rate=0.01, random_state=0)
xgb_boost_regr.fit(X_b_train,y_b_train.ravel())

[17:13:13] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.

XGBRegressor(learning_rate=0.01, n_estimators=5000)

xgb_boost_yHat=xgb_boost_regr.predict(X_b_test)
mse_xgb_boost=mean_squared_error(y_b_test,xgb_boost_yHat)
print(f'mse_xgb_boost:{mse_xgb_boost}')

mse_xgb_boost:12.784486572339242

xgbrf_boost_regr=XGBRFRegressor(n_estimators=10000,learning_rate=0.01, random_state=0)
xgbrf_boost_regr.fit(X_b_train,y_b_train.ravel())

[17:20:26] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.

XGBRFRegressor(learning_rate=0.01, n_estimators=10000)

xgbrf_boost_yHat=xgbrf_boost_regr.predict(X_b_test)
mse_xgbrf_boost=mean_squared_error(y_b_test,xgbrf_boost_yHat)
print(f'mse_xgbrf_boost:{mse_xgbrf_boost}')

mse_xgbrf_boost:559.8321126176156

Deniz's Notes

Thursday, February 25, 2021

Chapter 8 Tree Based Methods within Python - An Introduction to Statistical Learning

Bagging and Random Forest¶

No comments: