In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
import patsy as pt
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, BaggingClassifier,RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from six import StringIO
import matplotlib.pyplot as plt
import graphviz
from mpl_toolkits.mplot3d import Axes3D
import pydotplus
from IPython.display import HTML
from IPython.display import Image
import catboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
In [2]:
# This function creates images of tree models using pydot
def print_tree(estimator, features, clazz_names=None, filled=True):
dot_data = StringIO()
tree.export_graphviz(estimator, out_file=dot_data, feature_names=features, class_names=clazz_names, filled=filled)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
return graph
In [3]:
data=pd.read_csv('data/Carseats.csv',index_col=[0])
#data.describe()
data['High']=(data['Sales']>8).astype(np.float64)
frml='High~ 0 + '+' + '.join(data.columns.difference(["High","Sales"]))
print(frml)
y,X=pt.dmatrices(frml,data)
In [4]:
clf=DecisionTreeClassifier(max_depth=4)
clf=clf.fit(X,y)
In [7]:
data.ShelveLoc
Out[7]:
In [9]:
# Visualise the tree with GraphViz
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=X.design_info.column_names,
class_names=['Low', 'High'],
filled=True, rounded=True)
graph = graphviz.Source(dot_data)
# grap will be exported to pdf
graph.render("decisionTree",view=True)
#graph.size="3010,2010"
#svg=graph._repr_svg_()
#display(HTML(svg))
Out[9]:
In [10]:
list(enumerate(X.design_info.column_names))
Out[10]:
In [4]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.5,random_state=1)
In [12]:
clf1=DecisionTreeClassifier(max_depth=4)
clf1=clf1.fit(X_train,y_train)
In [13]:
yHat=clf1.predict(X_test)
cm=confusion_matrix(y_test,yHat,labels=[0,1])
sum(np.diag(cm))/cm.reshape(-1).sum()
Out[13]:
In [14]:
clf2=DecisionTreeClassifier(max_depth=4)
clf2.cost_complexity_pruning_path(X_train,y_train)
Out[14]:
In [15]:
print(clf1.tree_.node_count)
clf1.feature_importances_
fi=pd.DataFrame({'Feature':X.design_info.column_names,'importance':clf1.feature_importances_})
ax=sns.barplot(y='Feature',x='importance',data=fi.sort_values('importance',ascending=False))
plt.xticks(rotation=90)
Out[15]:
In [6]:
boston_df=pd.read_csv('data/Boston.csv')
boston_df.info()
In [7]:
boston_df.head()
frml_boston='medv~ 0 + '+' + '.join(boston_df.columns.difference(["medv"]))
print(frml_boston)
y_boston,X_boston=pt.dmatrices(frml_boston,boston_df)
In [8]:
X_b_train,X_b_test,y_b_train,y_b_test=train_test_split(X_boston,y_boston, test_size=0.5,random_state=12)
In [125]:
tree_reg=DecisionTreeRegressor(max_depth=5) #depth 5 is best according to CV (see below)
tree_reg.fit(X_b_train,y_b_train)
Out[125]:
In [126]:
graph=print_tree(tree_reg,features=X_boston.design_info.column_names)
Image(graph.create_png())
Out[126]:
In [128]:
print(tree_reg.get_depth())
print(tree_reg.get_n_leaves())
print(tree_reg.feature_importances_)
fiR=pd.DataFrame({'Feature':X_boston.design_info.column_names,'importance':tree_reg.feature_importances_})
ax=sns.barplot(y='Feature',x='importance',data=fiR.sort_values('importance',ascending=False))
plt.xticks(rotation=90)
Out[128]:
In [100]:
y_b_test[:,0]
Out[100]:
In [127]:
yhat_b=tree_reg.predict(X_b_test)
#mse=np.mean(np.power((y_b_test[:,0]-yhat_b),2))
mse=mean_squared_error(y_b_test,yhat_b)
print(f"mse:{mse}")
In [129]:
# pruning is useless in skitlearn
tree_reg_p=DecisionTreeRegressor(random_state=0)
path=tree_reg_p.cost_complexity_pruning_path(X_b_train,y_b_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
clfs = []
mse=[]
for ccp_alpha in ccp_alphas[:-1]:
clf = DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha)
clf=clf.fit(X_b_train, y_b_train)
y_bHat_p=clf.predict(X_b_test)
avg=mean_squared_error(y_b_test,yhat_b)
mse.append(avg)
clfs.append(clf)
best_model=np.argmin(mse)
print(len(ccp_alphas))
print(f'best_model:{best_model}')
print(mse[best_model])
best_tree=clfs[best_model]
print(best_tree)
print(best_tree.get_depth())
print(best_tree.get_n_leaves())
print(best_tree.feature_importances_)
fiR=pd.DataFrame({'Feature':X_boston.design_info.column_names,'importance':best_tree.feature_importances_})
ax=sns.barplot(y='Feature',x='importance',data=fiR.sort_values('importance',ascending=False))
plt.xticks(rotation=90)
Out[129]:
In [109]:
graph=print_tree(best_tree,features=X_boston.design_info.column_names)
Image(graph.create_png())
Out[109]:
In [124]:
# find best depth
param_grid = {'max_depth':[1,2,3,4,5,6,7,8,9,10]}
reg_estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
max_leaf_nodes=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, random_state=1,
splitter='best')
grid_clf=GridSearchCV(cv=5, error_score='raise',
estimator=reg_estimator,
n_jobs=1,
param_grid=param_grid,
pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
scoring=None, verbose=0)
grid_clf.fit(X_b_train, y_b_train)
Out[124]:
In [123]:
grid_clf.best_params_
Out[123]:
In [122]:
grid_clf.best_params_
Out[122]:
Bagging and Random Forest¶
In [175]:
bag_estimator=DecisionTreeRegressor(random_state=0)
bagging=BaggingRegressor(bag_estimator,random_state=0, n_estimators=500, n_jobs=4)
bagging.fit(X_b_train, y_b_train.ravel())
Out[175]:
In [169]:
print(bagging.n_estimators)
print(bagging.n_features_)
print(bagging.n_jobs)
In [176]:
bagging_yHat=bagging.predict(X_b_test)
mse_bagging=mean_squared_error(y_b_test,bagging_yHat)
print(f'mse_bagging:{mse_bagging}')
In [217]:
rf_regr=RandomForestRegressor(n_estimators=500,max_features=6, n_jobs=4, random_state=0,verbose=True)
rf_regr.fit(X_b_train,y_b_train.ravel())
Out[217]:
In [218]:
rf_yHat=rf_regr.predict(X_b_test)
mse_rf=mean_squared_error(y_b_test,rf_yHat)
print(f'mse_rf:{mse_rf}')
In [219]:
importance_df=pd.DataFrame({'Feature':X_boston.design_info.column_names,'Importance':rf_regr.feature_importances_})
sorted_rf_df=importance_df.set_index('Feature').sort_values(by='Importance', ascending=True)
sorted_rf_df.plot(kind='barh')
plt.xlabel('Feature Importance')
Out[219]:
In [223]:
ada_boost_regr=AdaBoostRegressor(n_estimators=5000, random_state=0)
ada_boost_regr.fit(X_b_train,y_b_train.ravel())
Out[223]:
In [224]:
ada_boost_yHat=ada_boost_regr.predict(X_b_test)
mse_ada_boost=mean_squared_error(y_b_test,ada_boost_yHat)
print(f'mse_ada_boost:{mse_ada_boost}')
In [280]:
grad_boost_regr=GradientBoostingRegressor(n_estimators=5000,
learning_rate=0.01, random_state=0)
grad_boost_regr.fit(X_b_train,y_b_train.ravel())
Out[280]:
In [290]:
grad_boost_yHat=grad_boost_regr.predict(X_b_test)
mse_grad_boost=mean_squared_error(y_b_test,grad_boost_yHat)
print(f'mse_grad_boost:{mse_grad_boost}')
plt.plot(range(len(grad_boost_regr.train_score_)),grad_boost_regr.train_score_)
Out[290]:
In [278]:
importance_grad_df=pd.DataFrame({'Importance':grad_boost_regr.feature_importances_}, index=X_boston.design_info.column_names)
sorted_grad_df=importance_grad_df.sort_values(by='Importance', ascending=True)
sorted_grad_df.plot(kind='barh')
plt.xlabel('Graident Boost Feature Importance')
Out[278]:
In [17]:
cat_boost_regr=CatBoostRegressor(iterations=50000,learning_rate=0.01,random_state=0, depth=4, thread_count=3)
pool = catboost.Pool(X_b_train,y_b_train.ravel(), feature_names=X_boston.design_info.column_names)
cat_boost_regr.fit(pool, verbose=False)
Out[17]:
In [13]:
print(cat_boost_regr.is_fitted())
cat_boost_yHat=cat_boost_regr.predict(X_b_test)
mse_cat_boost=mean_squared_error(y_b_test,cat_boost_yHat)
print(f'mse_cat_boost:{mse_cat_boost}')
pr
In [325]:
importance_cat_df=pd.DataFrame({'Importance':cat_boost_regr.feature_importances_}, index=X_boston.design_info.column_names)
sorted_cat_df=importance_cat_df.sort_values(by='Importance', ascending=True)
sorted_cat_df.plot(kind='barh')
plt.xlabel('CatBoost Feature Importance')
Out[325]:
In [326]:
cat_boost_regr.plot_tree(0, pool=pool)
Out[326]:
In [352]:
cat_boost_regr.get_leaf_values().shape[0]
Out[352]:
In [353]:
cat_boost_regr.tree_count_
Out[353]:
In [387]:
# let see if we can produce prediction results manually ..?
test_x=X_b_test[0,:]
tree_start_leaf_index=np.cumsum(cat_boost_regr.get_tree_leaf_counts())
tree_start_leaf_index=np.insert(tree_start_leaf_index,0,0)
tree_leaf_index_0=cat_boost_regr.calc_leaf_indexes(test_x).ravel()
leaf_indexes=tree_start_leaf_index[:-1]+tree_leaf_index_0
leaf_values=cat_boost_regr.get_leaf_values()
leaf_weights=cat_boost_regr.get_leaf_weights()
values=leaf_values[leaf_indexes]
weights=leaf_weights[leaf_indexes]
sum(values*weights)
Out[387]:
In [388]:
cat_boost_regr.predict(test_x) ## we could not produce same result
Out[388]:
In [393]:
xgb_boost_regr=XGBRegressor(n_estimators=5000,learning_rate=0.01, random_state=0)
xgb_boost_regr.fit(X_b_train,y_b_train.ravel())
Out[393]:
In [392]:
xgb_boost_yHat=xgb_boost_regr.predict(X_b_test)
mse_xgb_boost=mean_squared_error(y_b_test,xgb_boost_yHat)
print(f'mse_xgb_boost:{mse_xgb_boost}')
In [401]:
xgbrf_boost_regr=XGBRFRegressor(n_estimators=10000,learning_rate=0.01, random_state=0)
xgbrf_boost_regr.fit(X_b_train,y_b_train.ravel())
Out[401]:
In [402]:
xgbrf_boost_yHat=xgbrf_boost_regr.predict(X_b_test)
mse_xgbrf_boost=mean_squared_error(y_b_test,xgbrf_boost_yHat)
print(f'mse_xgbrf_boost:{mse_xgbrf_boost}')
In [ ]:
No comments:
Post a Comment