In [64]:
%config IPCompleter.greedy=True
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, classification_report, plot_roc_curve, plot_confusion_matrix
from sklearn import svm
from sklearn.model_selection import GridSearchCV
In [2]:
np.random.seed(2)
X=np.random.normal(0,1,size=(20,2))
Y=np.zeros(20)
Y[10:]=1
X[Y==1,:]=X[Y==1,:]+1
df=pd.DataFrame({'Y':Y,'X_1':X[:,0],'X_2':X[:,1]})
#plt.scatter(X[:,0],X[:,1],c=Y)
sns.scatterplot(x='X_1',y='X_2',hue='Y', style='Y',data=df)
Out[2]:
In [102]:
def summary_svm(df,model):
display(model.support_vectors_)
display(model.n_support_)
if model.kernel=='linear':
display([model.intercept_]+[model.coef_])
X=df.loc[:,["X_1","X_2"]].values
y_pred=model.predict(X)
print(classification_report(df.Y.values, y_pred))
plot_confusion_matrix(model,X,df.Y.values)
if len(np.unique(df_m.Y.values))==2:
plot_roc_curve(model,X,df.Y.values)
In [144]:
def plot_decision_boundary(df,model):
df=df.astype({'Y':'int'})
X1_min,X2_min=df.min(axis=0)[1:]-3
X1_max,X2_max=df.max(axis=0)[1:]+3
x1v,x2v=np.meshgrid(np.linspace(X1_min,X1_max, num=100),np.linspace(X2_min,X2_max,num=100))
z=model.predict(np.c_[x1v.ravel(),x2v.ravel()])
z=z.reshape(x1v.shape)
plt.contourf(x1v,x2v,z)
sns.scatterplot(x='X_1',y='X_2',hue='Y',data=df)
sns.scatterplot(x=model.support_vectors_[:,0],y=model.support_vectors_[:,1],color='red', marker='+', s=500)
if model.kernel=='linear':
#W[0]*X+W[1]*Y+Intercept=0
w = model.coef_[0]
a = -w[0] / w[1] # slope
xx = np.linspace(X1_min+1.5, X1_max-2,100)
# y= WX
yy = a * xx - (model.intercept_[0]) / w[1]
# margin =1/ ||W||
absW=np.sqrt(np.sum(model.coef_ ** 2))
margin = 1 / absW
#
#yy_down = yy - np.sqrt(1 + a ** 2) * margin
#yy_up = yy + np.sqrt(1 + a ** 2) * margin
# shift by margin
distance_to_origin=-model.intercept_[0]/ w[1]/absW
sh=margin*distance_to_origin
yy_down = yy - margin
yy_up = yy + margin
plt.plot(xx, yy, 'r-')
plt.plot(xx, yy_down, 'k--')
plt.plot(xx, yy_up, 'k--')
In [141]:
clf=svm.SVC(kernel='linear', C=1)
clf.fit(X,Y)
Out[141]:
In [145]:
plot_decision_boundary(df,clf)
summary_svm(df,clf)
In [78]:
clf_10000=svm.SVC(kernel='linear', C=10000)
clf_10000.fit(X,Y)
plot_decision_boundary(df,clf_10000)
summary_svm(df,clf_10000)
In [71]:
clf_01=svm.SVC(kernel='linear', C=0.01)
clf_01.fit(X,Y)
plot_decision_boundary(df,clf_01)
summary_svm(df,clf_01)
In [73]:
param_grid = {'C':[0.001,0.01,0.1,1,10,100,1000,10000]}
clf_cv=svm.SVC(kernel='linear')
grid_clf=GridSearchCV(estimator=clf_cv,param_grid=param_grid)
grid_clf.fit(X,Y)
means = grid_clf.cv_results_['mean_test_score']
stds = grid_clf.cv_results_['std_test_score']
print(grid_clf.best_params_)
print(pd.DataFrame(grid_clf.cv_results_).set_index('params'))
In [79]:
clf_10=svm.SVC(kernel='linear', C=10)
clf_10.fit(X,Y)
plot_decision_boundary(df,clf_10)
summary_svm(df,clf_10)
In [80]:
X_test=np.random.normal(0,1,size=(20,2))
Y_test=np.zeros(20)
Y_test[10:]=1
X_test[Y_test==1,:]=X_test[Y_test==1,:]+1
df_test=pd.DataFrame({'Y':Y_test,'X_1':X_test[:,0],'X_2':X_test[:,1]})
sns.scatterplot(x='X_1',y='X_2',hue='Y', style='Y',data=df_test)
Out[80]:
In [123]:
yHat_test=clf_10.predict(X_test)
confusion_matrix(Y_test,yHat_test)
Out[123]:
In [84]:
summary_svm(df_test,clf_10)
In [62]:
display(clf_10.decision_function(X_test))
display(clf_10.predict(X_test))
In [ ]:
Linearly separable¶
In [36]:
X_lp=X.copy()
X_lp[Y==1,:]=X_lp[Y==1,:]+2
df_lp=pd.DataFrame({'Y':Y,'X_1':X_lp[:,0],'X_2':X_lp[:,1]})
sns.scatterplot(x='X_1',y='X_2',hue='Y', style='Y',data=df_lp)
Out[36]:
In [81]:
clf_lp=svm.SVC(kernel='linear', C=10)
clf_lp.fit(X_lp,Y)
plot_decision_boundary(df_lp,clf_lp)
summary_svm(df_lp,clf_lp)
In [82]:
summary_svm(df_test,clf_lp)
Support Vector Machine with Kernel¶
In [83]:
clf_nl_rd=svm.SVC(kernel='rbf', C=10, gamma=0.5)
clf_nl_rd.fit(X,Y)
plot_decision_boundary(df,clf_nl_rd)
summary_svm(df,clf_nl_rd)
In [146]:
clf_nl_pol2=svm.SVC(kernel='poly',degree=2, C=1)
clf_nl_pol2.fit(X,Y)
plot_decision_boundary(df,clf_nl_pol2)
summary_svm(df,clf_nl_pol2)
In [59]:
param_rbf_grid = {'C':[0.001,0.01,0.1,1,10,100,1000,10000], 'gamma':[0.5,1,2,3,4]}
clf_cv_rbf=svm.SVC(kernel='rbf')
grid_clf_rbf=GridSearchCV(estimator=clf_cv_rbf,param_grid=param_rbf_grid)
grid_clf_rbf.fit(X,Y)
means = grid_clf_rbf.cv_results_['mean_test_score']
stds = grid_clf_rbf.cv_results_['std_test_score']
print(grid_clf_rbf.best_params_)
print(pd.DataFrame(grid_clf_rbf.cv_results_).set_index('params'))
ROC Curves¶
In [72]:
plot_roc_curve(clf_nl_rd,X,Y)
Out[72]:
SVM with Multiple Classes¶
In [147]:
np.random.seed(123)
X_m=np.random.normal(0,1,size=(50,2))
Y_m=np.zeros(50)
Y_m[:20]=0
Y_m[20:35]=1
Y_m[35:]=2
X_m[Y_m==1,:]=X_m[Y_m==1,:]+1
X_m[Y_m==2,:]=X_m[Y_m==2,:]+2
df_m=pd.DataFrame({'Y':Y_m,'X_1':X_m[:,0],'X_2':X_m[:,1]})
#plt.scatter(X_m[:,0],X_m[:,1],c=Y_m)
#clazzes=np.array(np.select([Y_m==0,Y_m==1,Y_m==2],['r','b','g']))
sns.scatterplot(x='X_1',y='X_2',hue=Y_m.astype('int'),data=df_m)
Out[147]:
In [152]:
clf_m=svm.SVC(kernel='rbf', decision_function_shape='ovo', gamma=0.5, C=10)
clf_m.fit(X_m,Y_m)
plot_decision_boundary(df_m,clf_m)
summary_svm(df_m,clf_m)
Application to Gene Expression Data¶
In [173]:
df_k_xtrain=pd.read_csv('data/Khan_xtrain.csv', index_col=0)
df_k_xtest=pd.read_csv('data/Khan_xtest.csv', index_col=0)
df_k_ytrain=pd.read_csv('data/Khan_ytrain.csv', index_col=0,dtype='int')
df_k_ytest=pd.read_csv('data/Khan_ytest.csv', index_col=0,dtype='int')
In [174]:
display(df_k_xtrain.info())
display(df_k_xtest.info())
display(df_k_ytrain.info())
display(df_k_ytest.info())
In [178]:
df_k_xtrain.head()
df_k_xtest.head()
df_k_ytest.head()
df_k_ytrain.head()
Out[178]:
In [200]:
clf_k=svm.SVC(kernel='linear',C=10, decision_function_shape='ovo')
clf_k.fit(df_k_xtrain.values,df_k_ytrain.values)
Out[200]:
In [201]:
y_k_pred=clf_k.predict(df_k_xtest)
print(classification_report(df_k_ytest.values, y_k_pred))
In [202]:
print(plot_confusion_matrix(clf_k,df_k_xtest.values,df_k_ytest.values))
In [ ]:
No comments:
Post a Comment