Thursday, February 25, 2021

Chapter 4 Classification within Python - An Introduction to Statistical Learning

Chapter 4 Classification - An Introduction to Statistical Learning
In [28]:
import numpy as np
import pandas as pd
import seaborn as sns; sns.set(style="ticks", color_codes=True)
In [31]:
smarket=pd.read_csv('data/Smarket.csv',index_col=0)
In [32]:
smarket
Out[32]:
Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.19130 0.959 Up
2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.29650 1.032 Up
3 2001 1.032 0.959 0.381 -0.192 -2.624 1.41120 -0.623 Down
4 2001 -0.623 1.032 0.959 0.381 -0.192 1.27600 0.614 Up
5 2001 0.614 -0.623 1.032 0.959 0.381 1.20570 0.213 Up
... ... ... ... ... ... ... ... ... ...
1246 2005 0.422 0.252 -0.024 -0.584 -0.285 1.88850 0.043 Up
1247 2005 0.043 0.422 0.252 -0.024 -0.584 1.28581 -0.955 Down
1248 2005 -0.955 0.043 0.422 0.252 -0.024 1.54047 0.130 Up
1249 2005 0.130 -0.955 0.043 0.422 0.252 1.42236 -0.298 Down
1250 2005 -0.298 0.130 -0.955 0.043 0.422 1.38254 -0.489 Down

1250 rows × 9 columns

In [34]:
smarket.describe()
Out[34]:
Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today
count 1250.000000 1250.000000 1250.000000 1250.000000 1250.000000 1250.00000 1250.000000 1250.000000
mean 2003.016000 0.003834 0.003919 0.001716 0.001636 0.00561 1.478305 0.003138
std 1.409018 1.136299 1.136280 1.138703 1.138774 1.14755 0.360357 1.136334
min 2001.000000 -4.922000 -4.922000 -4.922000 -4.922000 -4.92200 0.356070 -4.922000
25% 2002.000000 -0.639500 -0.639500 -0.640000 -0.640000 -0.64000 1.257400 -0.639500
50% 2003.000000 0.039000 0.039000 0.038500 0.038500 0.03850 1.422950 0.038500
75% 2004.000000 0.596750 0.596750 0.596750 0.596750 0.59700 1.641675 0.596750
max 2005.000000 5.733000 5.733000 5.733000 5.733000 5.73300 3.152470 5.733000
In [5]:
smarket.columns
Out[5]:
Index(['Unnamed: 0', 'Year', 'Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume',
       'Today', 'Direction'],
      dtype='object')
In [6]:
smarket
Out[6]:
Unnamed: 0 Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
0 1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.19130 0.959 Up
1 2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.29650 1.032 Up
2 3 2001 1.032 0.959 0.381 -0.192 -2.624 1.41120 -0.623 Down
3 4 2001 -0.623 1.032 0.959 0.381 -0.192 1.27600 0.614 Up
4 5 2001 0.614 -0.623 1.032 0.959 0.381 1.20570 0.213 Up
... ... ... ... ... ... ... ... ... ... ...
1245 1246 2005 0.422 0.252 -0.024 -0.584 -0.285 1.88850 0.043 Up
1246 1247 2005 0.043 0.422 0.252 -0.024 -0.584 1.28581 -0.955 Down
1247 1248 2005 -0.955 0.043 0.422 0.252 -0.024 1.54047 0.130 Up
1248 1249 2005 0.130 -0.955 0.043 0.422 0.252 1.42236 -0.298 Down
1249 1250 2005 -0.298 0.130 -0.955 0.043 0.422 1.38254 -0.489 Down

1250 rows × 10 columns

In [38]:
smarket.query("Today>=0 and Direction=='Up'")
Out[38]:
Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.19130 0.959 Up
2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.29650 1.032 Up
4 2001 -0.623 1.032 0.959 0.381 -0.192 1.27600 0.614 Up
5 2001 0.614 -0.623 1.032 0.959 0.381 1.20570 0.213 Up
6 2001 0.213 0.614 -0.623 1.032 0.959 1.34910 1.392 Up
... ... ... ... ... ... ... ... ... ...
1239 2005 0.555 0.084 0.281 -0.122 -0.501 2.39002 0.419 Up
1244 2005 -0.024 -0.584 -0.285 -0.141 0.419 1.99669 0.252 Up
1245 2005 0.252 -0.024 -0.584 -0.285 -0.141 2.06517 0.422 Up
1246 2005 0.422 0.252 -0.024 -0.584 -0.285 1.88850 0.043 Up
1248 2005 -0.955 0.043 0.422 0.252 -0.024 1.54047 0.130 Up

648 rows × 9 columns

len(np.where(smarket.Direction=='Up')[0])

In [33]:
smarket
Out[33]:
Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
1 2001 0.381 -0.192 -2.624 -1.055 5.010 1.19130 0.959 Up
2 2001 0.959 0.381 -0.192 -2.624 -1.055 1.29650 1.032 Up
3 2001 1.032 0.959 0.381 -0.192 -2.624 1.41120 -0.623 Down
4 2001 -0.623 1.032 0.959 0.381 -0.192 1.27600 0.614 Up
5 2001 0.614 -0.623 1.032 0.959 0.381 1.20570 0.213 Up
... ... ... ... ... ... ... ... ... ...
1246 2005 0.422 0.252 -0.024 -0.584 -0.285 1.88850 0.043 Up
1247 2005 0.043 0.422 0.252 -0.024 -0.584 1.28581 -0.955 Down
1248 2005 -0.955 0.043 0.422 0.252 -0.024 1.54047 0.130 Up
1249 2005 0.130 -0.955 0.043 0.422 0.252 1.42236 -0.298 Down
1250 2005 -0.298 0.130 -0.955 0.043 0.422 1.38254 -0.489 Down

1250 rows × 9 columns

In [36]:
sns.pairplot(smarket)
Out[36]:
<seaborn.axisgrid.PairGrid at 0x20f2c0b4e08>
In [44]:
import matplotlib.pyplot as pl
In [42]:
smarket.cov()
Out[42]:
Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today
Year 1.985332 0.047551 0.048986 0.053259 0.057264 0.048165 0.273680 0.048186
Lag1 0.047551 1.291175 -0.033950 -0.013979 -0.003864 -0.007399 0.016752 -0.033772
Lag2 0.048986 -0.033950 1.291133 -0.033507 -0.014044 -0.004639 -0.017764 -0.013235
Lag3 0.053259 -0.013979 -0.033507 1.296644 -0.031188 -0.024577 -0.017162 -0.003167
Lag4 0.057264 -0.003864 -0.014044 -0.031188 1.296806 -0.035393 -0.019868 -0.008928
Lag5 0.048165 -0.007399 -0.004639 -0.024577 -0.035393 1.316871 -0.009099 -0.045458
Volume 0.273680 0.016752 -0.017764 -0.017162 -0.019868 -0.009099 0.129857 0.005975
Today 0.048186 -0.033772 -0.013235 -0.003167 -0.008928 -0.045458 0.005975 1.291255
In [46]:
sns.heatmap(smarket.cov())
Out[46]:
<matplotlib.axes._subplots.AxesSubplot at 0x20f32a7c7c8>
In [48]:
smarket.cov()
Out[48]:
Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today
Year 1.985332 0.047551 0.048986 0.053259 0.057264 0.048165 0.273680 0.048186
Lag1 0.047551 1.291175 -0.033950 -0.013979 -0.003864 -0.007399 0.016752 -0.033772
Lag2 0.048986 -0.033950 1.291133 -0.033507 -0.014044 -0.004639 -0.017764 -0.013235
Lag3 0.053259 -0.013979 -0.033507 1.296644 -0.031188 -0.024577 -0.017162 -0.003167
Lag4 0.057264 -0.003864 -0.014044 -0.031188 1.296806 -0.035393 -0.019868 -0.008928
Lag5 0.048165 -0.007399 -0.004639 -0.024577 -0.035393 1.316871 -0.009099 -0.045458
Volume 0.273680 0.016752 -0.017764 -0.017162 -0.019868 -0.009099 0.129857 0.005975
Today 0.048186 -0.033772 -0.013235 -0.003167 -0.008928 -0.045458 0.005975 1.291255
In [67]:
X=smarket.iloc[:,1:-1]
Y=smarket.iloc[:,-1]

Logistic Regression

In [82]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import statsmodels.api as sm
In [146]:
model = LogisticRegression(solver='liblinear', random_state=0)
In [147]:
model.fit(X,Y)
Out[147]:
LogisticRegression(random_state=0, solver='liblinear')
In [148]:
model.intercept_
Out[148]:
array([-0.15875624])
In [77]:
model.coef_
Out[77]:
array([[-0.02227623, -0.02078998,  0.04633308,  0.01859488,  0.07907447,
         0.2019268 ,  9.61023592]])
In [150]:
y_hat=model.predict(X)
In [153]:
confusion_matrix(Y,y_hat)
Out[153]:
array([[595,   7],
       [  0, 648]], dtype=int64)
In [155]:
report=classification_report(Y,y_hat)
print('report:', report, sep='\n')
report:
              precision    recall  f1-score   support

        Down       1.00      0.99      0.99       602
          Up       0.99      1.00      0.99       648

    accuracy                           0.99      1250
   macro avg       0.99      0.99      0.99      1250
weighted avg       0.99      0.99      0.99      1250

In [134]:
Y_dummy=np.zeros([Y.size,1])
Y_dummy[np.where(Y=='Up')]=1
In [135]:
model=sm.Logit(Y_dummy,X.values)
In [141]:
result=model.fit(method='newton', maxiter=100)
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.000000
         Iterations: 100
D:\apps\Anaconda3\envs\dev\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
In [145]:
result.summary2()
Out[145]:
Model: Logit Pseudo R-squared: 1.000
Dependent Variable: y AIC: 14.0003
Date: 2020-06-28 17:22 BIC: 49.9166
No. Observations: 1250 Log-Likelihood: -0.00013095
Df Model: 6 LL-Null: -865.59
Df Residuals: 1243 LLR p-value: 0.0000
Converged: 0.0000 Scale: 1.0000
No. Iterations: 100.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
x1 2.4529 160.2746 0.0153 0.9878 -311.6796 316.5853
x2 4.8854 384.1694 0.0127 0.9899 -748.0727 757.8435
x3 -3.6706 478.5313 -0.0077 0.9939 -941.5747 934.2334
x4 -3.4798 371.8356 -0.0094 0.9925 -732.2642 725.3046
x5 8.4028 252.1767 0.0333 0.9734 -485.8545 502.6601
x6 11.6681 225.1737 0.0518 0.9587 -429.6642 453.0003
x7 4032.2539 33024.6793 0.1221 0.9028 -60694.9282 68759.4359
In [125]:
X.values.shape
Out[125]:
(1250, 7)
In [ ]:
 

No comments: