In [3]:
import numpy as np
import pandas as pd
import seaborn as sns; sns.set(style="ticks", color_codes=True)
In [4]:
import random
In [5]:
random.seed(1)
In [6]:
data=pd.read_csv('data/Auto.csv')
data=data.query('horsepower!="?"')
data.horsepower=data.horsepower.astype(float)
In [7]:
data.info()
#data.describe()
#data
In [8]:
sns.pairplot(data)
Out[8]:
The Validation Set Approach¶
In [9]:
from sklearn.model_selection import train_test_split, cross_val_score
In [10]:
import statsmodels.api as sm
In [11]:
X_train,X_test,Y_train,Y_test=train_test_split(data['horsepower'],data['mpg'],train_size=0.5,random_state=1)
In [12]:
X_train=sm.add_constant(X_train)
X_test=sm.add_constant(X_test)
In [13]:
model=sm.OLS(Y_train.values, X_train.values).fit()
In [14]:
result=model.predict(X_train)
In [15]:
model.summary2()
##model.params
Out[15]:
In [16]:
np.mean(np.power(Y_train-model.predict(X_train),2))
Out[16]:
In [17]:
np.mean(np.power(Y_test-model.predict(X_test),2))
Out[17]:
In [18]:
from sklearn.linear_model import LinearRegression
modelLR=LinearRegression().fit(X_train,Y_train)
In [19]:
[modelLR.score(X_train,Y_train),modelLR.score(X_test,Y_test)]
Out[19]:
Cross validationan¶
In [20]:
from sklearn.base import BaseEstimator, RegressorMixin
import statsmodels.formula.api as smf
import statsmodels.api as sm
class statsmodelWrapper(BaseEstimator, RegressorMixin):
def __init__(self,sm_class, formula):
self.sm_class=sm_class
self.formula=formula
self.model=None
self.result=None
def fit(self, data, dummy):
self.model=self.sm_class(self.formula,data)
self.result=self.model.fit()
def predict(self,X):
return self.result.predict(X)
In [21]:
clf=statsmodelWrapper(smf.ols,'mpg~horsepower')
print(cross_val_score(clf, data, data['mpg']))
print(cross_val_score(LinearRegression(),data.mpg.values.reshape(-1,1), data.horsepower.values.reshape(-1,1)))
In [ ]:
In [22]:
cross_val_score(LinearRegression(),X_train,Y_train,cv=10)
Out[22]:
Leave One Out¶
In [27]:
from sklearn.model_selection import LeaveOneOut
loo=LeaveOneOut()
loo_data=list(loo.split(data))
for train,test in loo_data[:3]:
print("train:%s\n test:%s"% (train,test))
Bootstraping¶
In [26]:
from sklearn.utils import resample
N=10000
params=[]
for i in range(N):
rData=resample(data)
res=smf.ols("mpg~horsepower",rData).fit()
params.append(res.params)
In [28]:
#from ols
#const 39.5927 1.0142 39.0366 0.0000 37.5923 41.5931
#x1 -0.1565 0.0093 -16.7494 0.0000 -0.1749 -0.1381
[np.mean(params,axis=0),np.std(params,axis=0)]
Out[28]:
In [ ]:
No comments:
Post a Comment