In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model as lm
import statsmodels.api as sm
import seaborn as sns; sns.set(style="ticks", color_codes=True)
In [2]:
df=pd.read_csv('data/Boston.csv')
In [3]:
df.describe()
Out[3]:
In [4]:
sns.pairplot(df)
Out[4]:
In [5]:
sns.heatmap(df.cov())
df.cov()
df.corr()
sns.heatmap(df.corr())
Out[5]:
Simple Linear Regression¶
In [10]:
X=df.lstat.values.reshape(-1,1)
Y=df.medv.values.reshape(-1,1)
In [11]:
model=lm.LinearRegression()
model.fit(X,Y)
Out[11]:
In [12]:
X=sm.add_constant(X)
In [13]:
model=sm.OLS(Y,X)
In [14]:
result=model.fit()
In [15]:
result.summary2()
Out[15]:
In [56]:
newX=np.array([[1,5],[1,10], [1,15]])
pre=result.get_prediction(newX)
In [17]:
pre.summary_frame()
Out[17]:
In [18]:
pre.conf_int()
Out[18]:
In [19]:
pre.conf_int(obs=True)
Out[19]:
In [20]:
np.std(result.resid)
Out[20]:
In [21]:
inf=result.get_influence()
In [22]:
from statsmodels.compat import lzip
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
In [23]:
model2=ols("medv~lstat",data=df).fit()
In [24]:
model2.summary2()
Out[24]:
In [25]:
fig, ax = plt.subplots()
sm.graphics.plot_fit(model2,1,ax=ax)
plt.show()
In [26]:
sns.regplot(x='lstat',y='medv', data=df)
Out[26]:
In [27]:
from statsmodels.graphics.regressionplots import abline_plot
abline_plot(model_results=model2,ax=ax)
Out[27]:
In [28]:
sns.residplot(x='lstat',y='medv', data=df)
Out[28]:
In [29]:
fig,(ax0,ax1)=plt.subplots(1,2)
sns.regplot(x='lstat',y='medv', data=df,ax=ax0)
sns.residplot(x='lstat',y='medv', data=df,ax=ax1)
#abline_plot(model_results=model2,ax=axs[0,0])
Out[29]:
In [30]:
sm.graphics.plot_partregress_grid(model2)
Out[30]:
Multiple Linear Regression¶
In [32]:
mul_model=ols("medv~lstat+age",data=df)
In [33]:
res_mm=mul_model.fit()
In [34]:
res_mm.summary2()
Out[34]:
In [35]:
formula="medv~"+"+".join(df.columns[df.columns!='medv'].values)
mul_modelAll=ols(formula,data=df)
res_mmAll=mul_modelAll.fit()
res_mmAll.summary2()
Out[35]:
In [36]:
wantedColumns=set(df.columns.values).difference(set(['medv','indus']))
formula="medv~"+"+".join(wantedColumns)
mul_modelAllRefined=ols(formula,data=df)
res_mmAllRefined=mul_modelAllRefined.fit()
res_mmAllRefined.summary2()
Out[36]:
In [37]:
plt.subplots(20,20)
sm.graphics.plot_partregress_grid(res_mmAll)
Out[37]:
In [38]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
In [ ]:
In [39]:
X=df.iloc[:,0:-1].values
#{vif(X,i) for i in range(X.shape[1])}
{n:vif(X,i) for n,i in zip(df.columns[:-1].values,range(df.shape[1]))}
Out[39]:
In [40]:
{n:vif(df.values,i) for n,i in zip(df.columns.values,range(df.shape[1]))}
Out[40]:
In [41]:
wantedColumns=set(df.columns.values).difference(set(['medv','indus','rm']))
formula="medv~"+"+".join(wantedColumns)
mul_modelAllRefined2=ols(formula,data=df)
res_mmAllRefined2=mul_modelAllRefined2.fit()
res_mmAllRefined2.summary2()
Out[41]:
Interaction Terms¶
In [42]:
mul_model_i=ols("medv~lstat*age",data=df).fit()
In [43]:
mul_model_i.summary2()
Out[43]:
Non-Linear Trasnformation of the Predictors¶
In [44]:
mul_model_non_lin=ols("medv~lstat+I(lstat**2)",data=df).fit()
In [45]:
mul_model_non_lin.summary2()
Out[45]:
In [46]:
from statsmodels.stats.anova import anova_lm
model2=ols("medv~lstat",data=df).fit()
In [47]:
anova_lm(model2, mul_model_non_lin)
Out[47]:
In [48]:
model_ploy=ols("medv~I(lstat**1)+I(lstat**2)+I(lstat**3)+I(lstat**4)+I(lstat**5)",data=df).fit()
In [49]:
model_ploy.summary2()
Out[49]:
Qualitive Predictors¶
In [50]:
car_df=pd.read_csv('data/Carseats.csv', index_col=0)
In [51]:
car_df.describe()
Out[51]:
In [71]:
#car_df.boxplot()
sns.set(rc={'figure.figsize':(9,4)})
sns.boxplot(data=car_df)
Out[71]:
In [59]:
car_df.head()
Out[59]:
In [58]:
features="+".join(set(car_df.columns.values).difference(["Sales"]))
print(features)
cat_model=ols("Sales~"+features+"+Income:Advertising+Price:Age",data=car_df).fit()
In [55]:
cat_model.summary2()
Out[55]:
In [ ]:
No comments:
Post a Comment