import numpy as np
import pandas as pd
from sklearn import linear_model as lm
import statsmodels.api as sm
import seaborn as sns; sns.set(style="ticks", color_codes=True)

df=pd.read_csv('data/Boston.csv')

df.describe()

sns.pairplot(df)

<seaborn.axisgrid.PairGrid at 0x1cce7a3d888>

sns.heatmap(df.cov())
df.cov()
df.corr()
sns.heatmap(df.corr())

<matplotlib.axes._subplots.AxesSubplot at 0x1ccf0d6d808>

Simple Linear Regression¶

X=df.lstat.values.reshape(-1,1)
Y=df.medv.values.reshape(-1,1)

model=lm.LinearRegression()
model.fit(X,Y)

LinearRegression()

X=sm.add_constant(X)

model=sm.OLS(Y,X)

result=model.fit()

result.summary2()

newX=np.array([[1,5],[1,10], [1,15]])
pre=result.get_prediction(newX)

pre.summary_frame()

pre.conf_int()

array([[29.00741194, 30.59977628],
       [24.47413202, 25.63256267],
       [19.73158815, 20.87461299]])

pre.conf_int(obs=True)

array([[17.56567478, 42.04151344],
       [12.82762635, 37.27906833],
       [ 8.0777421 , 32.52845905]])

np.std(result.resid)

6.20346413142642

inf=result.get_influence()

from statsmodels.compat import lzip
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols

model2=ols("medv~lstat",data=df).fit()

model2.summary2()

fig, ax = plt.subplots()
sm.graphics.plot_fit(model2,1,ax=ax)
plt.show()

sns.regplot(x='lstat',y='medv', data=df)

<matplotlib.axes._subplots.AxesSubplot at 0x1ccf3d23288>

from statsmodels.graphics.regressionplots import abline_plot
abline_plot(model_results=model2,ax=ax)

sns.residplot(x='lstat',y='medv', data=df)

<matplotlib.axes._subplots.AxesSubplot at 0x1ccf3e7a588>

fig,(ax0,ax1)=plt.subplots(1,2)
sns.regplot(x='lstat',y='medv', data=df,ax=ax0)
sns.residplot(x='lstat',y='medv', data=df,ax=ax1)
#abline_plot(model_results=model2,ax=axs[0,0])

<matplotlib.axes._subplots.AxesSubplot at 0x1ccf3efb848>

sm.graphics.plot_partregress_grid(model2)

Multiple Linear Regression¶

mul_model=ols("medv~lstat+age",data=df)

res_mm=mul_model.fit()

res_mm.summary2()

formula="medv~"+"+".join(df.columns[df.columns!='medv'].values)
mul_modelAll=ols(formula,data=df)
res_mmAll=mul_modelAll.fit()
res_mmAll.summary2()

wantedColumns=set(df.columns.values).difference(set(['medv','indus']))
formula="medv~"+"+".join(wantedColumns)
mul_modelAllRefined=ols(formula,data=df)
res_mmAllRefined=mul_modelAllRefined.fit()
res_mmAllRefined.summary2()

plt.subplots(20,20)
sm.graphics.plot_partregress_grid(res_mmAll)

D:\apps\Anaconda3\envs\dev\lib\site-packages\statsmodels\graphics\regressionplots.py:561: UserWarning: Tight layout not applied. tight_layout cannot make axes height small enough to accommodate all axes decorations
  fig.tight_layout()

from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

X=df.iloc[:,0:-1].values
#{vif(X,i)  for i in range(X.shape[1])}
{n:vif(X,i)  for n,i in zip(df.columns[:-1].values,range(df.shape[1]))}

{'crim': 2.1003728199615233,
 'zn': 2.8440132669462637,
 'indus': 14.485757706539308,
 'chas': 1.1529518589418775,
 'nox': 73.89494652814788,
 'rm': 77.94828304638538,
 'age': 21.38685048994314,
 'dis': 14.6996523837492,
 'rad': 15.167724857920897,
 'tax': 61.227274009649456,
 'ptratio': 85.02954731061801,
 'black': 20.104942636229136,
 'lstat': 11.102024772203539}

{n:vif(df.values,i)  for n,i in zip(df.columns.values,range(df.shape[1]))}

{'crim': 2.1314042398916406,
 'zn': 2.9100040831335887,
 'indus': 14.485874343285056,
 'chas': 1.1762659532928834,
 'nox': 74.00426852716569,
 'rm': 136.1017425464841,
 'age': 21.39886253165994,
 'dis': 15.430455486916713,
 'rad': 15.36997984977581,
 'tax': 61.93971317434026,
 'ptratio': 87.22723281176584,
 'black': 21.35101506934191,
 'lstat': 12.615187641345385,
 'medv': 24.503206285254794}

wantedColumns=set(df.columns.values).difference(set(['medv','indus','rm']))
formula="medv~"+"+".join(wantedColumns)
mul_modelAllRefined2=ols(formula,data=df)
res_mmAllRefined2=mul_modelAllRefined2.fit()
res_mmAllRefined2.summary2()

Interaction Terms¶

mul_model_i=ols("medv~lstat*age",data=df).fit()

mul_model_i.summary2()

Non-Linear Trasnformation of the Predictors¶

mul_model_non_lin=ols("medv~lstat+I(lstat**2)",data=df).fit()

mul_model_non_lin.summary2()

from statsmodels.stats.anova import anova_lm
model2=ols("medv~lstat",data=df).fit()

anova_lm(model2, mul_model_non_lin)

D:\apps\Anaconda3\envs\dev\lib\site-packages\scipy\stats\_distn_infrastructure.py:1932: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= _a)

model_ploy=ols("medv~I(lstat**1)+I(lstat**2)+I(lstat**3)+I(lstat**4)+I(lstat**5)",data=df).fit()

model_ploy.summary2()

Qualitive Predictors¶

car_df=pd.read_csv('data/Carseats.csv', index_col=0)

car_df.describe()

#car_df.boxplot()
sns.set(rc={'figure.figsize':(9,4)})
sns.boxplot(data=car_df)

<matplotlib.axes._subplots.AxesSubplot at 0x1ccfff3b248>

car_df.head()

features="+".join(set(car_df.columns.values).difference(["Sales"]))
print(features)
cat_model=ols("Sales~"+features+"+Income:Advertising+Price:Age",data=car_df).fit()

Advertising+Population+Income+Age+Education+US+Price+ShelveLoc+Urban+CompPrice

cat_model.summary2()

	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
count	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000
mean	3.613524	11.363636	11.136779	0.069170	0.554695	6.284634	68.574901	3.795043	9.549407	408.237154	18.455534	356.674032	12.653063	22.532806
std	8.601545	23.322453	6.860353	0.253994	0.115878	0.702617	28.148861	2.105710	8.707259	168.537116	2.164946	91.294864	7.141062	9.197104
min	0.006320	0.000000	0.460000	0.000000	0.385000	3.561000	2.900000	1.129600	1.000000	187.000000	12.600000	0.320000	1.730000	5.000000
25%	0.082045	0.000000	5.190000	0.000000	0.449000	5.885500	45.025000	2.100175	4.000000	279.000000	17.400000	375.377500	6.950000	17.025000
50%	0.256510	0.000000	9.690000	0.000000	0.538000	6.208500	77.500000	3.207450	5.000000	330.000000	19.050000	391.440000	11.360000	21.200000
75%	3.677082	12.500000	18.100000	0.000000	0.624000	6.623500	94.075000	5.188425	24.000000	666.000000	20.200000	396.225000	16.955000	25.000000
max	88.976200	100.000000	27.740000	1.000000	0.871000	8.780000	100.000000	12.126500	24.000000	711.000000	22.000000	396.900000	37.970000	50.000000

Model:	OLS	Adj. R-squared:	0.543
Dependent Variable:	y	AIC:	3286.9750
Date:	2021-02-25 16:35	BIC:	3295.4280
No. Observations:	506	Log-Likelihood:	-1641.5
Df Model:	1	F-statistic:	601.6
Df Residuals:	504	Prob (F-statistic):	5.08e-88
R-squared:	0.544	Scale:	38.636

	Coef.	Std.Err.	t	P>\|t\|	[0.025	0.975]
const	34.5538	0.5626	61.4151	0.0000	33.4485	35.6592
x1	-0.9500	0.0387	-24.5279	0.0000	-1.0261	-0.8740

Omnibus:	137.043	Durbin-Watson:	0.892
Prob(Omnibus):	0.000	Jarque-Bera (JB):	291.373
Skew:	1.453	Prob(JB):	0.000
Kurtosis:	5.319	Condition No.:	30

	mean	mean_se	mean_ci_lower	mean_ci_upper	obs_ci_lower	obs_ci_upper
0	29.803594	0.405247	29.007412	30.599776	17.565675	42.041513
1	25.053347	0.294814	24.474132	25.632563	12.827626	37.279068
2	20.303101	0.290893	19.731588	20.874613	8.077742	32.528459

Deniz's Notes

Thursday, February 25, 2021

Chapter 3 Lab: Linear Regression within Python - An Introduction to Statistical Learning

Simple Linear Regression¶

Multiple Linear Regression¶

Interaction Terms¶

Non-Linear Trasnformation of the Predictors¶

Qualitive Predictors¶

No comments:

Model:	OLS	Adj. R-squared:	0.549
Dependent Variable:	medv	AIC:	3281.0064
Date:	2021-02-25 16:35	BIC:	3293.6860
No. Observations:	506	Log-Likelihood:	-1637.5
Df Model:	2	F-statistic:	309.0
Df Residuals:	503	Prob (F-statistic):	2.98e-88
R-squared:	0.551	Scale:	38.108

Omnibus:	124.288	Durbin-Watson:	0.945
Prob(Omnibus):	0.000	Jarque-Bera (JB):	244.026
Skew:	1.362	Prob(JB):	0.000
Kurtosis:	5.038	Condition No.:	201

Model:	OLS	Adj. R-squared:	0.734
Dependent Variable:	medv	AIC:	3025.6086
Date:	2021-02-25 16:35	BIC:	3084.7801
No. Observations:	506	Log-Likelihood:	-1498.8
Df Model:	13	F-statistic:	108.1
Df Residuals:	492	Prob (F-statistic):	6.72e-135
R-squared:	0.741	Scale:	22.518

Omnibus:	178.041	Durbin-Watson:	1.078
Prob(Omnibus):	0.000	Jarque-Bera (JB):	783.126
Skew:	1.521	Prob(JB):	0.000
Kurtosis:	8.281	Condition No.:	15114

Omnibus:	178.124	Durbin-Watson:	1.079
Prob(Omnibus):	0.000	Jarque-Bera (JB):	784.481
Skew:	1.521	Prob(JB):	0.000
Kurtosis:	8.287	Condition No.:	14978

Model:	OLS	Adj. R-squared:	0.690
Dependent Variable:	medv	AIC:	3100.8063
Date:	2021-02-25 16:36	BIC:	3151.5247
No. Observations:	506	Log-Likelihood:	-1538.4
Df Model:	11	F-statistic:	103.2
Df Residuals:	494	Prob (F-statistic):	2.45e-120
R-squared:	0.697	Scale:	26.226

Omnibus:	125.993	Durbin-Watson:	1.244
Prob(Omnibus):	0.000	Jarque-Bera (JB):	294.346
Skew:	1.279	Prob(JB):	0.000
Kurtosis:	5.724	Condition No.:	12346

Model:	OLS	Adj. R-squared:	0.553
Dependent Variable:	medv	AIC:	3277.9547
Date:	2021-02-25 16:36	BIC:	3294.8609
No. Observations:	506	Log-Likelihood:	-1635.0
Df Model:	3	F-statistic:	209.3
Df Residuals:	502	Prob (F-statistic):	4.86e-88
R-squared:	0.556	Scale:	37.804

Omnibus:	135.601	Durbin-Watson:	0.965
Prob(Omnibus):	0.000	Jarque-Bera (JB):	296.955
Skew:	1.417	Prob(JB):	0.000
Kurtosis:	5.461	Condition No.:	6878

Model:	OLS	Adj. R-squared:	0.639
Dependent Variable:	medv	AIC:	3168.5160
Date:	2021-02-25 16:36	BIC:	3181.1956
No. Observations:	506	Log-Likelihood:	-1581.3
Df Model:	2	F-statistic:	448.5
Df Residuals:	503	Prob (F-statistic):	1.56e-112
R-squared:	0.641	Scale:	30.511

	Coef.	Std.Err.	t	P>\|t\|	[0.025	0.975]
Intercept	33.2228	0.7308	45.4579	0.0000	31.7869	34.6586
lstat	-1.0321	0.0482	-21.4163	0.0000	-1.1267	-0.9374
age	0.0345	0.0122	2.8256	0.0049	0.0105	0.0586

	Coef.	Std.Err.	t	P>\|t\|	[0.025	0.975]
Intercept	36.4595	5.1035	7.1441	0.0000	26.4322	46.4868
crim	-0.1080	0.0329	-3.2865	0.0011	-0.1726	-0.0434
zn	0.0464	0.0137	3.3816	0.0008	0.0194	0.0734
indus	0.0206	0.0615	0.3343	0.7383	-0.1003	0.1414
chas	2.6867	0.8616	3.1184	0.0019	0.9939	4.3796
nox	-17.7666	3.8197	-4.6513	0.0000	-25.2716	-10.2616
rm	3.8099	0.4179	9.1161	0.0000	2.9887	4.6310
age	0.0007	0.0132	0.0524	0.9582	-0.0253	0.0266
dis	-1.4756	0.1995	-7.3980	0.0000	-1.8675	-1.0837
rad	0.3060	0.0663	4.6129	0.0000	0.1757	0.4364
tax	-0.0123	0.0038	-3.2800	0.0011	-0.0197	-0.0049
ptratio	-0.9527	0.1308	-7.2825	0.0000	-1.2098	-0.6957
black	0.0093	0.0027	3.4668	0.0006	0.0040	0.0146
lstat	-0.5248	0.0507	-10.3471	0.0000	-0.6244	-0.4251

	Coef.	Std.Err.	t	P>\|t\|	[0.025	0.975]
Intercept	36.3639	5.0908	7.1430	0.0000	26.3614	46.3663
age	0.0007	0.0132	0.0528	0.9579	-0.0252	0.0266
chas	2.7164	0.8562	3.1725	0.0016	1.0341	4.3987
zn	0.0459	0.0136	3.3681	0.0008	0.0191	0.0727
crim	-0.1084	0.0328	-3.3042	0.0010	-0.1729	-0.0439
nox	-17.4295	3.6809	-4.7351	0.0000	-24.6618	-10.1973
rad	0.2999	0.0637	4.7101	0.0000	0.1748	0.4250
lstat	-0.5235	0.0505	-10.3611	0.0000	-0.6227	-0.4242
rm	3.7970	0.4158	9.1323	0.0000	2.9801	4.6139
ptratio	-0.9471	0.1296	-7.3075	0.0000	-1.2017	-0.6924
black	0.0093	0.0027	3.4607	0.0006	0.0040	0.0146
tax	-0.0118	0.0034	-3.4888	0.0005	-0.0184	-0.0051
dis	-1.4896	0.1948	-7.6477	0.0000	-1.8724	-1.1069

	Coef.	Std.Err.	t	P>\|t\|	[0.025	0.975]
Intercept	69.7816	3.8230	18.2529	0.0000	62.2701	77.2930
age	0.0257	0.0139	1.8437	0.0658	-0.0017	0.0531
chas	2.8877	0.9247	3.1230	0.0019	1.0710	4.7045
zn	0.0668	0.0145	4.5966	0.0000	0.0382	0.0953
crim	-0.1153	0.0354	-3.2545	0.0012	-0.1849	-0.0457
nox	-21.5501	3.9461	-5.4611	0.0000	-29.3034	-13.7969
rad	0.4115	0.0675	6.0975	0.0000	0.2789	0.5441
lstat	-0.7753	0.0457	-16.9550	0.0000	-0.8651	-0.6855
ptratio	-1.1526	0.1379	-8.3599	0.0000	-1.4234	-0.8817
black	0.0068	0.0029	2.3524	0.0190	0.0011	0.0124
tax	-0.0157	0.0036	-4.3473	0.0000	-0.0228	-0.0086
dis	-1.6983	0.2090	-8.1276	0.0000	-2.1088	-1.2877

Omnibus:	107.006	Durbin-Watson:	0.921
Prob(Omnibus):	0.000	Jarque-Bera (JB):	228.388
Skew:	1.128	Prob(JB):	0.000
Kurtosis:	5.397	Condition No.:	1135

	df_resid	ssr	df_diff	ss_diff	F	Pr(>F)
0	504.0	19472.381418	0.0	NaN	NaN	NaN
1	503.0	15347.243158	1.0	4125.13826	135.199822	7.630116e-28

Model:	OLS	Adj. R-squared:	0.679
Dependent Variable:	medv	AIC:	3113.2474
Date:	2021-02-25 16:36	BIC:	3138.6066
No. Observations:	506	Log-Likelihood:	-1550.6
Df Model:	5	F-statistic:	214.2
Df Residuals:	500	Prob (F-statistic):	8.73e-122
R-squared:	0.682	Scale:	27.194

Omnibus:	144.085	Durbin-Watson:	0.987
Prob(Omnibus):	0.000	Jarque-Bera (JB):	494.545
Skew:	1.292	Prob(JB):	0.000
Kurtosis:	7.096	Condition No.:	137417262

	Sales	CompPrice	Income	Advertising	Population	Price	Age	Education
count	400.000000	400.000000	400.000000	400.000000	400.000000	400.000000	400.000000	400.000000
mean	7.496325	124.975000	68.657500	6.635000	264.840000	115.795000	53.322500	13.900000
std	2.824115	15.334512	27.986037	6.650364	147.376436	23.676664	16.200297	2.620528
min	0.000000	77.000000	21.000000	0.000000	10.000000	24.000000	25.000000	10.000000
25%	5.390000	115.000000	42.750000	0.000000	139.000000	100.000000	39.750000	12.000000
50%	7.490000	125.000000	69.000000	5.000000	272.000000	117.000000	54.500000	14.000000
75%	9.320000	135.000000	91.000000	12.000000	398.500000	131.000000	66.000000	16.000000
max	16.270000	175.000000	120.000000	29.000000	509.000000	191.000000	80.000000	18.000000

	Sales	CompPrice	Income	Advertising	Population	Price	ShelveLoc	Age	Education	Urban	US
1	9.50	138	73	11	276	120	Bad	42	17	Yes	Yes
2	11.22	111	48	16	260	83	Good	65	10	Yes	Yes
3	10.06	113	35	10	269	80	Medium	59	12	Yes	Yes
4	7.40	117	100	4	466	97	Medium	55	14	Yes	Yes
5	4.15	141	64	3	340	128	Bad	38	13	Yes	No

Model:	OLS	Adj. R-squared:	0.872
Dependent Variable:	Sales	AIC:	1157.3378
Date:	2021-02-25 16:36	BIC:	1213.2183
No. Observations:	400	Log-Likelihood:	-564.67
Df Model:	13	F-statistic:	210.0
Df Residuals:	386	Prob (F-statistic):	6.14e-166
R-squared:	0.876	Scale:	1.0213

Omnibus:	1.281	Durbin-Watson:	2.047
Prob(Omnibus):	0.527	Jarque-Bera (JB):	1.147
Skew:	0.129	Prob(JB):	0.564
Kurtosis:	3.050	Condition No.:	130576