In [179]:
%config IPCompleter.greedy=True
%config Completer.use_jedi = False
import numpy as np
import pandas as pd
import seaborn as sns
from pca import pca # https://pypi.org/project/pca/
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans,AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, complete, single, ward
import matplotlib.pyplot as plt
In [110]:
usArrest=pd.read_csv(r"data/USArrests.csv",index_col=0)
In [111]:
usArrest.head()
Out[111]:
In [9]:
usArrest.info()
In [10]:
usArrest.describe()
Out[10]:
In [11]:
sns.pairplot(usArrest)
Out[11]:
Lab 1: Principal Component Analysis¶
In [112]:
scaler=StandardScaler()
X=scaler.fit(usArrest.values).transform(usArrest.values)
print(f'mean={np.mean(X,axis=0)}')
print(f'std={np.std(X,axis=0)}')
In [113]:
pca4=PCA(n_components=4)
pca4.fit(X)
Out[113]:
In [14]:
pca4.components_
Out[14]:
In [15]:
print(f'explained_var={pca4.explained_variance_}')
print(f'explained_var %={pca4.explained_variance_ratio_*100}')
print(f'mean={pca4.mean_}')
print(f'singular values (\u03BB)={pca4.singular_values_}')
In [114]:
XHat=pca4.transform(X)
print(f'Xhat shape={XHat.shape}')
print(f'XHat mean={np.mean(XHat,axis=0)}')
print(f'XHat std={np.std(XHat,axis=0)}')
In [118]:
fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
ax.scatter(XHat[:,0],-XHat[:,1],alpha=0.2)
for i,state in enumerate(usArrest.index.values):
ax.annotate(state,(XHat[i,0],-XHat[i,1]))
features = usArrest.columns
loadings = pca4.components_.T * np.sqrt(pca4.explained_variance_)
for i, feature in enumerate(features):
plt.arrow(0, 0, loadings[i,0], -loadings[i,1],color = 'r',alpha = 0.5)
plt.text(loadings[i,0]* 1.15, -loadings[i,1] * 1.15, feature, color = 'g', ha = 'center', va = 'center')
First component (x-axis) dominated with Murder, Assualt and Rape features, hence it corresponds to crime. On the other hand, the second component has UrbanPop which indicates urbanization has next important component.
Plot with PCA library (https://pypi.org/project/pca/)¶
In [31]:
model = pca(n_components=4)
# Fit transform
results = model.fit_transform(X)
# Plot explained variance
fig, ax = model.plot()
# Scatter first 2 PCs
fig, ax = model.scatter()
# Make biplot with the number of features
fig, ax = model.biplot(n_feat=4)
PCA as dimensinality Reduction¶
In [19]:
pca1=PCA(n_components=1)
pca1.fit(X)
XHat1=pca1.transform(X)
X_new1=pca1.inverse_transform(XHat1)
print(f"original shape:{X.shape}")
print(f"transformed shape:{XHat1.shape}")
plt.scatter(X[:,0],X[:,1],alpha=0.2)
plt.scatter(X_new1[:,0],X_new1[:,1],alpha=0.8)
plt.axis('equal')
Out[19]:
Lab 2 Clustering¶
K-Means Clustering¶
In [59]:
np.random.seed(123)
X=np.random.normal(0,1,size=(50,2))
X[0:25,0]=X[0:25,0]+3
X[0:25,1]=X[0:25,1]-4
plt.scatter(X[:,0],X[:,1])
Out[59]:
In [61]:
kmean=KMeans(n_clusters=2,random_state=0).fit(X)
In [90]:
print(f"cluester_centers:{kmean.cluster_centers_}")
print(f"labels:{kmean.labels_}")
print(f"inertia:{kmean.inertia_}")
print(f"#iter:{kmean.n_iter_}")
plt.scatter(X[:,0],X[:,1],alpha=0.3,c=kmean.labels_)
plt.scatter(kmean.cluster_centers_[:,0],kmean.cluster_centers_[:,0],c=['r','b'],marker='*',s=80)
Out[90]:
In [91]:
kmean3=KMeans(n_clusters=3,random_state=0).fit(X)
print(f"cluester_centers:{kmean3.cluster_centers_}")
print(f"labels:{kmean3.labels_}")
print(f"inertia:{kmean3.inertia_}")
print(f"#iter:{kmean3.n_iter_}")
plt.scatter(X[:,0],X[:,1],alpha=0.3,c=kmean3.labels_)
plt.scatter(kmean3.cluster_centers_[:,0],kmean3.cluster_centers_[:,1],c=['r','b','g'],marker='*',s=80)
Out[91]:
Hierarchical Clustering¶
In [122]:
agg_cluster_complete=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='complete').fit(X)
agg_cluster_single=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='single').fit(X)
agg_cluster_ward=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='ward').fit(X)
In [120]:
print(f'#clusters:{agg_cluster_complete.n_clusters_}')
print(f'#labels:{agg_cluster_complete.labels_}')
print(f'#leaves:{agg_cluster_complete.n_leaves_}')
In [109]:
print(f'#clusters:{agg_cluster_single.n_clusters_}')
print(f'#labels:{agg_cluster_single.labels_}')
print(f'#leaves:{agg_cluster_single.n_leaves_}')
In [160]:
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack([model.children_, model.distances_,
counts]).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
In [123]:
plt.title('Hierarchical Clustering Complete linkage Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(agg_cluster_complete, truncate_mode='level', p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()
In [124]:
plt.title('Hierarchical Clustering Single linkage Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(agg_cluster_single, truncate_mode='level', p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()
NCI60 Data Example¶
In [120]:
nci_labels=pd.read_csv('data/NCI60_Y.csv',index_col=0)
nci_data=pd.read_csv('data/NCI60_X.csv', index_col=0)
In [121]:
nci_labels.info()
In [122]:
print(nci_labels.x.unique())
nci_labels.head()
Out[122]:
In [123]:
nci_data.info()
In [124]:
nci_data.head()
Out[124]:
PCA on the NCI60 Data¶
In [125]:
scaler=StandardScaler()
X=scaler.fit(nci_data.values).transform(nci_data.values)
pca_nc=PCA()
pca_nc.fit(X)
Out[125]:
In [126]:
pca_nc.components_
Out[126]:
In [127]:
df_variance=pd.DataFrame({'explained_var':pca_nc.explained_variance_,
'explained_var_percentage':pca_nc.explained_variance_ratio_*100,
'explained_var_cum_percentage':np.cumsum(pca_nc.explained_variance_ratio_*100)})
In [128]:
sns.set(rc={'figure.figsize':(14,4)})
sns.barplot(x=list(range(df_variance.shape[0])),y='explained_var',data=df_variance).set_title('Explained Variance')
Out[128]:
In [129]:
sns.set(rc={'figure.figsize':(14,4)})
sns.barplot(x=list(range(df_variance.shape[0])),y='explained_var_percentage',data=df_variance).set_title('Explained Variance %')
Out[129]:
In [130]:
sns.set(rc={'figure.figsize':(14,4)})
sns.pointplot(x=list(range(df_variance.shape[0])),y='explained_var_cum_percentage',data=df_variance).set_title('Explained Cum Variance %')
Out[130]:
In [131]:
XHat=pca_nc.transform(X)
print(f'Xhat shape={XHat.shape}')
In [154]:
sns.set(rc={'figure.figsize':(10,5)})
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,5))
df_pcas=pd.DataFrame({'Z1':XHat[:,0],
'Z2':-XHat[:,1],
'Z3':XHat[:,2],
'Labels':nci_labels.x})
g=sns.scatterplot(x='Z1',y='Z2',hue='Labels',data=df_pcas, ax=ax1,legend=False)
g.set_title('Projection of NCI60 to Z1 and Z2 components')
g=sns.scatterplot(x='Z1',y='Z3',hue='Labels',data=df_pcas,ax=ax2)
g.set_title('Projection of NCI60 to Z1 and Z3 components')
g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)
Out[154]:
Clustering the Observations of the NCI60 Data¶
In [155]:
agg_nci_complete=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='complete').fit(X)
agg_nci_single=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='single').fit(X)
agg_nci_ward=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='ward').fit(X)
In [156]:
print(f'#clusters:{agg_nci_complete.n_clusters_}')
print(f'#labels:{agg_nci_complete.labels_}')
print(f'#leaves:{agg_nci_complete.n_leaves_}')
In [157]:
print(f'#clusters:{agg_nci_single.n_clusters_}')
print(f'#labels:{agg_nci_single.labels_}')
print(f'#leaves:{agg_nci_single.n_leaves_}')
In [158]:
print(f'#clusters:{agg_nci_ward.n_clusters_}')
print(f'#labels:{agg_nci_ward.labels_}')
print(f'#leaves:{agg_nci_ward.n_leaves_}')
In [222]:
fig, (ax1,ax2,ax3) = plt.subplots(1, 3, figsize=(17, 15))
ax1.title.set_text('NCI60 Complete Linkage Dendrogram')
dendrogram(complete(X),labels=list(nci_labels.x),orientation='right', color_threshold=0, leaf_font_size=9, ax=ax1)
ax2.title.set_text('NCI60 Ward Linkage Dendrogram')
dendrogram(ward(X),labels=list(nci_labels.x),orientation='right', color_threshold=0, leaf_font_size=9, ax=ax2)
ax3.title.set_text('NCI60 Single Linkage Dendrogram')
dendrogram(single(X),labels=list(nci_labels.x),orientation='right', color_threshold=0, leaf_font_size=9, ax=ax3)
plt.show()
In [254]:
fig, axes = plt.subplots(1,1, figsize=(6,15))
plt.title('NCI60 Complete Linkage Dendrogram')
lenX=nci_labels.shape[0]
dendrogram(complete(X),labels=list(nci_labels.x), color_threshold=139, leaf_font_size=9, orientation='right')
plt.vlines(140,0,plt.gca().yaxis.get_data_interval()[1], colors='b', linestyles='dashed')
plt.show()
In [260]:
nci_kmean=KMeans(n_clusters=4,random_state=0).fit(X)
print(f"labels:{nci_kmean.labels_}")
In [ ]: