%config IPCompleter.greedy=True
%config Completer.use_jedi = False

import numpy as np
import pandas as pd
import seaborn as sns

from pca import pca # https://pypi.org/project/pca/
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans,AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, complete, single, ward
import matplotlib.pyplot as plt

usArrest=pd.read_csv(r"data/USArrests.csv",index_col=0)

usArrest.head()

usArrest.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, Alabama to Wyoming
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Murder    50 non-null     float64
 1   Assault   50 non-null     int64  
 2   UrbanPop  50 non-null     int64  
 3   Rape      50 non-null     float64
dtypes: float64(2), int64(2)
memory usage: 2.0+ KB

usArrest.describe()

sns.pairplot(usArrest)

<seaborn.axisgrid.PairGrid at 0x2aa74feca48>

Lab 1: Principal Component Analysis¶

scaler=StandardScaler()
X=scaler.fit(usArrest.values).transform(usArrest.values)
print(f'mean={np.mean(X,axis=0)}')
print(f'std={np.std(X,axis=0)}')

mean=[-7.10542736e-17  1.38777878e-16 -4.39648318e-16  8.59312621e-16]
std=[1. 1. 1. 1.]

pca4=PCA(n_components=4)
pca4.fit(X)

PCA(n_components=4)

pca4.components_

array([[ 0.53589947,  0.58318363,  0.27819087,  0.54343209],
       [ 0.41818087,  0.1879856 , -0.87280619, -0.16731864],
       [-0.34123273, -0.26814843, -0.37801579,  0.81777791],
       [ 0.6492278 , -0.74340748,  0.13387773,  0.08902432]])

print(f'explained_var={pca4.explained_variance_}')
print(f'explained_var %={pca4.explained_variance_ratio_*100}')
print(f'mean={pca4.mean_}')
print(f'singular values (\u03BB)={pca4.singular_values_}')

explained_var=[2.53085875 1.00996444 0.36383998 0.17696948]
explained_var %=[62.00603948 24.74412881  8.91407951  4.33575219]
mean=[-7.10542736e-17  1.38777878e-16 -4.39648318e-16  8.59312621e-16]
singular values (λ)=[11.13607107  7.0347891   4.22234047  2.94474182]

XHat=pca4.transform(X)
print(f'Xhat shape={XHat.shape}')
print(f'XHat mean={np.mean(XHat,axis=0)}')
print(f'XHat std={np.std(XHat,axis=0)}')

Xhat shape=(50, 4)
XHat mean=[-8.88178420e-18 -2.66453526e-17 -5.55111512e-18 -4.99600361e-18]
XHat std=[1.57487827 0.99486941 0.59712912 0.41644938]

fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
ax.scatter(XHat[:,0],-XHat[:,1],alpha=0.2)
for i,state in enumerate(usArrest.index.values):
    ax.annotate(state,(XHat[i,0],-XHat[i,1]))

features = usArrest.columns
loadings = pca4.components_.T * np.sqrt(pca4.explained_variance_)

for i, feature in enumerate(features):
    plt.arrow(0, 0, loadings[i,0], -loadings[i,1],color = 'r',alpha = 0.5)
    plt.text(loadings[i,0]* 1.15, -loadings[i,1] * 1.15, feature, color = 'g', ha = 'center', va = 'center')

First component (x-axis) dominated with Murder, Assualt and Rape features, hence it corresponds to crime. On the other hand, the second component has UrbanPop which indicates urbanization has next important component.

Plot with PCA library (https://pypi.org/project/pca/)¶

model = pca(n_components=4)
# Fit transform
results = model.fit_transform(X)
# Plot explained variance
fig, ax = model.plot()
# Scatter first 2 PCs
fig, ax = model.scatter()
# Make biplot with the number of features
fig, ax = model.biplot(n_feat=4)

[pca] >Column labels are auto-completed.
[pca] >Row labels are auto-completed.
[pca] >The PCA reduction is performed on the [4] columns of the input dataframe.
[pca] >Fitting using PCA..
[pca] >Computing loadings and PCs..
[pca] >Computing explained variance..
[pca] >Outlier detection using Hotelling T2 test with alpha=[0.05] and n_components=[4]
[pca] >Outlier detection using SPE/DmodX with n_std=[2]

<Figure size 432x288 with 0 Axes>

PCA as dimensinality Reduction¶

pca1=PCA(n_components=1)
pca1.fit(X)
XHat1=pca1.transform(X)
X_new1=pca1.inverse_transform(XHat1)
print(f"original shape:{X.shape}")
print(f"transformed shape:{XHat1.shape}")
plt.scatter(X[:,0],X[:,1],alpha=0.2)
plt.scatter(X_new1[:,0],X_new1[:,1],alpha=0.8)
plt.axis('equal')

original shape:(50, 4)
transformed shape:(50, 1)

(-1.813191341735609,
 2.4217631095424945,
 -1.9330192845420575,
 2.2030307413924994)

Lab 2 Clustering¶

K-Means Clustering¶

np.random.seed(123)
X=np.random.normal(0,1,size=(50,2))
X[0:25,0]=X[0:25,0]+3
X[0:25,1]=X[0:25,1]-4
plt.scatter(X[:,0],X[:,1])

<matplotlib.collections.PathCollection at 0x2aa7a778748>

kmean=KMeans(n_clusters=2,random_state=0).fit(X)

print(f"cluester_centers:{kmean.cluster_centers_}")
print(f"labels:{kmean.labels_}")
print(f"inertia:{kmean.inertia_}")
print(f"#iter:{kmean.n_iter_}")
plt.scatter(X[:,0],X[:,1],alpha=0.3,c=kmean.labels_)
plt.scatter(kmean.cluster_centers_[:,0],kmean.cluster_centers_[:,0],c=['r','b'],marker='*',s=80)

cluester_centers:[[ 0.20742071 -0.1255273 ]
 [ 3.03552977 -4.00898689]]
labels:[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
inertia:125.86305660307792
#iter:3

<matplotlib.collections.PathCollection at 0x2aa78c64388>

kmean3=KMeans(n_clusters=3,random_state=0).fit(X)
print(f"cluester_centers:{kmean3.cluster_centers_}")
print(f"labels:{kmean3.labels_}")
print(f"inertia:{kmean3.inertia_}")
print(f"#iter:{kmean3.n_iter_}")
plt.scatter(X[:,0],X[:,1],alpha=0.3,c=kmean3.labels_)
plt.scatter(kmean3.cluster_centers_[:,0],kmean3.cluster_centers_[:,1],c=['r','b','g'],marker='*',s=80)

cluester_centers:[[ 0.20742071 -0.1255273 ]
 [ 2.77052727 -4.80173812]
 [ 3.43303352 -2.81986004]]
labels:[2 1 2 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 2 1 1 2 1 2 2 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
inertia:99.66252501749231
#iter:3

<matplotlib.collections.PathCollection at 0x2aa78ccdd08>

Hierarchical Clustering¶

agg_cluster_complete=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='complete').fit(X)
agg_cluster_single=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='single').fit(X)
agg_cluster_ward=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='ward').fit(X)

print(f'#clusters:{agg_cluster_complete.n_clusters_}')
print(f'#labels:{agg_cluster_complete.labels_}')
print(f'#leaves:{agg_cluster_complete.n_leaves_}')

#clusters:2
#labels:[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]
#leaves:50

print(f'#clusters:{agg_cluster_single.n_clusters_}')
print(f'#labels:{agg_cluster_single.labels_}')
print(f'#leaves:{agg_cluster_single.n_leaves_}')

#clusters:50
#labels:[47 45 41 29 44 37 49 30 39 26 25 36 28 38 43 48 31 12 27 35 13 18 33 32
 21 40 17 23 19 46 15  8 22 24 34 16 42 11  7 20 14  6  3  9  5  2 10  4
  1  0]
#leaves:50

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

plt.title('Hierarchical Clustering Complete linkage Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(agg_cluster_complete, truncate_mode='level', p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

plt.title('Hierarchical Clustering Single linkage Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(agg_cluster_single, truncate_mode='level', p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

NCI60 Data Example¶

nci_labels=pd.read_csv('data/NCI60_Y.csv',index_col=0)
nci_data=pd.read_csv('data/NCI60_X.csv', index_col=0)

nci_labels.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64 entries, 1 to 64
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   x       64 non-null     object
dtypes: object(1)
memory usage: 1.0+ KB

print(nci_labels.x.unique())
nci_labels.head()

['CNS' 'RENAL' 'BREAST' 'NSCLC' 'UNKNOWN' 'OVARIAN' 'MELANOMA' 'PROSTATE'
 'LEUKEMIA' 'K562B-repro' 'K562A-repro' 'COLON' 'MCF7A-repro'
 'MCF7D-repro']

nci_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64 entries, V1 to V64
Columns: 6830 entries, 1 to 6830
dtypes: float64(6830)
memory usage: 3.3+ MB

nci_data.head()

PCA on the NCI60 Data¶

scaler=StandardScaler()
X=scaler.fit(nci_data.values).transform(nci_data.values)
pca_nc=PCA()
pca_nc.fit(X)

PCA()

pca_nc.components_

array([[-0.01068237, -0.00231208, -0.00587975, ..., -0.00197109,
         0.00779109,  0.00771462],
       [-0.00132441, -0.00167527,  0.00628943, ..., -0.00937144,
        -0.00231535,  0.00354385],
       [ 0.00850351,  0.01025659,  0.01005541, ...,  0.01049791,
         0.02013372,  0.01815808],
       ...,
       [ 0.02337703, -0.00246957,  0.01216671, ...,  0.00040545,
         0.00067878, -0.0159314 ],
       [ 0.01131504, -0.02353899,  0.02160489, ...,  0.00559842,
        -0.00780919, -0.02065693],
       [ 0.00911709, -0.15645104, -0.12943769, ..., -0.00587106,
        -0.00154497,  0.00089959]])

df_variance=pd.DataFrame({'explained_var':pca_nc.explained_variance_,
                          'explained_var_percentage':pca_nc.explained_variance_ratio_*100,
                         'explained_var_cum_percentage':np.cumsum(pca_nc.explained_variance_ratio_*100)})

sns.set(rc={'figure.figsize':(14,4)})
sns.barplot(x=list(range(df_variance.shape[0])),y='explained_var',data=df_variance).set_title('Explained Variance')

Text(0.5, 1.0, 'Explained Variance')

sns.set(rc={'figure.figsize':(14,4)})
sns.barplot(x=list(range(df_variance.shape[0])),y='explained_var_percentage',data=df_variance).set_title('Explained Variance %')

Text(0.5, 1.0, 'Explained Variance %')

sns.set(rc={'figure.figsize':(14,4)})
sns.pointplot(x=list(range(df_variance.shape[0])),y='explained_var_cum_percentage',data=df_variance).set_title('Explained Cum Variance %')

Text(0.5, 1.0, 'Explained Cum Variance %')

XHat=pca_nc.transform(X)
print(f'Xhat shape={XHat.shape}')

Xhat shape=(64, 64)

sns.set(rc={'figure.figsize':(10,5)})
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,5))
df_pcas=pd.DataFrame({'Z1':XHat[:,0],
              'Z2':-XHat[:,1],
              'Z3':XHat[:,2],                      
            'Labels':nci_labels.x})

g=sns.scatterplot(x='Z1',y='Z2',hue='Labels',data=df_pcas, ax=ax1,legend=False)
g.set_title('Projection of NCI60 to Z1 and Z2 components')

g=sns.scatterplot(x='Z1',y='Z3',hue='Labels',data=df_pcas,ax=ax2)
g.set_title('Projection of NCI60 to Z1 and Z3 components')
g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)

<matplotlib.legend.Legend at 0x26441639508>

Clustering the Observations of the NCI60 Data¶

agg_nci_complete=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='complete').fit(X)
agg_nci_single=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='single').fit(X)
agg_nci_ward=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='ward').fit(X)

print(f'#clusters:{agg_nci_complete.n_clusters_}')
print(f'#labels:{agg_nci_complete.labels_}')
print(f'#leaves:{agg_nci_complete.n_leaves_}')

#clusters:64
#labels:[57 32 47 41 48 37 49 59 42 39 50 55 61 60 52 58 40 33 63 35 28 34 38 16
 56 45 23 27 46 19 18 51 31 20 62 54 29 43 53 44 25 13 15 36 24 22 26 30
 14  6 21 11 17  8 12  7  3  5  9  2 10  4  1  0]
#leaves:64

print(f'#clusters:{agg_nci_single.n_clusters_}')
print(f'#labels:{agg_nci_single.labels_}')
print(f'#leaves:{agg_nci_single.n_leaves_}')

#clusters:64
#labels:[57 52 53 49 55 51 47 39 42 59 40 50 56 43 48 37 25 63 58 23 28 27 32 31
 41 33 54 19 34 20 38 46 45 35 60 36 29 44 24 11 22 13 16 21  9 26 15 10
 61 62 30 18 14 17  8 12  5  7  6  3  2  4  1  0]
#leaves:64

print(f'#clusters:{agg_nci_ward.n_clusters_}')
print(f'#labels:{agg_nci_ward.labels_}')
print(f'#leaves:{agg_nci_ward.n_leaves_}')

#clusters:64
#labels:[57 61 44 42 59 47 51 53 43 45 48 55 46 39 52 36 22 21 63 56 58 54 35 37
 60 41 50 28 23 40 38 25 20 26 62 33 29 49 12 34 16 27 32 19 24 17 13  9
 30 14 31 18 15  8  6 11  7 10  5  2  3  4  1  0]
#leaves:64

fig, (ax1,ax2,ax3) = plt.subplots(1, 3, figsize=(17, 15))
ax1.title.set_text('NCI60 Complete Linkage Dendrogram')
dendrogram(complete(X),labels=list(nci_labels.x),orientation='right', color_threshold=0, leaf_font_size=9, ax=ax1)
ax2.title.set_text('NCI60 Ward Linkage Dendrogram')
dendrogram(ward(X),labels=list(nci_labels.x),orientation='right', color_threshold=0, leaf_font_size=9, ax=ax2)
ax3.title.set_text('NCI60 Single Linkage Dendrogram')
dendrogram(single(X),labels=list(nci_labels.x),orientation='right', color_threshold=0, leaf_font_size=9, ax=ax3)
plt.show()

fig, axes = plt.subplots(1,1, figsize=(6,15))
plt.title('NCI60 Complete Linkage Dendrogram')
lenX=nci_labels.shape[0]
dendrogram(complete(X),labels=list(nci_labels.x), color_threshold=139, leaf_font_size=9,  orientation='right')
plt.vlines(140,0,plt.gca().yaxis.get_data_interval()[1], colors='b', linestyles='dashed')
plt.show()

nci_kmean=KMeans(n_clusters=4,random_state=0).fit(X)
print(f"labels:{nci_kmean.labels_}")

labels:[3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1
 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 3 3 3 2 2 2 2 2 2 2 2 2]

	Murder	Assault	UrbanPop	Rape
Alabama	13.2	236	58	21.2
Alaska	10.0	263	48	44.5
Arizona	8.1	294	80	31.0
Arkansas	8.8	190	50	19.5
California	9.0	276	91	40.6

	Murder	Assault	UrbanPop	Rape
count	50.00000	50.000000	50.000000	50.000000
mean	7.78800	170.760000	65.540000	21.232000
std	4.35551	83.337661	14.474763	9.366385
min	0.80000	45.000000	32.000000	7.300000
25%	4.07500	109.000000	54.500000	15.075000
50%	7.25000	159.000000	66.000000	20.100000
75%	11.25000	249.000000	77.750000	26.175000
max	17.40000	337.000000	91.000000	46.000000

	1	2	3	4	5	6	7	8	9	10	...	6821	6822	6823	6824	6825	6826	6827	6828	6829	6830
V1	0.300000	1.180000	0.550000	1.140000	-0.265000	-7.000000e-02	0.350000	-0.315000	-0.450000	-0.654981	...	-0.990019	0.000000	0.030000	-0.175000	0.629981	-0.030000	0.000000	0.280000	-0.340000	-1.930000
V2	0.679961	1.289961	0.169961	0.379961	0.464961	5.799610e-01	0.699961	0.724961	-0.040039	-0.285020	...	-0.270058	-0.300039	-0.250039	-0.535039	0.109941	-0.860039	-1.250049	-0.770039	-0.390039	-2.000039
V3	0.940000	-0.040000	-0.170000	-0.040000	-0.605000	0.000000e+00	0.090000	0.645000	0.430000	0.475019	...	0.319981	0.120000	-0.740000	-0.595000	-0.270020	-0.150000	0.000000	-0.120000	-0.410000	0.000000
V4	0.280000	-0.310000	0.680000	-0.810000	0.625000	-1.387779e-17	0.170000	0.245000	0.020000	0.095019	...	-1.240020	-0.110000	-0.160000	0.095000	-0.350020	-0.300000	-1.150010	1.090000	-0.260000	-1.100000
V5	0.485000	-0.465000	0.395000	0.905000	0.200000	-5.000000e-03	0.085000	0.110000	0.235000	1.490019	...	0.554980	-0.775000	-0.515000	-0.320000	0.634980	0.605000	0.000000	0.745000	0.425000	0.145000

Deniz's Notes

Thursday, February 25, 2021

Chapter 10 Unsupervised Learning within Python - An Introduction to Statistical Learning