Thursday, February 25, 2021

Chapter 10 Unsupervised Learning within Python - An Introduction to Statistical Learning

Chapter 10 Unsupervised Learning - An Introduction to Statistical Learning
In [179]:
%config IPCompleter.greedy=True
%config Completer.use_jedi = False

import numpy as np
import pandas as pd
import seaborn as sns

from pca import pca # https://pypi.org/project/pca/
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans,AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, complete, single, ward
import matplotlib.pyplot as plt
In [110]:
usArrest=pd.read_csv(r"data/USArrests.csv",index_col=0)
In [111]:
usArrest.head()
Out[111]:
Murder Assault UrbanPop Rape
Alabama 13.2 236 58 21.2
Alaska 10.0 263 48 44.5
Arizona 8.1 294 80 31.0
Arkansas 8.8 190 50 19.5
California 9.0 276 91 40.6
In [9]:
usArrest.info()
<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, Alabama to Wyoming
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Murder    50 non-null     float64
 1   Assault   50 non-null     int64  
 2   UrbanPop  50 non-null     int64  
 3   Rape      50 non-null     float64
dtypes: float64(2), int64(2)
memory usage: 2.0+ KB
In [10]:
usArrest.describe()
Out[10]:
Murder Assault UrbanPop Rape
count 50.00000 50.000000 50.000000 50.000000
mean 7.78800 170.760000 65.540000 21.232000
std 4.35551 83.337661 14.474763 9.366385
min 0.80000 45.000000 32.000000 7.300000
25% 4.07500 109.000000 54.500000 15.075000
50% 7.25000 159.000000 66.000000 20.100000
75% 11.25000 249.000000 77.750000 26.175000
max 17.40000 337.000000 91.000000 46.000000
In [11]:
sns.pairplot(usArrest)
Out[11]:
<seaborn.axisgrid.PairGrid at 0x2aa74feca48>

Lab 1: Principal Component Analysis

In [112]:
scaler=StandardScaler()
X=scaler.fit(usArrest.values).transform(usArrest.values)
print(f'mean={np.mean(X,axis=0)}')
print(f'std={np.std(X,axis=0)}')
mean=[-7.10542736e-17  1.38777878e-16 -4.39648318e-16  8.59312621e-16]
std=[1. 1. 1. 1.]
In [113]:
pca4=PCA(n_components=4)
pca4.fit(X)
Out[113]:
PCA(n_components=4)
In [14]:
pca4.components_
Out[14]:
array([[ 0.53589947,  0.58318363,  0.27819087,  0.54343209],
       [ 0.41818087,  0.1879856 , -0.87280619, -0.16731864],
       [-0.34123273, -0.26814843, -0.37801579,  0.81777791],
       [ 0.6492278 , -0.74340748,  0.13387773,  0.08902432]])
In [15]:
print(f'explained_var={pca4.explained_variance_}')
print(f'explained_var %={pca4.explained_variance_ratio_*100}')
print(f'mean={pca4.mean_}')
print(f'singular values (\u03BB)={pca4.singular_values_}')
explained_var=[2.53085875 1.00996444 0.36383998 0.17696948]
explained_var %=[62.00603948 24.74412881  8.91407951  4.33575219]
mean=[-7.10542736e-17  1.38777878e-16 -4.39648318e-16  8.59312621e-16]
singular values (λ)=[11.13607107  7.0347891   4.22234047  2.94474182]
In [114]:
XHat=pca4.transform(X)
print(f'Xhat shape={XHat.shape}')
print(f'XHat mean={np.mean(XHat,axis=0)}')
print(f'XHat std={np.std(XHat,axis=0)}')
Xhat shape=(50, 4)
XHat mean=[-8.88178420e-18 -2.66453526e-17 -5.55111512e-18 -4.99600361e-18]
XHat std=[1.57487827 0.99486941 0.59712912 0.41644938]
In [118]:
fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
ax.scatter(XHat[:,0],-XHat[:,1],alpha=0.2)
for i,state in enumerate(usArrest.index.values):
    ax.annotate(state,(XHat[i,0],-XHat[i,1]))

features = usArrest.columns
loadings = pca4.components_.T * np.sqrt(pca4.explained_variance_)

for i, feature in enumerate(features):
    plt.arrow(0, 0, loadings[i,0], -loadings[i,1],color = 'r',alpha = 0.5)
    plt.text(loadings[i,0]* 1.15, -loadings[i,1] * 1.15, feature, color = 'g', ha = 'center', va = 'center')
    

First component (x-axis) dominated with Murder, Assualt and Rape features, hence it corresponds to crime. On the other hand, the second component has UrbanPop which indicates urbanization has next important component.

Plot with PCA library (https://pypi.org/project/pca/)

In [31]:
model = pca(n_components=4)
# Fit transform
results = model.fit_transform(X)
# Plot explained variance
fig, ax = model.plot()
# Scatter first 2 PCs
fig, ax = model.scatter()
# Make biplot with the number of features
fig, ax = model.biplot(n_feat=4)
[pca] >Column labels are auto-completed.
[pca] >Row labels are auto-completed.
[pca] >The PCA reduction is performed on the [4] columns of the input dataframe.
[pca] >Fitting using PCA..
[pca] >Computing loadings and PCs..
[pca] >Computing explained variance..
[pca] >Outlier detection using Hotelling T2 test with alpha=[0.05] and n_components=[4]
[pca] >Outlier detection using SPE/DmodX with n_std=[2]
<Figure size 432x288 with 0 Axes>

PCA as dimensinality Reduction

In [19]:
pca1=PCA(n_components=1)
pca1.fit(X)
XHat1=pca1.transform(X)
X_new1=pca1.inverse_transform(XHat1)
print(f"original shape:{X.shape}")
print(f"transformed shape:{XHat1.shape}")
plt.scatter(X[:,0],X[:,1],alpha=0.2)
plt.scatter(X_new1[:,0],X_new1[:,1],alpha=0.8)
plt.axis('equal')
original shape:(50, 4)
transformed shape:(50, 1)
Out[19]:
(-1.813191341735609,
 2.4217631095424945,
 -1.9330192845420575,
 2.2030307413924994)

Lab 2 Clustering

K-Means Clustering

In [59]:
np.random.seed(123)
X=np.random.normal(0,1,size=(50,2))
X[0:25,0]=X[0:25,0]+3
X[0:25,1]=X[0:25,1]-4
plt.scatter(X[:,0],X[:,1])
Out[59]:
<matplotlib.collections.PathCollection at 0x2aa7a778748>
In [61]:
kmean=KMeans(n_clusters=2,random_state=0).fit(X)
In [90]:
print(f"cluester_centers:{kmean.cluster_centers_}")
print(f"labels:{kmean.labels_}")
print(f"inertia:{kmean.inertia_}")
print(f"#iter:{kmean.n_iter_}")
plt.scatter(X[:,0],X[:,1],alpha=0.3,c=kmean.labels_)
plt.scatter(kmean.cluster_centers_[:,0],kmean.cluster_centers_[:,0],c=['r','b'],marker='*',s=80)
cluester_centers:[[ 0.20742071 -0.1255273 ]
 [ 3.03552977 -4.00898689]]
labels:[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
inertia:125.86305660307792
#iter:3
Out[90]:
<matplotlib.collections.PathCollection at 0x2aa78c64388>
In [91]:
kmean3=KMeans(n_clusters=3,random_state=0).fit(X)
print(f"cluester_centers:{kmean3.cluster_centers_}")
print(f"labels:{kmean3.labels_}")
print(f"inertia:{kmean3.inertia_}")
print(f"#iter:{kmean3.n_iter_}")
plt.scatter(X[:,0],X[:,1],alpha=0.3,c=kmean3.labels_)
plt.scatter(kmean3.cluster_centers_[:,0],kmean3.cluster_centers_[:,1],c=['r','b','g'],marker='*',s=80)
cluester_centers:[[ 0.20742071 -0.1255273 ]
 [ 2.77052727 -4.80173812]
 [ 3.43303352 -2.81986004]]
labels:[2 1 2 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 2 1 1 2 1 2 2 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
inertia:99.66252501749231
#iter:3
Out[91]:
<matplotlib.collections.PathCollection at 0x2aa78ccdd08>

Hierarchical Clustering

In [122]:
agg_cluster_complete=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='complete').fit(X)
agg_cluster_single=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='single').fit(X)
agg_cluster_ward=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='ward').fit(X)
In [120]:
print(f'#clusters:{agg_cluster_complete.n_clusters_}')
print(f'#labels:{agg_cluster_complete.labels_}')
print(f'#leaves:{agg_cluster_complete.n_leaves_}')
#clusters:2
#labels:[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]
#leaves:50
In [109]:
print(f'#clusters:{agg_cluster_single.n_clusters_}')
print(f'#labels:{agg_cluster_single.labels_}')
print(f'#leaves:{agg_cluster_single.n_leaves_}')
#clusters:50
#labels:[47 45 41 29 44 37 49 30 39 26 25 36 28 38 43 48 31 12 27 35 13 18 33 32
 21 40 17 23 19 46 15  8 22 24 34 16 42 11  7 20 14  6  3  9  5  2 10  4
  1  0]
#leaves:50
In [160]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)
In [123]:
plt.title('Hierarchical Clustering Complete linkage Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(agg_cluster_complete, truncate_mode='level', p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()
In [124]:
plt.title('Hierarchical Clustering Single linkage Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(agg_cluster_single, truncate_mode='level', p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

NCI60 Data Example

In [120]:
nci_labels=pd.read_csv('data/NCI60_Y.csv',index_col=0)
nci_data=pd.read_csv('data/NCI60_X.csv', index_col=0)
In [121]:
nci_labels.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 64 entries, 1 to 64
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   x       64 non-null     object
dtypes: object(1)
memory usage: 1.0+ KB
In [122]:
print(nci_labels.x.unique())
nci_labels.head()
['CNS' 'RENAL' 'BREAST' 'NSCLC' 'UNKNOWN' 'OVARIAN' 'MELANOMA' 'PROSTATE'
 'LEUKEMIA' 'K562B-repro' 'K562A-repro' 'COLON' 'MCF7A-repro'
 'MCF7D-repro']
Out[122]:
x
1 CNS
2 CNS
3 CNS
4 RENAL
5 BREAST
In [123]:
nci_data.info()
<class 'pandas.core.frame.DataFrame'>
Index: 64 entries, V1 to V64
Columns: 6830 entries, 1 to 6830
dtypes: float64(6830)
memory usage: 3.3+ MB
In [124]:
nci_data.head()
Out[124]:
1 2 3 4 5 6 7 8 9 10 ... 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830
V1 0.300000 1.180000 0.550000 1.140000 -0.265000 -7.000000e-02 0.350000 -0.315000 -0.450000 -0.654981 ... -0.990019 0.000000 0.030000 -0.175000 0.629981 -0.030000 0.000000 0.280000 -0.340000 -1.930000
V2 0.679961 1.289961 0.169961 0.379961 0.464961 5.799610e-01 0.699961 0.724961 -0.040039 -0.285020 ... -0.270058 -0.300039 -0.250039 -0.535039 0.109941 -0.860039 -1.250049 -0.770039 -0.390039 -2.000039
V3 0.940000 -0.040000 -0.170000 -0.040000 -0.605000 0.000000e+00 0.090000 0.645000 0.430000 0.475019 ... 0.319981 0.120000 -0.740000 -0.595000 -0.270020 -0.150000 0.000000 -0.120000 -0.410000 0.000000
V4 0.280000 -0.310000 0.680000 -0.810000 0.625000 -1.387779e-17 0.170000 0.245000 0.020000 0.095019 ... -1.240020 -0.110000 -0.160000 0.095000 -0.350020 -0.300000 -1.150010 1.090000 -0.260000 -1.100000
V5 0.485000 -0.465000 0.395000 0.905000 0.200000 -5.000000e-03 0.085000 0.110000 0.235000 1.490019 ... 0.554980 -0.775000 -0.515000 -0.320000 0.634980 0.605000 0.000000 0.745000 0.425000 0.145000

5 rows × 6830 columns

PCA on the NCI60 Data

In [125]:
scaler=StandardScaler()
X=scaler.fit(nci_data.values).transform(nci_data.values)
pca_nc=PCA()
pca_nc.fit(X)
Out[125]:
PCA()
In [126]:
pca_nc.components_
Out[126]:
array([[-0.01068237, -0.00231208, -0.00587975, ..., -0.00197109,
         0.00779109,  0.00771462],
       [-0.00132441, -0.00167527,  0.00628943, ..., -0.00937144,
        -0.00231535,  0.00354385],
       [ 0.00850351,  0.01025659,  0.01005541, ...,  0.01049791,
         0.02013372,  0.01815808],
       ...,
       [ 0.02337703, -0.00246957,  0.01216671, ...,  0.00040545,
         0.00067878, -0.0159314 ],
       [ 0.01131504, -0.02353899,  0.02160489, ...,  0.00559842,
        -0.00780919, -0.02065693],
       [ 0.00911709, -0.15645104, -0.12943769, ..., -0.00587106,
        -0.00154497,  0.00089959]])
In [127]:
df_variance=pd.DataFrame({'explained_var':pca_nc.explained_variance_,
                          'explained_var_percentage':pca_nc.explained_variance_ratio_*100,
                         'explained_var_cum_percentage':np.cumsum(pca_nc.explained_variance_ratio_*100)})
In [128]:
sns.set(rc={'figure.figsize':(14,4)})
sns.barplot(x=list(range(df_variance.shape[0])),y='explained_var',data=df_variance).set_title('Explained Variance')
Out[128]:
Text(0.5, 1.0, 'Explained Variance')
In [129]:
sns.set(rc={'figure.figsize':(14,4)})
sns.barplot(x=list(range(df_variance.shape[0])),y='explained_var_percentage',data=df_variance).set_title('Explained Variance %')
Out[129]:
Text(0.5, 1.0, 'Explained Variance %')
In [130]:
sns.set(rc={'figure.figsize':(14,4)})
sns.pointplot(x=list(range(df_variance.shape[0])),y='explained_var_cum_percentage',data=df_variance).set_title('Explained Cum Variance %')
Out[130]:
Text(0.5, 1.0, 'Explained Cum Variance %')
In [131]:
XHat=pca_nc.transform(X)
print(f'Xhat shape={XHat.shape}')
Xhat shape=(64, 64)
In [154]:
sns.set(rc={'figure.figsize':(10,5)})
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,5))
df_pcas=pd.DataFrame({'Z1':XHat[:,0],
              'Z2':-XHat[:,1],
              'Z3':XHat[:,2],                      
            'Labels':nci_labels.x})

g=sns.scatterplot(x='Z1',y='Z2',hue='Labels',data=df_pcas, ax=ax1,legend=False)
g.set_title('Projection of NCI60 to Z1 and Z2 components')

g=sns.scatterplot(x='Z1',y='Z3',hue='Labels',data=df_pcas,ax=ax2)
g.set_title('Projection of NCI60 to Z1 and Z3 components')
g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)
Out[154]:
<matplotlib.legend.Legend at 0x26441639508>

Clustering the Observations of the NCI60 Data

In [155]:
agg_nci_complete=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='complete').fit(X)
agg_nci_single=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='single').fit(X)
agg_nci_ward=AgglomerativeClustering(distance_threshold=0, n_clusters=None,affinity='euclidean',linkage='ward').fit(X)
In [156]:
print(f'#clusters:{agg_nci_complete.n_clusters_}')
print(f'#labels:{agg_nci_complete.labels_}')
print(f'#leaves:{agg_nci_complete.n_leaves_}')
#clusters:64
#labels:[57 32 47 41 48 37 49 59 42 39 50 55 61 60 52 58 40 33 63 35 28 34 38 16
 56 45 23 27 46 19 18 51 31 20 62 54 29 43 53 44 25 13 15 36 24 22 26 30
 14  6 21 11 17  8 12  7  3  5  9  2 10  4  1  0]
#leaves:64
In [157]:
print(f'#clusters:{agg_nci_single.n_clusters_}')
print(f'#labels:{agg_nci_single.labels_}')
print(f'#leaves:{agg_nci_single.n_leaves_}')
#clusters:64
#labels:[57 52 53 49 55 51 47 39 42 59 40 50 56 43 48 37 25 63 58 23 28 27 32 31
 41 33 54 19 34 20 38 46 45 35 60 36 29 44 24 11 22 13 16 21  9 26 15 10
 61 62 30 18 14 17  8 12  5  7  6  3  2  4  1  0]
#leaves:64
In [158]:
print(f'#clusters:{agg_nci_ward.n_clusters_}')
print(f'#labels:{agg_nci_ward.labels_}')
print(f'#leaves:{agg_nci_ward.n_leaves_}')
#clusters:64
#labels:[57 61 44 42 59 47 51 53 43 45 48 55 46 39 52 36 22 21 63 56 58 54 35 37
 60 41 50 28 23 40 38 25 20 26 62 33 29 49 12 34 16 27 32 19 24 17 13  9
 30 14 31 18 15  8  6 11  7 10  5  2  3  4  1  0]
#leaves:64
In [222]:
fig, (ax1,ax2,ax3) = plt.subplots(1, 3, figsize=(17, 15))
ax1.title.set_text('NCI60 Complete Linkage Dendrogram')
dendrogram(complete(X),labels=list(nci_labels.x),orientation='right', color_threshold=0, leaf_font_size=9, ax=ax1)
ax2.title.set_text('NCI60 Ward Linkage Dendrogram')
dendrogram(ward(X),labels=list(nci_labels.x),orientation='right', color_threshold=0, leaf_font_size=9, ax=ax2)
ax3.title.set_text('NCI60 Single Linkage Dendrogram')
dendrogram(single(X),labels=list(nci_labels.x),orientation='right', color_threshold=0, leaf_font_size=9, ax=ax3)
plt.show()