plt.scatter(X[:, 0], X[:, 1], c='black', marker='o', s=50)
plt.show()


plt.scatter(X[:, 0], X[:, 1], c=y, marker='o', s=50, cmap='jet')
plt.show()


from sklearn.datasets import make_blobs

X2, y2 = make_blobs(n_samples=150, 
                  n_features=2, 
                  centers=3, 
                  cluster_std=.7, 
                  shuffle=True, 
                  random_state=0)

plt.scatter(X2[:, 0], X2[:, 1], c='black', marker='o', s=50)
plt.show()


plt.scatter(X[:, 0], X[:, 1], c='black', marker='o', s=50)
plt.show()


from sklearn.cluster import KMeans

km = KMeans(n_clusters=3, 
            init='random', 
            n_init=10, 
            max_iter=300,
            tol=1e-04,
            random_state=0)
y_km = km.fit_predict(X)

plt.scatter(X[y_km == 0, 0],
            X[y_km == 0, 1],
            s=50,
            c='lightgreen',
            marker='s',
            label='cluster 1')
plt.scatter(X[y_km == 1, 0],
            X[y_km == 1, 1],
            s=50,
            c='orange',
            marker='o',
            label='cluster 2')
plt.scatter(X[y_km == 2, 0],
            X[y_km == 2, 1],
            s=50,
            c='lightblue',
            marker='v',
            label='cluster 3')
plt.scatter(km.cluster_centers_[:, 0],
            km.cluster_centers_[:, 1],
            s=250,
            marker='*',
            c='red',
            label='centroids')
plt.legend()
plt.tight_layout()
plt.show()


plt.scatter(X[:, 0], X[:, 1], c='black', marker='o', s=50)
plt.show()


print(X.shape[0])
print(X.shape[1])

150
2


plt.scatter(X[:, 0], X[:, 1], c='black', marker='o', s=50); 
plt.show()


def init(X, k):
    N, dim = X.shape
    # we select k samples from the points we have randomly
    idx = np.random.choice(N, k, replace=False)
    # the centers are just the k-random sampled points
    return X[idx]


def get_assignments(X, centers):
    P = X[:, np.newaxis, :]  # Nx1xD
    diff_sq = (P - centers)**2  # Nx1xD - KxD -> NxKxD # broadcasting
    # more info on broadcasting https://numpy.org/doc/stable/user/basics.broadcasting.html
    # dist = sum_over_axis(diff^2) then take argmin of dist across centers
    y = diff_sq.sum(axis=2).argmin(axis=1)
    return y  # Nx1 and the values inside go from [1...K]


def update_centers(X, y, K):
    # select the points to the k center and remove the N dimension with mean over axis 0
    y_row = y.reshape((1, -1)) #1xN
    k_col = np.arange(K).reshape(-1, 1) #Kx1
    
    #M[i][j] = 1 if X[j,:] belongs to cluster i else 0 
    M = k_col == y_row  #KxN
    
    #cluster_sum[i,:] contains the sum of all vectors in cluster i
    cluster_sum = M @ X #KxN @ NxD -> KxD 
    
    #counts[i,:] contains the number of vectors in cluster i
    counts = np.sum(M, axis=1).reshape(-1, 1) #Kx1
    
    return cluster_sum / counts #KxD


# Double check
np.random.seed(0)  # make the method deterministic
from sklearn.metrics import pairwise_distances_argmin
centers = init(X, 3)
y_assignments = get_assignments(X, centers)
labels = pairwise_distances_argmin(X, centers)
assert np.array_equal(labels, y_assignments), 'Assignment not correct!'


def kmeans(X, n_clusters):
    np.random.seed(0)  # make the method deterministic
    iter = 1
    #1. init random sample the centers
    centers = init(X, n_clusters)
    
    while True:
        # 2. Assignment step 
        y_assignments = get_assignments(X, centers)
        
        # 3. Update step
        new_centers = update_centers(X, y_assignments, n_clusters)
        
        # 4. Check for convergence
        # This is a possible implementation
        # A more formal is to check that the loss is plateauing (more on this later)
        if np.allclose(centers, new_centers):
            break
        # update cluster for next iter
        centers = new_centers
        
        print(f'Iteration {iter}')
        iter += 1
    
    return centers, y_assignments


centers, y_est = kmeans(X, 3)
plt.scatter(*X.T, c=y_est,
            s=50, cmap='viridis')
_ = plt.scatter(*centers.T,
                s=250,
                marker='*',
                c='red',
                label='centroids')

Iteration 1
Iteration 2
Iteration 3


from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=5000, 
                  n_features=3, 
                  centers=25, 
                  cluster_std=1.25, 
                  shuffle=True, 
                  random_state=0)


fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(projection='3d')

ax.scatter(*X.T, c=y,
            s=50, cmap='jet')

ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
_ = ax.set_zlabel('Z Label')


centers, y_est = kmeans(X, 20)

Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 30
Iteration 31
Iteration 32


fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(projection='3d')

ax.scatter(*X.T, c=y_est,
            s=10, cmap='viridis')

ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
_ = ax.set_zlabel('Z Label')
_ = ax.scatter(*centers.T,
                s=250,
                marker='*',
                c='red',
                label='centroids')


from sklearn.datasets import make_blobs
from sklearn.metrics import pairwise_distances_argmin

X, y_true = make_blobs(n_samples=300, centers=4,
                       cluster_std=0.60, random_state=0)

rng = np.random.RandomState(42)
centers = [0, 4] + rng.randn(4, 2)

def draw_points(ax, c, factor=1):
    ax.scatter(X[:, 0], X[:, 1], c=c, cmap='viridis',
               s=50 * factor, alpha=0.3)
    
def draw_centers(ax, centers, factor=1, alpha=1.0):
    ax.scatter(centers[:, 0], centers[:, 1],
               c=np.arange(4), cmap='viridis', s=200 * factor,
               alpha=alpha)
    ax.scatter(centers[:, 0], centers[:, 1],
               c='black', s=50 * factor, alpha=alpha)

def make_ax(fig, gs):
    ax = fig.add_subplot(gs)
    ax.xaxis.set_major_formatter(plt.NullFormatter())
    ax.yaxis.set_major_formatter(plt.NullFormatter())
    return ax

fig = plt.figure(figsize=(15, 4))
gs = plt.GridSpec(4, 15, left=0.02, right=0.98, bottom=0.05, top=0.95, wspace=0.2, hspace=0.2)
ax0 = make_ax(fig, gs[:4, :4])
ax0.text(0.98, 0.98, "Random Initialization", transform=ax0.transAxes,
         ha='right', va='top', size=16)
draw_points(ax0, 'gray', factor=2)
draw_centers(ax0, centers, factor=2)

for i in range(3):
    ax1 = make_ax(fig, gs[:2, 4 + 2 * i:6 + 2 * i])
    ax2 = make_ax(fig, gs[2:, 5 + 2 * i:7 + 2 * i])
    
    # E-step
    y_pred = pairwise_distances_argmin(X, centers)
    draw_points(ax1, y_pred)
    draw_centers(ax1, centers)
    
    # M-step
    new_centers = np.array([X[y_pred == i].mean(0) for i in range(4)])
    draw_points(ax2, y_pred)
    draw_centers(ax2, centers, alpha=0.3)
    draw_centers(ax2, new_centers)
    for i in range(4):
        ax2.annotate('', new_centers[i], centers[i],
                     arrowprops=dict(arrowstyle='->', linewidth=1))
        
    
    # Finish iteration
    centers = new_centers
    ax1.text(0.95, 0.95, "E-Step", transform=ax1.transAxes, ha='right', va='top', size=14)
    ax2.text(0.95, 0.95, "M-Step", transform=ax2.transAxes, ha='right', va='top', size=14)


# Final E-step    
y_pred = pairwise_distances_argmin(X, centers)
axf = make_ax(fig, gs[:4, -4:])
draw_points(axf, y_pred, factor=2)
draw_centers(axf, centers, factor=2)
axf.text(0.98, 0.98, "Final Clustering", transform=axf.transAxes,
         ha='right', va='top', size=16)


fig.savefig('figs/05.11-expectation-maximization.png')

/var/folders/rt/lg7n4lt1489270pz_18qn1_c0000gp/T/ipykernel_88826/3638793124.py:11: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  ax.scatter(X[:, 0], X[:, 1], c=c, cmap='viridis',


from sklearn.datasets import make_moons
from sklearn.cluster import KMeans

X, y = make_moons(200, noise=.05, random_state=0)
labels = KMeans(2, random_state=0).fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels,
            s=50, cmap='viridis');

dist = np.array([1.18518298, 1.30917493, 1.10973212, 2.24523519, 1.01625606])


fig, axs = plt.subplots(1, 2)
fig.set_figheight(3)
fig.set_figwidth(15)
# PDF
axs[0].stem(pmf, linefmt='b-', markerfmt='bo', basefmt='--')
axs[0].set_title('PMF')
axs[0].set_xlabel('Index of distance')
axs[0].set_ylabel('Probability')
axs[0].set_aspect('auto')
# CUMSUM
axs[1].plot(np.insert(pmf.cumsum(),0,0), 'o--')
axs[1].set_title('CMF')
axs[1].set_xlabel('Index of distance')
axs[1].set_ylabel('Cumulative Probability')
axs[1].set_aspect('auto')
plt.show()


fig, axs = plt.subplots(1, 2)
fig.set_figheight(5)
fig.set_figwidth(20)
# PDF
axs[0].stem(pmf, linefmt='b-', markerfmt='bo', basefmt='--')
axs[0].set_title('PMF')
axs[0].set_xlabel('Index of distance')
axs[0].set_ylabel('Probability')
axs[0].set_aspect('auto')
# CUMSUM
axs[1].plot(np.insert(pmf.cumsum(),0,0), 'o--')
axs[1].set_title('CMF')
axs[1].set_xlabel('Index of distance')
axs[1].set_ylabel('Cumulative Probability')
axs[1].set_aspect('auto')
#axs[1].stem(pmf, linefmt='b-', markerfmt='bo', basefmt='--')
plt.show()

U = np.random.rand(1000) #random sample uniformly from [0, 1]. 
# u ~ U[0,1]
# for each sampled u.
# find the first bin for which u > cumsum. 
# Get the index of that bin as the sample.
sampled_idx = np.argmin((U[:, None] > pmf.cumsum()), axis=1)

import random
random.choices(range(0,5), weights=pmf, k=1000) #support pmf, pmf, k=how many to sample


U = np.random.rand(5)
sampled_idx = np.argmin((U[:, None] > pmf.cumsum()), axis=1)
_=plt.bar(*np.unique(sampled_idx,return_counts=True))


U = np.random.rand(10000)
sampled_idx = np.argmin((U[:, None] > pmf.cumsum()), axis=1)
#_ = plt.hist(sampled_idx, histtype='bar')
_=plt.bar(*np.unique(sampled_idx,return_counts=True))


from sklearn.datasets import make_blobs
N_centers = 25
X, y = make_blobs(n_samples=1500,
                    n_features=2,
                    centers=N_centers,
                    cluster_std=.42,
                    shuffle=True,
                    random_state=0)

plt.scatter(X[:, 0], X[:, 1], c='black', marker='o', s=50)
plt.show()


def inverse_sampling(pmf, n_samples):
    U = np.random.rand(n_samples)
    sampled_idx = np.argmin((U[:, None] > pmf.cumsum()), axis=1)
    return sampled_idx


def distance(X, centers):
    P = X[:, np.newaxis, :]  # Nx1xD
    diff_sq = (P - centers)**2  # Nx1xD - KxD -> NxKxD # broadcasting
    # more info on broadcasting https://numpy.org/doc/stable/user/basics.broadcasting.html
    # dist = sum_over_axis(diff^2) then take argmin of dist across centers
    diff_sq_sum = diff_sq.sum(axis=2)
    return diff_sq_sum


def init(X, K, plusplus=True):
    N, dim = X.shape
    if not plusplus:
        print('> Init with random sampling')
        # we select k samples from the points we have randomly
        idx = np.random.choice(N, K, replace=False)
        # the centers are just the k-random sampled points
        return X[idx]
    else:
        print('> Init with kmeans++')
        # we select a point
        idx_first = np.random.choice(N, 1)
        # init the centers and distances
        centers = X[idx_first]
        dists = distance(X, centers)
        for k in range(1, K):  # for [1.....K-1]
            # compute min across all k-centers, for all points
            # dists is NxK so min_dist is Nx1
            min_dist = dists.min(axis=1)
            # select one of the N proportional to higher distance
            idx = inverse_sampling(min_dist/min_dist.sum(), 1)
            new_center = X[idx]
            # compute distance between all points to new center
            dist_new = distance(X, new_center)
            # update pool of centers and distances
            centers = np.vstack([centers, new_center])
            dists = np.hstack([dists, dist_new])
        # the centers are computed after the loop
        return centers


def get_assignments(X, centers):
    dist_sq = distance(X, centers)  # get distance #Nxk
    y = dist_sq.argmin(axis=1)  # select argmin distance #Nx1
    return y, dist_sq.min(axis=1)  # Nx1 and the values inside go from [1...K]


def update_centers(X, y, K):
    # select the points to the k center and remove the N dimension with mean over axis 0
    y_row = y.reshape((1, -1)) #1xN
    k_col = np.arange(K).reshape(-1, 1) #Kx1
    
    #M[i][j] = 1 if X[j,:] belongs to cluster i else 0 
    M = k_col == y_row  #KxN
    
    #cluster_sum[i,:] contains the sum of all vectors in cluster i
    cluster_sum = M @ X #KxN @ NxD -> KxD 
    
    #counts[i,:] contains the number of vectors in cluster i
    counts = np.sum(M, axis=1).reshape(-1, 1) #Kx1
    
    return cluster_sum / counts #KxD


# Double check
np.random.seed(0)  # make the method deterministic
from sklearn.metrics import pairwise_distances_argmin
centers = init(X, N_centers)
y_assignments, _ = get_assignments(X, centers)
labels = pairwise_distances_argmin(X, centers)
assert np.array_equal(labels, y_assignments), 'Assignment not correct!'

> Init with kmeans++


def kmeans(X, n_clusters):
    np.random.seed(0)  # make the method deterministic
    iter = 1
    # 1. init random sample the centers
    centers = init(X, n_clusters)
    loss = []
    while True:
        # 2. Assignment step
        y_assignments, dist_sq = get_assignments(X, centers)
        # distance between points and center, average over points
        loss.append(dist_sq.mean())
        # 3. Update step
        new_centers = update_centers(X, y_assignments, n_clusters)

        # 4. Check for convergence
        # If two last losses are close enough we stop
        if len(loss) > 2 and np.allclose(loss[-2], loss[-1], atol=1e-5):
            break
        # update cluster for next iter
        centers = new_centers

        print(f'> Iteration {iter}: loss {loss[-1]}')
        iter += 1

    return centers, y_assignments, loss


centers, y_est, loss = kmeans(X, N_centers)

> Init with kmeans++
> Iteration 1: loss 1.7296452170971994
> Iteration 2: loss 0.8938248433703356
> Iteration 3: loss 0.7897878328750901
> Iteration 4: loss 0.7802986159291133
> Iteration 5: loss 0.779175352001348
> Iteration 6: loss 0.7782299977540579
> Iteration 7: loss 0.777968597663401
> Iteration 8: loss 0.7778315053744624


fig, ax = plt.subplots(1, 3, figsize=(20, 5))
ax[0].scatter(*X.T, c=y_est,
              s=50, cmap='viridis')
ax[0].set_title('Result')
_ = ax[0].scatter(*centers.T,
                  s=250,
                  marker='*',
                  c='red',
                  label='centroids')
_ = ax[1].plot(loss)
_ = ax[1].set_title('Loss function')
_ = ax[2].scatter(*X.T, c=y,
              s=50, cmap='viridis')
_ = ax[2].set_title('Expected')


from sklearn.datasets import load_digits
digits = load_digits()
digits.data.shape

(1797, 64)


_ = plt.hist(digits.data.flatten())


np.random.seed(13)  # fixing the seed to see same pictures

fig, ax = plt.subplots(2, 5, figsize=(8, 3))
idx = np.random.choice(digits.data.shape[0], 10)
labels = digits.target[idx, ...]
points = digits.data[idx, ...].reshape(-1, 8, 8)
for axi, vector, y in zip(ax.flat, points, labels):
    axi.set(xticks=[], yticks=[])
    axi.imshow(vector, interpolation='none', cmap='gray')
    axi.set_title(y)


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(digits.data,digits.target, test_size=0.1,
                                                    random_state=0, stratify=digits.target)


plt.bar(*np.unique(y_train, return_counts=True));


plt.bar(*np.unique(y_test, return_counts=True));

u, counts = np.unique([1,2,3,4,1,1,1,], return_counts=True)
mode = u[counts.argmax()] # will print 1 since 1 is most frequent
                          # 4 time present


N_centers = 10 # we assume to know that there are 10 digits
centers, y_est, loss = kmeans(X_train, N_centers)

> Init with kmeans++
> Iteration 1: loss 1314.5627705627705
> Iteration 2: loss 756.8259891509574
> Iteration 3: loss 725.4550366992323
> Iteration 4: loss 716.4499767121214
> Iteration 5: loss 711.2049934919781
> Iteration 6: loss 708.236358647983
> Iteration 7: loss 705.2994723513523
> Iteration 8: loss 701.9520972059448
> Iteration 9: loss 698.6658028658622
> Iteration 10: loss 695.5144766870222
> Iteration 11: loss 693.5266489678621
> Iteration 12: loss 692.1208906914004
> Iteration 13: loss 691.3495761746228
> Iteration 14: loss 690.657571369928
> Iteration 15: loss 690.0861205462734
> Iteration 16: loss 689.7669567135049
> Iteration 17: loss 689.6771929699328
> Iteration 18: loss 689.6572694423297
> Iteration 19: loss 689.6474580308259
> Iteration 20: loss 689.6267235993998
> Iteration 21: loss 689.6114085109643


fig, ax = plt.subplots(2, 5, figsize=(8, 3))
centers_img = centers.reshape(10, 8, 8)
for axi, center in zip(ax.flat, centers_img):
    axi.set(xticks=[], yticks=[])
    axi.imshow(center, interpolation='none', cmap='gray')


def get_mode(X):
    values, counts = np.unique(X, return_counts=True)
    return values[counts.argmax()]

# Nx1
labels = np.zeros_like(y_est)
# 10x1
center_label = np.zeros((N_centers), dtype=np.int32)

for i in range(N_centers):
    mask = (y_est == i)
    mode_y = get_mode(y_train[mask])
    labels[mask] = mode_y
    center_label[i] = mode_y


from sklearn.metrics import accuracy_score
accuracy_score(y_train, labels)

0.7043908472479901


# sum over axis of dimensionality of squared difference
# Nx1x64 vs 10x64 --> Nx10x64 --sum axis 2 ---> Nx10
# we match the test wrt to the center (there might be other classification methods)
L2_square = ((X_test[...,np.newaxis,:] - centers)**2).sum(axis=2)


# Nx10 -- argmin over axis --> Nx1
idx_cluster = L2_square.argmin(axis=1)


y_test_estimated = center_label[idx_cluster]


accuracy_score(y_test, y_test_estimated)

0.6722222222222223

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)


# Note: this requires the ``pillow`` package to be installed
from sklearn.datasets import load_sample_image
china = load_sample_image("china.jpg")
fig, ax = plt.subplots(1, 1, figsize=(20, 15))
ax.set(xticks=[], yticks=[])
ax.imshow(china)

<matplotlib.image.AxesImage at 0x7f9b3152b370>


# Note: this requires the ``pillow`` package to be installed
from sklearn.datasets import load_sample_image
china = load_sample_image("china.jpg")
fig, ax = plt.subplots(1, 1, figsize=(20, 15))
ax.set(xticks=[], yticks=[])
ax.imshow(china)

<matplotlib.image.AxesImage at 0x7f9b40bc81c0>

data = china/255.0 # scale all points to 0...1
data = data.reshape(-1,3)


data = china/255.0 # scale all points to 0...1
data = data.reshape(-1,3)
data.shape

(273280, 3)


fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(projection='3d')
data_sampled = data[::10] #1 point every 10 to go faster
ax.scatter(*data_sampled.T, marker='.', color=data_sampled)
ax.set_xlabel('R')
ax.set_ylabel('G')
_ = ax.set_zlabel('B')
ax.view_init(elev=10, azim=(0))


from sklearn.cluster import KMeans
kmeans = KMeans(16)
kmeans.fit(data)
y_est = kmeans.predict(data) # predict cluster center (project)
new_colors = kmeans.cluster_centers_[y_est] # backproject to the colors centers


fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(projection='3d')
data_sampled = new_colors[::10]  # 1 point every 10 to go faster
ax.scatter(*data_sampled.T, marker='.', color=data_sampled, s=200)
ax.set_xlabel('R')
ax.set_ylabel('G')
_ = ax.set_zlabel('B')
ax.view_init(elev=10, azim=(0))


china_recolored = new_colors.reshape(china.shape)

fig, ax = plt.subplots(1, 2, figsize=(16, 6),
                       subplot_kw=dict(xticks=[], yticks=[]))
fig.subplots_adjust(wspace=0.05)
ax[0].imshow(china)
ax[0].set_title('Original Image (24 bit per pixel)', size=16) # 8 bit per channel, for each pixel
ax[1].imshow(china_recolored)
ax[1].set_title('16-color Image (4 bit per pixel + 24 bit per centroid)', size=16);

Attribute	Value
Classes	10
Samples per class	~180
Samples total	1797
Dimensionality	64
Features	int 0-15

AI & Machine Learning - Unit 2¶

5. Clustering with K-means¶

Recap previous lecture¶

Today's lecture¶

Unsupervised Learning: Clustering¶

K-means¶

Coordinate Descent¶

Application: Pixel Segmentation¶

Image recognition: Visual Bag of Words (hints)¶

This lecture material is taken from¶

Unsupervised Learning¶

Unsupervised Learning¶

Unsupervised Learning: Clustering¶

Supervised Learning is more constrained¶

Clustering is ill-posed¶

Why Clustering is useful¶

Clustering: formalizing the problem¶

Clustering: K-means Algorithm¶

Let's think how we can solve it¶

What would you do?¶

1. We need some "reference" centroids to start¶

2. Assignment Step¶

3. Update Step¶

Indicator function¶

K-means (Lloyd’s method)¶

K-means convergence¶

Let's implement a basic version with numpy¶

Let's try a larger dataset¶

Visualization¶

Assumptions and Limitations of K-means¶

Bounded by linear cluster boundaries¶

Interpretation cluster boundaries as Voronoi Diagram¶

What is a Voronoi Region?¶

"Theory" behind K-means¶

K-means as a loss minimization problem¶

K-means as a loss minimization problem¶

On Convergence¶

On Convergence 👍🏼¶

Practical Consideration¶

Theoretical limitations of K-means¶

Theoretical limitations of K-means¶

Main problem of local optimum: Initialization¶

Lloyd’s method (random init) can be far from global optimum¶

Furthest-first Heuristic¶

Furthest-first Heuristic¶

Furthest-first Heuristic¶

K-means++¶

K-means++¶

How can we sample from a set of distances? 🤔¶

Higher distances should be more likely to be selected¶

Inverse Transform Sampling¶

Inverse Transform Sampling for Discrete Distribution¶

Inverse Transform Sampling with python¶

Let's perform sampling and see if works¶

K-means++¶

K-means: a family of algorithms parametrized by the norm used¶

How do we choose K?¶

How do we choose K?¶

K-means Complexity¶

Putting all together¶

Applications¶

Clustering of Digits (2D images) for "classification"¶

Let's see the data¶

Let's see the histogram of features (pixel colors)¶

Compute the mode from a distribution¶

Let's run our K-means on the digits¶

Now let's visualize the centers¶

It looks like a smoothed version of before...¶

Accuracy of ~70% is not bad just with an unsupervised method on the "training set" though no label has been used for training¶

Remember: Random chance accuracy over 10 classes is 1/10 ~10%.¶

We also have validated our implementation of K-means¶

Further, There is hope this may translate to test set¶

Accuracy of ~67% is not bad just with an unsupervised method on the "test split"¶

Remember: Random chance accuracy over 10 classes is 1/10 ~10%¶

Note now things are easy because:¶

Note: I did not try so I do not know what happens, but you can make a hypothesis. Before running it, make a hypothesis of what is going to happen following a rationale in your mind, based on what we studied so far. Then after formulating the hypothesis, do the experiments.¶

Tips 1: Try to retain all the features (i.e. 100% of variance) in case accuracy does not improve. In this case, you are just rotating the data and decorrelating the covariance matrix¶

Tips 2: If you try multiple runs, remember to FIX THE RANDOM SEED to make the experiments comparable¶

Homework 1b: To debug what happens, you can plot the data in 2D or 3D with PCA and show the digits ID with a color using a colormap¶

Homework 1c [Advanced]: To visualize the digits dataset, now use an advanced dimensionality reduction algorithm called t-SNE. Unlike PCA this method supports, multiple non-linear structures so you should see the digits more separated in 2D. For this use sklearn!¶

Homework 1c [Advanced]: To visualize the digits dataset, now use an advanced dimensionality reduction algorithm called `t-SNE`. Unlike PCA this method supports, multiple non-linear structures so you should see the digits more separated in 2D. For this use sklearn!¶