from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
print(faces.target_names)
print(faces.images.shape)

['Ariel Sharon' 'Colin Powell' 'Donald Rumsfeld' 'George W Bush'
 'Gerhard Schroeder' 'Hugo Chavez' 'Junichiro Koizumi' 'Tony Blair']
(1348, 62, 47)


faces.images[0,...].shape

(62, 47)


# Plot the faces
N_ax, N_img = 10, 10 #10 rows with 10 images per row
fig, ax = plt.subplots(N_ax, N_img,figsize=(10,10),
                       subplot_kw={'xticks':[], 'yticks':[]},
                       gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i in range(N_ax):
    ax[i,0].set_ylabel(f'Faces {i}')
    for j in range(N_img):
        ax[i,j].imshow(faces.data[i*N_img+j].reshape(62, 47), cmap='gray')


plt.imshow(faces.data[9*N_img].reshape(62, 47), cmap='gray')

<matplotlib.image.AxesImage at 0x7fd499e50ca0>


# Sampling
samples = np.random.uniform(0, 255, (100, 62, 47)).astype(np.uint8)
# uniformly sample 100 points in 62x47 space.

# Plot the faces
N_ax, N_img = 10, 10  # 10 rows with 10 images per row
fig, ax = plt.subplots(N_ax, N_img, figsize=(10, 10),
                       subplot_kw={'xticks': [], 'yticks': []},
                       gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i in range(N_ax):
    ax[i, 0].set_ylabel(f'Samples {i}')
    for j in range(N_img):
        ax[i, j].imshow(samples[i*N_img+j].reshape(62, 47), cmap='gray')


# Sampling
samples = np.random.uniform(0, 255, (100, 62, 47)).astype(np.uint8)
# uniformly sample 100 points in 62x47 space.

# Plot the faces
N_ax, N_img = 10, 10  # 10 rows with 10 images per row
fig, ax = plt.subplots(N_ax, N_img, figsize=(10, 10),
                       subplot_kw={'xticks': [], 'yticks': []},
                       gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i in range(N_ax):
    ax[i, 0].set_ylabel(f'Samples {i}')
    for j in range(N_img):
        ax[i, j].imshow(samples[i*N_img+j].reshape(62, 47), cmap='gray')


_=plt.imshow(faces.data[0*N_img].reshape(62, 47), cmap='gray')


from sklearn.decomposition import PCA


pca = PCA(n_components=150) # retain 150 components
print(faces.data.shape) #NxD N=1348 samples in ~3K-D space
pca.fit(faces.data)

(1348, 2914)

PCA(n_components=150)


fig, axes = plt.subplots(3, 8, figsize=(9, 4),
                         subplot_kw={'xticks':[], 'yticks':[]},
                         gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i, ax in enumerate(axes.flat):
    if i == 0:
        ax.imshow(pca.mean_.reshape(62, 47), cmap='bone') #  plot mean
    else:
        ax.imshow(pca.components_[i].reshape(62, 47), cmap='gray') # plots components


plt.figure(figsize=(20,10))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');
_ = plt.ylim([0,1])


plt.rcParams['axes.grid'] = False
from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable
# Project back to the input
# Project back to the input
projected = pca.transform(faces.data) # project with P = U_t*X_t
unprojected = pca.inverse_transform(projected) # unproject with U*P = U(U_t*X_t)
# Now plot
fig, ax = plt.subplots(3, 10, figsize=(35, 10),
                       subplot_kw={'xticks': [], 'yticks': []},
                       gridspec_kw=dict(hspace=0.1, wspace=0.1))

# it is important to get the max value of the error so that
# we plot heatmap error with the SAME SCALE!
errors_img = [(unprojected[i]-faces.data[i])**2 for i in range(10)]
max_val = max([err.max() for err in errors_img])

for i in range(10):
    ax[0, i].imshow(faces.data[i].reshape(62, 47), cmap='gray')
    ax[1, i].imshow(unprojected[i].reshape(62, 47), cmap='gray')
    erri = ax[2, i].imshow((errors_img[i]).reshape(
        62, 47), cmap='jet', extent=[0, max_val]*2)
    if i == 9:
        divider = make_axes_locatable(ax[2, i])
        cax = divider.append_axes("right", size="5%", pad=0.005)
        plt.colorbar(erri, cax=cax)

ax[0, 0].set_ylabel('full-dim\ninput')
ax[1, 0].set_ylabel('150-dim\nreconstruction')
_ = ax[2, 0].set_ylabel('L2 rec. err')


plt.rcParams['axes.grid'] = False
from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable
# Project back to the input
# Project back to the input
projected = pca.transform(faces.data) # project with P = U_t*X_t
unprojected = pca.inverse_transform(projected) # unproject with U*P = U(U_t*X_t)
# Now plot
fig, ax = plt.subplots(3, 10, figsize=(35, 10),
                       subplot_kw={'xticks': [], 'yticks': []},
                       gridspec_kw=dict(hspace=0.1, wspace=0.1))

# it is important to get the max value of the error so that
# we plot heatmap error with the SAME SCALE!
errors_img = [(unprojected[i]-faces.data[i])**2 for i in range(10)]
max_val = max([err.max() for err in errors_img])

for i in range(10):
    ax[0, i].imshow(faces.data[i].reshape(62, 47), cmap='gray')
    ax[1, i].imshow(unprojected[i].reshape(62, 47), cmap='gray')
    erri = ax[2, i].imshow((errors_img[i]).reshape(
        62, 47), cmap='jet', extent=[0, max_val]*2)
    if i == 9:
        divider = make_axes_locatable(ax[2, i])
        cax = divider.append_axes("right", size="5%", pad=0.005)
        plt.colorbar(erri, cax=cax)

ax[0, 0].set_ylabel('full-dim\ninput')
ax[1, 0].set_ylabel('150-dim\nreconstruction')
_ = ax[2, 0].set_ylabel('L2 rec. err')


from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable

############### Fitting with 3 components ######
pca = PCA(n_components=3) # retain 3 components
pca.fit(faces.data)
#############################################
##### Plot
# Project back to the input
projected = pca.transform(faces.data) # project with P = U_t*X_t
unprojected = pca.inverse_transform(projected) # unproject with U*P = U(U_t*X_t)
# Now plot
fig, ax = plt.subplots(3, 10, figsize=(15, 4.5),
                       subplot_kw={'xticks': [], 'yticks': []},
                       gridspec_kw=dict(hspace=0.1, wspace=0.1))

# it is important to get the max value of the error so that
# we plot heatmap error with the SAME SCALE!
errors_img = [(unprojected[i]-faces.data[i])**2 for i in range(10)]
max_val = max([err.max() for err in errors_img])

for i in range(10):
    ax[0, i].imshow(faces.data[i].reshape(62, 47), cmap='gray')
    ax[1, i].imshow(unprojected[i].reshape(62, 47), cmap='gray')
    erri = ax[2, i].imshow((errors_img[i]).reshape(
        62, 47), cmap='jet', extent=[0, max_val]*2)
    if i == 9:
        divider = make_axes_locatable(ax[2, i])
        cax = divider.append_axes("right", size="5%", pad=0.005)
        plt.colorbar(erri, cax=cax)

ax[0, 0].set_ylabel('full-dim\ninput')
ax[1, 0].set_ylabel('3-dim\nreconstruction')
_ = ax[2, 0].set_ylabel('L2 rec. err')


# Nx3
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(projection='3d')
_ = ax.scatter(*projected.T, c=faces.target, marker='.', cmap='jet')


plt.figure(figsize=(10,10))
np.random.seed(0)
N_samples = 50
# samples points for class 1
X_1 = np.random.uniform(50, 200, N_samples)
X_1 = np.vstack((X_1, (1,)*N_samples))
# samples points for class 2
X_2 = np.random.uniform(50, 200, N_samples)
X_2 = np.vstack((X_2, (20,)*N_samples))
X = np.concatenate((X_1, X_2))
# data
X = np.concatenate((X_1, X_2), axis=1)
# labels
labels = X[1, ...]
# Plot also the training points
plt.scatter(
    x=X[0, ...],
    y=X[1, ...],
    c=labels,
    cmap='jet',
)
# Code below wants Nx2
X = X.T
_ = plt.axis('equal')

Any questions about previous lectures before moving on?¶

We will review a few concepts of PCA at the end of matrix calculus


x_big = np.arange(0.01, 3.01, 0.01)
ys = np.sin(x_big**x_big)
_ = plt.plot(x_big, ys, 'b-')
plt.xlabel('x');plt.ylabel('y');
_ = plt.axis('equal')


x_med = np.arange(1.75, 2.25, 0.001)
ys = np.sin(x_med**x_med)
_ = plt.plot(x_med, ys, 'b-')
plt.xlabel('x');plt.ylabel('y');
_ = plt.axis('equal')


x_small = np.arange(2.0, 2.01, 0.0001)
ys = np.sin(x_small**x_small)
_ = plt.plot(x_small, ys, 'b-')
plt.xlabel('x');plt.ylabel('y');
_ = plt.axis('equal')

Method	$\mathbf{A}$	Decomposition
SVD	any	$\mathbf{A}=\mathbf{U}\mathbf{\Sigma}\mathbf{V}^T$
Eigen	square	$\mathbf{A}=\mathbf{U}\mathbf{\Sigma}\mathbf{U}^{-1}$
Eigen	square/sym	as above but $\mathbf{U}\mathbf{U}^T=\mathbf{I}$

Method	Step 1	Step 2	Step 3
geometry	rotate	scale axis	rotate
SVD	$\mathbf{V}^T$	$\mathbf{\Sigma}$	$\mathbf{U}$
geometry	rotate	scale axis	rotate back
Eig	$\mathbf{U}^{-1}$	$\mathbf{\Sigma}$	$\mathbf{U}$

Machine Learning¶

4. PCA in higher dimension, 3DMM, the curse of dimensionality¶

Recap previous lecture¶

Today's lecture¶

PCA with SVD¶

PCA in a high dimensional space¶

Application: Eigenfaces¶

Eigendecomposition of Matrices¶

Operations on Eigendecompositions¶

Eigendecompositions of Symmetric Matrices¶

Eigendecompositions and Singular Value Decomposition¶

Decomposition as a Geometric Pipeline¶

Decomposition as a Geometric Pipeline¶

Geometry of SVD¶

Principal Component Analysis (PCA)¶

PCA works in unsupervised learning settings¶

Assumptions¶

Objective: find a transformation (subspace) for compressing the data¶

Find the projection that maximizes the spread of the data¶

Full recipe¶

Alternative interpretation: Find orthogonal projection that minimizes reconstruction error¶

What happens if input dimensionality is large?¶

What happens in high dimensions?¶

High Dimensional Space¶

Let's practice with 2D real data¶

Best Practice: Always look at the data before you start working!¶

For example, what is this?¶

2D images are high-dimensional vectors¶

Long Story Short¶

Let's random sample uniformly the space in 2914 dimensions¶

The probability of hitting a face is very small!¶

The curse of dimensionality¶

The curse of dimensionality¶

Ambient space¶

Embedded space¶

Stranger things happen in higher dimensions¶

Distances in high-dimensional space¶

Dimensionality in ML: some reference numbers¶

Eigenfaces (i.e. PCA applied to 2D images)¶

Subspaces in decreasing order of variance¶

How do we choose the subspace where to cut the dimension?¶

Data compression¶

Let's try with only 3 Components!¶

Color maps to Identity¶

Looks like not a great projection for separating the identity!¶

In the end makes sense since PCA is an unsupervised method.¶

Another example for PCA on color RGB images on CIFAR-10¶

Now I will show you some data and then you need to tell me if using¶

PCA is a good way to separate the color of the data¶

Why compression¶

How to apply PCA¶

Usage of PCA¶

Inductive Settings¶

On the side: Transductive Settings¶

Artificial Intelligence and Machine Learning

Unit II

Any questions about previous lectures before moving on?¶

Matrix Calculus¶

Basic example: scalar to scalar¶

Vector to scalar¶

Vector to vector¶

Why Calculus?¶

Finite Difference¶

Linear Approximation¶

Linear Approximation¶

Gradient¶

Try it yourself - Gradient of dot product¶

Hessian¶

Quadratic Forms¶

Definiteness and relation with Eigenvalues¶

Common Derivatives¶

Derivative Rules¶

Try it yourself¶