pmf = dist/dist.sum()


fig, axs = plt.subplots(1, 2)
fig.set_figheight(3)
fig.set_figwidth(15)
# PDF
axs[0].stem(pmf, linefmt='b-', markerfmt='bo', basefmt='--')
axs[0].set_title('PMF')
axs[0].set_xlabel('Index of distance')
axs[0].set_ylabel('Probability')
axs[0].set_aspect('auto')
# CUMSUM
axs[1].plot(pmf.cumsum(), 'o--')
axs[1].set_title('CDF')
axs[1].set_xlabel('Index of distance')
axs[1].set_ylabel('Cumulative Probability')
axs[1].set_aspect('auto')
plt.show()


# Univariate Gaussian (1-D Gaussian)
mu, sigma = 0, 1
x = np.arange(-3, 3, 0.01)
p = 1 / np.sqrt(2 * np.pi * sigma**2) * np.exp(-(x - mu)**2 / (2 * sigma**2)) 
_ = plt.plot(x, p);
_ = plt.title('Gaussian PDF')


from scipy.stats import multivariate_normal
mu = [0, 0]
Sigma = [[2, 0.0],
         [0.0, 1]]
F = multivariate_normal(mu, Sigma )


from scipy.stats import multivariate_normal
X, Y = np.mgrid[-2:2:0.01, -2:2:0.01]
pos = np.dstack((X, Y))
Z = F.pdf(pos)

# plot using subplots
fig = plt.figure(figsize=(20,20))
ax1 = fig.add_subplot(1, 2, 1, projection='3d')

ax1.plot_surface(X, Y, Z, rstride=3, cstride=3, linewidth=1, antialiased=True,
                 cmap='jet')
ax1.view_init(55, -70)
# ax1.set_xticks([])
# ax1.set_yticks([])
# ax1.set_zticks([])
ax1.set_xlabel(r'$x_1$')
ax1.set_ylabel(r'$x_2$')

ax2 = fig.add_subplot(1, 2, 2, projection='3d')
ax2.contourf(X, Y, Z, zdir='z', offset=0, cmap='jet')
ax2.view_init(90, 270)

ax2.grid(False)
ax2.set_xticks([])
ax2.set_yticks([])
ax2.set_zticks([])
ax2.set_xlabel(r'$x_1$')
ax2.set_ylabel(r'$x_2$')

plt.show()


from scipy.stats import multivariate_normal
step = 0.01
X, Y = np.mgrid[-2:2:step, -2:2:step]
pos = np.dstack((X, Y))
##############################
### Gaussian PDF definition
Sigma = [[.001, 0.0],
         [0.0, .001]]
mu = [0, 0]
F = multivariate_normal(mu, Sigma )
##############################
Z = F.pdf(pos)

# plot using subplots
fig = plt.figure(figsize=(20,20))
ax1 = fig.add_subplot(1, 2, 1, projection='3d')

ax1.plot_surface(X, Y, Z, rstride=3, cstride=3, linewidth=1, antialiased=True,
                 cmap='jet')
ax1.view_init(55, -70)
# ax1.set_xticks([])
# ax1.set_yticks([])
# ax1.set_zticks([])
ax1.set_xlabel(r'$x_1$')
ax1.set_ylabel(r'$x_2$')
ax1.set_xlim3d(-2,2)
ax1.set_ylim3d(-2,2)
#ax1.set_zlim3d(0,0.15)

ax2 = fig.add_subplot(1, 2, 2, projection='3d')
ax2.contourf(X, Y, Z, zdir='z', offset=0, cmap='jet')
ax2.view_init(90, 270)

ax2.grid(False)
ax2.set_xticks([])
ax2.set_yticks([])
ax2.set_zticks([])
ax2.set_xlabel(r'$X_1$')
ax2.set_ylabel(r'$X_2$')
ax2.set_xlim(-2,2)
ax2.set_ylim(-2,2)


plt.show()


from scipy.stats import norm
np.random.seed(1)
########################################
mu, sigma, N = 0, 1, 10  # mean and standard deviation of the generative process
# when we work in practice we have only points we do not know the generative pocess.
########################################
points = np.random.normal(mu, sigma, N)
X_plot = np.linspace(-5, 5, 1000)

true_dens = norm(mu, sigma).pdf(X_plot)

fig, ax = plt.subplots(figsize=(5, 5))
_ = ax.fill(X_plot, true_dens, fc="black",
            alpha=0.2, label="Input distribution (unknown)")
_ = ax.plot(points, -0.005 - 0.01 * np.random.random(points.shape[0]), "+k")
ax.legend()

<matplotlib.legend.Legend at 0x7f7881b48b20>


def estimate_gaussian_mle(points, X_plot, plot=True):
    # Now, we estimate with MLE in close form
    mu_mle = points.mean()
    std_mle = np.std(points, ddof=0)
    if plot:
        MLE_dens = norm(mu_mle, std_mle).pdf(X_plot)
        _ = ax.fill(X_plot, MLE_dens, fc="red",
                    alpha=0.2, label="estimated")
    return mu_mle, std_mle


mu_mle, std_mle = estimate_gaussian_mle(points, X_plot)
print(f'Estimated ({mu_mle}, {std_mle}) vs Ground-truth ({mu}, {sigma})')

Estimated (-0.09714089080609985, 1.190898552063902) vs Ground-truth (0, 1)


#help(np.std) #toggle to get help on the standard deviation function in numpy


fig, ax = plt.subplots(figsize=(5, 5))
_ = ax.fill(X_plot, true_dens, fc="black",
            alpha=0.2, label="input distribution")
_ = ax.plot(points, -0.005 - 0.01 * np.random.random(points.shape[0]), "+k")
mu_mle, std_mle = estimate_gaussian_mle(points, X_plot)
print(f'Estimated ({mu_mle}, {std_mle}) vs Ground-truth ({mu}, {sigma})')

Estimated (-0.09714089080609985, 1.190898552063902) vs Ground-truth (0, 1)


from scipy.stats import norm
np.random.seed(1)
########################################
mu, sigma, N = 0, 1, 100  # mean and standard deviation of the generative process
# when we work in practice, we have only points and we do not know the generative process.
########################################
# when we work in practice, we have only points and we do not know the generative process.
points = np.random.normal(mu, sigma, N)
X_plot = np.linspace(-5, 5, 1000)

true_dens = norm(mu, sigma).pdf(X_plot)

fig, ax = plt.subplots(figsize=(5, 5))
_ = ax.fill(X_plot, true_dens, fc="black",
            alpha=0.2, label="input distribution")
_ = ax.plot(points, -0.005 - 0.01 * np.random.random(points.shape[0]), "+k")


fig, ax = plt.subplots(figsize=(5, 5))
_ = ax.fill(X_plot, true_dens, fc="black",
            alpha=0.2, label="input distribution")
_ = ax.plot(points, -0.005 - 0.01 * np.random.random(points.shape[0]), "+k")
mu_mle, std_mle = estimate_gaussian_mle(points, X_plot)
print(f'Estimated ({mu_mle}, {std_mle}) vs Ground-truth ({mu}, {sigma})')

Estimated (0.060582852075698704, 0.885156213831585) vs Ground-truth (0, 1)


from scipy.stats import norm
np.random.seed(1)
########################################
mu, sigma, N = 0, 1, 1000  # mean and standard deviation of the generative process
# when we work in practice, we have only points and we do not know the generative process.
########################################
# when we work in practice, we have only points and we do not know the generative process.
points = np.random.normal(mu, sigma, N)
X_plot = np.linspace(-5, 5, 1000)

true_dens = norm(mu, sigma).pdf(X_plot)

fig, ax = plt.subplots(figsize=(5, 5))
_ = ax.fill(X_plot, true_dens, fc="black",
            alpha=0.2, label="input distribution")
_ = ax.plot(points, -0.005 - 0.01 * np.random.random(points.shape[0]), "+k")


fig, ax = plt.subplots(figsize=(5, 5))
_ = ax.fill(X_plot, true_dens, fc="black",
            alpha=0.2, label="input distribution")
_ = ax.plot(points, -0.005 - 0.01 * np.random.random(points.shape[0]), "+k")
mu_mle, std_mle = estimate_gaussian_mle(points, X_plot)
print(f'Estimated ({mu_mle}, {std_mle}) vs Ground-truth ({mu}, {sigma})')

Estimated (0.03881247615960185, 0.9810041339322116) vs Ground-truth (0, 1)


import matplotlib.pyplot as plt
np.random.seed(0)

def inverse_sampling(pmf):
    # import random
    # random.choices(range(1,k), pmf, k=1000)
    return np.argmin((np.random.rand(1)[:, None] > pmf.cumsum()), axis=1)


################### model params #######################
N_samples = 1000  # we sample this amount of points
z_to_gaussian = {0: ([0, 0], [[0.5, 0], [0, 0.15]], 'red'),
                 1: ([3, 2], [[1.25, 0], [0, 0.75]], 'green'),
                 2: ([-2, -2], [[1, -0.74], [-0.74, 1]], 'blue'),
                 }
mixing = np.array([0.05, 0.05, 0.9])  # mixing coefficients
########################################################
for _ in range(0, N_samples):  # for each sample
    z = inverse_sampling(mixing)  # sample k using mixing coefficients
    *normal, color = z_to_gaussian[z[0]]  # now sample the data from  N_k
    x, y = np.random.multivariate_normal(
        *normal, 1).T  # sample 1 point at a time for clarity
    plt.plot(x, y, '.', color=color)
plt.title('p(x,z)')
plt.axis('equal')
plt.show()


import matplotlib.pyplot as plt
np.random.seed(0)

def inverse_sampling(pmf):
    return np.argmin((np.random.rand(1)[:, None] > pmf.cumsum()), axis=1)


################### model params #######################
N_samples = 1000  # we sample this amount of points
z_to_gaussian = {0: ([0, 0], [[0.5, 0], [0, 0.15]], 'red'),
                 1: ([3, 2], [[1.25, 0], [0, 0.75]], 'green'),
                 2: ([-2, -2], [[1, -0.74], [-0.74, 1]], 'blue'),
                 }
mixing = np.array([0.2, 0.5, 0.3])  # mixing coefficients
########################################################
for _ in range(0, N_samples):  # for each sample
    z = inverse_sampling(mixing)  # sample k using mixing coefficients
    *normal, color = z_to_gaussian[z[0]]  # now sample the data from  N_k
    x, y = np.random.multivariate_normal(
        *normal, 1).T  # sample 1 point at time for clarity
    plt.plot(x, y, '.', color='black')
plt.axis('equal')
plt.title('p(x)')
plt.show()


import matplotlib.pyplot as plt
np.random.seed(0)

def inverse_sampling(pmf):
    return np.argmin((np.random.rand(1)[:, None] > pmf.cumsum()), axis=1)


################### model params #######################
N_samples = 1000  # we sample this amount of points
z_to_gaussian = {0: ([0, 0], [[0.5, 0], [0, 0.15]], 'red'),
                 1: ([3, 2], [[1.25, 0], [0, 0.75]], 'green'),
                 2: ([-2, -2], [[1, -0.74], [-0.74, 1]], 'blue'),
                 }
mixing = np.array([0.2, 0.5, 0.3])  # mixing coefficients
########################################################
for _ in range(0, N_samples):  # for each sample
    z = inverse_sampling(mixing)  # sample k using mixing coefficients
    *normal, color = z_to_gaussian[z[0]]  # now sample the data from  N_k
    x, y = np.random.multivariate_normal(
        *normal, 1).T  # sample 1 point at a time for clarity
    plt.plot(x, y, '.', color='black')
plt.axis('equal')
plt.title('p(x)')
plt.show()


from sklearn.mixture import GaussianMixture
# X is 2xN
gmm = GaussianMixture(n_components=3, random_state=0).fit(X.T)
# >>> gm.means_ #Dxk
assignments = gmm.predict(X.T)


z_to_col = {0: 'red', 1: 'green', 2: 'blue'}
_ = plt.scatter(X[0, ...], X[1, ...], color=[z_to_col[y] for y in assignments])
_ = plt.scatter(*gmm.means_.T,
                  s=250,
                  marker='*',
                  c='black',
                  label='centroids')


from matplotlib.colors import LogNorm
fig = plt.figure(figsize=(10, 10))
# display predicted scores by the model as a contour plot
x = np.linspace(-4.0, 6.0)
y = np.linspace(-4.0, 6.0)
X1, Y1 = np.meshgrid(x, y)
XX = np.array([X1.ravel(), Y1.ravel()]).T
Z = -gmm.score_samples(XX)  # negative log likelihood
Z = Z.reshape(X1.shape)

CS = plt.contourf(
    X1, Y1, Z, cmap='hot'
)
plt.grid(False)
CB = plt.colorbar(CS, shrink=0.8, extend="both")
plt.scatter(X[0, ...],  X[1, ...])

plt.title("Negative log-likelihood predicted by a GMM")
plt.axis("tight")
plt.show()


%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np


# Generate some data
from sklearn.datasets import make_blobs
X, y_true = make_blobs(n_samples=400, centers=4,
                       cluster_std=0.60, random_state=0)
X = X[:, ::-1] # flip axes for better plotting
# Plot the data with K Means Labels
from sklearn.cluster import KMeans
kmeans = KMeans(4, random_state=0)
labels = kmeans.fit(X).predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis');


from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

def plot_kmeans(kmeans, X, n_clusters=4, rseed=0, ax=None):
    labels = kmeans.fit_predict(X)

    # plot the input data
    ax = ax or plt.gca()
    ax.axis('equal')
    ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)

    # plot the representation of the KMeans model
    centers = kmeans.cluster_centers_
    radii = [cdist(X[labels == i], [center]).max()
             for i, center in enumerate(centers)]
    for c, r in zip(centers, radii):
        ax.add_patch(plt.Circle(c, r, fc='#CCCCCC', lw=3, alpha=0.5, zorder=1))
kmeans = KMeans(n_clusters=4, random_state=0)
plot_kmeans(kmeans, X)


rng = np.random.RandomState(13)
X_stretched = np.dot(X, rng.randn(2, 2))

kmeans = KMeans(n_clusters=4, random_state=0)
plot_kmeans(kmeans, X_stretched)


from matplotlib.patches import Ellipse

def draw_ellipse(position, covariance, ax=None, **kwargs):
    """Draw an ellipse with a given position and covariance"""
    ax = ax or plt.gca()
    
    # Convert covariance to principal axes
    if covariance.shape == (2, 2):
        U, s, Vt = np.linalg.svd(covariance)
        angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))
        width, height = 2 * np.sqrt(s)
    else:
        angle = 0
        width, height = 2 * np.sqrt(covariance)
    
    # Draw the Ellipse
    for nsig in range(1, 4):
        ax.add_patch(Ellipse(position, nsig * width, nsig * height,
                             angle, **kwargs))
        
def plot_gmm(gmm, X, label=True, ax=None):
    ax = ax or plt.gca()
    labels = gmm.fit(X).predict(X)
    if label:
        ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)
    else:
        ax.scatter(X[:, 0], X[:, 1], s=40, zorder=2)
    ax.axis('equal')
    
    w_factor = 0.2 / gmm.weights_.max()
    for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_):
        draw_ellipse(pos, covar, alpha=w * w_factor)


from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=4).fit(X)
labels = gmm.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis');


gmm = GaussianMixture(n_components=4, random_state=42)
plot_gmm(gmm, X)


gmm = GaussianMixture(n_components=4, covariance_type='full', random_state=42)
plot_gmm(gmm, X_stretched)


from sklearn.datasets import make_moons
Xmoon, ymoon = make_moons(200, noise=.05, random_state=0)
plt.scatter(Xmoon[:, 0], Xmoon[:, 1]);


gmm2 = GaussianMixture(n_components=2, covariance_type='full', random_state=0)
plot_gmm(gmm2, Xmoon)


gmm16 = GaussianMixture(n_components=8, covariance_type='full', random_state=0)
plot_gmm(gmm16, Xmoon, label=False)


Xnew, _ = gmm16.sample(4000)
plot_gmm(gmm16, Xmoon, label=True)
plt.scatter(Xnew[:, 0], Xnew[:, 1], color='red')


Xnew, _ = gmm16.sample(40000)
loglike = gmm16.score_samples(Xnew)
plt.scatter(Xnew[:, 0], Xnew[:, 1], c=loglike, cmap='jet')


n_components = np.arange(1, 21)
models = [GaussianMixture(n, covariance_type='full', random_state=0).fit(Xmoon)
          for n in n_components]

plt.plot(n_components, [m.bic(Xmoon) for m in models], label='BIC')
plt.plot(n_components, [m.aic(Xmoon) for m in models], label='AIC')
plt.legend(loc='best')
plt.xlabel('n_components');


from sklearn.datasets import load_digits
digits = load_digits()
digits.data.shape


def plot_digits(data):
    fig, ax = plt.subplots(10, 10, figsize=(8, 8),
                           subplot_kw=dict(xticks=[], yticks=[]))
    fig.subplots_adjust(hspace=0.05, wspace=0.05)
    for i, axi in enumerate(ax.flat):
        im = axi.imshow(data[i].reshape(8, 8), cmap='gray')
        im.set_clim(0, 16)
plot_digits(digits.data)


from sklearn.decomposition import PCA
pca = PCA(0.99, whiten=False)
data = pca.fit_transform(digits.data)
data.shape


n_components = np.arange(1, 40, 1)
models = [GaussianMixture(n, covariance_type='diag', random_state=0)
          for n in n_components]
aics = [model.fit(data).bic(data) for model in models]
plt.plot(n_components, aics);


idx = (n_components==10).nonzero()[0][0]
model = models[idx]


data_new, y_new = model.sample(100)
data_new.shape


digits_new = pca.inverse_transform(data_new)
plot_digits(digits_new)


plt.bar(*np.unique(digits.target, return_counts=True));


plt.bar(range(10),model.weights_);


plt.bar(range(model.means_[0].shape[0]),model.means_[0]);


plt.bar(range(model.means_[0].shape[0]),model.covariances_[0]);

Machine Learning¶

5. Gaussian PDF and Maximum Likelihood Estimation (MLE)¶

Today's lecture¶

Unsupervised Learning¶

Density Estimator¶

Single Gaussian¶

Maximum Likelihood Estimation (MLE)¶

Intro to GMM (Gaussian Mixture Model)¶

This lecture material is taken from¶

K-means as a loss minimization problem¶

Homework 2a¶

Homework 2b¶

Unsupervised Learning¶

Discrete Random Variable¶

Inverse Transform Sampling¶

Inverse Transform Sampling in Action¶

Gaussian (Normal) Distribution¶

Univariate Gaussian (1-D Gaussian)¶

Cumulative Density Function¶

Multivariate Gaussian (N-D Gaussian)¶

The Mahalanobis distance¶

Joint, Marginal, Conditional and Bayes Theorem¶

Joint¶

Joint = Marginal $\times$ Conditional¶

Bayes Theorem¶

Bayes Theorem (other direction)¶

Independent Random Variables¶

Terminology for Statistics¶

Terminology for Machine Learning¶

Problem: Given data, learn the parameters¶

The Maximum Likelihood Principle¶

Maximum Likehood Estimator (MLE) for a single Gaussian¶

Probability Density can be interpreted in two ways:¶

Probability Density can be interpreted in two ways:¶

Minimizing the Negative Log-Likelihood¶

Minimizing the Negative Log-Likelihood¶

General recipe for optimizing with MLE¶

Universal Template¶

MLE with a single Gaussian¶

Let's write it as a gradient¶

Continue the proof by hand...¶

MLE for a single Gaussian¶

Remember: MLE gives you an estimate, NOT the underlying distribution¶

Let's see a practical example (10 datapoints)¶

Let's see a practical example (100 datapoints)¶

Better than before but the std. deviation is still wrong¶

Let's see a practical example (1000 data points)¶

With 1000 training points seems to be OK¶

But what about different shapes?¶

MLE tends to underestimate the deviation of the Gaussian¶

Machine Learning¶

5. Maximum Likelihood Estimation, Gaussian Mixture Model, Clustering¶

Today's lecture¶

Unsupervised Learning¶

Density Estimator¶

Single Gaussian¶

Maximum Likelihood Estimation (MLE)¶

Intro to GMM (Gaussian Mixture Model)¶

This lecture material is taken from¶

K-means as a loss minimization problem¶

Homework 2a¶

Homework 2b¶

Unsupervised Learning¶

Minimizing the Negative Log-Likelihood¶

Minimizing the Negative Log-Likelihood¶

General recipe for optimizing with MLE¶

Universal Template¶

Today's lecture¶

Gaussian Mixture Model (GMM)¶

Density Estimator with GMM¶

Generate data with GMM¶

This lecture material is taken from¶

Unsupervised Learning¶

Problem: What if the data is generated by a multi-modal distribution?¶

Quick Note on Visualization in 1D/2D with Histograms¶

Gaussian¶

Histogram¶

A single Gaussian for each pixel¶

Why model the background?¶

Problem: what if the data is multi-modal?¶