import numpy as np
from matplotlib import pyplot as plt

from sklearn import linear_model, datasets


n_samples = 100
size = 10

X, y, coef_gt = datasets.make_regression(
    n_samples=n_samples,
    n_features=2,
    n_informative=1,
    noise=20,
    coef=True,
    random_state=42,
)
fig = plt.figure(figsize=(size, size))
ax = fig.add_subplot(projection='3d')
ax.scatter(X[..., 0], X[..., 1], y, c='red', marker='o')
ax.view_init(0, -90)


import numpy as np
from matplotlib import pyplot as plt

from sklearn import linear_model, datasets


n_samples = 100
size = 10

X, y, coef_gt = datasets.make_regression(
    n_samples=n_samples,
    n_features=2,
    n_informative=1,
    noise=20,
    coef=True,
    random_state=42,
)
fig = plt.figure(figsize=(size, size))
ax = fig.add_subplot(projection='3d')
ax.scatter(X[..., 0], X[..., 1], y, c='red', marker='o')
ax.view_init(0, -90)


%matplotlib notebook
from sklearn import linear_model, datasets
from matplotlib import pyplot as plt
import numpy as np

n_samples = 100
size = 8

X, y, coef_gt = datasets.make_regression(
    n_samples=n_samples,
    n_features=2,
    n_informative=1,
    noise=20,
    coef=True,
    random_state=42,
)
fig = plt.figure(figsize=(size, size))
ax = fig.add_subplot(projection='3d')
# Linear Regression
bias = np.ones((X.shape[0], 1))
X = np.hstack((X, bias))
theta = np.linalg.inv(X.T@X)@X.T@y
# Now MeshGrid
Xmin, Xmax = X.min(), X.max()
support = np.linspace(Xmin, Xmax, 10)
xx, yy = np.meshgrid(support, support)
data = np.stack((xx, yy), axis=2)
data = data.reshape(-1, 2)
data = np.hstack((data, np.ones((data.shape[0], 1))))
z = np.dot(theta, data.T)
z = z.reshape(xx.shape)
ax.plot_surface(xx, yy, z, alpha=0.2)
ax.scatter(X[..., 0], X[..., 1], y, c='red', marker='o')
ax.view_init(0, 90)


%matplotlib inline
plt.figure()
plt.bar(list(range(theta.size)),theta); plt.xlabel('Coefficients');


%matplotlib inline
plt.figure()
plt.bar(list(range(coef_gt.size)),coef_gt); plt.xlabel('Coefficients GT');


%matplotlib notebook
from sklearn import linear_model, datasets
from matplotlib import pyplot as plt
import numpy as np

n_samples = 3
size = 12

X, y, coef_gt = datasets.make_regression(
    n_samples=n_samples,
    n_features=2,
    n_informative=1,
    noise=20,
    coef=True,
    random_state=42,
)
fig = plt.figure(figsize=(size, size))
ax = fig.add_subplot(projection='3d')
ax.scatter(X[..., 0], X[..., 1], y, c='blue', marker='o')
# Linear Regression
bias = np.ones((X.shape[0], 1))
X = np.hstack((X, bias))
theta = np.linalg.inv(X)@y
# Now MeshGrid
Xmin, Xmax = X.min(), X.max()
support = np.linspace(Xmin, Xmax, 10)
xx, yy = np.meshgrid(support, support)
data = np.stack((xx, yy), axis=2)
data = data.reshape(-1, 2)
data = np.hstack((data, np.ones((data.shape[0], 1))))
z = np.dot(theta, data.T)
z = z.reshape(xx.shape)
ax.plot_surface(xx, yy, z, alpha=0.2)
ax.scatter(X[..., 0], X[..., 1], y, c='red', marker='o')
ax.view_init(0, 90)


%matplotlib inline
from sklearn import linear_model, datasets
from matplotlib import pyplot as plt
import numpy as np

n_samples = 100
size = 7

X, y, coef_gt = datasets.make_regression(
    n_samples=n_samples,
    n_features=1,
    n_informative=1,
    noise=20,
    coef=True,
    random_state=42,
)
fig = plt.figure(figsize=(size, size))
ax = fig.add_subplot()
# Linear Regression
bias = np.ones((X.shape[0], 1))
X = np.hstack((X, bias))
theta = np.linalg.inv(X.T@X)@X.T@y
# Now MeshGrid
x_interp = np.linspace(Xmin, Xmax, 100)
x_interp = x_interp.reshape(-1,1)
x_interp = np.c_[x_interp,np.ones_like(x_interp)]
y_interp = np.dot(theta, x_interp.T)
ax.scatter(x_interp[:,0], y_interp, alpha=0.7, marker='.')
ax.scatter(X[..., 0], y, c='red', marker='o')
ax.set_xlabel('$x$');
ax.set_ylabel('$y$');


%matplotlib inline
from sklearn import linear_model, datasets
from matplotlib import pyplot as plt
import numpy as np

n_samples = 100
size = 7

X, y, coef_gt = datasets.make_regression(
    n_samples=n_samples,
    n_features=1,
    n_informative=1,
    noise=20,
    coef=True,
    random_state=42,
)
fig = plt.figure(figsize=(size, size))
ax = fig.add_subplot()
# Linear Regression
bias = np.ones((X.shape[0], 1))
X = np.hstack((X, bias))
theta = np.linalg.inv(X.T@X)@X.T@y
# Now MeshGrid
x_interp = np.linspace(Xmin, Xmax, 100)
x_interp = x_interp.reshape(-1,1)
x_interp = np.c_[x_interp,np.ones_like(x_interp)]
y_interp = np.dot(theta, x_interp.T)
ax.scatter(x_interp[:,0], y_interp, alpha=0.7, marker='.')
ax.scatter(X[..., 0], y, c='red', marker='o')
ax.set_xlabel('$x$');
ax.set_ylabel('$y$');


fig = plt.figure(figsize=(size-1, size-1))
plt.rcParams['axes.grid'] = False
plt.contourf(xxt, yyt, losses, levels=50, cmap='jet');
plt.colorbar()
plt.scatter(coef_gt,0,color='red',marker='o',s=50)
plt.axis('scaled')
plt.xlabel('$theta_1$')
plt.ylabel('$theta_2$');


## Implementation of Gradient Descent for Logistic Regression

import time
%matplotlib notebook


def get_diff(X, theta, y):
    return X@theta - y[..., np.newaxis]


def get_loss(diff): 
    return 0.5*np.dot(diff.T, diff)


def plot_line(plot3, theta):
    x_interp = np.linspace(Xmin, Xmax, 100)
    x_interp = x_interp.reshape(-1, 1)
    x_interp = np.c_[x_interp, np.ones_like(x_interp)]
    y_interp = np.dot(x_interp, theta)
    if plot3:
        plot3.set_xdata(x_interp[:, 0])
        plot3.set_ydata(y_interp)
    else:
        return x_interp, y_interp


plt.ion()
figure, (axes_1, axes_2) = plt.subplots(2, 2, figsize=(9, 9))
plt.rcParams['axes.grid'] = False
ax0, ax1 = axes_1
ax2, ax3 = axes_2
ax0.contourf(xxt, yyt, losses, levels=50, cmap='jet')
ax0.scatter(coef_gt, 0, color='red', marker='o', s=50)
ax0.set_xlabel('$theta_1$',fontsize=18)
ax0.set_ylabel('$theta_2$',fontsize=18)
ax1.set_ylabel('$loss$',fontsize=18)
ax1.set_xlabel('$iter$',fontsize=18)
ax1.set(xlim=(0, 100), ylim=(0, 1.28e6))
ax2.scatter(X[..., 0], y, c='red', marker='o')
ax2.set_xlabel('$x$',fontsize=18)
ax2.set_ylabel('$y$',fontsize=18)
ax3.set_ylabel('$Grad. Norm.$',fontsize=18)
ax3.set_xlabel('$iter$',fontsize=18)
ax3.set(xlim=(0, 100), ylim=(0, 20000))

theta_curr = np.array([[-100, -100]]).T
losses_track = [get_loss(get_diff(X, theta_curr, y))]
grad_norm_track = [1000]

theta_track = np.array(theta_curr)
lr = 1e-3
loss_tol = 10

plot1, = ax0.plot(*theta_curr, color='violet',
                  marker='.', markersize=5, linestyle='-')
plot2, = ax1.plot(*losses_track, color='blue',
                  marker='.', markersize=10, linestyle='--')
xi, yi = plot_line(None, theta_curr)
plot3a,plot3b = ax2.plot(xi, yi, color='blue', marker='.',
                  markersize=3, linestyle='--')
plot4, = ax3.plot(1000, color='blue',
                  marker='.', markersize=10, linestyle='--')
while True:
    diff = get_diff(X, theta_curr, y)
    grad = (diff * X).sum(axis=0, keepdims=True).T
    theta_curr = theta_curr - lr*grad
    theta_track = np.append(theta_track, theta_curr, axis=1)
    diff = get_diff(X, theta_curr, y)
    losses_track.append(get_loss(diff))
    grad_norm_track.append(np.linalg.norm(grad,2))
    if abs(losses_track[-2]-losses_track[-1]) < loss_tol:
        break
    plot1.set_xdata(theta_track[0, :])
    plot1.set_ydata(theta_track[1, :])
    plot2.set_xdata(range(len(losses_track)))
    plot2.set_ydata(losses_track)
    plot4.set_xdata(range(len(grad_norm_track[1:])))
    plot4.set_ydata(grad_norm_track[1:])
    plot_line(plot3a, theta_curr)
    plot_line(plot3b, theta_curr)
    figure.canvas.draw()
    figure.canvas.flush_events()
    time.sleep(0.1)
print(*theta_curr)
plt.show()

[46.09610615] [1.80913084]


plt.rcParams['axes.grid'] = False
fig = plt.figure(figsize=(4, 4))
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(xxt, yyt, losses, cmap='jet')
ax.set_xlabel('$theta_1$')
ax.set_zlabel('$J$')
ax.set_ylabel('$theta_2$');


# Implementation of Stochastic Gradient Descent for Logistic Regression

import time
%matplotlib notebook


def get_diff(X, theta, y):
    return X@theta - y[..., np.newaxis]


def get_loss(diff):
    return 0.5*np.dot(diff.T, diff)


def plot_line(plot3, theta):
    x_interp = np.linspace(Xmin, Xmax, 100)
    x_interp = x_interp.reshape(-1, 1)
    x_interp = np.c_[x_interp, np.ones_like(x_interp)]
    y_interp = np.dot(x_interp, theta)
    if plot3:
        plot3.set_xdata(x_interp[:, 0])
        plot3.set_ydata(y_interp)
    else:
        return x_interp, y_interp


plt.ion()
figure, (axes_1, axes_2) = plt.subplots(2, 2, figsize=(7, 7))
plt.rcParams['axes.grid'] = False
ax0, ax1 = axes_1
ax2, ax3 = axes_2
ax0.contourf(xxt, yyt, losses, levels=50, cmap='jet')
ax0.scatter(coef_gt, 0, color='red', marker='o', s=50)
ax0.set_xlabel('$theta_1$', fontsize=18)
ax0.set_ylabel('$theta_2$', fontsize=18)
ax1.set_ylabel('$loss$', fontsize=18)
ax1.set_xlabel('$iter$', fontsize=18)
ax1.set(xlim=(0, 100), ylim=(0, 1.28e6))
ax2.scatter(X[..., 0], y, c='red', marker='o')
ax2.set_xlabel('$x$', fontsize=18)
ax2.set_ylabel('$y$', fontsize=18)
ax3.set_ylabel('$Grad. Norm.$', fontsize=18)
ax3.set_xlabel('$iter$', fontsize=18)
ax3.set(xlim=(0, 100), ylim=(0, 300))

theta_curr = np.array([[-100, -100]]).T
losses_track = [get_loss(get_diff(X, theta_curr, y))]
grad_norm_track = [1000]

theta_track = np.array(theta_curr)
lr = 1e-1
loss_tol = 10
np.random.seed(42)

plot1, = ax0.plot(*theta_curr, color='violet',
                  marker='.', markersize=5, linestyle='-')
plot2, = ax1.plot(*losses_track, color='blue',
                  marker='.', markersize=10, linestyle='--')
xi, yi = plot_line(None, theta_curr)
plot3a, plot3b = ax2.plot(xi, yi, color='blue', marker='.',
                          markersize=3, linestyle='--')
plot4, = ax3.plot(1000, color='blue',
                  marker='.', markersize=10, linestyle='--')
while True:
    diff = get_diff(X, theta_curr, y)
    # STOCHASTIC PART ########################
    idx_sampled = np.random.randint(n_samples)
    grad = (diff * X)[idx_sampled, :].T.reshape(-1, 1)
    ##############################
    theta_curr = theta_curr - lr*grad
    theta_track = np.append(theta_track, theta_curr, axis=1)
    diff = get_diff(X, theta_curr, y)
    losses_track.append(get_loss(diff))
    grad_norm_track.append(np.linalg.norm(grad, 2))
    if abs(losses_track[-2]-losses_track[-1]) < loss_tol:
        break
    plot1.set_xdata(theta_track[0, :])
    plot1.set_ydata(theta_track[1, :])
    plot2.set_xdata(range(len(losses_track)))
    plot2.set_ydata(losses_track)
    plot4.set_xdata(range(len(grad_norm_track[1:])))
    plot4.set_ydata(grad_norm_track[1:])
    plot_line(plot3a, theta_curr)
    plot_line(plot3b, theta_curr)
    figure.canvas.draw()
    figure.canvas.flush_events()
    time.sleep(0.1)
print(*theta_curr)
plt.show()

[42.27801342] [-3.01691851]


def solve_lstq(x, y):
    return np.linalg.inv(x.T@x)@x.T@y


plt.figure(figsize=(7, 7))
plt.scatter(x, y, c='red', marker='o', s=30)
plt.scatter(x_valid, y_valid, c='blue', marker='o', s=30)
theta = solve_lstq(x, y)
x_interp = np.linspace(-support_X*1.5, support_X*1.5, 100).reshape(-1, 1)
y_interp = np.dot(theta, x_interp.T)
plt.plot(x_interp[:, 0], y_interp.T, alpha=0.7, marker='.');

err = np.power(y - np.dot(theta, x.T), 2).mean()


%matplotlib inline
plt.figure(figsize=(7, 7))
plt.scatter(x, y, c='red',marker='o', s=30)
theta_q = solve_lstq(xq, y)
x_interp = np.linspace(-support_X*1.5, support_X*1.5, 100).reshape(-1, 1)
x_interp_q = np.c_[x_interp, x_interp**2]
y_interp_q = np.dot(theta_q.T, x_interp_q.T)
plt.plot(x_interp_q[:, 0], y_interp_q.T, alpha=0.7, marker='.');

err = np.power(y - np.dot(theta_q.T, xq.T), 2).mean()


%matplotlib inline
xc = np.c_[x, x**2, x**3]  # make cubic features
plt.figure(figsize=(7, 7))
plt.scatter(x, y, c='red', marker='o', s=30);
theta_c = solve_lstq(xc,y)
x_interp_c = np.c_[x_interp, x_interp**2, x_interp**3]
y_interp_c = np.dot(theta_c.T, x_interp_c.T)
plt.plot(x_interp_c[:, 0], y_interp_c.T, alpha=0.7, marker='.');

err = np.power(y - np.dot(theta_c.T, xc.T), 2).mean()


from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np

errors = []
errors_valid = []
models = []
for m in range(1, 20):
    model = Pipeline([('poly', PolynomialFeatures(degree=m, include_bias=False, interaction_only=False)),
                      ('linear', LinearRegression(fit_intercept=False))])
    models.append(model)
    model = model.fit(x, y)
    x_interp = np.linspace(-support_X-offset_valid,
                           support_X+offset_valid, 100).reshape(-1, 1)
    y_interp = model.predict(x_interp)
    y_est = model.predict(x)
    errors.append(np.power(y - y_est, 2).mean())
    errors_valid.append(np.power(y_valid - model.predict(x_valid), 2).mean())
    # Draw
    plt.figure(figsize=(7, 7))
    plt.scatter(x, y, c='red', marker='o', s=30)
    plt.plot(x_interp, y_interp, alpha=0.7, marker='.')
    plt.ylim([-10, 10])


fig, axes = plt.subplots(1,2, figsize=(20, 7))
axes[0].bar(range(1, 20), errors)
axes[1].bar(range(1, 20), errors_valid);
axes[1].set_ylim([0,200])
axes[0].set_ylim([0,200])
m_best = np.argmin(errors_valid)
print(f'M best (polynomial degree is) {m_best+1}')

M best (polynomial degree is) 3


# Draw
plt.figure(figsize=(7, 7))
plt.scatter(x, y, c='red', marker='.')
plt.scatter(x_valid, y_valid, c='blue', marker='.')
y_interp = models[m_best].predict(x_interp)
plt.plot(x_interp, y_interp, alpha=0.7, marker='.');
plt.ylim([-100,100]);


# Draw
plt.figure(figsize=(7, 7))
plt.scatter(x, y, c='red', marker='.')
plt.scatter(x_valid, y_valid, c='blue', marker='.')
y_interp = models[-1].predict(x_interp)
plt.plot(x_interp, y_interp, alpha=0.7, marker='.');
plt.ylim([-100,100]);

X = PolynomialFeatures(interaction_only=True).fit_transform(X)


X_rand = np.array([[3, 7]], dtype=float)
print(f'Input Dimension {X_rand.shape}')
X_rand_poly = PolynomialFeatures(
    degree=2, interaction_only=False).fit_transform(X_rand)
print(f'Output Dimension {X_rand_poly.shape}')
print(X_rand[0, :], X_rand_poly[0, :], sep='\n')

Input Dimension (1, 2)
Output Dimension (1, 6)
[3. 7.]
[ 1.  3.  7.  9. 21. 49.]


%matplotlib inline
new_size = [PolynomialFeatures(degree=2, interaction_only=False).fit_transform(
    np.random.rand(1, d)).shape[1] for d in range(1, 50)]
plt.figure(figsize=(12,12))
plt.bar(range(1, 50), new_size);

x_1	x_2	y
-1.415	-0.420	-116.6
0.5219	0.2969	58.737
-0.889	-0.815	-73.89
-0.883	0.1537	-113.3
0.7384	0.1713	63.998
-0.264	2.7201	-30.33
1.1428	0.7519	81.616
0.3613	1.5380	48.283
0.8125	1.3562	119.06
-0.223	0.7140	-12.36
-1.106	-1.196	-122.4

Machine Learning¶

7. Linear Regression¶

Recap previous lecture¶

Today's lecture¶

We go back to your loved 🥰 Linear Algebra¶

Supervised, Parametric Models¶

1) Ordinary Linear Regression with Least Squares¶

2) Probabilistic Interpretation¶

3) Gradient Descent "Family"¶

This lecture material is taken from¶

The data¶

Living area vs Apartment Price¶

Linear Regression settings¶

Linear Hypothesis¶

Trick for Notation Compactness¶

Parametric Nature¶

Loss or Cost Function for Linear Regression¶

Minimize the Total Loss with a Closed Form Solution¶

Explicit Cost¶

Vectorizing the Explicit Cost¶

Vectorizing the Explicit Cost¶

Vectorizing the Explicit Cost¶

Solve it¶

Set the gradient to zero¶

To get the normal equation¶

Final Least Squares solution¶

Debugging the Coefficients¶

Important: The distance is NOT orthogonal¶

Important: The distance is NOT orthogonal (1D case)¶

Interpretation as solving an overdetermined Linear System ($n\gg d$)¶

The normal equation gives you a way to invert $\mbf{X}^T\mbf{X}$¶

What happens if $n=d+1$?¶

What happens if $n=d$?¶

Probabilistic Interpretation¶

Probabilistic Interpretation for Linear Regression¶

Probabilistic Interpretation for Linear Regression¶

What does the noise look like?¶

What does the noise look like?¶

Probabilistic Interpretation for Linear Regression¶

Probabilistic Interpretation for Linear Regression¶

Estimate $\bmf{\theta}$ by Maximum Likelihood (MLE)¶

Estimate $\bmf{\theta}$ by Maximum Likelihood (MLE)¶

Maximizing the Log Likelihood (MLE)¶

Maximizing the Log Likelihood (MLE) equals Minimizing the Squared Loss¶

(Under the assumption that the errors will have distributions as Gaussians)¶

Let's assume we could not find a closed form solution but we know how to program plus a bit of calculus, can we still solve Linear Regression?¶

We cannot derive a closed form solution...¶

👋🏼 Closed Form Solution; 🤗 Iterative Methods¶

A very simple yet effective Iterative method is Gradient Descent¶

Ready for an awesome demo?¶

Gradient Descent (GD)¶

Analysis¶

Gradient Descent Algorithm as an Iterative Method¶

Convergence¶

0) Always: validation loss/metric (early stopping) (required)¶

1) No significant decrease in the loss function (preferred)¶

1) No variations in the parameters¶

2) Gradient Norm goes to zero¶

Gradient Descent on Linear Regression¶

Dimension Check¶

Stochastic Gradient Descent (SGD)¶

Let's see the dynamic of SGD!¶

Another demo¶

Stochastic Gradient Descent (SGD): lots of Variations¶

Notable Variations for Deep Learning¶

Machine Learning¶

7. Polynomial Regression, Feature Maps, Ridge Regression¶

Recap¶

We go back to your loved 🥰 Linear Algebra¶

Supervised, Parametric Models¶

1) Ordinary Linear Regression with Least Squares¶

2) Probabilistic Interpretation¶

3) Gradient Descent "Family"¶

Gradient Descent and [Stochastic] GD¶

GD¶

SGD¶

Maximizing the Log Likelihood (MLE) equals Minimizing the Squared Loss¶

(Under the assumption that the errors will have distribution as Gaussians)¶

Today¶

Make Linear Regression... Non-Linear¶