from random import shuffle
training = list(range(1,11)) # each index points to a training sample, could be a matrix x=HxWx3, label y
shuffle(training)
converge, it, max_it, k, epoch = False, 0, 100, 3, 0
while not converge and it < max_it: # you training convergence scheme
    print(f'[Epoch {epoch}]')
    for b in range(0, len(training), k): # Data Loader gives you a batch k x matrices
        mini_batch = training[b:b+k] # so mini-batch is a tensor HxWx3xk
        if len(mini_batch) != k: # a possible way of handling the offset
            continue
        print('SGD step taken over', mini_batch) # compute the loss/gradients and upate your model
        loss.backward()  # get the gradients
        optimizer.step() # incorporate in the model
        # check convergence and set it to True
        it += 1
    epoch += 1 # an epoch is done, we reshuffle the training set
    shuffle(training)

> Original unshuffled training set [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
> Training set [10, 8, 1, 2, 6, 4, 9, 7, 5, 3]
[Epoch 0]
SGD step taken over [10, 8, 1]
SGD step taken over [2, 6, 4]
SGD step taken over [9, 7, 5]
> Training set [1, 10, 6, 9, 3, 7, 8, 4, 5, 2]
[Epoch 1]
SGD step taken over [1, 10, 6]
SGD step taken over [9, 3, 7]
SGD step taken over [8, 4, 5]
> Training set [6, 3, 10, 5, 9, 8, 4, 7, 2, 1]
[Epoch 2]
SGD step taken over [6, 3, 10]
SGD step taken over [5, 9, 8]
SGD step taken over [4, 7, 2]
> Training set [1, 2, 5, 10, 6, 7, 9, 8, 3, 4]
[Epoch 3]
SGD step taken over [1, 2, 5]
SGD step taken over [10, 6, 7]
SGD step taken over [9, 8, 3]
> Training set [2, 3, 1, 9, 6, 8, 4, 10, 7, 5]
[Epoch 4]


import torch
import random
random.seed(0) # to fix the random seed to make the code deterministic
torch.manual_seed(0) # to fix the random seed to make the code deterministic
print(torch.__version__)

2.1.2


## params
A = torch.tensor([[0., 2.], [2.,0.]]) # 2x2
b = torch.tensor([[-0.5],[ 0.5]])  # 2x1
c = torch.tensor([-2.], dtype=torch.float32) # 1x1
## input
x = torch.tensor([1., 1.], requires_grad=True) # 1x  Note how here we require the gradients
thetas = (A, b, c) # pack the parameters


A.shape, b.shape, c.shape, x.shape

(torch.Size([2, 2]), torch.Size([2, 1]), torch.Size([1]), torch.Size([2]))


thetas

(tensor([[0., 2.],
         [2., 0.]]),
 tensor([[-0.5000],
         [ 0.5000]]),
 tensor([-2.]))


def function(x, thetas):
    A, b, c = thetas
    return torch.pow(x.T @ A @ x + b.T@x + c, 2)


for epoch in range(200):
    if x.grad is not None: x.grad.zero_() # zeroing the gradient before backprop
    func_v = function(x, thetas) # eval function and track gradients (autograd)
    func_v.backward() # get the grads
    if epoch % 20 == 0:
        print(f'{epoch}) y before SGD = ',func_v.item(),'input x=',x.detach().numpy())
    with torch.no_grad(): # here we DO not track gradient
        x += -1e-3*x.grad # plain SGD
        if epoch % 20 == 0:
            print(f'{epoch}) y after SGD  = ',function(x,thetas).item(), 
                  'input x=',x.detach().numpy())

0) y before SGD =  4.0 input x= [1. 1.]
0) y after SGD  =  3.5006706714630127 input x= [0.986 0.982]
20) y before SGD =  0.4590446949005127 input x= [0.84928155 0.79601455]
20) y after SGD  =  0.4189494550228119 input x= [0.8456445 0.7907337]
40) y before SGD =  0.08419127017259598 input x= [0.802158  0.7256682]
40) y after SGD  =  0.07775045186281204 input x= [0.80076367 0.72351605]
60) y before SGD =  0.018030276522040367 input x= [0.78288543 0.6954951 ]
60) y after SGD  =  0.01672857254743576 input x= [0.7822726 0.6945199]
80) y before SGD =  0.0041193654760718346 input x= [0.77417433 0.68153346]
80) y after SGD  =  0.0038299155421555042 input x= [0.7738886  0.68107176]
100) y before SGD =  0.000969333981629461 input x= [0.7700594 0.6748613]
100) y after SGD  =  0.0009021023870445788 input x= [0.76992244 0.6746384 ]
120) y before SGD =  0.0002313051954843104 input x= [0.7680748 0.6716252]
120) y after SGD  =  0.00021535974519792944 input x= [0.7680083  0.67151654]
140) y before SGD =  5.5571563279954717e-05 input x= [0.767108  0.6700446]
140) y after SGD  =  5.1754243031609803e-05 input x= [0.76707554 0.6699914 ]
160) y before SGD =  1.3398824194155168e-05 input x= [0.76663494 0.6692699 ]
160) y after SGD  =  1.2477918062359095e-05 input x= [0.766619  0.6692438]
180) y before SGD =  3.2350690162274987e-06 input x= [0.76640296 0.66888946]
180) y after SGD  =  3.0142657578835497e-06 input x= [0.76639515 0.66887665]


function(x, thetas).item()

7.811336786289758e-07


x.T @ A @ x + b.T@x + c

tensor([0.0009], grad_fn=<AddBackward0>)

x

tensor([0.7663, 0.6687], requires_grad=True)

with torch.no_grad(): # we do not track the backward pass
    code that uses the function...


with torch.no_grad():
    f = function(x, thetas)
print(f, '\n\n-->if you see in the tensor, there is NO grad_fn=<AddBackward0>) as before')

tensor([7.8113e-07]) 

-->if you see in the tensor, there is NO grad_fn=<AddBackward0>) as before

from torch import nn, optim

model = resnet18(weights=ResNet18_Weights.DEFAULT)

# Freeze all the parameters in the network
for param in model.parameters():
    param.requires_grad = False


def function(x, thetas):
    A, b, c = thetas
    quad = x.T @ A @ x
    linear_detached = (b.T@x).detach() # detaching b*x
    return torch.pow( quad + linear_detached + c, 2)

from torch import tensor

def neural_net(x,y,z):
    return (x+y)*z

x, y, z = tensor(-2., requires_grad=True), tensor(5.,requires_grad=True), tensor(-4., requires_grad=True)
loss = neural_net(x,y,z) # forward pass
loss.backward()          # backward (after this I can check the gradients)
for el in [x,y,z]:
    print(el.grad)

p = [0.2 0.7 0.1] y =[0 1 0]

dL/dz  = [0.2 0.7-1 0.1] = [0.2 -0.3 0.1]


x_big = np.arange(0.01, 3.01, 0.01)
ys = np.sin(x_big**x_big)
_ = plt.plot(x_big, ys, 'b-')
plt.xlabel('x');plt.ylabel('y');
_ = plt.axis('equal')


x_med = np.arange(1.75, 2.25, 0.001)
ys = np.sin(x_med**x_med)
_ = plt.plot(x_med, ys, 'b-')
plt.xlabel('x');plt.ylabel('y');
_ = plt.axis('equal')


x_small = np.arange(2.0, 2.01, 0.0001)
ys = np.sin(x_small**x_small)
_ = plt.plot(x_small, ys, 'b-')
plt.xlabel('x');plt.ylabel('y');
_ = plt.axis('equal')

Machine Learning¶

10. Multi-Layer Perceptron and Introduction to Deep Learning¶

Recap previous lecture¶

Today's lecture¶

Supervised, Parametric Models¶

Propaedeutic part for Deep Learning¶

0) Optimization in Deep Learning¶

1) Network Structure: Multi-Layer Perceptron (MLP) is a Fully-Connected Neural Net¶

2) Backpropagation¶

This lecture material is taken from¶

Deep Learning¶

0) Quick Intro to Optimization in Deep Learning¶

1) What is a Neural Net (just Multi-Layer Perceptron)¶

2) How to obtain gradients on the weights¶

Gradient Descent or Batch GD¶

Stochastic Gradient Descent or SGD¶

How to optimize a Neural Net - SGD over mini-batches¶

NN training scheme - Pseudo-code¶

Images - Mini-batch is a tensor HxWx3xk¶

as an example with RGB images of size $H\times W$, you have a tensor that contains $k$ images in the mini-batch¶

Video Frames - Mini-batch is a tensor HxWx3xtxk¶

as set of frames from a video, you have a tensor that contains $k$ frames over $t$ time instants of the videos in the mini-batch¶

1) SGD over mini-batches¶

Change of vocabulary - A bunch of training samples is a mini-batch¶

Mini-Batch, Visually¶

Mini-Batch SGD vs [Batch] GD¶

Mini-batch is a sort of smoothing of the single point SGD¶

There is another smoothing technique: Momentum¶

Moving average¶

Moving average¶

Exponential moving average used in SGD¶

SGD over mini-batches with Momentum¶

SGD over mini-batches with Momentum¶

Loss Surface for Linear Regression $\ell_2^2$ loss with $d=2$ parameters in $\bmf{\theta}$¶

Loss Surface for Linear Regression $\ell_2^2$ loss with $d=10$ parameters in $\bmf{\theta}$¶

With Deep Learning optimization is highly non-convex and params explode!¶

Loss Surface for ResNet-20 with no skip connection on ImageNet¶

ResNet-20, number of parameters $\bmf{\theta}$ of the order of.....millions!¶

You will cover ResNet (residual connections) in Deep Learning course¶

Learning rate is very important¶

Babysitting the training process¶

Loss in function of epochs¶

Valleys, Hills, Noisy Surface¶

Dynamics of Training¶

Dynamics of Training¶

Just to give you an hint on where the community is headed with Deep Learning¶

DALL-E OpenAI (January 2021)¶

DALL-E OpenAI¶

OpenAI DALL-E - 12-billion parameters trained with self-supervision¶

0) Quick Intro to Optimization in Deep Learning¶

1) Network Structure: Multi-Layer Perceptron (MLP) is a Fully-Connected Neural Net¶

2) Backpropagation¶

1) Network Structure: Multi-Layer Perceptron (MLP)¶

is a Fully-Connected Neural Net¶

Networks and Topics that we do NOT cover¶

Let's go back to single layer, linear soft-max regression or linear neural network¶

Let's recall last classification layer of a neural net as pipeline¶

$\mbf{x} \implies \mbf{z}= \mbf{W}\mbf{x} + \mbf{b} \implies e^{\mbf{z}} \implies \mbf{p} = \frac{e^{\mbf{z}}}{\sum_k e^{\mbf{z}}} \implies -\ln(\mbf{p}_y) $¶

Representation of a Single Layer¶

Representation of a Single Layer¶

Representation of a Single Layer¶

Representation of a Single Layer: Linear plus non-Linear¶

Representation of a Single Layer: Linear plus non-Linear¶

Representation as a computational graph¶

Damn, until now is all linear. So now the "Deep"!¶

Question: A Single Linear Soft-Max Layer may suffer from Bias or Variance problem?¶

A single linear layer is not enough for highly non-linear problems¶

Adding another non-linear layer before the classifier¶

$\mathbf{W}^1 \in \mathbb{R}^{d\times p}$ is an Hidden Layer¶

Let's update our visualizations¶

Multi-Layer Perceptron (MLP) with one hidden layer¶

Given the nature of these layers, they're called Fully-Connected NN¶

Multi-Layer Perceptron with one hidden layer¶

Non-linear activation functions: Sigmoid¶

Non-linear activation functions: Sigmoid¶

Non-linear activation functions: ReLu - Rectified Linear Unit¶

Sigmoid¶

ReLU¶

There are other activation functions we do not cover¶

TanH, Leaky ReLU, parametrized ReLU, ELU¶

Images - Mini-batch is a tensor `HxWx3xk`¶

Video Frames - Mini-batch is a tensor `HxWx3xtxk`¶