from sklearn.linear_model import Perceptron
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

lift_features = True

X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = X[:, 0] ^ X[:, 1]

if lift_features:
    print('X before=',X, sep='\n')
    X = PolynomialFeatures(interaction_only=True).fit_transform(X).astype(int)
    print('X before=',X,y.T, sep='\n')
clf = Perceptron(fit_intercept=False, max_iter=10, tol=None,
                 shuffle=False).fit(X, y)
clf.predict(X)
clf.score(X, y)

X before=
[[0 0]
 [0 1]
 [1 0]
 [1 1]]
X before=
[[1 0 0 0]
 [1 0 1 0]
 [1 1 0 0]
 [1 1 1 1]]
[0 1 1 0]

1.0


from sklearn.datasets import make_classification

X, y = make_classification(
    n_features=2, n_redundant=0, n_informative=2,
    n_clusters_per_class=1, random_state=0, class_sep=2.8)


plt.scatter(*X.T,c=y,cmap='jet');


from sklearn.datasets import make_classification
from matplotlib import cm
from celluloid import Camera
from IPython.display import HTML

def get_support(minx, maxx):
    support = np.linspace(minx, maxx, 100)
    xx, yy = np.meshgrid(support, support)
    points = np.stack((xx.flatten(), yy.flatten()), axis=1)
    points = add_ones(points)
    return points, xx, yy

def plot_separating_plane(w, minx, maxx, points, xx, yy):
    dist = perceptron(w, points.T)
    dist = dist.reshape(xx.shape)
    plt.contourf(xx, yy, dist, cmap=cm.get_cmap("RdBu"))
    plt.plot(0, 0, 'rx')
    plt.axis('scaled')
    plt.xlim(minx, maxx)


def perceptron(w, xi):
    return np.maximum(np.sign(np.dot(w, xi)), 0)


def add_ones(X):
    bias = np.ones((X.shape[0], 1))
    X = np.hstack((X, bias))
    return X

def done(w, X):
    return np.all(y == perceptron(w, X.T))


def plot_classification(theta_curr, minx, maxx, support, xx, yy, X):
    plot_separating_plane(theta_curr, minx, maxx, support, xx, yy)
    plt.scatter(X[:, 0], X[:, 1], s=80,c=y,
            facecolors="none",
            zorder=10,
            edgecolors="k",
            cmap=cm.get_cmap("RdBu"))

### CAMERA ##########
fig = plt.figure()
camera = Camera(fig)

################## DATA #####################
#X, y = make_classification(
#    n_features=2, n_redundant=0, n_informative=2, 
#    n_clusters_per_class=1, 
#    random_state=0, class_sep=2.8)
minx, maxx = X.min(), X.max()
support, xx, yy = get_support(minx, maxx)
X = add_ones(X)
############### PARAMS ########################
theta_curr = np.array([[-1, -2, 0]])
gamma = 1e-2
i = 0
text_kwargs = dict(ha='center', va='center', fontsize=10, color='C1')
################################################
plot_separating_plane(theta_curr, minx, maxx, support, xx, yy)
exit = False
# while not all are classified
while not done(theta_curr,X) and i < 300:
    # for all points
    for xi, yi in zip(X, y):
        prediction = perceptron(theta_curr, xi)
        # if prediction is wrong, update
        if prediction != yi:
            diff = yi - perceptron(theta_curr, xi)
            theta_curr = theta_curr + gamma * diff * xi
            plot_classification(theta_curr, minx, maxx, support, xx, yy, X)
            i += 1
            plt.text(-4, 4.5, str(i), **text_kwargs)
            camera.snap()
# write 20 frames with the last result for memory
for i in range(20):
    plot_classification(theta_curr, minx, maxx, support, xx, yy, X)
    camera.snap()
animation = camera.animate()
HTML(animation.to_html5_video())


import numpy as np;
import matplotlib.pyplot as plt;
x = np.arange(-20.0, 20.0, 0.1);
ws = np.array([0, 0.01,0.1,0.5,1,2,10,30])
for w in ws:
    y = 1/(1+np.exp(-w*x));
    plt.plot(x,y);
plt.legend(ws);


import numpy as np;
import matplotlib.pyplot as plt;
x = np.arange(-20.0, 20.0, 0.1);
ws = np.array([0, 0.01,0.1,0.5,1,2,10,30])*-1
for w in ws:
    y = 1/(1+np.exp(-w*x));
    plt.plot(x,y);
plt.legend(ws);


import numpy as np;import matplotlib.pyplot as plt;x = np.arange(-20.0, 20.0, 0.1);y = 1/(1+np.exp(-x));plt.plot(x,y);plt.plot(x,1-y);_=plt.legend(['p(y=1|x)','p(y=0|x)'])


import numpy as np;import matplotlib.pyplot as plt;x = np.arange(-20.0, 20.0, 0.1);y = 1/(1+np.exp(-x));plt.plot(x,y);plt.plot(x,1-y);_=plt.legend(['p(y=1|x)','p(y=0|x)'])


# Generate a toy dataset, it's just a straight line with some Gaussian noise:
xmin, xmax = -5, 5
n_samples = 100
np.random.seed(0)
X = np.random.normal(size=n_samples)
y = (X > 0).astype(float)
X[X > 0] *= 4
X += 0.3 * np.random.normal(size=n_samples)

X = X[:, np.newaxis]

plt.plot(X[y==1],[0]*sum(y==1),'.')
plt.plot(X[y==0],[0]*sum(y==0),'r.');


import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression, LinearRegression
from scipy.special import expit



# Fit the classifier
clf = LogisticRegression(C=1e5)
clf.fit(X, y)

# and plot the result
plt.figure(1, figsize=(8, 6))
plt.clf()
plt.scatter(X.ravel(), y, color="black", zorder=20)
X_test = np.linspace(-5, 10, 300)

loss = expit(X_test * clf.coef_ + clf.intercept_).ravel()
plt.plot(X_test, loss, color="red", linewidth=3)

ols = LinearRegression()
ols.fit(X, y)
plt.plot(X_test, ols.coef_ * X_test + ols.intercept_, linewidth=1)
plt.axhline(0.5, color=".5")

plt.ylabel("y")
plt.xlabel("X")
plt.xticks(range(-5, 10))
plt.yticks([0, 0.5, 1])
plt.ylim(-0.25, 1.25)
plt.xlim(-4, 10)
plt.legend(
    ("data","Logistic Regression Model", "Linear Regression Model"),
    loc="lower right",
    fontsize="small",
)
plt.tight_layout()
plt.show()


from sklearn.datasets import make_classification

X, y = make_classification(
    n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, random_state=0, class_sep=0.8)


plt.scatter(*X.T,c=y,cmap=cm.get_cmap("RdBu"));


clf = LogisticRegression(penalty='none')
clf.fit(X, y); minx, maxx = X.min(), X.max()
support = np.linspace(minx, maxx, 100)
xx, yy = np.meshgrid(support, support)
points = np.stack((xx.flatten(), yy.flatten()), axis=1)
prob_mesh = clf.predict_proba(points)
prob_mesh = prob_mesh[:,0].reshape(xx.shape)
fig, axes = plt.subplots(1,2,figsize=(12,6))
axes[0].contourf(xx,yy,prob_mesh,cmap=cm.get_cmap("RdBu"));axes[0].plot(0,0,'rx')
axes[0].scatter(*X.T,c=(1-y),cmap=cm.get_cmap("RdBu"));
axes[0].axis('scaled');axes[1].bar([0,1],clf.coef_[0]);axes[1].axis('equal');axes[1].set_xlim((-5,5));axes[1].set_ylim((-2,8));


import numpy as np
ww = [[1, 2], [-1, 4], [8, 2]]
support = np.linspace(-10, 10, 100)
xx, yy = np.meshgrid(support, support)
W = np.array(ww)
dim = xx.shape
points = np.stack((xx.flatten(), yy.flatten()), axis=1)
dist = np.argmax((W@points.T), axis=0)
dist = dist.reshape(dim)
plt.figure(figsize=(7,7))
plt.contourf(xx, yy, dist, cmap='jet')
plt.plot(0, 0, 'rx')
plt.axis('scaled')
#plt.colorbar()
plt.xlim(-10, 10);

y = np.argmax(W@points.T, axis=0)


import numpy as np
from scipy.special import softmax
_softmax = False
ww = [[1, 2], [-1, 4], [8, 2]];support = np.linspace(-10, 10, 100)
xx, yy = np.meshgrid(support, support)
W = np.array(ww)
dim = xx.shape
points = np.stack((xx.flatten(), yy.flatten()), axis=1)
prob = np.argmax(W@points.T, axis=0) if not _softmax else np.max(np.tile(
    np.arange(0, 3).T, (points.shape[0], 1)).T*softmax(0.2*W@points.T, axis=0), axis=0)
prob = prob.reshape(dim)
plt.figure(figsize=(7, 7));
plt.contourf(xx, yy, prob, cmap='jet');plt.plot(0, 0, 'rx');plt.axis('scaled');plt.xlim(-10, 10);plt.figure(figsize=(7, 7));
ax = plt.axes(projection='3d');ax.plot_surface(xx, yy, prob,rstride=1, cstride=1,cmap='jet', edgecolor='none');ax.view_init(30, -110)


import numpy as np
from scipy.special import softmax
_softmax = True
ww = [[1, 2], [-1, 4], [8, 2]]
support = np.linspace(-10, 10, 100)
xx, yy = np.meshgrid(support, support)
W = np.array(ww)
dim = xx.shape
points = np.stack((xx.flatten(), yy.flatten()), axis=1)
prob = np.argmax(W@points.T, axis=0) if not _softmax else np.max(np.tile(
    np.arange(0, 3).T, (points.shape[0], 1)).T*softmax(0.2*W@points.T, axis=0), axis=0)
prob = prob.reshape(dim)
plt.figure(figsize=(7, 7))
plt.contourf(xx, yy, prob, cmap='jet')
plt.plot(0, 0, 'rx')
plt.axis('scaled')
# plt.colorbar()
plt.xlim(-10, 10);
plt.figure(figsize=(7, 7))
ax = plt.axes(projection='3d')
ax.plot_surface(xx, yy, prob,rstride=1, cstride=1,
                cmap='jet', edgecolor='none')
ax.view_init(30, -130)


import numpy as np
from scipy.special import softmax
_softmax = True
temperature = 10
ww = [[1, 2], [-1, 4], [8, 2]]
support = np.linspace(-10, 10, 100)
xx, yy = np.meshgrid(support, support)
W = np.array(ww)
dim = xx.shape
points = np.stack((xx.flatten(), yy.flatten()), axis=1)
prob = np.argmax(points, axis=1) if not _softmax else np.max(np.tile(
    np.arange(0, 2).T, (points.shape[0], 1))*softmax(temperature*points, axis=1), axis=1)
prob = prob.reshape(dim)
plt.figure(figsize=(4, 4))
plt.contourf(xx, yy, prob, cmap='jet', levels=500)
plt.plot(0, 0, 'rx');plt.axis('scaled');plt.xlim(-10, 10);plt.figure(figsize=(7, 7));ax = plt.axes(projection='3d')
ax.plot_surface(xx, yy, prob,rstride=1, cstride=1,
                cmap='jet', edgecolor='none')
ax.view_init(30, 30)

Data Type $y$	Expo. Family	Name/ML Topic
$\mathbb{R}$	Gaussian LaPlace	Regression
$\{0,1\}$	Bernoulli	Binary Classification
$\{1,K\}$	Categorical	Multi-class Classification
$\mathbb{N}_{+}$	Poisson	Poisson Regression (Counts)
Categorical	Dirichlet	More advanced Topics

Data Type $y$	Expo. Family	Name
$\mathbb{R}$	Gaussian LaPlace	Regression
$\{0,1\}$	Bernoulli	Binary Classification
$\{1,K\}$	Categorical	Multi-class Classification
$\mathbb{N}_{+}$	Poisson	Poisson Regression
Categorical	Dirichlet	More advanced Topics

AI & Machine Learning - Unit 2¶

8. Perceptron and Logistic Regression¶

Yet Another Text Book¶

Today's lecture¶

Supervised, Parametric Models¶

Propaedeutic part for Deep Learning¶

1) The Perceptron Algorithm¶

2) Logistic Regression¶

This lecture material is taken from¶

The Perceptron Algorithm¶

The Perceptron Algorithm¶

The Perceptron Algorithm¶

The Perceptron Algorithm¶

Observations¶

Observations¶

Observations¶

Learning the Perceptron¶

Learning the Perceptron¶

Learning the Perceptron¶

Learning the Perceptron¶

Learning the Perceptron¶

Learning the Perceptron¶

Key Observation¶

Key Observation¶

Key Observation¶

Learning the Perceptron¶

Perceptron Update Rule: $\bmf{\theta} \leftarrow \bmf{\theta} +\gamma \left(y - \sigma(\mbf{\theta}^T\mbf{x})\right)\mbf{x}$¶

Theory:¶

Perceptron cannot learn XOR logic function¶

Awesome demo coming up¶

Perceptron Weakness¶

Perceptron History¶

Logistic Regression¶

Logistic Regression¶

Logistic Regression¶

Logistic Regression (also called Logit)¶

Logistic Regression - Probabilistic View¶

Logistic Regression - Probabilistic View¶

Logistic Regression - Probabilistic View¶

How changing $\mathbf{w}$ changes the output of the function¶

How changing $\mathbf{w}$ changes the output of the function¶

What is the probability of points on the decision boundary?¶

What is the probability of points on the decision boundary?¶

Logistic Regression + threshold for probability = classifier¶

Learning Logistic Regression - No closed form solution¶

Bernoulli Distribution¶

Before moving to optimizing Logistic Regression¶

let's have a "bird's eye" view¶

Generalized Linear Models (GLM) - $\sigma(\bmf{\theta}^T\mbf{x})$¶

Generalized Linear Models (GLM) - $\sigma(\bmf{\theta}^T\mbf{x})$¶

Linear Regression Revisited¶

Linear Regression Revisited¶

Linear Regression Revisited¶

Linear Regression Revisited¶

What we see when we start¶

Linear Regression Revisited¶

What we have to invert¶

Something similar holds for Logistic Regression¶

Something similar holds for Logistic Regression¶

Something similar holds for Logistic Regression¶

Logistic Regression Data¶

Learning Logistic Regression¶

Maximizing Log Likelihood¶

Learning Logistic Regression¶

Maximizing Log Likelihood¶

Learning Logistic Regression¶

Maximizing Log Likelihood¶

Learning Logistic Regression¶

Gradient of Log Likelihood¶

Gradient/Derivative of Logistic Function¶

Gradient of Logistic Function¶

Learning Logistic Regression - Gradient of Log Likelihood¶

Learning Logistic Regression - Gradient of Log Likelihood¶

Learning Logistic Regression - Gradient of Log Likelihood¶

Learning Logistic Regression - Gradient of Log Likelihood¶

Learning Logistic Regression - Gradient of Log Likelihood¶

Learning Logistic Regression - Gradient of Log Likelihood¶

Gradient Ascent for Logistic Regression since maximizing Log Likelihood¶

Same update rule for linear regression but $f_{\theta}\left(x^{(i)}\right)$ changes!¶

Because of the property of GLM!¶

In other words: is `argmax` itself differentiable?¶

In other words: is `argmax` itself differentiable? Nope!¶