# we can implement these formulas over the Iris dataset
d = 2 # number of features in our toy dataset
K = 3 # number of clases
n = X.shape[0] # size of the dataset

# these are the shapes of the parameters
mus = np.zeros([K,d])
Sigmas = np.zeros([K,d,d])
phis = np.zeros([K])

# we now compute the parameters
for k in range(3):
    X_k = X[iris_y == k]
    mus[k] = np.mean(X_k, axis=0)
    Sigmas[k] = np.cov(X_k.T)
    phis[k] = X_k.shape[0] / float(n)


print('mus:', mus, '\nphis:', phis)

mus: [[5.01 3.43]
 [5.94 2.77]
 [6.59 2.97]] 
phis: [0.33 0.33 0.33]


plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired, s=50);
plt.xlabel('Sepal length'); plt.ylabel('Sepal width');


# we can implement this in numpy
def gda_predictions(x, mus, Sigmas, phis):
    """This returns class assignments and p(y|x) under the GDA model.
    
    We compute \arg\max_y p(y|x) as \arg\max_y p(x|y)p(y)
    """
    # adjust shapes
    n, d = x.shape
    x = np.reshape(x, (1, n, d, 1))
    mus = np.reshape(mus, (K, 1, d, 1))
    Sigmas = np.reshape(Sigmas, (K, 1, d, d))    
    
    # compute probabilities
    py = np.tile(phis.reshape((K,1)), (1,n)).reshape([K,n,1,1])
    pxy = (
        np.sqrt(np.abs((2*np.pi)**d*np.linalg.det(Sigmas))).reshape((K,1,1,1)) 
        * -.5*np.exp(
            np.matmul(np.matmul((x-mus).transpose([0,1,3,2]), np.linalg.inv(Sigmas)), x-mus)
        )
    )
    pyx = pxy * py
    return pyx.argmax(axis=0).flatten(), pyx.reshape([K,n])

idx, pyx = gda_predictions(X, mus, Sigmas, phis)
print(idx)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 2 2 2 1 2 1 2 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1
 2 2 2 2 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2
 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 1 2 2 2 1 2 2 2 1 2 2 2 2 2
 2 1]


Z, pyx = gda_predictions(np.c_[xx.ravel(), yy.ravel()], mus, Sigmas, phis)
logpy = np.log(-1./3*pyx)

Z = Z.reshape(xx.shape)
contours = np.zeros([K, xx.shape[0], xx.shape[1]])
for k in range(K): contours[k] = logpy[k].reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
for k in range(K): plt.contour(xx, yy, contours[k], levels=np.logspace(0, 1, 1))

plt.scatter(X[:, 0], X[:, 1], c=iris_y, edgecolors='k', cmap=plt.cm.Paired)
plt.xlabel('Sepal length'); plt.ylabel('Sepal width');


# we can implement these formulas over the Iris dataset
d = 2 # number of features in our toy dataset
K = 3 # number of clases
n = X.shape[0] # size of the dataset

# these are the shapes of the parameters
mus = np.zeros([K,d])
Sigmas = np.zeros([K,d,d])
phis = np.zeros([K])

# we now compute the parameters
for k in range(3):
    X_k = X[iris_y == k]
    mus[k] = np.mean(X_k, axis=0)
    Sigmas[k] = np.cov(X.T) # this is now X.T instead of X_k.T
    phis[k] = X_k.shape[0] / float(n)

# print out the means
print(mus)

[[5.01 3.43]
 [5.94 2.77]
 [6.59 2.97]]


# we can implement this in numpy
def gda_predictions(x, mus, Sigmas, phis):
    """This returns class assignments and p(y|x) under the GDA model.
    
    We compute \arg\max_y p(y|x) as \arg\max_y p(x|y)p(y)
    """
    # adjust shapes
    n, d = x.shape
    x = np.reshape(x, (1, n, d, 1))
    mus = np.reshape(mus, (K, 1, d, 1))
    Sigmas = np.reshape(Sigmas, (K, 1, d, d))    
    
    # compute probabilities
    py = np.tile(phis.reshape((K,1)), (1,n)).reshape([K,n,1,1])
    pxy = (
        np.sqrt(np.abs((2*np.pi)**d*np.linalg.det(Sigmas))).reshape((K,1,1,1)) 
        * -.5*np.exp(
            np.matmul(np.matmul((x-mus).transpose([0,1,3,2]), np.linalg.inv(Sigmas)), x-mus)
        )
    )
    pyx = pxy * py
    return pyx.argmax(axis=0).flatten(), pyx.reshape([K,n])

idx, pyx = gda_predictions(X, mus, Sigmas, phis)
print(idx)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 2 2 2 1 2 1 2 1 2 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1
 2 2 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 1 2 2
 1 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 1 2 2 2 1 2 2 2 1 2 2 2 1 2
 2 1]


Z, pyx = gda_predictions(np.c_[xx.ravel(), yy.ravel()], mus, Sigmas, phis)
logpy = np.log(-1./3*pyx)

Z = Z.reshape(xx.shape)
contours = np.zeros([K, xx.shape[0], xx.shape[1]])
for k in range(K):
    contours[k] = logpy[k].reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
for k in range(K):
    plt.contour(xx, yy, contours[k], levels=np.logspace(0, 1, 1))

plt.scatter(X[:, 0], X[:, 1], c=iris_y, edgecolors='k', cmap=plt.cm.Paired)
plt.xlabel('Sepal length'); plt.ylabel('Sepal width');


nrow, ncol = 1, 2
fig, axs = plt.subplots(nrow, ncol, figsize=(6*ncol, 2*nrow))
axs[0].pcolormesh(xx, yy, Z, cmap=plt.cm.Paired); axs[0].set_title('LDA')
axs[1].pcolormesh(xx, yy, Z_lr, cmap=plt.cm.Paired); axs[1].set_title('Logistic Regression')

for ax in axs:
    ax.scatter(X[:, 0], X[:, 1], c=iris_y, edgecolors='k', cmap=plt.cm.Paired)
    ax.set_xlabel('Sepal length'); ax.set_ylabel('Sepal width')


#https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
    
from IPython.display import Markdown, display
import numpy as np; np.set_printoptions(precision=2)
import pandas as pd; pd.options.display.float_format = "{:,.2f}".format
from sklearn.datasets import fetch_20newsgroups

# for this lecture, we will restrict our attention to just 4 different newsgroups:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

# load the dataset
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

# print some information on it
Markdown(twenty_train.DESCR[:1088])

=================   ==========
Classes                     20
Samples total            18846
Dimensionality               1
Features                  text
=================   ==========


# The set of targets in this dataset are the newgroup topics:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']


# Let's examine one data point
print(twenty_train.data[3])

From: s0612596@let.rug.nl (M.M. Zwart)
Subject: catholic church poland
Organization: Faculteit der Letteren, Rijksuniversiteit Groningen, NL
Lines: 10

Hello,

I'm writing a paper on the role of the catholic church in Poland after 1989. 
Can anyone tell me more about this, or fill me in on recent books/articles(
in english, german or french). Most important for me is the role of the 
church concerning the abortion-law, religious education at schools,
birth-control and the relation church-state(government). Thanx,

                                                 Masja,
"M.M.Zwart"<s0612596@let.rug.nl>


from sklearn.feature_extraction.text import CountVectorizer

# vectorize the training set
count_vect = CountVectorizer(binary=True)
X_train = count_vect.fit_transform(twenty_train.data)
X_train.shape

(2257, 35788)


print('Word to index `.vocabulary`: ', str(count_vect.vocabulary_)[:94] + ' ...')
feature_names = count_vect.get_feature_names_out()
print('Index to word: ', feature_names[:10])

Word to index `.vocabulary`:  {'from': 14887, 'sd345': 29022, 'city': 8696, 'ac': 4017, 'uk': 33256, 'michael': 21661, 'coll ...
Index to word:  ['00' '000' '0000' '0000001200' '000005102000' '0001' '000100255pixel'
 '00014' '000406' '0007']


# The CountVectorizer class records the index j associated with each word in V
print('Index for the word "church": ', count_vect.vocabulary_.get(u'church'))
print('Index for the word "computer": ', count_vect.vocabulary_.get(u'computer'))

# And we can map the indices back to words
print("Word for the index '8609': ", feature_names[8609])
print("Word for the index '10000': ", feature_names[10000])

Index for the word "church":  8609
Index for the word "computer":  9338
Word for the index '8609':  church
Word for the index '10000':  counseling


# We can examine if any of these words are present in our previous datapoint
print(twenty_train.data[3])

# let's see if it contains these two words?
print('---'*20)
print('Value at the index for the word "church": ', X_train[3, count_vect.vocabulary_.get(u'church')])
print('Value at the index for the word "computer": ', X_train[3, count_vect.vocabulary_.get(u'computer')])
print('Value at the index for the word "doctor": ', X_train[3, count_vect.vocabulary_.get(u'doctor')])
print('Value at the index for the word "important": ', X_train[3, count_vect.vocabulary_.get(u'important')])

From: s0612596@let.rug.nl (M.M. Zwart)
Subject: catholic church poland
Organization: Faculteit der Letteren, Rijksuniversiteit Groningen, NL
Lines: 10

Hello,

I'm writing a paper on the role of the catholic church in Poland after 1989. 
Can anyone tell me more about this, or fill me in on recent books/articles(
in english, german or french). Most important for me is the role of the 
church concerning the abortion-law, religious education at schools,
birth-control and the relation church-state(government). Thanx,

                                                 Masja,
"M.M.Zwart"<s0612596@let.rug.nl>

------------------------------------------------------------
Value at the index for the word "church":  1
Value at the index for the word "computer":  0
Value at the index for the word "doctor":  0
Value at the index for the word "important":  1


from sklearn.linear_model import LogisticRegression

# Create an instance of Softmax and fit the data.
logreg = LogisticRegression(C=1e5, multi_class='multinomial', verbose=True)
logreg.fit(X_train, twenty_train.target)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =       143156     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  3.12887D+03    |proj g|=  2.12500D+02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
*****     38     43      1     0     0   9.100D-05   1.095D-02
  F =   1.0948389171855248E-002

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s finished

LogisticRegression(C=100000.0, multi_class='multinomial', verbose=True)

LogisticRegression(C=100000.0, multi_class='multinomial', verbose=True)


docs_new = ['God is love', 'OpenGL on the GPU is fast']

X_new = count_vect.transform(docs_new)
predicted = logreg.predict(X_new)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


#https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
from sklearn.datasets import fetch_20newsgroups

# for this lecture, we will restrict our attention to just 4 different newsgroups:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
Markdown(twenty_train.DESCR[:1088]);


from sklearn.feature_extraction.text import CountVectorizer

# vectorize the training set
count_vect = CountVectorizer(binary=True, max_features=1000)
y_train = twenty_train.target
X_train = count_vect.fit_transform(twenty_train.data).toarray()
feature_names = count_vect.get_feature_names_out()
X_train.shape

(2257, 1000)


n = X_train.shape[0] # size of the dataset
d = X_train.shape[1] # number of features in our dataset
K = 4 # number of clases

# these are the shapes of the parameters
psis = np.zeros([K,d])
phis = np.zeros([K])

# we now compute the parameters
for k in range(K):
    X_k = X_train[y_train == k]
    psis[k] = np.mean(X_k, axis=0)
    phis[k] = X_k.shape[0] / float(n)

# print out the class proportions
print(phis)

[0.21 0.26 0.26 0.27]


top_words = []
for category in range(4):
    top_words.append(feature_names[np.argsort(psis[category])[-200:]])

for category in range(4):
    words_from_other_categories = np.concatenate(top_words[:category] + top_words[category+1:])
    unique_words = [word for word in top_words[category] if word not in words_from_other_categories]
    print(f'\n# top 10 ~unique words occurring in {twenty_train.target_names[category]}:')
    print(unique_words[-10:])

# top 10 ~unique words occurring in alt.atheism:
['someone', 'again', 'allan', 'political', 'schneider', 'atheism', 'caltech', 'cco', 'keith', 'atheists']

# top 10 ~unique words occurring in comp.graphics:
['files', 'version', 'file', 'mail', 'image', 'keywords', 'program', 'looking', 'please', 'graphics']

# top 10 ~unique words occurring in sci.med:
['soon', 'univ', 'pittsburgh', 'disease', 'geb', 'banks', 'medical', 'years', 'gordon', 'pitt']

# top 10 ~unique words occurring in soc.religion.christian:
['faith', 'athos', 'church', 'bible', 'christ', 'jesus', 'apr', 'christians', '1993', 'rutgers']


def nb_predictions(x, psis, phis):
    """This returns class assignments and scores under the NB model.
    
    We compute \arg\max_y p(y|x) as \arg\max_y p(x|y)p(y)
    """
    # adjust shapes
    n, d = x.shape
    x = np.reshape(x, (1, n, d))
    psis = np.reshape(psis, (K, 1, d))
    
    psis = psis.clip(1e-14, 1-1e-14) # clip probabilities to avoid log(0)
    
    # compute log-probabilities
    logpy = np.log(phis).reshape([K,1])
    logpxy = x * np.log(psis) + (1-x) * np.log(1-psis)
    logpyx = logpxy.sum(axis=2) + logpy

    return logpyx.argmax(axis=0).flatten(), logpyx.reshape([K,n])

idx_train, logpyx = nb_predictions(X_train, psis, phis)
print(idx_train[:10])

[1 1 3 0 3 3 3 2 2 2]


(idx_train==y_train).mean()

0.8692955250332299


docs_new = ['OpenGL on the GPU is fast']

X_new = count_vect.transform(docs_new).toarray()
predicted, logpyx_new = nb_predictions(X_new, psis, phis)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'OpenGL on the GPU is fast' => comp.graphics

Machine Learning¶

6. Generative Models¶

Generative vs Discriminative Models in Machine Learning¶

Learning from complex distributions is in common to vision/NLP (curse of dimensionality)¶

Vision: high-dimensional, continuous data¶

NLP: combinatorial/compositional problem of discrete symbols¶

Unknown density of the data¶

Known data samples dataset¶

Generative vs Discriminative Models¶

Generative¶

Discriminative¶

Generative and Discriminative are Connected¶

Generative and Discriminative are Connected¶

From Generative go Discriminative¶

Discriminative vs. Generative Models¶

Relation to text-to-image (TTI) models¶

Generative Classifiers: Intuition¶

Generative Classifiers: Details¶

Part 3: Gaussian Discriminant Analysis¶

The Parameters of a GMM¶

Learning the Parameters $\mu_k, \Sigma_k$¶

Learning the Parameters $\phi$¶

Querying the Model¶

Algorithm: Gaussian Discriminant Analysis (GDA)

Example: Iris Flower Classification¶

Special Cases of GDA¶

Part 4: Discriminative vs. Generative Classifiers¶

Linear Discriminant Analysis¶

LDA vs. Logistic Regression¶

Discriminative Approaches¶

Other Useful Features of Generative Models¶

Naive Bayes¶

Part 1: Text Classification¶

Text Classification¶

Bag of Words Representations¶

Classification Dataset: Twenty Newsgroups¶

The 20 newsgroups text dataset¶

Practical Considerations¶

Classification Using BoW Features¶

Summary of Text Classification¶

Part 2: Generative Classifers for Text¶

Generative Classifiers: Intuition¶

Example: Gaussian Discriminant Analysis¶

A Generative Text Classifier¶

Part 3: Naive Bayes¶

A Generative Model for Text Classification¶

Review: Categorical Distribution¶

First Attempt at a Generative Model¶

Problem: High Dimensionality¶

The Naive Bayes Assumption¶

Naive Bayes Assumption for Bag of Words¶

Is Naive Bayes a Good Assumption?¶

Bernoulli Naive Bayes Model¶

Part 4: Naive Bayes: Learning¶

The Parameters of a Naive Bayes Model¶

Learning the Parameters $\phi$¶

Learning the Parameters $\psi_{jk}$¶

Querying the Model¶

Classification Dataset: Twenty Newsgroups¶

Algorithm: Bernoulli Naive Bayes¶

Part 4: Naive Bayes: Learning (Advanced)¶

Review: Maximum Likelihood Learning¶

Learning a Bernoulli Naive Bayes Model¶

Learning the Parameters $\phi$¶

Learning the Parameters $\psi_{jk}$¶

Generative vs. Discriminative Approaches¶

Summary¶