# Run this cell to get everything set up.
from lec_utils import *
import lec25_util as util

from sklearn.datasets import fetch_openml
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=True)
# The documentation here: https://www.openml.org/search?type=data&status=active&id=554
# tells us that the first 60,000 rows constitute the training set.
X_train, X_test = X.iloc[:60000], X.iloc[60000:]
y_train, y_test = y.iloc[:60000].astype(int), y.iloc[60000:].astype(int)

0        5
1        0
2        4
        ..
59997    5
59998    6
59999    8
Name: class, Length: 60000, dtype: int64

pixel1      0
pixel2      0
pixel3      0
           ..
pixel782    0
pixel783    0
pixel784    0
Name: 98, Length: 784, dtype: int64

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

KNeighborsClassifier(n_neighbors=100)

KNeighborsClassifier(n_neighbors=100)

array([3])

0.944

LogisticRegression(multi_class='multinomial', penalty='l1', solver='saga')

LogisticRegression(multi_class='multinomial', penalty='l1', solver='saga')

LogisticRegression(multi_class='multinomial', penalty='l1', solver='saga')

from sklearn.datasets import fetch_openml
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=True)
# The documentation here: https://www.openml.org/search?type=data&status=active&id=554
# tells us that the first 60,000 rows constitute the training set.
X_train, X_test = X.iloc[:60000], X.iloc[60000:]
y_train, y_test = y.iloc[:60000].astype(int), y.iloc[60000:].astype(int)

X_train

y_train

0        5
1        0
2        4
        ..
59997    5
59998    6
59999    8
Name: class, Length: 60000, dtype: int64

X_train.iloc[98]

pixel1      0
pixel2      0
pixel3      0
           ..
pixel782    0
pixel783    0
pixel784    0
Name: 98, Length: 784, dtype: int64

X_train.iloc[98].to_numpy().reshape((28, 28))

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# We'll keep using image 98 as an example, so remember that it's a 3!
util.show_image(X_train.iloc[98])

y_train.value_counts(normalize=True).sort_index().plot(kind='bar', title='Distribution of Digits in the Training Set')

from sklearn.neighbors import KNeighborsClassifier

model_knn = KNeighborsClassifier(n_neighbors=100) # Arbitrary choice; remember, there are 60,000 points in the training set.
model_knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=100)

KNeighborsClassifier(n_neighbors=100)

model_knn.predict(X_train.iloc[[98]])

array([3])

# Accuracy on test set. Takes ~10 seconds on my computer, but is fairly high.
y_test_pred = model_knn.predict(X_test)
(y_test == y_test_pred).mean()

0.944

X_test_labeled = X_test.assign(
    true=y_test,
    pred=y_test_pred
)
util.show_confusion(y_test, y_test_pred, title='Confusion Matrix for 100-Nearest Neighbors Model')

util.show_image_and_label(X_test_labeled.query('pred != true').sample().iloc[0])

util.show_image_and_label(X_test_labeled.query('pred != true and pred == 1').sample().iloc[0])

util.show_image_and_label(X_test_labeled.query('pred != true and pred == 1').iloc[1])

LogisticRegression(multi_class='multinomial', penalty='l1', solver='saga')

LogisticRegression(multi_class='multinomial', penalty='l1', solver='saga')

LogisticRegression(multi_class='multinomial', penalty='l1', solver='saga')

LogisticRegression(multi_class='multinomial', penalty='l1', solver='saga')

array([3])

0.9011

array([[0.  , 0.  , 0.44, 0.02, 0.  , 0.  , 0.  , 0.  , 0.54, 0.  ]])

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

(10, 784)

PCA(n_components=2)

util.show_image_and_label(X_test_labeled.query('pred != true and pred == 1').iloc[1])

from sklearn.linear_model import LogisticRegression
model_log = LogisticRegression(multi_class='multinomial', penalty='l1', solver='saga')
model_log

LogisticRegression(multi_class='multinomial', penalty='l1', solver='saga')

LogisticRegression(multi_class='multinomial', penalty='l1', solver='saga')

model_log.fit(X_train.head(10000), y_train.head(10000))

LogisticRegression(multi_class='multinomial', penalty='l1', solver='saga')

LogisticRegression(multi_class='multinomial', penalty='l1', solver='saga')

util.show_image(X_train.iloc[[98]])

model_log.predict(X_train.iloc[[98]])

array([3])

# MUCH faster than with the k-nearest neighbors model.
model_log.score(X_test, y_test)

0.9011

X_test_labeled = X_test.assign(
    true=y_test,
    pred=model_log.predict(X_test)
)
util.show_confusion(y_test, model_log.predict(X_test), title='Confusion Matrix for Multinomial Logistic Regression Model')

util.show_image_and_label(X_test_labeled.query('pred != true and pred == 8').iloc[15])

model_log.predict_proba(X_test_labeled.query('pred != true and pred == 8').iloc[[15], :-2])

array([[0.  , 0.  , 0.44, 0.02, 0.  , 0.  , 0.  , 0.  , 0.54, 0.  ]])

util.visualize_probs(model_log.predict_proba(X_test_labeled.query('pred != true and pred == 8').iloc[[15], :-2]))

t = X_test_labeled.query('pred != true').reset_index(drop=True)
t = t.assign(second_highest_prob = pd.DataFrame(model_log.predict_proba(t.iloc[:, :-2])).apply(lambda r: r.sort_values().iloc[-2], axis=1))
p = t[t['second_highest_prob'] >= 0.3].sample().iloc[0]
util.show_image_and_label(p.iloc[:-1].astype(int)).show()
util.visualize_probs(model_log.predict_proba(p.iloc[:-3].to_frame().T))

model_log.coef_

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

model_log.coef_.shape

(10, 784)

px.imshow(model_log.coef_[0].reshape((28, 28)), color_continuous_scale='Rdbu', title='Class 0 Coefficients')

util.plot_model_coefficients(model_log.coef_)

X_train

PCA(n_components=2)

PCA(n_components=2)

(60000, 2)

array([[ 123.93,  312.67],
       [1011.72,  294.86],
       [ -51.85, -392.17],
       ...,
       [-178.05, -160.08],
       [ 130.61,    5.59],
       [-173.44,   24.72]])

Pipeline(steps=[('pca', PCA(n_components=30)),
                ('logisticregression',
                 LogisticRegression(multi_class='multinomial', penalty='l1',
                                    solver='saga'))])

Pipeline(steps=[('pca', PCA(n_components=30)),
                ('logisticregression',
                 LogisticRegression(multi_class='multinomial', penalty='l1',
                                    solver='saga'))])

PCA(n_components=30)

LogisticRegression(multi_class='multinomial', penalty='l1', solver='saga')

Pipeline(steps=[('pca', PCA(n_components=30)),
                ('logisticregression',
                 LogisticRegression(multi_class='multinomial', penalty='l1',
                                    solver='saga'))])

Pipeline(steps=[('pca', PCA(n_components=30)),
                ('logisticregression',
                 LogisticRegression(multi_class='multinomial', penalty='l1',
                                    solver='saga'))])

X_train

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(X_train)

PCA(n_components=2)

PCA(n_components=2)

X_train_approx = pca.transform(X_train)
X_train_approx.shape

(60000, 2)

X_train_approx

array([[ 123.93,  312.67],
       [1011.72,  294.86],
       [ -51.85, -392.17],
       ...,
       [-178.05, -160.08],
       [ 130.61,    5.59],
       [-173.44,   24.72]])

util.show_2_pcs(X_train_approx, y_train)

util.show_2_pcs(X_train_approx, y_train, color=True)

from sklearn.pipeline import make_pipeline
model_pca_log = make_pipeline(
    PCA(n_components=30),
    LogisticRegression(multi_class='multinomial', penalty='l1', solver='saga')
)
model_pca_log

Pipeline(steps=[('pca', PCA(n_components=30)),
                ('logisticregression',
                 LogisticRegression(multi_class='multinomial', penalty='l1',
                                    solver='saga'))])

Pipeline(steps=[('pca', PCA(n_components=30)),
                ('logisticregression',
                 LogisticRegression(multi_class='multinomial', penalty='l1',
                                    solver='saga'))])

PCA(n_components=30)

LogisticRegression(multi_class='multinomial', penalty='l1', solver='saga')

model_pca_log.fit(X_train, y_train)

Pipeline(steps=[('pca', PCA(n_components=30)),
                ('logisticregression',
                 LogisticRegression(multi_class='multinomial', penalty='l1',
                                    solver='saga'))])

Pipeline(steps=[('pca', PCA(n_components=30)),
                ('logisticregression',
                 LogisticRegression(multi_class='multinomial', penalty='l1',
                                    solver='saga'))])

PCA(n_components=30)

LogisticRegression(multi_class='multinomial', penalty='l1', solver='saga')

model_pca_log.score(X_test, y_test)

0.8869

Lecture 25¶

Computer Vision, Conclusion¶

EECS 398: Practical Data Science, Spring 2025¶

Agenda 📆¶

Question 🤔 (Answer at practicaldsc.org/q)

Computer vision 👾¶

The MNIST dataset¶

Loading the data¶

From vectors to images¶

The distribution of the training set¶

Model #1: $k$-Nearest Neighbors 🏡🏠¶

What kinds of errors does the model make?¶

Examining misclassified images¶

Question 🤔 (Answer at practicaldsc.org/q)

Downsides of $k$-nearest neighbors¶

Model #2: Multinomial logistic regression 📈¶

Multinomial logistic regression in `sklearn`¶

What kinds of errors does this model make?¶

Modeling uncertainty¶

Other close calls¶

Visualizing coefficients¶

Question 🤔 (Answer at practicaldsc.org/q)

Reflection¶

Principal component analysis (PCA)¶

PCA in `sklearn`¶

Visualizing principal components¶

Clusters in principal components¶

PCA as a preprocessing step¶

Parting thoughts 👋¶

You've accomplished a lot!¶

Thank you, and keep in touch!¶

	pixel1	pixel2	pixel3	pixel4	...	pixel781	pixel782	pixel783	pixel784
0	0	0	0	0	...	0	0	0	0
1	0	0	0	0	...	0	0	0	0
2	0	0	0	0	...	0	0	0	0
...	...	...	...	...	...	...	...	...	...
59997	0	0	0	0	...	0	0	0	0
59998	0	0	0	0	...	0	0	0	0
59999	0	0	0	0	...	0	0	0	0

Lecture 25¶

Computer Vision, Conclusion¶

EECS 398: Practical Data Science, Spring 2025¶

Agenda 📆¶

Question 🤔 (Answer at practicaldsc.org/q)

Computer vision 👾¶

The MNIST dataset¶

Loading the data¶

From vectors to images¶

The distribution of the training set¶

Model #1: $k$-Nearest Neighbors 🏡🏠¶

What kinds of errors does the model make?¶

Examining misclassified images¶

Question 🤔 (Answer at practicaldsc.org/q)

Downsides of $k$-nearest neighbors¶

Model #2: Multinomial logistic regression 📈¶

Multinomial logistic regression in sklearn¶

What kinds of errors does this model make?¶

Modeling uncertainty¶

Other close calls¶

Visualizing coefficients¶

Question 🤔 (Answer at practicaldsc.org/q)

Reflection¶

Principal component analysis (PCA)¶

PCA in sklearn¶

Visualizing principal components¶

Clusters in principal components¶

PCA as a preprocessing step¶

Parting thoughts 👋¶

You've accomplished a lot!¶

Thank you, and keep in touch!¶

Multinomial logistic regression in `sklearn`¶

PCA in `sklearn`¶