from lec_utils import *
import lec21_util as util

diabetes = pd.read_csv('data/diabetes.csv')
display_df(diabetes, cols=9)

Outcome
0    500
1    268
Name: count, dtype: int64

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 51)})

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 51)})

KNeighborsClassifier(n_neighbors=28)

KNeighborsClassifier(n_neighbors=28)

{'n_neighbors': 28}

array([0])

0.75

0.75

knn with k = 28    0.75
dtype: float64

diabetes = pd.read_csv('data/diabetes.csv')
display_df(diabetes, cols=9)

# 0 means no diabetes, 1 means yes diabetes.
diabetes['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = (
    train_test_split(diabetes[['Glucose', 'BMI']], diabetes['Outcome'], random_state=1)
)
X_train

px.scatter_3d(X_train.assign(Outcome=y_train), 
              x='Glucose', y='BMI', z='Outcome', 
              title='Relationship between Glucose, BMI, and Diabetes',
              width=800, height=600)

fig = util.create_base_scatter(X_train, y_train)
fig

fig

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

model_knn = GridSearchCV(
    KNeighborsClassifier(),
    param_grid = {'n_neighbors': range(1, 51)}
)
model_knn.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 51)})

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 51)})

KNeighborsClassifier(n_neighbors=28)

KNeighborsClassifier(n_neighbors=28)

model_knn.best_params_

{'n_neighbors': 28}

# To know what reasonable values for 'Glucose' and 'BMI' might be, let's look at the plot again.
fig

model_knn.predict(pd.DataFrame([{
    'Glucose': 125,
    'BMI': 40
}]))

array([0])

util.visualize_k(28, X_train, y_train)

util.show_slider()

util.visualize_k(576, X_train, y_train)

# Equivalent to 75%.
(model_knn.predict(X_test) == y_test).mean()

0.75

model_knn.score(X_test, y_test)

0.75

# For future reference.
test_scores = pd.Series()
test_scores['knn with k = 28'] = model_knn.score(X_test, y_test) 
test_scores

knn with k = 28    0.75
dtype: float64

fig

KNeighborsClassifier(n_neighbors=1)

KNeighborsClassifier(n_neighbors=1)

0.9913194444444444

0.6822916666666666

knn with k = 28    0.75
knn with k = 1     0.68
dtype: float64

DecisionTreeClassifier(max_depth=3)

DecisionTreeClassifier(max_depth=3)

0.7708333333333334

knn with k = 28                 0.75
knn with k = 1                  0.68
decision tree with depth = 3    0.77
dtype: float64

Outcome
0    304
1     78
Name: count, dtype: int64

fig

model_k1 = KNeighborsClassifier(n_neighbors=1)
model_k1.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

KNeighborsClassifier(n_neighbors=1)

# Training accuracy – high, but not 100%.
model_k1.score(X_train, y_train)

0.9913194444444444

# Accuracy on test set is lower than when k = 28!
model_k1.score(X_test, y_test)

0.6822916666666666

test_scores['knn with k = 1'] = model_k1.score(X_test, y_test)
test_scores

knn with k = 28    0.75
knn with k = 1     0.68
dtype: float64

util.create_scaled_version(X_train, y_train)

DecisionTreeClassifier(max_depth=3)

DecisionTreeClassifier(max_depth=3)

0.7708333333333334

knn with k = 28                 0.75
knn with k = 1                  0.68
decision tree with depth = 3    0.77
dtype: float64

Outcome
0    304
1     78
Name: count, dtype: int64

interactive(children=(IntSlider(value=26, description='depth', max=51, min=1), Output()), _dom_classes=('widge…

DecisionTreeClassifier()

DecisionTreeClassifier()

0.9913194444444444

21

util.create_scaled_version(X_train, y_train)

fig

from sklearn.tree import DecisionTreeClassifier

model_tree = DecisionTreeClassifier(max_depth=3)
model_tree.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3)

DecisionTreeClassifier(max_depth=3)

model_tree.score(X_test, y_test)

0.7708333333333334

test_scores['decision tree with depth = 3'] = model_tree.score(X_test, y_test)
test_scores

knn with k = 28                 0.75
knn with k = 1                  0.68
decision tree with depth = 3    0.77
dtype: float64

util.show_decision_boundary(model_tree, X_train, y_train, title='Decision Boundary for a Tree of Depth 3')

util.show_diabetes_decision_tree(model_tree, X_train);

y_train[X_train[X_train['Glucose'] <= 129.5].index].value_counts()

Outcome
0    304
1     78
Name: count, dtype: int64

interact(lambda depth: util.visualize_depth(depth, X_train, y_train), depth=(1, 51));

interactive(children=(IntSlider(value=26, description='depth', max=51, min=1), Output()), _dom_classes=('widge…

# By default, there is pre-specified maximum depth.
# The training algorithm keeps 
model_tree_no_max = DecisionTreeClassifier()
model_tree_no_max.fit(X_train, y_train)

DecisionTreeClassifier()

DecisionTreeClassifier()

util.show_diabetes_decision_tree(model_tree_no_max, X_train);

# Training accuracy. This number should look familiar!
model_tree_no_max.score(X_train, y_train)

0.9913194444444444

model_tree_no_max.tree_.max_depth

21

# Worse test set performance than when we used max_depth = 3!
test_scores['decision tree with no specified max depth'] = model_tree_no_max.score(X_test, y_test)
test_scores

knn with k = 28                              0.75
knn with k = 1                               0.68
decision tree with depth = 3                 0.77
decision tree with no specified max depth    0.70
dtype: float64

import seaborn as sns
penguins = sns.load_dataset('penguins').dropna().reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(penguins[['bill_length_mm', 'body_mass_g', 'bill_depth_mm']], 
                                                    penguins['species'], 
                                                    random_state=26)
display(X_train, y_train)

93     Adelie
103    Adelie
274    Gentoo
        ...  
262    Gentoo
318    Gentoo
309    Gentoo
Name: species, Length: 249, dtype: object

species
Adelie       0.45
Gentoo       0.34
Chinstrap    0.21
Name: proportion, dtype: float64

KNeighborsClassifier()

KNeighborsClassifier()

DecisionTreeClassifier(max_depth=2)

DecisionTreeClassifier(max_depth=2)

import seaborn as sns
penguins = sns.load_dataset('penguins').dropna().reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(penguins[['bill_length_mm', 'body_mass_g', 'bill_depth_mm']], 
                                                    penguins['species'], 
                                                    random_state=26)
display(X_train, y_train)

93     Adelie
103    Adelie
274    Gentoo
        ...  
262    Gentoo
318    Gentoo
309    Gentoo
Name: species, Length: 249, dtype: object

species
Adelie       0.45
Gentoo       0.34
Chinstrap    0.21
Name: proportion, dtype: float64

KNeighborsClassifier()

KNeighborsClassifier()

DecisionTreeClassifier(max_depth=2)

DecisionTreeClassifier(max_depth=2)

import seaborn as sns
penguins = sns.load_dataset('penguins').dropna().reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(penguins[['bill_length_mm', 'body_mass_g', 'bill_depth_mm']], 
                                                    penguins['species'], 
                                                    random_state=26)
display(X_train, y_train)

93     Adelie
103    Adelie
274    Gentoo
        ...  
262    Gentoo
318    Gentoo
309    Gentoo
Name: species, Length: 249, dtype: object

species
Adelie       0.45
Gentoo       0.34
Chinstrap    0.21
Name: proportion, dtype: float64

KNeighborsClassifier()

KNeighborsClassifier()

DecisionTreeClassifier(max_depth=2)

DecisionTreeClassifier(max_depth=2)

import seaborn as sns
penguins = sns.load_dataset('penguins').dropna().reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(penguins[['bill_length_mm', 'body_mass_g', 'bill_depth_mm']], 
                                                    penguins['species'], 
                                                    random_state=26)
display(X_train, y_train)

93     Adelie
103    Adelie
274    Gentoo
        ...  
262    Gentoo
318    Gentoo
309    Gentoo
Name: species, Length: 249, dtype: object

y_train.value_counts(normalize=True)

species
Adelie       0.45
Gentoo       0.34
Chinstrap    0.21
Name: proportion, dtype: float64

util.penguin_scatter_3d(X_train, y_train)

util.penguin_scatter_2d(X_train, y_train)

model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train.iloc[:, :-1], y_train)

KNeighborsClassifier()

KNeighborsClassifier()

util.penguin_decision_boundary(model_knn, X_train, y_train, title="k-NN Decision Boundary when k = 5")

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
model_knn_standardized = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=5))
model_knn_standardized.fit(X_train.iloc[:, :-1], y_train)
util.penguin_decision_boundary(model_knn_standardized, X_train, y_train, title="k-NN Decision Boundary when k = 5 and with Standardization")

model_tree = DecisionTreeClassifier(max_depth=2)
model_tree.fit(X_train.iloc[:, :-1], y_train)

DecisionTreeClassifier(max_depth=2)

DecisionTreeClassifier(max_depth=2)

util.penguin_decision_boundary(model_tree, X_train.iloc[:, :-1], y_train, title="Decision Boundary for a Decision Tree of Depth 2")

util.show_penguin_decision_tree(model_tree, X_train);

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.63	50	1
1	1	85	66	29	0	26.6	0.35	31	0
2	8	183	64	0	0	23.3	0.67	32	1
...	...	...	...	...	...	...	...	...	...
765	5	121	72	23	112	26.2	0.24	30	0
766	1	126	60	0	0	30.1	0.35	47	1
767	1	93	70	31	0	30.4	0.32	23	0

	Glucose	BMI
118	97	28.2
205	111	23.9
506	180	36.5
...	...	...
72	126	43.4
235	171	43.6
37	102	32.9

Parametric	Non-Parametric
There's a fixed set of parameters (weights/coefficients), $w_0^, w_1^, ..., w_d^*$ that we'll use for making predictions, and the number of parameters is independent of the training set size.	No fixed set of parameters; model complexity increases as the training set size increases.
Parametric methods make assumptions about the shape of the data and/or its underlying probability distribution. For instance, linear models assume a linear relationship between the features $X$ and target $\vec{y}$. There's a connection between the squared loss function and maximum likelihood estimation, too.	Non-parametric methods make no assumptions about the shape of the data.

Outcome of Prediction	Definition	True Class
True positive (TP) ✅	The predictor correctly predicts the positive class.	P
False negative (FN) ❌	The predictor incorrectly predicts the negative class.	P
True negative (TN) ✅	The predictor correctly predicts the negative class.	N
False positive (FP) ❌	The predictor incorrectly predicts the positive class.	N

	Predicted Negative	Predicted Positive
Actually Negative	TN ✅	FP ❌
Actually Positive	FN ❌	TP ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 90 ✅	FP = 1 ❌
Actually Positive	FN = 8 ❌	TP = 1 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 0 ✅	FP = 91 ❌
Actually Positive	FN = 0 ❌	TP = 9 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 22 ✅	FP = 2 ❌
Actually Positive	FN = 23 ❌	TP = 18 ✅

	bill_length_mm	body_mass_g	bill_depth_mm
93	43.2	4100.0	18.5
103	43.2	4775.0	19.0
274	46.2	5300.0	14.9
...	...	...	...
262	45.2	5300.0	15.8
318	53.4	5500.0	15.8
309	46.9	4875.0	14.6

Lecture 21¶

Introduction to Classification¶

EECS 398: Practical Data Science, Spring 2025¶

Agenda 📆¶

Question 🤔 (Answer at practicaldsc.org/q)

Classification overview¶

The taxonomy of machine learning¶

Example classification problems¶

The plan¶

Loading the data 🏥¶

Visualizing the data¶

Classifier 1: $k$-nearest neighbors 🏡🏠¶

$k$-nearest neighbors 🏡🏠¶

KNeighborsClassifier in sklearn¶

Decision boundaries¶

Quantifying the performance of a classifier¶

Activity¶

Discussion¶

Parametric vs. non-parametric models¶

Classifier 2: Decision trees 🎄¶

Decision trees 🎄¶

DecisionTreeClassifier in sklearn¶

Decision boundaries for a decision tree classifier¶

Visualizing decision trees¶

Increasing tree depth¶

Activity¶

Classifier evaluation¶

Outcomes in binary classification¶

Example: Accuracy of COVID tests¶

Recall¶

Recall isn't everything, either!¶

Precision¶

Precision and recall¶

Discussion¶

Activity

Reference Slide¶

Combining precision and recall¶

Reference Slide¶

Other evaluation metrics for binary classifiers¶

Multiclass classification 🐧¶

From binary to multiclass classification¶

Loading the data 🐧¶

Visualizing the data¶

Classifier 1: $k$-nearest neighbors 🏡🏠¶

If distances matter, standardize!¶

Classifier 2: Decision trees 🎄¶

`KNeighborsClassifier` in `sklearn`¶

`DecisionTreeClassifier` in `sklearn`¶