from lec_utils import *
import lec23_util as util
from IPython.display import YouTubeVideo
from ipywidgets import interact

YouTubeVideo('oMk6sP7hrbk')

-8.186941724265557

142.44824158772875

142.1051891023626

-8.146983792459055

20664

Outcome
0    500
1    268
Name: count, dtype: int64

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 51)})

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 51)})

KNeighborsClassifier(n_neighbors=28)

KNeighborsClassifier(n_neighbors=28)

YouTubeVideo('oMk6sP7hrbk')

df = pd.read_csv('data/commute-times.csv')
df[['departure_hour', 'minutes']]
util.make_scatter(df)

x = df['departure_hour']
y = df['minutes']

slope = np.corrcoef(x, y)[0, 1] * np.std(y) / np.std(x)
slope

-8.186941724265557

intercept = np.mean(y) - slope * np.mean(x)
intercept

142.44824158772875

def dR_w0(w0, w1):
    return -2 * np.mean(y - (w0 + w1 * x))
def dR_w1(w0, w1):
    return -2 * np.mean((y - (w0 + w1 * x)) * x)

# We'll store our guesses so far, so we can look at them later.
def gradient_descent_for_regression(w0_initial, w1_initial, alpha, threshold=0.0001):
    w0, w1 = w0_initial, w1_initial
    w0_history = [w0]
    w1_history = [w1]
    while True:
        w0 = w0 - alpha * dR_w0(w0, w1)
        w1 = w1 - alpha * dR_w1(w0, w1)
        w0_history.append(w0)
        w1_history.append(w1)
        if np.abs(w0_history[-1] - w0_history[-2]) <= threshold:
            break
    return w0_history, w1_history

w0_history, w1_history = gradient_descent_for_regression(0, 0, 0.01)

w0_history[-1]

142.1051891023626

w1_history[-1]

-8.146983792459055

len(w0_history)

20664

diabetes = pd.read_csv('data/diabetes.csv')
display_df(diabetes, cols=9)

# 0 means no diabetes, 1 means yes diabetes.
diabetes['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = (
    train_test_split(diabetes[['Glucose', 'BMI']], diabetes['Outcome'], random_state=1)
)

px.scatter_3d(X_train.assign(Outcome=y_train), 
              x='Glucose', y='BMI', z='Outcome', 
              title='Relationship between Glucose, BMI, and Diabetes',
              width=800, height=600)

fig = (
    X_train.assign(Outcome=y_train.astype(str).replace({'0': 'no diabetes', '1': 'yes diabetes'}))
            .plot(kind='scatter', x='Glucose', y='BMI', color='Outcome', 
                  color_discrete_map={'no diabetes': 'orange', 'yes diabetes': 'blue'},
                  title='Relationship between Glucose, BMI, and Diabetes')
            .update_layout(width=800)
)
fig

fig

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

model_knn = GridSearchCV(
    KNeighborsClassifier(),
    param_grid = {'n_neighbors': range(1, 51)}
)
model_knn.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 51)})

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 51)})

KNeighborsClassifier(n_neighbors=28)

KNeighborsClassifier(n_neighbors=28)

model_knn.best_params_

{'n_neighbors': 28}

# To know what reasonable values for 'Glucose' and 'BMI' might be, let's look at the plot again.
fig

model_knn.predict(pd.DataFrame([{
    'Glucose': 125,
    'BMI': 40
}]))

array([0])

util.show_decision_boundary(model_knn, X_train, y_train, title='Decision Boundary when $k = 28$')

from ipywidgets import interact
interact(lambda k: util.visualize_k(k, X_train, y_train), k=(1, 51));

interactive(children=(IntSlider(value=26, description='k', max=51, min=1), Output()), _dom_classes=('widget-in…

util.visualize_k(576, X_train, y_train)

# Equivalent to 75%.
(model_knn.predict(X_test) == y_test).mean()

0.75

model_knn.score(X_test, y_test)

0.75

# For future reference.
test_scores = pd.Series()
test_scores['knn with k = 28'] = model_knn.score(X_test, y_test) 
test_scores

knn with k = 28    0.75
dtype: float64

model_k1 = KNeighborsClassifier(n_neighbors=1)
model_k1.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

KNeighborsClassifier(n_neighbors=1)

0.9913194444444444

0.6822916666666666

knn with k = 28    0.75
knn with k = 1     0.68
dtype: float64

DecisionTreeClassifier(max_depth=3)

DecisionTreeClassifier(max_depth=3)

0.7708333333333334

knn with k = 28                 0.75
knn with k = 1                  0.68
decision tree with depth = 3    0.77
dtype: float64

Outcome
0    304
1     78
Name: count, dtype: int64

model_k1 = KNeighborsClassifier(n_neighbors=1)
model_k1.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

KNeighborsClassifier(n_neighbors=1)

# Training accuracy – high, but not 100%.
model_k1.score(X_train, y_train)

0.9913194444444444

# Accuracy on test set is lower than when k = 28!
model_k1.score(X_test, y_test)

0.6822916666666666

test_scores['knn with k = 1'] = model_k1.score(X_test, y_test)
test_scores

knn with k = 28    0.75
knn with k = 1     0.68
dtype: float64

X_train_scaled = X_train.copy()
X_train_scaled['Glucose * 2'] = X_train_scaled['Glucose'] * 2
(
    X_train_scaled.assign(Outcome=y_train.astype(str).replace({'0': 'no diabetes', '1': 'yes diabetes'}))
    .plot(kind='scatter', x='Glucose * 2', y='BMI', color='Outcome', 
                  color_discrete_map={'no diabetes': 'orange', 'yes diabetes': 'blue'},
                  title='Relationship between Glucose * 2, BMI, and Diabetes')
            .update_layout(width=1300)
            .update_xaxes(tickvals=np.arange(0, 500, 100))
)

DecisionTreeClassifier(max_depth=3)

DecisionTreeClassifier(max_depth=3)

0.7708333333333334

knn with k = 28                 0.75
knn with k = 1                  0.68
decision tree with depth = 3    0.77
dtype: float64

Outcome
0    304
1     78
Name: count, dtype: int64

interactive(children=(IntSlider(value=26, description='depth', max=51, min=1), Output()), _dom_classes=('widge…

DecisionTreeClassifier()

DecisionTreeClassifier()

0.9913194444444444

21

X_train_scaled = X_train.copy()
X_train_scaled['Glucose * 2'] = X_train_scaled['Glucose'] * 2
(
    X_train_scaled.assign(Outcome=y_train.astype(str).replace({'0': 'no diabetes', '1': 'yes diabetes'}))
    .plot(kind='scatter', x='Glucose * 2', y='BMI', color='Outcome', 
                  color_discrete_map={'no diabetes': 'orange', 'yes diabetes': 'blue'},
                  title='Relationship between Glucose * 2, BMI, and Diabetes')
            .update_layout(width=1300)
            .update_xaxes(tickvals=np.arange(0, 500, 100))
)

fig

from sklearn.tree import DecisionTreeClassifier

model_tree = DecisionTreeClassifier(max_depth=3)
model_tree.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3)

DecisionTreeClassifier(max_depth=3)

model_tree.score(X_test, y_test)

0.7708333333333334

test_scores['decision tree with depth = 3'] = model_tree.score(X_test, y_test)
test_scores

knn with k = 28                 0.75
knn with k = 1                  0.68
decision tree with depth = 3    0.77
dtype: float64

util.show_decision_boundary(model_tree, X_train, y_train, title='Decision Boundary for a Tree of Depth 3')

from sklearn.tree import plot_tree
plt.figure(figsize=(13, 5))
plot_tree(model_tree, feature_names=X_train.columns, class_names=['no db', 'yes db'], 
          filled=True, fontsize=10, impurity=False);

y_train[X_train[X_train['Glucose'] <= 129.5].index].value_counts()

Outcome
0    304
1     78
Name: count, dtype: int64

interact(lambda depth: util.visualize_depth(depth, X_train, y_train), depth=(1, 51));

interactive(children=(IntSlider(value=26, description='depth', max=51, min=1), Output()), _dom_classes=('widge…

# By default, there is pre-specified maximum depth.
# The training algorithm keeps 
model_tree_no_max = DecisionTreeClassifier()
model_tree_no_max.fit(X_train, y_train)

DecisionTreeClassifier()

DecisionTreeClassifier()

# Uncomment this!
# plt.figure(figsize=(5, 5))
# plot_tree(model_tree_no_max, feature_names=X_train.columns, class_names=['no db', 'yes db'], 
#           filled=True, fontsize=10, impurity=False);

# Training accuracy. This number should look familiar!
model_tree_no_max.score(X_train, y_train)

0.9913194444444444

model_tree_no_max.tree_.max_depth

21

# Worse test set performance than when we used max_depth = 3!
test_scores['decision tree with no specified max depth'] = model_tree_no_max.score(X_test, y_test)
test_scores

knn with k = 28                              0.75
knn with k = 1                               0.68
decision tree with depth = 3                 0.77
decision tree with no specified max depth    0.71
dtype: float64

# If this doesn't render in the HTML, see the notebook on GitHub or the recording!
util.show_three_sigmoids()

interactive(children=(IntSlider(value=0, description='w0', max=15, min=-15), IntSlider(value=0, description='w…

LogisticRegression()

LogisticRegression()

0.765625

array([0])

array([[0.52, 0.48]])

(array([-7.62]), array([[0.04, 0.08]]))

interactive(children=(FloatSlider(value=0.0, description='t', max=1.0, step=0.05), Output()), _dom_classes=('w…

# If this doesn't render in the HTML, see the notebook on GitHub or the recording!
util.show_three_sigmoids()

# If this doesn't render in the HTML, see the notebook on GitHub or the recording!
interact(util.plot_sigmoid, w0=(-15, 15), w1=(-15, 15));

interactive(children=(IntSlider(value=0, description='w0', max=15, min=-15), IntSlider(value=0, description='w…

from sklearn.linear_model import LogisticRegression

model_logistic = LogisticRegression()
model_logistic.fit(X_train, y_train)

LogisticRegression()

LogisticRegression()

model_logistic.score(X_test, y_test)

0.765625

test_scores['logistic regression'] = model_logistic.score(X_test, y_test)
test_scores.to_frame()

model_logistic.predict(pd.DataFrame([{
    'Glucose': 125,
    'BMI': 40
}]))

array([0])

model_logistic.predict_proba(pd.DataFrame([{
    'Glucose': 125,
    'BMI': 40
}]))

array([[0.52, 0.48]])

util.show_decision_boundary(model_logistic, X_train, y_train, title='Decision Boundary for Logistic Regression')

model_logistic.intercept_, model_logistic.coef_

(array([-7.62]), array([[0.04, 0.08]]))

util.show_logistic(model_logistic, X_train, y_train)

interact(lambda t: util.show_logistic(model_logistic, X_train, y_train, show_threshold=True, t=t), t=(0, 1, 0.05));

interactive(children=(FloatSlider(value=0.0, description='t', max=1.0, step=0.05), Output()), _dom_classes=('w…

util.show_decision_boundary(model_logistic, X_train, y_train, title='Decision Boundary for Logistic Regression')

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.63	50	1
1	1	85	66	29	0	26.6	0.35	31	0
2	8	183	64	0	0	23.3	0.67	32	1
...	...	...	...	...	...	...	...	...	...
765	5	121	72	23	112	26.2	0.24	30	0
766	1	126	60	0	0	30.1	0.35	47	1
767	1	93	70	31	0	30.4	0.32	23	0

Parametric	Non-Parametric
There's a fixed set of coefficients (parameters), $w_0, w_1, ..., w_d$ that we'll use for making predictions, and the number of coefficients is independent of the training set size.	No fixed set of parameters; model complexity increases as the training set size increases.
Parametric methods make assumptions about the shape of the data and/or its underlying probability distribution. For instance, linear models assume a linear relationship between the features $X$ and target $\vec{y}$. There's a connection between the squared loss function and maximum likelihood estimation, too.	Non-parametric methods make no assumptions about the shape of the data.

Outcome of Prediction	Definition	True Class
True positive (TP) ✅	The predictor correctly predicts the positive class.	P
False negative (FN) ❌	The predictor incorrectly predicts the negative class.	P
True negative (TN) ✅	The predictor correctly predicts the negative class.	N
False positive (FP) ❌	The predictor incorrectly predicts the positive class.	N

	Predicted Negative	Predicted Positive
Actually Negative	TN ✅	FP ❌
Actually Positive	FN ❌	TP ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 90 ✅	FP = 1 ❌
Actually Positive	FN = 8 ❌	TP = 1 ✅

	0
knn with k = 28	0.75
knn with k = 1	0.68
decision tree with depth = 3	0.77
decision tree with no specified max depth	0.71
logistic regression	0.77

	Predicted Negative	Predicted Positive
Actually Negative	TN = 0 ✅	FP = 91 ❌
Actually Positive	FN = 0 ❌	TP = 9 ✅

	Predicted Negative	Predicted Positive
Actually Negative	TN = 22 ✅	FP = 2 ❌
Actually Positive	FN = 23 ❌	TP = 18 ✅

Lecture 23¶

Classification¶

EECS 398-003: Practical Data Science, Fall 2024¶

Announcements 📣¶

Agenda¶

Question 🤔 (Answer at practicaldsc.org/q)

Recap: Gradient descent for multivariate functions¶

Example: Gradient descent for simple linear regression¶

Gradient descent for simple linear regression, visualized¶

Gradient descent for simple linear regression, implemented¶

Implementing partial derivatives¶

Implementing gradient descent¶

Classification overview¶

The taxonomy of machine learning¶

Example classification problems¶

The plan¶

Loading the data¶

Visualizing the data¶

Classifier 1: $k$-Nearest Neighbors 🏡🏠¶

$k$-Nearest Neighbors 🏡🏠¶

KNeighborsClassifier in sklearn¶

Decision boundaries¶

Quantifying the performance of a classifier¶

Activity¶

Discussion¶

Parametric vs. non-parametric models¶

Classifier 2: Decision trees 🎄¶

Decision trees 🎄¶

DecisionTreeClassifier in sklearn¶

Decision boundaries for a decision tree classifier¶

Visualizing decision trees¶

Increasing tree depth¶

Lingering questions about decision trees¶

Activity¶

Classifier 3: Logistic regression 📈¶

Logistic regression 📈¶

Predicting probabilities vs. predicting classes¶

The sigmoid function¶

LogisticRegression in sklearn¶

Predicting probabilities vs. predicting classes, revisited¶

Decision boundaries for logistic regression¶

Visualizing the probability of belonging to class 1¶

Lingering questions about logistic regression¶

Classifier evaluation¶

Accuracy isn't everything!¶

Outcomes in binary classification¶

Example: COVID testing 🦠¶

Accuracy of COVID tests¶

Recall¶

Recall isn't everything, either!¶

Precision¶

Precision and recall¶

Discussion¶

Activity

Activity

Combining precision and recall¶

Other evaluation metrics for binary classifiers¶

`KNeighborsClassifier` in `sklearn`¶

`DecisionTreeClassifier` in `sklearn`¶

`LogisticRegression` in `sklearn`¶