# Run this cell to get everything set up.
from lec_utils import *
import lec23_util as util
diabetes = pd.read_csv('data/diabetes.csv')
from sklearn.model_selection import train_test_split
diabetes = diabetes[(diabetes['Glucose'] > 0) & (diabetes['BMI'] > 0)]
X_train, X_test, y_train, y_test = (
    train_test_split(diabetes[['Glucose', 'BMI']], diabetes['Outcome'], random_state=1)
)

from sklearn.linear_model import LogisticRegression
model_logistic_multiple = LogisticRegression()
model_logistic_multiple.fit(X_train, y_train)

LogisticRegression()

LogisticRegression()

array([0])

array([[0.58, 0.42]])

array([0])

array([1])

array([0, 0, 1, ..., 0, 0, 0])

0.7588652482269503

Outcome
0    0.64
1    0.36
Name: proportion, dtype: float64

interactive(children=(FloatSlider(value=0.0, description='T', max=1.0, step=0.01), Output()), _dom_classes=('w…

from sklearn.linear_model import LogisticRegression
model_logistic_multiple = LogisticRegression()
model_logistic_multiple.fit(X_train, y_train)

LogisticRegression()

LogisticRegression()

model_logistic_multiple.predict(pd.DataFrame([{
    'Glucose': 150,
    'BMI': 25,
}]))

array([0])

model_logistic_multiple.predict_proba(pd.DataFrame([{
    'Glucose': 150,
    'BMI': 25,
}]))

array([[0.58, 0.42]])

util.show_decision_boundary(model_logistic_multiple, X_train, y_train, title='Logistic Regression Decision Boundary (T = 0.5)')

def predict_thresholded(X, T):
    '''Calls model_logistic_multiple.predict_proba.
       For each P(y_i = 1 | x_i), returns 1 if >= T and 0 if < T.'''
    probs = model_logistic_multiple.predict_proba(X)[:, 1]
    return (probs >= T).astype(int)

predict_thresholded([[150, 25]], 0.5)

array([0])

predict_thresholded([[150, 25]], 0.4)

array([1])

predict_thresholded(X_train, 0.4)

array([0, 0, 1, ..., 0, 0, 0])

# Training accuracy for the threshold T = 0.4.
(predict_thresholded(X_train, 0.4) == y_train).mean()

0.7588652482269503

util.plot_vs_threshold(X_train, y_train, 'Accuracy')

pd.Series(y_train).value_counts(normalize=True)

Outcome
0    0.64
1    0.36
Name: proportion, dtype: float64

util.show_confusion(X_train, y_train, T=0.5)

interact(lambda T: util.show_confusion(X_train, y_train, T), T=(0, 1, 0.01));

interactive(children=(FloatSlider(value=0.0, description='T', max=1.0, step=0.01), Output()), _dom_classes=('w…

util.plot_vs_threshold(X_train, y_train, 'Precision')

util.plot_vs_threshold(X_train, y_train, 'Recall')

util.pr_curve(X_train, y_train)

util.draw_roc_curve(X_train, y_train)

util.show_one_feature_plot_in_1D(X_train, y_train, thres=False)

util.create_base_scatter(X_train, y_train)

util.lin_sep_1D()

util.non_lin_sep_1D()

util.lin_sep_2D()

util.non_lin_sep_2D()

util.bad_example_1D()

util.lin_sep_1D()

util.lin_sep_1D()

util.lin_sep_1D_elevated()

import seaborn as sns
penguins = sns.load_dataset('penguins').dropna().reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(penguins[['bill_length_mm', 'body_mass_g']], 
                                                    penguins['species'], 
                                                    random_state=26)
display(X_train, y_train)

93     Adelie
103    Adelie
274    Gentoo
        ...  
262    Gentoo
318    Gentoo
309    Gentoo
Name: species, Length: 249, dtype: object

util.penguin_scatter_2d(X_train, y_train)

from sklearn.neighbors import KNeighborsClassifier
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train, y_train)
util.penguin_decision_boundary(model_knn, X_train, y_train, title="k-NN Decision Boundary when k = 5")

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
model_knn_standardized = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=5))
model_knn_standardized.fit(X_train, y_train)
util.penguin_decision_boundary(model_knn_standardized, X_train, y_train, title="k-NN Decision Boundary when k = 5 and with Standardization")

from sklearn.tree import DecisionTreeClassifier
model_tree = DecisionTreeClassifier(max_depth=3)
model_tree.fit(X_train, y_train)
util.penguin_decision_boundary(model_tree, X_train, y_train, title="Decision Boundary for a Decision Tree of Depth 3")

model_log = LogisticRegression(multi_class='multinomial')
model_log.fit(X_train, y_train)

LogisticRegression(multi_class='multinomial')

LogisticRegression(multi_class='multinomial')

model_log.coef_

array([[-0.85,  0.  ],
       [ 0.84, -0.01],
       [ 0.02,  0.01]])

model_log.intercept_

array([ 36.4 , -10.96, -25.43])

model_log.classes_

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

model_log.predict_proba(pd.DataFrame([{
    'bill_length_mm': 45,
    'body_mass_g': 4500
}]))

array([[0.14, 0.01, 0.85]])

util.penguin_decision_boundary(model_log, X_train, y_train, title="Softmax Regression Decision Boundary")

model_log.intercept_

array([ 36.4 , -10.96, -25.43])

model_log.coef_

array([[-0.85,  0.  ],
       [ 0.84, -0.01],
       [ 0.02,  0.01]])

# Same values as shown in model_log.predict, two slides ago!
softmax = lambda z: np.e ** z / sum(np.e ** z)
softmax(model_log.intercept_.reshape(-1, 1) + model_log.coef_ @ np.array([[45], [4500]]))

array([[0.14],
       [0.01],
       [0.85]])

	bill_length_mm	body_mass_g
93	43.2	4100.0
103	43.2	4775.0
274	46.2	5300.0
...	...	...
262	45.2	5300.0
318	53.4	5500.0
309	46.9	4875.0

Lecture 23¶

Logistic Regression, Continued¶

EECS 398: Practical Data Science, Spring 2025¶

Agenda 📆¶

Question 🤔 (Answer at practicaldsc.org/q)

Recap: Logistic regression¶

Logistic regression¶

`LogisticRegression` in `sklearn`¶

The decision boundary in the feature space¶

Question 🤔 (Answer at practicaldsc.org/q)

Question 🤔 (Answer at practicaldsc.org/q)

Choosing a threshold¶

Thresholding¶

Choosing a custom threshold¶

Accuracy vs. threshold¶

Metrics for binary classification¶

Precision vs. threshold¶

Recall vs. threshold¶

Precision vs. recall¶

ROC curves¶

Question 🤔 (Answer at practicaldsc.org/q)

Linear separability¶

Feature space¶

Linear separability¶

Linear separability and decision boundaries¶

Logistic regression and linear separability¶

Logistic regression for multiclass classification¶

From binary to multiclass classification¶

Loading the data 🐧¶

Recap: $k$-nearest neighbors¶

Recap: Decision trees¶

What about logistic regression?¶

Multinomial logistic regression¶

Aside: The softmax function¶

Multinomial logistic regression in `sklearn`¶

What does this model look like?¶

Neural networks 🧠¶

Lecture 23¶

Logistic Regression, Continued¶

EECS 398: Practical Data Science, Spring 2025¶

Agenda 📆¶

Question 🤔 (Answer at practicaldsc.org/q)

Recap: Logistic regression¶

Logistic regression¶

LogisticRegression in sklearn¶

The decision boundary in the feature space¶

Question 🤔 (Answer at practicaldsc.org/q)

Question 🤔 (Answer at practicaldsc.org/q)

Choosing a threshold¶

Thresholding¶

Choosing a custom threshold¶

Accuracy vs. threshold¶

Metrics for binary classification¶

Precision vs. threshold¶

Recall vs. threshold¶

Precision vs. recall¶

ROC curves¶

Question 🤔 (Answer at practicaldsc.org/q)

Linear separability¶

Feature space¶

Linear separability¶

Linear separability and decision boundaries¶

Logistic regression and linear separability¶

Logistic regression for multiclass classification¶

From binary to multiclass classification¶

Loading the data 🐧¶

Recap: $k$-nearest neighbors¶

Recap: Decision trees¶

What about logistic regression?¶

Multinomial logistic regression¶

Aside: The softmax function¶

Multinomial logistic regression in sklearn¶

What does this model look like?¶

Neural networks 🧠¶

`LogisticRegression` in `sklearn`¶

Multinomial logistic regression in `sklearn`¶