# Run this cell to get everything set up.
from lec_utils import *
import lec25_util as util
diabetes = pd.read_csv('data/diabetes.csv')
from sklearn.model_selection import train_test_split
diabetes = diabetes[(diabetes['Glucose'] > 0) & (diabetes['BMI'] > 0)]
X_train, X_test, y_train, y_test = (
    train_test_split(diabetes[['Glucose', 'BMI']], diabetes['Outcome'], random_state=11)
)
from ipywidgets import interact
import warnings
warnings.simplefilter('ignore')

sigma = lambda t: 1 / (1 + np.e ** (-t))
sigma(-0.2547)

0.4366670090148261

LogisticRegression()

LogisticRegression()

array([0])

array([[0.56, 0.44]])

array([0])

array([1])

array([1, 1, 1, ..., 0, 0, 0])

0.7659574468085106

Outcome
0    0.65
1    0.35
Name: proportion, dtype: float64

sigma = lambda t: 1 / (1 + np.e ** (-t))
sigma(-0.2547)

0.4366670090148261

from sklearn.linear_model import LogisticRegression
model_logistic_multiple = LogisticRegression()
model_logistic_multiple.fit(X_train, y_train)

LogisticRegression()

LogisticRegression()

model_logistic_multiple.predict(pd.DataFrame([{
    'Glucose': 150,
    'BMI': 25,
}]))

array([0])

model_logistic_multiple.predict_proba(pd.DataFrame([{
    'Glucose': 150,
    'BMI': 25,
}]))

array([[0.56, 0.44]])

util.show_decision_boundary(model_logistic_multiple, X_train, y_train, title='Decision Boundary when Using Both Glucose and BMI \n and T = 0.5 (the default)')

def predict_thresholded(X, T):
    '''Calls model_logistic_multiple.predict_proba.
       For each P(y = 1 | x), returns 1 if >= T and 0 if < T.'''
    probs = model_logistic_multiple.predict_proba(X)[:, 1]
    return (probs >= T).astype(int)

predict_thresholded([[150, 25]], 0.5)

array([0])

predict_thresholded([[150, 25]], 0.4)

array([1])

predict_thresholded(X_train, 0.4)

array([1, 1, 1, ..., 0, 0, 0])

# Training accuracy for the threshold T = 0.4.
(predict_thresholded(X_train, 0.4) == y_train).mean()

0.7659574468085106

util.plot_vs_threshold(X_train, y_train, 'Accuracy')

pd.Series(y_train).value_counts(normalize=True)

Outcome
0    0.65
1    0.35
Name: proportion, dtype: float64

util.show_confusion(X_train, y_train, T=0.5)

interact(lambda T: util.show_confusion(X_train, y_train, T), T=(0, 1, 0.01));

util.plot_vs_threshold(X_train, y_train, 'Precision')

util.plot_vs_threshold(X_train, y_train, 'Recall')

util.pr_curve(X_train, y_train)

util.draw_roc_curve(X_train, y_train)

util.show_one_feature_plot_in_1D(X_train, y_train, thres=False)

util.make_two_feature_scatter(X_train, y_train)

util.lin_sep_1D()

util.non_lin_sep_1D()

util.lin_sep_2D()

util.non_lin_sep_2D()

util.bad_example_1D()

util.lin_sep_1D()

util.lin_sep_1D()

util.lin_sep_1D_elevated()

penguins = sns.load_dataset('penguins').dropna().reset_index(drop=True)
penguins

penguins['species'].value_counts(normalize=True)

species
Adelie       0.44
Gentoo       0.36
Chinstrap    0.20
Name: proportion, dtype: float64

util.penguin_scatter_3d(penguins)

fig = util.penguin_scatter_2d(penguins)
fig

X_train, X_test, y_train, y_test = train_test_split(penguins[['bill_length_mm', 'body_mass_g']], 
                                                    penguins['species'], 
                                                    random_state=26)

fig

from sklearn.neighbors import KNeighborsClassifier

model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train, y_train)

KNeighborsClassifier()

KNeighborsClassifier()

util.penguin_decision_boundary(model_knn, X_train, y_train, title="Decision Boundary when k = 5")

from sklearn.tree import DecisionTreeClassifier

model_tree = DecisionTreeClassifier(max_depth=3)
model_tree.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3)

DecisionTreeClassifier(max_depth=3)

from sklearn.tree import plot_tree
plt.figure(figsize=(13, 5))
plot_tree(model_tree, feature_names=X_train.columns, 
          class_names=['Adelie', 'Chinstrap', 'Gentoo'],
          filled=True, fontsize=10, impurity=False);

util.penguin_decision_boundary(model_tree, X_train, y_train, title="Decision Boundary for a Decision Tree\nwith Depth = 3")

from sklearn.multiclass import OneVsRestClassifier

model_logistic_ovr = OneVsRestClassifier(LogisticRegression())

model_logistic_ovr.fit(X_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression())

OneVsRestClassifier(estimator=LogisticRegression())

LogisticRegression()

LogisticRegression()

util.penguin_decision_boundary(model_logistic_ovr, X_train, y_train, title="Decision Boundary for a Decision Tree\nwith Depth = 3")

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
...	...	...	...	...	...	...	...
330	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
331	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
332	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

Lecture 25¶

Decision Boundaries, Multiclass Classification¶

EECS 398-003: Practical Data Science, Fall 2024¶

Announcements 📣¶

Agenda¶

Question 🤔 (Answer at practicaldsc.org/q)

Recap: Logistic regression¶

Logistic regression¶

`LogisticRegression` in `sklearn`¶

The decision boundary in the feature space¶

Question 🤔 (Answer at practicaldsc.org/q)

Question 🤔 (Answer at practicaldsc.org/q)

Choosing a threshold¶

Thresholding¶

Choosing a custom threshold¶

Accuracy vs. threshold¶

Metrics for binary classification¶

Precision vs. threshold¶

Recall vs. threshold¶

Precision vs. recall¶

ROC curves¶

Question 🤔 (Answer at practicaldsc.org/q)

Linear separability¶

Feature space¶

Linear separability¶

Linear separability and decision boundaries¶

Logistic regression and linear separability¶

Multiclass classification¶

From binary to multiclass classification¶

Return of the penguins!¶

Loading the data¶

Visualizing the data¶

Classifier 1: $k$-Nearest Neighbors 🏡🏠¶

`KNeighborsClassifier` in `sklearn`¶

Classifier 2: Decision trees 🎄¶

`DecisionTreeClassifier` in `sklearn`¶

Classifier 3: Logistic regression 📈¶

"One vs. rest" logistic regression¶

Lecture 25¶

Decision Boundaries, Multiclass Classification¶

EECS 398-003: Practical Data Science, Fall 2024¶

Announcements 📣¶

Agenda¶

Question 🤔 (Answer at practicaldsc.org/q)

Recap: Logistic regression¶

Logistic regression¶

LogisticRegression in sklearn¶

The decision boundary in the feature space¶

Question 🤔 (Answer at practicaldsc.org/q)

Question 🤔 (Answer at practicaldsc.org/q)

Choosing a threshold¶

Thresholding¶

Choosing a custom threshold¶

Accuracy vs. threshold¶

Metrics for binary classification¶

Precision vs. threshold¶

Recall vs. threshold¶

Precision vs. recall¶

ROC curves¶

Question 🤔 (Answer at practicaldsc.org/q)

Linear separability¶

Feature space¶

Linear separability¶

Linear separability and decision boundaries¶

Logistic regression and linear separability¶

Multiclass classification¶

From binary to multiclass classification¶

Return of the penguins!¶

Loading the data¶

Visualizing the data¶

Classifier 1: $k$-Nearest Neighbors 🏡🏠¶

KNeighborsClassifier in sklearn¶

Classifier 2: Decision trees 🎄¶

DecisionTreeClassifier in sklearn¶

Classifier 3: Logistic regression 📈¶

"One vs. rest" logistic regression¶

`LogisticRegression` in `sklearn`¶

`KNeighborsClassifier` in `sklearn`¶

`DecisionTreeClassifier` in `sklearn`¶