from lec_utils import *
import lec18_util as util
def show_cv_slides():
    src = "https://docs.google.com/presentation/d/e/2PACX-1vTydTrLDr-y4nxQu1OMsaoqO5EnPEISz2VYmM6pd83ke8YnnTBJlp40NfNLI1HMgoaKx6GBKXYE4UcA/embed?start=false&loop=false&delayms=60000&rm=minimal"
    display(IFrame(src, width=900, height=361))

np.random.seed(23) # For reproducibility.
def sample_from_pop(n=100):
    x = np.linspace(-2, 3, n)
    y = x ** 3 + (np.random.normal(0, 3, size=n))
    return pd.DataFrame({'x': x, 'y': y})
sample_1 = sample_from_pop()
sample_2 = sample_from_pop()

array([[ 1.,  1.,  1.],
       [ 2.,  4.,  8.],
       [ 3.,  9., 27.],
       [ 4., 16., 64.],
       [-2.,  4., -8.]])

Rows in X_train: 80

Rows in X_test: 20

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('polynomialfeatures',
                                        PolynomialFeatures()),
                                       ('linearregression',
                                        LinearRegression())]),
             param_grid={'polynomialfeatures__degree': range(1, 26)},
             scoring='neg_mean_squared_error')

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('polynomialfeatures',
                                        PolynomialFeatures()),
                                       ('linearregression',
                                        LinearRegression())]),
             param_grid={'polynomialfeatures__degree': range(1, 26)},
             scoring='neg_mean_squared_error')

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures()),
                ('linearregression', LinearRegression())])

PolynomialFeatures()

LinearRegression()

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('polynomialfeatures',
                                        PolynomialFeatures()),
                                       ('linearregression',
                                        LinearRegression())]),
             param_grid={'polynomialfeatures__degree': range(1, 26)},
             scoring='neg_mean_squared_error')

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('polynomialfeatures',
                                        PolynomialFeatures()),
                                       ('linearregression',
                                        LinearRegression())]),
             param_grid={'polynomialfeatures__degree': range(1, 26)},
             scoring='neg_mean_squared_error')

np.random.seed(23) # For reproducibility.
def sample_from_pop(n=100):
    x = np.linspace(-2, 3, n)
    y = x ** 3 + (np.random.normal(0, 3, size=n))
    return pd.DataFrame({'x': x, 'y': y})
sample_1 = sample_from_pop()
sample_2 = sample_from_pop()

px.scatter(sample_1, x='x', y='y', title='Sample 1')

from sklearn.preprocessing import PolynomialFeatures
# fit_transform fits and transforms the same input.
# We tell it not to add a column of 1s, because
# LinearRegression() does this automatically later on.
d2 = PolynomialFeatures(3, include_bias=False)
d2.fit_transform(np.array([1, 2, 3, 4, -2]).reshape(-1, 1))

array([[ 1.,  1.,  1.],
       [ 2.,  4.,  8.],
       [ 3.,  9., 27.],
       [ 4., 16., 64.],
       [-2.,  4., -8.]])

# Look at the definition of train_and_plot in lec17_util.py if you're curious as to how the plotting works.
fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_1, degs=[1, 3, 25], data_name='Sample 1')
fig.update_layout(title='Trained on Sample 1, Performance on Sample 1')

fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25], data_name='Sample 2')
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')

fig = util.plot_multiple_models(sample_1, sample_2, degs=[1, 3, 25])
fig

fig

fig = util.plot_multiple_models(sample_1, sample_2, degs=[1, 3, 25], data=True)
fig

fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25], data_name='Sample 2')
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

from sklearn.model_selection import train_test_split

# Read the documentation!
train_test_split?

sample_1

X = sample_1[['x']] # DataFrame. 
y = sample_1['y'] # Series. 
# We don't have to choose 0.25.
# We also don't have to set a random_state;
# we've done this so that we get the same results in lecture every time.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

print('Rows in X_train:', X_train.shape[0])
display(X_train.head())
print('Rows in X_test:', X_test.shape[0])
display(X_test.head())

Rows in X_train: 80

Rows in X_test: 20

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
train_errs = []
test_errs = []
for d in range(1, 26):
    pl = make_pipeline(PolynomialFeatures(d), LinearRegression())
    pl.fit(X_train, y_train)
    train_errs.append(mean_squared_error(y_train, pl.predict(X_train)))
    test_errs.append(mean_squared_error(y_test, pl.predict(X_test)))
errs = pd.DataFrame({'Train Error': train_errs, 'Test Error': test_errs})
errs

fig = px.line(errs.iloc[:-1])
fig.update_layout(showlegend=True, xaxis_title='Polynomial Degree', yaxis_title='Mean Squared Error')

training_data = pd.DataFrame().assign(x=range(0, 120, 10),
                                      y=[9, 1, 58, 3, 6, 4, -2, 8, 1, 10, 1.1, -45])        
display_df(training_data, rows=12)

show_cv_slides()

from sklearn.model_selection import GridSearchCV
GridSearchCV?

# The key names in this dictionary are chosen very carefully.
# They need to be of the format pipelinestep__hyperparametername,
# where pipelinestep is a lowercase version of the step in the pipeline
# that we want to tune, and 
# hyperparameter name is the formal name of the hyperparameter (see the documentation).
hyperparams = {
    'polynomialfeatures__degree': range(1, 26)
}

searcher = GridSearchCV(
    make_pipeline(PolynomialFeatures(), LinearRegression()),
    param_grid=hyperparams,
    cv=5, # k = 5.
    scoring='neg_mean_squared_error'
)
searcher

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('polynomialfeatures',
                                        PolynomialFeatures()),
                                       ('linearregression',
                                        LinearRegression())]),
             param_grid={'polynomialfeatures__degree': range(1, 26)},
             scoring='neg_mean_squared_error')

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('polynomialfeatures',
                                        PolynomialFeatures()),
                                       ('linearregression',
                                        LinearRegression())]),
             param_grid={'polynomialfeatures__degree': range(1, 26)},
             scoring='neg_mean_squared_error')

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures()),
                ('linearregression', LinearRegression())])

PolynomialFeatures()

LinearRegression()

searcher.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('polynomialfeatures',
                                        PolynomialFeatures()),
                                       ('linearregression',
                                        LinearRegression())]),
             param_grid={'polynomialfeatures__degree': range(1, 26)},
             scoring='neg_mean_squared_error')

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('polynomialfeatures',
                                        PolynomialFeatures()),
                                       ('linearregression',
                                        LinearRegression())]),
             param_grid={'polynomialfeatures__degree': range(1, 26)},
             scoring='neg_mean_squared_error')

Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=3)),
                ('linearregression', LinearRegression())])

PolynomialFeatures(degree=3)

LinearRegression()

searcher.best_params_

{'polynomialfeatures__degree': 3}

searcher.predict([[4], 
                  [-1], 
                  [0]])

array([64.62, -1.22,  0.32])

errs_df = util.format_results(searcher)
errs_df

errs_df

errs_df.mean(axis=0)

Degree 1       21.39
Degree 2       13.52
Degree 3        7.49
              ...   
Degree 23     183.34
Degree 24     459.90
Degree 25    7270.38
Length: 25, dtype: float64

fig = errs_df.mean(axis=0).iloc[:18].plot(kind='line', title='Average Validation Error')
fig.update_layout(xaxis_title='Degree', yaxis_title='Average Validation MSE', showlegend=False)

# Chosen automatically by sklearn.
errs_df.mean(axis=0).idxmin()

'Degree 3'

errs_df.idxmin(axis=1)

Fold 1     Degree 6
Fold 2     Degree 3
Fold 3    Degree 12
Fold 4    Degree 10
Fold 5     Degree 2
dtype: object

df = pd.read_csv('data/commute-times.csv')
df['day_of_month'] = pd.to_datetime(df['date']).dt.day
df['month'] = pd.to_datetime(df['date']).dt.month_name()
df.head()

df = pd.read_csv('data/commute-times.csv')
df['day_of_month'] = pd.to_datetime(df['date']).dt.day
df['month'] = pd.to_datetime(df['date']).dt.month_name()
df.head()

# Here, we're letting X_train and X_test keep all of the columns in the DataFrame
# OTHER than 'minutes'.
X_train, X_test, y_train, y_test = train_test_split(df.drop('minutes', axis=1), df['minutes'], random_state=23)

from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder

selecter = FunctionTransformer(lambda x: x) # Shortcut to say "keep just these columns."
week_converter = FunctionTransformer(lambda s: 'Week ' + ((s - 1) // 7 + 1).astype(str))
day_of_month_transformer = make_pipeline(week_converter, OneHotEncoder(drop='first')) # From last class.
pipes = {
    'departure_hour only': make_pipeline(
        make_column_transformer((selecter, ['departure_hour'])),
        LinearRegression()
    ),
    'departure_hour + day_of_month': make_pipeline(
        make_column_transformer((selecter, ['departure_hour', 'day_of_month'])),
        LinearRegression()
    ),
    'departure_hour + day OHE': make_pipeline(
        make_column_transformer(
            (selecter, ['departure_hour']),
            (OneHotEncoder(drop='first', handle_unknown='ignore'), ['day'])
        ),
        LinearRegression()
    ),
    'departure_hour + day OHE + month OHE': make_pipeline(
        make_column_transformer(
            (selecter, ['departure_hour']),
            (OneHotEncoder(drop='first', handle_unknown='ignore'), ['day', 'month'])
        ),
        LinearRegression()
    ),
    'departure_hour with poly features + day OHE + month OHE + week': make_pipeline(
        make_column_transformer(
        (PolynomialFeatures(3), ['departure_hour']),
        (OneHotEncoder(drop='first', handle_unknown='ignore'), ['day', 'month']),
        (day_of_month_transformer, ['day_of_month']),
    ),
    LinearRegression())
}

results = pd.DataFrame(columns=['Average Training MSE', 'Average Validation MSE'])
for pipe in pipes:
    fitted = GridSearchCV(
        pipes[pipe],
        param_grid={}, # No hyperparameters, but we could have them.
        scoring='neg_mean_squared_error',
        cv=10, # Change this and see what happens!
        return_train_score=True # So that we can compute training MSEs, too.
    )
    fitted.fit(X_train, y_train)
    results.loc[pipe] = [-fitted.cv_results_['mean_train_score'][0], -fitted.cv_results_['mean_test_score'][0]]
commute_models_summarized = (
    results
    .sort_values('Average Training MSE')
    .plot(kind='barh', barmode='group', width=1000)
    .update_layout(xaxis_title='Mean Squared Error', yaxis_title='Model')
)
commute_models_summarized

	x	y
0	-2.00	-6.00
1	-1.95	-7.33
2	-1.90	-9.18
...	...	...
97	2.90	25.75
98	2.95	22.40
99	3.00	32.47

	x
85	2.29
28	-0.59
8	-1.60
11	-1.44
63	1.18

	x
26	-0.69
80	2.04
82	2.14
68	1.43
77	1.89

	Train Error	Test Error
0	20.86	20.79
1	13.08	14.31
2	7.01	12.10
...	...	...
22	5.27	14.26
23	5.10	14.64
24	65.22	63.25

	x	y
0	0	9.0
1	10	1.0
2	20	58.0
3	30	3.0
4	40	6.0
5	50	4.0
6	60	-2.0
7	70	8.0
8	80	1.0
9	90	10.0
10	100	1.1
11	110	-45.0

Lecture 18¶

Generalization and Cross-Validation¶

EECS 398: Practical Data Science, Winter 2025¶

Agenda 📆¶

Question 🤔 (Answer at practicaldsc.org/q)

Generalization 🔭¶

Motivation¶

Evaluating the quality of a model¶

Overfitting and underfitting¶

Polynomial regression¶

Bias and variance¶

Model variance¶

Model bias¶

Risk vs. empirical risk¶

The bias-variance decomposition¶

Hyperparameters and train-test splits 🎛️¶

Example: Polynomial regression¶

Hyperparameters¶

Train-test splits 🚆¶

Train-test split¶

Remember: Train only using the training data!¶

Polynomial degree vs. train/test error¶

Training error vs. test error¶

Conducting train-test splits¶

But wait...¶

Cross-validation¶

Idea: A single validation set¶

A better idea: $k$-fold cross-validation¶

Illustrating $k$-fold cross-validation¶

$k$-fold cross-validation, in general¶

`GridSearchCV`¶

Grid searching for the best polynomial degree¶

Interpreting the results of $k$-fold cross-validation¶

Question 🤔 (Answer at practicaldsc.org/q)

Another example: Commute times¶

Creating many pipelines¶

Summary¶

Question 🤔 (Answer at practicaldsc.org/q)

	Degree 1	Degree 2	Degree 3	Degree 4	...	Degree 22	Degree 23	Degree 24	Degree 25
Fold 1	22.54	15.35	6.93	7.15	...	46.38	64.23	35.50	102.93
Fold 2	30.24	18.46	8.42	12.15	...	560.61	630.02	1596.78	4589.27
Fold 3	15.93	11.42	4.47	4.62	...	304.46	198.77	643.80	31566.25
Fold 4	26.99	17.79	12.99	13.02	...	15.29	15.76	15.24	28.90
Fold 5	11.24	4.58	4.66	5.16	...	8.39	7.95	8.18	64.57

	date	day	home_departure_time	home_departure_mileage	...	work_departure_time_hr	mileage_to_home	day_of_month	month
0	5/15/2023	Mon	2023-05-15 10:49:00	15873.0	...	17.17	53.0	15	May
1	5/16/2023	Tue	2023-05-16 07:45:00	15979.0	...	NaN	NaN	16	May
2	5/22/2023	Mon	2023-05-22 08:27:00	50407.0	...	15.90	54.0	22	May
3	5/23/2023	Tue	2023-05-23 07:08:00	50535.0	...	NaN	NaN	23	May
4	5/30/2023	Tue	2023-05-30 09:09:00	50664.0	...	17.12	54.0	30	May

Lecture 18¶

Generalization and Cross-Validation¶

EECS 398: Practical Data Science, Winter 2025¶

Agenda 📆¶

Question 🤔 (Answer at practicaldsc.org/q)

Generalization 🔭¶

Motivation¶

Evaluating the quality of a model¶

Overfitting and underfitting¶

Polynomial regression¶

Bias and variance¶

Model variance¶

Model bias¶

Risk vs. empirical risk¶

The bias-variance decomposition¶

Hyperparameters and train-test splits 🎛️¶

Example: Polynomial regression¶

Hyperparameters¶

Train-test splits 🚆¶

Train-test split¶

Remember: Train only using the training data!¶

Polynomial degree vs. train/test error¶

Training error vs. test error¶

Conducting train-test splits¶

But wait...¶

Cross-validation¶

Idea: A single validation set¶

A better idea: $k$-fold cross-validation¶

Illustrating $k$-fold cross-validation¶

$k$-fold cross-validation, in general¶

GridSearchCV¶

Grid searching for the best polynomial degree¶

Interpreting the results of $k$-fold cross-validation¶

Question 🤔 (Answer at practicaldsc.org/q)

Another example: Commute times¶

Creating many pipelines¶

Summary¶

Question 🤔 (Answer at practicaldsc.org/q)

`GridSearchCV`¶