from lec_utils import *

df = pd.read_csv('data/commute-times.csv')
df['day_of_month'] = pd.to_datetime(df['date']).dt.day
df.head()

day
Tue    25
Mon    20
Thu    15
Wed     3
Fri     2
Name: count, dtype: int64

0     0
1     1
2     0
     ..
62    0
63    1
64    0
Name: day, Length: 65, dtype: int64

LinearRegression()

LinearRegression()

(134.0430659240799, array([-8.42, -0.03,  5.09, 16.38,  5.12, 11.5 ]))

model_year
73    40
78    36
76    34
      ..
71    27
80    27
74    26
Name: count, Length: 13, dtype: int64

LinearRegression()

LinearRegression()

23.943662938603108

LinearRegression()

df = pd.read_csv('data/commute-times.csv')
df['day_of_month'] = pd.to_datetime(df['date']).dt.day
df.head()

df.head()

df['day'].value_counts()

day
Tue    25
Mon    20
Thu    15
Wed     3
Fri     2
Name: count, dtype: int64

(df['day'] == 'Tue').astype(int)

0     0
1     1
2     0
     ..
62    0
63    1
64    0
Name: day, Length: 65, dtype: int64

for val in df['day'].unique():
    df[f'day == {val}'] = (df['day'] == val).astype(int)

df.loc[:, df.columns.str.contains('day')]

X_for_ohe = df[['departure_hour', 
                'day_of_month',
                'day == Mon',
                'day == Tue',
                'day == Wed',
                'day == Thu']]
X_for_ohe

from sklearn.linear_model import LinearRegression
model_with_ohe = LinearRegression()
model_with_ohe.fit(X=X_for_ohe, y=df['minutes'])

LinearRegression()

LinearRegression()

model_with_ohe.intercept_, model_with_ohe.coef_

(134.0430659240799, array([-8.42, -0.03,  5.09, 16.38,  5.12, 11.5 ]))

fig = go.Figure()
fig.add_trace(go.Scatter(x=df['departure_hour'], y=df['minutes'], 
                         mode='markers', name='Original Data'))
fig.add_trace(go.Scatter(x=df['departure_hour'], y=model_with_ohe.predict(X_for_ohe), 
                         mode='markers', name='Predicted Commute Times using Departure Hour, <br>Day of Month, and Day of Week'))
fig.update_layout(showlegend=True, title='Commute Time vs. Departure Hour',
                  xaxis_title='Departure Hour', yaxis_title='Minutes', width=1000)

from sklearn.metrics import mean_squared_error

# Multiple linear regression model.
model_multiple = LinearRegression()
model_multiple.fit(X=df[['departure_hour', 'day_of_month']], y=df['minutes'])
mse_dict = {}
mse_dict['departure_hour + day_of_month'] = mean_squared_error(df['minutes'], model_multiple.predict(df[['departure_hour', 'day_of_month']]))

# Simple linear model.
model_simple = LinearRegression()
model_simple.fit(X=df[['departure_hour']], y=df['minutes'])
mse_dict['departure_hour'] = mean_squared_error(df['minutes'], model_simple.predict(df[['departure_hour']]))

# Constant model.
model_constant = df['minutes'].mean()
mse_dict['constant'] = mean_squared_error(df['minutes'], np.ones(df.shape[0]) * model_constant)

mse_dict['departure_hour + day_of_month + ohe day'] = mean_squared_error(
    df['minutes'],
    model_with_ohe.predict(X_for_ohe)
)
pd.Series(mse_dict).plot(kind='barh', title='Mean Squared Error')

df.head()

import seaborn as sns
mpg = sns.load_dataset('mpg').dropna()
mpg.head()

model_year
73    40
78    36
76    34
      ..
71    27
80    27
74    26
Name: count, Length: 13, dtype: int64

LinearRegression()

LinearRegression()

23.943662938603108

LinearRegression()

LinearRegression()

20.152887978233167

23.943662938603108

(108.69970699574483, array([-18.58]))

LinearRegression()

import seaborn as sns
mpg = sns.load_dataset('mpg').dropna()
mpg.head()

mpg['model_year'].value_counts()

model_year
73    40
78    36
76    34
      ..
71    27
80    27
74    26
Name: count, Length: 13, dtype: int64

px.scatter(mpg, x='horsepower', y='mpg')

car_model = LinearRegression()
car_model.fit(mpg[['horsepower']], mpg['mpg'])

LinearRegression()

LinearRegression()

hp_points = pd.DataFrame({'horsepower': [25, 225]})
fig = px.scatter(mpg, x='horsepower', y='mpg')
fig.add_trace(go.Scatter(
    x=hp_points['horsepower'],
    y=car_model.predict(hp_points),
    mode='lines',
    name='Predicted MPG using Horsepower'
))

# As a baseline:
mean_squared_error(mpg['mpg'], car_model.predict(mpg[['horsepower']]))

23.943662938603108

fig

mpg['log hp'] = np.log(mpg['horsepower'])

px.scatter(mpg, x='log hp', y='mpg')

car_model_log = LinearRegression()
car_model_log.fit(mpg[['log hp']], mpg['mpg'])

LinearRegression()

LinearRegression()

fig = px.scatter(mpg, x='log hp', y='mpg')
log_hp_points = pd.DataFrame({'log hp': [3.7, 5.5]})
fig = px.scatter(mpg, x='log hp', y='mpg')
fig.add_trace(go.Scatter(
    x=log_hp_points['log hp'],
    y=car_model_log.predict(log_hp_points),
    mode='lines',
    name='Predicted MPG using log(Horsepower)'
))

# Using log hp:
mean_squared_error(mpg['mpg'], car_model_log.predict(mpg[['log hp']]))

20.152887978233167

# Using hp, from before:
mean_squared_error(mpg['mpg'], car_model.predict(mpg[['horsepower']]))

23.943662938603108

fig = px.scatter(mpg, x='horsepower', y='mpg')
fig.add_trace(
    go.Scatter(
        x=mpg['horsepower'], 
        y=car_model_log.intercept_ + car_model_log.coef_[0] * np.log(mpg['horsepower']),  
        mode='markers', name='Predicted MPG using log(Horsepower)'
    )
)
fig

car_model_log.intercept_, car_model_log.coef_

(108.69970699574483, array([-18.58]))

sales = pd.read_csv('data/sales.csv')
sales.head()

LinearRegression()

LinearRegression()

array([16.2 ,  0.17, 11.53, 13.58, -5.31])

---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
Cell In[43], line 2
      1 # Doesn't work! Need to fit first.
----> 2 stdscaler.transform(sales.iloc[:, 1:])

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/utils/_set_output.py:313, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    311 @wraps(f)
    312 def wrapped(self, X, *args, **kwargs):
--> 313     data_to_wrap = f(self, X, *args, **kwargs)
    314     if isinstance(data_to_wrap, tuple):
    315         # only wrap the first output for cross decomposition
    316         return_tuple = (
    317             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    318             *data_to_wrap[1:],
    319         )

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/preprocessing/_data.py:1042, in StandardScaler.transform(self, X, copy)
   1027 def transform(self, X, copy=None):
   1028     """Perform standardization by centering and scaling.
   1029 
   1030     Parameters
   (...)
   1040         Transformed array.
   1041     """
-> 1042     check_is_fitted(self)
   1044     copy = copy if copy is not None else self.copy
   1045     X = self._validate_data(
   1046         X,
   1047         reset=False,
   (...)
   1052         force_all_finite="allow-nan",
   1053     )

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/utils/validation.py:1661, in check_is_fitted(estimator, attributes, msg, all_or_any)
   1658     raise TypeError("%s is not an estimator instance." % (estimator))
   1660 if not _is_fitted(estimator, attributes, all_or_any):
-> 1661     raise NotFittedError(msg % {"name": type(estimator).__name__})

NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

StandardScaler()

StandardScaler()

array([[ 0.85, -0.47,  0.51,  1.05, -0.36]])

array([[-1.13, -1.31, -1.35, -1.6 ,  0.89],
       [ 0.14,  0.39,  0.4 ,  0.32, -0.36],
       [ 0.09, -0.03,  0.46,  0.36, -0.57],
       [ 0.9 ,  1.08,  1.05,  1.19, -1.61],
       [ 2.67,  0.69, -0.3 ,  0.46,  0.05]])

array([  3.33, 387.48,   8.1 ,   9.69,   7.74])

array([    3.89, 35191.58,    13.72,    25.44,    23.08])

sales = pd.read_csv('data/sales.csv')
sales.head()

sales.head()

sales_model = LinearRegression()
sales_model.fit(X=sales.iloc[:, 1:], y=sales.iloc[:, 0])

LinearRegression()

LinearRegression()

sales_model.coef_

array([16.2 ,  0.17, 11.53, 13.58, -5.31])

coefs = pd.DataFrame().assign(
    column=sales.columns[1:],
    original_coef=sales_model.coef_,
).set_index('column')
coefs.plot(kind='barh', title='Original Coefficients')

sales.iloc[:, 1:]

coefs.plot(kind='barh', title='Original Coefficients')

sales.iloc[:, 1:]

from sklearn.preprocessing import StandardScaler

stdscaler = StandardScaler()

# Doesn't work! Need to fit first.
stdscaler.transform(sales.iloc[:, 1:])

---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
Cell In[43], line 2
      1 # Doesn't work! Need to fit first.
----> 2 stdscaler.transform(sales.iloc[:, 1:])

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/utils/_set_output.py:313, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    311 @wraps(f)
    312 def wrapped(self, X, *args, **kwargs):
--> 313     data_to_wrap = f(self, X, *args, **kwargs)
    314     if isinstance(data_to_wrap, tuple):
    315         # only wrap the first output for cross decomposition
    316         return_tuple = (
    317             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    318             *data_to_wrap[1:],
    319         )

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/preprocessing/_data.py:1042, in StandardScaler.transform(self, X, copy)
   1027 def transform(self, X, copy=None):
   1028     """Perform standardization by centering and scaling.
   1029 
   1030     Parameters
   (...)
   1040         Transformed array.
   1041     """
-> 1042     check_is_fitted(self)
   1044     copy = copy if copy is not None else self.copy
   1045     X = self._validate_data(
   1046         X,
   1047         reset=False,
   (...)
   1052         force_all_finite="allow-nan",
   1053     )

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/utils/validation.py:1661, in check_is_fitted(estimator, attributes, msg, all_or_any)
   1658     raise TypeError("%s is not an estimator instance." % (estimator))
   1660 if not _is_fitted(estimator, attributes, all_or_any):
-> 1661     raise NotFittedError(msg % {"name": type(estimator).__name__})

NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

# This is like saying "determine the mean and SD of each column in sales, 
# other than the 'net_sales' column".
stdscaler.fit(sales.iloc[:, 1:])

StandardScaler()

StandardScaler()

stdscaler.transform([[5, 300, 10, 15, 6]])

array([[ 0.85, -0.47,  0.51,  1.05, -0.36]])

stdscaler.transform(sales.iloc[:, 1:].tail(5))

array([[-1.13, -1.31, -1.35, -1.6 ,  0.89],
       [ 0.14,  0.39,  0.4 ,  0.32, -0.36],
       [ 0.09, -0.03,  0.46,  0.36, -0.57],
       [ 0.9 ,  1.08,  1.05,  1.19, -1.61],
       [ 2.67,  0.69, -0.3 ,  0.46,  0.05]])

stdscaler.mean_

array([  3.33, 387.48,   8.1 ,   9.69,   7.74])

stdscaler.var_

array([    3.89, 35191.58,    13.72,    25.44,    23.08])

new_scaler = StandardScaler()

new_scaler.fit_transform(sales.iloc[:, 1:].tail(5))

array([[-1.33, -1.79, -1.71, -1.88,  1.48],
       [-0.32,  0.28,  0.43,  0.19, -0.05],
       [-0.36, -0.24,  0.49,  0.23, -0.31],
       [ 0.29,  1.11,  1.22,  1.13, -1.58],
       [ 1.71,  0.64, -0.43,  0.34,  0.46]])

sales_model_std = LinearRegression()
sales_model_std.fit(X=stdscaler.transform(sales.iloc[:, 1:]),
                    y=sales.iloc[:, 0])

LinearRegression()

LinearRegression()

pd.DataFrame().assign(
    column=sales.columns[1:],
    original_coef=sales_model.coef_,
    standardized_coef=sales_model_std.coef_
).set_index('column').plot(kind='barh', barmode='group', title='Standardized and Original Coefficients')

mean_squared_error(sales.iloc[:, 0],
                   sales_model.predict(sales.iloc[:, 1:]))

242.27445717154964

mean_squared_error(sales.iloc[:, 0],
                   sales_model_std.predict(stdscaler.transform(sales.iloc[:, 1:])))

242.27445717154956

	date	day	home_departure_time	home_departure_mileage	...	minutes_to_home	work_departure_time_hr	mileage_to_home	day_of_month
0	5/15/2023	Mon	2023-05-15 10:49:00	15873.0	...	72.0	17.17	53.0	15
1	5/16/2023	Tue	2023-05-16 07:45:00	15979.0	...	NaN	NaN	NaN	16
2	5/22/2023	Mon	2023-05-22 08:27:00	50407.0	...	82.0	15.90	54.0	22
3	5/23/2023	Tue	2023-05-23 07:08:00	50535.0	...	NaN	NaN	NaN	23
4	5/30/2023	Tue	2023-05-30 09:09:00	50664.0	...	76.0	17.12	54.0	30

	date	day	home_departure_time	home_departure_mileage	...	minutes_to_home	work_departure_time_hr	mileage_to_home	day_of_month
0	5/15/2023	Mon	2023-05-15 10:49:00	15873.0	...	72.0	17.17	53.0	15
1	5/16/2023	Tue	2023-05-16 07:45:00	15979.0	...	NaN	NaN	NaN	16
2	5/22/2023	Mon	2023-05-22 08:27:00	50407.0	...	82.0	15.90	54.0	22
3	5/23/2023	Tue	2023-05-23 07:08:00	50535.0	...	NaN	NaN	NaN	23
4	5/30/2023	Tue	2023-05-30 09:09:00	50664.0	...	76.0	17.12	54.0	30

	date	day	home_departure_time	home_departure_mileage	...	day == Tue
0	5/15/2023	Mon	2023-05-15 10:49:00	15873.0	...	0
1	5/16/2023	Tue	2023-05-16 07:45:00	15979.0	...	1
2	5/22/2023	Mon	2023-05-22 08:27:00	50407.0	...	0
3	5/23/2023	Tue	2023-05-23 07:08:00	50535.0	...	1
4	5/30/2023	Tue	2023-05-30 09:09:00	50664.0	...	1

	mpg	cylinders	displacement	horsepower	...	acceleration	model_year	origin	name
0	18.0	8	307.0	130.0	...	12.0	70	usa	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	...	11.5	70	usa	buick skylark 320
2	18.0	8	318.0	150.0	...	11.0	70	usa	plymouth satellite
3	16.0	8	304.0	150.0	...	12.0	70	usa	amc rebel sst
4	17.0	8	302.0	140.0	...	10.5	70	usa	ford torino

	net_sales	sq_ft	inventory	advertising	district_size	competing_stores
0	231.0	3.0	294	8.2	8.2	11
1	156.0	2.2	232	6.9	4.1	12
2	10.0	0.5	149	3.0	4.3	15
3	519.0	5.5	600	12.0	16.1	1
4	437.0	4.4	567	10.6	14.1	5

	day	day_of_month	day == Mon	day == Tue	day == Wed	day == Thu	day == Fri
0	Mon	15	1	0	0	0	0
1	Tue	16	0	1	0	0	0
2	Mon	22	1	0	0	0	0
...	...	...	...	...	...	...	...
62	Mon	4	1	0	0	0	0
63	Tue	5	0	1	0	0	0
64	Thu	7	0	0	0	1	0

	departure_hour	day_of_month	day == Mon	day == Tue	day == Wed	day == Thu
0	10.82	15	1	0	0	0
1	7.75	16	0	1	0	0
2	8.45	22	1	0	0	0
...	...	...	...	...	...	...
62	7.58	4	1	0	0	0
63	7.45	5	0	1	0	0
64	7.60	7	0	0	0	1

	sq_ft	inventory	advertising	district_size	competing_stores
0	3.0	294	8.2	8.2	11
1	2.2	232	6.9	4.1	12
2	0.5	149	3.0	4.3	15
...	...	...	...	...	...
24	3.5	382	9.8	11.5	5
25	5.1	590	12.0	15.7	0
26	8.6	517	7.0	12.0	8

Property	Example	Description
Initialize with parameters	`stdscaler = StandardScaler()`	z-score the data (no parameters)
Fit the transformer	`stdscaler.fit(X)`	Compute the mean and SD of `X`
Transform data in a dataset	`feat = stdscaler.transform(X_new)`	z-score `X_new` with mean and SD of `X`
Fit and transform	`stdscaler.fit_transform(X)`	Compute the mean and SD of `X`, then z-score `X`

	day	day_of_month	day == Mon	day == Tue	day == Wed	day == Thu	day == Fri
0	Mon	15	1	0	0	0	0
1	Tue	16	0	1	0	0	0
2	Mon	22	1	0	0	0	0
...	...	...	...	...	...	...	...
62	Mon	4	1	0	0	0	0
63	Tue	5	0	1	0	0	0
64	Thu	7	0	0	0	1	0

Lecture 16¶

Feature Engineering¶

EECS 398: Practical Data Science, Winter 2025¶

Agenda 📆¶

Question 🤔 (Answer at practicaldsc.org/q)

Recap: Multiple linear regression¶

The general problem¶

The general solution¶

Feature engineering ⚙️¶

The goal of feature engineering¶

One hot encoding¶

Example: One hot encoding 'day'¶

Using 'day' as a feature, along with 'departure_hour' and 'day_of_month'¶

Visualizing our latest model¶

Comparing our latest model to earlier models¶

Reflection¶

Question 🤔 (Answer at practicaldsc.org/q)

Numerical-to-numerical transformations 🧬¶

Example: Horsepower 🚗¶

The relationship between 'horsepower' and 'mpg'¶

Predicting 'mpg' using 'horsepower'¶

Linear in the parameters¶

Linearization¶

Predicting 'mpg' using log('horsepower')¶

Question 🤔 (Answer at practicaldsc.org/q)

How do we fit hypothesis functions that aren't linear in the parameters?¶

Reference Slide¶

Transformations¶

The modeling recipe, revisited¶

The original modeling recipe, from Lecture 11¶

The updated modeling recipe¶

preprocessing and linear_models¶

Transformer classes¶

StandardScaler and standardized regression coefficients¶

Example: Predicting sales 📈¶

An initial model¶

Thought experiment¶

Standardization¶

Pre- and post- standardization¶

Which features are most "important"?¶

Example transformer: StandardScaler¶

Interpreting standardized regression coefficients¶

Key takeaways¶

StandardScaler summary¶

What's next?¶

Example: One hot encoding `'day'`¶

Using `'day'` as a feature, along with `'departure_hour'` and `'day_of_month'`¶

The relationship between `'horsepower'` and `'mpg'`¶

Predicting `'mpg'` using `'horsepower'`¶

Predicting `'mpg'` using `log('horsepower')`¶

`preprocessing` and `linear_model`s¶

`StandardScaler` and standardized regression coefficients¶

Example transformer: `StandardScaler`¶

`StandardScaler` summary¶

	day	day_of_month	day == Mon	day == Tue	day == Wed	day == Thu	day == Fri
0	Mon	15	1	0	0	0	0
1	Tue	16	0	1	0	0	0
2	Mon	22	1	0	0	0	0
...	...	...	...	...	...	...	...
62	Mon	4	1	0	0	0	0
63	Tue	5	0	1	0	0	0
64	Thu	7	0	0	0	1	0