from lec_utils import *
def sample_from_pop(n=100):
    x = np.linspace(-2, 3, n)
    y = x ** 3 + (np.random.normal(0, 3, size=n))
    return pd.DataFrame({'x': x, 'y': y})
sample_1 = sample_from_pop()

px.scatter(sample_1, x='x', y='y')

LinearRegression()

LinearRegression()

array([-9.72, -9.03, -8.38, ..., 24.1 , 25.47, 26.89])

-0.5198355347237897

array([-0.31, -0.21,  1.12])

---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
Cell In[16], line 2
      1 # Error!
----> 2 ohe.transform(df[['day', 'month']])

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/utils/_set_output.py:313, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    311 @wraps(f)
    312 def wrapped(self, X, *args, **kwargs):
--> 313     data_to_wrap = f(self, X, *args, **kwargs)
    314     if isinstance(data_to_wrap, tuple):
    315         # only wrap the first output for cross decomposition
    316         return_tuple = (
    317             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    318             *data_to_wrap[1:],
    319         )

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:1008, in OneHotEncoder.transform(self, X)
    985 def transform(self, X):
    986     """
    987     Transform X using one-hot encoding.
    988 
   (...)
   1006         returned.
   1007     """
-> 1008     check_is_fitted(self)
   1009     transform_output = _get_output_config("transform", estimator=self)["dense"]
   1010     if transform_output != "default" and self.sparse_output:

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/utils/validation.py:1661, in check_is_fitted(estimator, attributes, msg, all_or_any)
   1658     raise TypeError("%s is not an estimator instance." % (estimator))
   1660 if not _is_fitted(estimator, attributes, all_or_any):
-> 1661     raise NotFittedError(msg % {"name": type(estimator).__name__})

NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

OneHotEncoder()

OneHotEncoder()

<65x16 sparse matrix of type '<class 'numpy.float64'>'
	with 130 stored elements in Compressed Sparse Row format>

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

px.scatter(sample_1, x='x', y='y')

X = sample_1[['x']].copy()
X

# Note that X itself is not the design matrix;
# sklearn's LinearRegression object will create the needed design matrix
# by adding a column of 1s to the start of X.
X['x^2'] = X['x'] ** 2
X['x^3'] = X['x'] ** 3
X

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X=X, y=sample_1['y'])

LinearRegression()

LinearRegression()

model.predict(X)

array([-9.72, -9.03, -8.38, ..., 24.1 , 25.47, 26.89])

fig = px.scatter(sample_1, x='x', y='y')
fig.add_trace(go.Scatter(
    x=X['x'],
    y=model.predict(X),
    mode='lines',
    line=dict(width=5),
    name='Degree 3 Polynomial of Best Fit'
))

model.intercept_

-0.5198355347237897

model.coef_

array([-0.31, -0.21,  1.12])

df = pd.read_csv('data/commute-times.csv')
df['day_of_month'] = pd.to_datetime(df['date']).dt.day
df['month'] = pd.to_datetime(df['date']).dt.month_name()
df.head()

---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
Cell In[16], line 2
      1 # Error!
----> 2 ohe.transform(df[['day', 'month']])

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/utils/_set_output.py:313, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    311 @wraps(f)
    312 def wrapped(self, X, *args, **kwargs):
--> 313     data_to_wrap = f(self, X, *args, **kwargs)
    314     if isinstance(data_to_wrap, tuple):
    315         # only wrap the first output for cross decomposition
    316         return_tuple = (
    317             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    318             *data_to_wrap[1:],
    319         )

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:1008, in OneHotEncoder.transform(self, X)
    985 def transform(self, X):
    986     """
    987     Transform X using one-hot encoding.
    988 
   (...)
   1006         returned.
   1007     """
-> 1008     check_is_fitted(self)
   1009     transform_output = _get_output_config("transform", estimator=self)["dense"]
   1010     if transform_output != "default" and self.sparse_output:

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/utils/validation.py:1661, in check_is_fitted(estimator, attributes, msg, all_or_any)
   1658     raise TypeError("%s is not an estimator instance." % (estimator))
   1660 if not _is_fitted(estimator, attributes, all_or_any):
-> 1661     raise NotFittedError(msg % {"name": type(estimator).__name__})

NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

OneHotEncoder()

OneHotEncoder()

<65x16 sparse matrix of type '<class 'numpy.float64'>'
	with 130 stored elements in Compressed Sparse Row format>

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

array(['day_Fri', 'day_Mon', 'day_Thu', 'day_Tue', 'day_Wed',
       'month_August', 'month_December', 'month_February',
       'month_January', 'month_July', 'month_June', 'month_March',
       'month_May', 'month_November', 'month_October', 'month_September'],
      dtype=object)

LinearRegression()

LinearRegression()

(-82.57574306454093, array([3.08]))

101.58853248632849

df = pd.read_csv('data/commute-times.csv')
df['day_of_month'] = pd.to_datetime(df['date']).dt.day
df['month'] = pd.to_datetime(df['date']).dt.month_name()
df.head()

---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
Cell In[16], line 2
      1 # Error!
----> 2 ohe.transform(df[['day', 'month']])

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/utils/_set_output.py:313, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    311 @wraps(f)
    312 def wrapped(self, X, *args, **kwargs):
--> 313     data_to_wrap = f(self, X, *args, **kwargs)
    314     if isinstance(data_to_wrap, tuple):
    315         # only wrap the first output for cross decomposition
    316         return_tuple = (
    317             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    318             *data_to_wrap[1:],
    319         )

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:1008, in OneHotEncoder.transform(self, X)
    985 def transform(self, X):
    986     """
    987     Transform X using one-hot encoding.
    988 
   (...)
   1006         returned.
   1007     """
-> 1008     check_is_fitted(self)
   1009     transform_output = _get_output_config("transform", estimator=self)["dense"]
   1010     if transform_output != "default" and self.sparse_output:

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/utils/validation.py:1661, in check_is_fitted(estimator, attributes, msg, all_or_any)
   1658     raise TypeError("%s is not an estimator instance." % (estimator))
   1660 if not _is_fitted(estimator, attributes, all_or_any):
-> 1661     raise NotFittedError(msg % {"name": type(estimator).__name__})

NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

OneHotEncoder()

OneHotEncoder()

<65x16 sparse matrix of type '<class 'numpy.float64'>'
	with 130 stored elements in Compressed Sparse Row format>

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

array(['day_Fri', 'day_Mon', 'day_Thu', 'day_Tue', 'day_Wed',
       'month_August', 'month_December', 'month_February',
       'month_January', 'month_July', 'month_June', 'month_March',
       'month_May', 'month_November', 'month_October', 'month_September'],
      dtype=object)

LinearRegression()

LinearRegression()

(-82.57574306454093, array([3.08]))

101.58853248632849

df = pd.read_csv('data/commute-times.csv')
df['day_of_month'] = pd.to_datetime(df['date']).dt.day
df['month'] = pd.to_datetime(df['date']).dt.month_name()
df.head()

df[['day', 'month']]

df[['day', 'month']]

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

# Error!
ohe.transform(df[['day', 'month']])

---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
Cell In[16], line 2
      1 # Error!
----> 2 ohe.transform(df[['day', 'month']])

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/utils/_set_output.py:313, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
    311 @wraps(f)
    312 def wrapped(self, X, *args, **kwargs):
--> 313     data_to_wrap = f(self, X, *args, **kwargs)
    314     if isinstance(data_to_wrap, tuple):
    315         # only wrap the first output for cross decomposition
    316         return_tuple = (
    317             _wrap_data_with_container(method, data_to_wrap[0], X, self),
    318             *data_to_wrap[1:],
    319         )

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:1008, in OneHotEncoder.transform(self, X)
    985 def transform(self, X):
    986     """
    987     Transform X using one-hot encoding.
    988 
   (...)
   1006         returned.
   1007     """
-> 1008     check_is_fitted(self)
   1009     transform_output = _get_output_config("transform", estimator=self)["dense"]
   1010     if transform_output != "default" and self.sparse_output:

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/utils/validation.py:1661, in check_is_fitted(estimator, attributes, msg, all_or_any)
   1658     raise TypeError("%s is not an estimator instance." % (estimator))
   1660 if not _is_fitted(estimator, attributes, all_or_any):
-> 1661     raise NotFittedError(msg % {"name": type(estimator).__name__})

NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

# Need to fit first.
ohe.fit(df[['day', 'month']])

OneHotEncoder()

OneHotEncoder()

ohe.transform(df[['day', 'month']])

<65x16 sparse matrix of type '<class 'numpy.float64'>'
	with 130 stored elements in Compressed Sparse Row format>

ohe.transform(df[['day', 'month']]).toarray()

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

ohe.get_feature_names_out()

array(['day_Fri', 'day_Mon', 'day_Thu', 'day_Tue', 'day_Wed',
       'month_August', 'month_December', 'month_February',
       'month_January', 'month_July', 'month_June', 'month_March',
       'month_May', 'month_November', 'month_October', 'month_September'],
      dtype=object)

pd.DataFrame(ohe.transform(df[['day', 'month']]).toarray(), 
             columns=ohe.get_feature_names_out()) # If we need a DataFrame back, for some reason.

people = pd.read_csv('data/heights-weights.csv').drop(columns=['Index'])
people.head()

people.plot(kind='scatter', x='Height (Inches)', y='Weight (Pounds)', 
            title='Weight vs. Height for 25,000 18 Year Olds')

X = people[['Height (Inches)']]
y = people['Weight (Pounds)']

people_one_feat = LinearRegression()
people_one_feat.fit(X, y)

LinearRegression()

LinearRegression()

people_one_feat.intercept_, people_one_feat.coef_

(-82.57574306454093, array([3.08]))

from sklearn.metrics import mean_squared_error
mean_squared_error(y, people_one_feat.predict(X))

101.58853248632849

people['Height (cm)'] = people['Height (Inches)'] * 2.54 # 1 inch = 2.54 cm.

X2 = people[['Height (Inches)', 'Height (cm)']]

people_two_feat = LinearRegression()
people_two_feat.fit(X2, y)

LinearRegression()

LinearRegression()

people_two_feat.intercept_, people_two_feat.coef_

(-82.57525639659859, array([ 3.38e+10, -1.33e+10]))

mean_squared_error(y, people_two_feat.predict(X2))

101.58853251653949

(-80 - 10 * people.iloc[:, 0] + (13 / 2.54) * people.iloc[:, 2]).head()

0    117.35
1    134.55
2    128.20
3    124.65
4    123.36
dtype: float64

(-80 + 10 * people.iloc[:, 0] - (7 / 2.54) * people.iloc[:, 2]).head()

0    117.35
1    134.55
2    128.20
3    124.65
4    123.36
dtype: float64

df[['day', 'month']]

ohe_drop_one = OneHotEncoder(drop='first')

ohe_drop_one.fit(df[['day', 'month']])

OneHotEncoder(drop='first')

OneHotEncoder(drop='first')

len(ohe_drop_one.get_feature_names_out())

14

df['day'].nunique()

5

df['month'].nunique()

11

sales = pd.read_csv('data/sales.csv')
sales.head()

sales.head()

sales_model = LinearRegression()
sales_model.fit(X=sales.iloc[:, 1:], y=sales.iloc[:, 0])

LinearRegression()

LinearRegression()

sales_model.coef_

array([16.2 ,  0.17, 11.53, 13.58, -5.31])

pd.DataFrame().assign(
    column=sales.columns[1:],
    original_coef=sales_model.coef_,
).set_index('column')

sales.iloc[:, 1:]

from sklearn.preprocessing import StandardScaler
stdscaler = StandardScaler()

# This is like saying "determine the mean and SD of each column in sales, 
# other than the 'net_sales' column".
stdscaler.fit(sales.iloc[:, 1:])

StandardScaler()

StandardScaler()

stdscaler.transform([[5, 300, 10, 15, 6]])

/Users/surajrampure/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/base.py:493: UserWarning:

X does not have valid feature names, but StandardScaler was fitted with feature names

array([[ 0.85, -0.47,  0.51,  1.05, -0.36]])

stdscaler.transform(sales.iloc[:, 1:].tail(5))

array([[-1.13, -1.31, -1.35, -1.6 ,  0.89],
       [ 0.14,  0.39,  0.4 ,  0.32, -0.36],
       [ 0.09, -0.03,  0.46,  0.36, -0.57],
       [ 0.9 ,  1.08,  1.05,  1.19, -1.61],
       [ 2.67,  0.69, -0.3 ,  0.46,  0.05]])

stdscaler.mean_

array([  3.33, 387.48,   8.1 ,   9.69,   7.74])

stdscaler.var_

array([    3.89, 35191.58,    13.72,    25.44,    23.08])

new_scaler = StandardScaler()

new_scaler.fit_transform(sales.iloc[:, 1:].tail(5))

array([[-1.33, -1.79, -1.71, -1.88,  1.48],
       [-0.32,  0.28,  0.43,  0.19, -0.05],
       [-0.36, -0.24,  0.49,  0.23, -0.31],
       [ 0.29,  1.11,  1.22,  1.13, -1.58],
       [ 1.71,  0.64, -0.43,  0.34,  0.46]])

sales_model_std = LinearRegression()
sales_model_std.fit(X=stdscaler.transform(sales.iloc[:, 1:]),
                    y=sales.iloc[:, 0])

LinearRegression()

LinearRegression()

pd.DataFrame().assign(
    column=sales.columns[1:],
    original_coef=sales_model.coef_,
    standardized_coef=sales_model_std.coef_
).set_index('column')

mean_squared_error(sales.iloc[:, 0],
                   sales_model.predict(sales.iloc[:, 1:]))

242.27445717154964

mean_squared_error(sales.iloc[:, 0],
                   sales_model_std.predict(stdscaler.transform(sales.iloc[:, 1:])))

242.27445717154956

	x
0	-2.00
1	-1.95
2	-1.90
...	...
97	2.90
98	2.95
99	3.00

	x	x^2	x^3
0	-2.00	4.00	-8.00
1	-1.95	3.80	-7.41
2	-1.90	3.61	-6.85
...	...	...	...
97	2.90	8.40	24.36
98	2.95	8.70	25.66
99	3.00	9.00	27.00

	date	day	home_departure_time	home_departure_mileage	...	work_departure_time_hr	mileage_to_home	day_of_month	month
0	5/15/2023	Mon	2023-05-15 10:49:00	15873.0	...	17.17	53.0	15	May
1	5/16/2023	Tue	2023-05-16 07:45:00	15979.0	...	NaN	NaN	16	May
2	5/22/2023	Mon	2023-05-22 08:27:00	50407.0	...	15.90	54.0	22	May
3	5/23/2023	Tue	2023-05-23 07:08:00	50535.0	...	NaN	NaN	23	May
4	5/30/2023	Tue	2023-05-30 09:09:00	50664.0	...	17.12	54.0	30	May

	Height (Inches)	Weight (Pounds)
0	65.78	112.99
1	71.52	136.49
2	69.40	153.03
3	68.22	142.34
4	67.79	144.30

	net_sales	sq_ft	inventory	advertising	district_size	competing_stores
0	231.0	3.0	294	8.2	8.2	11
1	156.0	2.2	232	6.9	4.1	12
2	10.0	0.5	149	3.0	4.3	15
3	519.0	5.5	600	12.0	16.1	1
4	437.0	4.4	567	10.6	14.1	5

	day	month
0	Mon	May
1	Tue	May
2	Mon	May
...	...	...
62	Mon	March
63	Tue	March
64	Thu	March

	day_Fri	day_Mon	day_Thu	day_Tue	...	month_May	month_November	month_October	month_September
0	0.0	1.0	0.0	0.0	...	1.0	0.0	0.0	0.0
1	0.0	0.0	0.0	1.0	...	1.0	0.0	0.0	0.0
2	0.0	1.0	0.0	0.0	...	1.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...
62	0.0	1.0	0.0	0.0	...	0.0	0.0	0.0	0.0
63	0.0	0.0	0.0	1.0	...	0.0	0.0	0.0	0.0
64	0.0	0.0	1.0	0.0	...	0.0	0.0	0.0	0.0

	original_coef
column
sq_ft	16.20
inventory	0.17
advertising	11.53
district_size	13.58
competing_stores	-5.31

	sq_ft	inventory	advertising	district_size	competing_stores
0	3.0	294	8.2	8.2	11
1	2.2	232	6.9	4.1	12
2	0.5	149	3.0	4.3	15
...	...	...	...	...	...
24	3.5	382	9.8	11.5	5
25	5.1	590	12.0	15.7	0
26	8.6	517	7.0	12.0	8

Property	Example	Description
Initialize with parameters	`stdscaler = StandardScaler()`	z-score the data (no parameters)
Fit the transformer	`stdscaler.fit(X)`	Compute the mean and SD of `X`
Transform data in a dataset	`feat = stdscaler.transform(X_new)`	z-score `X_new` with mean and SD of `X`
Fit and transform	`stdscaler.fit_transform(X)`	Compute the mean and SD of `X`, then z-score `X`

Processors	Time (Hours)
1	8
2	4
4	3

	day_Fri	day_Mon	day_Thu	day_Tue	...	month_May	month_November	month_October	month_September
0	0.0	1.0	0.0	0.0	...	1.0	0.0	0.0	0.0
1	0.0	0.0	0.0	1.0	...	1.0	0.0	0.0	0.0
2	0.0	1.0	0.0	0.0	...	1.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...
62	0.0	1.0	0.0	0.0	...	0.0	0.0	0.0	0.0
63	0.0	0.0	0.0	1.0	...	0.0	0.0	0.0	0.0
64	0.0	0.0	1.0	0.0	...	0.0	0.0	0.0	0.0

Lecture 18¶

Feature Engineering, Continued¶

EECS 398-003: Practical Data Science, Fall 2024¶

Announcements 📣¶

Come say hi next Thursday!¶

Agenda¶

Question 🤔 (Answer at practicaldsc.org/q)

Recap: Multiple linear regression and feature engineering¶

The general problem¶

The general solution¶

The goal of feature engineering¶

Numerical-to-numerical transformations¶

¶

Linearization¶

Example: Polynomial regression¶

Example: Polynomial regression¶

Reference Slide¶

Example: Amdahl's Law¶

Question 🤔 (Answer at practicaldsc.org/q)

How do we fit hypothesis functions that aren't linear in the parameters?¶

The modeling recipe, revisited¶

The original modeling recipe, from Lecture 14¶

The updated modeling recipe¶

preprocessing and linear_models¶

Transformer classes¶

OneHotEncoder and multicollinearity¶

Example: Commute times 🚗¶

Example transformer: OneHotEncoder¶

Example: Heights and weights¶

Motivating example¶

An added feature¶

Redundant features¶

Infinitely many parameter choices¶

Multicollinearity¶

One hot encoding and multicollinearity¶

OneHotEncoder returns¶

Key takeaways¶

StandardScaler and standardized regression coefficients¶

Example: Predicting sales 📈¶

An initial model¶

Which features are most "important"?¶

Standardization¶

Example transformer: StandardScaler¶

Interpreting standardized regression coefficients¶

Key takeaways¶

StandardScaler summary¶

What's next?¶

`preprocessing` and `linear_model`s¶

`OneHotEncoder` and multicollinearity¶

Example transformer: `OneHotEncoder`¶

`OneHotEncoder` returns¶

`StandardScaler` and standardized regression coefficients¶

Example transformer: `StandardScaler`¶

`StandardScaler` summary¶

	day_Fri	day_Mon	day_Thu	day_Tue	...	month_May	month_November	month_October	month_September
0	0.0	1.0	0.0	0.0	...	1.0	0.0	0.0	0.0
1	0.0	0.0	0.0	1.0	...	1.0	0.0	0.0	0.0
2	0.0	1.0	0.0	0.0	...	1.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...
62	0.0	1.0	0.0	0.0	...	0.0	0.0	0.0	0.0
63	0.0	0.0	0.0	1.0	...	0.0	0.0	0.0	0.0
64	0.0	0.0	1.0	0.0	...	0.0	0.0	0.0	0.0