from lec_utils import *
Lecture 14 Supplementary Notebook¶
Regression using Linear Algebra¶
EECS 398: Practical Data Science, Winter 2025¶
practicaldsc.org • github.com/practicaldsc/wn25 • 📣 See latest announcements here on Ed
Note: This notebook is only a supplementary notebook to the main lecture slides, which are in PDF form.
The main lecture slides can be found at practicaldsc.org under Lecture 14. After the live lecture, an annotated version of the slides will be made available as well.
Understanding the Data¶
Let's load in the commute times dataset as a DataFrame.
df = pd.read_csv('data/commute-times.csv')
df.head()
date | day | home_departure_time | home_departure_mileage | ... | mileage_to_work | minutes_to_home | work_departure_time_hr | mileage_to_home | |
---|---|---|---|---|---|---|---|---|---|
0 | 5/15/2023 | Mon | 2023-05-15 10:49:00 | 15873.0 | ... | 53.0 | 72.0 | 17.17 | 53.0 |
1 | 5/16/2023 | Tue | 2023-05-16 07:45:00 | 15979.0 | ... | 54.0 | NaN | NaN | NaN |
2 | 5/22/2023 | Mon | 2023-05-22 08:27:00 | 50407.0 | ... | 54.0 | 82.0 | 15.90 | 54.0 |
3 | 5/23/2023 | Tue | 2023-05-23 07:08:00 | 50535.0 | ... | 54.0 | NaN | NaN | NaN |
4 | 5/30/2023 | Tue | 2023-05-30 09:09:00 | 50664.0 | ... | 54.0 | 76.0 | 17.12 | 54.0 |
5 rows × 18 columns
There are many columns in here, but the only ones we're interested in for now are 'departure_hour'
and 'minutes'
.
df[['departure_hour', 'minutes']]
departure_hour | minutes | |
---|---|---|
0 | 10.82 | 68.0 |
1 | 7.75 | 94.0 |
2 | 8.45 | 63.0 |
... | ... | ... |
62 | 7.58 | 68.0 |
63 | 7.45 | 90.0 |
64 | 7.60 | 83.0 |
65 rows × 2 columns
fig = px.scatter(df,
x='departure_hour',
y='minutes',
size=np.ones(len(df)) * 50,
size_max=8)
fig.update_xaxes(title='Home Departure Time (AM)')
fig.update_yaxes(title='Minutes')
fig.update_layout(title='Commuting Time vs. Home Departure Time')
fig.update_layout(width=700)
Implementing $w_0^*$ and $w_1^*$¶
Let's implement the formulas for the best slope, $w_1^*$, and intercept, $w_0^*$, we found in the lecture slides:
\begin{align*} w_1^* &= \frac{ \displaystyle \sum_{i=1}^n (x_i - \bar x)(y_i - \bar y) }{ \displaystyle \sum_{i=1}^n (x_i - \bar x)^2 } \qquad \qquad w_0^* = \bar y - w_1^* \bar x \end{align*}def slope(x, y):
# Assume x and y are two Series.
numerator = ((x - np.mean(x)) * (y - np.mean(y))).sum()
denominator = ((x - np.mean(x)) ** 2).sum()
return numerator / denominator
def intercept(x, y):
return y.mean() - slope(x, y) * x.mean()
w1_star = slope(df['departure_hour'], df['minutes'])
w1_star
-8.186941724265552
w0_star = intercept(df['departure_hour'], df['minutes'])
w0_star
142.4482415877287
The results above tell us that the linear hypothesis function with the lowest mean squared error on our dataset is:
$$\text{predicted commute time (minutes)} = 142.45 - 8.19 \cdot \text{departure hour}$$We can use it to make predictions:
def predict_commute(x_new):
return w0_star + w1_star * x_new
What if I leave at 8AM? 10:45AM?
predict_commute(8)
76.95270779360428
predict_commute(10 + 45 / 60)
54.438618051874016
What do all of our predictions look like?
hline = px.line(x=[5.5, 11.5], y=[predict_commute(5.5), predict_commute(11.5)]).update_traces(line={'color': 'red', 'width': 4})
fline1 = go.Figure(fig.data + hline.data)
fline1.update_xaxes(title='Home Departure Time (AM)')
fline1.update_yaxes(title='Minutes')
fline1.update_layout(title='<span style="color:red">Predicted Commute Time</span> = 142.25 - 8.19 * Departure Hour')
fline1.update_layout(width=700, margin={'t': 60})
Aside: What does $R_{\text{sq}}(w_0, w_1)$ look like?¶
Let's draw a plot of $R_{\text{sq}}(w_0, w_1)$, the empirical risk that we're trying to minimize.
- When we only had a single parameter, $h$, $R(h)$ was in 2D.
- One axis for $h$, one axis for $R(h)$.
- Now that we have two parameters, $w_0$ and $w_1$, $R(w_0, w_1)$ will be in 3D!
- One axis for $w_0$, one axis for $w_1$, one axis for $R(w_0, w_1)$.
- The bottom plane consists of all possible combinations of slope and intercept.
- The height of the function above any pair of points on the bottom plane represents the MSE for that combination of slope and intercept.
def mse(y_actual, y_pred):
return np.mean((y_actual - y_pred)**2)
def mse_for_departure_model(w):
w0, w1 = w
return mse(df['minutes'], w0 + w1 * df['departure_hour'])
num_points = 50 # increase for better resolution, but it will run more slowly.
# if (num_points <= 100):
uvalues = np.linspace(90, 190, num_points)
vvalues = np.linspace(-13, -3, num_points)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))
MSE = np.array([mse_for_departure_model(t) for t in thetas.T])
loss_surface = go.Surface(x=u, y=v, z=np.reshape(MSE, u.shape))
minimizer = go.Scatter3d(x=[w0_star], y=[w1_star], z=[mse_for_departure_model([w0_star, w1_star])],
mode='markers', name='optimal parameters',
marker=dict(size=10, color='gold'))
fig = go.Figure(data=[loss_surface, minimizer])
# fig.add_trace(opt_point)
fig.update_layout(title='Loss Surface', scene = dict(
xaxis_title = "w0",
yaxis_title = "w1",
zaxis_title = r"R(w0, w1)"))
fig.show()
# else:
# print("Picking num points > 100 can be really slow. If you really want to try, edit the code above so that this if statement doesn't trigger.")
We used partial derivatives to minimize the graph above!
Here's another demo website that graphs $R_\text{sq}(w_0, w_1)$; check it out.
Correlation¶
$$\begin{align*} r &= \text{the average of the product of $x$ and $y$, when both are standardized} \\ &= \frac{1}{n} \sum_{i = 1}^n \left( \frac{x_i - \bar{x}}{\sigma_x} \right) \left( \frac{y_i - \bar{y}}{\sigma_y} \right) \end{align*}$$
def correlation(x, y):
x_su = (x - np.mean(x)) / np.std(x)
y_su = (y - np.mean(y)) / np.std(y)
return np.mean(x_su * y_su)
correlation(df['departure_hour'], df['minutes'])
-0.6486426165832002
# Symmetric!
correlation(df['minutes'], df['departure_hour'])
-0.6486426165832002
# Doesn't change if we multiply x or y by constants!
correlation(df['departure_hour'] * 1000, df['minutes'] * 545)
-0.6486426165832
# DataFrames have a built-in correlation method.
df[['departure_hour', 'minutes']].corr()
departure_hour | minutes | |
---|---|---|
departure_hour | 1.00 | -0.65 |
minutes | -0.65 | 1.00 |
# numpy has a built-in corrcoef method.
np.corrcoef(df['departure_hour'], df['minutes'])
array([[ 1. , -0.65], [-0.65, 1. ]])
Implementing $w_0^*$ and $w_1^*$, Again¶
Recall, the formulas for the optimal intercept and slope are:
$$w_1^* = r \frac{\sigma_y}{\sigma_x}$$$$w_0^* = \bar{y} - w_1^* \bar{x}$$Let's define two new functions, slope_again
and intercept_again
, which use these slightly updated formulas. (Really, only the formula for $w_1^*$ has changed.)
def slope_again(x, y):
return correlation(x, y) * np.std(y) / np.std(x)
def intercept_again(x, y):
return y.mean() - slope_again(x, y) * x.mean()
w1_star_again = slope_again(df['departure_hour'], df['minutes'])
w1_star_again
-8.186941724265553
w0_star_again = intercept_again(df['departure_hour'], df['minutes'])
w0_star_again
142.44824158772872
We get the same optimal intercept and slope as before!
# From before:
(w1_star, w0_star)
(-8.186941724265552, 142.4482415877287)
# Now:
(w1_star_again, w0_star_again)
(-8.186941724265553, 142.44824158772872)
Implementing $w_0^*$ and $w_1^*$ using sklearn
¶
In practice, you wouldn't manually implement formulas for $w_0^*$ and $w_1^*$. Instead, you'd use a pre-built implementation.
The Python package we'll use for machine learning is sklearn
. We'll start seeing it more in lectures next week.
from sklearn.linear_model import LinearRegression
To build a linear regression model that we can use for prediction, we first need to instantiate a LinearRegression
object.
model = LinearRegression()
Then, we need to fit
the model by telling it what our $x$'s and $y$'s are.
model.fit(X=df[['departure_hour']], y=df['minutes'])
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
Once the model is fit, we can look at its intercept_
and coef_
attributes to see $w_0^*$ and $w_1^*$, respectively.
model.intercept_
142.4482415877287
model.coef_
array([-8.19])
These are exactly the same as we found with our manual calculations! This means that sklearn
is doing the same three step modeling process that we outlined in lecture.
Now that model
is fit, we can use it for making predictions:
# We'll discuss this warning more in coming lectures.
model.predict([[8]])
array([76.95])
# Using our hand-build predict_commute function from earlier in the lecture:
predict_commute(8)
76.95270779360428
Finding the Regression Line, Using the Normal Equations¶
Using our linear algebraic formulation, the optimal intercept and slope are given by the vector $\vec{w}^*$, where:
$$\vec{w}^* = ({X^TX})^{-1} X^T\vec{y}$$Here:
- $X$ is a $n \times 2$ matrix, called the design matrix, defined as:
- $\vec{y}$ is a $n$-dimensional vector, called the observation vector, defined as:
Let's construct $X$ and $y$ in code.
First, the design matrix.
# Create a new DataFrame by taking the 'departure_hour' column from df.
X = df[['departure_hour']].copy()
X
departure_hour | |
---|---|
0 | 10.82 |
1 | 7.75 |
2 | 8.45 |
... | ... |
62 | 7.58 |
63 | 7.45 |
64 | 7.60 |
65 rows × 1 columns
# Add a column of all 1s to X.
X['1'] = 1
X
departure_hour | 1 | |
---|---|---|
0 | 10.82 | 1 |
1 | 7.75 | 1 |
2 | 8.45 | 1 |
... | ... | ... |
62 | 7.58 | 1 |
63 | 7.45 | 1 |
64 | 7.60 | 1 |
65 rows × 2 columns
# Change the order of the columns and convert to an array.
X = X[['1', 'departure_hour']].to_numpy()
X
array([[ 1. , 10.82], [ 1. , 7.75], [ 1. , 8.45], ..., [ 1. , 7.58], [ 1. , 7.45], [ 1. , 7.6 ]])
$\vec{y}$ is already created for us: it's just the 'minutes'
column in df
.
y = df['minutes'].to_numpy()
y
array([68., 94., 63., ..., 68., 90., 83.])
Now, let's implement:
$$\vec{w}^* = ({X^TX})^{-1} X^T\vec{y}$$# The @ symbol performs matrix multiplication!
w_star_linalg = np.linalg.inv(X.T @ X) @ X.T @ y
w_star_linalg
array([142.45, -8.19])
These numbers look familiar!
# Old formulas.
w0_star, w1_star
(142.4482415877287, -8.186941724265552)
Indeed, they're exactly the same as the w0_star
and w1_star
we found using our old formulas.
# The predicted commute time if I leave at 8:30AM.
w0_star + w1_star * 8.5
72.8592369314715
How do we make predictions with the new formulas?
To find the predicted commute time for every departure hour in our dataset, we can multiply $X$ by the optimal parameter vector, $\vec{w}^*$.
$$\vec{h}^* = X \vec{w}^*$$$\vec{h}^*$ above is the optimal hypothesis vector.
all_preds = X @ w_star_linalg
all_preds
array([53.89, 79. , 73.27, ..., 80.36, 81.46, 80.23])
To make a prediction for a single data point, we must take the dot product of the optimal parameter vector, $\vec{w}^*$ (w_star_linalg
) with a vector of the form $\begin{bmatrix} 1 & x_\text{new} \end{bmatrix}^T$, since this is what the rows of $X$ look like.
# Also the predicted commute time if I leave at 8:30AM.
np.dot(w_star_linalg, np.array([1, 8.5]))
72.85923693147129
This gives us the same prediction as before!
Multiple Linear Regression¶
Previously, with simple linear regression, our goal was to fit a hypothesis function of the form:
$$\begin{align*}\text{pred. commute} &= H(\text{departure hour}) \\ &= w_0 + w_1 \cdot \text{departure hour} \end{align*}$$Now, we'll try and fit a linear regression model of the form:
$$\begin{align*}\text{pred. commute} &= H(\text{departure hour, day of month}) \\ &= w_0 + w_1 \cdot \text{departure hour} + w_2 \cdot \text{day of month} \end{align*}$$df['date']
0 5/15/2023 1 5/16/2023 2 5/22/2023 ... 62 3/4/2024 63 3/5/2024 64 3/7/2024 Name: date, Length: 65, dtype: object
df['day_of_month'] = pd.to_datetime(df['date']).dt.day
df[['departure_hour', 'day_of_month', 'minutes']]
departure_hour | day_of_month | minutes | |
---|---|---|---|
0 | 10.82 | 15 | 68.0 |
1 | 7.75 | 16 | 94.0 |
2 | 8.45 | 22 | 63.0 |
... | ... | ... | ... |
62 | 7.58 | 4 | 68.0 |
63 | 7.45 | 5 | 90.0 |
64 | 7.60 | 7 | 83.0 |
65 rows × 3 columns
Let's create our new design matrix, $X$:
$$X = \begin{bmatrix} 1 & \text{departure hour}_1 & \text{day}_1 \\ 1 & \text{departure hour}_2 & \text{day}_2 \\ ... & ... & ... \\ 1 & \text{departure hour}_n & \text{day}_n \end{bmatrix}$$X = df[['departure_hour', 'day_of_month']].copy()
X['1'] = 1
X = X[['1', 'departure_hour', 'day_of_month']].to_numpy()
X
array([[ 1. , 10.82, 15. ], [ 1. , 7.75, 16. ], [ 1. , 8.45, 22. ], ..., [ 1. , 7.58, 4. ], [ 1. , 7.45, 5. ], [ 1. , 7.6 , 7. ]])
w_star_multiple = np.linalg.inv(X.T @ X) @ X.T @ y
w_star_multiple
array([141.86, -8.22, 0.06])
What do our predictions look like, for each row of the dataset?
XX, YY = np.mgrid[5:14:1, 0:31:1]
Z = w_star_multiple[0] + w_star_multiple[1] * XX + w_star_multiple[2] * YY
plane = go.Surface(x=XX, y=YY, z=Z, colorscale='Reds')
fig = go.Figure(data=[plane])
fig.add_trace(go.Scatter3d(x=df['departure_hour'],
y=df['day_of_month'],
z=df['minutes'], mode='markers', marker = {'color': '#656DF1'}))
fig.update_layout(scene=dict(xaxis_title='Departure Hour',
yaxis_title='Day of Month',
zaxis_title='Minutes'),
title='Commute Time vs. Departure Hour and Day of Month',
width=1000, height=500)
How do we make predictions for new datapoints?
# The predicted commute time if I leave at 8:30AM on the 15th of the month.
np.dot(w_star_multiple, np.array([1, 8.5, 15]))
72.80767679746616
# The predicted commute time if I leave at 8:30AM on the 30th of the month.
np.dot(w_star_multiple, np.array([1, 8.5, 30]))
73.65007448594321