from lec_utils import *

df = pd.read_csv('data/commute-times.csv')
df.head()

df[['departure_hour', 'minutes']]

fig = px.scatter(df,
           x='departure_hour',
           y='minutes',
           size=np.ones(len(df)) * 50,
           size_max=8)
fig.update_xaxes(title='Home Departure Time (AM)')
fig.update_yaxes(title='Minutes')
fig.update_layout(title='Commuting Time vs. Home Departure Time')
fig.update_layout(width=700)

def slope(x, y):
    # Assume x and y are two Series.
    numerator = ((x - np.mean(x)) * (y - np.mean(y))).sum()
    denominator = ((x - np.mean(x)) ** 2).sum()
    return numerator / denominator
def intercept(x, y):
    return y.mean() - slope(x, y) * x.mean()

w1_star = slope(df['departure_hour'], df['minutes'])
w1_star

-8.186941724265552

w0_star = intercept(df['departure_hour'], df['minutes'])
w0_star

142.4482415877287

def predict_commute(x_new):
    return w0_star + w1_star * x_new

predict_commute(8)

76.95270779360428

predict_commute(10 + 45 / 60)

54.438618051874016

hline = px.line(x=[5.5, 11.5], y=[predict_commute(5.5), predict_commute(11.5)]).update_traces(line={'color': 'red', 'width': 4})
fline1 = go.Figure(fig.data + hline.data)
fline1.update_xaxes(title='Home Departure Time (AM)')
fline1.update_yaxes(title='Minutes')
fline1.update_layout(title='<span style="color:red">Predicted Commute Time</span> = 142.25 - 8.19 * Departure Hour')
fline1.update_layout(width=700, margin={'t': 60})

def mse(y_actual, y_pred):
    return np.mean((y_actual - y_pred)**2)
def mse_for_departure_model(w):
    w0, w1 = w
    return mse(df['minutes'], w0 + w1 * df['departure_hour'])
num_points = 50 # increase for better resolution, but it will run more slowly. 
# if (num_points <= 100):
uvalues = np.linspace(90, 190, num_points)
vvalues = np.linspace(-13, -3, num_points)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))
MSE = np.array([mse_for_departure_model(t) for t in thetas.T])
loss_surface = go.Surface(x=u, y=v, z=np.reshape(MSE, u.shape))
minimizer = go.Scatter3d(x=[w0_star], y=[w1_star], z=[mse_for_departure_model([w0_star, w1_star])], 
                         mode='markers', name='optimal parameters',
                         marker=dict(size=10, color='gold'))
fig = go.Figure(data=[loss_surface, minimizer])
# fig.add_trace(opt_point)
fig.update_layout(title='Loss Surface', scene = dict(
    xaxis_title = "w0",
    yaxis_title = "w1",
    zaxis_title = r"R(w0, w1)"))
fig.show()
# else:
#     print("Picking num points > 100 can be really slow. If you really want to try, edit the code above so that this if statement doesn't trigger.")

def correlation(x, y): 
    x_su = (x - np.mean(x)) / np.std(x)
    y_su = (y - np.mean(y)) / np.std(y)
    return np.mean(x_su * y_su)

correlation(df['departure_hour'], df['minutes'])

-0.6486426165832002

# Symmetric!
correlation(df['minutes'], df['departure_hour'])

-0.6486426165832002

# Doesn't change if we multiply x or y by constants!
correlation(df['departure_hour'] * 1000, df['minutes'] * 545)

-0.6486426165832

# DataFrames have a built-in correlation method.
df[['departure_hour', 'minutes']].corr()

# numpy has a built-in corrcoef method.
np.corrcoef(df['departure_hour'], df['minutes'])

array([[ 1.  , -0.65],
       [-0.65,  1.  ]])

def slope_again(x, y):
    return correlation(x, y) * np.std(y) / np.std(x)

def intercept_again(x, y):
    return y.mean() - slope_again(x, y) * x.mean()

w1_star_again = slope_again(df['departure_hour'], df['minutes'])
w1_star_again

-8.186941724265553

w0_star_again = intercept_again(df['departure_hour'], df['minutes'])
w0_star_again

142.44824158772872

# From before:
(w1_star, w0_star)

(-8.186941724265552, 142.4482415877287)

# Now:
(w1_star_again, w0_star_again)

(-8.186941724265553, 142.44824158772872)

from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X=df[['departure_hour']], y=df['minutes'])

LinearRegression()

LinearRegression()

model.intercept_

142.4482415877287

model.coef_

array([-8.19])

# We'll discuss this warning more in coming lectures.
model.predict([[8]])

array([76.95])

# Using our hand-build predict_commute function from earlier in the lecture:
predict_commute(8)

76.95270779360428

# Create a new DataFrame by taking the 'departure_hour' column from df.
X = df[['departure_hour']].copy()
X

# Add a column of all 1s to X.
X['1'] = 1
X

# Change the order of the columns and convert to an array.
X = X[['1', 'departure_hour']].to_numpy()
X

array([[ 1.  , 10.82],
       [ 1.  ,  7.75],
       [ 1.  ,  8.45],
       ...,
       [ 1.  ,  7.58],
       [ 1.  ,  7.45],
       [ 1.  ,  7.6 ]])

y = df['minutes'].to_numpy()
y

array([68., 94., 63., ..., 68., 90., 83.])

# The @ symbol performs matrix multiplication!
w_star_linalg = np.linalg.inv(X.T @ X) @ X.T @ y 
w_star_linalg

array([142.45,  -8.19])

# Old formulas.
w0_star, w1_star

(142.4482415877287, -8.186941724265552)

# The predicted commute time if I leave at 8:30AM.
w0_star + w1_star * 8.5

72.8592369314715

all_preds = X @ w_star_linalg 
all_preds

array([53.89, 79.  , 73.27, ..., 80.36, 81.46, 80.23])

# Also the predicted commute time if I leave at 8:30AM.
np.dot(w_star_linalg, np.array([1, 8.5]))

72.85923693147129

df['date']

0     5/15/2023
1     5/16/2023
2     5/22/2023
        ...    
62     3/4/2024
63     3/5/2024
64     3/7/2024
Name: date, Length: 65, dtype: object

df['day_of_month'] = pd.to_datetime(df['date']).dt.day

df[['departure_hour', 'day_of_month', 'minutes']]

X = df[['departure_hour', 'day_of_month']].copy()
X['1'] = 1
X = X[['1', 'departure_hour', 'day_of_month']].to_numpy()
X

array([[ 1.  , 10.82, 15.  ],
       [ 1.  ,  7.75, 16.  ],
       [ 1.  ,  8.45, 22.  ],
       ...,
       [ 1.  ,  7.58,  4.  ],
       [ 1.  ,  7.45,  5.  ],
       [ 1.  ,  7.6 ,  7.  ]])

w_star_multiple = np.linalg.inv(X.T @ X) @ X.T @ y 
w_star_multiple

array([141.86,  -8.22,   0.06])

XX, YY = np.mgrid[5:14:1, 0:31:1]
Z = w_star_multiple[0] + w_star_multiple[1] * XX + w_star_multiple[2] * YY
plane = go.Surface(x=XX, y=YY, z=Z, colorscale='Reds')
fig = go.Figure(data=[plane])
fig.add_trace(go.Scatter3d(x=df['departure_hour'], 
                           y=df['day_of_month'], 
                           z=df['minutes'], mode='markers', marker = {'color': '#656DF1'}))
fig.update_layout(scene=dict(xaxis_title='Departure Hour',
                             yaxis_title='Day of Month',
                             zaxis_title='Minutes'),
                  title='Commute Time vs. Departure Hour and Day of Month',
                  width=1000, height=500)

# The predicted commute time if I leave at 8:30AM on the 15th of the month.
np.dot(w_star_multiple, np.array([1, 8.5, 15]))

72.80767679746616

# The predicted commute time if I leave at 8:30AM on the 30th of the month.
np.dot(w_star_multiple, np.array([1, 8.5, 30]))

73.65007448594321

	date	day	home_departure_time	home_departure_mileage	...	mileage_to_work	minutes_to_home	work_departure_time_hr	mileage_to_home
0	5/15/2023	Mon	2023-05-15 10:49:00	15873.0	...	53.0	72.0	17.17	53.0
1	5/16/2023	Tue	2023-05-16 07:45:00	15979.0	...	54.0	NaN	NaN	NaN
2	5/22/2023	Mon	2023-05-22 08:27:00	50407.0	...	54.0	82.0	15.90	54.0
3	5/23/2023	Tue	2023-05-23 07:08:00	50535.0	...	54.0	NaN	NaN	NaN
4	5/30/2023	Tue	2023-05-30 09:09:00	50664.0	...	54.0	76.0	17.12	54.0

Lecture 14 Supplementary Notebook¶

Regression using Linear Algebra¶

EECS 398: Practical Data Science, Winter 2025¶

Understanding the Data¶

Implementing $w_0^$ and $w_1^$¶

Aside: What does $R_{\text{sq}}(w_0, w_1)$ look like?¶

Correlation¶

Implementing $w_0^$ and $w_1^$, Again¶

Implementing $w_0^$ and $w_1^$ using `sklearn`¶

Finding the Regression Line, Using the Normal Equations¶

Making Predictions¶

Multiple Linear Regression¶

	departure_hour	minutes
0	10.82	68.0
1	7.75	94.0
2	8.45	63.0
...	...	...
62	7.58	68.0
63	7.45	90.0
64	7.60	83.0

	departure_hour	minutes
departure_hour	1.00	-0.65
minutes	-0.65	1.00

Lecture 14 Supplementary Notebook¶

Regression using Linear Algebra¶

EECS 398: Practical Data Science, Winter 2025¶

Understanding the Data¶

Implementing $w_0^*$ and $w_1^*$¶

Aside: What does $R_{\text{sq}}(w_0, w_1)$ look like?¶

Correlation¶

Implementing $w_0^*$ and $w_1^*$, Again¶

Implementing $w_0^*$ and $w_1^*$ using sklearn¶

Finding the Regression Line, Using the Normal Equations¶

Making Predictions¶

Multiple Linear Regression¶

Implementing $w_0^$ and $w_1^$¶

Implementing $w_0^$ and $w_1^$, Again¶

Implementing $w_0^$ and $w_1^$ using `sklearn`¶