from lec_utils import *

df = pd.read_csv('data/commute-times.csv')
df.head()

df[['departure_hour', 'minutes']]

fig = px.scatter(df,
           x='departure_hour',
           y='minutes',
           size=np.ones(len(df)) * 50,
           size_max=8)
fig.update_xaxes(title='Home Departure Time (AM)')
fig.update_yaxes(title='Minutes')
fig.update_layout(title='Commuting Time vs. Home Departure Time')
fig.update_layout(width=700)

def slope(x, y):
    return np.corrcoef(x, y)[0, 1] * np.std(y) / np.std(x)

def intercept(x, y):
    return np.mean(y) - slope(x, y) * np.mean(x)

w0_star = intercept(df['departure_hour'], df['minutes'])
w1_star = slope(df['departure_hour'], df['minutes'])
# Just fancy printing – ignore these next two lines.
rule_string = ('$$\\text{Predicted Commute Time (in Minutes)} = ' + 
               f'{round(w0_star, 2)} + {round(w1_star, 2)}' + 
               '\cdot \\left( \\text{Departure Hour} \\right)$$')
display(HTML(f'<h4>The best linear predictor for this dataset is</h4><br><center>{rule_string}</center>'))

hline = px.line(x=[5.5, 11.5], y=[97.405, 48.265]).update_traces(line={'color': 'red', 'width': 4})
fline1 = go.Figure(fig.data + hline.data)
fline1.update_xaxes(title='Home Departure Time (AM)')
fline1.update_yaxes(title='Minutes')
fline1.update_layout(title='<span style="color:red">Predicted Commute Time</span> = 142.45 - 8.19 * Departure Hour')
fline1.update_layout(width=700, margin={'t': 60})

# The predicted commute time if I leave at 8:30AM.
w0_star + w1_star * 8.5

72.85923693147151

# Create a new DataFrame by taking the 'departure_hour' column from df.
X = df[['departure_hour']].copy()
X

# Add a column of all 1s to X.
X['1'] = 1
X

# Change the order of the columns and convert to an array.
X = X[['1', 'departure_hour']].to_numpy()
X

array([[ 1.  , 10.82],
       [ 1.  ,  7.75],
       [ 1.  ,  8.45],
       ...,
       [ 1.  ,  7.58],
       [ 1.  ,  7.45],
       [ 1.  ,  7.6 ]])

y = df['minutes'].to_numpy()
y

array([68., 94., 63., ..., 68., 90., 83.])

# The @ symbol performs matrix multiplication!
w_star_linalg = np.linalg.inv(X.T @ X) @ X.T @ y 
w_star_linalg

array([142.45,  -8.19])

# Old formulas.
w0_star, w1_star

(142.44824158772875, -8.186941724265557)

# The predicted commute time if I leave at 8:30AM.
w0_star + w1_star * 8.5

72.85923693147151

all_preds = X @ w_star_linalg 
all_preds

array([53.89, 79.  , 73.27, ..., 80.36, 81.46, 80.23])

# Also the predicted commute time if I leave at 8:30AM.
np.dot(w_star_linalg, np.array([1, 8.5]))

72.85923693147129

df['date']

0     5/15/2023
1     5/16/2023
2     5/22/2023
        ...    
62     3/4/2024
63     3/5/2024
64     3/7/2024
Name: date, Length: 65, dtype: object

df['day_of_month'] = pd.to_datetime(df['date']).dt.day

df[['departure_hour', 'day_of_month', 'minutes']]

X = df[['departure_hour', 'day_of_month']].copy()
X['1'] = 1
X = X[['1', 'departure_hour', 'day_of_month']].to_numpy()
X

array([[ 1.  , 10.82, 15.  ],
       [ 1.  ,  7.75, 16.  ],
       [ 1.  ,  8.45, 22.  ],
       ...,
       [ 1.  ,  7.58,  4.  ],
       [ 1.  ,  7.45,  5.  ],
       [ 1.  ,  7.6 ,  7.  ]])

w_star_multiple = np.linalg.inv(X.T @ X) @ X.T @ y 
w_star_multiple

array([141.86,  -8.22,   0.06])

XX, YY = np.mgrid[5:14:1, 0:31:1]
Z = w_star_multiple[0] + w_star_multiple[1] * XX + w_star_multiple[2] * YY
plane = go.Surface(x=XX, y=YY, z=Z, colorscale='Reds')
fig = go.Figure(data=[plane])
fig.add_trace(go.Scatter3d(x=df['departure_hour'], 
                           y=df['day_of_month'], 
                           z=df['minutes'], mode='markers', marker = {'color': '#656DF1'}))
fig.update_layout(scene=dict(xaxis_title='Departure Hour',
                             yaxis_title='Day of Month',
                             zaxis_title='Minutes'),
                  title='Commute Time vs. Departure Hour and Day of Month',
                  width=1000, height=500)

# The predicted commute time if I leave at 8:30AM on the 15th of the month.
np.dot(w_star_multiple, np.array([1, 8.5, 15]))

72.80767679746616

# The predicted commute time if I leave at 8:30AM on the 30th of the month.
np.dot(w_star_multiple, np.array([1, 8.5, 30]))

73.65007448594321

	date	day	home_departure_time	home_departure_mileage	...	mileage_to_work	minutes_to_home	work_departure_time_hr	mileage_to_home
0	5/15/2023	Mon	2023-05-15 10:49:00	15873.0	...	53.0	72.0	17.17	53.0
1	5/16/2023	Tue	2023-05-16 07:45:00	15979.0	...	54.0	NaN	NaN	NaN
2	5/22/2023	Mon	2023-05-22 08:27:00	50407.0	...	54.0	82.0	15.90	54.0
3	5/23/2023	Tue	2023-05-23 07:08:00	50535.0	...	54.0	NaN	NaN	NaN
4	5/30/2023	Tue	2023-05-30 09:09:00	50664.0	...	54.0	76.0	17.12	54.0

Lecture 16 Supplementary Notebook¶

Regression using Linear Algebra¶

EECS 398-003: Practical Data Science, Fall 2024¶

Finding the Regression Line, Using the Old Formulas¶

The best linear predictor for this dataset is

Finding the Regression Line, Using the Normal Equations¶

Making Predictions¶

Multiple Linear Regression¶

	departure_hour	minutes
0	10.82	68.0
1	7.75	94.0
2	8.45	63.0
...	...	...
62	7.58	68.0
63	7.45	90.0
64	7.60	83.0