from lec_utils import *

df = pd.read_csv('data/commute-times.csv')
df.head()

df[['departure_hour', 'minutes']]

fig = px.scatter(df,
           x='departure_hour',
           y='minutes',
           size=np.ones(len(df)) * 50,
           size_max=8)
fig.update_xaxes(title='Home Departure Time (AM)')
fig.update_yaxes(title='Minutes')
fig.update_layout(title='Commuting Time vs. Home Departure Time')
fig.update_layout(width=700)

def slope(x, y):
    # Assume x and y are two Series.
    numerator = ((x - np.mean(x)) * (y - np.mean(y))).sum()
    denominator = ((x - np.mean(x)) ** 2).sum()
    return numerator / denominator
def intercept(x, y):
    return y.mean() - slope(x, y) * x.mean()

w1_star = slope(df['departure_hour'], df['minutes'])
w1_star

-8.186941724265552

w0_star = intercept(df['departure_hour'], df['minutes'])
w0_star

142.4482415877287

def predict_commute(x_new):
    return w0_star + w1_star * x_new

predict_commute(8)

76.95270779360428

predict_commute(10 + 45 / 60)

54.438618051874016

hline = px.line(x=[5.5, 11.5], y=[predict_commute(5.5), predict_commute(11.5)]).update_traces(line={'color': 'red', 'width': 4})
fline1 = go.Figure(fig.data + hline.data)
fline1.update_xaxes(title='Home Departure Time (AM)')
fline1.update_yaxes(title='Minutes')
fline1.update_layout(title='<span style="color:red">Predicted Commute Time</span> = 142.25 - 8.19 * Departure Hour')
fline1.update_layout(width=700, margin={'t': 60})

def mse(y_actual, y_pred):
    return np.mean((y_actual - y_pred)**2)
def mse_for_departure_model(w):
    w0, w1 = w
    return mse(df['minutes'], w0 + w1 * df['departure_hour'])
num_points = 50 # increase for better resolution, but it will run more slowly. 
# if (num_points <= 100):
uvalues = np.linspace(90, 190, num_points)
vvalues = np.linspace(-13, -3, num_points)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))
MSE = np.array([mse_for_departure_model(t) for t in thetas.T])
loss_surface = go.Surface(x=u, y=v, z=np.reshape(MSE, u.shape))
minimizer = go.Scatter3d(x=[w0_star], y=[w1_star], z=[mse_for_departure_model([w0_star, w1_star])], 
                         mode='markers', name='optimal parameters',
                         marker=dict(size=10, color='gold'))
fig = go.Figure(data=[loss_surface, minimizer])
# fig.add_trace(opt_point)
fig.update_layout(title='Loss Surface', scene = dict(
    xaxis_title = "w0",
    yaxis_title = "w1",
    zaxis_title = r"R(w0, w1)"))
fig.show()
# else:
#     print("Picking num points > 100 can be really slow. If you really want to try, edit the code above so that this if statement doesn't trigger.")

def correlation(x, y): 
    x_su = (x - np.mean(x)) / np.std(x)
    y_su = (y - np.mean(y)) / np.std(y)
    return np.mean(x_su * y_su)

correlation(df['departure_hour'], df['minutes'])

-0.6486426165832002

# Symmetric!
correlation(df['minutes'], df['departure_hour'])

-0.6486426165832002

# Doesn't change if we multiply x or y by constants!
correlation(df['departure_hour'] * 1000, df['minutes'] * 545)

-0.6486426165832

# DataFrames have a built-in correlation method.
df[['departure_hour', 'minutes']].corr()

# numpy has a built-in corrcoef method.
np.corrcoef(df['departure_hour'], df['minutes'])

array([[ 1.  , -0.65],
       [-0.65,  1.  ]])

def slope_again(x, y):
    return correlation(x, y) * np.std(y) / np.std(x)

def intercept_again(x, y):
    return y.mean() - slope_again(x, y) * x.mean()

w1_star_again = slope_again(df['departure_hour'], df['minutes'])
w1_star_again

-8.186941724265553

w0_star_again = intercept_again(df['departure_hour'], df['minutes'])
w0_star_again

142.44824158772872

# From before:
(w1_star, w0_star)

(-8.186941724265552, 142.4482415877287)

# Now:
(w1_star_again, w0_star_again)

(-8.186941724265553, 142.44824158772872)

from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X=df[['departure_hour']], y=df['minutes'])

LinearRegression()

LinearRegression()

model.intercept_

142.4482415877287

model.coef_

array([-8.19])

# We'll discuss this warning more in coming lectures.
model.predict([[8]])

/Users/surajrampure/miniforge3/envs/pds/lib/python3.10/site-packages/sklearn/base.py:493: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

array([76.95])

# Using our hand-build predict_commute function from earlier in the lecture:
predict_commute(8)

76.95270779360428

anscombe = pd.read_csv('data/anscombe.csv')

plt.figure(figsize=(12, 10))
for i, n in enumerate(['I', 'II', 'III', 'IV']):
    rows = anscombe[anscombe.get('dataset') == n]
    x = rows['x']
    y = rows['y']
    plt.subplot(2, 2, i+1)
    plt.scatter(x, y, label=f'Dataset {n}', alpha=0.65, s=65)
    plt.title(f'Dataset {n}');

for i, n in enumerate(['I', 'II', 'III', 'IV']):
    rows = anscombe[anscombe.get('dataset') == n]
    x = rows['x']
    y = rows['y']
    r = correlation(x, y)
    outstr = f'''
    <b>Dataset {n}</b><br>
    $\\bar x$: {np.round(np.mean(x), 2)}<br>
    $\\bar y$: {np.round(np.mean(y), 2)}<br>
    $\\sigma_x$: {np.round(np.std(x), 2)}<br>
    $\\sigma_y$: {np.round(np.std(y), 2)}<br>
    $r$: {np.round(r, 2)}
    '''
    display(HTML(outstr))

plt.figure(figsize=(12, 10))
for i, n in enumerate(['I', 'II', 'III', 'IV']):
    rows = anscombe[anscombe.get('dataset') == n]
    x = rows['x']
    y = rows['y']
    w0_ans = intercept(x, y)
    w1_ans = slope(x, y)
    plt.subplot(2, 2, i+1)
    plt.scatter(x, y, label=f'Dataset {n}', alpha=0.65, s=65)
    plt.plot(x, w0_ans + w1_ans * x, color='red');
    plt.title(f'Dataset {n}');

	date	day	home_departure_time	home_departure_mileage	...	mileage_to_work	minutes_to_home	work_departure_time_hr	mileage_to_home
0	5/15/2023	Mon	2023-05-15 10:49:00	15873.0	...	53.0	72.0	17.17	53.0
1	5/16/2023	Tue	2023-05-16 07:45:00	15979.0	...	54.0	NaN	NaN	NaN
2	5/22/2023	Mon	2023-05-22 08:27:00	50407.0	...	54.0	82.0	15.90	54.0
3	5/23/2023	Tue	2023-05-23 07:08:00	50535.0	...	54.0	NaN	NaN	NaN
4	5/30/2023	Tue	2023-05-30 09:09:00	50664.0	...	54.0	76.0	17.12	54.0

	departure_hour	minutes
0	10.82	68.0
1	7.75	94.0
2	8.45	63.0
...	...	...
62	7.58	68.0
63	7.45	90.0
64	7.60	83.0

Lecture 15 Supplementary Notebook¶

Simple Linear Regression¶

EECS 398-003: Practical Data Science, Fall 2024¶

Understanding the Data¶

Implementing $w_0^$ and $w_1^$¶

Aside: What does $R_{\text{sq}}(w_0, w_1)$ look like?¶

Correlation¶

Implementing $w_0^$ and $w_1^$, Again¶

Implementing $w_0^$ and $w_1^$ using `sklearn`¶

Aside: Pitfalls of Correlation¶

	departure_hour	minutes
departure_hour	1.00	-0.65
minutes	-0.65	1.00

Lecture 15 Supplementary Notebook¶

Simple Linear Regression¶

EECS 398-003: Practical Data Science, Fall 2024¶

Understanding the Data¶

Implementing $w_0^*$ and $w_1^*$¶

Aside: What does $R_{\text{sq}}(w_0, w_1)$ look like?¶

Correlation¶

Implementing $w_0^*$ and $w_1^*$, Again¶

Implementing $w_0^*$ and $w_1^*$ using sklearn¶

Aside: Pitfalls of Correlation¶

Implementing $w_0^$ and $w_1^$¶

Implementing $w_0^$ and $w_1^$, Again¶

Implementing $w_0^$ and $w_1^$ using `sklearn`¶