from lec_utils import *
def show_merging_animation():
    src = "https://docs.google.com/presentation/d/1HPJ7fiBLNEURsWYiY0qpqPR3qup68Mr0_B34GU99Y8Q/embed?start=false&loop=false&delayms=60000&rm=minimal"
    width = 865
    height = 509
    display(IFrame(src, width, height))

IFrame(
    src='https://www.3blue1bron.com/share/7c278b09-f703-454c-9e23-31564f38b040',
    width=800,
    height=700
)

penguins = pd.read_csv('data/penguins.csv')
penguins

# The median 'bill_length_mm' of each 'species'.
penguins.groupby('species')['bill_length_mm'].median()

species
Adelie       38.85
Chinstrap    49.55
Gentoo       47.40
Name: bill_length_mm, dtype: float64

# The most common 'island' per 'species'.
penguins.groupby('species')['island'].agg(lambda s: s.value_counts().idxmax())

species
Adelie        Dream
Chinstrap     Dream
Gentoo       Biscoe
Name: island, dtype: object

# Keeps the 'species' with at least 100 penguins.
penguins.groupby('species').filter(lambda df: df.shape[0] >= 100)

# Read this as:
species_and_island = (
    penguins.groupby(['species', 'island'])         # for every combination of 'species' and 'island' in the DataFrame,
    [['bill_length_mm', 'bill_depth_mm']].mean()    # calculate the mean 'bill_length_mm' and the mean 'bill_depth_mm'.
)
species_and_island

# Now, this looks like a regular DataFrame!
species_and_island.reset_index()

df.pivot_table(index=index_col,
                       columns=columns_col,
                       values=values_col,
                       aggfunc=func)

penguins.pivot_table(
    index='species',
    columns='sex',
    values='body_mass_g',
    aggfunc='mean'
)

# Same information as above, but harder to read!
(
    penguins
    .groupby(['species', 'sex'])
    [['body_mass_g']]
    .mean()
)

penguins

penguins.value_counts(['island', 'species'])

island     species  
Biscoe     Gentoo       119
Dream      Chinstrap     68
           Adelie        55
Torgersen  Adelie        47
Biscoe     Adelie        44
Name: count, dtype: int64

penguins.groupby(['island', 'species']).size()

island     species  
Biscoe     Adelie        44
           Gentoo       119
Dream      Adelie        55
           Chinstrap     68
Torgersen  Adelie        47
dtype: int64

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', # Choice of column here doesn't actually matter! Why?
    aggfunc='count',
)

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', 
    aggfunc='count',
    fill_value=0,
)

penguins

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', 
    aggfunc='count',
    fill_value=0,
)

wide_example = pd.DataFrame({
    'Year': [2001, 2002],
    'Jan': [10, 130],
    'Feb': [20, 200],
    'Mar': [30, 340]
}).set_index('Year')
wide_example

196785

0                   iPhone 16
1           iPhone 16 Pro Max
2    Samsung Galaxy S24 Ultra
3                 Pixel 9 Pro
Name: Model, dtype: object

{'Pixel 9 Pro', 'Samsung Galaxy S24 Ultra', 'iPhone 16', 'iPhone 16 Pro Max'}

0    iPhone 16 Pro Max
1            iPhone 16
2          Pixel 9 Pro
3          Pixel 9 Pro
4            iPhone 16
5            iPhone 15
Name: Handset, dtype: object

{'Pixel 9 Pro', 'iPhone 15', 'iPhone 16', 'iPhone 16 Pro Max'}

{'Samsung Galaxy S24 Ultra'}

{'iPhone 15'}

awesome      NaN
eecs398     12.0
hello        NaN
is           NaN
students     NaN
dtype: float64

Index(['id', 'loan_amnt', 'issue_d', 'term', 'int_rate', 'grade', 'sub_grade',
       'emp_title', 'verification_status', 'home_ownership', 'annual_inc',
       'loan_status', 'purpose', 'desc', 'addr_state', 'dti', 'fico_range_low',
       'fico_range_high', 'hardship_flag', 'mths_since_last_delinq'],
      dtype='object')

0        60 months
1        36 months
2        36 months
           ...    
6297     60 months
6298     60 months
6299     60 months
Name: term, Length: 6300, dtype: object

wide_example = pd.DataFrame({
    'Year': [2001, 2002],
    'Jan': [10, 130],
    'Feb': [20, 200],
    'Mar': [30, 340]
}).set_index('Year')
wide_example

wide_example.melt(ignore_index=False)

phones = pd.DataFrame().assign(
    Model=['iPhone 16', 'iPhone 16 Pro Max', 'Samsung Galaxy S24 Ultra', 'Pixel 9 Pro'],
    Price=[799, 1199, 1299, 999],
    Screen=[6.1, 6.9, 6.8, 6.3]
)
inventory = pd.DataFrame().assign(
    Handset=['iPhone 16 Pro Max', 'iPhone 16', 'Pixel 9 Pro', 'Pixel 9 Pro', 'iPhone 16', 'iPhone 15'],
    Units=[50, 40, 10, 15, 100, 5],
    Store=['Briarwood', 'Somerset', 'Arbor Hills', '12 Oaks', 'Briarwood', 'Oakland Mall']
)

# The DataFrame on the left contains information about phones on the market.
# The DataFrame on the right contains information about the stock I have in my stores.
dfs_side_by_side(phones, inventory)

combined = phones.merge(inventory, left_on='Model', right_on='Handset') 
combined

(combined['Price'] * combined['Units']).sum()

196785

# Click through the presentation that appears.
show_merging_animation()

# The DataFrame on the far right is the merged DataFrame.
dfs_side_by_side(phones, inventory, phones.merge(inventory, left_on='Model', right_on='Handset'))

phones.merge(inventory, left_on='Model', right_on='Handset', how='left')

phones.merge(inventory, left_on='Model', right_on='Handset', how='right')

phones.merge(inventory, left_on='Model', right_on='Handset', how='outer')

phones['Model']

0                   iPhone 16
1           iPhone 16 Pro Max
2    Samsung Galaxy S24 Ultra
3                 Pixel 9 Pro
Name: Model, dtype: object

left = set(phones['Model'])
left

{'Pixel 9 Pro', 'Samsung Galaxy S24 Ultra', 'iPhone 16', 'iPhone 16 Pro Max'}

inventory['Handset']

0    iPhone 16 Pro Max
1            iPhone 16
2          Pixel 9 Pro
3          Pixel 9 Pro
4            iPhone 16
5            iPhone 15
Name: Handset, dtype: object

right = set(inventory['Handset'])
right

{'Pixel 9 Pro', 'iPhone 15', 'iPhone 16', 'iPhone 16 Pro Max'}

left.difference(right)

{'Samsung Galaxy S24 Ultra'}

right.difference(left)

{'iPhone 15'}

df1 = pd.DataFrame({'a': [1, 2, 3]}, index=['hello', 'eecs398', 'students'])
df2 = pd.DataFrame({'b': [10, 20, 30]}, index=['eecs398', 'is', 'awesome'])
dfs_side_by_side(df1, df2)

awesome      NaN
eecs398     12.0
hello        NaN
is           NaN
students     NaN
dtype: float64

Index(['id', 'loan_amnt', 'issue_d', 'term', 'int_rate', 'grade', 'sub_grade',
       'emp_title', 'verification_status', 'home_ownership', 'annual_inc',
       'loan_status', 'purpose', 'desc', 'addr_state', 'dti', 'fico_range_low',
       'fico_range_high', 'hardship_flag', 'mths_since_last_delinq'],
      dtype='object')

0        60 months
1        36 months
2        36 months
           ...    
6297     60 months
6298     60 months
6299     60 months
Name: term, Length: 6300, dtype: object

252

101

35

0       60
1       36
2       36
        ..
6297    60
6298    60
6299    60
Name: term, Length: 6300, dtype: int64

1.05 ms ± 8.94 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

911 μs ± 4.88 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

53.3 μs ± 172 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

df1 = pd.DataFrame({'a': [1, 2, 3]}, index=['hello', 'eecs398', 'students'])
df2 = pd.DataFrame({'b': [10, 20, 30]}, index=['eecs398', 'is', 'awesome'])
dfs_side_by_side(df1, df2)

awesome      NaN
eecs398     12.0
hello        NaN
is           NaN
students     NaN
dtype: float64

Index(['id', 'loan_amnt', 'issue_d', 'term', 'int_rate', 'grade', 'sub_grade',
       'emp_title', 'verification_status', 'home_ownership', 'annual_inc',
       'loan_status', 'purpose', 'desc', 'addr_state', 'dti', 'fico_range_low',
       'fico_range_high', 'hardship_flag', 'mths_since_last_delinq'],
      dtype='object')

0        60 months
1        36 months
2        36 months
           ...    
6297     60 months
6298     60 months
6299     60 months
Name: term, Length: 6300, dtype: object

252

101

35

0       60
1       36
2       36
        ..
6297    60
6298    60
6299    60
Name: term, Length: 6300, dtype: int64

1.05 ms ± 8.94 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

911 μs ± 4.88 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

53.3 μs ± 172 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

df1 = pd.DataFrame({'a': [1, 2, 3]}, index=['hello', 'eecs398', 'students'])
df2 = pd.DataFrame({'b': [10, 20, 30]}, index=['eecs398', 'is', 'awesome'])
dfs_side_by_side(df1, df2)

df1['a'] + df2['b']

awesome      NaN
eecs398     12.0
hello        NaN
is           NaN
students     NaN
dtype: float64

midwest_cities = pd.DataFrame().assign(
    city=['Ann Arbor', 'Detroit', 'Chicago', 'East Lansing'],
    state=['Michigan', 'Michigan', 'Illinois', 'Michigan'],
    today_high_temp=['79', '83', '87', '87']
)
schools = pd.DataFrame().assign(
    name=['University of Michigan', 'University of Chicago', 'Wayne State University', 'Johns Hopkins University', 'UC San Diego', 'Concordia U-Ann Arbor', 'Michigan State University'], 
    city=['Ann Arbor', 'Chicago', 'Detroit', 'Baltimore', 'La Jolla', 'Ann Arbor', 'East Lansing'],
    state=['Michigan', 'Illinois', 'Michigan', 'Maryland', 'California', 'Michigan', 'Michigan'],
    graduation_rate=[0.87, 0.94, 0.78, 0.92, 0.81, 0.83, 0.91]
)

dfs_side_by_side(midwest_cities, schools)

Index(['id', 'loan_amnt', 'issue_d', 'term', 'int_rate', 'grade', 'sub_grade',
       'emp_title', 'verification_status', 'home_ownership', 'annual_inc',
       'loan_status', 'purpose', 'desc', 'addr_state', 'dti', 'fico_range_low',
       'fico_range_high', 'hardship_flag', 'mths_since_last_delinq'],
      dtype='object')

0        60 months
1        36 months
2        36 months
           ...    
6297     60 months
6298     60 months
6299     60 months
Name: term, Length: 6300, dtype: object

252

101

35

0       60
1       36
2       36
        ..
6297    60
6298    60
6299    60
Name: term, Length: 6300, dtype: int64

1.05 ms ± 8.94 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

911 μs ± 4.88 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

53.3 μs ± 172 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

500 μs ± 2.06 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

dfs_side_by_side(midwest_cities, schools)

midwest_cities.merge(schools, on='city')

dfs_side_by_side(midwest_cities, schools)

midwest_cities.merge(schools, on='state')

show_merging_animation()

loans = pd.read_csv('data/loans.csv')

Index(['id', 'loan_amnt', 'issue_d', 'term', 'int_rate', 'grade', 'sub_grade',
       'emp_title', 'verification_status', 'home_ownership', 'annual_inc',
       'loan_status', 'purpose', 'desc', 'addr_state', 'dti', 'fico_range_low',
       'fico_range_high', 'hardship_flag', 'mths_since_last_delinq'],
      dtype='object')

0        60 months
1        36 months
2        36 months
           ...    
6297     60 months
6298     60 months
6299     60 months
Name: term, Length: 6300, dtype: object

252

101

35

0       60
1       36
2       36
        ..
6297    60
6298    60
6299    60
Name: term, Length: 6300, dtype: int64

1.05 ms ± 8.94 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

911 μs ± 4.88 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

53.3 μs ± 172 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

500 μs ± 2.06 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

loans = pd.read_csv('data/loans.csv')

# Each time you run this cell, you'll see a different random subset of the DataFrame.
loans.sample(5)

# When a DataFrame has more columns than you can see in its preview,
# it's a good idea to check the names of all columns.
loans.columns

Index(['id', 'loan_amnt', 'issue_d', 'term', 'int_rate', 'grade', 'sub_grade',
       'emp_title', 'verification_status', 'home_ownership', 'annual_inc',
       'loan_status', 'purpose', 'desc', 'addr_state', 'dti', 'fico_range_low',
       'fico_range_high', 'hardship_flag', 'mths_since_last_delinq'],
      dtype='object')

# Again, run this a few times to get a sense of the typical values.
loans[['loan_amnt', 'issue_d', 'term', 'int_rate', 'emp_title', 'fico_range_low']].sample(5)

loans['term']

0        60 months
1        36 months
2        36 months
           ...    
6297     60 months
6298     60 months
6299     60 months
Name: term, Length: 6300, dtype: object

(loans['emp_title'] == 'registered nurse').sum()

252

(loans['emp_title'] == 'nurse').sum()

101

(loans['emp_title'] == 'rn').sum()

35

def clean_term(term_string):
    return int(term_string.split()[0])

loans['term'].apply(clean_term)

0       60
1       36
2       36
        ..
6297    60
6298    60
6299    60
Name: term, Length: 6300, dtype: int64

%%timeit
loans['term'].apply(clean_term)

1.05 ms ± 8.94 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

%%timeit
res = []
for term in loans['term']:
    res.append(clean_term(term))

911 μs ± 4.88 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

%%timeit
loans['int_rate'] // 10 * 10 # Rounds down to the nearest multiple of 10.

53.3 μs ± 172 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

%%timeit
loans['int_rate'].apply(lambda y: y // 10 * 10)

500 μs ± 2.06 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

# Here, we use .split() on every string in loans['term'].
loans['term'].str.split()

0       [60, months]
1       [36, months]
2       [36, months]
            ...     
6297    [60, months]
6298    [60, months]
6299    [60, months]
Name: term, Length: 6300, dtype: object

loans['term'].str.split().str[0].astype(int)

0       60
1       36
2       36
        ..
6297    60
6298    60
6299    60
Name: term, Length: 6300, dtype: int64

# Stored as strings.
loans['issue_d']

0       Jun-2014
1       Jun-2017
2       Dec-2016
          ...   
6297    Nov-2015
6298    Dec-2014
6299    Jun-2015
Name: issue_d, Length: 6300, dtype: object

pd.to_datetime(loans['issue_d'], format='%b-%Y')

0      2014-06-01
1      2017-06-01
2      2016-12-01
          ...    
6297   2015-11-01
6298   2014-12-01
6299   2015-06-01
Name: issue_d, Length: 6300, dtype: datetime64[ns]

def clean_term_column(df):
    return df.assign(
        term=df['term'].str.split().str[0].astype(int)
    )
def clean_date_column(df):
    return (
        df
        .assign(date=pd.to_datetime(df['issue_d'], format='%b-%Y'))
        .drop(columns=['issue_d'])
    )

loans = (
    pd.read_csv('data/loans.csv')
    .pipe(clean_term_column)
    .pipe(clean_date_column)
)
loans

# Same as above, just way harder to read and write.
clean_date_column(clean_term_column(pd.read_csv('data/loans.csv')))

# This shows us the average interest rate given out to loans in every 6 month interval.
loans.resample('6M', on='date')['int_rate'].mean()

date
2008-03-31    10.71
2008-09-30     8.63
2009-03-31    12.13
              ...  
2018-03-31    12.85
2018-09-30    12.72
2019-03-31    12.93
Freq: 6M, Name: int_rate, Length: 23, dtype: float64

# Not meaningful in this example, but possible.
loans['date'].diff()

0            NaT
1      1096 days
2      -182 days
          ...   
6297   -517 days
6298   -335 days
6299    182 days
Name: date, Length: 6300, dtype: timedelta64[ns]

# If each loan was for 60 months,
# this is a Series of when they'd end.
# Unfortunately, pd.DateOffset isn't vectorized, so
# if you'd want to use a different month offset for each row
# (like we'd need to, since some loans are 36 months
# and some are 60 months), you'd need to use `.apply`.
loans['date'] + pd.DateOffset(months=60)

0      2019-06-01
1      2022-06-01
2      2021-12-01
          ...    
6297   2020-11-01
6298   2019-12-01
6299   2020-06-01
Name: date, Length: 6300, dtype: datetime64[ns]

loans['date'].dt.year

0       2014
1       2017
2       2016
        ... 
6297    2015
6298    2014
6299    2015
Name: date, Length: 6300, dtype: int32

loans['date'].dt.month

0        6
1        6
2       12
        ..
6297    11
6298    12
6299     6
Name: date, Length: 6300, dtype: int32

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Dream	41.3	20.3	194.0	3550.0	Male
1	Adelie	Torgersen	38.5	17.9	190.0	3325.0	Female
2	Adelie	Dream	34.0	17.1	185.0	3400.0	Female
...	...	...	...	...	...	...	...
330	Chinstrap	Dream	46.6	17.8	193.0	3800.0	Female
331	Adelie	Dream	39.7	17.9	193.0	4250.0	Male
332	Gentoo	Biscoe	45.1	14.5	207.0	5050.0	Female

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Dream	41.3	20.3	194.0	3550.0	Male
1	Adelie	Torgersen	38.5	17.9	190.0	3325.0	Female
2	Adelie	Dream	34.0	17.1	185.0	3400.0	Female
...	...	...	...	...	...	...	...
326	Adelie	Dream	41.1	18.1	205.0	4300.0	Male
331	Adelie	Dream	39.7	17.9	193.0	4250.0	Male
332	Gentoo	Biscoe	45.1	14.5	207.0	5050.0	Female

sex	Female	Male
species
Adelie	3368.84	4043.49
Chinstrap	3527.21	3938.97
Gentoo	4679.74	5484.84

sex	Female	Male
species
Adelie	3368.84	4043.49
Chinstrap	3527.21	3938.97
Gentoo	4679.74	5484.84

		body_mass_g
species	sex
Adelie	Female	3368.84
Adelie	Male	4043.49
Chinstrap	Female	3527.21
Chinstrap	Male	3938.97
Gentoo	Female	4679.74
Gentoo	Male	5484.84

	species	island	bill_length_mm	bill_depth_mm
0	Adelie	Biscoe	38.98	18.37
1	Adelie	Dream	38.52	18.24
2	Adelie	Torgersen	39.04	18.45
3	Chinstrap	Dream	48.83	18.42
4	Gentoo	Biscoe	47.57	15.00

island	Biscoe	Dream	Torgersen
species
Adelie	44.0	55.0	47.0
Chinstrap	NaN	68.0	NaN
Gentoo	119.0	NaN	NaN

	variable	value
Year
2001	Jan	10
2002	Jan	130
2001	Feb	20
2002	Feb	200
2001	Mar	30
2002	Mar	340

	Model	Price	Screen
0	iPhone 16	799	6.1
1	iPhone 16 Pro Max	1199	6.9
2	Samsung Galaxy S24 Ultra	1299	6.8
3	Pixel 9 Pro	999	6.3

	Handset	Units	Store
0	iPhone 16 Pro Max	50	Briarwood
1	iPhone 16	40	Somerset
2	Pixel 9 Pro	10	Arbor Hills
3	Pixel 9 Pro	15	12 Oaks
4	iPhone 16	100	Briarwood
5	iPhone 15	5	Oakland Mall

	Model	Price	Screen	Handset	Units	Store
0	iPhone 16	799	6.1	iPhone 16	40.0	Somerset
1	iPhone 16	799	6.1	iPhone 16	100.0	Briarwood
2	iPhone 16 Pro Max	1199	6.9	iPhone 16 Pro Max	50.0	Briarwood
3	Samsung Galaxy S24 Ultra	1299	6.8	NaN	NaN	NaN
4	Pixel 9 Pro	999	6.3	Pixel 9 Pro	10.0	Arbor Hills
5	Pixel 9 Pro	999	6.3	Pixel 9 Pro	15.0	12 Oaks

	Model	Price	Screen	Handset	Units	Store
0	iPhone 16 Pro Max	1199.0	6.9	iPhone 16 Pro Max	50	Briarwood
1	iPhone 16	799.0	6.1	iPhone 16	40	Somerset
2	Pixel 9 Pro	999.0	6.3	Pixel 9 Pro	10	Arbor Hills
3	Pixel 9 Pro	999.0	6.3	Pixel 9 Pro	15	12 Oaks
4	iPhone 16	799.0	6.1	iPhone 16	100	Briarwood
5	NaN	NaN	NaN	iPhone 15	5	Oakland Mall

	city	state	today_high_temp
0	Ann Arbor	Michigan	79
1	Detroit	Michigan	83
2	Chicago	Illinois	87
3	East Lansing	Michigan	87

	name	city	state	graduation_rate
0	University of Michigan	Ann Arbor	Michigan	0.87
1	University of Chicago	Chicago	Illinois	0.94
2	Wayne State University	Detroit	Michigan	0.78
3	Johns Hopkins University	Baltimore	Maryland	0.92
4	UC San Diego	La Jolla	California	0.81
5	Concordia U-Ann Arbor	Ann Arbor	Michigan	0.83
6	Michigan State University	East Lansing	Michigan	0.91

	id	loan_amnt	issue_d	term	...	fico_range_low	fico_range_high	hardship_flag	mths_since_last_delinq
5953	77387565	12000.0	Apr-2016	36 months	...	725.0	729.0	N	16.0
2929	57314757	15600.0	Aug-2015	60 months	...	675.0	679.0	N	NaN
1449	115254831	18000.0	Aug-2017	36 months	...	690.0	694.0	N	NaN
6106	77358853	7000.0	Apr-2016	36 months	...	675.0	679.0	N	35.0
1389	130611066	30000.0	Apr-2018	36 months	...	710.0	714.0	N	57.0

	loan_amnt	issue_d	term	int_rate	emp_title	fico_range_low
941	14000.0	Jun-2017	36 months	9.44	registered nurse	680.0
996	16000.0	Oct-2017	60 months	17.09	registered nurse-dept manager	685.0
3717	14600.0	Oct-2013	36 months	9.99	chemist	710.0
4494	14000.0	Nov-2017	60 months	12.62	high school teacher	700.0
5723	23000.0	Aug-2014	36 months	8.39	executive director	675.0

	id	loan_amnt	term	int_rate	...	fico_range_high	hardship_flag	mths_since_last_delinq	date
0	17965023	18000.0	60	16.99	...	704.0	N	72.0	2014-06-01
1	111414087	10000.0	36	16.02	...	684.0	N	6.0	2017-06-01
2	95219557	12800.0	36	7.99	...	709.0	N	66.0	2016-12-01
...	...	...	...	...	...	...	...	...	...
6297	63990101	10800.0	60	18.49	...	679.0	N	39.0	2015-11-01
6298	37641672	15000.0	60	14.31	...	664.0	N	22.0	2014-12-01
6299	50587446	14000.0	60	9.99	...	689.0	N	NaN	2015-06-01

Lecture 6¶

Pivoting, Merging, and Transforming¶

EECS 398: Practical Data Science, Winter 2025¶

3blue1bron 🏀¶

Agenda 📆¶

Recap: groupby¶

Loading the data 🐧¶

The groupby method¶

Grouping with multiple columns¶

Pivot tables using pivot_table¶

Pivot tables: An extension of grouping¶

pivot_table¶

Example: Finding the number of penguins per 'island' and 'species'¶

Granularity¶

Reshaping¶

Reference Slide¶

The melt method¶

Merging 🚗¶

Example: Phone sales 📱¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

What just happened!? 🤯¶

The merge method¶

Inner joins¶

Other join types¶

Different join types handle mismatches differently¶

Tip: Set differences¶

Reference Slide¶

Notes on the merge method¶

Reference Slide¶

Lots of pandas operations do an implicit outer join!¶

Activity setup¶

Question 🤔 (Answer at practicaldsc.org/q)

Followup activity¶

The butterfly method 🦋¶

Question 🤔 (Answer at practicaldsc.org/q)

Transforming 🤖¶

Loading the data 🏦¶

Transformations¶

One solution: The apply method¶

The price of apply¶

The .str accessor¶

Creating timestamps ⏱️¶

Aside: The pipe method🚰¶

Working with timestamps¶

The .dt accessor¶

What's next?¶

Recap: `groupby`¶

The `groupby` method¶

Pivot tables using `pivot_table`¶

`pivot_table`¶

Example: Finding the number of penguins per `'island'` and `'species'`¶

The `melt` method¶

The `merge` method¶

Notes on the `merge` method¶

Lots of `pandas` operations do an implicit outer join!¶

One solution: The `apply` method¶

The price of `apply`¶

The `.str` accessor¶

Aside: The `pipe` method🚰¶

The `.dt` accessor¶