from lec_utils import *
def show_grouping_animation():
    src = "https://docs.google.com/presentation/d/1tBaFyHseIGsX5wmE3BdNLeVHnKksQtpzLhHge8Tzly0/embed?start=false&loop=false&delayms=60000&rm=minimal"
    width = 960
    height = 509
    display(IFrame(src, width, height))
def show_merging_animation():
    src = "https://docs.google.com/presentation/d/1HPJ7fiBLNEURsWYiY0qpqPR3qup68Mr0_B34GU99Y8Q/embed?start=false&loop=false&delayms=60000&rm=minimal"
    width = 865
    height = 509
    display(IFrame(src, width, height))

show_grouping_animation()

38.82397260273973

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

species
Adelie       637.5
Chinstrap    462.5
Gentoo       800.0
Name: body_mass_g, dtype: float64

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

0     -0.57
1     -0.51
2     -1.19
       ... 
330    1.92
331    1.23
332    1.48
Name: body_mass_g, Length: 333, dtype: float64

0      0.10
1      0.21
2     -1.00
       ... 
330    1.32
331    0.22
332    0.62
Name: body_mass_g, Length: 333, dtype: float64

species    island   
Adelie     Biscoe       38.98
           Dream        38.52
           Torgersen    39.04
Chinstrap  Dream        48.83
Gentoo     Biscoe       47.57
Name: bill_length_mm, dtype: float64

bill_length_mm    39.04
bill_depth_mm     18.45
Name: (Adelie, Torgersen), dtype: float64

island     species  
Biscoe     Gentoo       119
Dream      Chinstrap     68
           Adelie        55
Torgersen  Adelie        47
Biscoe     Adelie        44
Name: count, dtype: int64

island     species  
Biscoe     Adelie        44
           Gentoo       119
Dream      Adelie        55
           Chinstrap     68
Torgersen  Adelie        47
dtype: int64

show_grouping_animation()

penguins = sns.load_dataset('penguins').dropna().reset_index(drop=True)
penguins

penguins.loc[penguins['species'] == 'Adelie', 'bill_length_mm'].mean()

38.82397260273973

# Read this as:
#        for each 'species',  calculate the mean 'bill_length_mm'.
penguins.groupby('species')['bill_length_mm'].mean()

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

(
    penguins
    .groupby('species')
    ['body_mass_g']
    .aggregate(['count', 'mean'])
)

species
Adelie       637.5
Chinstrap    462.5
Gentoo       800.0
Name: body_mass_g, dtype: float64

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

0     -0.57
1     -0.51
2     -1.19
       ... 
330    1.92
331    1.23
332    1.48
Name: body_mass_g, Length: 333, dtype: float64

0      0.10
1      0.21
2     -1.00
       ... 
330    1.32
331    0.22
332    0.62
Name: body_mass_g, Length: 333, dtype: float64

species    island   
Adelie     Biscoe       38.98
           Dream        38.52
           Torgersen    39.04
Chinstrap  Dream        48.83
Gentoo     Biscoe       47.57
Name: bill_length_mm, dtype: float64

bill_length_mm    39.04
bill_depth_mm     18.45
Name: (Adelie, Torgersen), dtype: float64

island     species  
Biscoe     Gentoo       119
Dream      Chinstrap     68
           Adelie        55
Torgersen  Adelie        47
Biscoe     Adelie        44
Name: count, dtype: int64

island     species  
Biscoe     Adelie        44
           Gentoo       119
Dream      Adelie        55
           Chinstrap     68
Torgersen  Adelie        47
dtype: int64

196785

awesome      NaN
eecs398     12.0
hello        NaN
is           NaN
students     NaN
dtype: float64

(
    penguins
    .groupby('species')
    ['body_mass_g']
    .aggregate(['count', 'mean'])
)

(
    penguins
    .groupby('species')
    .agg({'bill_length_mm': 'max', 'island': 'unique'})
)

# Here, the argument to agg is a function,
# which takes in a Series and returns a scalar.
def iqr(s):
    return np.percentile(s, 75) - np.percentile(s, 25)
(
    penguins
    .groupby('species')
    ['body_mass_g']
    .agg(iqr)
)

species
Adelie       637.5
Chinstrap    462.5
Gentoo       800.0
Name: body_mass_g, dtype: float64

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

0     -0.57
1     -0.51
2     -1.19
       ... 
330    1.92
331    1.23
332    1.48
Name: body_mass_g, Length: 333, dtype: float64

0      0.10
1      0.21
2     -1.00
       ... 
330    1.32
331    0.22
332    0.62
Name: body_mass_g, Length: 333, dtype: float64

species    island   
Adelie     Biscoe       38.98
           Dream        38.52
           Torgersen    39.04
Chinstrap  Dream        48.83
Gentoo     Biscoe       47.57
Name: bill_length_mm, dtype: float64

bill_length_mm    39.04
bill_depth_mm     18.45
Name: (Adelie, Torgersen), dtype: float64

island     species  
Biscoe     Gentoo       119
Dream      Chinstrap     68
           Adelie        55
Torgersen  Adelie        47
Biscoe     Adelie        44
Name: count, dtype: int64

island     species  
Biscoe     Adelie        44
           Gentoo       119
Dream      Adelie        55
           Chinstrap     68
Torgersen  Adelie        47
dtype: int64

196785

awesome      NaN
eecs398     12.0
hello        NaN
is           NaN
students     NaN
dtype: float64

# Here, the argument to agg is a function,
# which takes in a Series and returns a scalar.
def iqr(s):
    return np.percentile(s, 75) - np.percentile(s, 25)
(
    penguins
    .groupby('species')
    ['body_mass_g']
    .agg(iqr)
)

species
Adelie       637.5
Chinstrap    462.5
Gentoo       800.0
Name: body_mass_g, dtype: float64

(
    penguins
    .groupby('species')
    .filter(lambda df: df['bill_length_mm'].mean() > 39)
)

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

0     -0.57
1     -0.51
2     -1.19
       ... 
330    1.92
331    1.23
332    1.48
Name: body_mass_g, Length: 333, dtype: float64

0      0.10
1      0.21
2     -1.00
       ... 
330    1.32
331    0.22
332    0.62
Name: body_mass_g, Length: 333, dtype: float64

species    island   
Adelie     Biscoe       38.98
           Dream        38.52
           Torgersen    39.04
Chinstrap  Dream        48.83
Gentoo     Biscoe       47.57
Name: bill_length_mm, dtype: float64

bill_length_mm    39.04
bill_depth_mm     18.45
Name: (Adelie, Torgersen), dtype: float64

island     species  
Biscoe     Gentoo       119
Dream      Chinstrap     68
           Adelie        55
Torgersen  Adelie        47
Biscoe     Adelie        44
Name: count, dtype: int64

island     species  
Biscoe     Adelie        44
           Gentoo       119
Dream      Adelie        55
           Chinstrap     68
Torgersen  Adelie        47
dtype: int64

196785

awesome      NaN
eecs398     12.0
hello        NaN
is           NaN
students     NaN
dtype: float64

state
Michigan    3
Illinois    1
Name: count, dtype: int64

(
    penguins
    .groupby('species')
    .filter(lambda df: df['bill_length_mm'].mean() > 39)
)

(
    penguins
    .groupby('species')
    .filter(lambda df: df.shape[0] >= 100)
)

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

0     -0.57
1     -0.51
2     -1.19
       ... 
330    1.92
331    1.23
332    1.48
Name: body_mass_g, Length: 333, dtype: float64

0      0.10
1      0.21
2     -1.00
       ... 
330    1.32
331    0.22
332    0.62
Name: body_mass_g, Length: 333, dtype: float64

species    island   
Adelie     Biscoe       38.98
           Dream        38.52
           Torgersen    39.04
Chinstrap  Dream        48.83
Gentoo     Biscoe       47.57
Name: bill_length_mm, dtype: float64

bill_length_mm    39.04
bill_depth_mm     18.45
Name: (Adelie, Torgersen), dtype: float64

island     species  
Biscoe     Gentoo       119
Dream      Chinstrap     68
           Adelie        55
Torgersen  Adelie        47
Biscoe     Adelie        44
Name: count, dtype: int64

island     species  
Biscoe     Adelie        44
           Gentoo       119
Dream      Adelie        55
           Chinstrap     68
Torgersen  Adelie        47
dtype: int64

196785

awesome      NaN
eecs398     12.0
hello        NaN
is           NaN
students     NaN
dtype: float64

state
Michigan    3
Illinois    1
Name: count, dtype: int64

(
    penguins
    .groupby('species')
    .filter(lambda df: df.shape[0] >= 100)
)

# Note that to just find the 'species' with at least 100 penguins,
# we didn't need to group:
penguins['species'].value_counts()

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

def z_score(x):
    return (x - x.mean()) / x.std(ddof=0)

0     -0.57
1     -0.51
2     -1.19
       ... 
330    1.92
331    1.23
332    1.48
Name: body_mass_g, Length: 333, dtype: float64

0      0.10
1      0.21
2     -1.00
       ... 
330    1.32
331    0.22
332    0.62
Name: body_mass_g, Length: 333, dtype: float64

species    island   
Adelie     Biscoe       38.98
           Dream        38.52
           Torgersen    39.04
Chinstrap  Dream        48.83
Gentoo     Biscoe       47.57
Name: bill_length_mm, dtype: float64

bill_length_mm    39.04
bill_depth_mm     18.45
Name: (Adelie, Torgersen), dtype: float64

island     species  
Biscoe     Gentoo       119
Dream      Chinstrap     68
           Adelie        55
Torgersen  Adelie        47
Biscoe     Adelie        44
Name: count, dtype: int64

island     species  
Biscoe     Adelie        44
           Gentoo       119
Dream      Adelie        55
           Chinstrap     68
Torgersen  Adelie        47
dtype: int64

196785

awesome      NaN
eecs398     12.0
hello        NaN
is           NaN
students     NaN
dtype: float64

state
Michigan    3
Illinois    1
Name: count, dtype: int64

state
Michigan      4
Illinois      1
Maryland      1
California    1
Name: count, dtype: int64

def z_score(x):
    return (x - x.mean()) / x.std(ddof=0)

z_score(penguins['body_mass_g'])

0     -0.57
1     -0.51
2     -1.19
       ... 
330    1.92
331    1.23
332    1.48
Name: body_mass_g, Length: 333, dtype: float64

z_mass = (penguins
          .groupby('species')
          ['body_mass_g']
          .transform(z_score))
z_mass

0      0.10
1      0.21
2     -1.00
       ... 
330    1.32
331    0.22
332    0.62
Name: body_mass_g, Length: 333, dtype: float64

species    island   
Adelie     Biscoe       38.98
           Dream        38.52
           Torgersen    39.04
Chinstrap  Dream        48.83
Gentoo     Biscoe       47.57
Name: bill_length_mm, dtype: float64

bill_length_mm    39.04
bill_depth_mm     18.45
Name: (Adelie, Torgersen), dtype: float64

island     species  
Biscoe     Gentoo       119
Dream      Chinstrap     68
           Adelie        55
Torgersen  Adelie        47
Biscoe     Adelie        44
Name: count, dtype: int64

island     species  
Biscoe     Adelie        44
           Gentoo       119
Dream      Adelie        55
           Chinstrap     68
Torgersen  Adelie        47
dtype: int64

196785

awesome      NaN
eecs398     12.0
hello        NaN
is           NaN
students     NaN
dtype: float64

state
Michigan    3
Illinois    1
Name: count, dtype: int64

state
Michigan      4
Illinois      1
Maryland      1
California    1
Name: count, dtype: int64

state
California     NaN
Illinois       1.0
Maryland       NaN
Michigan      12.0
Name: count, dtype: float64

z_mass = (penguins
          .groupby('species')
          ['body_mass_g']
          .transform(z_score))
z_mass

0      0.10
1      0.21
2     -1.00
       ... 
330    1.32
331    0.22
332    0.62
Name: body_mass_g, Length: 333, dtype: float64

penguins.assign(z_mass=z_mass)

display_df(penguins.assign(z_mass=z_mass), rows=8)

# Read this as:
species_and_island = (
    penguins.groupby(['species', 'island'])         # for every combination of 'species' and 'island' in the DataFrame,
    [['bill_length_mm', 'bill_depth_mm']].mean()    # calculate the mean 'bill_length_mm' and the mean 'bill_depth_mm'.
)
species_and_island

species_and_island['bill_length_mm']

species    island   
Adelie     Biscoe       38.98
           Dream        38.52
           Torgersen    39.04
Chinstrap  Dream        48.83
Gentoo     Biscoe       47.57
Name: bill_length_mm, dtype: float64

species_and_island.loc['Adelie']

species_and_island.loc[('Adelie', 'Torgersen')]

bill_length_mm    39.04
bill_depth_mm     18.45
Name: (Adelie, Torgersen), dtype: float64

# Now, this looks like a regular DataFrame!
species_and_island.reset_index()

baby = pd.read_csv('data/baby.csv')
baby

(
    baby
    .groupby('Year')
    .filter(lambda df: df['Count'].sum() >= 1_000_000) # Keeps only the 'Year's with at least 1,000,000 births.
    .sort_values('Count', ascending=False)             # Sorts by 'Count' in descending order, so the most popular 'Name's are always at the top.
    .groupby(['Year', 'Sex'])                          # Finds the first row for every combination of ('Year', 'Sex').
    .first()
)

df.pivot_table(index=index_col,
                       columns=columns_col,
                       values=values_col,
                       aggfunc=func)

last_5_years = baby[baby['Year'] >= 2018] 
last_5_years

last_5_years.pivot_table(
    index='Year',
    columns='Sex',
    values='Count',
    aggfunc='sum',
)

# Same information as above, but harder to read!
(
    last_5_years
    .groupby(['Year', 'Sex'])
    [['Count']]
    .sum()
)

penguins

penguins.value_counts(['island', 'species'])

island     species  
Biscoe     Gentoo       119
Dream      Chinstrap     68
           Adelie        55
Torgersen  Adelie        47
Biscoe     Adelie        44
Name: count, dtype: int64

penguins.groupby(['island', 'species']).size()

island     species  
Biscoe     Adelie        44
           Gentoo       119
Dream      Adelie        55
           Chinstrap     68
Torgersen  Adelie        47
dtype: int64

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', # Choice of column here doesn't actually matter! Why?
    aggfunc='count',
)

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', 
    aggfunc='count',
    fill_value=0,
)

penguins

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', 
    aggfunc='count',
    fill_value=0,
)

phones = pd.DataFrame().assign(
    Model=['iPhone 16', 'iPhone 16 Pro Max', 'Samsung Galaxy S24 Ultra', 'Pixel 9 Pro'],
    Price=[799, 1199, 1299, 999],
    Screen=[6.1, 6.9, 6.8, 6.3]
)
inventory = pd.DataFrame().assign(
    Handset=['iPhone 16 Pro Max', 'iPhone 16', 'Pixel 9 Pro', 'Pixel 9 Pro', 'iPhone 16', 'iPhone 15'],
    Units=[50, 40, 10, 15, 100, 5],
    Store=['Briarwood', 'Somerset', 'Arbor Hills', '12 Oaks', 'Briarwood', 'Oakland Mall']
)

196785

awesome      NaN
eecs398     12.0
hello        NaN
is           NaN
students     NaN
dtype: float64

state
Michigan    3
Illinois    1
Name: count, dtype: int64

state
Michigan      4
Illinois      1
Maryland      1
California    1
Name: count, dtype: int64

state
California     NaN
Illinois       1.0
Maryland       NaN
Michigan      12.0
Name: count, dtype: float64

13.0

phones = pd.DataFrame().assign(
    Model=['iPhone 16', 'iPhone 16 Pro Max', 'Samsung Galaxy S24 Ultra', 'Pixel 9 Pro'],
    Price=[799, 1199, 1299, 999],
    Screen=[6.1, 6.9, 6.8, 6.3]
)
inventory = pd.DataFrame().assign(
    Handset=['iPhone 16 Pro Max', 'iPhone 16', 'Pixel 9 Pro', 'Pixel 9 Pro', 'iPhone 16', 'iPhone 15'],
    Units=[50, 40, 10, 15, 100, 5],
    Store=['Briarwood', 'Somerset', 'Arbor Hills', '12 Oaks', 'Briarwood', 'Oakland Mall']
)

# The DataFrame on the left contains information about phones on the market.
# The DataFrame on the right contains information about the stock I have in my stores.
dfs_side_by_side(phones, inventory)

combined = phones.merge(inventory, left_on='Model', right_on='Handset') 
combined

np.sum(combined['Price'] * combined['Units'])

196785

# Click through the presentation that appears.
show_merging_animation()

# The DataFrame on the far right is the merged DataFrame.
dfs_side_by_side(phones, inventory, phones.merge(inventory, left_on='Model', right_on='Handset'))

phones.merge(inventory, left_on='Model', right_on='Handset', how='left')

phones.merge(inventory, left_on='Model', right_on='Handset', how='right')

phones.merge(inventory, left_on='Model', right_on='Handset', how='outer')

df1 = pd.DataFrame({'a': [1, 2, 3]}, index=['hello', 'eecs398', 'students'])
df2 = pd.DataFrame({'b': [10, 20, 30]}, index=['eecs398', 'is', 'awesome'])
dfs_side_by_side(df1, df2)

awesome      NaN
eecs398     12.0
hello        NaN
is           NaN
students     NaN
dtype: float64

state
Michigan    3
Illinois    1
Name: count, dtype: int64

state
Michigan      4
Illinois      1
Maryland      1
California    1
Name: count, dtype: int64

state
California     NaN
Illinois       1.0
Maryland       NaN
Michigan      12.0
Name: count, dtype: float64

13.0

df1 = pd.DataFrame({'a': [1, 2, 3]}, index=['hello', 'eecs398', 'students'])
df2 = pd.DataFrame({'b': [10, 20, 30]}, index=['eecs398', 'is', 'awesome'])
dfs_side_by_side(df1, df2)

awesome      NaN
eecs398     12.0
hello        NaN
is           NaN
students     NaN
dtype: float64

state
Michigan    3
Illinois    1
Name: count, dtype: int64

state
Michigan      4
Illinois      1
Maryland      1
California    1
Name: count, dtype: int64

state
California     NaN
Illinois       1.0
Maryland       NaN
Michigan      12.0
Name: count, dtype: float64

13.0

df1 = pd.DataFrame({'a': [1, 2, 3]}, index=['hello', 'eecs398', 'students'])
df2 = pd.DataFrame({'b': [10, 20, 30]}, index=['eecs398', 'is', 'awesome'])
dfs_side_by_side(df1, df2)

df1['a'] + df2['b']

awesome      NaN
eecs398     12.0
hello        NaN
is           NaN
students     NaN
dtype: float64

midwest_cities = pd.DataFrame().assign(
    city=['Ann Arbor', 'Detroit', 'Chicago', 'East Lansing'],
    state=['Michigan', 'Michigan', 'Illinois', 'Michigan'],
    today_high_temp=['79', '83', '87', '87']
)
schools = pd.DataFrame().assign(
    name=['University of Michigan', 'University of Chicago', 'Wayne State University', 'Johns Hopkins University', 'UC San Diego', 'Concordia U-Ann Arbor', 'Michigan State University'], 
    city=['Ann Arbor', 'Chicago', 'Detroit', 'Baltimore', 'La Jolla', 'Ann Arbor', 'East Lansing'],
    state=['Michigan', 'Illinois', 'Michigan', 'Maryland', 'California', 'Michigan', 'Michigan'],
    graduation_rate=[0.87, 0.94, 0.78, 0.92, 0.81, 0.83, 0.91]
)

dfs_side_by_side(midwest_cities, schools)

state
Michigan    3
Illinois    1
Name: count, dtype: int64

state
Michigan      4
Illinois      1
Maryland      1
California    1
Name: count, dtype: int64

state
California     NaN
Illinois       1.0
Maryland       NaN
Michigan      12.0
Name: count, dtype: float64

13.0

dfs_side_by_side(midwest_cities, schools)

# Answer: 5.
midwest_cities.merge(schools, on='city')

dfs_side_by_side(midwest_cities, schools)

# Answer: 13.
midwest_cities.merge(schools, on='state')

df = midwest_cities.merge(schools, on='state')
df.shape[0] == (____).sum()

state
Michigan    3
Illinois    1
Name: count, dtype: int64

state
Michigan      4
Illinois      1
Maryland      1
California    1
Name: count, dtype: int64

state
California     NaN
Illinois       1.0
Maryland       NaN
Michigan      12.0
Name: count, dtype: float64

13.0

midwest_cities['state'].value_counts()

state
Michigan    3
Illinois    1
Name: count, dtype: int64

schools['state'].value_counts()

state
Michigan      4
Illinois      1
Maryland      1
California    1
Name: count, dtype: int64

# When we multiply the above two Series,
# the product is done by matching up the index.
midwest_cities['state'].value_counts() * schools['state'].value_counts()

state
California     NaN
Illinois       1.0
Maryland       NaN
Michigan      12.0
Name: count, dtype: float64

# When we sum the resulting Series, the missing values are ignored.
# The expression below evaluates to 13, which is the answer to the previous slide's question.
(midwest_cities['state'].value_counts() * schools['state'].value_counts()).sum()

13.0

baby

baby

nyt = pd.read_csv('data/nyt_names.csv')
nyt

category_counts = ...
category_counts = (
    baby
    .merge(nyt, left_on='Name', right_on='nyt_name')
    .groupby(['category', 'Year'])
    ['Count']
    .sum()
    .reset_index()
)
category_counts

# We'll talk about plotting code soon!
import plotly.express as px
fig = px.line(category_counts, x='Year', y='Count',
              facet_col='category', facet_col_wrap=3,
              facet_row_spacing=0.15,
              width=600, height=400)
fig.update_yaxes(matches=None, showticklabels=False)

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
...	...	...	...	...	...	...	...
330	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
331	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
332	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
146	Chinstrap	Dream	46.5	17.9	192.0	3500.0	Female
147	Chinstrap	Dream	50.0	19.5	196.0	3900.0	Male
148	Chinstrap	Dream	51.3	19.2	193.0	3650.0	Male
...	...	...	...	...	...	...	...
330	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
331	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
332	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
...	...	...	...	...	...	...	...
330	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
331	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
332	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex	z_mass
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male	0.10
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female	0.21
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female	-1.00
3	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female	-0.56
...	...	...	...	...	...	...	...	...
329	Gentoo	Biscoe	46.8	14.3	215.0	4850.0	Female	-0.49
330	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male	1.32
331	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female	0.22
332	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male	0.62

	Name	Sex	Count	Year
0	Liam	M	20456	2022
1	Noah	M	18621	2022
2	Olivia	F	16573	2022
...	...	...	...	...
2085155	Wright	M	5	1880
2085156	York	M	5	1880
2085157	Zachariah	M	5	1880

	bill_length_mm	island
species
Adelie	46.0	[Torgersen, Biscoe, Dream]
Chinstrap	58.0	[Dream]
Gentoo	59.6	[Biscoe]

	species	island	bill_length_mm	bill_depth_mm
0	Adelie	Biscoe	38.98	18.37
1	Adelie	Dream	38.52	18.24
2	Adelie	Torgersen	39.04	18.45
3	Chinstrap	Dream	48.83	18.42
4	Gentoo	Biscoe	47.57	15.00

		Name	Count
Year	Sex
1913	F	Mary	36642
1913	M	John	29329
1914	F	Mary	45346
...	...	...	...
2021	M	Liam	20365
2022	F	Olivia	16573
2022	M	Liam	20456

Sex	F	M
Year
2018	1698373	1813377
2019	1675139	1790682
2020	1612393	1721588
2021	1635800	1743913
2022	1628730	1733166

island	Biscoe	Dream	Torgersen
species
Adelie	44.0	55.0	47.0
Chinstrap	NaN	68.0	NaN
Gentoo	119.0	NaN	NaN

	Model	Price	Screen
0	iPhone 16	799	6.1
1	iPhone 16 Pro Max	1199	6.9
2	Samsung Galaxy S24 Ultra	1299	6.8
3	Pixel 9 Pro	999	6.3

	Handset	Units	Store
0	iPhone 16 Pro Max	50	Briarwood
1	iPhone 16	40	Somerset
2	Pixel 9 Pro	10	Arbor Hills
3	Pixel 9 Pro	15	12 Oaks
4	iPhone 16	100	Briarwood
5	iPhone 15	5	Oakland Mall

	Model	Price	Screen	Handset	Units	Store
0	iPhone 16	799	6.1	iPhone 16	40.0	Somerset
1	iPhone 16	799	6.1	iPhone 16	100.0	Briarwood
2	iPhone 16 Pro Max	1199	6.9	iPhone 16 Pro Max	50.0	Briarwood
3	Samsung Galaxy S24 Ultra	1299	6.8	NaN	NaN	NaN
4	Pixel 9 Pro	999	6.3	Pixel 9 Pro	10.0	Arbor Hills
5	Pixel 9 Pro	999	6.3	Pixel 9 Pro	15.0	12 Oaks

	Model	Price	Screen	Handset	Units	Store
0	iPhone 16 Pro Max	1199.0	6.9	iPhone 16 Pro Max	50	Briarwood
1	iPhone 16	799.0	6.1	iPhone 16	40	Somerset
2	Pixel 9 Pro	999.0	6.3	Pixel 9 Pro	10	Arbor Hills
3	Pixel 9 Pro	999.0	6.3	Pixel 9 Pro	15	12 Oaks
4	iPhone 16	799.0	6.1	iPhone 16	100	Briarwood
5	NaN	NaN	NaN	iPhone 15	5	Oakland Mall

	city	state	today_high_temp
0	Ann Arbor	Michigan	79
1	Detroit	Michigan	83
2	Chicago	Illinois	87
3	East Lansing	Michigan	87

	nyt_name	category
0	Lucifer	forbidden
1	Lilith	forbidden
2	Danger	forbidden
...	...	...
20	Venus	celestial
21	Celestia	celestial
22	Skye	celestial

	name	city	state	graduation_rate
0	University of Michigan	Ann Arbor	Michigan	0.87
1	University of Chicago	Chicago	Illinois	0.94
2	Wayne State University	Detroit	Michigan	0.78
3	Johns Hopkins University	Baltimore	Maryland	0.92
4	UC San Diego	La Jolla	California	0.81
5	Concordia U-Ann Arbor	Ann Arbor	Michigan	0.83
6	Michigan State University	East Lansing	Michigan	0.91

	category	Year	Count
0	boomer	1880	292
1	boomer	1881	298
2	boomer	1882	326
...	...	...	...
659	mythology	2020	3516
660	mythology	2021	3895
661	mythology	2022	4049

Lecture 6¶

Grouping, Pivoting, and Merging¶

EECS 398-003: Practical Data Science, Fall 2024¶

Announcements 📣¶

Agenda¶

Question 🤔 (Answer at practicaldsc.org/q)

Recap: groupby¶

Example: Finding the mean 'bill_length_mm' of each species¶

Understanding the syntax of groupby¶

Advanced groupby usage¶

Beyond default aggregation methods¶

The aggregate method¶

Reference Slide¶

Examples¶

Activity

Question 🤔 (Answer at practicaldsc.org/q)

Split-apply-combine, revisited¶

Grouping, then filtering¶

Activity

Reference Slide¶

Example: Z-Scoring¶

Reference Slide¶

Grouping, then transforming¶

Grouping with multiple columns¶

Activity

Pivot tables using the pivot_table method¶

Pivot tables: An extension of grouping¶

pivot_table¶

Example: Finding the number of penguins per 'island' and 'species'¶

Granularity, revisited¶

Reshaping¶

Question 🤔 (Answer at practicaldsc.org/q)

Merging¶

Example: Phone sales 📱¶

If I sell all of the phones in my inventory, how much will I make in revenue?¶

What just happened!? 🤯¶

The merge method¶

Inner joins¶

Other join types¶

Different join types handle mismatches differently¶

Reference Slide¶

Notes on the merge method¶

Reference Slide¶

Lots of pandas operations do an implicit outer join!¶

Activity setup¶

Question 🤔 (Answer at practicaldsc.org/q)

Followup activity¶

Activity

What's next?¶

Reference Section¶

Another example¶

Name categories¶

Loading in the data¶

Returning back to our original question¶

Recap: `groupby`¶

Example: Finding the mean `'bill_length_mm'` of each species¶

Understanding the syntax of `groupby`¶

Advanced `groupby` usage¶

The `aggregate` method¶

Pivot tables using the `pivot_table` method¶

`pivot_table`¶

Example: Finding the number of penguins per `'island'` and `'species'`¶

The `merge` method¶

Notes on the `merge` method¶

Lots of `pandas` operations do an implicit outer join!¶