from lec_utils import *
def show_grouping_animation():
    src = "https://docs.google.com/presentation/d/1tBaFyHseIGsX5wmE3BdNLeVHnKksQtpzLhHge8Tzly0/embed?start=false&loop=false&delayms=60000&rm=minimal"
    width = 960
    height = 509
    display(IFrame(src, width, height))

IFrame('https://www.youtube-nocookie.com/embed/CCrNAHXUstU?si=-DntSyUNp5Kwitjm&amp;start=11',
       width=560, height=315)

4207.057057057057

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

3706.1643835616437

3733.0882352941176

5092.436974789916

4207.057057057057

species
Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
Name: body_mass_g, dtype: float64

'Chinstrap'

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x16a301420>

IFrame('https://www.youtube-nocookie.com/embed/CCrNAHXUstU?si=-DntSyUNp5Kwitjm&amp;start=11',
       width=560, height=315)

penguins = pd.read_csv('data/penguins.csv')
penguins

penguins.plot(kind='scatter', 
              x='bill_length_mm', 
              y='body_mass_g', 
              color='species', 
              title='Body Mass vs. Bill Length')

penguins['body_mass_g'].mean()

4207.057057057057

penguins['species'].unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

penguins.loc[penguins['species'] == 'Adelie', 'body_mass_g'].mean()

3706.1643835616437

penguins.loc[penguins['species'] == 'Chinstrap', 'body_mass_g'].mean()

3733.0882352941176

penguins.loc[penguins['species'] == 'Gentoo', 'body_mass_g'].mean()

5092.436974789916

# To find the overall mean 'body_mass_g':
penguins['body_mass_g'].mean()

4207.057057057057

# To find the mean 'body_mass_g' for each 'species':
penguins.groupby('species')['body_mass_g'].mean()

species
Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
Name: body_mass_g, dtype: float64

show_grouping_animation()

penguins

'Chinstrap'

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x16a301420>

{'Adelie': [0, 1, 2, 4, 6, 11, 12, 15, 19, 20, 21, 23, 25, 26, 33, 34, 36, 37, 38, 42, 49, 51, 52, 54, 59, 60, 61, 63, 64, 69, 70, 71, 77, 78, 79, 81, 85, 87, 90, 91, 94, 95, 96, 102, 103, 106, 107, 111, 112, 115, 117, 118, 120, 121, 123, 126, 127, 131, 133, 136, 139, 141, 143, 147, 150, 153, 158, 160, 164, 165, 167, 168, 172, 173, 174, 177, 178, 184, 188, 193, 198, 201, 203, 207, 208, 210, 214, 216, 217, 218, 219, 220, 223, 227, 228, 229, 230, 232, 234, 237, ...], 'Chinstrap': [3, 5, 7, 8, 13, 18, 22, 24, 28, 30, 31, 39, 48, 50, 55, 58, 74, 84, 92, 93, 97, 99, 101, 105, 110, 125, 128, 130, 135, 140, 142, 146, 152, 162, 163, 176, 180, 183, 187, 190, 192, 195, 197, 206, 209, 211, 224, 235, 240, 241, 242, 246, 251, 260, 261, 265, 266, 271, 282, 302, 308, 311, 312, 317, 327, 328, 329, 330], 'Gentoo': [9, 10, 14, 16, 17, 27, 29, 32, 35, 40, 41, 43, 44, 45, 46, 47, 53, 56, 57, 62, 65, 66, 67, 68, 72, 73, 75, 76, 80, 82, 83, 86, 88, 89, 98, 100, 104, 108, 109, 113, 114, 116, 119, 122, 124, 129, 132, 134, 137, 138, 144, 145, 148, 149, 151, 154, 155, 156, 157, 159, 161, 166, 169, 170, 171, 175, 179, 181, 182, 185, 186, 189, 191, 194, 196, 199, 200, 202, 204, 205, 212, 213, 215, 221, 222, 225, 226, 231, 233, 236, 245, 247, 248, 252, 253, 257, 264, 267, 268, 275, ...]}

<pandas.core.groupby.generic.SeriesGroupBy object at 0x16a3015a0>

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1048fc280>

species
Adelie       146
Chinstrap     68
Gentoo       119
dtype: int64

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

species
Adelie       0.38
Chinstrap    1.00
Gentoo       0.00
Name: is_Dream, dtype: float64

species
Adelie       4725.0
Chinstrap    4550.0
Gentoo       6050.0
Name: body_mass_g, dtype: float64

penguins

(
    penguins
    .groupby('species')
    ['bill_length_mm']
    .median()
    .idxmax()
)

'Chinstrap'

(
    penguins
    .groupby('species')
    ['bill_length_mm']
    .median()
    .plot(kind='barh', title='Median Bill Length of Each Species')
)

penguins.groupby('species')['bill_length_mm'].mean()

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

penguins.groupby('species')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x16a301420>

# Creates one group for each unique value in the species column.
penguins.groupby('species').groups

{'Adelie': [0, 1, 2, 4, 6, 11, 12, 15, 19, 20, 21, 23, 25, 26, 33, 34, 36, 37, 38, 42, 49, 51, 52, 54, 59, 60, 61, 63, 64, 69, 70, 71, 77, 78, 79, 81, 85, 87, 90, 91, 94, 95, 96, 102, 103, 106, 107, 111, 112, 115, 117, 118, 120, 121, 123, 126, 127, 131, 133, 136, 139, 141, 143, 147, 150, 153, 158, 160, 164, 165, 167, 168, 172, 173, 174, 177, 178, 184, 188, 193, 198, 201, 203, 207, 208, 210, 214, 216, 217, 218, 219, 220, 223, 227, 228, 229, 230, 232, 234, 237, ...], 'Chinstrap': [3, 5, 7, 8, 13, 18, 22, 24, 28, 30, 31, 39, 48, 50, 55, 58, 74, 84, 92, 93, 97, 99, 101, 105, 110, 125, 128, 130, 135, 140, 142, 146, 152, 162, 163, 176, 180, 183, 187, 190, 192, 195, 197, 206, 209, 211, 224, 235, 240, 241, 242, 246, 251, 260, 261, 265, 266, 271, 282, 302, 308, 311, 312, 317, 327, 328, 329, 330], 'Gentoo': [9, 10, 14, 16, 17, 27, 29, 32, 35, 40, 41, 43, 44, 45, 46, 47, 53, 56, 57, 62, 65, 66, 67, 68, 72, 73, 75, 76, 80, 82, 83, 86, 88, 89, 98, 100, 104, 108, 109, 113, 114, 116, 119, 122, 124, 129, 132, 134, 137, 138, 144, 145, 148, 149, 151, 154, 155, 156, 157, 159, 161, 166, 169, 170, 171, 175, 179, 181, 182, 185, 186, 189, 191, 194, 196, 199, 200, 202, 204, 205, 212, 213, 215, 221, 222, 225, 226, 231, 233, 236, 245, 247, 248, 252, 253, 257, 264, 267, 268, 275, ...]}

penguins.groupby('species')['bill_length_mm']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x16a3015a0>

penguins.groupby('species')[['bill_length_mm', 'bill_depth_mm']]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1048fc280>

# Note that this worked on the entire DataFrame!
# But, if all we wanted are the sums of `'body_mass_g'
# for each species, this is slower than
# penguins.groupby('species')['body_mass_g'].sum().
penguins.groupby('species').sum()

# Often used in conjunction with sort_values.
# Remember this when you work on the activity in a few slides!
penguins.groupby('species').last()

# Similar to value_counts, but not identical!
penguins.groupby('species').size()

species
Adelie       146
Chinstrap     68
Gentoo       119
dtype: int64

penguins['species'].value_counts()

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

penguins.groupby('species').max()

# This penguin lived on Biscoe island!
penguins.loc[(penguins['species'] == 'Adelie') & (penguins['body_mass_g'] == 4775.0)]

# General idea: Sort the penguibs by mass in decreasing order.
# Then, the first male penguin that appears is the heaviest male penguin,
# and the first female penguin that appears is the heaviest female penguin.
# For each sex, take the first row.
(
    penguins
    .sort_values('body_mass_g', ascending=False)
    .groupby('sex')
    .first()
)

(
    penguins
    .assign(is_Dream=penguins['island'] == 'Dream')
    .groupby('species')
    ['is_Dream']
    .mean()
)

species
Adelie       0.38
Chinstrap    1.00
Gentoo       0.00
Name: is_Dream, dtype: float64

(
    penguins
    .groupby('species')
    ['body_mass_g']
    .agg(['count', 'mean'])
)

(
    penguins
    .groupby('species')
    .aggregate({'bill_length_mm': 'max', 'island': 'unique'})
)

# Here, the argument to agg is a function,
# which takes in a Series and returns a scalar.
def second_largest(s):
    return s.sort_values().iloc[-2]
(
    penguins
    .groupby('species')
    ['body_mass_g']
    .agg(second_largest)
)

species
Adelie       4725.0
Chinstrap    4550.0
Gentoo       6050.0
Name: body_mass_g, dtype: float64

# This is a query, NOT a filter for the purposes of this slide.
penguins[penguins['bill_length_mm'] > 47]

(
    penguins
    .groupby('species')
    .filter(lambda df: df['bill_length_mm'].mean() > 47)
)

# Since 'Adelie's have a mean 'bill_length_mm' below 47, they aren't included in the output above.
penguins.groupby('species')['bill_length_mm'].mean()

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

(
    penguins
    .groupby('species')
    .filter(lambda df: (df.shape[0] >= 100) and ((df['sex'] == 'Female').sum() >= 60))
    ['species']
    .unique()
    [0]
)

'Adelie'

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

species
Adelie       73
Gentoo       58
Chinstrap    34
Name: count, dtype: int64

0     -156.16
1     -381.16
2     -306.16
        ...  
330     66.91
331    543.84
332    -42.44
Name: body_mass_g, Length: 333, dtype: float64

0      3550.0
1      3325.0
2      3400.0
        ...  
330    3800.0
331    4250.0
332    5050.0
Name: body_mass_g, Length: 333, dtype: float64

species
Adelie       197.0
Chinstrap    210.0
Gentoo       221.0
dtype: float64

species
Adelie       4725.0
Chinstrap    4550.0
Gentoo       6050.0
Name: body_mass_g, dtype: float64

0     -156.16
1     -381.16
2     -306.16
        ...  
330     66.91
331    543.84
332    -42.44
Name: body_mass_g, Length: 333, dtype: float64

island     species  
Biscoe     Gentoo       119
Dream      Chinstrap     68
           Adelie        55
Torgersen  Adelie        47
Biscoe     Adelie        44
Name: count, dtype: int64

island     species  
Biscoe     Adelie        44
           Gentoo       119
Dream      Adelie        55
           Chinstrap     68
Torgersen  Adelie        47
dtype: int64

(
    penguins
    .groupby('species')
    .filter(lambda df: (df.shape[0] >= 100) and ((df['sex'] == 'Female').sum() >= 60))
    ['species']
    .unique()
    [0]
)

'Adelie'

# Note that to just find the 'species' with at least 100 penguins,
# we didn't need to group:
penguins['species'].value_counts()

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

penguins.loc[penguins['sex'] == 'Female', 'species'].value_counts()

species
Adelie       73
Gentoo       58
Chinstrap    34
Name: count, dtype: int64

penguins.groupby('species')['body_mass_g'].transform(lambda s: s - s.mean())

0     -156.16
1     -381.16
2     -306.16
        ...  
330     66.91
331    543.84
332    -42.44
Name: body_mass_g, Length: 333, dtype: float64

0      3550.0
1      3325.0
2      3400.0
        ...  
330    3800.0
331    4250.0
332    5050.0
Name: body_mass_g, Length: 333, dtype: float64

species
Adelie       197.0
Chinstrap    210.0
Gentoo       221.0
dtype: float64

species
Adelie       4725.0
Chinstrap    4550.0
Gentoo       6050.0
Name: body_mass_g, dtype: float64

0     -156.16
1     -381.16
2     -306.16
        ...  
330     66.91
331    543.84
332    -42.44
Name: body_mass_g, Length: 333, dtype: float64

island     species  
Biscoe     Gentoo       119
Dream      Chinstrap     68
           Adelie        55
Torgersen  Adelie        47
Biscoe     Adelie        44
Name: count, dtype: int64

island     species  
Biscoe     Adelie        44
           Gentoo       119
Dream      Adelie        55
           Chinstrap     68
Torgersen  Adelie        47
dtype: int64

penguins.groupby('species')['body_mass_g'].transform(lambda s: s - s.mean())

0     -156.16
1     -381.16
2     -306.16
        ...  
330     66.91
331    543.84
332    -42.44
Name: body_mass_g, Length: 333, dtype: float64

penguins['body_mass_g']

0      3550.0
1      3325.0
2      3400.0
        ...  
330    3800.0
331    4250.0
332    5050.0
Name: body_mass_g, Length: 333, dtype: float64

penguins.groupby('species').apply(lambda df: df.sort_values('body_mass_g', ascending=False).head(2))

(
    penguins
    .groupby('species')
    .apply(
        lambda df: df.sort_values('body_mass_g', ascending=False)['flipper_length_mm'].iloc[0]
    )
)

species
Adelie       197.0
Chinstrap    210.0
Gentoo       221.0
dtype: float64

penguins.groupby('species')['body_mass_g'].agg(lambda s: s.sort_values().iloc[-2])

species
Adelie       4725.0
Chinstrap    4550.0
Gentoo       6050.0
Name: body_mass_g, dtype: float64

(
    penguins
    .groupby('species')
    .filter(lambda df: (df.shape[0] >= 100) and ((df['sex'] == 'Female').sum() >= 60))
)

penguins.groupby('species')['body_mass_g'].transform(lambda s: s - s.mean())

0     -156.16
1     -381.16
2     -306.16
        ...  
330     66.91
331    543.84
332    -42.44
Name: body_mass_g, Length: 333, dtype: float64

penguins.groupby('species').apply(lambda df: df.sort_values('body_mass_g', ascending=False).head(2))

# Read this as:
species_and_island = (
    penguins.groupby(['species', 'island'])         # for every combination of 'species' and 'island' in the DataFrame,
    [['bill_length_mm', 'bill_depth_mm']].mean()    # calculate the mean 'bill_length_mm' and the mean 'bill_depth_mm'.
)
species_and_island

island     species  
Biscoe     Gentoo       119
Dream      Chinstrap     68
           Adelie        55
Torgersen  Adelie        47
Biscoe     Adelie        44
Name: count, dtype: int64

island     species  
Biscoe     Adelie        44
           Gentoo       119
Dream      Adelie        55
           Chinstrap     68
Torgersen  Adelie        47
dtype: int64

# Read this as:
species_and_island = (
    penguins.groupby(['species', 'island'])         # for every combination of 'species' and 'island' in the DataFrame,
    [['bill_length_mm', 'bill_depth_mm']].mean()    # calculate the mean 'bill_length_mm' and the mean 'bill_depth_mm'.
)
species_and_island

# Now, this looks like a regular DataFrame!
species_and_island.reset_index()

df.pivot_table(index=index_col,
                       columns=columns_col,
                       values=values_col,
                       aggfunc=func)

penguins.pivot_table(
    index='species',
    columns='sex',
    values='body_mass_g',
    aggfunc='mean'
)

# Same information as above, but harder to read!
(
    penguins
    .groupby(['species', 'sex'])
    [['body_mass_g']]
    .mean()
)

penguins

penguins.value_counts(['island', 'species'])

island     species  
Biscoe     Gentoo       119
Dream      Chinstrap     68
           Adelie        55
Torgersen  Adelie        47
Biscoe     Adelie        44
Name: count, dtype: int64

penguins.groupby(['island', 'species']).size()

island     species  
Biscoe     Adelie        44
           Gentoo       119
Dream      Adelie        55
           Chinstrap     68
Torgersen  Adelie        47
dtype: int64

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', # Choice of column here doesn't actually matter! Why?
    aggfunc='count',
)

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', 
    aggfunc='count',
    fill_value=0,
)

penguins

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', 
    aggfunc='count',
    fill_value=0,
)

	Species	Color	Weight	Age
0	dog	black	40	5.0
1	cat	golden	15	8.0
2	cat	black	20	9.0
3	dog	white	80	2.0
4	dog	golden	25	0.5
5	hamster	golden	1	3.0

	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
species
Adelie	DreamTorgersenDreamBiscoeTorgersenBiscoeTorger...	5668.3	2678.7	27755.0	541100.0	MaleFemaleFemaleMaleFemaleMaleFemaleMaleFemale...
Chinstrap	DreamDreamDreamDreamDreamDreamDreamDreamDreamD...	3320.7	1252.6	13316.0	253850.0	FemaleFemaleMaleMaleFemaleFemaleFemaleFemaleFe...
Gentoo	BiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBisc...	5660.6	1784.6	25851.0	606000.0	FemaleFemaleFemaleFemaleMaleFemaleFemaleMaleMa...

sex	Female	Male
species
Adelie	3368.84	4043.49
Chinstrap	3527.21	3938.97
Gentoo	4679.74	5484.84

sex	Female	Male
species
Adelie	3368.84	4043.49
Chinstrap	3527.21	3938.97
Gentoo	4679.74	5484.84

island	Biscoe	Dream	Torgersen
species
Adelie	44.0	55.0	47.0
Chinstrap	NaN	68.0	NaN
Gentoo	119.0	NaN	NaN

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Dream	41.3	20.3	194.0	3550.0	Male
1	Adelie	Torgersen	38.5	17.9	190.0	3325.0	Female
2	Adelie	Dream	34.0	17.1	185.0	3400.0	Female
...	...	...	...	...	...	...	...
330	Chinstrap	Dream	46.6	17.8	193.0	3800.0	Female
331	Adelie	Dream	39.7	17.9	193.0	4250.0	Male
332	Gentoo	Biscoe	45.1	14.5	207.0	5050.0	Female

	bill_length_mm	island
species
Adelie	46.0	[Dream, Torgersen, Biscoe]
Chinstrap	58.0	[Dream]
Gentoo	59.6	[Biscoe]

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
5	Chinstrap	Dream	50.1	17.9	190.0	3400.0	Female
7	Chinstrap	Dream	51.4	19.0	201.0	3950.0	Male
8	Chinstrap	Dream	51.3	19.2	193.0	3650.0	Male
...	...	...	...	...	...	...	...
327	Chinstrap	Dream	49.3	19.9	203.0	4050.0	Male
328	Chinstrap	Dream	50.7	19.7	203.0	4050.0	Male
329	Chinstrap	Dream	51.3	19.9	198.0	3700.0	Male

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
3	Chinstrap	Dream	45.5	17.0	196.0	3500.0	Female
5	Chinstrap	Dream	50.1	17.9	190.0	3400.0	Female
7	Chinstrap	Dream	51.4	19.0	201.0	3950.0	Male
...	...	...	...	...	...	...	...
329	Chinstrap	Dream	51.3	19.9	198.0	3700.0	Male
330	Chinstrap	Dream	46.6	17.8	193.0	3800.0	Female
332	Gentoo	Biscoe	45.1	14.5	207.0	5050.0	Female

	species	island	bill_length_mm	bill_depth_mm
0	Adelie	Biscoe	38.98	18.37
1	Adelie	Dream	38.52	18.24
2	Adelie	Torgersen	39.04	18.45
3	Chinstrap	Dream	48.83	18.42
4	Gentoo	Biscoe	47.57	15.00

Lecture 5¶

Aggregation: Grouping and Pivoting¶

EECS 398: Practical Data Science, Winter 2025¶

Agenda 📆¶

Read the guide!¶

Question 🤔 (Answer at practicaldsc.org/q)

Introduction to the groupby method¶

Example: Palmer Penguins¶

Loading the data¶

Visualizing the data¶

Aggregating¶

A naïve approach to finding the mean 'body_mass_g' per 'species'¶

The magic of groupby 🪄¶

An illustrative example: Pets 🐱 🐶🐹¶

"Split-apply-combine" paradigm¶

Activity

groupby's inner workings¶

How does groupby actually work?¶

DataFrameGroupBy objects¶

Column extraction¶

Aggregation¶

Reminder: Column independence¶

Activity

Activity

Advanced groupby usage¶

Beyond default aggregation methods¶

Grouping method 1: agg¶

agg with different aggregation methods for different columns¶

agg with a custom aggregation method¶

Grouping method 2: filter¶

Activity

Question 🤔 (Answer at practicaldsc.org/q)

Grouping method 3: transform¶

Grouping method 4: apply¶

⭐️ The grouping method cheat sheet: agg, filter, transform, and apply ⭐️¶

Question 🤔 (Answer at practicaldsc.org/q)

Grouping with multiple columns¶

Pivot tables using pivot_table¶

Pivot tables: An extension of grouping¶

pivot_table¶

Example: Finding the number of penguins per 'island' and 'species'¶

Granularity¶

Reshaping¶

Question 🤔 (Answer at practicaldsc.org/q)

Introduction to the `groupby` method¶

A naïve approach to finding the mean `'body_mass_g'` per `'species'`¶

The magic of `groupby` 🪄¶

`groupby`'s inner workings¶

How does `groupby` actually work?¶

`DataFrameGroupBy` objects¶

Advanced `groupby` usage¶

Grouping method 1: `agg`¶

`agg` with different aggregation methods for different columns¶

`agg` with a custom aggregation method¶

Grouping method 2: `filter`¶

Grouping method 3: `transform`¶

Grouping method 4: `apply`¶

⭐️ The grouping method cheat sheet: `agg`, `filter`, `transform`, and `apply` ⭐️¶

Pivot tables using `pivot_table`¶

`pivot_table`¶

Example: Finding the number of penguins per `'island'` and `'species'`¶