from lec_utils import *
def show_grouping_animation():
    src = "https://docs.google.com/presentation/d/1tBaFyHseIGsX5wmE3BdNLeVHnKksQtpzLhHge8Tzly0/embed?start=false&loop=false&delayms=60000&rm=minimal"
    width = 960
    height = 509
    display(IFrame(src, width, height))

YouTubeVideo('TLzUE-FyVcM')

IFrame('https://www.youtube-nocookie.com/embed/CCrNAHXUstU?si=-DntSyUNp5Kwitjm&amp;start=11',
       width=560, height=315)

penguins = pd.read_csv('data/penguins.csv')
penguins

penguins.plot(kind='scatter', 
              x='bill_length_mm', 
              y='body_mass_g', 
              color='species', 
              title='Body Mass vs. Bill Length')

penguins['body_mass_g'].mean()

4207.057057057057

penguins['species'].unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

penguins.loc[penguins['species'] == 'Adelie', 'body_mass_g'].mean()

3706.1643835616437

penguins.loc[penguins['species'] == 'Chinstrap', 'body_mass_g'].mean()

3733.0882352941176

penguins.loc[penguins['species'] == 'Gentoo', 'body_mass_g'].mean()

5092.436974789916

# To find the overall mean 'body_mass_g':
penguins['body_mass_g'].mean()

4207.057057057057

# To find the mean 'body_mass_g' for each 'species':
penguins.groupby('species')['body_mass_g'].mean()

species
Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
Name: body_mass_g, dtype: float64

show_grouping_animation()

penguins

'Chinstrap'

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1529bd8d0>

{'Adelie': [0, 1, 2, 4, 6, 11, 12, 15, 19, 20, 21, 23, 25, 26, 33, 34, 36, 37, 38, 42, 49, 51, 52, 54, 59, 60, 61, 63, 64, 69, 70, 71, 77, 78, 79, 81, 85, 87, 90, 91, 94, 95, 96, 102, 103, 106, 107, 111, 112, 115, 117, 118, 120, 121, 123, 126, 127, 131, 133, 136, 139, 141, 143, 147, 150, 153, 158, 160, 164, 165, 167, 168, 172, 173, 174, 177, 178, 184, 188, 193, 198, 201, 203, 207, 208, 210, 214, 216, 217, 218, 219, 220, 223, 227, 228, 229, 230, 232, 234, 237, ...], 'Chinstrap': [3, 5, 7, 8, 13, 18, 22, 24, 28, 30, 31, 39, 48, 50, 55, 58, 74, 84, 92, 93, 97, 99, 101, 105, 110, 125, 128, 130, 135, 140, 142, 146, 152, 162, 163, 176, 180, 183, 187, 190, 192, 195, 197, 206, 209, 211, 224, 235, 240, 241, 242, 246, 251, 260, 261, 265, 266, 271, 282, 302, 308, 311, 312, 317, 327, 328, 329, 330], 'Gentoo': [9, 10, 14, 16, 17, 27, 29, 32, 35, 40, 41, 43, 44, 45, 46, 47, 53, 56, 57, 62, 65, 66, 67, 68, 72, 73, 75, 76, 80, 82, 83, 86, 88, 89, 98, 100, 104, 108, 109, 113, 114, 116, 119, 122, 124, 129, 132, 134, 137, 138, 144, 145, 148, 149, 151, 154, 155, 156, 157, 159, 161, 166, 169, 170, 171, 175, 179, 181, 182, 185, 186, 189, 191, 194, 196, 199, 200, 202, 204, 205, 212, 213, 215, 221, 222, 225, 226, 231, 233, 236, 245, 247, 248, 252, 253, 257, 264, 267, 268, 275, ...]}

<pandas.core.groupby.generic.SeriesGroupBy object at 0x152a39330>

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x152a3a650>

species
Adelie       146
Chinstrap     68
Gentoo       119
dtype: int64

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

species
Adelie       0.38
Chinstrap    1.00
Gentoo       0.00
Name: is_Dream, dtype: float64

species
Adelie       4725.0
Chinstrap    4550.0
Gentoo       6050.0
Name: body_mass_g, dtype: float64

penguins

(
    penguins
    .groupby('species')
    ['bill_length_mm']
    .median()
    .idxmax()
)

'Chinstrap'

(
    penguins
    .groupby('species')
    ['bill_length_mm']
    .median()
    .plot(kind='barh', title='Median Bill Length of Each Species')
)

penguins.groupby('species')['bill_length_mm'].mean()

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

penguins.groupby('species')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1529bd8d0>

# Creates one group for each unique value in the species column.
penguins.groupby('species').groups

{'Adelie': [0, 1, 2, 4, 6, 11, 12, 15, 19, 20, 21, 23, 25, 26, 33, 34, 36, 37, 38, 42, 49, 51, 52, 54, 59, 60, 61, 63, 64, 69, 70, 71, 77, 78, 79, 81, 85, 87, 90, 91, 94, 95, 96, 102, 103, 106, 107, 111, 112, 115, 117, 118, 120, 121, 123, 126, 127, 131, 133, 136, 139, 141, 143, 147, 150, 153, 158, 160, 164, 165, 167, 168, 172, 173, 174, 177, 178, 184, 188, 193, 198, 201, 203, 207, 208, 210, 214, 216, 217, 218, 219, 220, 223, 227, 228, 229, 230, 232, 234, 237, ...], 'Chinstrap': [3, 5, 7, 8, 13, 18, 22, 24, 28, 30, 31, 39, 48, 50, 55, 58, 74, 84, 92, 93, 97, 99, 101, 105, 110, 125, 128, 130, 135, 140, 142, 146, 152, 162, 163, 176, 180, 183, 187, 190, 192, 195, 197, 206, 209, 211, 224, 235, 240, 241, 242, 246, 251, 260, 261, 265, 266, 271, 282, 302, 308, 311, 312, 317, 327, 328, 329, 330], 'Gentoo': [9, 10, 14, 16, 17, 27, 29, 32, 35, 40, 41, 43, 44, 45, 46, 47, 53, 56, 57, 62, 65, 66, 67, 68, 72, 73, 75, 76, 80, 82, 83, 86, 88, 89, 98, 100, 104, 108, 109, 113, 114, 116, 119, 122, 124, 129, 132, 134, 137, 138, 144, 145, 148, 149, 151, 154, 155, 156, 157, 159, 161, 166, 169, 170, 171, 175, 179, 181, 182, 185, 186, 189, 191, 194, 196, 199, 200, 202, 204, 205, 212, 213, 215, 221, 222, 225, 226, 231, 233, 236, 245, 247, 248, 252, 253, 257, 264, 267, 268, 275, ...]}

penguins.groupby('species')['bill_length_mm']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x152a39330>

penguins.groupby('species')[['bill_length_mm', 'bill_depth_mm']]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x152a3a650>

# Note that this worked on the entire DataFrame!
# But, if all we wanted are the sums of `'body_mass_g'
# for each species, this is slower than
# penguins.groupby('species')['body_mass_g'].sum().
penguins.groupby('species').sum()

# Often used in conjunction with sort_values.
# Remember this when you work on the activity in a few slides!
penguins.groupby('species').last()

# Similar to value_counts, but not identical!
penguins.groupby('species').size()

species
Adelie       146
Chinstrap     68
Gentoo       119
dtype: int64

penguins['species'].value_counts()

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

penguins.groupby('species').max()

# This penguin lived on Biscoe island!
penguins.loc[(penguins['species'] == 'Adelie') & (penguins['body_mass_g'] == 4775.0)]

# General idea: Sort the penguibs by mass in decreasing order.
# Then, the first male penguin that appears is the heaviest male penguin,
# and the first female penguin that appears is the heaviest female penguin.
# For each sex, take the first row.
(
    penguins
    .sort_values('body_mass_g', ascending=False)
    .groupby('sex')
    .first()
)

(
    penguins
    .assign(is_Dream=penguins['island'] == 'Dream')
    .groupby('species')
    ['is_Dream']
    .mean()
)

species
Adelie       0.38
Chinstrap    1.00
Gentoo       0.00
Name: is_Dream, dtype: float64

(
    penguins
    .groupby('species')
    ['body_mass_g']
    .agg(['count', 'mean'])
)

(
    penguins
    .groupby('species')
    .aggregate({'bill_length_mm': 'max', 'island': 'unique'})
)

# Here, the argument to agg is a function,
# which takes in a Series and returns a scalar.
def second_largest(s):
    return s.sort_values().iloc[-2]
(
    penguins
    .groupby('species')
    ['body_mass_g']
    .agg(second_largest)
)

species
Adelie       4725.0
Chinstrap    4550.0
Gentoo       6050.0
Name: body_mass_g, dtype: float64

# This is a query, NOT a filter for the purposes of this slide.
penguins[penguins['bill_length_mm'] > 47]

(
    penguins
    .groupby('species')
    .filter(lambda df: df['bill_length_mm'].mean() > 47)
)

# Since 'Adelie's have a mean 'bill_length_mm' below 47, they aren't included in the output above.
penguins.groupby('species')['bill_length_mm'].mean()

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

(
    penguins
    .groupby('species')
    .filter(lambda df: (df.shape[0] >= 100) and ((df['sex'] == 'Female').sum() >= 60))
    ['species']
    .unique()
    [0]
)

'Adelie'

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

species
Adelie       73
Gentoo       58
Chinstrap    34
Name: count, dtype: int64

0     -156.16
1     -381.16
2     -306.16
        ...  
330     66.91
331    543.84
332    -42.44
Name: body_mass_g, Length: 333, dtype: float64

0      3550.0
1      3325.0
2      3400.0
        ...  
330    3800.0
331    4250.0
332    5050.0
Name: body_mass_g, Length: 333, dtype: float64

species
Adelie       197.0
Chinstrap    210.0
Gentoo       221.0
dtype: float64

species
Adelie       4725.0
Chinstrap    4550.0
Gentoo       6050.0
Name: body_mass_g, dtype: float64

0     -156.16
1     -381.16
2     -306.16
        ...  
330     66.91
331    543.84
332    -42.44
Name: body_mass_g, Length: 333, dtype: float64

island     species  
Biscoe     Gentoo       119
Dream      Chinstrap     68
           Adelie        55
Torgersen  Adelie        47
Biscoe     Adelie        44
Name: count, dtype: int64

island     species  
Biscoe     Adelie        44
           Gentoo       119
Dream      Adelie        55
           Chinstrap     68
Torgersen  Adelie        47
dtype: int64

(
    penguins
    .groupby('species')
    .filter(lambda df: (df.shape[0] >= 100) and ((df['sex'] == 'Female').sum() >= 60))
    ['species']
    .unique()
    [0]
)

'Adelie'

# Note that to just find the 'species' with at least 100 penguins,
# we didn't need to group:
penguins['species'].value_counts()

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

penguins.loc[penguins['sex'] == 'Female', 'species'].value_counts()

species
Adelie       73
Gentoo       58
Chinstrap    34
Name: count, dtype: int64

penguins.groupby('species')['body_mass_g'].transform(lambda s: s - s.mean())

0     -156.16
1     -381.16
2     -306.16
        ...  
330     66.91
331    543.84
332    -42.44
Name: body_mass_g, Length: 333, dtype: float64

penguins['body_mass_g']

0      3550.0
1      3325.0
2      3400.0
        ...  
330    3800.0
331    4250.0
332    5050.0
Name: body_mass_g, Length: 333, dtype: float64

penguins.groupby('species').apply(lambda df: df.sort_values('body_mass_g', ascending=False).head(2))

(
    penguins
    .groupby('species')
    .apply(
        lambda df: df.sort_values('body_mass_g', ascending=False)['flipper_length_mm'].iloc[0]
    )
)

species
Adelie       197.0
Chinstrap    210.0
Gentoo       221.0
dtype: float64

penguins.groupby('species')['body_mass_g'].agg(lambda s: s.sort_values().iloc[-2])

species
Adelie       4725.0
Chinstrap    4550.0
Gentoo       6050.0
Name: body_mass_g, dtype: float64

(
    penguins
    .groupby('species')
    .filter(lambda df: (df.shape[0] >= 100) and ((df['sex'] == 'Female').sum() >= 60))
)

penguins.groupby('species')['body_mass_g'].transform(lambda s: s - s.mean())

0     -156.16
1     -381.16
2     -306.16
        ...  
330     66.91
331    543.84
332    -42.44
Name: body_mass_g, Length: 333, dtype: float64

penguins.groupby('species').apply(lambda df: df.sort_values('body_mass_g', ascending=False).head(2))

# Read this as:
species_and_island = (
    penguins.groupby(['species', 'island'])         # for every combination of 'species' and 'island' in the DataFrame,
    [['bill_length_mm', 'bill_depth_mm']].mean()    # calculate the mean 'bill_length_mm' and the mean 'bill_depth_mm'.
)
species_and_island

# Now, this looks like a regular DataFrame!
species_and_island.reset_index()

df.pivot_table(index=index_col,
                       columns=columns_col,
                       values=values_col,
                       aggfunc=func)

penguins.pivot_table(
    index='species',
    columns='sex',
    values='body_mass_g',
    aggfunc='mean'
)

# Same information as above, but harder to read!
(
    penguins
    .groupby(['species', 'sex'])
    [['body_mass_g']]
    .mean()
)

penguins

penguins.value_counts(['island', 'species'])

island     species  
Biscoe     Gentoo       119
Dream      Chinstrap     68
           Adelie        55
Torgersen  Adelie        47
Biscoe     Adelie        44
Name: count, dtype: int64

penguins.groupby(['island', 'species']).size()

island     species  
Biscoe     Adelie        44
           Gentoo       119
Dream      Adelie        55
           Chinstrap     68
Torgersen  Adelie        47
dtype: int64

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', # Choice of column here doesn't actually matter! Why?
    aggfunc='count',
)

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', 
    aggfunc='count',
    fill_value=0,
)

penguins

penguins.pivot_table(
    index='species', 
    columns='island', 
    values='bill_length_mm', 
    aggfunc='count',
    fill_value=0,
)

	Species	Color	Weight	Age
0	dog	black	40	5.0
1	cat	golden	15	8.0
2	cat	black	20	9.0
3	dog	white	80	2.0
4	dog	golden	25	0.5
5	hamster	golden	1	3.0

	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
species
Adelie	DreamTorgersenDreamBiscoeTorgersenBiscoeTorger...	5668.3	2678.7	27755.0	541100.0	MaleFemaleFemaleMaleFemaleMaleFemaleMaleFemale...
Chinstrap	DreamDreamDreamDreamDreamDreamDreamDreamDreamD...	3320.7	1252.6	13316.0	253850.0	FemaleFemaleMaleMaleFemaleFemaleFemaleFemaleFe...
Gentoo	BiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBisc...	5660.6	1784.6	25851.0	606000.0	FemaleFemaleFemaleFemaleMaleFemaleFemaleMaleMa...

sex	Female	Male
species
Adelie	3368.84	4043.49
Chinstrap	3527.21	3938.97
Gentoo	4679.74	5484.84

sex	Female	Male
species
Adelie	3368.84	4043.49
Chinstrap	3527.21	3938.97
Gentoo	4679.74	5484.84

island	Biscoe	Dream	Torgersen
species
Adelie	44.0	55.0	47.0
Chinstrap	NaN	68.0	NaN
Gentoo	119.0	NaN	NaN

Lecture 5¶

Aggregation: Grouping and Pivoting¶

EECS 398: Practical Data Science, Spring 2025¶

Lecture Video¶

Agenda 📆¶

Introduction to the `groupby` method¶

Example: Palmer Penguins¶

Loading the data¶

Visualizing the data¶

Aggregating¶

A naïve approach to finding the mean `'body_mass_g'` per `'species'`¶

The magic of `groupby` 🪄¶

An illustrative example: Pets 🐱 🐶🐹¶

"Split-apply-combine" paradigm¶

Activity

`groupby`'s inner workings¶

How does `groupby` actually work?¶

`DataFrameGroupBy` objects¶

Column extraction¶

Aggregation¶

Reminder: Column independence¶

Activity

Activity

Advanced `groupby` usage¶

Beyond default aggregation methods¶

Grouping method 1: `agg`¶

`agg` with different aggregation methods for different columns¶

`agg` with a custom aggregation method¶

Grouping method 2: `filter`¶

Activity

Grouping method 3: `transform`¶

Grouping method 4: `apply`¶

⭐️ The grouping method cheat sheet: `agg`, `filter`, `transform`, and `apply` ⭐️¶

Grouping with multiple columns¶

Pivot tables using `pivot_table`¶

Pivot tables: An extension of grouping¶

`pivot_table`¶

Example: Finding the number of penguins per `'island'` and `'species'`¶

Granularity¶

Reshaping¶

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Dream	41.3	20.3	194.0	3550.0	Male
1	Adelie	Torgersen	38.5	17.9	190.0	3325.0	Female
2	Adelie	Dream	34.0	17.1	185.0	3400.0	Female
...	...	...	...	...	...	...	...
330	Chinstrap	Dream	46.6	17.8	193.0	3800.0	Female
331	Adelie	Dream	39.7	17.9	193.0	4250.0	Male
332	Gentoo	Biscoe	45.1	14.5	207.0	5050.0	Female

	bill_length_mm	island
species
Adelie	46.0	[Dream, Torgersen, Biscoe]
Chinstrap	58.0	[Dream]
Gentoo	59.6	[Biscoe]

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
5	Chinstrap	Dream	50.1	17.9	190.0	3400.0	Female
7	Chinstrap	Dream	51.4	19.0	201.0	3950.0	Male
8	Chinstrap	Dream	51.3	19.2	193.0	3650.0	Male
...	...	...	...	...	...	...	...
327	Chinstrap	Dream	49.3	19.9	203.0	4050.0	Male
328	Chinstrap	Dream	50.7	19.7	203.0	4050.0	Male
329	Chinstrap	Dream	51.3	19.9	198.0	3700.0	Male

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
3	Chinstrap	Dream	45.5	17.0	196.0	3500.0	Female
5	Chinstrap	Dream	50.1	17.9	190.0	3400.0	Female
7	Chinstrap	Dream	51.4	19.0	201.0	3950.0	Male
...	...	...	...	...	...	...	...
329	Chinstrap	Dream	51.3	19.9	198.0	3700.0	Male
330	Chinstrap	Dream	46.6	17.8	193.0	3800.0	Female
332	Gentoo	Biscoe	45.1	14.5	207.0	5050.0	Female

	species	island	bill_length_mm	bill_depth_mm
0	Adelie	Biscoe	38.98	18.37
1	Adelie	Dream	38.52	18.24
2	Adelie	Torgersen	39.04	18.45
3	Chinstrap	Dream	48.83	18.42
4	Gentoo	Biscoe	47.57	15.00

Lecture 5¶

Aggregation: Grouping and Pivoting¶

EECS 398: Practical Data Science, Spring 2025¶

Lecture Video¶

Agenda 📆¶

Introduction to the groupby method¶

Example: Palmer Penguins¶

Loading the data¶

Visualizing the data¶

Aggregating¶

A naïve approach to finding the mean 'body_mass_g' per 'species'¶

The magic of groupby 🪄¶

An illustrative example: Pets 🐱 🐶🐹¶

"Split-apply-combine" paradigm¶

Activity

groupby's inner workings¶

How does groupby actually work?¶

DataFrameGroupBy objects¶

Column extraction¶

Aggregation¶

Reminder: Column independence¶

Activity

Activity

Advanced groupby usage¶

Beyond default aggregation methods¶

Grouping method 1: agg¶

agg with different aggregation methods for different columns¶

agg with a custom aggregation method¶

Grouping method 2: filter¶

Activity

Grouping method 3: transform¶

Grouping method 4: apply¶

⭐️ The grouping method cheat sheet: agg, filter, transform, and apply ⭐️¶

Grouping with multiple columns¶

Pivot tables using pivot_table¶

Pivot tables: An extension of grouping¶

pivot_table¶

Example: Finding the number of penguins per 'island' and 'species'¶

Granularity¶

Reshaping¶

Introduction to the `groupby` method¶

A naïve approach to finding the mean `'body_mass_g'` per `'species'`¶

The magic of `groupby` 🪄¶

`groupby`'s inner workings¶

How does `groupby` actually work?¶

`DataFrameGroupBy` objects¶

Advanced `groupby` usage¶

Grouping method 1: `agg`¶

`agg` with different aggregation methods for different columns¶

`agg` with a custom aggregation method¶

Grouping method 2: `filter`¶

Grouping method 3: `transform`¶

Grouping method 4: `apply`¶

⭐️ The grouping method cheat sheet: `agg`, `filter`, `transform`, and `apply` ⭐️¶

Pivot tables using `pivot_table`¶

`pivot_table`¶

Example: Finding the number of penguins per `'island'` and `'species'`¶