from lec_utils import *
def show_grouping_animation():
    src = "https://docs.google.com/presentation/d/1tBaFyHseIGsX5wmE3BdNLeVHnKksQtpzLhHge8Tzly0/embed?start=false&loop=false&delayms=60000&rm=minimal"
    width = 960
    height = 509
    display(IFrame(src, width, height))

dogs = pd.read_csv('data/dogs43.csv').set_index('breed')
dogs.head()

32

23.0

size
large     11
medium     7
small      1
Name: count, dtype: int64

size
large     11
medium     7
small      1
Name: count, dtype: int64

kind             sporting
lifetime_cost     21447.0
longevity           12.04
size               medium
weight               60.0
height              22.75
Name: Golden Retriever, dtype: object

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Name: lifetime_cost, Length: 43, dtype: float64

array([22589., 21992., 18993., ..., 13936., 13581., 20022.])

kind              object
lifetime_cost    float64
longevity        float64
size              object
weight           float64
height           float64
dtype: object

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     float64
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(4), object(2)
memory usage: 3.4+ KB

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     uint32 
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(3), object(2), uint32(1)
memory usage: 3.2+ KB

dogs = pd.read_csv('data/dogs43.csv').set_index('breed')
dogs.head()

dogs[dogs['longevity'] > 10].shape[0]

32

(
    dogs.loc[dogs.index.str.contains('Retriever'), 'height']
    .sort_values(ascending=False)
    .iloc[1]
)

23.0

(
    dogs.loc[(dogs['kind'] == 'sporting') | (dogs['kind'] == 'working'), 'size']
    .value_counts()
)

size
large     11
medium     7
small      1
Name: count, dtype: int64

# Equivalent to the above!
(
    dogs.loc[dogs['kind'].isin(['sporting', 'working']), 'size']
    .value_counts()
)

size
large     11
medium     7
small      1
Name: count, dtype: int64

dogs[dogs['size'] == 'medium']

dogs.loc['Golden Retriever']

kind             sporting
lifetime_cost     21447.0
longevity           12.04
size               medium
weight               60.0
height              22.75
Name: Golden Retriever, dtype: object

dogs

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Name: lifetime_cost, Length: 43, dtype: float64

array([22589., 21992., 18993., ..., 13936., 13581., 20022.])

kind              object
lifetime_cost    float64
longevity        float64
size              object
weight           float64
height           float64
dtype: object

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     float64
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(4), object(2)
memory usage: 3.4+ KB

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     uint32 
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(3), object(2), uint32(1)
memory usage: 3.2+ KB

breed             object
kind              object
lifetime_cost     uint32
longevity        float64
size              object
weight           float64
height           float64
dtype: object

kind             working
lifetime_cost      26686
longevity           16.5
size               small
weight             175.0
height              30.0
dtype: object

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Length: 43, dtype: float64

kind              7
lifetime_cost    43
longevity        40
size              3
weight           37
height           30
dtype: int64

4207.057057057057

dogs

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Name: lifetime_cost, Length: 43, dtype: float64

array([22589., 21992., 18993., ..., 13936., 13581., 20022.])

kind              object
lifetime_cost    float64
longevity        float64
size              object
weight           float64
height           float64
dtype: object

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     float64
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(4), object(2)
memory usage: 3.4+ KB

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     uint32 
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(3), object(2), uint32(1)
memory usage: 3.2+ KB

breed             object
kind              object
lifetime_cost     uint32
longevity        float64
size              object
weight           float64
height           float64
dtype: object

kind             working
lifetime_cost      26686
longevity           16.5
size               small
weight             175.0
height              30.0
dtype: object

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Length: 43, dtype: float64

kind              7
lifetime_cost    43
longevity        40
size              3
weight           37
height           30
dtype: int64

4207.057057057057

dogs

dogs.query('weight < 20 and kind == "terrier"')

dogs.query('kind in ["sporting", "terrier"] and lifetime_cost < 20000')

jack = pd.DataFrame({1: ['fee', 'fi'], 
                     '1': ['fo', 'fum']})
jack

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Name: lifetime_cost, Length: 43, dtype: float64

array([22589., 21992., 18993., ..., 13936., 13581., 20022.])

kind              object
lifetime_cost    float64
longevity        float64
size              object
weight           float64
height           float64
dtype: object

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     float64
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(4), object(2)
memory usage: 3.4+ KB

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     uint32 
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(3), object(2), uint32(1)
memory usage: 3.2+ KB

breed             object
kind              object
lifetime_cost     uint32
longevity        float64
size              object
weight           float64
height           float64
dtype: object

kind             working
lifetime_cost      26686
longevity           16.5
size               small
weight             175.0
height              30.0
dtype: object

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Length: 43, dtype: float64

kind              7
lifetime_cost    43
longevity        40
size              3
weight           37
height           30
dtype: int64

4207.057057057057

jack = pd.DataFrame({1: ['fee', 'fi'], 
                     '1': ['fo', 'fum']})
jack

# jack[1]

# jack[[1]]

# jack['1']

# jack[[1, 1]]

# jack.loc[1]

# jack.loc[jack[1] == 'fo']

# jack[1, ['1', 1]]

# jack.loc[1,1]

dogs.assign(cost_per_year=dogs['lifetime_cost'] / dogs['longevity'])

dogs

dogs.assign(**{'cost per year 💵': dogs['lifetime_cost'] / dogs['longevity']})

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Name: lifetime_cost, Length: 43, dtype: float64

array([22589., 21992., 18993., ..., 13936., 13581., 20022.])

kind              object
lifetime_cost    float64
longevity        float64
size              object
weight           float64
height           float64
dtype: object

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     float64
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(4), object(2)
memory usage: 3.4+ KB

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     uint32 
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(3), object(2), uint32(1)
memory usage: 3.2+ KB

breed             object
kind              object
lifetime_cost     uint32
longevity        float64
size              object
weight           float64
height           float64
dtype: object

kind             working
lifetime_cost      26686
longevity           16.5
size               small
weight             175.0
height              30.0
dtype: object

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Length: 43, dtype: float64

kind              7
lifetime_cost    43
longevity        40
size              3
weight           37
height           30
dtype: int64

4207.057057057057

dogs.assign(**{'cost per year 💵': dogs['lifetime_cost'] / dogs['longevity']})

# By default, .copy() returns a deep copy of the object it is called on,
# meaning that if you change the copy, the original remains unmodified.
dogs_copy = dogs.copy() 
dogs_copy.head(2)

dogs_copy['cost_per_year'] = dogs_copy['lifetime_cost'] / dogs_copy['longevity']
dogs_copy

dogs_copy

def cost_in_thousands():
    dogs_copy['lifetime_cost'] = dogs_copy['lifetime_cost'] / 1000

# What happens when we run this twice? Three times?
cost_in_thousands()

dogs_copy

dogs['lifetime_cost']

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Name: lifetime_cost, Length: 43, dtype: float64

array([22589., 21992., 18993., ..., 13936., 13581., 20022.])

kind              object
lifetime_cost    float64
longevity        float64
size              object
weight           float64
height           float64
dtype: object

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     float64
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(4), object(2)
memory usage: 3.4+ KB

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     uint32 
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(3), object(2), uint32(1)
memory usage: 3.2+ KB

breed             object
kind              object
lifetime_cost     uint32
longevity        float64
size              object
weight           float64
height           float64
dtype: object

kind             working
lifetime_cost      26686
longevity           16.5
size               small
weight             175.0
height              30.0
dtype: object

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Length: 43, dtype: float64

kind              7
lifetime_cost    43
longevity        40
size              3
weight           37
height           30
dtype: int64

4207.057057057057

dogs['lifetime_cost']

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Name: lifetime_cost, Length: 43, dtype: float64

dogs['lifetime_cost'].to_numpy()

array([22589., 21992., 18993., ..., 13936., 13581., 20022.])

dogs

kind              object
lifetime_cost    float64
longevity        float64
size              object
weight           float64
height           float64
dtype: object

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     float64
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(4), object(2)
memory usage: 3.4+ KB

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     uint32 
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(3), object(2), uint32(1)
memory usage: 3.2+ KB

breed             object
kind              object
lifetime_cost     uint32
longevity        float64
size              object
weight           float64
height           float64
dtype: object

kind             working
lifetime_cost      26686
longevity           16.5
size               small
weight             175.0
height              30.0
dtype: object

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Length: 43, dtype: float64

kind              7
lifetime_cost    43
longevity        40
size              3
weight           37
height           30
dtype: int64

4207.057057057057

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

3706.1643835616437

dogs

dogs.dtypes

kind              object
lifetime_cost    float64
longevity        float64
size              object
weight           float64
height           float64
dtype: object

dogs

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     float64
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(4), object(2)
memory usage: 3.4+ KB

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     uint32 
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(3), object(2), uint32(1)
memory usage: 3.2+ KB

breed             object
kind              object
lifetime_cost     uint32
longevity        float64
size              object
weight           float64
height           float64
dtype: object

kind             working
lifetime_cost      26686
longevity           16.5
size               small
weight             175.0
height              30.0
dtype: object

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Length: 43, dtype: float64

kind              7
lifetime_cost    43
longevity        40
size              3
weight           37
height           30
dtype: int64

4207.057057057057

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

3706.1643835616437

3733.0882352941176

dogs

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     float64
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(4), object(2)
memory usage: 3.4+ KB

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     uint32 
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(3), object(2), uint32(1)
memory usage: 3.2+ KB

breed             object
kind              object
lifetime_cost     uint32
longevity        float64
size              object
weight           float64
height           float64
dtype: object

kind             working
lifetime_cost      26686
longevity           16.5
size               small
weight             175.0
height              30.0
dtype: object

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Length: 43, dtype: float64

kind              7
lifetime_cost    43
longevity        40
size              3
weight           37
height           30
dtype: int64

4207.057057057057

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

3706.1643835616437

3733.0882352941176

dogs

# Gives the types as well as the space taken up by the DataFrame.
dogs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     float64
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(4), object(2)
memory usage: 3.4+ KB

dogs['lifetime_cost'] = dogs['lifetime_cost'].astype('uint32')

dogs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     uint32 
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(3), object(2), uint32(1)
memory usage: 3.2+ KB

dogs_new = pd.read_csv('data/dogs43.csv', dtype={'lifetime_cost': 'uint32'})
dogs_new

breed             object
kind              object
lifetime_cost     uint32
longevity        float64
size              object
weight           float64
height           float64
dtype: object

kind             working
lifetime_cost      26686
longevity           16.5
size               small
weight             175.0
height              30.0
dtype: object

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Length: 43, dtype: float64

kind              7
lifetime_cost    43
longevity        40
size              3
weight           37
height           30
dtype: int64

4207.057057057057

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

3706.1643835616437

3733.0882352941176

5092.436974789916

4207.057057057057

dogs_new = pd.read_csv('data/dogs43.csv', dtype={'lifetime_cost': 'uint32'})
dogs_new

dogs_new.dtypes

breed             object
kind              object
lifetime_cost     uint32
longevity        float64
size              object
weight           float64
height           float64
dtype: object

dogs

# Max element in each column.
dogs.max()

kind             working
lifetime_cost      26686
longevity           16.5
size               small
weight             175.0
height              30.0
dtype: object

# Max element in each row – a little nonsensical, since the values in each column are on different scales.
# Note that we had to select the numeric columns first.
dogs[['lifetime_cost', 'longevity', 'weight', 'height']].max(axis=1)

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Length: 43, dtype: float64

# The number of unique values in each column.
dogs.nunique()

kind              7
lifetime_cost    43
longevity        40
size              3
weight           37
height           30
dtype: int64

# describe doesn't accept an axis argument; it works on every numeric column in the DataFrame it is called on.
dogs.describe()

all_dogs = pd.read_csv('data/all_dogs.csv')
all_dogs

# There's no "right answer" here; you're supposed to explore!
fav_weight = all_dogs.loc[all_dogs['breed'] == 'English Cocker Spaniel', 'weight'].iloc[0]
similar_weight = all_dogs[(all_dogs['weight'] >= fav_weight - 5) & (all_dogs['weight'] <= fav_weight + 5)]
similar_weight

similar_weight.sort_values('intelligence_rank')[['breed', 'lifetime_cost', 'intelligence_rank']]

IFrame('https://www.youtube-nocookie.com/embed/CCrNAHXUstU?si=-DntSyUNp5Kwitjm&amp;start=11',
       width=560, height=315)

penguins = sns.load_dataset('penguins').dropna().reset_index(drop=True)
penguins

penguins.plot(kind='scatter', 
              x='bill_length_mm', 
              y='body_mass_g', 
              color='species', 
              title='Body Mass vs. Bill Length')

penguins['body_mass_g'].mean()

4207.057057057057

penguins['species'].unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

penguins.loc[penguins['species'] == 'Adelie', 'body_mass_g'].mean()

3706.1643835616437

penguins.loc[penguins['species'] == 'Chinstrap', 'body_mass_g'].mean()

3733.0882352941176

penguins.loc[penguins['species'] == 'Gentoo', 'body_mass_g'].mean()

5092.436974789916

# To find the overall mean 'body_mass_g':
penguins['body_mass_g'].mean()

4207.057057057057

# To find the mean 'body_mass_g' for each 'species':
penguins.groupby('species')['body_mass_g'].mean()

species
Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
Name: body_mass_g, dtype: float64

show_grouping_animation()

pets = pd.DataFrame().assign(
    Species=['dog', 'cat', 'cat', 'dog', 'dog', 'hamster'],
    Color=['black', 'golden', 'black', 'white', 'golden', 'golden'],
    Weight=[40, 15, 20, 80, 25, 1],
    Age=[5, 8, 9, 2, 0.5, 3]
)
pets

# Why does this error?
pets.groupby('Species').mean()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:1870, in GroupBy._agg_py_fallback(self, how, values, ndim, alt)
   1869 try:
-> 1870     res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
   1871 except Exception as err:

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/ops.py:850, in BaseGrouper.agg_series(self, obj, func, preserve_dtype)
    848     preserve_dtype = True
--> 850 result = self._aggregate_series_pure_python(obj, func)
    852 npvalues = lib.maybe_convert_objects(result, try_float=False)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/ops.py:871, in BaseGrouper._aggregate_series_pure_python(self, obj, func)
    870 for i, group in enumerate(splitter):
--> 871     res = func(group)
    872     res = extract_result(res)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:2376, in GroupBy.mean.<locals>.<lambda>(x)
   2373 else:
   2374     result = self._cython_agg_general(
   2375         "mean",
-> 2376         alt=lambda x: Series(x).mean(numeric_only=numeric_only),
   2377         numeric_only=numeric_only,
   2378     )
   2379     return result.__finalize__(self.obj, method="groupby")

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/series.py:6226, in Series.mean(self, axis, skipna, numeric_only, **kwargs)
   6218 @doc(make_doc("mean", ndim=1))
   6219 def mean(
   6220     self,
   (...)
   6224     **kwargs,
   6225 ):
-> 6226     return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/generic.py:11969, in NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
  11962 def mean(
  11963     self,
  11964     axis: Axis | None = 0,
   (...)
  11967     **kwargs,
  11968 ) -> Series | float:
> 11969     return self._stat_function(
  11970         "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
  11971     )

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/generic.py:11926, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
  11924 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11926 return self._reduce(
  11927     func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
  11928 )

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/series.py:6134, in Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   6130     raise TypeError(
   6131         f"Series.{name} does not allow {kwd_name}={numeric_only} "
   6132         "with non-numeric dtypes."
   6133     )
-> 6134 return op(delegate, skipna=skipna, **kwds)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/nanops.py:147, in bottleneck_switch.__call__.<locals>.f(values, axis, skipna, **kwds)
    146 else:
--> 147     result = alt(values, axis=axis, skipna=skipna, **kwds)
    149 return result

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/nanops.py:404, in _datetimelike_compat.<locals>.new_func(values, axis, skipna, mask, **kwargs)
    402     mask = isna(values)
--> 404 result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
    406 if datetimelike:

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/nanops.py:720, in nanmean(values, axis, skipna, mask)
    719 the_sum = values.sum(axis, dtype=dtype_sum)
--> 720 the_sum = _ensure_numeric(the_sum)
    722 if axis is not None and getattr(the_sum, "ndim", False):

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/nanops.py:1693, in _ensure_numeric(x)
   1691 if isinstance(x, str):
   1692     # GH#44008, GH#36703 avoid casting e.g. strings to numeric
-> 1693     raise TypeError(f"Could not convert string '{x}' to numeric")
   1694 try:

TypeError: Could not convert string 'goldenblack' to numeric

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
Cell In[60], line 2
      1 # Why does this error?
----> 2 pets.groupby('Species').mean()

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:2374, in GroupBy.mean(self, numeric_only, engine, engine_kwargs)
   2367     return self._numba_agg_general(
   2368         grouped_mean,
   2369         executor.float_dtype_mapping,
   2370         engine_kwargs,
   2371         min_periods=0,
   2372     )
   2373 else:
-> 2374     result = self._cython_agg_general(
   2375         "mean",
   2376         alt=lambda x: Series(x).mean(numeric_only=numeric_only),
   2377         numeric_only=numeric_only,
   2378     )
   2379     return result.__finalize__(self.obj, method="groupby")

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:1925, in GroupBy._cython_agg_general(self, how, alt, numeric_only, min_count, **kwargs)
   1922     result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
   1923     return result
-> 1925 new_mgr = data.grouped_reduce(array_func)
   1926 res = self._wrap_agged_manager(new_mgr)
   1927 out = self._wrap_aggregated_output(res)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/internals/managers.py:1428, in BlockManager.grouped_reduce(self, func)
   1424 if blk.is_object:
   1425     # split on object-dtype blocks bc some columns may raise
   1426     #  while others do not.
   1427     for sb in blk._split():
-> 1428         applied = sb.apply(func)
   1429         result_blocks = extend_blocks(applied, result_blocks)
   1430 else:

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/internals/blocks.py:366, in Block.apply(self, func, **kwargs)
    360 @final
    361 def apply(self, func, **kwargs) -> list[Block]:
    362     """
    363     apply the function to my values; return a block if we are not
    364     one
    365     """
--> 366     result = func(self.values, **kwargs)
    368     result = maybe_coerce_values(result)
    369     return self._split_op_result(result)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:1922, in GroupBy._cython_agg_general.<locals>.array_func(values)
   1919 else:
   1920     return result
-> 1922 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
   1923 return result

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:1874, in GroupBy._agg_py_fallback(self, how, values, ndim, alt)
   1872     msg = f"agg function failed [how->{how},dtype->{ser.dtype}]"
   1873     # preserve the kind of exception that raised
-> 1874     raise type(err)(msg) from err
   1876 if ser.dtype == object:
   1877     res_values = res_values.astype(object, copy=False)

TypeError: agg function failed [how->mean,dtype->object]

pets.groupby('Species')[['Weight', 'Age']].mean()

pets.groupby('Species').max()

(
    penguins
    .groupby('species')
    ['bill_length_mm']
    .median()
    .idxmax()
)

'Chinstrap'

species
Adelie       0.38
Chinstrap    1.00
Gentoo       0.00
Name: is_Dream, dtype: float64

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x309c06740>

{'Adelie': [0, 1], 'Chinstrap': [150, 151], 'Gentoo': [300, 251, 301]}

<pandas.core.groupby.generic.SeriesGroupBy object at 0x168927400>

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x309bc25c0>

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:1870, in GroupBy._agg_py_fallback(self, how, values, ndim, alt)
   1869 try:
-> 1870     res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
   1871 except Exception as err:

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/ops.py:850, in BaseGrouper.agg_series(self, obj, func, preserve_dtype)
    848     preserve_dtype = True
--> 850 result = self._aggregate_series_pure_python(obj, func)
    852 npvalues = lib.maybe_convert_objects(result, try_float=False)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/ops.py:871, in BaseGrouper._aggregate_series_pure_python(self, obj, func)
    870 for i, group in enumerate(splitter):
--> 871     res = func(group)
    872     res = extract_result(res)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:2376, in GroupBy.mean.<locals>.<lambda>(x)
   2373 else:
   2374     result = self._cython_agg_general(
   2375         "mean",
-> 2376         alt=lambda x: Series(x).mean(numeric_only=numeric_only),
   2377         numeric_only=numeric_only,
   2378     )
   2379     return result.__finalize__(self.obj, method="groupby")

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/series.py:6226, in Series.mean(self, axis, skipna, numeric_only, **kwargs)
   6218 @doc(make_doc("mean", ndim=1))
   6219 def mean(
   6220     self,
   (...)
   6224     **kwargs,
   6225 ):
-> 6226     return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/generic.py:11969, in NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
  11962 def mean(
  11963     self,
  11964     axis: Axis | None = 0,
   (...)
  11967     **kwargs,
  11968 ) -> Series | float:
> 11969     return self._stat_function(
  11970         "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
  11971     )

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/generic.py:11926, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
  11924 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11926 return self._reduce(
  11927     func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
  11928 )

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/series.py:6134, in Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   6130     raise TypeError(
   6131         f"Series.{name} does not allow {kwd_name}={numeric_only} "
   6132         "with non-numeric dtypes."
   6133     )
-> 6134 return op(delegate, skipna=skipna, **kwds)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/nanops.py:147, in bottleneck_switch.__call__.<locals>.f(values, axis, skipna, **kwds)
    146 else:
--> 147     result = alt(values, axis=axis, skipna=skipna, **kwds)
    149 return result

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/nanops.py:404, in _datetimelike_compat.<locals>.new_func(values, axis, skipna, mask, **kwargs)
    402     mask = isna(values)
--> 404 result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
    406 if datetimelike:

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/nanops.py:720, in nanmean(values, axis, skipna, mask)
    719 the_sum = values.sum(axis, dtype=dtype_sum)
--> 720 the_sum = _ensure_numeric(the_sum)
    722 if axis is not None and getattr(the_sum, "ndim", False):

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/nanops.py:1693, in _ensure_numeric(x)
   1691 if isinstance(x, str):
   1692     # GH#44008, GH#36703 avoid casting e.g. strings to numeric
-> 1693     raise TypeError(f"Could not convert string '{x}' to numeric")
   1694 try:

TypeError: Could not convert string 'TorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDream' to numeric

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
Cell In[72], line 3
      1 # This errors, because there are non-numeric columns in penguins
      2 # that it's trying to take the "mean" of, like 'island'.
----> 3 penguins.groupby('species').mean()

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:2374, in GroupBy.mean(self, numeric_only, engine, engine_kwargs)
   2367     return self._numba_agg_general(
   2368         grouped_mean,
   2369         executor.float_dtype_mapping,
   2370         engine_kwargs,
   2371         min_periods=0,
   2372     )
   2373 else:
-> 2374     result = self._cython_agg_general(
   2375         "mean",
   2376         alt=lambda x: Series(x).mean(numeric_only=numeric_only),
   2377         numeric_only=numeric_only,
   2378     )
   2379     return result.__finalize__(self.obj, method="groupby")

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:1925, in GroupBy._cython_agg_general(self, how, alt, numeric_only, min_count, **kwargs)
   1922     result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
   1923     return result
-> 1925 new_mgr = data.grouped_reduce(array_func)
   1926 res = self._wrap_agged_manager(new_mgr)
   1927 out = self._wrap_aggregated_output(res)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/internals/managers.py:1428, in BlockManager.grouped_reduce(self, func)
   1424 if blk.is_object:
   1425     # split on object-dtype blocks bc some columns may raise
   1426     #  while others do not.
   1427     for sb in blk._split():
-> 1428         applied = sb.apply(func)
   1429         result_blocks = extend_blocks(applied, result_blocks)
   1430 else:

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/internals/blocks.py:366, in Block.apply(self, func, **kwargs)
    360 @final
    361 def apply(self, func, **kwargs) -> list[Block]:
    362     """
    363     apply the function to my values; return a block if we are not
    364     one
    365     """
--> 366     result = func(self.values, **kwargs)
    368     result = maybe_coerce_values(result)
    369     return self._split_op_result(result)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:1922, in GroupBy._cython_agg_general.<locals>.array_func(values)
   1919 else:
   1920     return result
-> 1922 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
   1923 return result

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:1874, in GroupBy._agg_py_fallback(self, how, values, ndim, alt)
   1872     msg = f"agg function failed [how->{how},dtype->{ser.dtype}]"
   1873     # preserve the kind of exception that raised
-> 1874     raise type(err)(msg) from err
   1876 if ser.dtype == object:
   1877     res_values = res_values.astype(object, copy=False)

TypeError: agg function failed [how->mean,dtype->object]

species
Adelie       3775.0
Chinstrap    3837.5
Gentoo       4925.0
Name: body_mass_g, dtype: float64

species
Adelie       637.5
Chinstrap    462.5
Gentoo       800.0
Name: body_mass_g, dtype: float64

(
    penguins
    .groupby('species')
    ['bill_length_mm']
    .median()
    .idxmax()
)

'Chinstrap'

(
    penguins
    .groupby('species')
    ['bill_length_mm']
    .median()
    .plot(kind='barh', title='Median Bill Length of Each Species')
)

(
    penguins
    .assign(is_Dream=penguins['island'] == 'Dream')
    .groupby('species')
    ['is_Dream']
    .mean()
)

species
Adelie       0.38
Chinstrap    1.00
Gentoo       0.00
Name: is_Dream, dtype: float64

penguins.groupby('species')['bill_length_mm'].mean()

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

# Simplified DataFrame for demonstration:
penguins_small = penguins.iloc[[0, 150, 300, 1, 251, 151, 301], [0, 5, 6]]
penguins_small

# Creates one group for each unique value in the species column.
penguins_small.groupby('species')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x309c06740>

penguins_small.groupby('species').groups

{'Adelie': [0, 1], 'Chinstrap': [150, 151], 'Gentoo': [300, 251, 301]}

penguins.groupby('species')['bill_length_mm']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x168927400>

penguins.groupby('species')[['bill_length_mm', 'bill_depth_mm']]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x309bc25c0>

# This errors, because there are non-numeric columns in penguins
# that it's trying to take the "mean" of, like 'island'.
penguins.groupby('species').mean()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:1870, in GroupBy._agg_py_fallback(self, how, values, ndim, alt)
   1869 try:
-> 1870     res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
   1871 except Exception as err:

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/ops.py:850, in BaseGrouper.agg_series(self, obj, func, preserve_dtype)
    848     preserve_dtype = True
--> 850 result = self._aggregate_series_pure_python(obj, func)
    852 npvalues = lib.maybe_convert_objects(result, try_float=False)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/ops.py:871, in BaseGrouper._aggregate_series_pure_python(self, obj, func)
    870 for i, group in enumerate(splitter):
--> 871     res = func(group)
    872     res = extract_result(res)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:2376, in GroupBy.mean.<locals>.<lambda>(x)
   2373 else:
   2374     result = self._cython_agg_general(
   2375         "mean",
-> 2376         alt=lambda x: Series(x).mean(numeric_only=numeric_only),
   2377         numeric_only=numeric_only,
   2378     )
   2379     return result.__finalize__(self.obj, method="groupby")

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/series.py:6226, in Series.mean(self, axis, skipna, numeric_only, **kwargs)
   6218 @doc(make_doc("mean", ndim=1))
   6219 def mean(
   6220     self,
   (...)
   6224     **kwargs,
   6225 ):
-> 6226     return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/generic.py:11969, in NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
  11962 def mean(
  11963     self,
  11964     axis: Axis | None = 0,
   (...)
  11967     **kwargs,
  11968 ) -> Series | float:
> 11969     return self._stat_function(
  11970         "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
  11971     )

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/generic.py:11926, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
  11924 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11926 return self._reduce(
  11927     func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
  11928 )

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/series.py:6134, in Series._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   6130     raise TypeError(
   6131         f"Series.{name} does not allow {kwd_name}={numeric_only} "
   6132         "with non-numeric dtypes."
   6133     )
-> 6134 return op(delegate, skipna=skipna, **kwds)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/nanops.py:147, in bottleneck_switch.__call__.<locals>.f(values, axis, skipna, **kwds)
    146 else:
--> 147     result = alt(values, axis=axis, skipna=skipna, **kwds)
    149 return result

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/nanops.py:404, in _datetimelike_compat.<locals>.new_func(values, axis, skipna, mask, **kwargs)
    402     mask = isna(values)
--> 404 result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
    406 if datetimelike:

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/nanops.py:720, in nanmean(values, axis, skipna, mask)
    719 the_sum = values.sum(axis, dtype=dtype_sum)
--> 720 the_sum = _ensure_numeric(the_sum)
    722 if axis is not None and getattr(the_sum, "ndim", False):

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/nanops.py:1693, in _ensure_numeric(x)
   1691 if isinstance(x, str):
   1692     # GH#44008, GH#36703 avoid casting e.g. strings to numeric
-> 1693     raise TypeError(f"Could not convert string '{x}' to numeric")
   1694 try:

TypeError: Could not convert string 'TorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeBiscoeTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenTorgersenDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDreamDream' to numeric

The above exception was the direct cause of the following exception:

TypeError                                 Traceback (most recent call last)
Cell In[72], line 3
      1 # This errors, because there are non-numeric columns in penguins
      2 # that it's trying to take the "mean" of, like 'island'.
----> 3 penguins.groupby('species').mean()

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:2374, in GroupBy.mean(self, numeric_only, engine, engine_kwargs)
   2367     return self._numba_agg_general(
   2368         grouped_mean,
   2369         executor.float_dtype_mapping,
   2370         engine_kwargs,
   2371         min_periods=0,
   2372     )
   2373 else:
-> 2374     result = self._cython_agg_general(
   2375         "mean",
   2376         alt=lambda x: Series(x).mean(numeric_only=numeric_only),
   2377         numeric_only=numeric_only,
   2378     )
   2379     return result.__finalize__(self.obj, method="groupby")

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:1925, in GroupBy._cython_agg_general(self, how, alt, numeric_only, min_count, **kwargs)
   1922     result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
   1923     return result
-> 1925 new_mgr = data.grouped_reduce(array_func)
   1926 res = self._wrap_agged_manager(new_mgr)
   1927 out = self._wrap_aggregated_output(res)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/internals/managers.py:1428, in BlockManager.grouped_reduce(self, func)
   1424 if blk.is_object:
   1425     # split on object-dtype blocks bc some columns may raise
   1426     #  while others do not.
   1427     for sb in blk._split():
-> 1428         applied = sb.apply(func)
   1429         result_blocks = extend_blocks(applied, result_blocks)
   1430 else:

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/internals/blocks.py:366, in Block.apply(self, func, **kwargs)
    360 @final
    361 def apply(self, func, **kwargs) -> list[Block]:
    362     """
    363     apply the function to my values; return a block if we are not
    364     one
    365     """
--> 366     result = func(self.values, **kwargs)
    368     result = maybe_coerce_values(result)
    369     return self._split_op_result(result)

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:1922, in GroupBy._cython_agg_general.<locals>.array_func(values)
   1919 else:
   1920     return result
-> 1922 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
   1923 return result

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:1874, in GroupBy._agg_py_fallback(self, how, values, ndim, alt)
   1872     msg = f"agg function failed [how->{how},dtype->{ser.dtype}]"
   1873     # preserve the kind of exception that raised
-> 1874     raise type(err)(msg) from err
   1876 if ser.dtype == object:
   1877     res_values = res_values.astype(object, copy=False)

TypeError: agg function failed [how->mean,dtype->object]

penguins.groupby('species')[['bill_length_mm', 'bill_depth_mm']].mean()

penguins_small

penguins_small.groupby('species')['body_mass_g'].mean()

species
Adelie       3775.0
Chinstrap    3837.5
Gentoo       4925.0
Name: body_mass_g, dtype: float64

# Note that this worked on the entire DataFrame!
# But, if all we wanted are the sums of `'body_mass_g'
# for each species, this is slower than
# penguins_small.groupby('species')['body_mass_g'].mean().
penguins_small.groupby('species').sum()

# Often used in conjunction with sort_values.
# Remember this when you work on the activity in a few slides!
penguins_small.groupby('species').last()

penguins_small.groupby('species').max()

penguins_small.groupby('species').max()

# This penguin is Female!
penguins_small.loc[(penguins['species'] == 'Adelie') & (penguins['body_mass_g'] == 3800.0)]

# General idea: Sort the penguibs by mass in decreasing order.
# Then, the first male penguin that appears is the heaviest male penguin,
# and the first female penguin that appears is the heaviest female penguin.
# For each sex, take the first row.
(
    penguins
    .sort_values('body_mass_g', ascending=False)
    .groupby('sex')
    .first()
)

(
    penguins
    .groupby('species')
    ['body_mass_g']
    .aggregate(['count', 'mean'])
)

species
Adelie       637.5
Chinstrap    462.5
Gentoo       800.0
Name: body_mass_g, dtype: float64

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

0     -0.57
1     -0.51
2     -1.19
       ... 
330    1.92
331    1.23
332    1.48
Name: body_mass_g, Length: 333, dtype: float64

0      0.10
1      0.21
2     -1.00
       ... 
330    1.32
331    0.22
332    0.62
Name: body_mass_g, Length: 333, dtype: float64

(
    penguins
    .groupby('species')
    ['body_mass_g']
    .aggregate(['count', 'mean'])
)

(
    penguins
    .groupby('species')
    .agg({'bill_length_mm': 'max', 'island': 'unique'})
)

# Here, the argument to agg is a function,
# which takes in a Series and returns a scalar.
def iqr(s):
    return np.percentile(s, 75) - np.percentile(s, 25)
(
    penguins
    .groupby('species')
    ['body_mass_g']
    .agg(iqr)
)

species
Adelie       637.5
Chinstrap    462.5
Gentoo       800.0
Name: body_mass_g, dtype: float64

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

0     -0.57
1     -0.51
2     -1.19
       ... 
330    1.92
331    1.23
332    1.48
Name: body_mass_g, Length: 333, dtype: float64

0      0.10
1      0.21
2     -1.00
       ... 
330    1.32
331    0.22
332    0.62
Name: body_mass_g, Length: 333, dtype: float64

# Here, the argument to agg is a function,
# which takes in a Series and returns a scalar.
def iqr(s):
    return np.percentile(s, 75) - np.percentile(s, 25)
(
    penguins
    .groupby('species')
    ['body_mass_g']
    .agg(iqr)
)

species
Adelie       637.5
Chinstrap    462.5
Gentoo       800.0
Name: body_mass_g, dtype: float64

(
    penguins
    .groupby('species')
    .filter(lambda df: df['bill_length_mm'].mean() > 39)
)

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

0     -0.57
1     -0.51
2     -1.19
       ... 
330    1.92
331    1.23
332    1.48
Name: body_mass_g, Length: 333, dtype: float64

0      0.10
1      0.21
2     -1.00
       ... 
330    1.32
331    0.22
332    0.62
Name: body_mass_g, Length: 333, dtype: float64

(
    penguins
    .groupby('species')
    .filter(lambda df: df['bill_length_mm'].mean() > 39)
)

(
    penguins
    .groupby('species')
    .filter(lambda df: df.shape[0] >= 100)
)

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

0     -0.57
1     -0.51
2     -1.19
       ... 
330    1.92
331    1.23
332    1.48
Name: body_mass_g, Length: 333, dtype: float64

0      0.10
1      0.21
2     -1.00
       ... 
330    1.32
331    0.22
332    0.62
Name: body_mass_g, Length: 333, dtype: float64

(
    penguins
    .groupby('species')
    .filter(lambda df: df.shape[0] >= 100)
)

# Note that to just find the 'species' with at least 100 penguins,
# we didn't need to group:
penguins['species'].value_counts()

species
Adelie       146
Gentoo       119
Chinstrap     68
Name: count, dtype: int64

def z_score(x):
    return (x - x.mean()) / x.std(ddof=0)

0     -0.57
1     -0.51
2     -1.19
       ... 
330    1.92
331    1.23
332    1.48
Name: body_mass_g, Length: 333, dtype: float64

0      0.10
1      0.21
2     -1.00
       ... 
330    1.32
331    0.22
332    0.62
Name: body_mass_g, Length: 333, dtype: float64

def z_score(x):
    return (x - x.mean()) / x.std(ddof=0)

z_score(penguins['body_mass_g'])

0     -0.57
1     -0.51
2     -1.19
       ... 
330    1.92
331    1.23
332    1.48
Name: body_mass_g, Length: 333, dtype: float64

z_mass = (penguins
          .groupby('species')
          ['body_mass_g']
          .transform(z_score))
z_mass

0      0.10
1      0.21
2     -1.00
       ... 
330    1.32
331    0.22
332    0.62
Name: body_mass_g, Length: 333, dtype: float64

z_mass = (penguins
          .groupby('species')
          ['body_mass_g']
          .transform(z_score))
z_mass

0      0.10
1      0.21
2     -1.00
       ... 
330    1.32
331    0.22
332    0.62
Name: body_mass_g, Length: 333, dtype: float64

penguins.assign(z_mass=z_mass)

display_df(penguins.assign(z_mass=z_mass), rows=8)

	kind	lifetime_cost	longevity	size	weight	height
breed
English Cocker Spaniel	sporting	18993.0	11.66	medium	30.0	16.0
Chesapeake Bay Retriever	sporting	16697.0	9.48	large	67.5	23.5
Gordon Setter	sporting	19605.0	11.10	large	62.5	25.0
Clumber Spaniel	sporting	18084.0	10.00	medium	70.0	18.5
Scottish Terrier	terrier	17525.0	10.69	small	20.0	10.0
Kerry Blue Terrier	terrier	17240.0	9.40	medium	36.5	18.5
Bull Terrier	terrier	18490.0	10.21	medium	60.0	21.5

Pandas dtype	Python type	NumPy type	SQL type	Usage
int64	int	int_, int8,...,int64, uint8,...,uint64	INT, BIGINT	Integer numbers
float64	float	float_, float16, float32, float64	FLOAT	Floating point numbers
bool	bool	bool_	BOOL	True/False values
datetime64 or Timestamp	datetime.datetime	datetime64	DATETIME	Date and time values
timedelta64 or Timedelta	datetime.timedelta	timedelta64	NA	Differences between two datetimes
category	NA	NA	ENUM	Finite list of text values
object	str	string, unicode	NA	Text
object	NA	object	NA	Mixed types

	lifetime_cost	longevity	weight	height
count	43.00	43.00	43.00	43.00
mean	20532.84	11.34	49.35	18.34
std	3290.78	2.05	39.42	6.83
...	...	...	...	...
50%	21006.00	11.81	36.50	18.50
75%	22072.50	12.52	67.50	25.00
max	26686.00	16.50	175.00	30.00

	breed	lifetime_cost	intelligence_rank
57	Pembroke Welsh Corgi	23978.0	11.0
5	English Cocker Spaniel	18993.0	18.0
2	Brittany	22589.0	19.0
...	...	...	...
73	French Bulldog	17266.0	58.0
117	Glen of Imaal Terrier	NaN	NaN
138	Norwegian Buhund	NaN	NaN

	kind	lifetime_cost	longevity	size	weight	height
breed
Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.0
Miniature Schnauzer	terrier	20087.0	11.81	small	15.5	13.0
Norfolk Terrier	terrier	24308.0	13.07	small	12.0	9.5

	breed	kind	lifetime_cost	longevity	size	weight	height
0	Brittany	sporting	22589	12.92	medium	35.0	19.0
1	Cairn Terrier	terrier	21992	13.84	small	14.0	10.0
2	English Cocker Spaniel	sporting	18993	11.66	medium	30.0	16.0
...	...	...	...	...	...	...	...
40	Bullmastiff	working	13936	7.57	large	115.0	25.5
41	Mastiff	working	13581	6.50	large	175.0	30.0
42	Saint Bernard	working	20022	7.78	large	155.0	26.5

	breed	group	datadog	popularity_all	...	megarank	size	weight	height
0	Border Collie	herding	3.64	45	...	29.0	medium	NaN	20.0
1	Border Terrier	terrier	3.61	80	...	1.0	small	13.5	NaN
2	Brittany	sporting	3.54	30	...	11.0	medium	35.0	19.0
...	...	...	...	...	...	...	...	...	...
169	Wire Fox Terrier	terrier	NaN	100	...	NaN	small	17.5	15.0
170	Wirehaired Pointing Griffon	sporting	NaN	92	...	NaN	medium	NaN	22.0
171	Xoloitzcuintli	non-sporting	NaN	155	...	NaN	medium	NaN	16.5

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
...	...	...	...	...	...	...	...
330	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
331	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
332	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

	Species	Color	Weight	Age
0	dog	black	40	5.0
1	cat	golden	15	8.0
2	cat	black	20	9.0
3	dog	white	80	2.0
4	dog	golden	25	0.5
5	hamster	golden	1	3.0

	body_mass_g	sex
species
Adelie	7550.0	MaleFemale
Chinstrap	7675.0	MaleFemale
Gentoo	14775.0	FemaleFemaleMale

	bill_length_mm	island
species
Adelie	46.0	[Torgersen, Biscoe, Dream]
Chinstrap	58.0	[Dream]
Gentoo	59.6	[Biscoe]

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
146	Chinstrap	Dream	46.5	17.9	192.0	3500.0	Female
147	Chinstrap	Dream	50.0	19.5	196.0	3900.0	Male
148	Chinstrap	Dream	51.3	19.2	193.0	3650.0	Male
...	...	...	...	...	...	...	...
330	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
331	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
332	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

	1	1
0	fee	fo
1	fi	fum

Lecture 5¶

Querying and Grouping¶

EECS 398-003: Practical Data Science, Fall 2024¶

Announcements 📣¶

Agenda¶

Question 🤔 (Answer at practicaldsc.org/q)

Querying¶

Recap: Querying¶

Aside: Reference Slides

Reference Slide¶

The query method¶

Reference Slide¶

More practice¶

Adding and modifying columns¶

Adding and modifying columns, using a copy¶

Reference Slide¶

assign for column names with special characters¶

Adding and modifying columns, in-place¶

Mutability¶

⚠️ Warning: Avoid mutation when possible!¶

Question 🤔 (Answer at practicaldsc.org/q)

pandas and numpy¶

pandas is built upon numpy!¶

Reference Slide¶

pandas data types¶

Reference Slide¶

pandas data types¶

Reference Slide¶

Type conversion¶

Reference Slide¶

Setting dtypes in read_csv¶

Axes¶

DataFrame methods with axis¶

Activity

Introduction to the groupby method¶

Example: Palmer Penguins¶

Loading the data¶

Visualizing the data¶

Granularity¶

Aggregating¶

A naïve approach to finding the mean 'body_mass_g' per 'species'¶

Grouping¶

An illustrative example: Pets 🐱 🐶🐹¶

Let's try it out!¶

"Split-apply-combine" paradigm¶

More examples¶

Activity

Activity

groupby's inner workings¶

How does groupby actually work?¶

DataFrameGroupBy objects¶

Column extraction¶

Aggregation¶

Column independence¶

Activity

Beyond default aggregation methods¶

The aggregate method¶

Reference Slide¶

Examples¶

Activity

Question 🤔 (Answer at practicaldsc.org/q)

Split-apply-combine, revisited¶

Grouping, then filtering¶

Activity

Reference Slide¶

Example: Z-Scoring¶

Reference Slide¶

Grouping, then transforming¶

What's next?¶

The `query` method¶

`assign` for column names with special characters¶

`pandas` and `numpy`¶

`pandas` is built upon `numpy`!¶

`pandas` data types¶

`pandas` data types¶

Setting `dtype`s in `read_csv`¶

DataFrame methods with `axis`¶

Introduction to the `groupby` method¶

A naïve approach to finding the mean `'body_mass_g'` per `'species'`¶

`groupby`'s inner workings¶

How does `groupby` actually work?¶

`DataFrameGroupBy` objects¶

The `aggregate` method¶