from lec_utils import *
def show_chaining_slides():
    src = 'https://docs.google.com/presentation/d/e/2PACX-1vRYAERthJoyVnD1JymDK6JAtufCJmA5AYb5H2NwlegJHm04WhBfxnA0zQO3vKbEYQbqOJ8XJPZtfoxF/embed?start=false&loop=false&rm=minimal'
    width = 1000
    height = 590
    display(IFrame(src, width, height))

import pandas as pd
import numpy as np

breed,kind,lifetime_cost,longevity,size,weight,height
Brittany,sporting,22589.0,12.92,medium,35.0,19.0
Cairn Terrier,terrier,21992.0,13.84,small,14.0,10.0
English Cocker Spaniel,sporting,18993.0,11.66,medium,30.0,16.0
Cocker Spaniel,sporting,24330.0,12.5,small,25.0,14.5
Shetland Sheepdog,herding,21006.0,12.53,small,22.0,14.5
Siberian Husky,working,22049.0,12.58,medium,47.5,21.75
Miniature Schnauzer,terrier,20087.0,11.81,small,15.5,13.0
Chihuahua,toy,26250.0,16.5,small,5.5,5.0
English Springer Spaniel,sporting,21946.0,12.54,medium,45.0,19.5
German Shorthaired Pointer,sporting,25842.0,11.46,large,62.5,24.0
Pointer,sporting,24445.0,12.42,large,59.5,25.5
Tibetan Spaniel,non-sporting,25549.0,14.42,small,12.0,10.0
Labrador Retriever,sporting,21299.0,12.04,medium,67.5,23.0
Maltese,toy,19084.0,12.25,small,5.0,9.0
Shih Tzu,toy,21152.0,13.2,small,12.5,9.75
Irish Setter,sporting,20323.0,11.63,large,65.0,26.0
Golden Retriever,sporting,21447.0,12.04,medium,60.0,22.75
Chesapeake Bay Retriever,sporting,16697.0,9.48,large,67.5,23.5
Tibetan Terrier,non-sporting,20336.0,12.31,small,24.0,15.5
Gordon Setter,sporting,19605.0,11.1,large,62.5,25.0
Pug,toy,18527.0,11.0,medium,16.0,16.0
Norfolk Terrier,terrier,24308.0,13.07,small,12.0,9.5
English Toy Spaniel,toy,17521.0,10.1,small,11.0,10.0
Cavalier King Charles Spaniel,toy,18639.0,11.29,small,15.5,12.5
Basenji,hound,22096.0,13.58,medium,23.0,16.5
Staffordshire Bull Terrier,terrier,21650.0,12.05,medium,31.0,15.0
Pembroke Welsh Corgi,herding,23978.0,12.25,small,26.0,11.0
Clumber Spaniel,sporting,18084.0,10.0,medium,70.0,18.5
Dandie Dinmont Terrier,terrier,21633.0,12.17,small,21.0,9.0
Giant Schnauzer,working,26686.0,10.0,large,77.5,25.5
Scottish Terrier,terrier,17525.0,10.69,small,20.0,10.0
Kerry Blue Terrier,terrier,17240.0,9.4,medium,36.5,18.5
Afghan Hound,hound,24077.0,11.92,large,55.0,26.0
Newfoundland,working,19351.0,9.32,large,125.0,27.0
Rhodesian Ridgeback,hound,16530.0,9.1,large,77.5,25.5
Borzoi,hound,16176.0,9.08,large,82.5,28.0
Bull Terrier,terrier,18490.0,10.21,medium,60.0,21.5
Alaskan Malamute,working,21986.0,10.67,large,80.0,24.0
Bloodhound,hound,13824.0,6.75,large,85.0,25.0
Bullmastiff,working,13936.0,7.57,large,115.0,25.5
Mastiff,working,13581.0,6.5,large,175.0,30.0
Saint Bernard,working,20022.0,7.78,large,155.0,26.5

(42, 7)

RangeIndex(start=0, stop=42, step=1)

(42, 6)

Index(['Brittany', 'Cairn Terrier', 'English Cocker Spaniel', 'Cocker Spaniel',
       'Shetland Sheepdog', 'Siberian Husky', 'Miniature Schnauzer',
       'Chihuahua', 'English Springer Spaniel', 'German Shorthaired Pointer',
       'Pointer', 'Tibetan Spaniel', 'Labrador Retriever', 'Maltese',
       'Shih Tzu', 'Irish Setter', 'Golden Retriever',
       'Chesapeake Bay Retriever', 'Tibetan Terrier', 'Gordon Setter', 'Pug',
       'Norfolk Terrier', 'English Toy Spaniel',
       'Cavalier King Charles Spaniel', 'Basenji',
       'Staffordshire Bull Terrier', 'Pembroke Welsh Corgi', 'Clumber Spaniel',
       'Dandie Dinmont Terrier', 'Giant Schnauzer', 'Scottish Terrier',
       'Kerry Blue Terrier', 'Afghan Hound', 'Newfoundland',
       'Rhodesian Ridgeback', 'Borzoi', 'Bull Terrier', 'Alaskan Malamute',
       'Bloodhound', 'Bullmastiff', 'Mastiff', 'Saint Bernard'],
      dtype='object', name='breed')

'Mastiff'

breed
Brittany                  sporting
Cairn Terrier              terrier
English Cocker Spaniel    sporting
                            ...   
Bullmastiff                working
Mastiff                    working
Saint Bernard              working
Name: kind, Length: 42, dtype: object

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3790, in Index.get_loc(self, key)
   3789 try:
-> 3790     return self._engine.get_loc(casted_key)
   3791 except KeyError as err:

File index.pyx:152, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:181, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7080, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'breed'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[24], line 2
      1 # Breeds are stored in the index, which is not a column!
----> 2 dogs['breed']

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/frame.py:3896, in DataFrame.__getitem__(self, key)
   3894 if self.columns.nlevels > 1:
   3895     return self._getitem_multilevel(key)
-> 3896 indexer = self.columns.get_loc(key)
   3897 if is_integer(indexer):
   3898     indexer = [indexer]

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3797, in Index.get_loc(self, key)
   3792     if isinstance(casted_key, slice) or (
   3793         isinstance(casted_key, abc.Iterable)
   3794         and any(isinstance(x, slice) for x in casted_key)
   3795     ):
   3796         raise InvalidIndexError(key)
-> 3797     raise KeyError(key) from err
   3798 except TypeError:
   3799     # If we have a listlike key, _check_indexing_error will raise
   3800     #  InvalidIndexError. Otherwise we fall through and re-raise
   3801     #  the TypeError.
   3802     self._check_indexing_error(key)

KeyError: 'breed'

Index(['Brittany', 'Cairn Terrier', 'English Cocker Spaniel', 'Cocker Spaniel',
       'Shetland Sheepdog', 'Siberian Husky', 'Miniature Schnauzer',
       'Chihuahua', 'English Springer Spaniel', 'German Shorthaired Pointer',
       'Pointer', 'Tibetan Spaniel', 'Labrador Retriever', 'Maltese',
       'Shih Tzu', 'Irish Setter', 'Golden Retriever',
       'Chesapeake Bay Retriever', 'Tibetan Terrier', 'Gordon Setter', 'Pug',
       'Norfolk Terrier', 'English Toy Spaniel',
       'Cavalier King Charles Spaniel', 'Basenji',
       'Staffordshire Bull Terrier', 'Pembroke Welsh Corgi', 'Clumber Spaniel',
       'Dandie Dinmont Terrier', 'Giant Schnauzer', 'Scottish Terrier',
       'Kerry Blue Terrier', 'Afghan Hound', 'Newfoundland',
       'Rhodesian Ridgeback', 'Borzoi', 'Bull Terrier', 'Alaskan Malamute',
       'Bloodhound', 'Bullmastiff', 'Mastiff', 'Saint Bernard'],
      dtype='object', name='breed')

array(['sporting', 'terrier', 'herding', 'working', 'toy', 'non-sporting',
       'hound'], dtype=object)

import pandas as pd
import numpy as np

# The "cat" shell command shows you the contents of a file.
!cat data/dogs42.csv

breed,kind,lifetime_cost,longevity,size,weight,height
Brittany,sporting,22589.0,12.92,medium,35.0,19.0
Cairn Terrier,terrier,21992.0,13.84,small,14.0,10.0
English Cocker Spaniel,sporting,18993.0,11.66,medium,30.0,16.0
Cocker Spaniel,sporting,24330.0,12.5,small,25.0,14.5
Shetland Sheepdog,herding,21006.0,12.53,small,22.0,14.5
Siberian Husky,working,22049.0,12.58,medium,47.5,21.75
Miniature Schnauzer,terrier,20087.0,11.81,small,15.5,13.0
Chihuahua,toy,26250.0,16.5,small,5.5,5.0
English Springer Spaniel,sporting,21946.0,12.54,medium,45.0,19.5
German Shorthaired Pointer,sporting,25842.0,11.46,large,62.5,24.0
Pointer,sporting,24445.0,12.42,large,59.5,25.5
Tibetan Spaniel,non-sporting,25549.0,14.42,small,12.0,10.0
Labrador Retriever,sporting,21299.0,12.04,medium,67.5,23.0
Maltese,toy,19084.0,12.25,small,5.0,9.0
Shih Tzu,toy,21152.0,13.2,small,12.5,9.75
Irish Setter,sporting,20323.0,11.63,large,65.0,26.0
Golden Retriever,sporting,21447.0,12.04,medium,60.0,22.75
Chesapeake Bay Retriever,sporting,16697.0,9.48,large,67.5,23.5
Tibetan Terrier,non-sporting,20336.0,12.31,small,24.0,15.5
Gordon Setter,sporting,19605.0,11.1,large,62.5,25.0
Pug,toy,18527.0,11.0,medium,16.0,16.0
Norfolk Terrier,terrier,24308.0,13.07,small,12.0,9.5
English Toy Spaniel,toy,17521.0,10.1,small,11.0,10.0
Cavalier King Charles Spaniel,toy,18639.0,11.29,small,15.5,12.5
Basenji,hound,22096.0,13.58,medium,23.0,16.5
Staffordshire Bull Terrier,terrier,21650.0,12.05,medium,31.0,15.0
Pembroke Welsh Corgi,herding,23978.0,12.25,small,26.0,11.0
Clumber Spaniel,sporting,18084.0,10.0,medium,70.0,18.5
Dandie Dinmont Terrier,terrier,21633.0,12.17,small,21.0,9.0
Giant Schnauzer,working,26686.0,10.0,large,77.5,25.5
Scottish Terrier,terrier,17525.0,10.69,small,20.0,10.0
Kerry Blue Terrier,terrier,17240.0,9.4,medium,36.5,18.5
Afghan Hound,hound,24077.0,11.92,large,55.0,26.0
Newfoundland,working,19351.0,9.32,large,125.0,27.0
Rhodesian Ridgeback,hound,16530.0,9.1,large,77.5,25.5
Borzoi,hound,16176.0,9.08,large,82.5,28.0
Bull Terrier,terrier,18490.0,10.21,medium,60.0,21.5
Alaskan Malamute,working,21986.0,10.67,large,80.0,24.0
Bloodhound,hound,13824.0,6.75,large,85.0,25.0
Bullmastiff,working,13936.0,7.57,large,115.0,25.5
Mastiff,working,13581.0,6.5,large,175.0,30.0
Saint Bernard,working,20022.0,7.78,large,155.0,26.5

dogs = pd.read_csv('data/dogs42.csv') 
dogs

dogs

# Note that the index – 0, 1, 2, ... – does **not** count as a column!
dogs.shape

(42, 7)

dogs.head(3)

dogs.tail(2)

# Note that the index is no longer 0, 1, 2, ...!
dogs.sort_values('height', ascending=False)

dogs

dogs.index

RangeIndex(start=0, stop=42, step=1)

dogs.set_index('breed')

# The above cell didn't involve an assignment statement, so dogs was unchanged.
dogs

# By reassigning dogs, our changes will persist.
# Note that we can't run this cell twice! Try it and see what happens.
dogs = dogs.set_index('breed')
dogs

# There used to be 7 columns, but now there are only 6!
# The index is **not** a column!
dogs.shape

(42, 6)

dogs.index

Index(['Brittany', 'Cairn Terrier', 'English Cocker Spaniel', 'Cocker Spaniel',
       'Shetland Sheepdog', 'Siberian Husky', 'Miniature Schnauzer',
       'Chihuahua', 'English Springer Spaniel', 'German Shorthaired Pointer',
       'Pointer', 'Tibetan Spaniel', 'Labrador Retriever', 'Maltese',
       'Shih Tzu', 'Irish Setter', 'Golden Retriever',
       'Chesapeake Bay Retriever', 'Tibetan Terrier', 'Gordon Setter', 'Pug',
       'Norfolk Terrier', 'English Toy Spaniel',
       'Cavalier King Charles Spaniel', 'Basenji',
       'Staffordshire Bull Terrier', 'Pembroke Welsh Corgi', 'Clumber Spaniel',
       'Dandie Dinmont Terrier', 'Giant Schnauzer', 'Scottish Terrier',
       'Kerry Blue Terrier', 'Afghan Hound', 'Newfoundland',
       'Rhodesian Ridgeback', 'Borzoi', 'Bull Terrier', 'Alaskan Malamute',
       'Bloodhound', 'Bullmastiff', 'Mastiff', 'Saint Bernard'],
      dtype='object', name='breed')

tallest_breed = dogs.sort_values('height', ascending=False).index[0] 
tallest_breed

'Mastiff'

breed
Brittany                  sporting
Cairn Terrier              terrier
English Cocker Spaniel    sporting
                            ...   
Bullmastiff                working
Mastiff                    working
Saint Bernard              working
Name: kind, Length: 42, dtype: object

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3790, in Index.get_loc(self, key)
   3789 try:
-> 3790     return self._engine.get_loc(casted_key)
   3791 except KeyError as err:

File index.pyx:152, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:181, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7080, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'breed'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[24], line 2
      1 # Breeds are stored in the index, which is not a column!
----> 2 dogs['breed']

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/frame.py:3896, in DataFrame.__getitem__(self, key)
   3894 if self.columns.nlevels > 1:
   3895     return self._getitem_multilevel(key)
-> 3896 indexer = self.columns.get_loc(key)
   3897 if is_integer(indexer):
   3898     indexer = [indexer]

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3797, in Index.get_loc(self, key)
   3792     if isinstance(casted_key, slice) or (
   3793         isinstance(casted_key, abc.Iterable)
   3794         and any(isinstance(x, slice) for x in casted_key)
   3795     ):
   3796         raise InvalidIndexError(key)
-> 3797     raise KeyError(key) from err
   3798 except TypeError:
   3799     # If we have a listlike key, _check_indexing_error will raise
   3800     #  InvalidIndexError. Otherwise we fall through and re-raise
   3801     #  the TypeError.
   3802     self._check_indexing_error(key)

KeyError: 'breed'

Index(['Brittany', 'Cairn Terrier', 'English Cocker Spaniel', 'Cocker Spaniel',
       'Shetland Sheepdog', 'Siberian Husky', 'Miniature Schnauzer',
       'Chihuahua', 'English Springer Spaniel', 'German Shorthaired Pointer',
       'Pointer', 'Tibetan Spaniel', 'Labrador Retriever', 'Maltese',
       'Shih Tzu', 'Irish Setter', 'Golden Retriever',
       'Chesapeake Bay Retriever', 'Tibetan Terrier', 'Gordon Setter', 'Pug',
       'Norfolk Terrier', 'English Toy Spaniel',
       'Cavalier King Charles Spaniel', 'Basenji',
       'Staffordshire Bull Terrier', 'Pembroke Welsh Corgi', 'Clumber Spaniel',
       'Dandie Dinmont Terrier', 'Giant Schnauzer', 'Scottish Terrier',
       'Kerry Blue Terrier', 'Afghan Hound', 'Newfoundland',
       'Rhodesian Ridgeback', 'Borzoi', 'Bull Terrier', 'Alaskan Malamute',
       'Bloodhound', 'Bullmastiff', 'Mastiff', 'Saint Bernard'],
      dtype='object', name='breed')

array(['sporting', 'terrier', 'herding', 'working', 'toy', 'non-sporting',
       'hound'], dtype=object)

7

kind
sporting        12
terrier          8
working          7
toy              6
hound            5
herding          2
non-sporting     2
Name: count, dtype: int64

11.279285714285715

count     42.00
mean      50.17
std       39.52
          ...  
50%       40.75
75%       67.50
max      175.00
Name: weight, Length: 8, dtype: float64

breed
Mastiff                       13581.0
Bloodhound                    13824.0
Bullmastiff                   13936.0
                               ...   
German Shorthaired Pointer    25842.0
Chihuahua                     26250.0
Giant Schnauzer               26686.0
Name: lifetime_cost, Length: 42, dtype: float64

tallest_breed = dogs.sort_values('height', ascending=False).index[0] 
tallest_breed

'Mastiff'

def display_df(df, rows=pd.options.display.max_rows, cols=pd.options.display.max_columns):
    """Displays n rows and cols from df."""
    with pd.option_context("display.max_rows", rows,
                           "display.max_columns", cols):
        display(df)

display_df(dogs.sort_values('weight', ascending=False), rows=42)

dogs

# Returns a Series. Note the index appears again on the left!
dogs['kind']

breed
Brittany                  sporting
Cairn Terrier              terrier
English Cocker Spaniel    sporting
                            ...   
Bullmastiff                working
Mastiff                    working
Saint Bernard              working
Name: kind, Length: 42, dtype: object

# Returns a DataFrame.
dogs[['kind', 'size']]

# 🤔
dogs[['kind']]

# Breeds are stored in the index, which is not a column!
dogs['breed']

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3790, in Index.get_loc(self, key)
   3789 try:
-> 3790     return self._engine.get_loc(casted_key)
   3791 except KeyError as err:

File index.pyx:152, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:181, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7080, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'breed'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[24], line 2
      1 # Breeds are stored in the index, which is not a column!
----> 2 dogs['breed']

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/frame.py:3896, in DataFrame.__getitem__(self, key)
   3894 if self.columns.nlevels > 1:
   3895     return self._getitem_multilevel(key)
-> 3896 indexer = self.columns.get_loc(key)
   3897 if is_integer(indexer):
   3898     indexer = [indexer]

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3797, in Index.get_loc(self, key)
   3792     if isinstance(casted_key, slice) or (
   3793         isinstance(casted_key, abc.Iterable)
   3794         and any(isinstance(x, slice) for x in casted_key)
   3795     ):
   3796         raise InvalidIndexError(key)
-> 3797     raise KeyError(key) from err
   3798 except TypeError:
   3799     # If we have a listlike key, _check_indexing_error will raise
   3800     #  InvalidIndexError. Otherwise we fall through and re-raise
   3801     #  the TypeError.
   3802     self._check_indexing_error(key)

KeyError: 'breed'

dogs.index

Index(['Brittany', 'Cairn Terrier', 'English Cocker Spaniel', 'Cocker Spaniel',
       'Shetland Sheepdog', 'Siberian Husky', 'Miniature Schnauzer',
       'Chihuahua', 'English Springer Spaniel', 'German Shorthaired Pointer',
       'Pointer', 'Tibetan Spaniel', 'Labrador Retriever', 'Maltese',
       'Shih Tzu', 'Irish Setter', 'Golden Retriever',
       'Chesapeake Bay Retriever', 'Tibetan Terrier', 'Gordon Setter', 'Pug',
       'Norfolk Terrier', 'English Toy Spaniel',
       'Cavalier King Charles Spaniel', 'Basenji',
       'Staffordshire Bull Terrier', 'Pembroke Welsh Corgi', 'Clumber Spaniel',
       'Dandie Dinmont Terrier', 'Giant Schnauzer', 'Scottish Terrier',
       'Kerry Blue Terrier', 'Afghan Hound', 'Newfoundland',
       'Rhodesian Ridgeback', 'Borzoi', 'Bull Terrier', 'Alaskan Malamute',
       'Bloodhound', 'Bullmastiff', 'Mastiff', 'Saint Bernard'],
      dtype='object', name='breed')

dogs

# What are the unique kinds of dogs?
dogs['kind'].unique()

array(['sporting', 'terrier', 'herding', 'working', 'toy', 'non-sporting',
       'hound'], dtype=object)

# How many unique kinds of dogs are there?
dogs['kind'].nunique()

7

# What's the distribution of kinds?
# value_counts is super useful – and I love asking exam questions about it!
dogs['kind'].value_counts()

kind
sporting        12
terrier          8
working          7
toy              6
hound            5
herding          2
non-sporting     2
Name: count, dtype: int64

# What's the mean of the 'longevity' column?
dogs['longevity'].mean()

11.279285714285715

# Tell me more about the 'weight' column.
dogs['weight'].describe()

count     42.00
mean      50.17
std       39.52
          ...  
50%       40.75
75%       67.50
max      175.00
Name: weight, Length: 8, dtype: float64

# Sort the 'lifetime_cost' column. Note that here we're using sort_values on a Series, not a DataFrame!
dogs['lifetime_cost'].sort_values()

breed
Mastiff                       13581.0
Bloodhound                    13824.0
Bullmastiff                   13936.0
                               ...   
German Shorthaired Pointer    25842.0
Chihuahua                     26250.0
Giant Schnauzer               26686.0
Name: lifetime_cost, Length: 42, dtype: float64

# Gives us the index of the largest value, not the largest value itself.
# Note that this makes our Activity from a few slides ago way easier!
dogs['height'].idxmax()

'Mastiff'

dogs.sort_values('weight', ascending=False).head(10)['size'].value_counts().idxmax()

'large'

show_chaining_slides()

(
    dogs
    .sort_values('weight', ascending=False)
    .head(10)
    ['size']
    .value_counts()
    .idxmax()
)

'large'

x = pd.Series({'a': 1, 'b': 2})
x

a    1
b    2
dtype: int64

x * 5

a     5
b    10
dtype: int64

y = pd.Series({'b': 5, 'c': -1, 'a': 10})
y

b     5
c    -1
a    10
dtype: int64

# If x and y were regular numpy arrays, this would error because of the size mismatch.
x + y

a    11.0
b     7.0
c     NaN
dtype: float64

dogs

2025 + dogs['longevity']

breed
Brittany                  2037.92
Cairn Terrier             2038.84
English Cocker Spaniel    2036.66
                           ...   
Bullmastiff               2032.57
Mastiff                   2031.50
Saint Bernard             2032.78
Name: longevity, Length: 42, dtype: float64

dogs

weight_kg = dogs['weight'] / 2.2
height_m = dogs['height'] * 2.54 / 100
bmis = weight_kg / (height_m ** 2)
bmis

breed
Brittany                   68.31
Cairn Terrier              98.64
English Cocker Spaniel     82.56
                           ...  
Bullmastiff               124.60
Mastiff                   137.00
Saint Bernard             155.51
Length: 42, dtype: float64

dogs.plot(kind='scatter', x='weight', y='longevity')

# Hover over a point and see what happens!
(
    dogs
    .reset_index()
    .plot(kind='scatter', x='weight', y='longevity', color='size', hover_name='breed',
          title='Longevity vs. Weight for 42 Dog Breeds')
)

(
    dogs['kind']
    .value_counts()
    .sort_values()
    .plot(kind='barh', title='Distribution of Dog Kinds')
)

dogs

# The first argument is the row label, i.e. the index value.
#        ↓
dogs.loc['Pug', 'longevity']
#                  ↑
# The second argument is the column label.

11.0

type(dogs.loc)

pandas.core.indexing._LocIndexer

type(dogs.sort_values)

method

dogs

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], 'size']

breed
Cocker Spaniel         small
Labrador Retriever    medium
Name: size, dtype: object

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], ['kind', 'size', 'height']]

# Note that the 'weight' column is included!
# loc, per the pandas documentation, is inclusive of both slicer endpoints.
dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], 'lifetime_cost': 'weight']

dogs.loc[['Cocker Spaniel', 'Labrador Retriever']]

dogs

# Try removing the iloc and see what happens!
dogs.iloc[1:15, :-2]

dogs.sort_values('longevity', ascending=False)['weight'].iloc[0]

5.5

# Finding the breed itself involves sorting, but not iloc, since breeds are stored in the index.
dogs.sort_values('longevity', ascending=False).index[0]

'Chihuahua'

second_tallest_height = (
    dogs
    .loc[['Cocker Spaniel', 'Labrador Retriever', 'Newfoundland', 'Irish Setter'], 'height']
    .sort_values(ascending=False)
    .iloc[1]
)
second_tallest_height

26.0

breed
Brittany                  False
Cairn Terrier              True
English Cocker Spaniel    False
                          ...  
Bullmastiff               False
Mastiff                   False
Saint Bernard             False
Name: kind, Length: 42, dtype: bool

33

'Labrador Retriever'

kind             sporting
lifetime_cost     21447.0
longevity           12.04
size               medium
weight               60.0
height              22.75
Name: Golden Retriever, dtype: object

0    fee
1     fi
Name: 1, dtype: object

0     fo
1    fum
Name: 1, dtype: object

1     fi
1    fum
Name: 1, dtype: object

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3790, in Index.get_loc(self, key)
   3789 try:
-> 3790     return self._engine.get_loc(casted_key)
   3791 except KeyError as err:

File index.pyx:152, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:158, in pandas._libs.index.IndexEngine.get_loc()

TypeError: '(1, ['1', 1])' is an invalid key

During handling of the above exception, another exception occurred:

InvalidIndexError                         Traceback (most recent call last)
Cell In[82], line 1
----> 1 jack[1, ['1', 1]]

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/frame.py:3896, in DataFrame.__getitem__(self, key)
   3894 if self.columns.nlevels > 1:
   3895     return self._getitem_multilevel(key)
-> 3896 indexer = self.columns.get_loc(key)
   3897 if is_integer(indexer):
   3898     indexer = [indexer]

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3802, in Index.get_loc(self, key)
   3797     raise KeyError(key) from err
   3798 except TypeError:
   3799     # If we have a listlike key, _check_indexing_error will raise
   3800     #  InvalidIndexError. Otherwise we fall through and re-raise
   3801     #  the TypeError.
-> 3802     self._check_indexing_error(key)
   3803     raise

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:5974, in Index._check_indexing_error(self, key)
   5970 def _check_indexing_error(self, key):
   5971     if not is_scalar(key):
   5972         # if key is not a scalar, directly raise an error (the code below
   5973         # would convert to numpy arrays and raise later any way) - GH29926
-> 5974         raise InvalidIndexError(key)

InvalidIndexError: (1, ['1', 1])

'fi'

second_tallest_height = (
    dogs
    .loc[['Cocker Spaniel', 'Labrador Retriever', 'Newfoundland', 'Irish Setter'], 'height']
    .sort_values(ascending=False)
    .iloc[1]
)
second_tallest_height

26.0

dogs

dogs['kind'] == 'terrier'

breed
Brittany                  False
Cairn Terrier              True
English Cocker Spaniel    False
                          ...  
Bullmastiff               False
Mastiff                   False
Saint Bernard             False
Name: kind, Length: 42, dtype: bool

dogs.loc[dogs['kind'] == 'terrier']

dogs.loc[dogs['longevity'] >= 10].shape[0]

33

dogs[dogs['longevity'] >= 10]

# Since we're selecting both rows AND columns, we do need loc here.
(
    dogs.loc[dogs.index.str.contains('Retriever'), 'height']
    .sort_values(ascending=False)
    .index[1]
)

'Labrador Retriever'

# Empty DataFrame – not an error!
dogs[dogs['kind'] == 'beaver']

dogs.loc['Golden Retriever']

kind             sporting
lifetime_cost     21447.0
longevity           12.04
size               medium
weight               60.0
height              22.75
Name: Golden Retriever, dtype: object

dogs[(dogs['kind'] == 'sporting') | (dogs['kind'] == 'working')]

# Equivalent to the above!
dogs[dogs['kind'].isin(['sporting', 'working'])]

dogs

0    fee
1     fi
Name: 1, dtype: object

0     fo
1    fum
Name: 1, dtype: object

1     fi
1    fum
Name: 1, dtype: object

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3790, in Index.get_loc(self, key)
   3789 try:
-> 3790     return self._engine.get_loc(casted_key)
   3791 except KeyError as err:

File index.pyx:152, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:158, in pandas._libs.index.IndexEngine.get_loc()

TypeError: '(1, ['1', 1])' is an invalid key

During handling of the above exception, another exception occurred:

InvalidIndexError                         Traceback (most recent call last)
Cell In[82], line 1
----> 1 jack[1, ['1', 1]]

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/frame.py:3896, in DataFrame.__getitem__(self, key)
   3894 if self.columns.nlevels > 1:
   3895     return self._getitem_multilevel(key)
-> 3896 indexer = self.columns.get_loc(key)
   3897 if is_integer(indexer):
   3898     indexer = [indexer]

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3802, in Index.get_loc(self, key)
   3797     raise KeyError(key) from err
   3798 except TypeError:
   3799     # If we have a listlike key, _check_indexing_error will raise
   3800     #  InvalidIndexError. Otherwise we fall through and re-raise
   3801     #  the TypeError.
-> 3802     self._check_indexing_error(key)
   3803     raise

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:5974, in Index._check_indexing_error(self, key)
   5970 def _check_indexing_error(self, key):
   5971     if not is_scalar(key):
   5972         # if key is not a scalar, directly raise an error (the code below
   5973         # would convert to numpy arrays and raise later any way) - GH29926
-> 5974         raise InvalidIndexError(key)

InvalidIndexError: (1, ['1', 1])

'fi'

array(['sporting', 'terrier', 'herding', 'working', 'toy', 'non-sporting',
       'hound'], dtype=object)

11.649166666666668

11.655

dogs

dogs.query('weight < 20 and kind == "terrier"')

dogs.query('kind in ["sporting", "terrier"] and lifetime_cost < 20000')

jack = pd.DataFrame({1: ['fee', 'fi'], 
                     '1': ['fo', 'fum']})
jack

0    fee
1     fi
Name: 1, dtype: object

0     fo
1    fum
Name: 1, dtype: object

1     fi
1    fum
Name: 1, dtype: object

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3790, in Index.get_loc(self, key)
   3789 try:
-> 3790     return self._engine.get_loc(casted_key)
   3791 except KeyError as err:

File index.pyx:152, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:158, in pandas._libs.index.IndexEngine.get_loc()

TypeError: '(1, ['1', 1])' is an invalid key

During handling of the above exception, another exception occurred:

InvalidIndexError                         Traceback (most recent call last)
Cell In[82], line 1
----> 1 jack[1, ['1', 1]]

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/frame.py:3896, in DataFrame.__getitem__(self, key)
   3894 if self.columns.nlevels > 1:
   3895     return self._getitem_multilevel(key)
-> 3896 indexer = self.columns.get_loc(key)
   3897 if is_integer(indexer):
   3898     indexer = [indexer]

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3802, in Index.get_loc(self, key)
   3797     raise KeyError(key) from err
   3798 except TypeError:
   3799     # If we have a listlike key, _check_indexing_error will raise
   3800     #  InvalidIndexError. Otherwise we fall through and re-raise
   3801     #  the TypeError.
-> 3802     self._check_indexing_error(key)
   3803     raise

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:5974, in Index._check_indexing_error(self, key)
   5970 def _check_indexing_error(self, key):
   5971     if not is_scalar(key):
   5972         # if key is not a scalar, directly raise an error (the code below
   5973         # would convert to numpy arrays and raise later any way) - GH29926
-> 5974         raise InvalidIndexError(key)

InvalidIndexError: (1, ['1', 1])

'fi'

array(['sporting', 'terrier', 'herding', 'working', 'toy', 'non-sporting',
       'hound'], dtype=object)

11.649166666666668

11.655

jack = pd.DataFrame({1: ['fee', 'fi'], 
                     '1': ['fo', 'fum']})
jack

jack[1]

0    fee
1     fi
Name: 1, dtype: object

jack[[1]]

jack['1']

0     fo
1    fum
Name: 1, dtype: object

jack[[1, 1]]

jack.loc[1]

1     fi
1    fum
Name: 1, dtype: object

jack.loc[jack[1] == 'fo']

jack[1, ['1', 1]]

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3790, in Index.get_loc(self, key)
   3789 try:
-> 3790     return self._engine.get_loc(casted_key)
   3791 except KeyError as err:

File index.pyx:152, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:158, in pandas._libs.index.IndexEngine.get_loc()

TypeError: '(1, ['1', 1])' is an invalid key

During handling of the above exception, another exception occurred:

InvalidIndexError                         Traceback (most recent call last)
Cell In[82], line 1
----> 1 jack[1, ['1', 1]]

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/frame.py:3896, in DataFrame.__getitem__(self, key)
   3894 if self.columns.nlevels > 1:
   3895     return self._getitem_multilevel(key)
-> 3896 indexer = self.columns.get_loc(key)
   3897 if is_integer(indexer):
   3898     indexer = [indexer]

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3802, in Index.get_loc(self, key)
   3797     raise KeyError(key) from err
   3798 except TypeError:
   3799     # If we have a listlike key, _check_indexing_error will raise
   3800     #  InvalidIndexError. Otherwise we fall through and re-raise
   3801     #  the TypeError.
-> 3802     self._check_indexing_error(key)
   3803     raise

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:5974, in Index._check_indexing_error(self, key)
   5970 def _check_indexing_error(self, key):
   5971     if not is_scalar(key):
   5972         # if key is not a scalar, directly raise an error (the code below
   5973         # would convert to numpy arrays and raise later any way) - GH29926
-> 5974         raise InvalidIndexError(key)

InvalidIndexError: (1, ['1', 1])

jack.loc[1, 1]

'fi'

dogs['kind'].unique()

array(['sporting', 'terrier', 'herding', 'working', 'toy', 'non-sporting',
       'hound'], dtype=object)

dogs.loc[dogs['kind'] == 'sporting', 'longevity'].mean()

11.649166666666668

dogs.loc[dogs['kind'] == 'terrier', 'longevity'].mean()

11.655

Lecture 4¶

DataFrame Fundamentals¶

EECS 398: Practical Data Science, Spring 2025¶

Agenda 📆¶

Question 🤔 (Answer at practicaldsc.org/q)

Introduction to `pandas` DataFrames¶

`pandas`¶

`pandas` data structures¶

Example: Dog breeds 🐶¶

Exploring our first DataFrame¶

Setting the index¶

Activity

Reference Slide¶

Displaying more rows/columns¶

Selecting columns¶

Selecting columns with `[]`¶

Useful Series methods¶

Method chaining 🔗¶

Series support vectorized operations¶

Aside: Visualization 📊¶

Selecting slices 🍰¶

Use `loc` to slice rows and columns using labels¶

`loc` is flexible 🧘¶

Use `iloc` to slice rows and columns using positions¶

Activity

Querying 🔎¶

Reflection¶

Querying¶

Reference Slide¶

The `query` method¶

Reference Slide¶

More practice¶

What's next?¶

	breed	kind	lifetime_cost	longevity	size	weight	height
0	Brittany	sporting	22589.0	12.92	medium	35.0	19.0
1	Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.0
2	English Cocker Spaniel	sporting	18993.0	11.66	medium	30.0	16.0
...	...	...	...	...	...	...	...
39	Bullmastiff	working	13936.0	7.57	large	115.0	25.5
40	Mastiff	working	13581.0	6.50	large	175.0	30.0
41	Saint Bernard	working	20022.0	7.78	large	155.0	26.5

	kind	lifetime_cost	longevity	size	weight	height
breed
Brittany	sporting	22589.0	12.92	medium	35.0	19.0
Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.0
English Cocker Spaniel	sporting	18993.0	11.66	medium	30.0	16.0
...	...	...	...	...	...	...
Bullmastiff	working	13936.0	7.57	large	115.0	25.5
Mastiff	working	13581.0	6.50	large	175.0	30.0
Saint Bernard	working	20022.0	7.78	large	155.0	26.5

	kind	size	height
breed
Cocker Spaniel	sporting	small	14.5
Labrador Retriever	sporting	medium	23.0

	lifetime_cost	longevity	size	weight
breed
Cocker Spaniel	24330.0	12.50	small	25.0
Labrador Retriever	21299.0	12.04	medium	67.5

Lecture 4¶

DataFrame Fundamentals¶

EECS 398: Practical Data Science, Spring 2025¶

Agenda 📆¶

Question 🤔 (Answer at practicaldsc.org/q)

Introduction to pandas DataFrames¶

pandas¶

pandas data structures¶

Example: Dog breeds 🐶¶

Exploring our first DataFrame¶

Setting the index¶

Activity

Reference Slide¶

Displaying more rows/columns¶

Selecting columns¶

Selecting columns with []¶

Useful Series methods¶

Method chaining 🔗¶

Series support vectorized operations¶

Aside: Visualization 📊¶

Selecting slices 🍰¶

Use loc to slice rows and columns using labels¶

loc is flexible 🧘¶

Use iloc to slice rows and columns using positions¶

Activity

Querying 🔎¶

Reflection¶

Querying¶

Reference Slide¶

The query method¶

Reference Slide¶

More practice¶

What's next?¶

Introduction to `pandas` DataFrames¶

`pandas`¶

`pandas` data structures¶

Selecting columns with `[]`¶

Use `loc` to slice rows and columns using labels¶

`loc` is flexible 🧘¶

Use `iloc` to slice rows and columns using positions¶

The `query` method¶