from lec_utils import *

# Run this cell multiple times!
# Returns a random integer between 1 and 6, inclusive.
np.random.randint(1, 7)

3

0.24976347964756174

array(['T', 'H', 'T', 'T', 'T'], dtype='<U1')

array([3, 9])

48

55

49

0.5207

0.5221945185847372

0.9999

# Run this cell multiple times!
# Returns a random integer between 1 and 6, inclusive.
np.random.randint(1, 7)

3

# Returns a random real number between 0 and 1.
np.random.random()

0.24976347964756174

# Returns a randomly selected element from the provided list, 5 times.
np.random.choice(['H', 'T'], 5)

array(['T', 'H', 'T', 'T', 'T'], dtype='<U1')

# Returns the number of occurrences of each outcome
# in 12 trials of an experiment in which
# outcome 1 happens 60% of the time and
# outcome 2 happens 40% of the time.
np.random.multinomial(12, [0.6, 0.4])

array([3, 9])

(np.random.choice(['H', 'T'], 100) == 'H').sum()

48

np.random.multinomial(100, [0.5, 0.5])[0]

55

def num_heads():
    return np.random.multinomial(100, [0.5, 0.5])[0]
num_heads()

49

outcomes = np.array([]) 
for _ in range(10_000):
    # Note that with arrays, append is a FUNCTION,
    # not a METHOD, and is NOT destructive, 
    # unlike with lists!
    outcomes = np.append(outcomes, num_heads())

px.histogram(outcomes)

((outcomes >= 40) & (outcomes <= 50)).mean()

0.5207

from scipy.stats import binom
binom.cdf(50, 100, 0.5) - binom.cdf(39, 100, 0.5)

0.5221945185847372

def simulate_classroom(n):
    # This helper function should take in a class size, n,
    # and return True if a simulated classroom of size n
    # has at least 2 students with the same birthday
    # and False otherwise.
    # This is not the most efficient solution, but works for now.
    options = np.arange(1, 366)
    chosen_options = np.random.choice(options, n, replace=True)
    return len(chosen_options) != len(np.unique(chosen_options))
def estimated_probability(n):
    return np.mean([simulate_classroom(n) for _ in range(10_000)])

0.9999

breed,kind,lifetime_cost,longevity,size,weight,height
Brittany,sporting,22589.0,12.92,medium,35.0,19.0
Cairn Terrier,terrier,21992.0,13.84,small,14.0,10.0
English Cocker Spaniel,sporting,18993.0,11.66,medium,30.0,16.0
Cocker Spaniel,sporting,24330.0,12.5,small,25.0,14.5
Shetland Sheepdog,herding,21006.0,12.53,small,22.0,14.5
Siberian Husky,working,22049.0,12.58,medium,47.5,21.75
Lhasa Apso,non-sporting,22031.0,13.92,small,15.0,10.5
Miniature Schnauzer,terrier,20087.0,11.81,small,15.5,13.0
Chihuahua,toy,26250.0,16.5,small,5.5,5.0
English Springer Spaniel,sporting,21946.0,12.54,medium,45.0,19.5
German Shorthaired Pointer,sporting,25842.0,11.46,large,62.5,24.0
Pointer,sporting,24445.0,12.42,large,59.5,25.5
Tibetan Spaniel,non-sporting,25549.0,14.42,small,12.0,10.0
Labrador Retriever,sporting,21299.0,12.04,medium,67.5,23.0
Maltese,toy,19084.0,12.25,small,5.0,9.0
Shih Tzu,toy,21152.0,13.2,small,12.5,9.75
Irish Setter,sporting,20323.0,11.63,large,65.0,26.0
Golden Retriever,sporting,21447.0,12.04,medium,60.0,22.75
Chesapeake Bay Retriever,sporting,16697.0,9.48,large,67.5,23.5
Tibetan Terrier,non-sporting,20336.0,12.31,small,24.0,15.5
Gordon Setter,sporting,19605.0,11.1,large,62.5,25.0
Pug,toy,18527.0,11.0,medium,16.0,16.0
Norfolk Terrier,terrier,24308.0,13.07,small,12.0,9.5
English Toy Spaniel,toy,17521.0,10.1,small,11.0,10.0
Cavalier King Charles Spaniel,toy,18639.0,11.29,small,15.5,12.5
Basenji,hound,22096.0,13.58,medium,23.0,16.5
Staffordshire Bull Terrier,terrier,21650.0,12.05,medium,31.0,15.0
Pembroke Welsh Corgi,herding,23978.0,12.25,small,26.0,11.0
Clumber Spaniel,sporting,18084.0,10.0,medium,70.0,18.5
Dandie Dinmont Terrier,terrier,21633.0,12.17,small,21.0,9.0
Giant Schnauzer,working,26686.0,10.0,large,77.5,25.5
Scottish Terrier,terrier,17525.0,10.69,small,20.0,10.0
Kerry Blue Terrier,terrier,17240.0,9.4,medium,36.5,18.5
Afghan Hound,hound,24077.0,11.92,large,55.0,26.0
Newfoundland,working,19351.0,9.32,large,125.0,27.0
Rhodesian Ridgeback,hound,16530.0,9.1,large,77.5,25.5
Borzoi,hound,16176.0,9.08,large,82.5,28.0
Bull Terrier,terrier,18490.0,10.21,medium,60.0,21.5
Alaskan Malamute,working,21986.0,10.67,large,80.0,24.0
Bloodhound,hound,13824.0,6.75,large,85.0,25.0
Bullmastiff,working,13936.0,7.57,large,115.0,25.5
Mastiff,working,13581.0,6.5,large,175.0,30.0
Saint Bernard,working,20022.0,7.78,large,155.0,26.5

(43, 7)

RangeIndex(start=0, stop=43, step=1)

(43, 6)

Index(['Brittany', 'Cairn Terrier', 'English Cocker Spaniel', 'Cocker Spaniel',
       'Shetland Sheepdog', 'Siberian Husky', 'Lhasa Apso',
       'Miniature Schnauzer', 'Chihuahua', 'English Springer Spaniel',
       'German Shorthaired Pointer', 'Pointer', 'Tibetan Spaniel',
       'Labrador Retriever', 'Maltese', 'Shih Tzu', 'Irish Setter',
       'Golden Retriever', 'Chesapeake Bay Retriever', 'Tibetan Terrier',
       'Gordon Setter', 'Pug', 'Norfolk Terrier', 'English Toy Spaniel',
       'Cavalier King Charles Spaniel', 'Basenji',
       'Staffordshire Bull Terrier', 'Pembroke Welsh Corgi', 'Clumber Spaniel',
       'Dandie Dinmont Terrier', 'Giant Schnauzer', 'Scottish Terrier',
       'Kerry Blue Terrier', 'Afghan Hound', 'Newfoundland',
       'Rhodesian Ridgeback', 'Borzoi', 'Bull Terrier', 'Alaskan Malamute',
       'Bloodhound', 'Bullmastiff', 'Mastiff', 'Saint Bernard'],
      dtype='object', name='breed')

'Mastiff'

breed
Brittany                  sporting
Cairn Terrier              terrier
English Cocker Spaniel    sporting
                            ...   
Bullmastiff                working
Mastiff                    working
Saint Bernard              working
Name: kind, Length: 43, dtype: object

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3790, in Index.get_loc(self, key)
   3789 try:
-> 3790     return self._engine.get_loc(casted_key)
   3791 except KeyError as err:

File index.pyx:152, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:181, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7080, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'breed'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[40], line 2
      1 # Breeds are stored in the index, which is not a column!
----> 2 dogs['breed']

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/frame.py:3896, in DataFrame.__getitem__(self, key)
   3894 if self.columns.nlevels > 1:
   3895     return self._getitem_multilevel(key)
-> 3896 indexer = self.columns.get_loc(key)
   3897 if is_integer(indexer):
   3898     indexer = [indexer]

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3797, in Index.get_loc(self, key)
   3792     if isinstance(casted_key, slice) or (
   3793         isinstance(casted_key, abc.Iterable)
   3794         and any(isinstance(x, slice) for x in casted_key)
   3795     ):
   3796         raise InvalidIndexError(key)
-> 3797     raise KeyError(key) from err
   3798 except TypeError:
   3799     # If we have a listlike key, _check_indexing_error will raise
   3800     #  InvalidIndexError. Otherwise we fall through and re-raise
   3801     #  the TypeError.
   3802     self._check_indexing_error(key)

KeyError: 'breed'

Index(['Brittany', 'Cairn Terrier', 'English Cocker Spaniel', 'Cocker Spaniel',
       'Shetland Sheepdog', 'Siberian Husky', 'Lhasa Apso',
       'Miniature Schnauzer', 'Chihuahua', 'English Springer Spaniel',
       'German Shorthaired Pointer', 'Pointer', 'Tibetan Spaniel',
       'Labrador Retriever', 'Maltese', 'Shih Tzu', 'Irish Setter',
       'Golden Retriever', 'Chesapeake Bay Retriever', 'Tibetan Terrier',
       'Gordon Setter', 'Pug', 'Norfolk Terrier', 'English Toy Spaniel',
       'Cavalier King Charles Spaniel', 'Basenji',
       'Staffordshire Bull Terrier', 'Pembroke Welsh Corgi', 'Clumber Spaniel',
       'Dandie Dinmont Terrier', 'Giant Schnauzer', 'Scottish Terrier',
       'Kerry Blue Terrier', 'Afghan Hound', 'Newfoundland',
       'Rhodesian Ridgeback', 'Borzoi', 'Bull Terrier', 'Alaskan Malamute',
       'Bloodhound', 'Bullmastiff', 'Mastiff', 'Saint Bernard'],
      dtype='object', name='breed')

def simulate_classroom(n):
    # This helper function should take in a class size, n,
    # and return True if a simulated classroom of size n
    # has at least 2 students with the same birthday
    # and False otherwise.
    # This is not the most efficient solution, but works for now.
    options = np.arange(1, 366)
    chosen_options = np.random.choice(options, n, replace=True)
    return len(chosen_options) != len(np.unique(chosen_options))
def estimated_probability(n):
    return np.mean([simulate_classroom(n) for _ in range(10_000)])

estimated_probability(80)

0.9999

probs = [estimated_probability(n) for n in range(1, 51)]

(
    px
    .bar(x=range(1, 51), 
         y=probs,
         title='Probability that at least 2 students share the<br>same birthday in a class of n students')
    .update_xaxes(title='$n$')
    .update_yaxes(title='Probability')
)

import pandas as pd
import numpy as np

# The "cat" shell command shows you the contents of a file.
!cat data/dogs43.csv

breed,kind,lifetime_cost,longevity,size,weight,height
Brittany,sporting,22589.0,12.92,medium,35.0,19.0
Cairn Terrier,terrier,21992.0,13.84,small,14.0,10.0
English Cocker Spaniel,sporting,18993.0,11.66,medium,30.0,16.0
Cocker Spaniel,sporting,24330.0,12.5,small,25.0,14.5
Shetland Sheepdog,herding,21006.0,12.53,small,22.0,14.5
Siberian Husky,working,22049.0,12.58,medium,47.5,21.75
Lhasa Apso,non-sporting,22031.0,13.92,small,15.0,10.5
Miniature Schnauzer,terrier,20087.0,11.81,small,15.5,13.0
Chihuahua,toy,26250.0,16.5,small,5.5,5.0
English Springer Spaniel,sporting,21946.0,12.54,medium,45.0,19.5
German Shorthaired Pointer,sporting,25842.0,11.46,large,62.5,24.0
Pointer,sporting,24445.0,12.42,large,59.5,25.5
Tibetan Spaniel,non-sporting,25549.0,14.42,small,12.0,10.0
Labrador Retriever,sporting,21299.0,12.04,medium,67.5,23.0
Maltese,toy,19084.0,12.25,small,5.0,9.0
Shih Tzu,toy,21152.0,13.2,small,12.5,9.75
Irish Setter,sporting,20323.0,11.63,large,65.0,26.0
Golden Retriever,sporting,21447.0,12.04,medium,60.0,22.75
Chesapeake Bay Retriever,sporting,16697.0,9.48,large,67.5,23.5
Tibetan Terrier,non-sporting,20336.0,12.31,small,24.0,15.5
Gordon Setter,sporting,19605.0,11.1,large,62.5,25.0
Pug,toy,18527.0,11.0,medium,16.0,16.0
Norfolk Terrier,terrier,24308.0,13.07,small,12.0,9.5
English Toy Spaniel,toy,17521.0,10.1,small,11.0,10.0
Cavalier King Charles Spaniel,toy,18639.0,11.29,small,15.5,12.5
Basenji,hound,22096.0,13.58,medium,23.0,16.5
Staffordshire Bull Terrier,terrier,21650.0,12.05,medium,31.0,15.0
Pembroke Welsh Corgi,herding,23978.0,12.25,small,26.0,11.0
Clumber Spaniel,sporting,18084.0,10.0,medium,70.0,18.5
Dandie Dinmont Terrier,terrier,21633.0,12.17,small,21.0,9.0
Giant Schnauzer,working,26686.0,10.0,large,77.5,25.5
Scottish Terrier,terrier,17525.0,10.69,small,20.0,10.0
Kerry Blue Terrier,terrier,17240.0,9.4,medium,36.5,18.5
Afghan Hound,hound,24077.0,11.92,large,55.0,26.0
Newfoundland,working,19351.0,9.32,large,125.0,27.0
Rhodesian Ridgeback,hound,16530.0,9.1,large,77.5,25.5
Borzoi,hound,16176.0,9.08,large,82.5,28.0
Bull Terrier,terrier,18490.0,10.21,medium,60.0,21.5
Alaskan Malamute,working,21986.0,10.67,large,80.0,24.0
Bloodhound,hound,13824.0,6.75,large,85.0,25.0
Bullmastiff,working,13936.0,7.57,large,115.0,25.5
Mastiff,working,13581.0,6.5,large,175.0,30.0
Saint Bernard,working,20022.0,7.78,large,155.0,26.5

dogs = pd.read_csv('data/dogs43.csv') 
dogs

dogs.head(3)

dogs.tail(2)

dogs

# Note that the index – 0, 1, 2, ... – does **not** count as a column!
dogs.shape

(43, 7)

# Note that the index is no longer 0, 1, 2, ...!
dogs.sort_values('height', ascending=False)

dogs.sort_values(['height', 'longevity'], ascending=False)

dogs

dogs.index

RangeIndex(start=0, stop=43, step=1)

dogs.set_index('breed')

# The above cell didn't involve an assignment statement, so dogs was unchanged.
dogs

# By reassigning dogs, our changes will persist.
# Note that we can't run this cell twice! Try it and see what happens.
dogs = dogs.set_index('breed')
dogs

# There used to be 7 columns, but now there are only 6!
# The index is **not** a column!
dogs.shape

(43, 6)

dogs.index

Index(['Brittany', 'Cairn Terrier', 'English Cocker Spaniel', 'Cocker Spaniel',
       'Shetland Sheepdog', 'Siberian Husky', 'Lhasa Apso',
       'Miniature Schnauzer', 'Chihuahua', 'English Springer Spaniel',
       'German Shorthaired Pointer', 'Pointer', 'Tibetan Spaniel',
       'Labrador Retriever', 'Maltese', 'Shih Tzu', 'Irish Setter',
       'Golden Retriever', 'Chesapeake Bay Retriever', 'Tibetan Terrier',
       'Gordon Setter', 'Pug', 'Norfolk Terrier', 'English Toy Spaniel',
       'Cavalier King Charles Spaniel', 'Basenji',
       'Staffordshire Bull Terrier', 'Pembroke Welsh Corgi', 'Clumber Spaniel',
       'Dandie Dinmont Terrier', 'Giant Schnauzer', 'Scottish Terrier',
       'Kerry Blue Terrier', 'Afghan Hound', 'Newfoundland',
       'Rhodesian Ridgeback', 'Borzoi', 'Bull Terrier', 'Alaskan Malamute',
       'Bloodhound', 'Bullmastiff', 'Mastiff', 'Saint Bernard'],
      dtype='object', name='breed')

tallest_breed = dogs.sort_values('height', ascending=False).index[0] 
tallest_breed

'Mastiff'

breed
Brittany                  sporting
Cairn Terrier              terrier
English Cocker Spaniel    sporting
                            ...   
Bullmastiff                working
Mastiff                    working
Saint Bernard              working
Name: kind, Length: 43, dtype: object

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3790, in Index.get_loc(self, key)
   3789 try:
-> 3790     return self._engine.get_loc(casted_key)
   3791 except KeyError as err:

File index.pyx:152, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:181, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7080, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'breed'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[40], line 2
      1 # Breeds are stored in the index, which is not a column!
----> 2 dogs['breed']

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/frame.py:3896, in DataFrame.__getitem__(self, key)
   3894 if self.columns.nlevels > 1:
   3895     return self._getitem_multilevel(key)
-> 3896 indexer = self.columns.get_loc(key)
   3897 if is_integer(indexer):
   3898     indexer = [indexer]

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3797, in Index.get_loc(self, key)
   3792     if isinstance(casted_key, slice) or (
   3793         isinstance(casted_key, abc.Iterable)
   3794         and any(isinstance(x, slice) for x in casted_key)
   3795     ):
   3796         raise InvalidIndexError(key)
-> 3797     raise KeyError(key) from err
   3798 except TypeError:
   3799     # If we have a listlike key, _check_indexing_error will raise
   3800     #  InvalidIndexError. Otherwise we fall through and re-raise
   3801     #  the TypeError.
   3802     self._check_indexing_error(key)

KeyError: 'breed'

Index(['Brittany', 'Cairn Terrier', 'English Cocker Spaniel', 'Cocker Spaniel',
       'Shetland Sheepdog', 'Siberian Husky', 'Lhasa Apso',
       'Miniature Schnauzer', 'Chihuahua', 'English Springer Spaniel',
       'German Shorthaired Pointer', 'Pointer', 'Tibetan Spaniel',
       'Labrador Retriever', 'Maltese', 'Shih Tzu', 'Irish Setter',
       'Golden Retriever', 'Chesapeake Bay Retriever', 'Tibetan Terrier',
       'Gordon Setter', 'Pug', 'Norfolk Terrier', 'English Toy Spaniel',
       'Cavalier King Charles Spaniel', 'Basenji',
       'Staffordshire Bull Terrier', 'Pembroke Welsh Corgi', 'Clumber Spaniel',
       'Dandie Dinmont Terrier', 'Giant Schnauzer', 'Scottish Terrier',
       'Kerry Blue Terrier', 'Afghan Hound', 'Newfoundland',
       'Rhodesian Ridgeback', 'Borzoi', 'Bull Terrier', 'Alaskan Malamute',
       'Bloodhound', 'Bullmastiff', 'Mastiff', 'Saint Bernard'],
      dtype='object', name='breed')

array(['sporting', 'terrier', 'herding', 'working', 'non-sporting', 'toy',
       'hound'], dtype=object)

7

kind
sporting        12
terrier          8
working          7
toy              6
hound            5
non-sporting     3
herding          2
Name: count, dtype: int64

11.340697674418605

count     43.00
mean      49.35
std       39.42
          ...  
50%       36.50
75%       67.50
max      175.00
Name: weight, Length: 8, dtype: float64

breed
Mastiff                       13581.0
Bloodhound                    13824.0
Bullmastiff                   13936.0
                               ...   
German Shorthaired Pointer    25842.0
Chihuahua                     26250.0
Giant Schnauzer               26686.0
Name: lifetime_cost, Length: 43, dtype: float64

tallest_breed = dogs.sort_values('height', ascending=False).index[0] 
tallest_breed

'Mastiff'

from IPython.display import display
def display_df(df, rows=pd.options.display.max_rows, cols=pd.options.display.max_columns):
    """Displays n rows and cols from df."""
    with pd.option_context("display.max_rows", rows,
                           "display.max_columns", cols):
        display(df)

breed
Brittany                  sporting
Cairn Terrier              terrier
English Cocker Spaniel    sporting
                            ...   
Bullmastiff                working
Mastiff                    working
Saint Bernard              working
Name: kind, Length: 43, dtype: object

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3790, in Index.get_loc(self, key)
   3789 try:
-> 3790     return self._engine.get_loc(casted_key)
   3791 except KeyError as err:

File index.pyx:152, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:181, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7080, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'breed'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[40], line 2
      1 # Breeds are stored in the index, which is not a column!
----> 2 dogs['breed']

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/frame.py:3896, in DataFrame.__getitem__(self, key)
   3894 if self.columns.nlevels > 1:
   3895     return self._getitem_multilevel(key)
-> 3896 indexer = self.columns.get_loc(key)
   3897 if is_integer(indexer):
   3898     indexer = [indexer]

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3797, in Index.get_loc(self, key)
   3792     if isinstance(casted_key, slice) or (
   3793         isinstance(casted_key, abc.Iterable)
   3794         and any(isinstance(x, slice) for x in casted_key)
   3795     ):
   3796         raise InvalidIndexError(key)
-> 3797     raise KeyError(key) from err
   3798 except TypeError:
   3799     # If we have a listlike key, _check_indexing_error will raise
   3800     #  InvalidIndexError. Otherwise we fall through and re-raise
   3801     #  the TypeError.
   3802     self._check_indexing_error(key)

KeyError: 'breed'

Index(['Brittany', 'Cairn Terrier', 'English Cocker Spaniel', 'Cocker Spaniel',
       'Shetland Sheepdog', 'Siberian Husky', 'Lhasa Apso',
       'Miniature Schnauzer', 'Chihuahua', 'English Springer Spaniel',
       'German Shorthaired Pointer', 'Pointer', 'Tibetan Spaniel',
       'Labrador Retriever', 'Maltese', 'Shih Tzu', 'Irish Setter',
       'Golden Retriever', 'Chesapeake Bay Retriever', 'Tibetan Terrier',
       'Gordon Setter', 'Pug', 'Norfolk Terrier', 'English Toy Spaniel',
       'Cavalier King Charles Spaniel', 'Basenji',
       'Staffordshire Bull Terrier', 'Pembroke Welsh Corgi', 'Clumber Spaniel',
       'Dandie Dinmont Terrier', 'Giant Schnauzer', 'Scottish Terrier',
       'Kerry Blue Terrier', 'Afghan Hound', 'Newfoundland',
       'Rhodesian Ridgeback', 'Borzoi', 'Bull Terrier', 'Alaskan Malamute',
       'Bloodhound', 'Bullmastiff', 'Mastiff', 'Saint Bernard'],
      dtype='object', name='breed')

array(['sporting', 'terrier', 'herding', 'working', 'non-sporting', 'toy',
       'hound'], dtype=object)

7

kind
sporting        12
terrier          8
working          7
toy              6
hound            5
non-sporting     3
herding          2
Name: count, dtype: int64

11.340697674418605

count     43.00
mean      49.35
std       39.42
          ...  
50%       36.50
75%       67.50
max      175.00
Name: weight, Length: 8, dtype: float64

breed
Mastiff                       13581.0
Bloodhound                    13824.0
Bullmastiff                   13936.0
                               ...   
German Shorthaired Pointer    25842.0
Chihuahua                     26250.0
Giant Schnauzer               26686.0
Name: lifetime_cost, Length: 43, dtype: float64

'Mastiff'

from IPython.display import display
def display_df(df, rows=pd.options.display.max_rows, cols=pd.options.display.max_columns):
    """Displays n rows and cols from df."""
    with pd.option_context("display.max_rows", rows,
                           "display.max_columns", cols):
        display(df)

display_df(dogs.sort_values('weight', ascending=False), rows=43)

dogs

# Returns a Series.
dogs['kind']

breed
Brittany                  sporting
Cairn Terrier              terrier
English Cocker Spaniel    sporting
                            ...   
Bullmastiff                working
Mastiff                    working
Saint Bernard              working
Name: kind, Length: 43, dtype: object

# Returns a DataFrame.
dogs[['kind', 'size']]

# 🤔
dogs[['kind']]

# Breeds are stored in the index, which is not a column!
dogs['breed']

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3790, in Index.get_loc(self, key)
   3789 try:
-> 3790     return self._engine.get_loc(casted_key)
   3791 except KeyError as err:

File index.pyx:152, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:181, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7080, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'breed'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[40], line 2
      1 # Breeds are stored in the index, which is not a column!
----> 2 dogs['breed']

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/frame.py:3896, in DataFrame.__getitem__(self, key)
   3894 if self.columns.nlevels > 1:
   3895     return self._getitem_multilevel(key)
-> 3896 indexer = self.columns.get_loc(key)
   3897 if is_integer(indexer):
   3898     indexer = [indexer]

File ~/miniforge3/envs/pds/lib/python3.10/site-packages/pandas/core/indexes/base.py:3797, in Index.get_loc(self, key)
   3792     if isinstance(casted_key, slice) or (
   3793         isinstance(casted_key, abc.Iterable)
   3794         and any(isinstance(x, slice) for x in casted_key)
   3795     ):
   3796         raise InvalidIndexError(key)
-> 3797     raise KeyError(key) from err
   3798 except TypeError:
   3799     # If we have a listlike key, _check_indexing_error will raise
   3800     #  InvalidIndexError. Otherwise we fall through and re-raise
   3801     #  the TypeError.
   3802     self._check_indexing_error(key)

KeyError: 'breed'

dogs.index

Index(['Brittany', 'Cairn Terrier', 'English Cocker Spaniel', 'Cocker Spaniel',
       'Shetland Sheepdog', 'Siberian Husky', 'Lhasa Apso',
       'Miniature Schnauzer', 'Chihuahua', 'English Springer Spaniel',
       'German Shorthaired Pointer', 'Pointer', 'Tibetan Spaniel',
       'Labrador Retriever', 'Maltese', 'Shih Tzu', 'Irish Setter',
       'Golden Retriever', 'Chesapeake Bay Retriever', 'Tibetan Terrier',
       'Gordon Setter', 'Pug', 'Norfolk Terrier', 'English Toy Spaniel',
       'Cavalier King Charles Spaniel', 'Basenji',
       'Staffordshire Bull Terrier', 'Pembroke Welsh Corgi', 'Clumber Spaniel',
       'Dandie Dinmont Terrier', 'Giant Schnauzer', 'Scottish Terrier',
       'Kerry Blue Terrier', 'Afghan Hound', 'Newfoundland',
       'Rhodesian Ridgeback', 'Borzoi', 'Bull Terrier', 'Alaskan Malamute',
       'Bloodhound', 'Bullmastiff', 'Mastiff', 'Saint Bernard'],
      dtype='object', name='breed')

dogs

# What are the unique kinds of dogs?
dogs['kind'].unique()

array(['sporting', 'terrier', 'herding', 'working', 'non-sporting', 'toy',
       'hound'], dtype=object)

# How many unique kinds of dogs are there?
dogs['kind'].nunique()

7

# What's the distribution of kinds?
# value_counts is super useful – and I love asking exam questions about it!
dogs['kind'].value_counts()

kind
sporting        12
terrier          8
working          7
toy              6
hound            5
non-sporting     3
herding          2
Name: count, dtype: int64

# What's the mean of the 'longevity' column?
dogs['longevity'].mean()

11.340697674418605

# Tell me more about the 'weight' column.
dogs['weight'].describe()

count     43.00
mean      49.35
std       39.42
          ...  
50%       36.50
75%       67.50
max      175.00
Name: weight, Length: 8, dtype: float64

# Sort the 'lifetime_cost' column. Note that here we're using sort_values on a Series, not a DataFrame!
dogs['lifetime_cost'].sort_values()

breed
Mastiff                       13581.0
Bloodhound                    13824.0
Bullmastiff                   13936.0
                               ...   
German Shorthaired Pointer    25842.0
Chihuahua                     26250.0
Giant Schnauzer               26686.0
Name: lifetime_cost, Length: 43, dtype: float64

# Gives us the index of the largest value, not the largest value itself.
# Note that this makes our Activity from a few slides ago way easier!
dogs['height'].idxmax()

'Mastiff'

>>> average_heaviest(5)
16142.8

>>> average_heaviest(1)
13581.0

16142.8

13581.0

breed
Brittany                  2037.92
Cairn Terrier             2038.84
English Cocker Spaniel    2036.66
                           ...   
Bullmastiff               2032.57
Mastiff                   2031.50
Saint Bernard             2032.78
Name: longevity, Length: 43, dtype: float64

breed
Brittany                  1748.37
Cairn Terrier             1589.02
English Cocker Spaniel    1628.90
                           ...   
Bullmastiff               1840.95
Mastiff                   2089.38
Saint Bernard             2573.52
Length: 43, dtype: float64

breed
Maltese                       1557.88
Lhasa Apso                    1582.69
Cairn Terrier                 1589.02
                               ...   
German Shorthaired Pointer    2254.97
Saint Bernard                 2573.52
Giant Schnauzer               2668.60
Length: 43, dtype: float64

'Giant Schnauzer'

breed
Brittany                   68.31
Cairn Terrier              98.64
English Cocker Spaniel     82.56
                           ...  
Bullmastiff               124.60
Mastiff                   137.00
Saint Bernard             155.51
Length: 43, dtype: float64

11.0

pandas.core.indexing._LocIndexer

method

def average_heaviest(n):
    return (
        dogs
        .sort_values('weight', ascending=False)
        .head(n)
        ['lifetime_cost']
        .mean()
    )

average_heaviest(5)

16142.8

average_heaviest(1)

13581.0

dogs

2025 + dogs['longevity']

breed
Brittany                  2037.92
Cairn Terrier             2038.84
English Cocker Spaniel    2036.66
                           ...   
Bullmastiff               2032.57
Mastiff                   2031.50
Saint Bernard             2032.78
Name: longevity, Length: 43, dtype: float64

dogs['lifetime_cost'] / dogs['longevity']

breed
Brittany                  1748.37
Cairn Terrier             1589.02
English Cocker Spaniel    1628.90
                           ...   
Bullmastiff               1840.95
Mastiff                   2089.38
Saint Bernard             2573.52
Length: 43, dtype: float64

(dogs['lifetime_cost'] / dogs['longevity']).sort_values()

breed
Maltese                       1557.88
Lhasa Apso                    1582.69
Cairn Terrier                 1589.02
                               ...   
German Shorthaired Pointer    2254.97
Saint Bernard                 2573.52
Giant Schnauzer               2668.60
Length: 43, dtype: float64

(dogs['lifetime_cost'] / dogs['longevity']).idxmax()

'Giant Schnauzer'

weight_kg = dogs['weight'] / 2.2
height_m = dogs['height'] * 2.54 / 100
bmis = weight_kg / (height_m ** 2)
bmis

breed
Brittany                   68.31
Cairn Terrier              98.64
English Cocker Spaniel     82.56
                           ...  
Bullmastiff               124.60
Mastiff                   137.00
Saint Bernard             155.51
Length: 43, dtype: float64

11.0

pandas.core.indexing._LocIndexer

method

breed
Cocker Spaniel         small
Labrador Retriever    medium
Name: size, dtype: object

5.5

'Chihuahua'

breed
Brittany                  False
Cairn Terrier              True
English Cocker Spaniel    False
                          ...  
Bullmastiff               False
Mastiff                   False
Saint Bernard             False
Name: kind, Length: 43, dtype: bool

8

'large'

weight_kg = dogs['weight'] / 2.2
height_m = dogs['height'] * 2.54 / 100
bmis = weight_kg / (height_m ** 2)
bmis

breed
Brittany                   68.31
Cairn Terrier              98.64
English Cocker Spaniel     82.56
                           ...  
Bullmastiff               124.60
Mastiff                   137.00
Saint Bernard             155.51
Length: 43, dtype: float64

dogs.plot(kind='scatter', x='weight', y='longevity')

# Hover over a point and see what happens!
(
    dogs
    .reset_index()
    .plot(kind='scatter', x='weight', y='longevity', color='size', hover_name='breed',
          title='Longevity vs. Weight for 43 Dog Breeds')
)

(
    dogs['kind']
    .value_counts()
    .sort_values()
    .plot(kind='barh', title='Distribution of Dog Kinds')
)

dogs

# The first argument is the row label, i.e. the index value.
#        ↓
dogs.loc['Pug', 'longevity']
#                  ↑
# The second argument is the column label.

11.0

type(dogs.loc)

pandas.core.indexing._LocIndexer

type(dogs.sort_values)

method

dogs

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], 'size']

breed
Cocker Spaniel         small
Labrador Retriever    medium
Name: size, dtype: object

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], ['kind', 'size', 'height']]

# Note that the 'weight' column is included!
# loc, per the pandas documentation, is inclusive of both slicer endpoints.
dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], 'lifetime_cost': 'weight']

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], :]

# Shortcut for the line above.
dogs.loc[['Cocker Spaniel', 'Labrador Retriever']]

dogs

# Try removing the iloc and see what happens!
dogs.iloc[1:15, :-2]

dogs.sort_values('longevity', ascending=False)['weight'].iloc[0]

5.5

# Finding the breed itself involves sorting, but not iloc, since breeds are stored in the index.
dogs.sort_values('longevity', ascending=False).index[0]

'Chihuahua'

second_tallest_height = (
    dogs
    .loc[['Cocker Spaniel', 'Labrador Retriever', 'Newfoundland', 'Irish Setter'], 'height']
    .sort_values(ascending=False)
    .iloc[1]
)

breed
Brittany                  False
Cairn Terrier              True
English Cocker Spaniel    False
                          ...  
Bullmastiff               False
Mastiff                   False
Saint Bernard             False
Name: kind, Length: 43, dtype: bool

8

'large'

second_tallest_height = (
    dogs
    .loc[['Cocker Spaniel', 'Labrador Retriever', 'Newfoundland', 'Irish Setter'], 'height']
    .sort_values(ascending=False)
    .iloc[1]
)

dogs

dogs['kind'] == 'terrier'

breed
Brittany                  False
Cairn Terrier              True
English Cocker Spaniel    False
                          ...  
Bullmastiff               False
Mastiff                   False
Saint Bernard             False
Name: kind, Length: 43, dtype: bool

dogs.loc[dogs['kind'] == 'terrier']

# This gives us the number of terriers in the dataset.
dogs.loc[dogs['kind'] == 'terrier'].shape[0]

8

dogs.loc[dogs['weight'] >= 50]

# .str.contains is very useful!
dogs.loc[dogs.index.str.contains('Retriever')]

# Because querying is so common, there's a shortcut:
dogs[dogs.index.str.contains('Retriever')]

# Empty DataFrame – not an error!
dogs.loc[dogs['kind'] == 'beaver']

second_tallest = (
    dogs[dogs['kind'] == 'sporting']
    .sort_values('height', ascending=False)
    ['size']
    .iloc[1]
)
second_tallest

'large'

second_tallest = (
    dogs[dogs['kind'] == 'sporting']
    .sort_values('height', ascending=False)
    ['size']
    .iloc[1]
)
second_tallest

'large'

Lecture 4¶

Simulation, DataFrame Fundamentals¶

EECS 398-003: Practical Data Science, Fall 2024¶

Announcements 📣¶

Agenda¶

Question 🤔 (Answer at practicaldsc.org/q)

Randomness and simulation¶

`np.random`¶

Simulations¶

Example: Coin flipping¶

Question 🤔 (Answer at practicaldsc.org/q)

Example: The Birthday Paradox¶

Introduction to `pandas` DataFrames¶

`pandas`¶

`pandas` data structures¶

Example: Dog Breeds 🐶¶

Exploring our first DataFrame¶

Sorting¶

Setting the index¶

Activity

Question 🤔 (Answer at practicaldsc.org/q)

💡 Pro-Tip: Displaying more rows/columns¶

Selecting columns from a DataFrame¶

Selecting columns with `[]`¶

Useful Series methods¶

Activity

Series support vectorized operations¶

Activity

Aside: Visualization¶

Selecting slices of a DataFrame¶

Use `loc` to slice rows and columns using labels¶

`loc` is flexible 🧘¶

Use `iloc` to slice rows and columns using positions¶

Activity

Querying¶

Reflection¶

Querying¶

Activity

Lingering questions¶

	breed	kind	lifetime_cost	longevity	size	weight	height
0	Brittany	sporting	22589.0	12.92	medium	35.0	19.0
1	Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.0
2	English Cocker Spaniel	sporting	18993.0	11.66	medium	30.0	16.0
...	...	...	...	...	...	...	...
40	Bullmastiff	working	13936.0	7.57	large	115.0	25.5
41	Mastiff	working	13581.0	6.50	large	175.0	30.0
42	Saint Bernard	working	20022.0	7.78	large	155.0	26.5

	kind	lifetime_cost	longevity	size	weight	height
breed
Brittany	sporting	22589.0	12.92	medium	35.0	19.0
Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.0
English Cocker Spaniel	sporting	18993.0	11.66	medium	30.0	16.0
...	...	...	...	...	...	...
Bullmastiff	working	13936.0	7.57	large	115.0	25.5
Mastiff	working	13581.0	6.50	large	175.0	30.0
Saint Bernard	working	20022.0	7.78	large	155.0	26.5

	kind	size	height
breed
Cocker Spaniel	sporting	small	14.5
Labrador Retriever	sporting	medium	23.0

	lifetime_cost	longevity	size	weight
breed
Cocker Spaniel	24330.0	12.50	small	25.0
Labrador Retriever	21299.0	12.04	medium	67.5

	kind	lifetime_cost	longevity	size	weight	height
breed
German Shorthaired Pointer	sporting	25842.0	11.46	large	62.5	24.0
Pointer	sporting	24445.0	12.42	large	59.5	25.5
Labrador Retriever	sporting	21299.0	12.04	medium	67.5	23.0
...	...	...	...	...	...	...
Bullmastiff	working	13936.0	7.57	large	115.0	25.5
Mastiff	working	13581.0	6.50	large	175.0	30.0
Saint Bernard	working	20022.0	7.78	large	155.0	26.5

Lecture 4¶

Simulation, DataFrame Fundamentals¶

EECS 398-003: Practical Data Science, Fall 2024¶

Announcements 📣¶

Agenda¶

Question 🤔 (Answer at practicaldsc.org/q)

Randomness and simulation¶

np.random¶

Simulations¶

Example: Coin flipping¶

Question 🤔 (Answer at practicaldsc.org/q)

Example: The Birthday Paradox¶

Introduction to pandas DataFrames¶

pandas¶

Importing pandas and related libraries¶

pandas data structures¶

Example: Dog Breeds 🐶¶

Exploring our first DataFrame¶

Sorting¶

Setting the index¶

Activity

Question 🤔 (Answer at practicaldsc.org/q)

💡 Pro-Tip: Displaying more rows/columns¶

Selecting columns from a DataFrame¶

Selecting columns with []¶

Useful Series methods¶

Activity

Series support vectorized operations¶

Activity

Aside: Visualization¶

Selecting slices of a DataFrame¶

Use loc to slice rows and columns using labels¶

loc is flexible 🧘¶

Use iloc to slice rows and columns using positions¶

Activity

Querying¶

Reflection¶

Querying¶

Activity

Lingering questions¶

`np.random`¶

Introduction to `pandas` DataFrames¶

`pandas`¶

Importing `pandas` and related libraries¶

`pandas` data structures¶

Selecting columns with `[]`¶

Use `loc` to slice rows and columns using labels¶

`loc` is flexible 🧘¶

Use `iloc` to slice rows and columns using positions¶