from lec_utils import *
import matplotlib.pyplot as plt

import math

math.sqrt(15)

3.872983346207417

sqrt(15)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 sqrt(15)

NameError: name 'sqrt' is not defined

# This is the standard way that we will import numpy.
import numpy as np

np.pi

3.141592653589793

np.linalg.inv([[2, 1], 
               [3, 4]])

array([[ 0.8, -0.2],
       [-0.6,  0.4]])

# Importing a particular function from the requests module.
from requests import get

# This typically fills up the namespace with a lot of unnecessary names, so use sparingly.
from math import *

sqrt

<function math.sqrt(x, /)>

import numpy as np

np.array([4, 9, 1, 2])

array([4, 9, 1, 2])

# All elements are converted to strings!
np.array([1961, 'michigan'])

array(['1961', 'michigan'], dtype='<U21')

temps = [68, 72, 65, 64, 62, 61, 59, 64, 64, 63, 65, 62]
temps

[68, 72, 65, 64, 62, 61, 59, 64, 64, 63, 65, 62]

temp_array = np.array(temps)

# Increase all temperatures by 3 degrees.
temp_array + 3

array([71, 75, 68, 67, 65, 64, 62, 67, 67, 66, 68, 65])

# Halve all temperatures.
temp_array / 2

array([34. , 36. , 32.5, 32. , 31. , 30.5, 29.5, 32. , 32. , 31.5, 32.5,
       31. ])

# Convert all temperatures to Celsius.
(5 / 9) * (temp_array - 32)

array([20.  , 22.22, 18.33, 17.78, 16.67, 16.11, 15.  , 17.78, 17.78,
       17.22, 18.33, 16.67])

temp_array

array([68, 72, 65, 64, 62, 61, 59, 64, 64, 63, 65, 62])

temp_array = (5 / 9) * (temp_array - 32)

# Now in Celsius!
temp_array

array([20.  , 22.22, 18.33, 17.78, 16.67, 16.11, 15.  , 17.78, 17.78,
       17.22, 18.33, 16.67])

%%timeit
squares = []
for i in range(1_000_000):
    squares.append(i * i)

29.3 ms ± 82.1 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)

%%timeit
squares = np.arange(1_000_000) ** 2

866 μs ± 50.1 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

a = np.array([4, 5, -1])
b = np.array([2, 3, 2])

a + b

array([6, 8, 1])

a / b

array([ 2.  ,  1.67, -0.5 ])

a ** 2 + b ** 2

array([20, 34,  5])

# Broadcasting implicitly replaces the 3 with np.array([3, 3, 3]),
# so that the shapes match up.
a + 3

array([7, 8, 2])

arr = np.array([3, 8, 4, -3.2])

(2 ** arr).sum()

280.108818820412

(2 ** arr).mean()

70.027204705103

(2 ** arr).max()

256.0

(2 ** arr).argmax()

1

# An attribute, not a method.
arr.shape

(4,)

(2 ** np.arange(30) / 100).sum()

10737418.23

array([20.  , 22.22, 18.33, 17.78, 16.67, 16.11, 15.  , 17.78, 17.78,
       17.22, 18.33, 16.67])

array([ True,  True,  True, False, False, False, False, False, False,
       False,  True, False])

4

4

0.3333333333333333

array([20.  , 22.22, 18.33, 18.33])

array([20.  , 18.33, 18.33])

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[43], line 2
      1 # WRONG!
----> 2 temp_array[(temp_array >= 18) and (temp_array <= 20)]

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

success

(2 ** np.arange(30) / 100).sum()

10737418.23

array([20.  , 22.22, 18.33, 17.78, 16.67, 16.11, 15.  , 17.78, 17.78,
       17.22, 18.33, 16.67])

array([ True,  True,  True, False, False, False, False, False, False,
       False,  True, False])

4

4

0.3333333333333333

array([20.  , 22.22, 18.33, 18.33])

array([20.  , 18.33, 18.33])

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[43], line 2
      1 # WRONG!
----> 2 temp_array[(temp_array >= 18) and (temp_array <= 20)]

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

success

(2 ** np.arange(30) / 100).sum()

10737418.23

temp_array

array([20.  , 22.22, 18.33, 17.78, 16.67, 16.11, 15.  , 17.78, 17.78,
       17.22, 18.33, 16.67])

temp_array >= 18

array([ True,  True,  True, False, False, False, False, False, False,
       False,  True, False])

(temp_array >= 18).sum()

4

np.count_nonzero(temp_array >= 18)

4

(temp_array >= 18).mean()

0.3333333333333333

temp_array[temp_array >= 18]

array([20.  , 22.22, 18.33, 18.33])

# Note the parentheses!
temp_array[(temp_array >= 18) & (temp_array <= 20)]

array([20.  , 18.33, 18.33])

# WRONG!
temp_array[(temp_array >= 18) and (temp_array <= 20)]

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[43], line 2
      1 # WRONG!
----> 2 temp_array[(temp_array >= 18) and (temp_array <= 20)]

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

if (5 > 3 and 'h' + 'i' == 'hi') or (-2 > 0):
    print('success')

success

temp_array

array([20.  , 22.22, 18.33, 17.78, 16.67, 16.11, 15.  , 17.78, 17.78,
       17.22, 18.33, 16.67])

# Don't forget parentheses when using multiple conditions!
temp_array[(temp_array % 2 == 0) | (temp_array == temp_array.min())]

array([20., 15.])

nums = np.array([
    [5, 1, 9, 7],
    [9, 8, 2, 3],
    [2, 5, 0, 4]
])
nums

array([[5, 1, 9, 7],
       [9, 8, 2, 3],
       [2, 5, 0, 4]])

# nums has 3 rows and 4 columns.
nums.shape

(3, 4)

# Here, we're asking to reshape np.arange(1, 7)
# so that it has 2 rows and 3 columns.
a = np.arange(1, 7).reshape((2, 3)) 
a

array([[1, 2, 3],
       [4, 5, 6]])

a

array([[1, 2, 3],
       [4, 5, 6]])

a.sum(axis=0)

array([5, 7, 9])

a.sum(axis=1)

array([ 6, 15])

array[<row positions>, <column positions>]

a

array([[1, 2, 3],
       [4, 5, 6]])

# Accesses row 0 and all columns.
a[0, :]

array([1, 2, 3])

# Same as the above.
a[0]

array([1, 2, 3])

# Accesses all rows and column 1.
a[:, 1]

array([2, 5])

# Access all rows and columns 0 and 2.
a[:, [0, 2]]

array([[1, 3],
       [4, 6]])

# Accesses row 0 and columns 1 and onwards.
a[0, 1:]

array([2, 3])

s = (5, 3)
grid = np.ones(s) * 2 * np.arange(1, 16).reshape(s)
grid[-1, 1:].sum()

img = np.asarray(Image.open('imgs/junior.jpeg'))
img

array([[[ 98,  62,  46],
        [ 89,  56,  39],
        [ 88,  56,  41],
        ...,
        [123,  78,  59],
        [125,  78,  60],
        [128,  81,  63]],

       [[ 96,  60,  44],
        [ 89,  56,  39],
        [ 89,  57,  42],
        ...,
        [124,  79,  60],
        [125,  78,  60],
        [127,  80,  62]],

       [[ 94,  58,  42],
        [ 89,  56,  39],
        [ 89,  57,  42],
        ...,
        [125,  78,  60],
        [125,  78,  60],
        [126,  79,  61]],

       ...,

       [[ 89,  50,  11],
        [ 85,  46,   7],
        [ 81,  42,   3],
        ...,
        [ 94,  68,  33],
        [100,  74,  39],
        [108,  82,  47]],

       [[ 86,  48,  11],
        [ 84,  46,   9],
        [ 84,  46,   9],
        ...,
        [ 92,  66,  31],
        [100,  74,  39],
        [111,  85,  50]],

       [[ 94,  56,  19],
        [ 93,  55,  18],
        [ 95,  57,  21],
        ...,
        [ 90,  64,  29],
        [ 98,  72,  37],
        [112,  86,  51]]], dtype=uint8)

img.shape

(2048, 1536, 3)

plt.imshow(img)
plt.axis('off');

mean_2d = img.mean(axis=2) 
mean_2d

array([[68.67, 61.33, 61.67, ..., 86.67, 87.67, 90.67],
       [66.67, 61.33, 62.67, ..., 87.67, 87.67, 89.67],
       [64.67, 61.33, 62.67, ..., 87.67, 87.67, 88.67],
       ...,
       [50.  , 46.  , 42.  , ..., 65.  , 71.  , 79.  ],
       [48.33, 46.33, 46.33, ..., 63.  , 71.  , 82.  ],
       [56.33, 55.33, 57.67, ..., 61.  , 69.  , 83.  ]])

# This is just a single red channel!
plt.imshow(mean_2d.astype(int))
plt.axis('off');

# np.newaxis is an alias for None.
# It helps us introduce an additional axis.
np.arange(5)[:, np.newaxis]

array([[0],
       [1],
       [2],
       [3],
       [4]])

np.repeat(np.arange(5)[:, np.newaxis], 3, axis=1)

array([[0, 0, 0],
       [1, 1, 1],
       [2, 2, 2],
       [3, 3, 3],
       [4, 4, 4]])

mean_3d = np.repeat(mean_2d[:, :, np.newaxis], 3, axis=2).astype(int)
mean_3d

array([[[68, 68, 68],
        [61, 61, 61],
        [61, 61, 61],
        ...,
        [86, 86, 86],
        [87, 87, 87],
        [90, 90, 90]],

       [[66, 66, 66],
        [61, 61, 61],
        [62, 62, 62],
        ...,
        [87, 87, 87],
        [87, 87, 87],
        [89, 89, 89]],

       [[64, 64, 64],
        [61, 61, 61],
        [62, 62, 62],
        ...,
        [87, 87, 87],
        [87, 87, 87],
        [88, 88, 88]],

       ...,

       [[50, 50, 50],
        [46, 46, 46],
        [42, 42, 42],
        ...,
        [65, 65, 65],
        [71, 71, 71],
        [79, 79, 79]],

       [[48, 48, 48],
        [46, 46, 46],
        [46, 46, 46],
        ...,
        [63, 63, 63],
        [71, 71, 71],
        [82, 82, 82]],

       [[56, 56, 56],
        [55, 55, 55],
        [57, 57, 57],
        ...,
        [61, 61, 61],
        [69, 69, 69],
        [83, 83, 83]]])

plt.imshow(mean_3d)
plt.axis('off');

sepia_filter = np.array([
    [0.393, 0.769, 0.189],
    [0.349, 0.686, 0.168],
    [0.272, 0.534, 0.131]
])

# Multiplies each pixel by the sepia_filter matrix.
# Then, clips each RGB value to be between 0 and 1.
filtered = (img @ sepia_filter.T).clip(0, 255).astype(int)
filtered

array([[[ 94,  84,  65],
        [ 85,  76,  59],
        [ 85,  76,  59],
        ...,
        [119, 106,  82],
        [120, 107,  83],
        [124, 110,  86]],

       [[ 92,  82,  63],
        [ 85,  76,  59],
        [ 86,  77,  60],
        ...,
        [120, 107,  83],
        [120, 107,  83],
        [123, 109,  85]],

       [[ 89,  79,  62],
        [ 85,  76,  59],
        [ 86,  77,  60],
        ...,
        [120, 107,  83],
        [120, 107,  83],
        [121, 108,  84]],

       ...,

       [[ 75,  67,  52],
        [ 70,  62,  48],
        [ 64,  57,  44],
        ...,
        [ 95,  84,  66],
        [103,  92,  71],
        [114, 101,  79]],

       [[ 72,  64,  50],
        [ 70,  62,  48],
        [ 70,  62,  48],
        ...,
        [ 92,  82,  64],
        [103,  92,  71],
        [118, 105,  82]],

       [[ 83,  74,  57],
        [ 82,  73,  57],
        [ 85,  75,  59],
        ...,
        [ 90,  80,  62],
        [100,  89,  69],
        [119, 106,  83]]])

plt.imshow(filtered)
plt.axis('off');

A = np.array([[2, -5, 1],
              [0,  3, 2]])
x = np.array([[1],
              [-1],
              [4]])
y = np.array([[3],
              [-2]])

A @ x

array([[11],
       [ 5]])

A.T @ y

array([[  6],
       [-21],
       [ -1]])

x.T @ A.T @ A @ x

array([[146]])

fib = np.array([[1, 1],
                [1, 0]])

np.linalg.matrix_power(fib, 8)

array([[34, 21],
       [21, 13]])

np.random.random()

0.6599195323554048

# Run this cell multiple times!
np.random.randint(1, 7)

5

unique_names = np.load('data/sp25-names.npy', allow_pickle=True)
unique_names

array(['Abhinav', 'Alvin', 'Andrew', ..., 'Toby', 'Trent', 'Yu'],
      dtype=object)

# Returns a randomly selected element from the provided array, 5 times, with replacement.
# The resulting array COULD have duplicates.
np.random.choice(unique_names, 5)

array(['Hasan', 'Christian', 'Isaiah', 'Isaiah', 'Sean'], dtype=object)

# Returns a randomly selected element from the provided array, 5 times, without replacement.
# The resulting array CANNOT have duplicates.
np.random.choice(unique_names, 5, replace=False)

array(['Alvin', 'Yu', 'Abhinav', 'Joshua', 'Maxwell'], dtype=object)

np.random.multinomial(15, [0.3, 0.5, 0.2])

array([4, 7, 4])

np.random.multinomial(100, [0.5, 0.5])

array([51, 49])

np.random.multinomial(100, [0.5, 0.5], 100_000)

array([[51, 49],
       [49, 51],
       [55, 45],
       ...,
       [35, 65],
       [49, 51],
       [53, 47]])

# outcomes is an array with 100,000 elements,
# each of which is the number of heads in 100 simulated flips of a fair coin.
outcomes = np.random.multinomial(100, [0.5, 0.5], 100_000)[:, 0] 
outcomes

array([59, 50, 49, ..., 50, 57, 52])

px.histogram(outcomes, title='Number of Heads in 100 Simulated Flips of a Fair Coin')

((outcomes >= 40) & (outcomes <= 50)).mean()

0.52015

from scipy.special import comb
sum([comb(100, k) for k in range(40, 51)]) / (2 ** 100)

0.5221945185847369

np.random.permutation(['A', 'B', 'C'])

array(['A', 'C', 'B'], dtype='<U1')

Real plane:      [ 1  2  3  4  5  6  7  8  9 10]
Simulated plane: [ 4 10  7  5  8  1  3  9  6  2]

True

0.36796

0.36787944117144233

np.random.permutation(['A', 'B', 'C'])

array(['A', 'C', 'B'], dtype='<U1')

def simulate_one_plane(n, display=False):
    """Simulates one plane of n people.
       Returns True if nobody is in their originally assigned seat,
       or False if at least one person is.
    """
    poss = np.arange(1, n + 1)
    shuffled = np.random.permutation(poss)
    if display:
        print('Real plane:     ', poss)
        print('Simulated plane:', shuffled)
    return (poss != shuffled).all()
simulate_one_plane(10, display=True)

Real plane:      [ 1  2  3  4  5  6  7  8  9 10]
Simulated plane: [ 4 10  7  5  8  1  3  9  6  2]

True

n = 50
prob = np.mean([simulate_one_plane(n) for _ in range(100_000)]) 
prob

0.36796

1 / np.e

0.36787944117144233

YouTubeVideo('shc1YcPrfXE')

YouTubeVideo('pbXg5EI5t4c')

Lecture 3¶

NumPy and Random Simulations¶

EECS 398: Practical Data Science, Spring 2025¶

Agenda 📆¶

`numpy` arrays¶

Import statements¶

NumPy¶

Arrays¶

Array-number arithmetic¶

⚠️ The dangers of unnecessary `for`-loops¶

Element-wise arithmetic¶

Array methods¶

Question 🤔 (Answer at practicaldsc.org/q)

Activity

Boolean filtering¶

Note: & and | vs. and and or¶

Multidimensional arrays and linear algebra¶

Multidimensional arrays¶

Operations along axes¶

Selecting rows and columns from 2D arrays¶

Activity

Example: Image processing¶

Applying a grayscale filter¶

Reference Slide¶

Applying a sepia filter¶

Matrix multiplication¶

Example: Fibonacci numbers¶

Key takeaway: avoid `for`-loops whenever possible!¶

Randomness and simulations¶

`np.random`¶

Random sampling¶

Simulations¶

Example: Coin flipping¶

Estimating a probability from empirical results¶

Question 🤔 (Answer at practicaldsc.org/q)

Example: Airplane seats ✈️¶

Simulating airplane seats¶

Airplane seats, solved¶

Walkthrough video of airplane seats example¶

Curious to learn more?¶

Lecture 3¶

NumPy and Random Simulations¶

EECS 398: Practical Data Science, Spring 2025¶

Agenda 📆¶

numpy arrays¶

Import statements¶

NumPy¶

Arrays¶

Array-number arithmetic¶

⚠️ The dangers of unnecessary for-loops¶

Element-wise arithmetic¶

Array methods¶

Question 🤔 (Answer at practicaldsc.org/q)

Activity

Boolean filtering¶

Note: & and | vs. and and or¶

Multidimensional arrays and linear algebra¶

Multidimensional arrays¶

Operations along axes¶

Selecting rows and columns from 2D arrays¶

Activity

Example: Image processing¶

Applying a grayscale filter¶

Reference Slide¶

Applying a sepia filter¶

Matrix multiplication¶

Example: Fibonacci numbers¶

Key takeaway: avoid for-loops whenever possible!¶

Randomness and simulations¶

np.random¶

Random sampling¶

Simulations¶

Example: Coin flipping¶

Estimating a probability from empirical results¶

Question 🤔 (Answer at practicaldsc.org/q)

Example: Airplane seats ✈️¶

Simulating airplane seats¶

Airplane seats, solved¶

Walkthrough video of airplane seats example¶

Curious to learn more?¶

`numpy` arrays¶

⚠️ The dangers of unnecessary `for`-loops¶

Key takeaway: avoid `for`-loops whenever possible!¶

`np.random`¶