from lec_utils import *

from IPython.display import IFrame
IFrame(src='https://www.loom.com/embed/eb06b185428542c391f21e55480a0d2d?sid=3891cb7f-a4c9-4a34-8211-0347a283d413',
       width=400, height=300)

email = '''
Thank you for buying our expensive product!
If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.
If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!
Due to high demand, please allow one-hundred (100) business days for a response.
'''

print(email)

Thank you for buying our expensive product!
If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.
If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!
Due to high demand, please allow one-hundred (100) business days for a response.

def is_possibly_area_code(s):
    '''Does `s` look like (678)?'''
    return (len(s) == 5 and
            s.startswith('(') and
            s.endswith(')') and
            s[1:4].isnumeric())

is_possibly_area_code('(123)')

True

is_possibly_area_code('(99)')

False

def is_last_7_phone_number(s):
    '''Does `s` look like 999-8212?'''
    return len(s) == 8 and s[0:3].isnumeric() and s[3] == '-' and s[4:].isnumeric()

is_last_7_phone_number('999-8212')

True

is_last_7_phone_number('534 1100')

False

print(email)

Thank you for buying our expensive product!
If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.
If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!
Due to high demand, please allow one-hundred (100) business days for a response.

# Removes punctuation from the end of each string.
pieces = [s.rstrip('.,?;"\'') for s in email.split()]
for i in range(len(pieces) - 1):
    if is_possibly_area_code(pieces[i]):
        if is_last_7_phone_number(pieces[i+1]):
            print(pieces[i], pieces[i+1])

(800) 867-5309
(800) 123-4567

print(email)

Thank you for buying our expensive product!
If you have a complaint, please send it to complaints@compuserve.com or call (800) 867-5309.
If you are happy with your purchase, please call us at (800) 123-4567; we'd love to hear from you!
Due to high demand, please allow one-hundred (100) business days for a response.

import re
re.findall(r'\(\d{3}\) \d{3}-\d{4}', email)

['(800) 867-5309', '(800) 123-4567']

import re

['ABBBA', 'ABBBBBBBA']

'here is a string for you: billy. here is another: billy'

[]

['cat']

cat

['notumich', 'umich']

['billy@notumich.edu', 'notbilly@umich.edu']

[('oo', '124')]

(90000, 4)

'The Best Exercise To Lose Belly Fat In 2 weeks  https://t.co/oHFToG7rh6 #Exercise #LoseBellyFat #CatTV #TeenWolf… https://t.co/b4pr9gEx38'

import re

re.findall('AB*A', 
           'here is a string for you: ABBBA. here is another: ABBBBBBBA')

['ABBBA', 'ABBBBBBBA']

re.sub('AB*A', 
       'billy', 
       'here is a string for you: ABBBA. here is another: ABBBBBBBA')

'here is a string for you: billy. here is another: billy'

re.findall('\bcat\b', 'my cat is hungry')

[]

re.findall(r'\bcat\b', 'my cat is hungry')

['cat']

# Huh?
print('\bcat\b')

cat

re.findall(r'\w+@(\w+)\.edu', 
           'my old email was billy@notumich.edu, my new email is notbilly@umich.edu')

['notumich', 'umich']

re.findall(r'\w+@\w+\.edu', 
           'my old email was billy@notumich.edu, my new email is notbilly@umich.edu')

['billy@notumich.edu', 'notbilly@umich.edu']

# A regex that matches strings with two of the same vowel followed by 3 digits.
# We only want to capture the digits, but...
re.findall(r'(aa|ee|ii|oo|uu)(\d{3})', 'eeoo124')

[('oo', '124')]

tweets = pd.read_csv('data/ira.csv', names=['id', 'user', 'time', 'text'])
tweets.head()

tweets.shape

(90000, 4)

example_tweet = tweets['text'].iloc[0]
example_tweet

'The Best Exercise To Lose Belly Fat In 2 weeks  https://t.co/oHFToG7rh6 #Exercise #LoseBellyFat #CatTV #TeenWolf… https://t.co/b4pr9gEx38'

re.findall(r'#(\w+)', example_tweet)

['Exercise', 'LoseBellyFat', 'CatTV', 'TeenWolf']

re.findall(r'#(\w+)', 'hey there, no hashtags here')

[]

tags = tweets['text'].str.findall(r'#(\w+)') 
tags.head()

0    [Exercise, LoseBellyFat, CatTV, TeenWolf]
1                                           []
2                                       [tech]
3                                       [news]
4       [IHatePokemonGoBecause, PokesAreJokes]
Name: text, dtype: object

(
    pd.Series(tags.sum())
    .value_counts()
    .head(15)
    .sort_values()
    .plot(kind='barh', title='Most Common Hashtags in IRA Tweets')
)

s = '''132.249.20.188 - - [01/Oct/2024:2:36:15 -0400] "GET /my/home/ HTTP/1.1" 200 2585'''

[('01', 'Oct', '2024', '2', '36', '15')]

[('adr', 'jduy', 'wffsdffs', 'r4s4', '4wsgdfd', 'asdf')]

'132.249.20.188 - - [01/Oct/2024:2:36:15 -0400] "GET /my/home/ HTTP/1.1" 200 2585'

[]

'[adr/jduy/wffsdffs:r4s4:4wsgdfd:asdf 7]'

[]

s = '''132.249.20.188 - - [01/Oct/2024:2:36:15 -0400] "GET /my/home/ HTTP/1.1" 200 2585'''

exp = '\[(.+)\/(.+)\/(.+):(.+):(.+):(.+) .+\]'
re.findall(exp, s)

[('01', 'Oct', '2024', '2', '36', '15')]

other_s = '[adr/jduy/wffsdffs:r4s4:4wsgdfd:asdf 7]'
re.findall(exp, other_s)

[('adr', 'jduy', 'wffsdffs', 'r4s4', '4wsgdfd', 'asdf')]

s

'132.249.20.188 - - [01/Oct/2024:2:36:15 -0400] "GET /my/home/ HTTP/1.1" 200 2585'

[]

'[adr/jduy/wffsdffs:r4s4:4wsgdfd:asdf 7]'

[]

s

'132.249.20.188 - - [01/Oct/2024:2:36:15 -0400] "GET /my/home/ HTTP/1.1" 200 2585'

new_exp = '\[(\d{2})\/([A-Z]{1}[a-z]{2})\/(\d{4}):(\d{2}):(\d{2}):(\d{2}) -\d{4}\]'
re.findall(new_exp, s)

[]

other_s

'[adr/jduy/wffsdffs:r4s4:4wsgdfd:asdf 7]'

re.findall(new_exp, other_s)

[]

operation	order of op.	example	matches ✅	does not match ❌
concatenation	3	`AABAAB`	`'AABAAB'`	every other string
or	4	`AA\|BAAB`	`'AA'`, `'BAAB'`	every other string
closure (zero or more)	2	`AB*A`	`'AA'`, `'ABBBBBBA'`	`'AB'`, `'ABABA'`
parentheses	1	`A(A\|B)AAB` `(AB)*A`	`'AAAAB'`, `'ABAAB'` `'A'`, `'ABABABABA'`	every other string `'AA'`, `'ABBA'`

operation	example	matches ✅	does not match ❌
wildcard	`.U.U.U.`	`'CUMULUS'` `'JUGULUM'`	`'SUCCUBUS'` `'TUMULTUOUS'`
character class	`[A-Za-z][a-z]*`	`'word'` `'Capitalized'`	`'camelCase'` `'4illegal'`
at least one	`bi(ll)+y`	`'billy'` `'billlllly'`	`'biy'` `'bily'`
between $i$ and $j$ occurrences	`m[aeiou]{1,2}m`	`'mem'` `'maam'` `'miem'`	`'mm'` `'mooom'` `'meme'`

operation	example	matches ✅	does not match ❌
escape character	`umich\.edu`	`'umich.edu'`	`'umich!edu'`
beginning of line	`^ark`	`'ark two'` `'ark o ark'`	`'dark'`
end of line	`ark$`	`'dark'` `'ark o ark'`	`'ark two'`
zero or one	`cat?`	`'ca'` `'cat'`	`'cart'` (matches `'ca'` only)
built-in character classes*	`\w+` `\d+`	`'billy'` `'231231'`	`'this person'` `'858 people'`
character class negation	`[^a-z]+`	`'WOLVERINE551'` `'1721$$'`	`'porch'` `'billy.edu'`

	id	user	time	text
0	3906258	ea85ac8be1e8ab479064ca4c0fe3ac6587f76b1ef97452...	2016-11-16 09:04	The Best Exercise To Lose Belly Fat In 2 weeks...
1	1051443	8e58ab0f46d273103d9e71aa92cdaffb6e330ec7d15ae5...	2016-12-24 04:31	RT @Philanthropy: Dozens of ‘hate groups’ have...
2	2823399	Room Of Rumor	2016-08-18 20:26	Artificial intelligence can find, map poverty,...
3	272878	San Francisco Daily	2016-03-18 19:28	Uber balks at rules proposed by world’s busies...
4	7697802	41bb9ae5991f53996752a0ab8dd36b543821abca8d5aed...	2016-07-30 15:44	RT @dirtroaddiva1: #IHatePokemonGoBecause he ...

Lecture 11¶

Regular Expressions¶

EECS 398-003: Practical Data Science, Fall 2024¶

Announcements 📣¶

Aside: Spreadsheets¶

Agenda¶

Motivation¶

Who called? 📞¶

Checking formatting¶

Is there a better way?¶

🤯

Basic regular expressions¶

Regular expressions¶

Writing regular expressions¶

Literals¶

Regex building blocks 🧱¶

Activity

Activity

Intermediate regex¶

More regex syntax¶

Activity

Activity

Even more regex syntax¶

Activity

Activity

Regex in Python¶

`re` in Python¶

Raw strings¶

Capture groups¶

Example: Extracting hashtags¶

Extracting hashtags¶

Followup questions¶

Reference Slide¶

Example: Log parsing¶

Reference Slide¶

The more specific, the better!¶

Lecture 11¶

Regular Expressions¶

EECS 398-003: Practical Data Science, Fall 2024¶

Announcements 📣¶

Aside: Spreadsheets¶

Agenda¶

Motivation¶

Who called? 📞¶

Checking formatting¶

Is there a better way?¶

🤯

Basic regular expressions¶

Regular expressions¶

Writing regular expressions¶

Literals¶

Regex building blocks 🧱¶

Activity

Activity

Intermediate regex¶

More regex syntax¶

Activity

Activity

Even more regex syntax¶

Activity

Activity

Regex in Python¶

re in Python¶

Raw strings¶

Capture groups¶

Example: Extracting hashtags¶

Extracting hashtags¶

Followup questions¶

Reference Slide¶

Example: Log parsing¶

Reference Slide¶

The more specific, the better!¶

`re` in Python¶