from lec_utils import *

div_one = pd.DataFrame().assign(
    Team=['Triton Circus', 'Wolverines', 'Golden Bears', 'Sooners', 'Patriots', 'Bruins'],
    Region=['Midwest', 'Midwest', 'East', 'Midwest', 'West', 'West']
)
coach = pd.DataFrame().assign(
    Team=['Triton Circus', 'Wolverines', 'Golden Bears', 'Sooners', 'Patriots', 'Bruins'],
    Coach=['Coach Jason', 'Coach Jack', 'Coach Jason', 'Coach Ashley', 'Coach Nick', 'Coach Zoe'],
    Region=['____', 'Midwest', 'Midwest', 'East', 'East', 'South'] # Test this out once you've guessed!
)
# div_one.merge(coach, on='Region').shape[0]

Index(['id', 'loan_amnt', 'issue_d', 'term', 'int_rate', 'grade', 'sub_grade',
       'emp_title', 'verification_status', 'home_ownership', 'annual_inc',
       'loan_status', 'purpose', 'desc', 'addr_state', 'dti', 'fico_range_low',
       'fico_range_high', 'hardship_flag', 'mths_since_last_delinq'],
      dtype='object')

0        17965023
1       111414087
2        95219557
          ...    
6297     63990101
6298     37641672
6299     50587446
Name: id, Length: 6300, dtype: int64

0        60 months
1        36 months
2        36 months
           ...    
6297     60 months
6298     60 months
6299     60 months
Name: term, Length: 6300, dtype: object

Index(['id', 'loan_amnt', 'issue_d', 'term', 'int_rate', 'grade', 'sub_grade',
       'emp_title', 'verification_status', 'home_ownership', 'annual_inc',
       'loan_status', 'purpose', 'desc', 'addr_state', 'dti', 'fico_range_low',
       'fico_range_high', 'hardship_flag', 'mths_since_last_delinq'],
      dtype='object')

loan_status
Fully Paid                                            3025
Current                                               2367
Charged Off                                            814
Late (31-120 days)                                      59
In Grace Period                                         23
Late (16-30 days)                                       11
Does not meet the credit policy. Status:Fully Paid       1
Name: count, dtype: int64

addr_state
CA    905
TX    553
NY    498
FL    420
IL    261
Name: count, dtype: int64

count    6300.00
mean      698.18
std        32.50
          ...   
50%       690.00
75%       715.00
max       845.00
Name: fico_range_low, Length: 8, dtype: float64

min        1000.0
median    15000.0
mean      15568.8
max       40000.0
Name: loan_amnt, dtype: float64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6300 entries, 0 to 6299
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      6300 non-null   int64  
 1   loan_amnt               6300 non-null   float64
 2   issue_d                 6300 non-null   object 
 3   term                    6300 non-null   object 
 4   int_rate                6300 non-null   float64
 5   grade                   6300 non-null   object 
 6   sub_grade               6300 non-null   object 
 7   emp_title               6300 non-null   object 
 8   verification_status     6300 non-null   object 
 9   home_ownership          6300 non-null   object 
 10  annual_inc              6300 non-null   float64
 11  loan_status             6300 non-null   object 
 12  purpose                 6300 non-null   object 
 13  desc                    324 non-null    object 
 14  addr_state              6300 non-null   object 
 15  dti                     6299 non-null   float64
 16  fico_range_low          6300 non-null   float64
 17  fico_range_high         6300 non-null   float64
 18  hardship_flag           6300 non-null   object 
 19  mths_since_last_delinq  3120 non-null   float64
dtypes: float64(7), int64(1), object(12)
memory usage: 984.5+ KB

1

div_one = pd.DataFrame().assign(
    Team=['Triton Circus', 'Wolverines', 'Golden Bears', 'Sooners', 'Patriots', 'Bruins'],
    Region=['Midwest', 'Midwest', 'East', 'Midwest', 'West', 'West']
)
coach = pd.DataFrame().assign(
    Team=['Triton Circus', 'Wolverines', 'Golden Bears', 'Sooners', 'Patriots', 'Bruins'],
    Coach=['Coach Jason', 'Coach Jack', 'Coach Jason', 'Coach Ashley', 'Coach Nick', 'Coach Zoe'],
    Region=['____', 'Midwest', 'Midwest', 'East', 'East', 'South'] # Test this out once you've guessed!
)
# div_one.merge(coach, on='Region').shape[0]

loans = pd.read_csv('data/loans.csv')

# Each time you run this cell, you'll see a different random subset of the DataFrame.
loans.sample(5)

# When a DataFrame has more columns than you can see in its preview,
# it's a good idea to check the names of all columns.
loans.columns

Index(['id', 'loan_amnt', 'issue_d', 'term', 'int_rate', 'grade', 'sub_grade',
       'emp_title', 'verification_status', 'home_ownership', 'annual_inc',
       'loan_status', 'purpose', 'desc', 'addr_state', 'dti', 'fico_range_low',
       'fico_range_high', 'hardship_flag', 'mths_since_last_delinq'],
      dtype='object')

# Again, run this a few times to get a sense of the typical values.
loans[['loan_amnt', 'issue_d', 'term', 'int_rate', 'emp_title', 'fico_range_low']].sample(5)

display_df(loans.loc[loans['loan_amnt'] == 3600, ['loan_amnt', 'term', 'int_rate']], rows=17)

# The 'id's are stored as numbers, but are categorical (nominal).
# Are these loan 'id's (unique to each loan) or customer 'id's (which could be duplicated)?
# We'll investigate soon!
loans['id']

0        17965023
1       111414087
2        95219557
          ...    
6297     63990101
6298     37641672
6299     50587446
Name: id, Length: 6300, dtype: int64

0        60 months
1        36 months
2        36 months
           ...    
6297     60 months
6298     60 months
6299     60 months
Name: term, Length: 6300, dtype: object

Index(['id', 'loan_amnt', 'issue_d', 'term', 'int_rate', 'grade', 'sub_grade',
       'emp_title', 'verification_status', 'home_ownership', 'annual_inc',
       'loan_status', 'purpose', 'desc', 'addr_state', 'dti', 'fico_range_low',
       'fico_range_high', 'hardship_flag', 'mths_since_last_delinq'],
      dtype='object')

loan_status
Fully Paid                                            3025
Current                                               2367
Charged Off                                            814
Late (31-120 days)                                      59
In Grace Period                                         23
Late (16-30 days)                                       11
Does not meet the credit policy. Status:Fully Paid       1
Name: count, dtype: int64

addr_state
CA    905
TX    553
NY    498
FL    420
IL    261
Name: count, dtype: int64

count    6300.00
mean      698.18
std        32.50
          ...   
50%       690.00
75%       715.00
max       845.00
Name: fico_range_low, Length: 8, dtype: float64

min        1000.0
median    15000.0
mean      15568.8
max       40000.0
Name: loan_amnt, dtype: float64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6300 entries, 0 to 6299
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      6300 non-null   int64  
 1   loan_amnt               6300 non-null   float64
 2   issue_d                 6300 non-null   object 
 3   term                    6300 non-null   object 
 4   int_rate                6300 non-null   float64
 5   grade                   6300 non-null   object 
 6   sub_grade               6300 non-null   object 
 7   emp_title               6300 non-null   object 
 8   verification_status     6300 non-null   object 
 9   home_ownership          6300 non-null   object 
 10  annual_inc              6300 non-null   float64
 11  loan_status             6300 non-null   object 
 12  purpose                 6300 non-null   object 
 13  desc                    324 non-null    object 
 14  addr_state              6300 non-null   object 
 15  dti                     6299 non-null   float64
 16  fico_range_low          6300 non-null   float64
 17  fico_range_high         6300 non-null   float64
 18  hardship_flag           6300 non-null   object 
 19  mths_since_last_delinq  3120 non-null   float64
dtypes: float64(7), int64(1), object(12)
memory usage: 984.5+ KB

1

4.0    6298
5.0       2
Name: count, dtype: int64

# The 'id's are stored as numbers, but are categorical (nominal).
# Are these loan 'id's (unique to each loan) or customer 'id's (which could be duplicated)?
# We'll investigate soon!
loans['id']

0        17965023
1       111414087
2        95219557
          ...    
6297     63990101
6298     37641672
6299     50587446
Name: id, Length: 6300, dtype: int64

# Loan 'term's are stored as strings, but are actually numerical (discrete).
loans['term']

0        60 months
1        36 months
2        36 months
           ...    
6297     60 months
6298     60 months
6299     60 months
Name: term, Length: 6300, dtype: object

loans.columns

Index(['id', 'loan_amnt', 'issue_d', 'term', 'int_rate', 'grade', 'sub_grade',
       'emp_title', 'verification_status', 'home_ownership', 'annual_inc',
       'loan_status', 'purpose', 'desc', 'addr_state', 'dti', 'fico_range_low',
       'fico_range_high', 'hardship_flag', 'mths_since_last_delinq'],
      dtype='object')

loans['loan_status'].value_counts()

loan_status
Fully Paid                                            3025
Current                                               2367
Charged Off                                            814
Late (31-120 days)                                      59
In Grace Period                                         23
Late (16-30 days)                                       11
Does not meet the credit policy. Status:Fully Paid       1
Name: count, dtype: int64

loans['addr_state'].value_counts().head()

addr_state
CA    905
TX    553
NY    498
FL    420
IL    261
Name: count, dtype: int64

loans['fico_range_low'].describe()

count    6300.00
mean      698.18
std        32.50
          ...   
50%       690.00
75%       715.00
max       845.00
Name: fico_range_low, Length: 8, dtype: float64

# It seems like no loans were above $40,000.
loans['loan_amnt'].agg(['min', 'median', 'mean', 'max'])

min        1000.0
median    15000.0
mean      15568.8
max       40000.0
Name: loan_amnt, dtype: float64

# The "object" dtype in pandas refers to anything that is not numeric/Boolean/time-related,
# including strings.
loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6300 entries, 0 to 6299
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      6300 non-null   int64  
 1   loan_amnt               6300 non-null   float64
 2   issue_d                 6300 non-null   object 
 3   term                    6300 non-null   object 
 4   int_rate                6300 non-null   float64
 5   grade                   6300 non-null   object 
 6   sub_grade               6300 non-null   object 
 7   emp_title               6300 non-null   object 
 8   verification_status     6300 non-null   object 
 9   home_ownership          6300 non-null   object 
 10  annual_inc              6300 non-null   float64
 11  loan_status             6300 non-null   object 
 12  purpose                 6300 non-null   object 
 13  desc                    324 non-null    object 
 14  addr_state              6300 non-null   object 
 15  dti                     6299 non-null   float64
 16  fico_range_low          6300 non-null   float64
 17  fico_range_high         6300 non-null   float64
 18  hardship_flag           6300 non-null   object 
 19  mths_since_last_delinq  3120 non-null   float64
dtypes: float64(7), int64(1), object(12)
memory usage: 984.5+ KB

# Are there multiple rows with the same 'id'?
# That is, are they person 'id's or loan 'id's?
loans['id'].value_counts().max()

1

loans[['fico_range_low', 'fico_range_high']]

(loans['fico_range_high'] - loans['fico_range_low']).value_counts()

4.0    6298
5.0       2
Name: count, dtype: int64

loans[['grade', 'sub_grade']]

# Turns out, the answer is yes!
# The .str accessor allows us to use the [0] operation
# on every string in loans['sub_grade'].
(loans['sub_grade'].str[0] == loans['grade']).all()

True

# Note that very few of the 'desc' (description) values are non-null!
loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6300 entries, 0 to 6299
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      6300 non-null   int64  
 1   loan_amnt               6300 non-null   float64
 2   issue_d                 6300 non-null   object 
 3   term                    6300 non-null   object 
 4   int_rate                6300 non-null   float64
 5   grade                   6300 non-null   object 
 6   sub_grade               6300 non-null   object 
 7   emp_title               6300 non-null   object 
 8   verification_status     6300 non-null   object 
 9   home_ownership          6300 non-null   object 
 10  annual_inc              6300 non-null   float64
 11  loan_status             6300 non-null   object 
 12  purpose                 6300 non-null   object 
 13  desc                    324 non-null    object 
 14  addr_state              6300 non-null   object 
 15  dti                     6299 non-null   float64
 16  fico_range_low          6300 non-null   float64
 17  fico_range_high         6300 non-null   float64
 18  hardship_flag           6300 non-null   object 
 19  mths_since_last_delinq  3120 non-null   float64
dtypes: float64(7), int64(1), object(12)
memory usage: 984.5+ KB

# Run this repeatedly to read a random sample of loan descriptions.
for desc in loans.loc[loans['desc'].notna(), 'desc'].sample(3):
    print(desc + '\n')

  Borrower added on 04/24/13 > To pay off credit card debt.<br>

  Borrower added on 04/19/13 > Have three credit cards that have teaser rates of zero percent APR's that are about to expire.<br>

  Borrower added on 05/16/13 > for credit cards and personal things<br><br> Borrower added on 05/16/13 > I need it for my credit cards and personal items<br><br> Borrower added on 05/16/13 > I need it for my credit cards and would like to use it for things that I would like to get.<br><br> Borrower added on 05/16/13 > I would like it for credit cards and personal things I would like to get for myself.  I work hard and think it would be nice to get a few things I would like to get at one time instead of waiting until I can save enough money that never seems to work.  I save a little but not alot in good time......<br>

# The percentage of values in each column that are missing.
loans.isna().mean().sort_values(ascending=False) * 100

desc                      94.86
mths_since_last_delinq    50.48
dti                        0.02
                          ...  
term                       0.00
issue_d                    0.00
annual_inc                 0.00
Length: 20, dtype: float64

(
    loans
    .assign(submitted_description=loans['desc'].notna())
    .groupby('submitted_description')
    ['int_rate']
    .agg(['mean', 'median'])
)

# Note the NaN at the very bottom.
loans['mths_since_last_delinq']

0       72.0
1        6.0
2       66.0
        ... 
6297    39.0
6298    22.0
6299     NaN
Name: mths_since_last_delinq, Length: 6300, dtype: float64

loans['mths_since_last_delinq'].sum()

106428.0

sum(loans['mths_since_last_delinq'])

nan

None

np.nan

nan

loans['term']

0        60 months
1        36 months
2        36 months
           ...    
6297     60 months
6298     60 months
6299     60 months
Name: term, Length: 6300, dtype: object

(loans['emp_title'] == 'registered nurse').sum()

252

(loans['emp_title'] == 'nurse').sum()

101

(loans['emp_title'] == 'rn').sum()

35

def clean_term(term_string):
    return int(term_string.split()[0])

loans['term'].apply(clean_term)

0       60
1       36
2       36
        ..
6297    60
6298    60
6299    60
Name: term, Length: 6300, dtype: int64

%%timeit
loans['term'].apply(clean_term)

1.67 ms ± 81.2 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

%%timeit
res = []
for term in loans['term']:
    res.append(clean_term(term))

1.36 ms ± 17.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

%%timeit
loans['int_rate'] // 10 * 10 # Rounds down to the nearest multiple of 10.

89.8 μs ± 803 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

%%timeit
loans['int_rate'].apply(lambda y: y // 10 * 10)

716 μs ± 6.95 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

# Here, we use .split() on every string in loans['term'].
loans['term'].str.split()

0       [60, months]
1       [36, months]
2       [36, months]
            ...     
6297    [60, months]
6298    [60, months]
6299    [60, months]
Name: term, Length: 6300, dtype: object

loans['term'].str.split().str[0].astype(int)

0       60
1       36
2       36
        ..
6297    60
6298    60
6299    60
Name: term, Length: 6300, dtype: int64

# Stored as strings.
loans['issue_d']

0       Jun-2014
1       Jun-2017
2       Dec-2016
          ...   
6297    Nov-2015
6298    Dec-2014
6299    Jun-2015
Name: issue_d, Length: 6300, dtype: object

pd.to_datetime(loans['issue_d'], format='%b-%Y')

0      2014-06-01
1      2017-06-01
2      2016-12-01
          ...    
6297   2015-11-01
6298   2014-12-01
6299   2015-06-01
Name: issue_d, Length: 6300, dtype: datetime64[ns]

def clean_term_column(df):
    return df.assign(
        term=df['term'].str.split().str[0].astype(int)
    )
def clean_date_column(df):
    return (
        df
        .assign(date=pd.to_datetime(df['issue_d'], format='%b-%Y'))
        .drop(columns=['issue_d'])
    )

loans = (
    pd.read_csv('data/loans.csv')
    .pipe(clean_term_column)
    .pipe(clean_date_column)
)
loans

# Same as above, just way harder to read and write.
clean_date_column(clean_term_column(pd.read_csv('data/loans.csv')))

# This shows us the average interest rate given out to loans in every 6 month interval.
loans.resample('6M', on='date')['int_rate'].mean()

date
2008-03-31    10.71
2008-09-30     8.63
2009-03-31    12.13
              ...  
2018-03-31    12.85
2018-09-30    12.72
2019-03-31    12.93
Freq: 6M, Name: int_rate, Length: 23, dtype: float64

# Not meaningful in this example, but possible.
loans['date'].diff()

0            NaT
1      1096 days
2      -182 days
          ...   
6297   -517 days
6298   -335 days
6299    182 days
Name: date, Length: 6300, dtype: timedelta64[ns]

# If each loan was for 60 months,
# this is a Series of when they'd end.
# Unfortunately, pd.DateOffset isn't vectorized, so
# if you'd want to use a different month offset for each row
# (like we'd need to, since some loans are 36 months
# and some are 60 months), you'd need to use `.apply`.
loans['date'] + pd.DateOffset(months=60)

0      2019-06-01
1      2022-06-01
2      2021-12-01
          ...    
6297   2020-11-01
6298   2019-12-01
6299   2020-06-01
Name: date, Length: 6300, dtype: datetime64[ns]

loans['date'].dt.year

0       2014
1       2017
2       2016
        ... 
6297    2015
6298    2014
6299    2015
Name: date, Length: 6300, dtype: int32

loans['date'].dt.month

0        6
1        6
2       12
        ..
6297    11
6298    12
6299     6
Name: date, Length: 6300, dtype: int32

wide_example = pd.DataFrame({
    'Year': [2001, 2002],
    'Jan': [10, 130],
    'Feb': [20, 200],
    'Mar': [30, 340]
}).set_index('Year')
wide_example

home_ownership
MORTGAGE    2810
RENT        2539
OWN          950
ANY            1
Name: count, dtype: int64

interactive(children=(IntSlider(value=26, description='nbins', max=51, min=1), Output()), _dom_classes=('widge…

wide_example = pd.DataFrame({
    'Year': [2001, 2002],
    'Jan': [10, 130],
    'Feb': [20, 200],
    'Mar': [30, 340]
}).set_index('Year')
wide_example

wide_example.melt(ignore_index=False)

import plotly.express as px

pd.options.plotting.backend = 'plotly'

# Here, we're using the .plot method on loans['addr_state'], which is a Series.
# We prefer horizontal bar charts, since they're easier to read.
(
    loans['addr_state']
    .value_counts()
    .plot(kind='barh')
)

# A little formatting goes a long way!
(
    loans['addr_state']
    .value_counts()
    .sort_values()
    .head(10)
    .plot(kind='barh', title='States of Residence for Successful Loan Applicants')
    .update_layout()
)

(
    loans
    .groupby('home_ownership')
    ['int_rate']
    .mean()
    .plot(kind='barh', title='Average Interest Rate by Home Ownership Status')
)

# The "ANY" category seems to be an outlier.
loans['home_ownership'].value_counts()

home_ownership
MORTGAGE    2810
RENT        2539
OWN          950
ANY            1
Name: count, dtype: int64

(
    loans
    .groupby('home_ownership')
    .filter(lambda df: df.shape[0] > 1) # Gets rid of the "ANY" category.
    .groupby(['home_ownership', 'term'])
    [['int_rate']]
    .mean()
)

# Annoyingly, the side-by-side bar chart doesn't work properly
# if the column that separates colors (here, 'term')
# isn't made up of strings.
(
    loans
    .assign(term=loans['term'].astype(str) + ' months')
    .groupby('home_ownership')
    .filter(lambda df: df.shape[0] > 1)
    .groupby(['home_ownership', 'term'])
    [['int_rate']]
    .mean()
    .reset_index()
    .plot(kind='bar', 
          y='int_rate', 
          x='home_ownership', 
          color='term', 
          barmode='group',
          title='Average Interest Rate by Home Ownership Status and Loan Term',
          width=800)
)

(
    loans
    .plot(kind='hist', x='int_rate', title='Distribution of Interest Rates')
)

from ipywidgets import interact
def hist_bins(nbins):
    (
        loans
        .plot(kind='hist', x='int_rate', nbins=nbins, title='Distribution of Interest Rates')
        .show()
    )
interact(hist_bins, nbins=(1, 51));

interactive(children=(IntSlider(value=26, description='nbins', max=51, min=1), Output()), _dom_classes=('widge…

(
    loans
    .plot(kind='hist', x='int_rate', title='Distribution of Interest Rates')
)

(
    loans
    .plot(kind='box', y='int_rate', color='term', orientation='v', 
          title='Distribution of Interest Rates by Loan Term')
)

(
    loans
    .plot(kind='violin', y='int_rate', color='term', orientation='v', 
          title='Distribution of Interest Rates by Loan Term')
)

(
    loans
    .plot(kind='hist', x='int_rate', color='term', marginal='box', nbins=20,
          title='Distribution of Interest Rates by Loan Term')
)

(
    loans
    .sample(200, random_state=23)
    .plot(kind='scatter', x='dti', y='int_rate', title='Interest Rate vs. Debt-to-Income Ratio')
)

(
    loans
    .assign(term=loans['term'].astype(str))
    .sample(200, random_state=23)
    .plot(kind='scatter', x='dti', y='int_rate', color='term',
          hover_name='id', size='loan_amnt',
          title='Interest Rate vs. Debt-to-Income Ratio')
)

(
    loans
    .assign(year=loans['date'].dt.year)
    ['year']
    .value_counts()
    .sort_index()
    .plot(kind='line', title='Number of Loans Given Per Year')
)

(
    loans
    .resample('6M', on='date')
    ['int_rate']
    .mean()
    .plot(kind='line', title='Average Interest Rate over Time')
)

(
    loans
    .groupby('term')
    .resample('6M', on='date')
    ['int_rate']
    .mean()
    .reset_index()
    .plot(kind='line', x='date', y='int_rate', color='term',
          title='Average Interest Rate over Time')
)

	id	loan_amnt	issue_d	term	...	fico_range_low	fico_range_high	hardship_flag	mths_since_last_delinq
1345	120022770	20000.0	Sep-2017	36 months	...	680.0	684.0	N	10.0
1659	129128743	14000.0	Feb-2018	36 months	...	800.0	804.0	N	NaN
5866	143962984	15000.0	Nov-2018	60 months	...	765.0	769.0	N	NaN
5828	145216436	15000.0	Dec-2018	36 months	...	700.0	704.0	N	NaN
1012	66485750	10000.0	Dec-2015	36 months	...	830.0	834.0	N	NaN

	loan_amnt	issue_d	term	int_rate	emp_title	fico_range_low
2110	29600.0	Aug-2011	60 months	19.69	cbs corporation	690.0
272	29725.0	Aug-2018	60 months	18.94	account executive	725.0
805	5400.0	Oct-2017	36 months	10.91	auto body technician	710.0
1029	24000.0	Dec-2015	60 months	13.67	enterprise architect	670.0
3886	17225.0	Apr-2016	36 months	16.29	research project manager	700.0

	loan_amnt	term	int_rate
249	3600.0	36 months	24.50
626	3600.0	36 months	13.99
1020	3600.0	36 months	11.49
2141	3600.0	36 months	10.08
2145	3600.0	36 months	5.32
2584	3600.0	36 months	16.29
2739	3600.0	36 months	8.24
3845	3600.0	36 months	13.66
4153	3600.0	36 months	12.59
4368	3600.0	36 months	14.08
4575	3600.0	36 months	16.46
4959	3600.0	36 months	10.75
4984	3600.0	36 months	13.99
5478	3600.0	60 months	10.59
5560	3600.0	36 months	19.99
5693	3600.0	36 months	13.99
6113	3600.0	36 months	15.59

	fico_range_low	fico_range_high
0	700.0	704.0
1	680.0	684.0
2	705.0	709.0
...	...	...
6297	675.0	679.0
6298	660.0	664.0
6299	685.0	689.0

Feature types	Options
Single categorical feature	Bar charts, pie charts, dot plots
Single numerical feature	Histograms, box plots, density curves, rug plots, violin plots
Two numerical features	Scatter plots, line plots, heat maps, contour plots
One categorical and one numerical feature It really depends on the nature of the features themselves!	Side-by-side histograms, box plots, or bar charts, overlaid line plots or density curves

	id	loan_amnt	term	int_rate	...	fico_range_high	hardship_flag	mths_since_last_delinq	date
0	17965023	18000.0	60	16.99	...	704.0	N	72.0	2014-06-01
1	111414087	10000.0	36	16.02	...	684.0	N	6.0	2017-06-01
2	95219557	12800.0	36	7.99	...	709.0	N	66.0	2016-12-01
...	...	...	...	...	...	...	...	...	...
6297	63990101	10800.0	60	18.49	...	679.0	N	39.0	2015-11-01
6298	37641672	15000.0	60	14.31	...	664.0	N	22.0	2014-12-01
6299	50587446	14000.0	60	9.99	...	689.0	N	NaN	2015-06-01

	variable	value
Year
2001	Jan	10
2002	Jan	130
2001	Feb	20
2002	Feb	200
2001	Mar	30
2002	Mar	340

		int_rate
home_ownership	term
MORTGAGE	36	11.42
MORTGAGE	60	15.27
OWN	36	11.75
OWN	60	16.14
RENT	36	12.23
RENT	60	16.43

	mean	median
submitted_description
False	13.09	12.62
True	13.68	13.68

Lecture 7¶

Exploratory Data Analysis, Data Cleaning, and Visualization¶

EECS 398-003: Practical Data Science, Fall 2024¶

Announcements 📣¶

Agenda¶

Question 🤔 (Answer at practicaldsc.org/q)

Exploratory data analysis¶

Dataset overview¶

Lender decision-making¶

Exploratory data analysis (EDA)¶

Terminology¶

Feature types¶

Question 🤔 (Answer at practicaldsc.org/q)

Feature types vs. data types¶

Data cleaning¶

Four pillars of data cleaning¶

Data cleaning: Data quality checks¶

Data quality checks¶

Scope: Do the data match your understanding of the population?¶

Measurements and values: Are the values reasonable?¶

Relationships: Are related features in agreement?¶

Data cleaning: Missing values¶

Missing values¶

Aside: Series operations with null values¶

Data cleaning: Transformations and timestamps¶

Transformations¶

One solution: The apply method¶

The price of apply¶

The .str accessor¶

Creating timestamps ⏱️¶

Aside: The pipe method🚰¶

Working with timestamps¶

The .dt accessor¶

Reference Section¶

Data cleaning: Modifying structure¶

Reshaping DataFrames¶

Using melt¶

Example usage of melt¶

Visualization¶

Napoleon's March¶

Why visualize?¶

Choosing the correct type of visualization¶

plotly¶

Using plotly¶

Bar charts¶

Side-by-side bar charts¶

Histograms¶

Box plots and violin plots¶

Scatter plots¶

Line charts¶

One solution: The `apply` method¶

The price of `apply`¶

The `.str` accessor¶

Aside: The `pipe` method🚰¶

The `.dt` accessor¶

Using `melt`¶

Example usage of `melt`¶

`plotly`¶

Using `plotly`¶