from lec_utils import *

# Run this cell to perform the data cleaning steps we implemented last lecture.
def clean_term_column(df):
    return df.assign(
        term=df['term'].str.split().str[0].astype(int)
    )
def clean_date_column(df):
    return (
        df
        .assign(date=pd.to_datetime(df['issue_d'], format='%b-%Y'))
        .drop(columns=['issue_d'])
    )
loans = (
    pd.read_csv('data/loans.csv')
    .pipe(clean_term_column)
    .pipe(clean_date_column)
)
loans

Index(['id', 'loan_amnt', 'term', 'int_rate', 'grade', 'sub_grade',
       'emp_title', 'verification_status', 'home_ownership', 'annual_inc',
       'loan_status', 'purpose', 'desc', 'addr_state', 'dti', 'fico_range_low',
       'fico_range_high', 'hardship_flag', 'mths_since_last_delinq', 'date'],
      dtype='object')

loan_status
Fully Paid                                            3025
Current                                               2367
Charged Off                                            814
Late (31-120 days)                                      59
In Grace Period                                         23
Late (16-30 days)                                       11
Does not meet the credit policy. Status:Fully Paid       1
Name: count, dtype: int64

addr_state
CA    905
TX    553
NY    498
FL    420
IL    261
Name: count, dtype: int64

count     6300.00
mean     15568.80
std       7398.97
           ...   
50%      15000.00
75%      19125.00
max      40000.00
Name: loan_amnt, Length: 8, dtype: float64

4.0    6298
5.0       2
Name: count, dtype: int64

True

home_ownership
MORTGAGE    2810
RENT        2539
OWN          950
ANY            1
Name: count, dtype: int64

interactive(children=(IntSlider(value=26, description='nbins', max=51, min=1), Output()), _dom_classes=('widge…

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6300 entries, 0 to 6299
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   id                      6300 non-null   int64         
 1   loan_amnt               6300 non-null   float64       
 2   term                    6300 non-null   int64         
 3   int_rate                6300 non-null   float64       
 4   grade                   6300 non-null   object        
 5   sub_grade               6300 non-null   object        
 6   emp_title               6300 non-null   object        
 7   verification_status     6300 non-null   object        
 8   home_ownership          6300 non-null   object        
 9   annual_inc              6300 non-null   float64       
 10  loan_status             6300 non-null   object        
 11  purpose                 6300 non-null   object        
 12  desc                    324 non-null    object        
 13  addr_state              6300 non-null   object        
 14  dti                     6299 non-null   float64       
 15  fico_range_low          6300 non-null   float64       
 16  fico_range_high         6300 non-null   float64       
 17  hardship_flag           6300 non-null   object        
 18  mths_since_last_delinq  3120 non-null   float64       
 19  date                    6300 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(7), int64(2), object(10)
memory usage: 984.5+ KB

5976

# Run this cell to perform the data cleaning steps we implemented last lecture.
def clean_term_column(df):
    return df.assign(
        term=df['term'].str.split().str[0].astype(int)
    )
def clean_date_column(df):
    return (
        df
        .assign(date=pd.to_datetime(df['issue_d'], format='%b-%Y'))
        .drop(columns=['issue_d'])
    )
loans = (
    pd.read_csv('data/loans.csv')
    .pipe(clean_term_column)
    .pipe(clean_date_column)
)
loans

display_df(loans.loc[loans['loan_amnt'] == 3600, ['loan_amnt', 'term', 'int_rate']], rows=17)

loans.columns

Index(['id', 'loan_amnt', 'term', 'int_rate', 'grade', 'sub_grade',
       'emp_title', 'verification_status', 'home_ownership', 'annual_inc',
       'loan_status', 'purpose', 'desc', 'addr_state', 'dti', 'fico_range_low',
       'fico_range_high', 'hardship_flag', 'mths_since_last_delinq', 'date'],
      dtype='object')

loans['loan_status'].value_counts()

loan_status
Fully Paid                                            3025
Current                                               2367
Charged Off                                            814
Late (31-120 days)                                      59
In Grace Period                                         23
Late (16-30 days)                                       11
Does not meet the credit policy. Status:Fully Paid       1
Name: count, dtype: int64

loans['addr_state'].value_counts().head()

addr_state
CA    905
TX    553
NY    498
FL    420
IL    261
Name: count, dtype: int64

loans['loan_amnt'].describe()

count     6300.00
mean     15568.80
std       7398.97
           ...   
50%      15000.00
75%      19125.00
max      40000.00
Name: loan_amnt, Length: 8, dtype: float64

loans['loan_amnt'].plot(kind='hist', nbins=10)

loans[['fico_range_low', 'fico_range_high']]

(loans['fico_range_high'] - loans['fico_range_low']).value_counts()

4.0    6298
5.0       2
Name: count, dtype: int64

loans[['grade', 'sub_grade']]

# Turns out, the answer is yes!
# The .str accessor allows us to use the [0] operation
# on every string in loans['sub_grade'].
(loans['sub_grade'].str[0] == loans['grade']).all()

True

import plotly.express as px

pd.options.plotting.backend = 'plotly'

# Here, we're using the .plot method on loans['addr_state'], which is a Series.
# We prefer horizontal bar charts, since they're easier to read.
(
    loans['addr_state']
    .value_counts()
    .plot(kind='barh')
)

# A little formatting goes a long way!
(
    loans['addr_state']
    .value_counts(normalize=True)
    .head(10)
    .sort_values()
    .plot(kind='barh', title='States of Residence for Successful Loan Applicants')
)

(
    loans
    .groupby('home_ownership')
    ['int_rate']
    .mean()
    .plot(kind='barh', title='Average Interest Rate by Home Ownership Status')
)

# The "ANY" category seems to be an outlier.
loans['home_ownership'].value_counts()

home_ownership
MORTGAGE    2810
RENT        2539
OWN          950
ANY            1
Name: count, dtype: int64

(
    loans
    .groupby('home_ownership')
    .filter(lambda df: df.shape[0] > 1) # Gets rid of the "ANY" category.
    .groupby(['home_ownership', 'term'])
    [['int_rate']]
    .mean()
)

# Annoyingly, the side-by-side bar chart doesn't work properly
# if the column that separates colors (here, 'term')
# isn't made up of strings.
(
    loans
    .assign(term=loans['term'].astype(str) + ' months')
    .groupby('home_ownership')
    .filter(lambda df: df.shape[0] > 1)
    .groupby(['home_ownership', 'term'])
    [['int_rate']]
    .mean()
    .reset_index()
    .plot(kind='bar', 
          y='int_rate', 
          x='home_ownership', 
          color='term', 
          barmode='group',
          title='Average Interest Rate by Home Ownership Status and Loan Term',
          width=800)
)

(
    loans
    .plot(kind='hist', x='int_rate', title='Distribution of Interest Rates')
)

def hist_bins(nbins):
    (
        loans
        .plot(kind='hist', x='int_rate', nbins=nbins, title='Distribution of Interest Rates')
        .show()
    )
interact(hist_bins, nbins=(1, 51));

interactive(children=(IntSlider(value=26, description='nbins', max=51, min=1), Output()), _dom_classes=('widge…

(
    loans
    .plot(kind='hist', x='int_rate', title='Distribution of Interest Rates', nbins=20)
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6300 entries, 0 to 6299
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   id                      6300 non-null   int64         
 1   loan_amnt               6300 non-null   float64       
 2   term                    6300 non-null   int64         
 3   int_rate                6300 non-null   float64       
 4   grade                   6300 non-null   object        
 5   sub_grade               6300 non-null   object        
 6   emp_title               6300 non-null   object        
 7   verification_status     6300 non-null   object        
 8   home_ownership          6300 non-null   object        
 9   annual_inc              6300 non-null   float64       
 10  loan_status             6300 non-null   object        
 11  purpose                 6300 non-null   object        
 12  desc                    324 non-null    object        
 13  addr_state              6300 non-null   object        
 14  dti                     6299 non-null   float64       
 15  fico_range_low          6300 non-null   float64       
 16  fico_range_high         6300 non-null   float64       
 17  hardship_flag           6300 non-null   object        
 18  mths_since_last_delinq  3120 non-null   float64       
 19  date                    6300 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(7), int64(2), object(10)
memory usage: 984.5+ KB

5976

  Borrower added on 05/28/13 > Debt Consolidation Loan<br>

  Borrower added on 12/04/13 > i need to pay off some outstanding loans. get credit cards paid off. i need just one payment instead of several. having hard time remembering when which one is due.<br>

  Borrower added on 02/20/14 > Basically the loan is to pay off credit cards.  I just want to get ahead in regards to my finances. I heard about the Lending Club via my mentor.<br>

0       72.0
1        6.0
2       66.0
        ... 
6297    39.0
6298    22.0
6299     NaN
Name: mths_since_last_delinq, Length: 6300, dtype: float64

106428.0

nan

nan

169

67.10339869281046

3.5227776335950374

(
    loans
    .plot(kind='hist', x='int_rate', title='Distribution of Interest Rates', nbins=20)
)

(
    loans
    .plot(kind='box', x='int_rate', title='Distribution of Interest Rates')
)

(
    loans
    .plot(kind='box', y='int_rate', color='term', orientation='v', 
          title='Distribution of Interest Rates by Loan Term')
)

(
    loans
    .plot(kind='violin', y='int_rate', color='term', orientation='v', 
          title='Distribution of Interest Rates by Loan Term')
)

(
    loans
    .plot(kind='hist', x='int_rate', color='term', marginal='box', nbins=20,
          title='Distribution of Interest Rates by Loan Term')
)

(
    loans
    .sample(200, random_state=23)
    .plot(kind='scatter', x='dti', y='int_rate', title='Interest Rate vs. Debt-to-Income Ratio')
)

(
    loans
    .assign(term=loans['term'].astype(str))
    .sample(200, random_state=23)
    .plot(kind='scatter', x='dti', y='int_rate', color='term',
          hover_name='id', size='loan_amnt',
          title='Interest Rate vs. Debt-to-Income Ratio')
)

(
    loans
    .assign(year=loans['date'].dt.year)
    ['year']
    .value_counts()
    .sort_index()
    .plot(kind='line', title='Number of Loans Given Per Year')
)

(
    loans
    .resample('6M', on='date')
    ['int_rate']
    .mean()
    .plot(kind='line', title='Average Interest Rate over Time')
)

(
    loans
    .groupby('term')
    .resample('6M', on='date')
    ['int_rate']
    .mean()
    .reset_index()
    .plot(kind='line', x='date', y='int_rate', color='term',
          title='Average Interest Rate over Time')
)

loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6300 entries, 0 to 6299
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   id                      6300 non-null   int64         
 1   loan_amnt               6300 non-null   float64       
 2   term                    6300 non-null   int64         
 3   int_rate                6300 non-null   float64       
 4   grade                   6300 non-null   object        
 5   sub_grade               6300 non-null   object        
 6   emp_title               6300 non-null   object        
 7   verification_status     6300 non-null   object        
 8   home_ownership          6300 non-null   object        
 9   annual_inc              6300 non-null   float64       
 10  loan_status             6300 non-null   object        
 11  purpose                 6300 non-null   object        
 12  desc                    324 non-null    object        
 13  addr_state              6300 non-null   object        
 14  dti                     6299 non-null   float64       
 15  fico_range_low          6300 non-null   float64       
 16  fico_range_high         6300 non-null   float64       
 17  hardship_flag           6300 non-null   object        
 18  mths_since_last_delinq  3120 non-null   float64       
 19  date                    6300 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(7), int64(2), object(10)
memory usage: 984.5+ KB

loans['desc'].isna().sum()

5976

# Run this repeatedly to read a random sample of loan descriptions.
for desc in loans.loc[loans['desc'].notna(), 'desc'].sample(3):
    print(desc + '\n')

  Borrower added on 05/28/13 > Debt Consolidation Loan<br>

  Borrower added on 12/04/13 > i need to pay off some outstanding loans. get credit cards paid off. i need just one payment instead of several. having hard time remembering when which one is due.<br>

  Borrower added on 02/20/14 > Basically the loan is to pay off credit cards.  I just want to get ahead in regards to my finances. I heard about the Lending Club via my mentor.<br>

(
    loans
    .assign(submitted_description=loans['desc'].notna())
    .groupby('submitted_description')
    ['int_rate']
    .agg(['mean', 'median'])
)

# Note the NaN at the very bottom.
loans['mths_since_last_delinq']

0       72.0
1        6.0
2       66.0
        ... 
6297    39.0
6298    22.0
6299     NaN
Name: mths_since_last_delinq, Length: 6300, dtype: float64

loans['mths_since_last_delinq'].sum()

106428.0

sum(loans['mths_since_last_delinq'])

nan

None

np.nan

nan

heights = pd.read_csv('data/heights-missing.csv')
heights.head()

heights['child'].isna().sum()

169

heights['child'].plot(kind='hist', nbins=30)

def multiple_kdes(ser_map, title=""):
    values = [ser_map[key].dropna() for key in ser_map]
    labels = list(ser_map.keys())
    fig = ff.create_distplot(
        hist_data=values,
        group_labels=labels,
        show_rug=False,
        show_hist=False,
        colors=px.colors.qualitative.Dark2[: len(ser_map)],
    )
    return fig.update_layout(title=title, width=1000).update_xaxes(title="child")
multiple_kdes({'Before Imputation': heights['child']})

heights

heights.dropna()

heights['child'].mean()

67.10339869281046

heights['child'].std()

3.5227776335950374

heights['child']

0       NaN
1      69.2
2      69.0
       ... 
931    61.0
932    66.5
933    57.0
Name: child, Length: 934, dtype: float64

# DON'T do this!
heights['child'].fillna(0)

0       0.0
1      69.2
2      69.0
       ... 
931    61.0
932    66.5
933    57.0
Name: child, Length: 934, dtype: float64

heights['child']

0       NaN
1      69.2
2      69.0
       ... 
931    61.0
932    66.5
933    57.0
Name: child, Length: 934, dtype: float64

heights['child'].mean()

67.10339869281046

mean_imputed = heights['child'].fillna(heights['child'].mean()) 
mean_imputed

0      67.1
1      69.2
2      69.0
       ... 
931    61.0
932    66.5
933    57.0
Name: child, Length: 934, dtype: float64

# Mean before imputation:
heights['child'].mean()

67.10339869281046

# Mean after imputation:
mean_imputed.mean()

67.10339869281046

mean_imputed.value_counts()

child
67.1    169
70.0     54
68.0     50
       ... 
63.2      1
62.2      1
59.0      1
Name: count, Length: 64, dtype: int64

multiple_kdes({'Before Imputation': heights['child'], 
               'After Mean Imputation': mean_imputed})

heights['child'].std()

3.5227776335950374

mean_imputed.std()

3.187800157298298

# Here, we're computing the proportion of 'child' heights that are missing per gender.
(
    heights
    .groupby('gender')
    ['child']
    .agg(lambda s: s.isna().mean())
)

gender
female    0.33
male      0.04
Name: child, dtype: float64

# The mean 'female' observed 'child' height is 64.03, while
# the mean 'male' observed 'child' height is 69.13.
heights.groupby('gender')['child'].mean()

gender
female    64.03
male      69.13
Name: child, dtype: float64

heights

# Note the first missing 'child' height is filled in with
# 69.13, the mean of the observed 'male' heights, since
# they are a 'male' child!
conditional_mean_imputed = ...
conditional_mean_imputed = (
    heights
    .groupby('gender')
    ['child']
    .transform(lambda s: s.fillna(s.mean()))
)
conditional_mean_imputed

0      69.13
1      69.20
2      69.00
       ...  
931    61.00
932    66.50
933    57.00
Name: child, Length: 934, dtype: float64

multiple_kdes({'Before Imputation': heights['child'], 
               'After Mean Imputation': mean_imputed,
               'After Conditional Mean Imputation': conditional_mean_imputed})

# The mean of just our present values.
heights['child'].mean()

67.10339869281046

# Lower than above, reflecting the fact that we are missing
# more 'female' heights and 'female' heights
# tend to be lower.
conditional_mean_imputed.mean()

66.65591770566121

# There's nothing special about the values passed into .iloc below;
# they're just for illustration.
heights.iloc[[0, 2, 919, 11, 4, 8, 9]]

# impute_prob should take in a Series with missing values and return an imputed Series.
def impute_prob(s):
    s = s.copy()
    # Find the number of missing values.
    num_missing = s.isna().sum()
    # Take a sample of size num_missing from the present values.
    sample = np.random.choice(s.dropna(), num_missing)
    # Fill in the missing values with our random sample.
    s.loc[s.isna()] = sample
    return s

# The number at the very top is constantly changing!
prob_imputed = impute_prob(heights['child'])
print('Mean:', prob_imputed.mean())
prob_imputed

Mean: 67.1423982869379

0      63.0
1      69.2
2      69.0
       ... 
931    61.0
932    66.5
933    57.0
Name: child, Length: 934, dtype: float64

conditional_prob_imputed = ...
conditional_prob_imputed = (
    heights
    .groupby('gender')
    ['child']
    .transform(impute_prob)
)
conditional_prob_imputed

0      66.0
1      69.2
2      69.0
       ... 
931    61.0
932    66.5
933    57.0
Name: child, Length: 934, dtype: float64

multiple_kdes({'Before Imputation': heights['child'], 
               'After Mean Imputation': mean_imputed,
               'After Conditional Mean Imputation': conditional_mean_imputed,
               'After Probabilistic Imputation': prob_imputed,
               'After Conditional Probabilistic Imputation': conditional_prob_imputed})

	loan_amnt	term	int_rate
249	3600.0	36	24.50
626	3600.0	36	13.99
1020	3600.0	36	11.49
2141	3600.0	36	10.08
2145	3600.0	36	5.32
2584	3600.0	36	16.29
2739	3600.0	36	8.24
3845	3600.0	36	13.66
4153	3600.0	36	12.59
4368	3600.0	36	14.08
4575	3600.0	36	16.46
4959	3600.0	36	10.75
4984	3600.0	36	13.99
5478	3600.0	60	10.59
5560	3600.0	36	19.99
5693	3600.0	36	13.99
6113	3600.0	36	15.59

	fico_range_low	fico_range_high
0	700.0	704.0
1	680.0	684.0
2	705.0	709.0
...	...	...
6297	675.0	679.0
6298	660.0	664.0
6299	685.0	689.0

Feature types	Options
Single categorical feature	Bar charts, pie charts, dot plots
Single numerical feature	Histograms, box plots, density curves, rug plots, violin plots
Two numerical features	Scatter plots, line plots, heat maps, contour plots
One categorical and one numerical feature It really depends on the nature of the features themselves!	Side-by-side histograms, box plots, or bar charts, overlaid line plots or density curves

	id	loan_amnt	term	int_rate	...	fico_range_high	hardship_flag	mths_since_last_delinq	date
0	17965023	18000.0	60	16.99	...	704.0	N	72.0	2014-06-01
1	111414087	10000.0	36	16.02	...	684.0	N	6.0	2017-06-01
2	95219557	12800.0	36	7.99	...	709.0	N	66.0	2016-12-01
...	...	...	...	...	...	...	...	...	...
6297	63990101	10800.0	60	18.49	...	679.0	N	39.0	2015-11-01
6298	37641672	15000.0	60	14.31	...	664.0	N	22.0	2014-12-01
6299	50587446	14000.0	60	9.99	...	689.0	N	NaN	2015-06-01

		int_rate
home_ownership	term
MORTGAGE	36	11.42
MORTGAGE	60	15.27
OWN	36	11.75
OWN	60	16.14
RENT	36	12.23
RENT	60	16.43

	father	mother	gender	child
0	78.5	67.0	male	NaN
1	78.5	67.0	female	69.2
2	78.5	67.0	female	69.0
3	78.5	67.0	female	69.0
4	75.5	66.5	male	NaN

	mean	median
submitted_description
False	13.09	12.62
True	13.68	13.68

Lecture 7¶

EDA, Visualization, and Missing Value Imputation¶

EECS 398: Practical Data Science, Spring 2025¶

Agenda 📆¶

Question 🤔 (Answer at practicaldsc.org/q)

Exploratory data analysis 🔎¶

Loading the data 🏦¶

Lender decision-making¶

Exploratory data analysis (EDA)¶

Individuals and features¶

Feature types¶

Asking questions 🙋¶

Who do we have data for?¶

What is the distribution of loan amounts?¶

Are related features in agreement?¶

Visualization 📊¶

Visualizations complement statistics¶

Why visualize?¶

plotly¶

Choosing the correct type of visualization¶

Bar charts¶

Side-by-side bar charts¶

Histograms¶

Question 🤔 (Answer at practicaldsc.org/q)

Box plots and violin plots¶

Scatter plots¶

Line charts¶

Missing value imputation 🕳️¶

Who provided loan descriptions?¶

Reference Slide¶

Aside: Series operations with null values¶

Intentionally missing values and default replacements¶

Generally, what do we do with missing data?¶

Example: Heights¶

Aside: Kernel density estimates¶

Idea: Dropping missing values¶

Idea: Mean imputation¶

Mean imputation destroys spread!¶

Mean imputation and listwise deletion introduce bias!¶

Idea: Conditional mean imputation¶

Pros and cons of conditional mean imputation¶

Idea: Regression imputation¶

Idea: Probabilistic imputation¶

Visualizing imputation strategies¶

Reference Slide¶

Missingness mechanisms¶

Reference Slide¶

How do we know if data are MCAR?¶

Summary of imputation techniques¶

What's next?¶

`plotly`¶