Practical DSC Midterm Review¶

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
In [2]:
reviews = pd.read_csv("reviews.csv", index_col=0)
reviews
Out[2]:
airline author date content cabin overall
0 klm-royal-dutch-airlines A Sahni 2015-07-24 KLM Business Class from Barcelona via AMS to D... Business 8
1 hawaiian-airlines Peter Pomeranze 2015-07-27 Flying Hawaiians seasonal non stop was a pleas... First 8
2 tap-portugal David Olvierk 2015-07-21 I fly up to Lisbon on a near weekly basis gen... Economy 1
3 spirit-airlines Narges Noori 2015-06-22 I've been using Spirit for 2years because my h... Economy 1
4 precision Joe Rossi 2015-07-17 Dar Es Salaam to Arusha. One day delay, landin... Economy 1
... ... ... ... ... ... ...
1892 spirit-airlines Teri Hake 2015-06-17 Worst flight experience. Arrived at 6.00am for... Economy 1
1893 hawaiian-airlines Brett Kero 2015-06-11 Flight from Hilo to Honolulu, new cabin, seat ... Economy 9
1894 korean-air Josiah Peachey 2015-07-27 I flew Korean Air for the first time. The A380... Economy 8
1895 emirates Anthony Innarelli 2015-07-28 Great flight from JFK to Milan. The aircraft i... Economy 9
1896 china-airlines C McPhaden 2015-07-07 Left on time and arrived early on both flights... Economy 7

1897 rows × 6 columns

In [ ]:
 

Problem 2¶

In [3]:
pd.Series(["aasdf", "b", ["a", "b"]]).str.len()
Out[3]:
0    5
1    1
2    2
dtype: int64
In [4]:
def operate(df):
    df["content"] = df["content"].str.split().str.len()
In [5]:
reviews = pd.read_csv("reviews.csv", index_col=0)
operate(reviews)
In [6]:
reviews = pd.read_csv("reviews.csv", index_col=0)
operate(reviews)
operate(reviews)
reviews
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[6], line 3
      1 reviews = pd.read_csv("reviews.csv", index_col=0)
      2 operate(reviews)
----> 3 operate(reviews)
      4 reviews

Cell In[4], line 2, in operate(df)
      1 def operate(df):
----> 2     df["content"] = df["content"].str.split().str.len()

File ~/miniforge3/lib/python3.11/site-packages/pandas/core/generic.py:5902, in NDFrame.__getattr__(self, name)
   5895 if (
   5896     name not in self._internal_names_set
   5897     and name not in self._metadata
   5898     and name not in self._accessors
   5899     and self._info_axis._can_hold_identifiers_and_holds_name(name)
   5900 ):
   5901     return self[name]
-> 5902 return object.__getattribute__(self, name)

File ~/miniforge3/lib/python3.11/site-packages/pandas/core/accessor.py:182, in CachedAccessor.__get__(self, obj, cls)
    179 if obj is None:
    180     # we're accessing the attribute of the class, i.e., Dataset.geo
    181     return self._accessor
--> 182 accessor_obj = self._accessor(obj)
    183 # Replace the property with the accessor object. Inspired by:
    184 # https://www.pydanny.com/cached-property.html
    185 # We need to use object.__setattr__ because we overwrite __setattr__ on
    186 # NDFrame
    187 object.__setattr__(obj, self._name, accessor_obj)

File ~/miniforge3/lib/python3.11/site-packages/pandas/core/strings/accessor.py:181, in StringMethods.__init__(self, data)
    178 def __init__(self, data) -> None:
    179     from pandas.core.arrays.string_ import StringDtype
--> 181     self._inferred_dtype = self._validate(data)
    182     self._is_categorical = is_categorical_dtype(data.dtype)
    183     self._is_string = isinstance(data.dtype, StringDtype)

File ~/miniforge3/lib/python3.11/site-packages/pandas/core/strings/accessor.py:235, in StringMethods._validate(data)
    232 inferred_dtype = lib.infer_dtype(values, skipna=True)
    234 if inferred_dtype not in allowed_types:
--> 235     raise AttributeError("Can only use .str accessor with string values!")
    236 return inferred_dtype

AttributeError: Can only use .str accessor with string values!
In [7]:
reviews = pd.read_csv("reviews.csv", index_col=0)
reviews = operate(reviews)
In [8]:
reviews
In [9]:
reviews = pd.read_csv("reviews.csv", index_col=0)
reviews = operate(reviews)
reviews = operate(reviews)
reviews
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[9], line 3
      1 reviews = pd.read_csv("reviews.csv", index_col=0)
      2 reviews = operate(reviews)
----> 3 reviews = operate(reviews)
      4 reviews

Cell In[4], line 2, in operate(df)
      1 def operate(df):
----> 2     df["content"] = df["content"].str.split().str.len()

TypeError: 'NoneType' object is not subscriptable

Problem 3¶

In [19]:
reviews = pd.read_csv("reviews.csv", index_col=0)
littlereviews = reviews.sample(5).reset_index(drop=True)
littlereviews
Out[19]:
airline author date content cabin overall
0 blue-air Stelios Papadopoulos 2015-07-06 First time flying with a low cost airline so I... Economy 7
1 etihad-airways Jean-Pierre Ardinois 2015-07-20 Flight EY6298 - flight operated by South Afric... Economy 3
2 sun-country-airlines Javier Lopez 2015-06-17 Our travel experience was very good. Sun Count... Economy 10
3 air-canada-rouge D Routledge 2015-06-30 Boeing 767 on the way to Vegas and A319 on the... Economy 2
4 virgin-australia Suzanne Warmerdam 2015-07-09 Check in at Brisbane was poor, machines didn’t... Economy 9
In [20]:
littlereviews.loc[[True, False, False, False, False]]
Out[20]:
airline author date content cabin overall
0 blue-air Stelios Papadopoulos 2015-07-06 First time flying with a low cost airline so I... Economy 7
In [21]:
n = littlereviews.shape[0]
n
Out[21]:
5
In [22]:
np.random.seed(23)
np.random.choice([True, False], size=n, replace=True)
Out[22]:
array([False,  True,  True, False,  True])
In [23]:
np.random.seed(23)
littlereviews.loc[np.random.choice([True, False], size=n, replace=True)]
Out[23]:
airline author date content cabin overall
1 etihad-airways Jean-Pierre Ardinois 2015-07-20 Flight EY6298 - flight operated by South Afric... Economy 3
2 sun-country-airlines Javier Lopez 2015-06-17 Our travel experience was very good. Sun Count... Economy 10
4 virgin-australia Suzanne Warmerdam 2015-07-09 Check in at Brisbane was poor, machines didn’t... Economy 9
In [24]:
np.arange(n)
Out[24]:
array([0, 1, 2, 3, 4])
In [25]:
np.random.seed(23)
np.random.permutation(np.arange(n))
Out[25]:
array([4, 1, 0, 2, 3])
In [26]:
np.random.seed(23)
littlereviews.loc[np.random.permutation(np.arange(n))]
Out[26]:
airline author date content cabin overall
4 virgin-australia Suzanne Warmerdam 2015-07-09 Check in at Brisbane was poor, machines didn’t... Economy 9
1 etihad-airways Jean-Pierre Ardinois 2015-07-20 Flight EY6298 - flight operated by South Afric... Economy 3
0 blue-air Stelios Papadopoulos 2015-07-06 First time flying with a low cost airline so I... Economy 7
2 sun-country-airlines Javier Lopez 2015-06-17 Our travel experience was very good. Sun Count... Economy 10
3 air-canada-rouge D Routledge 2015-06-30 Boeing 767 on the way to Vegas and A319 on the... Economy 2
In [27]:
np.random.seed(23)
littlereviews.loc[np.random.choice(np.arange(n), size=n, replace=True)]
Out[27]:
airline author date content cabin overall
3 air-canada-rouge D Routledge 2015-06-30 Boeing 767 on the way to Vegas and A319 on the... Economy 2
0 blue-air Stelios Papadopoulos 2015-07-06 First time flying with a low cost airline so I... Economy 7
1 etihad-airways Jean-Pierre Ardinois 2015-07-20 Flight EY6298 - flight operated by South Afric... Economy 3
0 blue-air Stelios Papadopoulos 2015-07-06 First time flying with a low cost airline so I... Economy 7
4 virgin-australia Suzanne Warmerdam 2015-07-09 Check in at Brisbane was poor, machines didn’t... Economy 9
In [28]:
np.random.seed(23)
littlereviews.loc[np.random.choice(np.arange(n), size=n, replace=False)]
Out[28]:
airline author date content cabin overall
4 virgin-australia Suzanne Warmerdam 2015-07-09 Check in at Brisbane was poor, machines didn’t... Economy 9
1 etihad-airways Jean-Pierre Ardinois 2015-07-20 Flight EY6298 - flight operated by South Afric... Economy 3
0 blue-air Stelios Papadopoulos 2015-07-06 First time flying with a low cost airline so I... Economy 7
2 sun-country-airlines Javier Lopez 2015-06-17 Our travel experience was very good. Sun Count... Economy 10
3 air-canada-rouge D Routledge 2015-06-30 Boeing 767 on the way to Vegas and A319 on the... Economy 2

Problem 4¶

In [29]:
authors = reviews["author"]
authors
Out[29]:
0                 A Sahni
1         Peter Pomeranze
2           David Olvierk
3            Narges Noori
4               Joe Rossi
              ...        
1892            Teri Hake
1893           Brett Kero
1894       Josiah Peachey
1895    Anthony Innarelli
1896           C McPhaden
Name: author, Length: 1897, dtype: object
In [30]:
authors.value_counts()
Out[30]:
Bojan Tercon        8
D Brown             7
A Ray               7
Bob Motto           6
Rob Bowden          6
                   ..
Javier Lopez        1
L Laplanche         1
Andrew Exelby       1
Shokhzod Yakubov    1
C McPhaden          1
Name: author, Length: 1610, dtype: int64
In [31]:
authors.value_counts().value_counts().iloc[-2]
Out[31]:
2
In [32]:
authors.value_counts().value_counts().iloc[7]
Out[32]:
1
In [33]:
authors.value_counts().value_counts().loc[7]
Out[33]:
2
In [34]:
authors.value_counts().value_counts().index[-2]
Out[34]:
7
In [35]:
double = authors.value_counts().value_counts()
double
Out[35]:
1    1434
2     119
3      33
5       8
4       7
6       6
7       2
8       1
Name: author, dtype: int64
In [36]:
fig = px.bar(x = double.index, y = double.values)
fig.show()

Problem 8¶

In [37]:
framer = reviews.pivot_table(index="airline",
                                columns="cabin",
                                values="author",
                                aggfunc="max")
In [38]:
framer
Out[38]:
cabin Business Economy First Premium Economy
airline
aegean-airlines P Vlogianitis Teresa McCann NaN NaN
aer-lingus S Whyte L Pulliam NaN NaN
aeroflot-russian-airlines H Dalem Vsevolod Andreev NaN Dan Korn
aerolineas-argentinas M Studzinski Hilarion Martinez NaN NaN
aeromexico NaN Marco Flores Jose Granados NaN
... ... ... ... ...
vueling-airlines Paul Cartwright Widya Gunawan S Becker NaN
westjet NaN William McDonough Steve Hughes R Sawyer
wizz-air NaN Urooj Qureshi NaN NaN
wow-air NaN Rio Prariyadi NaN NaN
xl-airways-france NaN Yleana Castillo NaN NaN

188 rows × 4 columns

In [39]:
framer.loc["emirates", "Economy"]
Out[39]:
'Y Areemitr'
In [40]:
framer.isna()
Out[40]:
cabin Business Economy First Premium Economy
airline
aegean-airlines False False True True
aer-lingus False False True True
aeroflot-russian-airlines False False True False
aerolineas-argentinas False False True True
aeromexico True False False True
... ... ... ... ...
vueling-airlines False False False True
westjet True False False False
wizz-air True False True True
wow-air True False True True
xl-airways-france True False True True

188 rows × 4 columns

In [41]:
framer.isna().sum(axis=0)
Out[41]:
cabin
Business           102
Economy              6
First              152
Premium Economy    162
dtype: int64
In [42]:
framer.isna().sum(axis=0)
Out[42]:
cabin
Business           102
Economy              6
First              152
Premium Economy    162
dtype: int64
In [43]:
framer.isna().sum(axis=0).shape
Out[43]:
(4,)
In [44]:
framer.isna().sum(axis=0).shape[1]
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[44], line 1
----> 1 framer.isna().sum(axis=0).shape[1]

IndexError: tuple index out of range
In [45]:
framer.isna().sum(axis=0).str.len()
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[45], line 1
----> 1 framer.isna().sum(axis=0).str.len()

File ~/miniforge3/lib/python3.11/site-packages/pandas/core/generic.py:5902, in NDFrame.__getattr__(self, name)
   5895 if (
   5896     name not in self._internal_names_set
   5897     and name not in self._metadata
   5898     and name not in self._accessors
   5899     and self._info_axis._can_hold_identifiers_and_holds_name(name)
   5900 ):
   5901     return self[name]
-> 5902 return object.__getattribute__(self, name)

File ~/miniforge3/lib/python3.11/site-packages/pandas/core/accessor.py:182, in CachedAccessor.__get__(self, obj, cls)
    179 if obj is None:
    180     # we're accessing the attribute of the class, i.e., Dataset.geo
    181     return self._accessor
--> 182 accessor_obj = self._accessor(obj)
    183 # Replace the property with the accessor object. Inspired by:
    184 # https://www.pydanny.com/cached-property.html
    185 # We need to use object.__setattr__ because we overwrite __setattr__ on
    186 # NDFrame
    187 object.__setattr__(obj, self._name, accessor_obj)

File ~/miniforge3/lib/python3.11/site-packages/pandas/core/strings/accessor.py:181, in StringMethods.__init__(self, data)
    178 def __init__(self, data) -> None:
    179     from pandas.core.arrays.string_ import StringDtype
--> 181     self._inferred_dtype = self._validate(data)
    182     self._is_categorical = is_categorical_dtype(data.dtype)
    183     self._is_string = isinstance(data.dtype, StringDtype)

File ~/miniforge3/lib/python3.11/site-packages/pandas/core/strings/accessor.py:235, in StringMethods._validate(data)
    232 inferred_dtype = lib.infer_dtype(values, skipna=True)
    234 if inferred_dtype not in allowed_types:
--> 235     raise AttributeError("Can only use .str accessor with string values!")
    236 return inferred_dtype

AttributeError: Can only use .str accessor with string values!