Practical DSC Midterm Review¶
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
In [2]:
reviews = pd.read_csv("reviews.csv", index_col=0)
reviews
Out[2]:
airline | author | date | content | cabin | overall | |
---|---|---|---|---|---|---|
0 | klm-royal-dutch-airlines | A Sahni | 2015-07-24 | KLM Business Class from Barcelona via AMS to D... | Business | 8 |
1 | hawaiian-airlines | Peter Pomeranze | 2015-07-27 | Flying Hawaiians seasonal non stop was a pleas... | First | 8 |
2 | tap-portugal | David Olvierk | 2015-07-21 | I fly up to Lisbon on a near weekly basis gen... | Economy | 1 |
3 | spirit-airlines | Narges Noori | 2015-06-22 | I've been using Spirit for 2years because my h... | Economy | 1 |
4 | precision | Joe Rossi | 2015-07-17 | Dar Es Salaam to Arusha. One day delay, landin... | Economy | 1 |
... | ... | ... | ... | ... | ... | ... |
1892 | spirit-airlines | Teri Hake | 2015-06-17 | Worst flight experience. Arrived at 6.00am for... | Economy | 1 |
1893 | hawaiian-airlines | Brett Kero | 2015-06-11 | Flight from Hilo to Honolulu, new cabin, seat ... | Economy | 9 |
1894 | korean-air | Josiah Peachey | 2015-07-27 | I flew Korean Air for the first time. The A380... | Economy | 8 |
1895 | emirates | Anthony Innarelli | 2015-07-28 | Great flight from JFK to Milan. The aircraft i... | Economy | 9 |
1896 | china-airlines | C McPhaden | 2015-07-07 | Left on time and arrived early on both flights... | Economy | 7 |
1897 rows × 6 columns
In [ ]:
Problem 2¶
In [3]:
pd.Series(["aasdf", "b", ["a", "b"]]).str.len()
Out[3]:
0 5 1 1 2 2 dtype: int64
In [4]:
def operate(df):
df["content"] = df["content"].str.split().str.len()
In [5]:
reviews = pd.read_csv("reviews.csv", index_col=0)
operate(reviews)
In [6]:
reviews = pd.read_csv("reviews.csv", index_col=0)
operate(reviews)
operate(reviews)
reviews
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[6], line 3 1 reviews = pd.read_csv("reviews.csv", index_col=0) 2 operate(reviews) ----> 3 operate(reviews) 4 reviews Cell In[4], line 2, in operate(df) 1 def operate(df): ----> 2 df["content"] = df["content"].str.split().str.len() File ~/miniforge3/lib/python3.11/site-packages/pandas/core/generic.py:5902, in NDFrame.__getattr__(self, name) 5895 if ( 5896 name not in self._internal_names_set 5897 and name not in self._metadata 5898 and name not in self._accessors 5899 and self._info_axis._can_hold_identifiers_and_holds_name(name) 5900 ): 5901 return self[name] -> 5902 return object.__getattribute__(self, name) File ~/miniforge3/lib/python3.11/site-packages/pandas/core/accessor.py:182, in CachedAccessor.__get__(self, obj, cls) 179 if obj is None: 180 # we're accessing the attribute of the class, i.e., Dataset.geo 181 return self._accessor --> 182 accessor_obj = self._accessor(obj) 183 # Replace the property with the accessor object. Inspired by: 184 # https://www.pydanny.com/cached-property.html 185 # We need to use object.__setattr__ because we overwrite __setattr__ on 186 # NDFrame 187 object.__setattr__(obj, self._name, accessor_obj) File ~/miniforge3/lib/python3.11/site-packages/pandas/core/strings/accessor.py:181, in StringMethods.__init__(self, data) 178 def __init__(self, data) -> None: 179 from pandas.core.arrays.string_ import StringDtype --> 181 self._inferred_dtype = self._validate(data) 182 self._is_categorical = is_categorical_dtype(data.dtype) 183 self._is_string = isinstance(data.dtype, StringDtype) File ~/miniforge3/lib/python3.11/site-packages/pandas/core/strings/accessor.py:235, in StringMethods._validate(data) 232 inferred_dtype = lib.infer_dtype(values, skipna=True) 234 if inferred_dtype not in allowed_types: --> 235 raise AttributeError("Can only use .str accessor with string values!") 236 return inferred_dtype AttributeError: Can only use .str accessor with string values!
In [7]:
reviews = pd.read_csv("reviews.csv", index_col=0)
reviews = operate(reviews)
In [8]:
reviews
In [9]:
reviews = pd.read_csv("reviews.csv", index_col=0)
reviews = operate(reviews)
reviews = operate(reviews)
reviews
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[9], line 3 1 reviews = pd.read_csv("reviews.csv", index_col=0) 2 reviews = operate(reviews) ----> 3 reviews = operate(reviews) 4 reviews Cell In[4], line 2, in operate(df) 1 def operate(df): ----> 2 df["content"] = df["content"].str.split().str.len() TypeError: 'NoneType' object is not subscriptable
Problem 3¶
In [19]:
reviews = pd.read_csv("reviews.csv", index_col=0)
littlereviews = reviews.sample(5).reset_index(drop=True)
littlereviews
Out[19]:
airline | author | date | content | cabin | overall | |
---|---|---|---|---|---|---|
0 | blue-air | Stelios Papadopoulos | 2015-07-06 | First time flying with a low cost airline so I... | Economy | 7 |
1 | etihad-airways | Jean-Pierre Ardinois | 2015-07-20 | Flight EY6298 - flight operated by South Afric... | Economy | 3 |
2 | sun-country-airlines | Javier Lopez | 2015-06-17 | Our travel experience was very good. Sun Count... | Economy | 10 |
3 | air-canada-rouge | D Routledge | 2015-06-30 | Boeing 767 on the way to Vegas and A319 on the... | Economy | 2 |
4 | virgin-australia | Suzanne Warmerdam | 2015-07-09 | Check in at Brisbane was poor, machines didn’t... | Economy | 9 |
In [20]:
littlereviews.loc[[True, False, False, False, False]]
Out[20]:
airline | author | date | content | cabin | overall | |
---|---|---|---|---|---|---|
0 | blue-air | Stelios Papadopoulos | 2015-07-06 | First time flying with a low cost airline so I... | Economy | 7 |
In [21]:
n = littlereviews.shape[0]
n
Out[21]:
5
In [22]:
np.random.seed(23)
np.random.choice([True, False], size=n, replace=True)
Out[22]:
array([False, True, True, False, True])
In [23]:
np.random.seed(23)
littlereviews.loc[np.random.choice([True, False], size=n, replace=True)]
Out[23]:
airline | author | date | content | cabin | overall | |
---|---|---|---|---|---|---|
1 | etihad-airways | Jean-Pierre Ardinois | 2015-07-20 | Flight EY6298 - flight operated by South Afric... | Economy | 3 |
2 | sun-country-airlines | Javier Lopez | 2015-06-17 | Our travel experience was very good. Sun Count... | Economy | 10 |
4 | virgin-australia | Suzanne Warmerdam | 2015-07-09 | Check in at Brisbane was poor, machines didn’t... | Economy | 9 |
In [24]:
np.arange(n)
Out[24]:
array([0, 1, 2, 3, 4])
In [25]:
np.random.seed(23)
np.random.permutation(np.arange(n))
Out[25]:
array([4, 1, 0, 2, 3])
In [26]:
np.random.seed(23)
littlereviews.loc[np.random.permutation(np.arange(n))]
Out[26]:
airline | author | date | content | cabin | overall | |
---|---|---|---|---|---|---|
4 | virgin-australia | Suzanne Warmerdam | 2015-07-09 | Check in at Brisbane was poor, machines didn’t... | Economy | 9 |
1 | etihad-airways | Jean-Pierre Ardinois | 2015-07-20 | Flight EY6298 - flight operated by South Afric... | Economy | 3 |
0 | blue-air | Stelios Papadopoulos | 2015-07-06 | First time flying with a low cost airline so I... | Economy | 7 |
2 | sun-country-airlines | Javier Lopez | 2015-06-17 | Our travel experience was very good. Sun Count... | Economy | 10 |
3 | air-canada-rouge | D Routledge | 2015-06-30 | Boeing 767 on the way to Vegas and A319 on the... | Economy | 2 |
In [27]:
np.random.seed(23)
littlereviews.loc[np.random.choice(np.arange(n), size=n, replace=True)]
Out[27]:
airline | author | date | content | cabin | overall | |
---|---|---|---|---|---|---|
3 | air-canada-rouge | D Routledge | 2015-06-30 | Boeing 767 on the way to Vegas and A319 on the... | Economy | 2 |
0 | blue-air | Stelios Papadopoulos | 2015-07-06 | First time flying with a low cost airline so I... | Economy | 7 |
1 | etihad-airways | Jean-Pierre Ardinois | 2015-07-20 | Flight EY6298 - flight operated by South Afric... | Economy | 3 |
0 | blue-air | Stelios Papadopoulos | 2015-07-06 | First time flying with a low cost airline so I... | Economy | 7 |
4 | virgin-australia | Suzanne Warmerdam | 2015-07-09 | Check in at Brisbane was poor, machines didn’t... | Economy | 9 |
In [28]:
np.random.seed(23)
littlereviews.loc[np.random.choice(np.arange(n), size=n, replace=False)]
Out[28]:
airline | author | date | content | cabin | overall | |
---|---|---|---|---|---|---|
4 | virgin-australia | Suzanne Warmerdam | 2015-07-09 | Check in at Brisbane was poor, machines didn’t... | Economy | 9 |
1 | etihad-airways | Jean-Pierre Ardinois | 2015-07-20 | Flight EY6298 - flight operated by South Afric... | Economy | 3 |
0 | blue-air | Stelios Papadopoulos | 2015-07-06 | First time flying with a low cost airline so I... | Economy | 7 |
2 | sun-country-airlines | Javier Lopez | 2015-06-17 | Our travel experience was very good. Sun Count... | Economy | 10 |
3 | air-canada-rouge | D Routledge | 2015-06-30 | Boeing 767 on the way to Vegas and A319 on the... | Economy | 2 |
Problem 4¶
In [29]:
authors = reviews["author"]
authors
Out[29]:
0 A Sahni 1 Peter Pomeranze 2 David Olvierk 3 Narges Noori 4 Joe Rossi ... 1892 Teri Hake 1893 Brett Kero 1894 Josiah Peachey 1895 Anthony Innarelli 1896 C McPhaden Name: author, Length: 1897, dtype: object
In [30]:
authors.value_counts()
Out[30]:
Bojan Tercon 8 D Brown 7 A Ray 7 Bob Motto 6 Rob Bowden 6 .. Javier Lopez 1 L Laplanche 1 Andrew Exelby 1 Shokhzod Yakubov 1 C McPhaden 1 Name: author, Length: 1610, dtype: int64
In [31]:
authors.value_counts().value_counts().iloc[-2]
Out[31]:
2
In [32]:
authors.value_counts().value_counts().iloc[7]
Out[32]:
1
In [33]:
authors.value_counts().value_counts().loc[7]
Out[33]:
2
In [34]:
authors.value_counts().value_counts().index[-2]
Out[34]:
7
In [35]:
double = authors.value_counts().value_counts()
double
Out[35]:
1 1434 2 119 3 33 5 8 4 7 6 6 7 2 8 1 Name: author, dtype: int64
In [36]:
fig = px.bar(x = double.index, y = double.values)
fig.show()
Problem 8¶
In [37]:
framer = reviews.pivot_table(index="airline",
columns="cabin",
values="author",
aggfunc="max")
In [38]:
framer
Out[38]:
cabin | Business | Economy | First | Premium Economy |
---|---|---|---|---|
airline | ||||
aegean-airlines | P Vlogianitis | Teresa McCann | NaN | NaN |
aer-lingus | S Whyte | L Pulliam | NaN | NaN |
aeroflot-russian-airlines | H Dalem | Vsevolod Andreev | NaN | Dan Korn |
aerolineas-argentinas | M Studzinski | Hilarion Martinez | NaN | NaN |
aeromexico | NaN | Marco Flores | Jose Granados | NaN |
... | ... | ... | ... | ... |
vueling-airlines | Paul Cartwright | Widya Gunawan | S Becker | NaN |
westjet | NaN | William McDonough | Steve Hughes | R Sawyer |
wizz-air | NaN | Urooj Qureshi | NaN | NaN |
wow-air | NaN | Rio Prariyadi | NaN | NaN |
xl-airways-france | NaN | Yleana Castillo | NaN | NaN |
188 rows × 4 columns
In [39]:
framer.loc["emirates", "Economy"]
Out[39]:
'Y Areemitr'
In [40]:
framer.isna()
Out[40]:
cabin | Business | Economy | First | Premium Economy |
---|---|---|---|---|
airline | ||||
aegean-airlines | False | False | True | True |
aer-lingus | False | False | True | True |
aeroflot-russian-airlines | False | False | True | False |
aerolineas-argentinas | False | False | True | True |
aeromexico | True | False | False | True |
... | ... | ... | ... | ... |
vueling-airlines | False | False | False | True |
westjet | True | False | False | False |
wizz-air | True | False | True | True |
wow-air | True | False | True | True |
xl-airways-france | True | False | True | True |
188 rows × 4 columns
In [41]:
framer.isna().sum(axis=0)
Out[41]:
cabin Business 102 Economy 6 First 152 Premium Economy 162 dtype: int64
In [42]:
framer.isna().sum(axis=0)
Out[42]:
cabin Business 102 Economy 6 First 152 Premium Economy 162 dtype: int64
In [43]:
framer.isna().sum(axis=0).shape
Out[43]:
(4,)
In [44]:
framer.isna().sum(axis=0).shape[1]
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) Cell In[44], line 1 ----> 1 framer.isna().sum(axis=0).shape[1] IndexError: tuple index out of range
In [45]:
framer.isna().sum(axis=0).str.len()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[45], line 1 ----> 1 framer.isna().sum(axis=0).str.len() File ~/miniforge3/lib/python3.11/site-packages/pandas/core/generic.py:5902, in NDFrame.__getattr__(self, name) 5895 if ( 5896 name not in self._internal_names_set 5897 and name not in self._metadata 5898 and name not in self._accessors 5899 and self._info_axis._can_hold_identifiers_and_holds_name(name) 5900 ): 5901 return self[name] -> 5902 return object.__getattribute__(self, name) File ~/miniforge3/lib/python3.11/site-packages/pandas/core/accessor.py:182, in CachedAccessor.__get__(self, obj, cls) 179 if obj is None: 180 # we're accessing the attribute of the class, i.e., Dataset.geo 181 return self._accessor --> 182 accessor_obj = self._accessor(obj) 183 # Replace the property with the accessor object. Inspired by: 184 # https://www.pydanny.com/cached-property.html 185 # We need to use object.__setattr__ because we overwrite __setattr__ on 186 # NDFrame 187 object.__setattr__(obj, self._name, accessor_obj) File ~/miniforge3/lib/python3.11/site-packages/pandas/core/strings/accessor.py:181, in StringMethods.__init__(self, data) 178 def __init__(self, data) -> None: 179 from pandas.core.arrays.string_ import StringDtype --> 181 self._inferred_dtype = self._validate(data) 182 self._is_categorical = is_categorical_dtype(data.dtype) 183 self._is_string = isinstance(data.dtype, StringDtype) File ~/miniforge3/lib/python3.11/site-packages/pandas/core/strings/accessor.py:235, in StringMethods._validate(data) 232 inferred_dtype = lib.infer_dtype(values, skipna=True) 234 if inferred_dtype not in allowed_types: --> 235 raise AttributeError("Can only use .str accessor with string values!") 236 return inferred_dtype AttributeError: Can only use .str accessor with string values!