# Import libraries to be used
import pandas as pd
import plotly.express as px


# Import test and train datasets
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# View first lines of training data
df_train.head()


# View first lines of test data
df_test.head()


df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


df_train.describe()


fig = px.histogram(data_frame=df_train, x =df_train['Survived'].astype('string'), color='Survived')
fig.show()
# Converted the inputs in the 'Survived' feature into strings so the features can be treated as categorical, rather than numeric


fig = px.histogram(data_frame=df_train, x ='Sex', color='Sex')
fig.show()


fig = px.histogram(data_frame=df_train, x ='Sex', facet_col='Survived', color='Sex')
fig.show()


fig = px.histogram(data_frame=df_train, x ='Sex', color='Survived')
fig.show()


sex_group = df_train.groupby('Sex')
sex_group['Survived'].value_counts()

Sex     Survived
female  1           233
        0            81
male    0           468
        1           109
Name: Survived, dtype: int64


sex_group['Survived'].sum()
# An alternative to calculate number of survived by gender because the 'survived' column is represented in 1s and 0s.

Sex
female    233
male      109
Name: Survived, dtype: int64


print(df_train[df_train.Sex == 'female'].Survived.sum()/df_train[df_train.Sex == 'female'].Survived.count())
print(df_train[df_train.Sex == 'male'].Survived.sum()/df_train[df_train.Sex == 'male'].Survived.count())

0.7420382165605095
0.18890814558058924


fig = px.histogram(data_frame=df_train, x =df_train['Survived'].astype('string'), facet_col='Pclass', color='Survived')
fig.show()


fig = px.histogram(data_frame=df_train, x =df_train['Survived'].astype('string'), color='Embarked')
fig.show()
# returns an error

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Input In [14], in <cell line: 1>()
----> 1 fig = px.histogram(data_frame=df_train, x =df_train['Survived'].astype('string'), color='Embarked')
      2 fig.show()

File ~\anaconda3\lib\site-packages\plotly\express\_chart_types.py:472, in histogram(data_frame, x, y, color, pattern_shape, facet_row, facet_col, facet_col_wrap, facet_row_spacing, facet_col_spacing, hover_name, hover_data, animation_frame, animation_group, category_orders, labels, color_discrete_sequence, color_discrete_map, pattern_shape_sequence, pattern_shape_map, marginal, opacity, orientation, barmode, barnorm, histnorm, log_x, log_y, range_x, range_y, histfunc, cumulative, nbins, text_auto, title, template, width, height)
    426 def histogram(
    427     data_frame=None,
    428     x=None,
   (...)
    464     height=None,
    465 ):
    466     """
    467     In a histogram, rows of `data_frame` are grouped together into a
    468     rectangular mark to visualize the 1D distribution of an aggregate
    469     function `histfunc` (e.g. the count or sum) of the value `y` (or `x` if
    470     `orientation` is `'h'`).
    471     """
--> 472     return make_figure(
    473         args=locals(),
    474         constructor=go.Histogram,
    475         trace_patch=dict(
    476             histnorm=histnorm, histfunc=histfunc, cumulative=dict(enabled=cumulative),
    477         ),
    478         layout_patch=dict(barmode=barmode, barnorm=barnorm),
    479     )

File ~\anaconda3\lib\site-packages\plotly\express\_core.py:1988, in make_figure(args, constructor, trace_patch, layout_patch)
   1986 facet_col_wrap = args.get("facet_col_wrap", 0)
   1987 for group_name in sorted_group_names:
-> 1988     group = grouped.get_group(group_name if len(group_name) > 1 else group_name[0])
   1989     mapping_labels = OrderedDict()
   1990     trace_name_labels = OrderedDict()

File ~\anaconda3\lib\site-packages\pandas\core\groupby\groupby.py:747, in BaseGroupBy.get_group(self, name, obj)
    745 inds = self._get_index(name)
    746 if not len(inds):
--> 747     raise KeyError(name)
    749 return obj._take_with_is_copy(inds, axis=self.axis)

KeyError: (nan, '', '', '', '')


# returns an error because there are NaN values in the 'Embarked' column. Let's take a look at the count.


df_train['Embarked'].value_counts(dropna=False)
# 2 NaN values. Could be stowaways.

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64


#  Makes sense to fill the NaN values with 'S' as that is the most common port of Embarkment
fig = px.histogram(data_frame=df_train, x =df_train['Survived'].astype('string'), facet_col=df_train['Embarked'].fillna('S'), color='Survived')
fig.show()


fig = px.histogram(data_frame=df_train, x='Fare')
fig.show()


df_train['Age'].isna().sum()
# 177 records without age. These recorded will be dropped for visualizin but during cleaning we would fill with either the mode, mode or median age.

177


fig = px.histogram(data_frame=df_train, x=df_train['Age'].dropna())
fig.show()


fig = px.strip(df_train, x=df_train["Survived"], y="Fare")
fig.show()


not_survived = df_train['Survived'] == 0
df_train.loc[not_survived, 'Fare'].describe()

count    549.000000
mean      22.117887
std       31.388207
min        0.000000
25%        7.854200
50%       10.500000
75%       26.000000
max      263.000000
Name: Fare, dtype: float64


survived = df_train['Survived'] == 1
df_train.loc[survived, 'Fare'].describe()

count    342.000000
mean      48.395408
std       66.596998
min        0.000000
25%       12.475000
50%       26.000000
75%       57.000000
max      512.329200
Name: Fare, dtype: float64


fig = px.scatter(data_frame=df_train, x='Age', y='Fare', color='Survived')
fig.show()

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

Exploratory Data Analysis and Visualizations of the Titanic Dataset¶

Approach¶

Import you data and check it out¶

Visual exploratory data analysis¶

EDA on feature variables¶

Explore your data more!¶

EDA with numeric variables¶

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	NaN	S
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	NaN	S