# Adding relative path for imports
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


import pandas as pd
import numpy as np
import _titanic


train = pd.read_csv('../_data/titanic_train.csv')


train.head()


model1_cols = ['SibSp', 'Parch', 'Fare']
X, y = _titanic.parse_model(train.copy(), name_Y='Survived', use_columns=model1_cols)


_titanic.logmodel_prediction(X, y, 0.3, 42)

              precision    recall  f1-score   support

           0       0.65      0.94      0.77       157
           1       0.76      0.28      0.41       111

    accuracy                           0.66       268
   macro avg       0.70      0.61      0.59       268
weighted avg       0.69      0.66      0.62       268

score : 0.664179104477612


import seaborn as sn
sn.heatmap(train.corr(), annot=True)

&lt;AxesSubplot:&gt;


dead = train[train['Survived']==0]
survived = train[train['Survived']==1]


_titanic.plot_hist('Pclass', 'Dead', 'Survived', dead, survived)


model2 = train[model1_cols+['Survived', 'Pclass']]
model2_cols = model1_cols + ['Pclass']
X2, y2 = _titanic.parse_model(model2, name_Y='Survived', use_columns=model2_cols)


_titanic.logmodel_prediction(X2, y2, 0.3, 101)

              precision    recall  f1-score   support

           0       0.67      0.84      0.74       154
           1       0.67      0.43      0.52       114

    accuracy                           0.67       268
   macro avg       0.67      0.64      0.63       268
weighted avg       0.67      0.67      0.65       268

score : 0.667910447761194


_titanic.plot_hist('Age', 'Dead', 'Survived', dead, survived)


total = train['Age'].isnull().sum()
percent_1 = train['Age'].isnull().sum()/train['Age'].isnull().count()*100
print(percent_1)

19.865319865319865


train2 = train.copy()
_titanic.fill_with_median(train2, 'Age', 'Pclass')
train2['Age'] = train2['Age'].astype('int')


model3_cols = model2_cols+['Age']
model3 = model2.join(train2['Age'])
X3, y3 = _titanic.parse_model(model3, name_Y="Survived", use_columns=model3_cols)
_titanic.logmodel_prediction(X3, y3, 0.3, 42)

              precision    recall  f1-score   support

           0       0.71      0.90      0.79       157
           1       0.77      0.48      0.59       111

    accuracy                           0.72       268
   macro avg       0.74      0.69      0.69       268
weighted avg       0.73      0.72      0.71       268

score : 0.7238805970149254


ageCat = pd.Series(_titanic.div_cat(train2['Age'], 5), name="ageCat")
model4 = model2.join(ageCat)
model4_cols = model2_cols + ['ageCat']
X4, y4 = _titanic.parse_model(model4, name_Y="Survived", use_columns=model4_cols)
_titanic.logmodel_prediction(X4, y4, 0.3, 101)

              precision    recall  f1-score   support

           0       0.69      0.86      0.76       154
           1       0.72      0.46      0.56       114

    accuracy                           0.69       268
   macro avg       0.70      0.66      0.66       268
weighted avg       0.70      0.69      0.68       268

score : 0.6940298507462687


_titanic.plot_hist('Sex', 'Dead', 'Survived', dead, survived)


is_male = pd.get_dummies(train2['Sex'],drop_first=True)
model5 = model4.join(is_male)
model5_cols = model4_cols + ['male']
X5, y5 = _titanic.parse_model(model5, name_Y="Survived", use_columns=model5_cols)
_titanic.logmodel_prediction(X5, y5, 0.3, 101)

              precision    recall  f1-score   support

           0       0.77      0.86      0.81       154
           1       0.77      0.65      0.70       114

    accuracy                           0.77       268
   macro avg       0.77      0.75      0.76       268
weighted avg       0.77      0.77      0.77       268

score : 0.7686567164179104


titles = []
for i in range(len(train2)):
    titles.append(train['Name'][i].split(',')[1].split('.')[0].strip())
titles = np.array(titles)
np.unique(titles)

array([&#39;Capt&#39;, &#39;Col&#39;, &#39;Don&#39;, &#39;Dr&#39;, &#39;Jonkheer&#39;, &#39;Lady&#39;, &#39;Major&#39;, &#39;Master&#39;,
       &#39;Miss&#39;, &#39;Mlle&#39;, &#39;Mme&#39;, &#39;Mr&#39;, &#39;Mrs&#39;, &#39;Ms&#39;, &#39;Rev&#39;, &#39;Sir&#39;,
       &#39;the Countess&#39;], dtype=&#39;&lt;U12&#39;)


important = np.zeros(len(train))
for i in range(len(train2)):
    name = train2['Name'][i]
    title = name.split(',')[1].split('.')[0].strip()
    if title=='Dr' or title=='Master' or title=='the Countess':
        important[i] = 1
is_important = pd.Series(important, name='is_important')
model6 = model5.join(is_important)
model6_cols = model5_cols + ['is_important']
X6, y6 = _titanic.parse_model(model6, name_Y="Survived", use_columns=model6_cols)
_titanic.logmodel_prediction(X6, y6, 0.3, 102)

              precision    recall  f1-score   support

           0       0.87      0.88      0.87       167
           1       0.80      0.78      0.79       101

    accuracy                           0.84       268
   macro avg       0.83      0.83      0.83       268
weighted avg       0.84      0.84      0.84       268

score : 0.8432835820895522


_titanic.random_forest_prediction(X6, y6, 0.3, 404, 100)

              precision    recall  f1-score   support

           0       0.80      0.87      0.83       153
           1       0.80      0.70      0.75       115

    accuracy                           0.80       268
   macro avg       0.80      0.79      0.79       268
weighted avg       0.80      0.80      0.80       268

score:  0.7985074626865671


_titanic.RFE_predicion(X6, y6, 0.3, 101, 8)

              precision    recall  f1-score   support

           0       0.81      0.84      0.83       154
           1       0.78      0.73      0.75       114

    accuracy                           0.79       268
   macro avg       0.79      0.79      0.79       268
weighted avg       0.79      0.79      0.79       268

score:  0.7947761194029851


_titanic.GSCV_prediction(X6, y6, 0.3, 64)

Parameters chosen:  {&#39;max_depth&#39;: 8, &#39;min_samples_leaf&#39;: 5, &#39;min_samples_split&#39;: 10, &#39;n_estimators&#39;: 150}
              precision    recall  f1-score   support

           0       0.82      0.94      0.88       165
           1       0.87      0.67      0.76       103

    accuracy                           0.84       268
   macro avg       0.85      0.80      0.82       268
weighted avg       0.84      0.84      0.83       268

score:  0.835820895522388


embark = pd.get_dummies(train2['Embarked'])
model7 = model6.join(embark)
model7_cols = model6_cols+['C', 'Q', 'S']
X7, y7 = _titanic.parse_model(model7, name_Y="Survived", use_columns=model7_cols)
_titanic.logmodel_prediction(X7, y7, 0.3, 102)

              precision    recall  f1-score   support

           0       0.86      0.88      0.87       167
           1       0.80      0.77      0.78       101

    accuracy                           0.84       268
   macro avg       0.83      0.83      0.83       268
weighted avg       0.84      0.84      0.84       268

score : 0.8395522388059702


_titanic.GSCV_prediction(X7, y7, 0.3, 64)

Parameters chosen:  {&#39;max_depth&#39;: 10, &#39;min_samples_leaf&#39;: 5, &#39;min_samples_split&#39;: 10, &#39;n_estimators&#39;: 100}
              precision    recall  f1-score   support

           0       0.84      0.94      0.89       165
           1       0.88      0.71      0.78       103

    accuracy                           0.85       268
   macro avg       0.86      0.82      0.84       268
weighted avg       0.85      0.85      0.85       268

score:  0.8507462686567164

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

Titanic notebook¶

Module and data import¶

First look at the data¶

Creating Models¶

Comparing logistic regression to random forest resulsts¶

Conclusion¶