diff --git a/README.md b/README.md
index 295a252..dc8fbad 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,4 @@
+[](https://classroom.github.com/a/yYhm1UCY)
# Introduction to Data Processing
## Introduction
diff --git a/titanic.csv b/titanic.csv
new file mode 100644
index 0000000..00dd924
--- /dev/null
+++ b/titanic.csv
@@ -0,0 +1,419 @@
+PassengerId,Survived
+892,0.0
+893,0.0
+894,0.0
+895,0.0
+896,0.0
+897,1.0
+898,0.0
+899,0.0
+900,1.0
+901,0.0
+902,0.0
+903,0.0
+904,0.0
+905,0.0
+906,1.0
+907,0.0
+908,1.0
+909,0.0
+910,0.0
+911,0.0
+912,0.0
+913,1.0
+914,0.0
+915,1.0
+916,1.0
+917,1.0
+918,1.0
+919,0.0
+920,1.0
+921,0.0
+922,0.0
+923,0.0
+924,0.0
+925,0.0
+926,0.0
+927,0.0
+928,0.0
+929,1.0
+930,0.0
+931,0.0
+932,0.0
+933,0.0
+934,0.0
+935,1.0
+936,0.0
+937,1.0
+938,0.0
+939,0.0
+940,1.0
+941,1.0
+942,0.0
+943,1.0
+944,1.0
+945,0.0
+946,0.0
+947,0.0
+948,0.0
+949,0.0
+950,0.0
+951,0.0
+952,0.0
+953,0.0
+954,0.0
+955,0.0
+956,1.0
+957,0.0
+958,1.0
+959,0.0
+960,0.0
+961,0.0
+962,0.0
+963,0.0
+964,1.0
+965,0.0
+966,0.0
+967,0.0
+968,0.0
+969,1.0
+970,0.0
+971,0.0
+972,0.0
+973,1.0
+974,0.0
+975,1.0
+976,1.0
+977,0.0
+978,0.0
+979,0.0
+980,0.0
+981,1.0
+982,0.0
+983,1.0
+984,0.0
+985,0.0
+986,1.0
+987,1.0
+988,0.0
+989,0.0
+990,0.0
+991,0.0
+992,0.0
+993,0.0
+994,0.0
+995,0.0
+996,0.0
+997,1.0
+998,1.0
+999,0.0
+1000,0.0
+1001,0.0
+1002,0.0
+1003,0.0
+1004,1.0
+1005,1.0
+1006,0.0
+1007,0.0
+1008,0.0
+1009,1.0
+1010,0.0
+1011,0.0
+1012,0.0
+1013,0.0
+1014,0.0
+1015,0.0
+1016,0.0
+1017,0.0
+1018,0.0
+1019,0.0
+1020,0.0
+1021,0.0
+1022,0.0
+1023,0.0
+1024,1.0
+1025,1.0
+1026,0.0
+1027,0.0
+1028,0.0
+1029,0.0
+1030,1.0
+1031,0.0
+1032,0.0
+1033,1.0
+1034,1.0
+1035,0.0
+1036,0.0
+1037,0.0
+1038,0.0
+1039,1.0
+1040,0.0
+1041,1.0
+1042,1.0
+1043,0.0
+1044,0.0
+1045,0.0
+1046,0.0
+1047,0.0
+1048,0.0
+1049,0.0
+1050,0.0
+1051,0.0
+1052,0.0
+1053,0.0
+1054,1.0
+1055,0.0
+1056,0.0
+1057,0.0
+1058,0.0
+1059,0.0
+1060,0.0
+1061,0.0
+1062,0.0
+1063,0.0
+1064,1.0
+1065,0.0
+1066,0.0
+1067,0.0
+1068,1.0
+1069,1.0
+1070,0.0
+1071,0.0
+1072,0.0
+1073,0.0
+1074,0.0
+1075,0.0
+1076,0.0
+1077,1.0
+1078,0.0
+1079,1.0
+1080,0.0
+1081,1.0
+1082,0.0
+1083,0.0
+1084,0.0
+1085,1.0
+1086,0.0
+1087,1.0
+1088,1.0
+1089,1.0
+1090,1.0
+1091,0.0
+1092,1.0
+1093,1.0
+1094,1.0
+1095,0.0
+1096,0.0
+1097,0.0
+1098,0.0
+1099,1.0
+1100,0.0
+1101,1.0
+1102,0.0
+1103,1.0
+1104,0.0
+1105,0.0
+1106,0.0
+1107,0.0
+1108,0.0
+1109,1.0
+1110,0.0
+1111,0.0
+1112,1.0
+1113,1.0
+1114,0.0
+1115,1.0
+1116,0.0
+1117,0.0
+1118,1.0
+1119,0.0
+1120,0.0
+1121,0.0
+1122,0.0
+1123,0.0
+1124,0.0
+1125,0.0
+1126,1.0
+1127,0.0
+1128,0.0
+1129,0.0
+1130,0.0
+1131,0.0
+1132,1.0
+1133,1.0
+1134,1.0
+1135,0.0
+1136,1.0
+1137,0.0
+1138,1.0
+1139,0.0
+1140,0.0
+1141,0.0
+1142,0.0
+1143,0.0
+1144,1.0
+1145,0.0
+1146,0.0
+1147,0.0
+1148,0.0
+1149,0.0
+1150,0.0
+1151,1.0
+1152,0.0
+1153,0.0
+1154,0.0
+1155,1.0
+1156,0.0
+1157,0.0
+1158,0.0
+1159,0.0
+1160,1.0
+1161,0.0
+1162,1.0
+1163,0.0
+1164,0.0
+1165,0.0
+1166,0.0
+1167,1.0
+1168,0.0
+1169,0.0
+1170,0.0
+1171,0.0
+1172,0.0
+1173,0.0
+1174,0.0
+1175,0.0
+1176,0.0
+1177,0.0
+1178,0.0
+1179,0.0
+1180,0.0
+1181,1.0
+1182,0.0
+1183,0.0
+1184,1.0
+1185,1.0
+1186,1.0
+1187,0.0
+1188,0.0
+1189,
+1190,
+1191,
+1192,
+1193,
+1194,
+1195,
+1196,
+1197,
+1198,
+1199,
+1200,
+1201,
+1202,
+1203,
+1204,
+1205,
+1206,
+1207,
+1208,
+1209,
+1210,
+1211,
+1212,
+1213,
+1214,
+1215,
+1216,
+1217,
+1218,
+1219,
+1220,
+1221,
+1222,
+1223,
+1224,
+1225,
+1226,
+1227,
+1228,
+1229,
+1230,
+1231,
+1232,
+1233,
+1234,
+1235,
+1236,
+1237,
+1238,
+1239,
+1240,
+1241,
+1242,
+1243,
+1244,
+1245,
+1246,
+1247,
+1248,
+1249,
+1250,
+1251,
+1252,
+1253,
+1254,
+1255,
+1256,
+1257,
+1258,
+1259,
+1260,
+1261,
+1262,
+1263,
+1264,
+1265,
+1266,
+1267,
+1268,
+1269,
+1270,
+1271,
+1272,
+1273,
+1274,
+1275,
+1276,
+1277,
+1278,
+1279,
+1280,
+1281,
+1282,
+1283,
+1284,
+1285,
+1286,
+1287,
+1288,
+1289,
+1290,
+1291,
+1292,
+1293,
+1294,
+1295,
+1296,
+1297,
+1298,
+1299,
+1300,
+1301,
+1302,
+1303,
+1304,
+1305,
+1306,
+1307,
+1308,
+1309,
diff --git a/titanic.ipynb b/titanic.ipynb
index cde5079..d90c3a9 100644
--- a/titanic.ipynb
+++ b/titanic.ipynb
@@ -1 +1,4113 @@
-{"cells":[{"cell_type":"markdown","metadata":{},"source":["
\n","# Ignore this"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["import numpy as np # linear algebra\n","import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n","import matplotlib.pyplot as plt\n","plt.style.use(\"seaborn-v0_8-whitegrid\")\n","\n","import seaborn as sns\n","\n","from collections import Counter\n","\n","import warnings\n","warnings.filterwarnings(\"ignore\")"]},{"cell_type":"markdown","metadata":{},"source":["
\n","# Load and Check Data"]},{"cell_type":"markdown","metadata":{},"source":["DataFrames hold the dataset in a tabular format for easy manipulation and analysis.
\n","CSV data is read into 'df' using Pandas' read_csv() function."]},{"cell_type":"code","execution_count":null,"metadata":{"_kg_hide-input":true,"execution":{"iopub.execute_input":"2024-04-01T06:45:27.416192Z","iopub.status.busy":"2024-04-01T06:45:27.415763Z","iopub.status.idle":"2024-04-01T06:45:27.433162Z","shell.execute_reply":"2024-04-01T06:45:27.431944Z","shell.execute_reply.started":"2024-04-01T06:45:27.416105Z"},"trusted":true},"outputs":[],"source":["train_df = pd.read_csv(\"./data/train.csv\")"]},{"cell_type":"markdown","metadata":{},"source":["### 1. Try to read the test .csv file into test_df"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.436611Z","iopub.status.busy":"2024-04-01T06:45:27.435916Z","iopub.status.idle":"2024-04-01T06:45:27.449974Z","shell.execute_reply":"2024-04-01T06:45:27.448230Z","shell.execute_reply.started":"2024-04-01T06:45:27.436517Z"},"trusted":true},"outputs":[],"source":["test_df = pd.read_csv(\"./data/test.csv\")\n","test_PassengerId = test_df[\"PassengerId\"]"]},{"cell_type":"code","execution_count":null,"metadata":{"_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","execution":{"iopub.execute_input":"2024-04-01T06:45:27.452397Z","iopub.status.busy":"2024-04-01T06:45:27.451949Z","iopub.status.idle":"2024-04-01T06:45:27.462622Z","shell.execute_reply":"2024-04-01T06:45:27.461859Z","shell.execute_reply.started":"2024-04-01T06:45:27.452348Z"},"trusted":true},"outputs":[],"source":["print(\"The Columns of train_df are: \")\n","train_df.columns"]},{"cell_type":"markdown","metadata":{},"source":["### We can use head() to see the first few rows in the dataframe"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.464289Z","iopub.status.busy":"2024-04-01T06:45:27.463866Z","iopub.status.idle":"2024-04-01T06:45:27.491984Z","shell.execute_reply":"2024-04-01T06:45:27.491110Z","shell.execute_reply.started":"2024-04-01T06:45:27.464242Z"},"trusted":true},"outputs":[],"source":["train_df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.494326Z","iopub.status.busy":"2024-04-01T06:45:27.493637Z","iopub.status.idle":"2024-04-01T06:45:27.541999Z","shell.execute_reply":"2024-04-01T06:45:27.541210Z","shell.execute_reply.started":"2024-04-01T06:45:27.494251Z"},"jupyter":{"source_hidden":true},"trusted":true},"outputs":[],"source":["train_df.describe()"]},{"cell_type":"markdown","metadata":{},"source":["### 2. Without looking above, try checking the first few rows of test_df"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{},"source":["### 3. Now try checking for a description of test_df's data"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{},"source":["# Variable Description\n","
\n"," PassengerId: unique id number to each passenger \n"," Survived: passenger survive(1) or died(0) \n"," Pclass: passenger class \n"," Name: name \n"," Sex: gender of passenger \n"," Age: age of passenger \n"," SibSp: number of siblings/spouses \n"," Parch: number of parents/children \n"," Ticket: ticket number \n"," Fare: amount of money spent on ticket \n"," Cabin: cabin category \n"," Embarked: port where passenger embarked ( C = Cherbourg, Q = Queenstown, S = Southampton ) \n","
\n"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:45:27.544377Z","iopub.status.busy":"2024-04-01T06:45:27.543901Z","iopub.status.idle":"2024-04-01T06:45:27.557229Z","shell.execute_reply":"2024-04-01T06:45:27.555972Z","shell.execute_reply.started":"2024-04-01T06:45:27.544320Z"},"trusted":true},"outputs":[],"source":["train_df.info()"]},{"cell_type":"markdown","metadata":{},"source":["### Slice Rows and Columsn of DF (Assigmennt)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:53:12.214069Z","iopub.status.busy":"2024-04-01T06:53:12.213708Z","iopub.status.idle":"2024-04-01T06:53:12.223150Z","shell.execute_reply":"2024-04-01T06:53:12.222195Z","shell.execute_reply.started":"2024-04-01T06:53:12.214014Z"},"trusted":true},"outputs":[],"source":["# Printing the Second Row\n","train_df.iloc[2]"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Print the 5th Row"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:54:14.398373Z","iopub.status.busy":"2024-04-01T06:54:14.398006Z","iopub.status.idle":"2024-04-01T06:54:14.407886Z","shell.execute_reply":"2024-04-01T06:54:14.406590Z","shell.execute_reply.started":"2024-04-01T06:54:14.398326Z"},"trusted":true},"outputs":[],"source":["# Print the Sex Column\n","train_df['Sex']"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:54:24.550687Z","iopub.status.busy":"2024-04-01T06:54:24.550286Z","iopub.status.idle":"2024-04-01T06:54:24.555255Z","shell.execute_reply":"2024-04-01T06:54:24.553923Z","shell.execute_reply.started":"2024-04-01T06:54:24.550616Z"},"trusted":true},"outputs":[],"source":["# Print the Name Column"]},{"cell_type":"markdown","metadata":{},"source":["## Visualization (Assignment)"]},{"cell_type":"markdown","metadata":{},"source":["### Age -- Survived"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:13:34.450088Z","iopub.status.busy":"2024-04-01T07:13:34.449302Z","iopub.status.idle":"2024-04-01T07:13:34.932717Z","shell.execute_reply":"2024-04-01T07:13:34.930449Z","shell.execute_reply.started":"2024-04-01T07:13:34.450021Z"},"trusted":true},"outputs":[],"source":["plt.figure(figsize=(8, 6))\n","# Plot 1: Survivors vs Non Survivors\n","\n","# Creating a plot for the Survived Column\n","sns.countplot(x='Survived', data=train_df)\n","\n","plt.title('Survivors vs Non Survivors')\n","plt.xlabel('Survived')\n","plt.ylabel('Count')\n","plt.xticks([0, 1], ['No', 'Yes']) # Setting custom tick labels\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try Plotting Passenger Class"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:14:31.768779Z","iopub.status.busy":"2024-04-01T07:14:31.768341Z","iopub.status.idle":"2024-04-01T07:14:32.062495Z","shell.execute_reply":"2024-04-01T07:14:32.060660Z","shell.execute_reply.started":"2024-04-01T07:14:31.768690Z"},"trusted":true},"outputs":[],"source":["plt.figure(figsize=(8, 6))\n","\n","# Make the plot for Pclass here:\n","\n","\n","plt.title('Count of Passengers In each Passenger Class')\n","plt.xlabel('Passenger Class')\n","plt.ylabel('Count')\n","plt.xticks([0, 1, 2], ['1st', '2nd', '3rd']) # Setting custom tick labels\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Try it for \"Embarked\""]},{"cell_type":"code","execution_count":null,"metadata":{"trusted":true},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{},"source":["### Try Making a histogram for \"Fare\""]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{},"source":["### Here is the distplot for \"Fare\", refer to it after you tried it yourself:"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:18:24.402882Z","iopub.status.busy":"2024-04-01T07:18:24.402274Z","iopub.status.idle":"2024-04-01T07:18:24.798062Z","shell.execute_reply":"2024-04-01T07:18:24.796669Z","shell.execute_reply.started":"2024-04-01T07:18:24.402828Z"},"trusted":true},"outputs":[],"source":["sns.histplot(train_df['Fare'], bins=20, color='orange')\n","plt.title('Distribution of Passenger Fares')\n","plt.xlabel('Fare')\n","plt.ylabel('Frequency')\n","plt.show()"]},{"cell_type":"markdown","metadata":{},"source":["### Make a histogram for \"Age\" (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:19:53.874413Z","iopub.status.busy":"2024-04-01T07:19:53.873686Z","iopub.status.idle":"2024-04-01T07:19:54.244996Z","shell.execute_reply":"2024-04-01T07:19:54.243521Z","shell.execute_reply.started":"2024-04-01T07:19:53.874351Z"},"trusted":true},"outputs":[],"source":["# Create the plot below"]},{"cell_type":"markdown","metadata":{},"source":["
\n","## Fill Missing: Age Feature"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:50.370496Z","iopub.status.busy":"2024-04-01T06:27:50.369419Z","iopub.status.idle":"2024-04-01T06:27:50.427731Z","shell.execute_reply":"2024-04-01T06:27:50.426655Z","shell.execute_reply.started":"2024-04-01T06:27:50.370387Z"},"trusted":true},"outputs":[],"source":["train_df[train_df[\"Age\"].isnull()]"]},{"cell_type":"markdown","metadata":{},"source":["### Try Checking for Null Values in Test Df"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df.isnull().sum()"]},{"cell_type":"markdown","metadata":{},"source":["Run this to fix the Null Values"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:21:48.194895Z","iopub.status.busy":"2024-04-01T07:21:48.194020Z","iopub.status.idle":"2024-04-01T07:21:49.449282Z","shell.execute_reply":"2024-04-01T07:21:49.447918Z","shell.execute_reply.started":"2024-04-01T07:21:48.194825Z"},"trusted":true},"outputs":[],"source":["index_nan_age = list(train_df[\"Age\"][train_df[\"Age\"].isnull()].index)\n","for i in index_nan_age:\n"," age_pred = train_df[\"Age\"][((train_df[\"SibSp\"] == train_df.iloc[i][\"SibSp\"]) &(train_df[\"Parch\"] == train_df.iloc[i][\"Parch\"])& (train_df[\"Pclass\"] == train_df.iloc[i][\"Pclass\"]))].median()\n"," age_med = train_df[\"Age\"].median()\n"," if not np.isnan(age_pred):\n"," train_df[\"Age\"].iloc[i] = age_pred\n"," else:\n"," train_df[\"Age\"].iloc[i] = age_med\n","\n","index_nan_age = list(test_df[\"Age\"][test_df[\"Age\"].isnull()].index)\n","for i in index_nan_age:\n"," age_pred = test_df[\"Age\"][((test_df[\"SibSp\"] == test_df.iloc[i][\"SibSp\"]) &(test_df[\"Parch\"] == test_df.iloc[i][\"Parch\"])& (test_df[\"Pclass\"] == test_df.iloc[i][\"Pclass\"]))].median()\n"," age_med = test_df[\"Age\"].median()\n"," if not np.isnan(age_pred):\n"," test_df[\"Age\"].iloc[i] = age_pred\n"," else:\n"," test_df[\"Age\"].iloc[i] = age_med"]},{"cell_type":"markdown","metadata":{},"source":["## Analysing the correlation between the different columns"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T07:24:33.644174Z","iopub.status.busy":"2024-04-01T07:24:33.643621Z","iopub.status.idle":"2024-04-01T07:24:34.404306Z","shell.execute_reply":"2024-04-01T07:24:34.402938Z","shell.execute_reply.started":"2024-04-01T07:24:33.643935Z"},"trusted":true},"outputs":[],"source":["numerical_columns = train_df.select_dtypes(include=[np.number]).columns\n","sns.heatmap(train_df[numerical_columns].corr(), annot=True)"]},{"cell_type":"markdown","metadata":{},"source":["We see that Fare and Parch are positively correlated with Survived. Similarly, Fare and Class are negatively correlated, in the sense that the higher the higher the Fare, the lower the Class number (Remember that Class 1 < Class 2 < Class 3 in face value)."]},{"cell_type":"markdown","metadata":{},"source":["## Embarked"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.378151Z","iopub.status.busy":"2024-04-01T06:27:55.377756Z","iopub.status.idle":"2024-04-01T06:27:55.384785Z","shell.execute_reply":"2024-04-01T06:27:55.384101Z","shell.execute_reply.started":"2024-04-01T06:27:55.378107Z"},"trusted":true},"outputs":[],"source":["train_df[\"Embarked\"].head()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.386265Z","iopub.status.busy":"2024-04-01T06:27:55.385875Z","iopub.status.idle":"2024-04-01T06:27:55.635178Z","shell.execute_reply":"2024-04-01T06:27:55.633609Z","shell.execute_reply.started":"2024-04-01T06:27:55.386223Z"},"trusted":true},"outputs":[],"source":["sns.countplot(x = \"Embarked\", data = train_df)\n","plt.show()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.638242Z","iopub.status.busy":"2024-04-01T06:27:55.637447Z","iopub.status.idle":"2024-04-01T06:27:55.699106Z","shell.execute_reply":"2024-04-01T06:27:55.698208Z","shell.execute_reply.started":"2024-04-01T06:27:55.638150Z"},"trusted":true},"outputs":[],"source":["train_df = pd.get_dummies(train_df, columns=[\"Embarked\"])\n","train_df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df = pd.get_dummies(test_df, columns=[\"Embarked\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["## Ticket (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.700760Z","iopub.status.busy":"2024-04-01T06:27:55.700330Z","iopub.status.idle":"2024-04-01T06:27:55.708542Z","shell.execute_reply":"2024-04-01T06:27:55.707466Z","shell.execute_reply.started":"2024-04-01T06:27:55.700715Z"},"trusted":true},"outputs":[],"source":["train_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.710291Z","iopub.status.busy":"2024-04-01T06:27:55.709980Z","iopub.status.idle":"2024-04-01T06:27:55.722810Z","shell.execute_reply":"2024-04-01T06:27:55.721839Z","shell.execute_reply.started":"2024-04-01T06:27:55.710231Z"},"trusted":true},"outputs":[],"source":["example_ticket = \"A/5. 2151\"\n","example_ticket.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0]"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.726116Z","iopub.status.busy":"2024-04-01T06:27:55.725689Z","iopub.status.idle":"2024-04-01T06:27:55.738095Z","shell.execute_reply":"2024-04-01T06:27:55.737043Z","shell.execute_reply.started":"2024-04-01T06:27:55.726039Z"},"trusted":true},"outputs":[],"source":["tickets = []\n","for i in list(train_df.Ticket):\n"," if not i.isdigit():\n"," tickets.append(i.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0])\n"," else:\n"," tickets.append(\"x\")\n","train_df[\"Ticket\"] = tickets\n","\n","# Do the same for the test set"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.740389Z","iopub.status.busy":"2024-04-01T06:27:55.739797Z","iopub.status.idle":"2024-04-01T06:27:55.755416Z","shell.execute_reply":"2024-04-01T06:27:55.754317Z","shell.execute_reply.started":"2024-04-01T06:27:55.740333Z"},"trusted":true},"outputs":[],"source":["train_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df[\"Ticket\"].head(20)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.790832Z","iopub.status.busy":"2024-04-01T06:27:55.790500Z","iopub.status.idle":"2024-04-01T06:27:55.841011Z","shell.execute_reply":"2024-04-01T06:27:55.839862Z","shell.execute_reply.started":"2024-04-01T06:27:55.790770Z"},"trusted":true},"outputs":[],"source":["train_df = pd.get_dummies(train_df, columns= [\"Ticket\"], prefix = \"TcktName\")\n","train_df.head(10)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df = pd.get_dummies(test_df, columns= [\"Ticket\"], prefix = \"TcktName\")\n","test_df.head(10)"]},{"cell_type":"markdown","metadata":{},"source":["
\n","## Pclass"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:55.843395Z","iopub.status.busy":"2024-04-01T06:27:55.842833Z","iopub.status.idle":"2024-04-01T06:27:56.089225Z","shell.execute_reply":"2024-04-01T06:27:56.087578Z","shell.execute_reply.started":"2024-04-01T06:27:55.843168Z"},"trusted":true},"outputs":[],"source":["sns.countplot(x = \"Pclass\", data = train_df)\n","plt.show()"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.092270Z","iopub.status.busy":"2024-04-01T06:27:56.091722Z","iopub.status.idle":"2024-04-01T06:27:56.162888Z","shell.execute_reply":"2024-04-01T06:27:56.161841Z","shell.execute_reply.started":"2024-04-01T06:27:56.092186Z"},"trusted":true},"outputs":[],"source":["train_df[\"Pclass\"] = train_df[\"Pclass\"].astype(\"category\")\n","train_df = pd.get_dummies(train_df, columns= [\"Pclass\"])\n","train_df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df[\"Pclass\"] = test_df[\"Pclass\"].astype(\"category\")\n","test_df = pd.get_dummies(test_df, columns= [\"Pclass\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["
\n","## Sex"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.164709Z","iopub.status.busy":"2024-04-01T06:27:56.164391Z","iopub.status.idle":"2024-04-01T06:27:56.205775Z","shell.execute_reply":"2024-04-01T06:27:56.204761Z","shell.execute_reply.started":"2024-04-01T06:27:56.164639Z"},"trusted":true},"outputs":[],"source":["train_df[\"Sex\"] = train_df[\"Sex\"].astype(\"category\")\n","train_df = pd.get_dummies(train_df, columns=[\"Sex\"])\n","train_df.head()"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["test_df[\"Sex\"] = test_df[\"Sex\"].astype(\"category\")\n","test_df = pd.get_dummies(test_df, columns=[\"Sex\"])\n","test_df.head()"]},{"cell_type":"markdown","metadata":{},"source":["## Drop Passenger ID and Cabin (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.207602Z","iopub.status.busy":"2024-04-01T06:27:56.207299Z","iopub.status.idle":"2024-04-01T06:27:56.215886Z","shell.execute_reply":"2024-04-01T06:27:56.214401Z","shell.execute_reply.started":"2024-04-01T06:27:56.207550Z"},"trusted":true},"outputs":[],"source":["train_df.drop(labels = [\"PassengerId\", \"Cabin\"], axis = 1, inplace = True)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.217917Z","iopub.status.busy":"2024-04-01T06:27:56.217536Z","iopub.status.idle":"2024-04-01T06:27:56.228150Z","shell.execute_reply":"2024-04-01T06:27:56.227230Z","shell.execute_reply.started":"2024-04-01T06:27:56.217854Z"},"trusted":true},"outputs":[],"source":["train_df.columns"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Drop the PassengerId and Cabin columns from the test set"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Print the columns of the test set"]},{"cell_type":"markdown","metadata":{},"source":["
\n","# Modeling"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.230086Z","iopub.status.busy":"2024-04-01T06:27:56.229809Z","iopub.status.idle":"2024-04-01T06:27:56.238557Z","shell.execute_reply":"2024-04-01T06:27:56.237679Z","shell.execute_reply.started":"2024-04-01T06:27:56.230040Z"},"trusted":true},"outputs":[],"source":["from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.svm import SVC\n","from sklearn.ensemble import RandomForestClassifier, VotingClassifier\n","from sklearn.neighbors import KNeighborsClassifier\n","from sklearn.tree import DecisionTreeClassifier\n","from sklearn.metrics import accuracy_score"]},{"cell_type":"markdown","metadata":{},"source":["## Train - Test Split (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.240103Z","iopub.status.busy":"2024-04-01T06:27:56.239830Z","iopub.status.idle":"2024-04-01T06:27:56.256809Z","shell.execute_reply":"2024-04-01T06:27:56.255463Z","shell.execute_reply.started":"2024-04-01T06:27:56.240056Z"},"trusted":true},"outputs":[],"source":["train_df_len = len(train_df)\n","train_df_len"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.314730Z","iopub.status.busy":"2024-04-01T06:27:56.313986Z","iopub.status.idle":"2024-04-01T06:27:56.333564Z","shell.execute_reply":"2024-04-01T06:27:56.332507Z","shell.execute_reply.started":"2024-04-01T06:27:56.314635Z"},"trusted":true},"outputs":[],"source":["\n","train = train_df[:train_df_len]\n","test = test_df\n","\n","# Select all numerical values from train and test\n","numeric_train = train.select_dtypes(include=[np.number])\n","numeric_test = test.select_dtypes(include=[np.number]) \n","\n","\n","X_train = numeric_train.drop(labels=[\"Survived\",], axis=1)\n","y_train = numeric_train[\"Survived\"]\n","\n","# Split the train data into train and test sets with a 1/3 ratio\n","X_train, X_test, y_train, y_test = # Use the train_test_split function here\n","\n","\n","print(\"X_train\", len(X_train))\n","print(\"X_test\", len(X_test))\n","print(\"y_train\", len(y_train))\n","print(\"y_test\", len(y_test))\n","print(\"test\", len(numeric_test))"]},{"cell_type":"markdown","metadata":{},"source":["## Simple Logistic Regression (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.335970Z","iopub.status.busy":"2024-04-01T06:27:56.335281Z","iopub.status.idle":"2024-04-01T06:27:56.368083Z","shell.execute_reply":"2024-04-01T06:27:56.366489Z","shell.execute_reply.started":"2024-04-01T06:27:56.335561Z"},"trusted":true},"outputs":[],"source":["logreg = LogisticRegression()\n","logreg.fit(X_train, y_train)\n","acc_log_train = round(logreg.score(X_train, y_train)*100,2) \n","acc_log_test = round(logreg.score(X_test,y_test)*100,2)\n","# Print the accuracy on the training and test set"]},{"cell_type":"markdown","metadata":{},"source":["
\n","## Hyperparameter Tuning -- Grid Search -- Cross Validation\n","We will compare 5 ml classifier and evaluate mean accuracy of each of them by stratified cross validation.\n","\n","* Decision Tree\n","* SVM\n","* Random Forest\n","* KNN\n","* Logistic Regression"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.371066Z","iopub.status.busy":"2024-04-01T06:27:56.370400Z","iopub.status.idle":"2024-04-01T06:27:56.401742Z","shell.execute_reply":"2024-04-01T06:27:56.396867Z","shell.execute_reply.started":"2024-04-01T06:27:56.370802Z"},"trusted":true},"outputs":[],"source":["random_state = 42\n","classifier = [DecisionTreeClassifier(random_state = random_state),\n"," SVC(random_state = random_state),\n"," RandomForestClassifier(random_state = random_state),\n"," LogisticRegression(random_state = random_state),\n"," KNeighborsClassifier()]\n","\n","dt_param_grid = {\"min_samples_split\" : range(10,500,20),\n"," \"max_depth\": range(1,20,2)}\n","\n","svc_param_grid = {\"kernel\" : [\"rbf\"],\n"," \"gamma\": [0.001, 0.01, 0.1, 1],\n"," \"C\": [1,10,50,100,200,300,1000]}\n","\n","rf_param_grid = {\"max_features\": [1,3,10],\n"," \"min_samples_split\":[2,3,10],\n"," \"min_samples_leaf\":[1,3,10],\n"," \"bootstrap\":[False],\n"," \"n_estimators\":[100,300],\n"," \"criterion\":[\"gini\"]}\n","\n","logreg_param_grid = {\"C\":np.logspace(-3,3,7),\n"," \"penalty\": [\"l1\",\"l2\"]}\n","\n","knn_param_grid = {\"n_neighbors\": np.linspace(1,19,10, dtype = int).tolist(),\n"," \"weights\": [\"uniform\",\"distance\"],\n"," \"metric\":[\"euclidean\",\"manhattan\"]}\n","classifier_param = [dt_param_grid,\n"," svc_param_grid,\n"," rf_param_grid,\n"," logreg_param_grid,\n"," knn_param_grid]"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:27:56.413811Z","iopub.status.busy":"2024-04-01T06:27:56.404322Z","iopub.status.idle":"2024-04-01T06:29:38.718970Z","shell.execute_reply":"2024-04-01T06:29:38.717807Z","shell.execute_reply.started":"2024-04-01T06:27:56.413658Z"},"trusted":true},"outputs":[],"source":["cv_result = []\n","best_estimators = []\n","for i in range(len(classifier)):\n"," clf = GridSearchCV(classifier[i], param_grid=classifier_param[i], cv = StratifiedKFold(n_splits = 10), scoring = \"accuracy\", n_jobs = -1,verbose = 1)\n"," clf.fit(X_train,y_train)\n"," cv_result.append(clf.best_score_)\n"," best_estimators.append(clf.best_estimator_)\n"," print(cv_result[i])"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:38.722928Z","iopub.status.busy":"2024-04-01T06:29:38.722207Z","iopub.status.idle":"2024-04-01T06:29:39.075423Z","shell.execute_reply":"2024-04-01T06:29:39.073987Z","shell.execute_reply.started":"2024-04-01T06:29:38.722582Z"},"trusted":true},"outputs":[],"source":["cv_results = pd.DataFrame({\"Cross Validation Means\":cv_result, \"ML Models\":[\"DecisionTreeClassifier\", \"SVM\",\"RandomForestClassifier\",\n"," \"LogisticRegression\",\n"," \"KNeighborsClassifier\"]})\n","\n","g = sns.barplot(x=\"Cross Validation Means\",y= \"ML Models\", data=cv_results)\n","g.set_xlabel(\"Mean Accuracy\")\n","g.set_title(\"Cross Validation Scores\")"]},{"cell_type":"markdown","metadata":{},"source":["## Ensemble Modeling (Assignment)"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:39.078654Z","iopub.status.busy":"2024-04-01T06:29:39.077840Z","iopub.status.idle":"2024-04-01T06:29:39.862871Z","shell.execute_reply":"2024-04-01T06:29:39.860937Z","shell.execute_reply.started":"2024-04-01T06:29:39.078554Z"},"trusted":true},"outputs":[],"source":["votingC = VotingClassifier(estimators = [(\"dt\",best_estimators[0]),\n"," (\"rfc\",best_estimators[2]),\n"," (\"lr\",best_estimators[3])],\n"," voting = \"soft\", n_jobs = -1)\n","votingC = votingC.fit(X_train, y_train)\n","\n","# Print the accuracy score of the voting classifier"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# Drop the null values which are going to cause you an error in the next cell"]},{"cell_type":"markdown","metadata":{},"source":["
\n","## Prediction and Submission"]},{"cell_type":"code","execution_count":null,"metadata":{"execution":{"iopub.execute_input":"2024-04-01T06:29:39.865981Z","iopub.status.busy":"2024-04-01T06:29:39.865330Z","iopub.status.idle":"2024-04-01T06:29:39.977357Z","shell.execute_reply":"2024-04-01T06:29:39.973301Z","shell.execute_reply.started":"2024-04-01T06:29:39.865906Z"},"trusted":true},"outputs":[],"source":["test_survived = pd.Series(votingC.predict(numeric_test), name=\"Survived\").astype(int)\n","results = pd.concat([test_PassengerId, test_survived], axis=1)\n","results.to_csv(\"titanic.csv\", index=False)\n","print(results)"]},{"cell_type":"markdown","metadata":{},"source":["# Congratulations on finishing the assignment!!\n","\n","### The submission is the titanic.csv which was just created, and this file which you have modified."]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"databundleVersionId":26502,"sourceId":3136,"sourceType":"competition"}],"dockerImageVersionId":29852,"isGpuEnabled":false,"isInternetEnabled":false,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.7"}},"nbformat":4,"nbformat_minor":4}
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "# Ignore this"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np # linear algebra\n",
+ "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
+ "import matplotlib.pyplot as plt\n",
+ "plt.style.use(\"seaborn-v0_8-whitegrid\")\n",
+ "\n",
+ "import seaborn as sns\n",
+ "\n",
+ "from collections import Counter\n",
+ "\n",
+ "import warnings\n",
+ "warnings.filterwarnings(\"ignore\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "# Load and Check Data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "DataFrames hold the dataset in a tabular format for easy manipulation and analysis.
\n",
+ "CSV data is read into 'df' using Pandas' read_csv() function."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "_kg_hide-input": true
+ },
+ "outputs": [],
+ "source": [
+ "train_df = pd.read_csv(\"/Downloads/train.csv\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1. Try to read the test .csv file into test_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_df = pd.read_csv(\"/Downloads/test.csv\")\n",
+ "test_PassengerId = test_df[\"PassengerId\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
+ "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The Columns of train_df are: \n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',\n",
+ " 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "print(\"The Columns of train_df are: \")\n",
+ "train_df.columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### We can use head() to see the first few rows in the dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PassengerId | \n",
+ " Survived | \n",
+ " Pclass | \n",
+ " Name | \n",
+ " Sex | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Ticket | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " Embarked | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Braund, Mr. Owen Harris | \n",
+ " male | \n",
+ " 22.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " A/5 21171 | \n",
+ " 7.2500 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
+ " female | \n",
+ " 38.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " PC 17599 | \n",
+ " 71.2833 | \n",
+ " C85 | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " Heikkinen, Miss. Laina | \n",
+ " female | \n",
+ " 26.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " STON/O2. 3101282 | \n",
+ " 7.9250 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
+ " female | \n",
+ " 35.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 113803 | \n",
+ " 53.1000 | \n",
+ " C123 | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Allen, Mr. William Henry | \n",
+ " male | \n",
+ " 35.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 373450 | \n",
+ " 8.0500 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PassengerId Survived Pclass \\\n",
+ "0 1 0 3 \n",
+ "1 2 1 1 \n",
+ "2 3 1 3 \n",
+ "3 4 1 1 \n",
+ "4 5 0 3 \n",
+ "\n",
+ " Name Sex Age SibSp \\\n",
+ "0 Braund, Mr. Owen Harris male 22.0 1 \n",
+ "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
+ "2 Heikkinen, Miss. Laina female 26.0 0 \n",
+ "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
+ "4 Allen, Mr. William Henry male 35.0 0 \n",
+ "\n",
+ " Parch Ticket Fare Cabin Embarked \n",
+ "0 0 A/5 21171 7.2500 NaN S \n",
+ "1 0 PC 17599 71.2833 C85 C \n",
+ "2 0 STON/O2. 3101282 7.9250 NaN S \n",
+ "3 0 113803 53.1000 C123 S \n",
+ "4 0 373450 8.0500 NaN S "
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PassengerId | \n",
+ " Survived | \n",
+ " Pclass | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Fare | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | count | \n",
+ " 891.000000 | \n",
+ " 891.000000 | \n",
+ " 891.000000 | \n",
+ " 714.000000 | \n",
+ " 891.000000 | \n",
+ " 891.000000 | \n",
+ " 891.000000 | \n",
+ "
\n",
+ " \n",
+ " | mean | \n",
+ " 446.000000 | \n",
+ " 0.383838 | \n",
+ " 2.308642 | \n",
+ " 29.699118 | \n",
+ " 0.523008 | \n",
+ " 0.381594 | \n",
+ " 32.204208 | \n",
+ "
\n",
+ " \n",
+ " | std | \n",
+ " 257.353842 | \n",
+ " 0.486592 | \n",
+ " 0.836071 | \n",
+ " 14.526497 | \n",
+ " 1.102743 | \n",
+ " 0.806057 | \n",
+ " 49.693429 | \n",
+ "
\n",
+ " \n",
+ " | min | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 0.420000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 25% | \n",
+ " 223.500000 | \n",
+ " 0.000000 | \n",
+ " 2.000000 | \n",
+ " 20.125000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 7.910400 | \n",
+ "
\n",
+ " \n",
+ " | 50% | \n",
+ " 446.000000 | \n",
+ " 0.000000 | \n",
+ " 3.000000 | \n",
+ " 28.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 14.454200 | \n",
+ "
\n",
+ " \n",
+ " | 75% | \n",
+ " 668.500000 | \n",
+ " 1.000000 | \n",
+ " 3.000000 | \n",
+ " 38.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 31.000000 | \n",
+ "
\n",
+ " \n",
+ " | max | \n",
+ " 891.000000 | \n",
+ " 1.000000 | \n",
+ " 3.000000 | \n",
+ " 80.000000 | \n",
+ " 8.000000 | \n",
+ " 6.000000 | \n",
+ " 512.329200 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PassengerId Survived Pclass Age SibSp \\\n",
+ "count 891.000000 891.000000 891.000000 714.000000 891.000000 \n",
+ "mean 446.000000 0.383838 2.308642 29.699118 0.523008 \n",
+ "std 257.353842 0.486592 0.836071 14.526497 1.102743 \n",
+ "min 1.000000 0.000000 1.000000 0.420000 0.000000 \n",
+ "25% 223.500000 0.000000 2.000000 20.125000 0.000000 \n",
+ "50% 446.000000 0.000000 3.000000 28.000000 0.000000 \n",
+ "75% 668.500000 1.000000 3.000000 38.000000 1.000000 \n",
+ "max 891.000000 1.000000 3.000000 80.000000 8.000000 \n",
+ "\n",
+ " Parch Fare \n",
+ "count 891.000000 891.000000 \n",
+ "mean 0.381594 32.204208 \n",
+ "std 0.806057 49.693429 \n",
+ "min 0.000000 0.000000 \n",
+ "25% 0.000000 7.910400 \n",
+ "50% 0.000000 14.454200 \n",
+ "75% 0.000000 31.000000 \n",
+ "max 6.000000 512.329200 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2. Without looking above, try checking the first few rows of test_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PassengerId | \n",
+ " Pclass | \n",
+ " Name | \n",
+ " Sex | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Ticket | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " Embarked | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 892 | \n",
+ " 3 | \n",
+ " Kelly, Mr. James | \n",
+ " male | \n",
+ " 34.5 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 330911 | \n",
+ " 7.8292 | \n",
+ " NaN | \n",
+ " Q | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 893 | \n",
+ " 3 | \n",
+ " Wilkes, Mrs. James (Ellen Needs) | \n",
+ " female | \n",
+ " 47.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 363272 | \n",
+ " 7.0000 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 894 | \n",
+ " 2 | \n",
+ " Myles, Mr. Thomas Francis | \n",
+ " male | \n",
+ " 62.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 240276 | \n",
+ " 9.6875 | \n",
+ " NaN | \n",
+ " Q | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 895 | \n",
+ " 3 | \n",
+ " Wirz, Mr. Albert | \n",
+ " male | \n",
+ " 27.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 315154 | \n",
+ " 8.6625 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 896 | \n",
+ " 3 | \n",
+ " Hirvonen, Mrs. Alexander (Helga E Lindqvist) | \n",
+ " female | \n",
+ " 22.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3101298 | \n",
+ " 12.2875 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PassengerId Pclass Name Sex \\\n",
+ "0 892 3 Kelly, Mr. James male \n",
+ "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n",
+ "2 894 2 Myles, Mr. Thomas Francis male \n",
+ "3 895 3 Wirz, Mr. Albert male \n",
+ "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n",
+ "\n",
+ " Age SibSp Parch Ticket Fare Cabin Embarked \n",
+ "0 34.5 0 0 330911 7.8292 NaN Q \n",
+ "1 47.0 1 0 363272 7.0000 NaN S \n",
+ "2 62.0 0 0 240276 9.6875 NaN Q \n",
+ "3 27.0 0 0 315154 8.6625 NaN S \n",
+ "4 22.0 1 1 3101298 12.2875 NaN S "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3. Now try checking for a description of test_df's data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PassengerId | \n",
+ " Pclass | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Fare | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | count | \n",
+ " 418.000000 | \n",
+ " 418.000000 | \n",
+ " 332.000000 | \n",
+ " 418.000000 | \n",
+ " 418.000000 | \n",
+ " 417.000000 | \n",
+ "
\n",
+ " \n",
+ " | mean | \n",
+ " 1100.500000 | \n",
+ " 2.265550 | \n",
+ " 30.272590 | \n",
+ " 0.447368 | \n",
+ " 0.392344 | \n",
+ " 35.627188 | \n",
+ "
\n",
+ " \n",
+ " | std | \n",
+ " 120.810458 | \n",
+ " 0.841838 | \n",
+ " 14.181209 | \n",
+ " 0.896760 | \n",
+ " 0.981429 | \n",
+ " 55.907576 | \n",
+ "
\n",
+ " \n",
+ " | min | \n",
+ " 892.000000 | \n",
+ " 1.000000 | \n",
+ " 0.170000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " | 25% | \n",
+ " 996.250000 | \n",
+ " 1.000000 | \n",
+ " 21.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 7.895800 | \n",
+ "
\n",
+ " \n",
+ " | 50% | \n",
+ " 1100.500000 | \n",
+ " 3.000000 | \n",
+ " 27.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 14.454200 | \n",
+ "
\n",
+ " \n",
+ " | 75% | \n",
+ " 1204.750000 | \n",
+ " 3.000000 | \n",
+ " 39.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 31.500000 | \n",
+ "
\n",
+ " \n",
+ " | max | \n",
+ " 1309.000000 | \n",
+ " 3.000000 | \n",
+ " 76.000000 | \n",
+ " 8.000000 | \n",
+ " 9.000000 | \n",
+ " 512.329200 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PassengerId Pclass Age SibSp Parch Fare\n",
+ "count 418.000000 418.000000 332.000000 418.000000 418.000000 417.000000\n",
+ "mean 1100.500000 2.265550 30.272590 0.447368 0.392344 35.627188\n",
+ "std 120.810458 0.841838 14.181209 0.896760 0.981429 55.907576\n",
+ "min 892.000000 1.000000 0.170000 0.000000 0.000000 0.000000\n",
+ "25% 996.250000 1.000000 21.000000 0.000000 0.000000 7.895800\n",
+ "50% 1100.500000 3.000000 27.000000 0.000000 0.000000 14.454200\n",
+ "75% 1204.750000 3.000000 39.000000 1.000000 0.000000 31.500000\n",
+ "max 1309.000000 3.000000 76.000000 8.000000 9.000000 512.329200"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_df.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Variable Description\n",
+ "\n",
+ " PassengerId: unique id number to each passenger \n",
+ " Survived: passenger survive(1) or died(0) \n",
+ " Pclass: passenger class \n",
+ " Name: name \n",
+ " Sex: gender of passenger \n",
+ " Age: age of passenger \n",
+ " SibSp: number of siblings/spouses \n",
+ " Parch: number of parents/children \n",
+ " Ticket: ticket number \n",
+ " Fare: amount of money spent on ticket \n",
+ " Cabin: cabin category \n",
+ " Embarked: port where passenger embarked ( C = Cherbourg, Q = Queenstown, S = Southampton ) \n",
+ "
\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 891 entries, 0 to 890\n",
+ "Data columns (total 12 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 PassengerId 891 non-null int64 \n",
+ " 1 Survived 891 non-null int64 \n",
+ " 2 Pclass 891 non-null int64 \n",
+ " 3 Name 891 non-null object \n",
+ " 4 Sex 891 non-null object \n",
+ " 5 Age 714 non-null float64\n",
+ " 6 SibSp 891 non-null int64 \n",
+ " 7 Parch 891 non-null int64 \n",
+ " 8 Ticket 891 non-null object \n",
+ " 9 Fare 891 non-null float64\n",
+ " 10 Cabin 204 non-null object \n",
+ " 11 Embarked 889 non-null object \n",
+ "dtypes: float64(2), int64(5), object(5)\n",
+ "memory usage: 83.7+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "train_df.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Slice Rows and Columsn of DF (Assigmennt)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PassengerId 3\n",
+ "Survived 1\n",
+ "Pclass 3\n",
+ "Name Heikkinen, Miss. Laina\n",
+ "Sex female\n",
+ "Age 26.0\n",
+ "SibSp 0\n",
+ "Parch 0\n",
+ "Ticket STON/O2. 3101282\n",
+ "Fare 7.925\n",
+ "Cabin NaN\n",
+ "Embarked S\n",
+ "Name: 2, dtype: object"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Printing the Second Row\n",
+ "train_df.iloc[2]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PassengerId 6\n",
+ "Survived 0\n",
+ "Pclass 3\n",
+ "Name Moran, Mr. James\n",
+ "Sex male\n",
+ "Age NaN\n",
+ "SibSp 0\n",
+ "Parch 0\n",
+ "Ticket 330877\n",
+ "Fare 8.4583\n",
+ "Cabin NaN\n",
+ "Embarked Q\n",
+ "Name: 5, dtype: object"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Print the 5th Row\n",
+ "train_df.iloc[5]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 male\n",
+ "1 female\n",
+ "2 female\n",
+ "3 female\n",
+ "4 male\n",
+ " ... \n",
+ "886 male\n",
+ "887 female\n",
+ "888 female\n",
+ "889 male\n",
+ "890 male\n",
+ "Name: Sex, Length: 891, dtype: object"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Print the Sex Column\n",
+ "train_df['Sex']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 Braund, Mr. Owen Harris\n",
+ "1 Cumings, Mrs. John Bradley (Florence Briggs Th...\n",
+ "2 Heikkinen, Miss. Laina\n",
+ "3 Futrelle, Mrs. Jacques Heath (Lily May Peel)\n",
+ "4 Allen, Mr. William Henry\n",
+ " ... \n",
+ "886 Montvila, Rev. Juozas\n",
+ "887 Graham, Miss. Margaret Edith\n",
+ "888 Johnston, Miss. Catherine Helen \"Carrie\"\n",
+ "889 Behr, Mr. Karl Howell\n",
+ "890 Dooley, Mr. Patrick\n",
+ "Name: Name, Length: 891, dtype: object"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Print the Name Column\n",
+ "train_df['Name']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Visualization (Assignment)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Age -- Survived"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(8, 6))\n",
+ "# Plot 1: Survivors vs Non Survivors\n",
+ "\n",
+ "# Creating a plot for the Survived Column\n",
+ "sns.countplot(x='Survived', data=train_df)\n",
+ "\n",
+ "plt.title('Survivors vs Non Survivors')\n",
+ "plt.xlabel('Survived')\n",
+ "plt.ylabel('Count')\n",
+ "plt.xticks([0, 1], ['No', 'Yes']) # Setting custom tick labels\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Try Plotting Passenger Class"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(8, 6))\n",
+ "\n",
+ "# Make the plot for Pclass here:\n",
+ "sns.countplot(x='Pclass', data=train_df)\n",
+ "\n",
+ "plt.title('Count of Passengers In each Passenger Class')\n",
+ "plt.xlabel('Passenger Class')\n",
+ "plt.ylabel('Count')\n",
+ "plt.xticks([0, 1, 2], ['1st', '2nd', '3rd']) # Setting custom tick labels\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Try it for \"Embarked\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(8, 6))\n",
+ "\n",
+ "sns.countplot(x='Embarked', data=train_df)\n",
+ "\n",
+ "plt.title('Embarked Histogram')\n",
+ "plt.xlabel('S vs C vs Q')\n",
+ "plt.ylabel('Count')\n",
+ "plt.xticks([0, 1, 2], ['S', 'C', 'Q']) \n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Try Making a histogram for \"Fare\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.histplot(train_df['Fare'], bins=20, color='green')\n",
+ "\n",
+ "plt.title('Fare Histogram')\n",
+ "plt.xlabel('Fare')\n",
+ "plt.ylabel('Frequency')\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Here is the distplot for \"Fare\", refer to it after you tried it yourself:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.histplot(train_df['Fare'], bins=20, color='orange')\n",
+ "plt.title('Distribution of Passenger Fares')\n",
+ "plt.xlabel('Fare')\n",
+ "plt.ylabel('Frequency')\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Make a histogram for \"Age\" (Assignment)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Create the plot below\n",
+ "sns.histplot(train_df['Age'], bins=15, color='blue')\n",
+ "plt.title('Age Histogram')\n",
+ "plt.xlabel('Age')\n",
+ "plt.ylabel('Frequency')\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "## Fill Missing: Age Feature"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PassengerId | \n",
+ " Survived | \n",
+ " Pclass | \n",
+ " Name | \n",
+ " Sex | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Ticket | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " Embarked | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 5 | \n",
+ " 6 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Moran, Mr. James | \n",
+ " male | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 330877 | \n",
+ " 8.4583 | \n",
+ " NaN | \n",
+ " Q | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 18 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " Williams, Mr. Charles Eugene | \n",
+ " male | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 244373 | \n",
+ " 13.0000 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " 20 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " Masselmani, Mrs. Fatima | \n",
+ " female | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2649 | \n",
+ " 7.2250 | \n",
+ " NaN | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 26 | \n",
+ " 27 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Emir, Mr. Farred Chehab | \n",
+ " male | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2631 | \n",
+ " 7.2250 | \n",
+ " NaN | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 28 | \n",
+ " 29 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " O'Dwyer, Miss. Ellen \"Nellie\" | \n",
+ " female | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 330959 | \n",
+ " 7.8792 | \n",
+ " NaN | \n",
+ " Q | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 859 | \n",
+ " 860 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Razi, Mr. Raihed | \n",
+ " male | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2629 | \n",
+ " 7.2292 | \n",
+ " NaN | \n",
+ " C | \n",
+ "
\n",
+ " \n",
+ " | 863 | \n",
+ " 864 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Sage, Miss. Dorothy Edith \"Dolly\" | \n",
+ " female | \n",
+ " NaN | \n",
+ " 8 | \n",
+ " 2 | \n",
+ " CA. 2343 | \n",
+ " 69.5500 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " | 868 | \n",
+ " 869 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " van Melkebeke, Mr. Philemon | \n",
+ " male | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 345777 | \n",
+ " 9.5000 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " | 878 | \n",
+ " 879 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Laleff, Mr. Kristo | \n",
+ " male | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 349217 | \n",
+ " 7.8958 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ " | 888 | \n",
+ " 889 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Johnston, Miss. Catherine Helen \"Carrie\" | \n",
+ " female | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " W./C. 6607 | \n",
+ " 23.4500 | \n",
+ " NaN | \n",
+ " S | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
177 rows × 12 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PassengerId Survived Pclass Name \\\n",
+ "5 6 0 3 Moran, Mr. James \n",
+ "17 18 1 2 Williams, Mr. Charles Eugene \n",
+ "19 20 1 3 Masselmani, Mrs. Fatima \n",
+ "26 27 0 3 Emir, Mr. Farred Chehab \n",
+ "28 29 1 3 O'Dwyer, Miss. Ellen \"Nellie\" \n",
+ ".. ... ... ... ... \n",
+ "859 860 0 3 Razi, Mr. Raihed \n",
+ "863 864 0 3 Sage, Miss. Dorothy Edith \"Dolly\" \n",
+ "868 869 0 3 van Melkebeke, Mr. Philemon \n",
+ "878 879 0 3 Laleff, Mr. Kristo \n",
+ "888 889 0 3 Johnston, Miss. Catherine Helen \"Carrie\" \n",
+ "\n",
+ " Sex Age SibSp Parch Ticket Fare Cabin Embarked \n",
+ "5 male NaN 0 0 330877 8.4583 NaN Q \n",
+ "17 male NaN 0 0 244373 13.0000 NaN S \n",
+ "19 female NaN 0 0 2649 7.2250 NaN C \n",
+ "26 male NaN 0 0 2631 7.2250 NaN C \n",
+ "28 female NaN 0 0 330959 7.8792 NaN Q \n",
+ ".. ... ... ... ... ... ... ... ... \n",
+ "859 male NaN 0 0 2629 7.2292 NaN C \n",
+ "863 female NaN 8 2 CA. 2343 69.5500 NaN S \n",
+ "868 male NaN 0 0 345777 9.5000 NaN S \n",
+ "878 male NaN 0 0 349217 7.8958 NaN S \n",
+ "888 female NaN 1 2 W./C. 6607 23.4500 NaN S \n",
+ "\n",
+ "[177 rows x 12 columns]"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df[train_df[\"Age\"].isnull()]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Try Checking for Null Values in Test Df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PassengerId 0\n",
+ "Pclass 0\n",
+ "Name 0\n",
+ "Sex 0\n",
+ "Age 86\n",
+ "SibSp 0\n",
+ "Parch 0\n",
+ "Ticket 0\n",
+ "Fare 1\n",
+ "Cabin 327\n",
+ "Embarked 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_df.isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Run this to fix the Null Values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "index_nan_age = list(train_df[\"Age\"][train_df[\"Age\"].isnull()].index)\n",
+ "for i in index_nan_age:\n",
+ " age_pred = train_df[\"Age\"][((train_df[\"SibSp\"] == train_df.iloc[i][\"SibSp\"]) &(train_df[\"Parch\"] == train_df.iloc[i][\"Parch\"])& (train_df[\"Pclass\"] == train_df.iloc[i][\"Pclass\"]))].median()\n",
+ " age_med = train_df[\"Age\"].median()\n",
+ " if not np.isnan(age_pred):\n",
+ " train_df[\"Age\"].iloc[i] = age_pred\n",
+ " else:\n",
+ " train_df[\"Age\"].iloc[i] = age_med\n",
+ "\n",
+ "index_nan_age = list(test_df[\"Age\"][test_df[\"Age\"].isnull()].index)\n",
+ "for i in index_nan_age:\n",
+ " age_pred = test_df[\"Age\"][((test_df[\"SibSp\"] == test_df.iloc[i][\"SibSp\"]) &(test_df[\"Parch\"] == test_df.iloc[i][\"Parch\"])& (test_df[\"Pclass\"] == test_df.iloc[i][\"Pclass\"]))].median()\n",
+ " age_med = test_df[\"Age\"].median()\n",
+ " if not np.isnan(age_pred):\n",
+ " test_df[\"Age\"].iloc[i] = age_pred\n",
+ " else:\n",
+ " test_df[\"Age\"].iloc[i] = age_med"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analysing the correlation between the different columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "numerical_columns = train_df.select_dtypes(include=[np.number]).columns\n",
+ "sns.heatmap(train_df[numerical_columns].corr(), annot=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We see that Fare and Parch are positively correlated with Survived. Similarly, Fare and Class are negatively correlated, in the sense that the higher the higher the Fare, the lower the Class number (Remember that Class 1 < Class 2 < Class 3 in face value)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Embarked"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 S\n",
+ "1 C\n",
+ "2 S\n",
+ "3 S\n",
+ "4 S\n",
+ "Name: Embarked, dtype: object"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df[\"Embarked\"].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.countplot(x = \"Embarked\", data = train_df)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PassengerId | \n",
+ " Survived | \n",
+ " Pclass | \n",
+ " Name | \n",
+ " Sex | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Ticket | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " Embarked_C | \n",
+ " Embarked_Q | \n",
+ " Embarked_S | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Braund, Mr. Owen Harris | \n",
+ " male | \n",
+ " 22.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " A/5 21171 | \n",
+ " 7.2500 | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
+ " female | \n",
+ " 38.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " PC 17599 | \n",
+ " 71.2833 | \n",
+ " C85 | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " Heikkinen, Miss. Laina | \n",
+ " female | \n",
+ " 26.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " STON/O2. 3101282 | \n",
+ " 7.9250 | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
+ " female | \n",
+ " 35.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 113803 | \n",
+ " 53.1000 | \n",
+ " C123 | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Allen, Mr. William Henry | \n",
+ " male | \n",
+ " 35.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 373450 | \n",
+ " 8.0500 | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PassengerId Survived Pclass \\\n",
+ "0 1 0 3 \n",
+ "1 2 1 1 \n",
+ "2 3 1 3 \n",
+ "3 4 1 1 \n",
+ "4 5 0 3 \n",
+ "\n",
+ " Name Sex Age SibSp \\\n",
+ "0 Braund, Mr. Owen Harris male 22.0 1 \n",
+ "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
+ "2 Heikkinen, Miss. Laina female 26.0 0 \n",
+ "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
+ "4 Allen, Mr. William Henry male 35.0 0 \n",
+ "\n",
+ " Parch Ticket Fare Cabin Embarked_C Embarked_Q Embarked_S \n",
+ "0 0 A/5 21171 7.2500 NaN False False True \n",
+ "1 0 PC 17599 71.2833 C85 True False False \n",
+ "2 0 STON/O2. 3101282 7.9250 NaN False False True \n",
+ "3 0 113803 53.1000 C123 False False True \n",
+ "4 0 373450 8.0500 NaN False False True "
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df = pd.get_dummies(train_df, columns=[\"Embarked\"])\n",
+ "train_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PassengerId | \n",
+ " Pclass | \n",
+ " Name | \n",
+ " Sex | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Ticket | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " Embarked_C | \n",
+ " Embarked_Q | \n",
+ " Embarked_S | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 892 | \n",
+ " 3 | \n",
+ " Kelly, Mr. James | \n",
+ " male | \n",
+ " 34.5 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 330911 | \n",
+ " 7.8292 | \n",
+ " NaN | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 893 | \n",
+ " 3 | \n",
+ " Wilkes, Mrs. James (Ellen Needs) | \n",
+ " female | \n",
+ " 47.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 363272 | \n",
+ " 7.0000 | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 894 | \n",
+ " 2 | \n",
+ " Myles, Mr. Thomas Francis | \n",
+ " male | \n",
+ " 62.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 240276 | \n",
+ " 9.6875 | \n",
+ " NaN | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 895 | \n",
+ " 3 | \n",
+ " Wirz, Mr. Albert | \n",
+ " male | \n",
+ " 27.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 315154 | \n",
+ " 8.6625 | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 896 | \n",
+ " 3 | \n",
+ " Hirvonen, Mrs. Alexander (Helga E Lindqvist) | \n",
+ " female | \n",
+ " 22.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3101298 | \n",
+ " 12.2875 | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PassengerId Pclass Name Sex \\\n",
+ "0 892 3 Kelly, Mr. James male \n",
+ "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n",
+ "2 894 2 Myles, Mr. Thomas Francis male \n",
+ "3 895 3 Wirz, Mr. Albert male \n",
+ "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n",
+ "\n",
+ " Age SibSp Parch Ticket Fare Cabin Embarked_C Embarked_Q \\\n",
+ "0 34.5 0 0 330911 7.8292 NaN False True \n",
+ "1 47.0 1 0 363272 7.0000 NaN False False \n",
+ "2 62.0 0 0 240276 9.6875 NaN False True \n",
+ "3 27.0 0 0 315154 8.6625 NaN False False \n",
+ "4 22.0 1 1 3101298 12.2875 NaN False False \n",
+ "\n",
+ " Embarked_S \n",
+ "0 False \n",
+ "1 True \n",
+ "2 False \n",
+ "3 True \n",
+ "4 True "
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_df = pd.get_dummies(test_df, columns=[\"Embarked\"])\n",
+ "test_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Ticket (Assignment)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 330911\n",
+ "1 363272\n",
+ "2 240276\n",
+ "3 315154\n",
+ "4 3101298\n",
+ "Name: Ticket, dtype: object"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_df[\"Ticket\"].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'A5'"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "example_ticket = \"A/5. 2151\"\n",
+ "example_ticket.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tickets = []\n",
+ "for i in list(train_df.Ticket):\n",
+ " if not i.isdigit():\n",
+ " tickets.append(i.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0])\n",
+ " else:\n",
+ " tickets.append(\"x\")\n",
+ "train_df[\"Ticket\"] = tickets\n",
+ "\n",
+ "# Do the same for the test set\n",
+ "tickets = []\n",
+ "for i in list(test_df.Ticket):\n",
+ " if not i.isdigit():\n",
+ " tickets.append(i.replace(\".\",\"\").replace(\"/\",\"\").strip().split(\" \")[0])\n",
+ " else:\n",
+ " tickets.append(\"x\")\n",
+ "test_df[\"Ticket\"] = tickets\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 A5\n",
+ "1 PC\n",
+ "2 STONO2\n",
+ "3 x\n",
+ "4 x\n",
+ "5 x\n",
+ "6 x\n",
+ "7 x\n",
+ "8 x\n",
+ "9 x\n",
+ "10 PP\n",
+ "11 x\n",
+ "12 A5\n",
+ "13 x\n",
+ "14 x\n",
+ "15 x\n",
+ "16 x\n",
+ "17 x\n",
+ "18 x\n",
+ "19 x\n",
+ "Name: Ticket, dtype: object"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df[\"Ticket\"].head(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 x\n",
+ "1 x\n",
+ "2 x\n",
+ "3 x\n",
+ "4 x\n",
+ "5 x\n",
+ "6 x\n",
+ "7 x\n",
+ "8 x\n",
+ "9 A4\n",
+ "10 x\n",
+ "11 x\n",
+ "12 x\n",
+ "13 x\n",
+ "14 WEP\n",
+ "15 SCPARIS\n",
+ "16 x\n",
+ "17 x\n",
+ "18 STONO2\n",
+ "19 x\n",
+ "Name: Ticket, dtype: object"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_df[\"Ticket\"].head(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PassengerId | \n",
+ " Survived | \n",
+ " Pclass | \n",
+ " Name | \n",
+ " Sex | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " ... | \n",
+ " TcktName_SOPP | \n",
+ " TcktName_SOTONO2 | \n",
+ " TcktName_SOTONOQ | \n",
+ " TcktName_SP | \n",
+ " TcktName_STONO | \n",
+ " TcktName_STONO2 | \n",
+ " TcktName_SWPP | \n",
+ " TcktName_WC | \n",
+ " TcktName_WEP | \n",
+ " TcktName_x | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Braund, Mr. Owen Harris | \n",
+ " male | \n",
+ " 22.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 7.2500 | \n",
+ " NaN | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
+ " female | \n",
+ " 38.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 71.2833 | \n",
+ " C85 | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " Heikkinen, Miss. Laina | \n",
+ " female | \n",
+ " 26.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 7.9250 | \n",
+ " NaN | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
+ " female | \n",
+ " 35.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 53.1000 | \n",
+ " C123 | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Allen, Mr. William Henry | \n",
+ " male | \n",
+ " 35.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 8.0500 | \n",
+ " NaN | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 6 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Moran, Mr. James | \n",
+ " male | \n",
+ " 26.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 8.4583 | \n",
+ " NaN | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 7 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " McCarthy, Mr. Timothy J | \n",
+ " male | \n",
+ " 54.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 51.8625 | \n",
+ " E46 | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 8 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " Palsson, Master. Gosta Leonard | \n",
+ " male | \n",
+ " 2.0 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 21.0750 | \n",
+ " NaN | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | \n",
+ " female | \n",
+ " 27.0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 11.1333 | \n",
+ " NaN | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 10 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " Nasser, Mrs. Nicholas (Adele Achem) | \n",
+ " female | \n",
+ " 14.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 30.0708 | \n",
+ " NaN | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
10 rows × 44 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PassengerId Survived Pclass \\\n",
+ "0 1 0 3 \n",
+ "1 2 1 1 \n",
+ "2 3 1 3 \n",
+ "3 4 1 1 \n",
+ "4 5 0 3 \n",
+ "5 6 0 3 \n",
+ "6 7 0 1 \n",
+ "7 8 0 3 \n",
+ "8 9 1 3 \n",
+ "9 10 1 2 \n",
+ "\n",
+ " Name Sex Age SibSp \\\n",
+ "0 Braund, Mr. Owen Harris male 22.0 1 \n",
+ "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
+ "2 Heikkinen, Miss. Laina female 26.0 0 \n",
+ "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
+ "4 Allen, Mr. William Henry male 35.0 0 \n",
+ "5 Moran, Mr. James male 26.0 0 \n",
+ "6 McCarthy, Mr. Timothy J male 54.0 0 \n",
+ "7 Palsson, Master. Gosta Leonard male 2.0 3 \n",
+ "8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 \n",
+ "9 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 \n",
+ "\n",
+ " Parch Fare Cabin ... TcktName_SOPP TcktName_SOTONO2 \\\n",
+ "0 0 7.2500 NaN ... False False \n",
+ "1 0 71.2833 C85 ... False False \n",
+ "2 0 7.9250 NaN ... False False \n",
+ "3 0 53.1000 C123 ... False False \n",
+ "4 0 8.0500 NaN ... False False \n",
+ "5 0 8.4583 NaN ... False False \n",
+ "6 0 51.8625 E46 ... False False \n",
+ "7 1 21.0750 NaN ... False False \n",
+ "8 2 11.1333 NaN ... False False \n",
+ "9 0 30.0708 NaN ... False False \n",
+ "\n",
+ " TcktName_SOTONOQ TcktName_SP TcktName_STONO TcktName_STONO2 \\\n",
+ "0 False False False False \n",
+ "1 False False False False \n",
+ "2 False False False True \n",
+ "3 False False False False \n",
+ "4 False False False False \n",
+ "5 False False False False \n",
+ "6 False False False False \n",
+ "7 False False False False \n",
+ "8 False False False False \n",
+ "9 False False False False \n",
+ "\n",
+ " TcktName_SWPP TcktName_WC TcktName_WEP TcktName_x \n",
+ "0 False False False False \n",
+ "1 False False False False \n",
+ "2 False False False False \n",
+ "3 False False False True \n",
+ "4 False False False True \n",
+ "5 False False False True \n",
+ "6 False False False True \n",
+ "7 False False False True \n",
+ "8 False False False True \n",
+ "9 False False False True \n",
+ "\n",
+ "[10 rows x 44 columns]"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df = pd.get_dummies(train_df, columns= [\"Ticket\"], prefix = \"TcktName\")\n",
+ "train_df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PassengerId | \n",
+ " Pclass | \n",
+ " Name | \n",
+ " Sex | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " Embarked_C | \n",
+ " ... | \n",
+ " TcktName_SOC | \n",
+ " TcktName_SOPP | \n",
+ " TcktName_SOTONO2 | \n",
+ " TcktName_SOTONOQ | \n",
+ " TcktName_STONO | \n",
+ " TcktName_STONO2 | \n",
+ " TcktName_STONOQ | \n",
+ " TcktName_WC | \n",
+ " TcktName_WEP | \n",
+ " TcktName_x | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 892 | \n",
+ " 3 | \n",
+ " Kelly, Mr. James | \n",
+ " male | \n",
+ " 34.5 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 7.8292 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 893 | \n",
+ " 3 | \n",
+ " Wilkes, Mrs. James (Ellen Needs) | \n",
+ " female | \n",
+ " 47.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 7.0000 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 894 | \n",
+ " 2 | \n",
+ " Myles, Mr. Thomas Francis | \n",
+ " male | \n",
+ " 62.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 9.6875 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 895 | \n",
+ " 3 | \n",
+ " Wirz, Mr. Albert | \n",
+ " male | \n",
+ " 27.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 8.6625 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 896 | \n",
+ " 3 | \n",
+ " Hirvonen, Mrs. Alexander (Helga E Lindqvist) | \n",
+ " female | \n",
+ " 22.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 12.2875 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 897 | \n",
+ " 3 | \n",
+ " Svensson, Mr. Johan Cervin | \n",
+ " male | \n",
+ " 14.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 9.2250 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 898 | \n",
+ " 3 | \n",
+ " Connolly, Miss. Kate | \n",
+ " female | \n",
+ " 30.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 7.6292 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 899 | \n",
+ " 2 | \n",
+ " Caldwell, Mr. Albert Francis | \n",
+ " male | \n",
+ " 26.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 29.0000 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 900 | \n",
+ " 3 | \n",
+ " Abrahim, Mrs. Joseph (Sophie Halaut Easu) | \n",
+ " female | \n",
+ " 18.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 7.2292 | \n",
+ " NaN | \n",
+ " True | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 901 | \n",
+ " 3 | \n",
+ " Davies, Mr. John Samuel | \n",
+ " male | \n",
+ " 21.0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 24.1500 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
10 rows × 40 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PassengerId Pclass Name Sex \\\n",
+ "0 892 3 Kelly, Mr. James male \n",
+ "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n",
+ "2 894 2 Myles, Mr. Thomas Francis male \n",
+ "3 895 3 Wirz, Mr. Albert male \n",
+ "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n",
+ "5 897 3 Svensson, Mr. Johan Cervin male \n",
+ "6 898 3 Connolly, Miss. Kate female \n",
+ "7 899 2 Caldwell, Mr. Albert Francis male \n",
+ "8 900 3 Abrahim, Mrs. Joseph (Sophie Halaut Easu) female \n",
+ "9 901 3 Davies, Mr. John Samuel male \n",
+ "\n",
+ " Age SibSp Parch Fare Cabin Embarked_C ... TcktName_SOC \\\n",
+ "0 34.5 0 0 7.8292 NaN False ... False \n",
+ "1 47.0 1 0 7.0000 NaN False ... False \n",
+ "2 62.0 0 0 9.6875 NaN False ... False \n",
+ "3 27.0 0 0 8.6625 NaN False ... False \n",
+ "4 22.0 1 1 12.2875 NaN False ... False \n",
+ "5 14.0 0 0 9.2250 NaN False ... False \n",
+ "6 30.0 0 0 7.6292 NaN False ... False \n",
+ "7 26.0 1 1 29.0000 NaN False ... False \n",
+ "8 18.0 0 0 7.2292 NaN True ... False \n",
+ "9 21.0 2 0 24.1500 NaN False ... False \n",
+ "\n",
+ " TcktName_SOPP TcktName_SOTONO2 TcktName_SOTONOQ TcktName_STONO \\\n",
+ "0 False False False False \n",
+ "1 False False False False \n",
+ "2 False False False False \n",
+ "3 False False False False \n",
+ "4 False False False False \n",
+ "5 False False False False \n",
+ "6 False False False False \n",
+ "7 False False False False \n",
+ "8 False False False False \n",
+ "9 False False False False \n",
+ "\n",
+ " TcktName_STONO2 TcktName_STONOQ TcktName_WC TcktName_WEP TcktName_x \n",
+ "0 False False False False True \n",
+ "1 False False False False True \n",
+ "2 False False False False True \n",
+ "3 False False False False True \n",
+ "4 False False False False True \n",
+ "5 False False False False True \n",
+ "6 False False False False True \n",
+ "7 False False False False True \n",
+ "8 False False False False True \n",
+ "9 False False False False False \n",
+ "\n",
+ "[10 rows x 40 columns]"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_df = pd.get_dummies(test_df, columns= [\"Ticket\"], prefix = \"TcktName\")\n",
+ "test_df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "## Pclass"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sns.countplot(x = \"Pclass\", data = train_df)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PassengerId | \n",
+ " Survived | \n",
+ " Name | \n",
+ " Sex | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " Embarked_C | \n",
+ " ... | \n",
+ " TcktName_SP | \n",
+ " TcktName_STONO | \n",
+ " TcktName_STONO2 | \n",
+ " TcktName_SWPP | \n",
+ " TcktName_WC | \n",
+ " TcktName_WEP | \n",
+ " TcktName_x | \n",
+ " Pclass_1 | \n",
+ " Pclass_2 | \n",
+ " Pclass_3 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " Braund, Mr. Owen Harris | \n",
+ " male | \n",
+ " 22.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 7.2500 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
+ " female | \n",
+ " 38.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 71.2833 | \n",
+ " C85 | \n",
+ " True | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " Heikkinen, Miss. Laina | \n",
+ " female | \n",
+ " 26.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 7.9250 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
+ " female | \n",
+ " 35.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 53.1000 | \n",
+ " C123 | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " Allen, Mr. William Henry | \n",
+ " male | \n",
+ " 35.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 8.0500 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 46 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PassengerId Survived Name \\\n",
+ "0 1 0 Braund, Mr. Owen Harris \n",
+ "1 2 1 Cumings, Mrs. John Bradley (Florence Briggs Th... \n",
+ "2 3 1 Heikkinen, Miss. Laina \n",
+ "3 4 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) \n",
+ "4 5 0 Allen, Mr. William Henry \n",
+ "\n",
+ " Sex Age SibSp Parch Fare Cabin Embarked_C ... TcktName_SP \\\n",
+ "0 male 22.0 1 0 7.2500 NaN False ... False \n",
+ "1 female 38.0 1 0 71.2833 C85 True ... False \n",
+ "2 female 26.0 0 0 7.9250 NaN False ... False \n",
+ "3 female 35.0 1 0 53.1000 C123 False ... False \n",
+ "4 male 35.0 0 0 8.0500 NaN False ... False \n",
+ "\n",
+ " TcktName_STONO TcktName_STONO2 TcktName_SWPP TcktName_WC TcktName_WEP \\\n",
+ "0 False False False False False \n",
+ "1 False False False False False \n",
+ "2 False True False False False \n",
+ "3 False False False False False \n",
+ "4 False False False False False \n",
+ "\n",
+ " TcktName_x Pclass_1 Pclass_2 Pclass_3 \n",
+ "0 False False False True \n",
+ "1 False True False False \n",
+ "2 False False False True \n",
+ "3 True True False False \n",
+ "4 True False False True \n",
+ "\n",
+ "[5 rows x 46 columns]"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df[\"Pclass\"] = train_df[\"Pclass\"].astype(\"category\")\n",
+ "train_df = pd.get_dummies(train_df, columns= [\"Pclass\"])\n",
+ "train_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PassengerId | \n",
+ " Name | \n",
+ " Sex | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " Embarked_C | \n",
+ " Embarked_Q | \n",
+ " ... | \n",
+ " TcktName_SOTONOQ | \n",
+ " TcktName_STONO | \n",
+ " TcktName_STONO2 | \n",
+ " TcktName_STONOQ | \n",
+ " TcktName_WC | \n",
+ " TcktName_WEP | \n",
+ " TcktName_x | \n",
+ " Pclass_1 | \n",
+ " Pclass_2 | \n",
+ " Pclass_3 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 892 | \n",
+ " Kelly, Mr. James | \n",
+ " male | \n",
+ " 34.5 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 7.8292 | \n",
+ " NaN | \n",
+ " False | \n",
+ " True | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 893 | \n",
+ " Wilkes, Mrs. James (Ellen Needs) | \n",
+ " female | \n",
+ " 47.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 7.0000 | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 894 | \n",
+ " Myles, Mr. Thomas Francis | \n",
+ " male | \n",
+ " 62.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 9.6875 | \n",
+ " NaN | \n",
+ " False | \n",
+ " True | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 895 | \n",
+ " Wirz, Mr. Albert | \n",
+ " male | \n",
+ " 27.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 8.6625 | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 896 | \n",
+ " Hirvonen, Mrs. Alexander (Helga E Lindqvist) | \n",
+ " female | \n",
+ " 22.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 12.2875 | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 42 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PassengerId Name Sex Age \\\n",
+ "0 892 Kelly, Mr. James male 34.5 \n",
+ "1 893 Wilkes, Mrs. James (Ellen Needs) female 47.0 \n",
+ "2 894 Myles, Mr. Thomas Francis male 62.0 \n",
+ "3 895 Wirz, Mr. Albert male 27.0 \n",
+ "4 896 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 \n",
+ "\n",
+ " SibSp Parch Fare Cabin Embarked_C Embarked_Q ... TcktName_SOTONOQ \\\n",
+ "0 0 0 7.8292 NaN False True ... False \n",
+ "1 1 0 7.0000 NaN False False ... False \n",
+ "2 0 0 9.6875 NaN False True ... False \n",
+ "3 0 0 8.6625 NaN False False ... False \n",
+ "4 1 1 12.2875 NaN False False ... False \n",
+ "\n",
+ " TcktName_STONO TcktName_STONO2 TcktName_STONOQ TcktName_WC \\\n",
+ "0 False False False False \n",
+ "1 False False False False \n",
+ "2 False False False False \n",
+ "3 False False False False \n",
+ "4 False False False False \n",
+ "\n",
+ " TcktName_WEP TcktName_x Pclass_1 Pclass_2 Pclass_3 \n",
+ "0 False True False False True \n",
+ "1 False True False False True \n",
+ "2 False True False True False \n",
+ "3 False True False False True \n",
+ "4 False True False False True \n",
+ "\n",
+ "[5 rows x 42 columns]"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_df[\"Pclass\"] = test_df[\"Pclass\"].astype(\"category\")\n",
+ "test_df = pd.get_dummies(test_df, columns= [\"Pclass\"])\n",
+ "test_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "## Sex"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PassengerId | \n",
+ " Survived | \n",
+ " Name | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " Embarked_C | \n",
+ " Embarked_Q | \n",
+ " ... | \n",
+ " TcktName_STONO2 | \n",
+ " TcktName_SWPP | \n",
+ " TcktName_WC | \n",
+ " TcktName_WEP | \n",
+ " TcktName_x | \n",
+ " Pclass_1 | \n",
+ " Pclass_2 | \n",
+ " Pclass_3 | \n",
+ " Sex_female | \n",
+ " Sex_male | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " Braund, Mr. Owen Harris | \n",
+ " 22.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 7.2500 | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
+ " 38.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 71.2833 | \n",
+ " C85 | \n",
+ " True | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " Heikkinen, Miss. Laina | \n",
+ " 26.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 7.9250 | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " True | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
+ " 35.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 53.1000 | \n",
+ " C123 | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " Allen, Mr. William Henry | \n",
+ " 35.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 8.0500 | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 47 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PassengerId Survived Name \\\n",
+ "0 1 0 Braund, Mr. Owen Harris \n",
+ "1 2 1 Cumings, Mrs. John Bradley (Florence Briggs Th... \n",
+ "2 3 1 Heikkinen, Miss. Laina \n",
+ "3 4 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) \n",
+ "4 5 0 Allen, Mr. William Henry \n",
+ "\n",
+ " Age SibSp Parch Fare Cabin Embarked_C Embarked_Q ... \\\n",
+ "0 22.0 1 0 7.2500 NaN False False ... \n",
+ "1 38.0 1 0 71.2833 C85 True False ... \n",
+ "2 26.0 0 0 7.9250 NaN False False ... \n",
+ "3 35.0 1 0 53.1000 C123 False False ... \n",
+ "4 35.0 0 0 8.0500 NaN False False ... \n",
+ "\n",
+ " TcktName_STONO2 TcktName_SWPP TcktName_WC TcktName_WEP TcktName_x \\\n",
+ "0 False False False False False \n",
+ "1 False False False False False \n",
+ "2 True False False False False \n",
+ "3 False False False False True \n",
+ "4 False False False False True \n",
+ "\n",
+ " Pclass_1 Pclass_2 Pclass_3 Sex_female Sex_male \n",
+ "0 False False True False True \n",
+ "1 True False False True False \n",
+ "2 False False True True False \n",
+ "3 True False False True False \n",
+ "4 False False True False True \n",
+ "\n",
+ "[5 rows x 47 columns]"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df[\"Sex\"] = train_df[\"Sex\"].astype(\"category\")\n",
+ "train_df = pd.get_dummies(train_df, columns=[\"Sex\"])\n",
+ "train_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " PassengerId | \n",
+ " Name | \n",
+ " Age | \n",
+ " SibSp | \n",
+ " Parch | \n",
+ " Fare | \n",
+ " Cabin | \n",
+ " Embarked_C | \n",
+ " Embarked_Q | \n",
+ " Embarked_S | \n",
+ " ... | \n",
+ " TcktName_STONO2 | \n",
+ " TcktName_STONOQ | \n",
+ " TcktName_WC | \n",
+ " TcktName_WEP | \n",
+ " TcktName_x | \n",
+ " Pclass_1 | \n",
+ " Pclass_2 | \n",
+ " Pclass_3 | \n",
+ " Sex_female | \n",
+ " Sex_male | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 892 | \n",
+ " Kelly, Mr. James | \n",
+ " 34.5 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 7.8292 | \n",
+ " NaN | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 893 | \n",
+ " Wilkes, Mrs. James (Ellen Needs) | \n",
+ " 47.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 7.0000 | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " True | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 894 | \n",
+ " Myles, Mr. Thomas Francis | \n",
+ " 62.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 9.6875 | \n",
+ " NaN | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 895 | \n",
+ " Wirz, Mr. Albert | \n",
+ " 27.0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 8.6625 | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 896 | \n",
+ " Hirvonen, Mrs. Alexander (Helga E Lindqvist) | \n",
+ " 22.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 12.2875 | \n",
+ " NaN | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " True | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 43 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " PassengerId Name Age SibSp \\\n",
+ "0 892 Kelly, Mr. James 34.5 0 \n",
+ "1 893 Wilkes, Mrs. James (Ellen Needs) 47.0 1 \n",
+ "2 894 Myles, Mr. Thomas Francis 62.0 0 \n",
+ "3 895 Wirz, Mr. Albert 27.0 0 \n",
+ "4 896 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 22.0 1 \n",
+ "\n",
+ " Parch Fare Cabin Embarked_C Embarked_Q Embarked_S ... \\\n",
+ "0 0 7.8292 NaN False True False ... \n",
+ "1 0 7.0000 NaN False False True ... \n",
+ "2 0 9.6875 NaN False True False ... \n",
+ "3 0 8.6625 NaN False False True ... \n",
+ "4 1 12.2875 NaN False False True ... \n",
+ "\n",
+ " TcktName_STONO2 TcktName_STONOQ TcktName_WC TcktName_WEP TcktName_x \\\n",
+ "0 False False False False True \n",
+ "1 False False False False True \n",
+ "2 False False False False True \n",
+ "3 False False False False True \n",
+ "4 False False False False True \n",
+ "\n",
+ " Pclass_1 Pclass_2 Pclass_3 Sex_female Sex_male \n",
+ "0 False False True False True \n",
+ "1 False False True True False \n",
+ "2 False True False False True \n",
+ "3 False False True False True \n",
+ "4 False False True True False \n",
+ "\n",
+ "[5 rows x 43 columns]"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_df[\"Sex\"] = test_df[\"Sex\"].astype(\"category\")\n",
+ "test_df = pd.get_dummies(test_df, columns=[\"Sex\"])\n",
+ "test_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Drop Passenger ID and Cabin (Assignment)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_df.drop(labels = [\"PassengerId\", \"Cabin\"], axis = 1, inplace = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['Survived', 'Name', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C',\n",
+ " 'Embarked_Q', 'Embarked_S', 'TcktName_A4', 'TcktName_A5', 'TcktName_AS',\n",
+ " 'TcktName_C', 'TcktName_CA', 'TcktName_CASOTON', 'TcktName_FC',\n",
+ " 'TcktName_FCC', 'TcktName_Fa', 'TcktName_LINE', 'TcktName_PC',\n",
+ " 'TcktName_PP', 'TcktName_PPP', 'TcktName_SC', 'TcktName_SCA4',\n",
+ " 'TcktName_SCAH', 'TcktName_SCOW', 'TcktName_SCPARIS',\n",
+ " 'TcktName_SCParis', 'TcktName_SOC', 'TcktName_SOP', 'TcktName_SOPP',\n",
+ " 'TcktName_SOTONO2', 'TcktName_SOTONOQ', 'TcktName_SP', 'TcktName_STONO',\n",
+ " 'TcktName_STONO2', 'TcktName_SWPP', 'TcktName_WC', 'TcktName_WEP',\n",
+ " 'TcktName_x', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female',\n",
+ " 'Sex_male'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Drop the PassengerId and Cabin columns from the test set\n",
+ "test_df.drop(labels = [\"PassengerId\", \"Cabin\"], axis = 1, inplace = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['Name', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q',\n",
+ " 'Embarked_S', 'TcktName_A', 'TcktName_A4', 'TcktName_A5',\n",
+ " 'TcktName_AQ3', 'TcktName_AQ4', 'TcktName_C', 'TcktName_CA',\n",
+ " 'TcktName_FC', 'TcktName_FCC', 'TcktName_LP', 'TcktName_PC',\n",
+ " 'TcktName_PP', 'TcktName_SC', 'TcktName_SCA3', 'TcktName_SCA4',\n",
+ " 'TcktName_SCAH', 'TcktName_SCPARIS', 'TcktName_SCParis', 'TcktName_SOC',\n",
+ " 'TcktName_SOPP', 'TcktName_SOTONO2', 'TcktName_SOTONOQ',\n",
+ " 'TcktName_STONO', 'TcktName_STONO2', 'TcktName_STONOQ', 'TcktName_WC',\n",
+ " 'TcktName_WEP', 'TcktName_x', 'Pclass_1', 'Pclass_2', 'Pclass_3',\n",
+ " 'Sex_female', 'Sex_male'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Print the columns of the test set\n",
+ "test_df.columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "# Modeling"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.svm import SVC\n",
+ "from sklearn.ensemble import RandomForestClassifier, VotingClassifier\n",
+ "from sklearn.neighbors import KNeighborsClassifier\n",
+ "from sklearn.tree import DecisionTreeClassifier\n",
+ "from sklearn.metrics import accuracy_score"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Train - Test Split (Assignment)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "891"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_df_len = len(train_df)\n",
+ "train_df_len"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "X_train 594\n",
+ "X_test 297\n",
+ "y_train 594\n",
+ "y_test 297\n",
+ "test 418\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "train = train_df[:train_df_len]\n",
+ "test = test_df\n",
+ "\n",
+ "# Select all numerical values from train and test\n",
+ "numeric_train = train.select_dtypes(include=[np.number])\n",
+ "numeric_test = test.select_dtypes(include=[np.number]) \n",
+ "\n",
+ "\n",
+ "X_train = numeric_train.drop(labels=[\"Survived\",], axis=1)\n",
+ "y_train = numeric_train[\"Survived\"]\n",
+ "\n",
+ "# Split the train data into train and test sets with a 1/3 ratio\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=1/3) # Use the train_test_split function here\n",
+ "\n",
+ "\n",
+ "print(\"X_train\", len(X_train))\n",
+ "print(\"X_test\", len(X_test))\n",
+ "print(\"y_train\", len(y_train))\n",
+ "print(\"y_test\", len(y_test))\n",
+ "print(\"test\", len(numeric_test))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Simple Logistic Regression (Assignment)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy on the training set: 70.03\n",
+ "Accuracy on the test set: 69.36\n"
+ ]
+ }
+ ],
+ "source": [
+ "logreg = LogisticRegression()\n",
+ "logreg.fit(X_train, y_train)\n",
+ "acc_log_train = round(logreg.score(X_train, y_train)*100,2) \n",
+ "acc_log_test = round(logreg.score(X_test,y_test)*100,2)\n",
+ "# Print the accuracy on the training and test set\n",
+ "print(\"Accuracy on the training set:\", acc_log_train)\n",
+ "print(\"Accuracy on the test set:\", acc_log_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "## Hyperparameter Tuning -- Grid Search -- Cross Validation\n",
+ "We will compare 5 ml classifier and evaluate mean accuracy of each of them by stratified cross validation.\n",
+ "\n",
+ "* Decision Tree\n",
+ "* SVM\n",
+ "* Random Forest\n",
+ "* KNN\n",
+ "* Logistic Regression"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "random_state = 42\n",
+ "classifier = [DecisionTreeClassifier(random_state = random_state),\n",
+ " SVC(random_state = random_state),\n",
+ " RandomForestClassifier(random_state = random_state),\n",
+ " LogisticRegression(random_state = random_state),\n",
+ " KNeighborsClassifier()]\n",
+ "\n",
+ "dt_param_grid = {\"min_samples_split\" : range(10,500,20),\n",
+ " \"max_depth\": range(1,20,2)}\n",
+ "\n",
+ "svc_param_grid = {\"kernel\" : [\"rbf\"],\n",
+ " \"gamma\": [0.001, 0.01, 0.1, 1],\n",
+ " \"C\": [1,10,50,100,200,300,1000]}\n",
+ "\n",
+ "rf_param_grid = {\"max_features\": [1,3,10],\n",
+ " \"min_samples_split\":[2,3,10],\n",
+ " \"min_samples_leaf\":[1,3,10],\n",
+ " \"bootstrap\":[False],\n",
+ " \"n_estimators\":[100,300],\n",
+ " \"criterion\":[\"gini\"]}\n",
+ "\n",
+ "logreg_param_grid = {\"C\":np.logspace(-3,3,7),\n",
+ " \"penalty\": [\"l1\",\"l2\"]}\n",
+ "\n",
+ "knn_param_grid = {\"n_neighbors\": np.linspace(1,19,10, dtype = int).tolist(),\n",
+ " \"weights\": [\"uniform\",\"distance\"],\n",
+ " \"metric\":[\"euclidean\",\"manhattan\"]}\n",
+ "classifier_param = [dt_param_grid,\n",
+ " svc_param_grid,\n",
+ " rf_param_grid,\n",
+ " logreg_param_grid,\n",
+ " knn_param_grid]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Fitting 10 folds for each of 250 candidates, totalling 2500 fits\n",
+ "0.7035875706214688\n",
+ "Fitting 10 folds for each of 28 candidates, totalling 280 fits\n",
+ "0.720367231638418\n",
+ "Fitting 10 folds for each of 54 candidates, totalling 540 fits\n",
+ "0.7492090395480225\n",
+ "Fitting 10 folds for each of 14 candidates, totalling 140 fits\n",
+ "0.7019774011299434\n",
+ "Fitting 10 folds for each of 40 candidates, totalling 400 fits\n",
+ "0.7188983050847456\n"
+ ]
+ }
+ ],
+ "source": [
+ "cv_result = []\n",
+ "best_estimators = []\n",
+ "for i in range(len(classifier)):\n",
+ " clf = GridSearchCV(classifier[i], param_grid=classifier_param[i], cv = StratifiedKFold(n_splits = 10), scoring = \"accuracy\", n_jobs = -1,verbose = 1)\n",
+ " clf.fit(X_train,y_train)\n",
+ " cv_result.append(clf.best_score_)\n",
+ " best_estimators.append(clf.best_estimator_)\n",
+ " print(cv_result[i])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 1.0, 'Cross Validation Scores')"
+ ]
+ },
+ "execution_count": 61,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "cv_results = pd.DataFrame({\"Cross Validation Means\":cv_result, \"ML Models\":[\"DecisionTreeClassifier\", \"SVM\",\"RandomForestClassifier\",\n",
+ " \"LogisticRegression\",\n",
+ " \"KNeighborsClassifier\"]})\n",
+ "\n",
+ "g = sns.barplot(x=\"Cross Validation Means\",y= \"ML Models\", data=cv_results)\n",
+ "g.set_xlabel(\"Mean Accuracy\")\n",
+ "g.set_title(\"Cross Validation Scores\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Ensemble Modeling (Assignment)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy Score: 0.7138047138047138\n"
+ ]
+ }
+ ],
+ "source": [
+ "votingC = VotingClassifier(estimators = [(\"dt\",best_estimators[0]),\n",
+ " (\"rfc\",best_estimators[2]),\n",
+ " (\"lr\",best_estimators[3])],\n",
+ " voting = \"soft\", n_jobs = -1)\n",
+ "votingC = votingC.fit(X_train, y_train)\n",
+ "\n",
+ "# Print the accuracy score of the voting classifier\n",
+ "y_pred = votingC.predict(X_test)\n",
+ "accuracy = accuracy_score(y_test, y_pred)\n",
+ "print(\"Accuracy Score:\", accuracy)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Drop the null values which are going to cause you an error in the next cell\n",
+ "X_train.dropna(inplace=True)\n",
+ "y_train.dropna(inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "## Prediction and Submission"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " PassengerId Survived\n",
+ "0 892 0.0\n",
+ "1 893 0.0\n",
+ "2 894 0.0\n",
+ "3 895 0.0\n",
+ "4 896 0.0\n",
+ ".. ... ...\n",
+ "413 1305 NaN\n",
+ "414 1306 NaN\n",
+ "415 1307 NaN\n",
+ "416 1308 NaN\n",
+ "417 1309 NaN\n",
+ "\n",
+ "[418 rows x 2 columns]\n"
+ ]
+ }
+ ],
+ "source": [
+ "test_survived = pd.Series(votingC.predict(X_test), name=\"Survived\").astype(int)\n",
+ "results = pd.concat([test_PassengerId, test_survived], axis=1)\n",
+ "results.to_csv(\"titanic.csv\", index=False)\n",
+ "print(results)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Congratulations on finishing the assignment!!\n",
+ "\n",
+ "### The submission is the titanic.csv which was just created, and this file which you have modified."
+ ]
+ }
+ ],
+ "metadata": {
+ "kaggle": {
+ "accelerator": "none",
+ "dataSources": [
+ {
+ "databundleVersionId": 26502,
+ "sourceId": 3136,
+ "sourceType": "competition"
+ }
+ ],
+ "dockerImageVersionId": 29852,
+ "isGpuEnabled": false,
+ "isInternetEnabled": false,
+ "language": "python",
+ "sourceType": "notebook"
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}