diff --git a/Titanic_Missing_Data_Handling.ipynb b/Titanic_Missing_Data_Handling.ipynb new file mode 100644 index 0000000..823c727 --- /dev/null +++ b/Titanic_Missing_Data_Handling.ipynb @@ -0,0 +1,703 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np ### importing numpy and pandas\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22.0 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", + "2 Heikkinen, Miss. Laina female 26.0 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", + "4 Allen, Mr. William Henry male 35.0 0 \n", + "\n", + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = pd.read_csv(\"E:\\MAYUKH\\\\naive bayes\\\\titanic\\\\titanic.csv\") ### import the csv dataset\n", + "dataset.head() ### looking at the head of the csv dataset..." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SurvivedPclassSexAgeFare
003male22.07.2500
111female38.071.2833
213female26.07.9250
311female35.053.1000
403male35.08.0500
\n", + "
" + ], + "text/plain": [ + " Survived Pclass Sex Age Fare\n", + "0 0 3 male 22.0 7.2500\n", + "1 1 1 female 38.0 71.2833\n", + "2 1 3 female 26.0 7.9250\n", + "3 1 1 female 35.0 53.1000\n", + "4 0 3 male 35.0 8.0500" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True) ###Dropping the unwanted columns...\n", + "dataset.head() ### After dropping the unwanted columns..." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "target=dataset.Survived ### Fixing Survived columns as target to predict the survivals...\n", + "inputs=dataset.drop('Survived',axis='columns') ### Columns other than \"Survived\" are taken as input..." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
femalemale
001
110
210
310
401
\n", + "
" + ], + "text/plain": [ + " female male\n", + "0 0 1\n", + "1 1 0\n", + "2 1 0\n", + "3 1 0\n", + "4 0 1" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dummies=pd.get_dummies(inputs.Sex) ### Converting the Sex column to integer type from text...\n", + "dummies.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PclassSexAgeFarefemalemale
03male22.07.250001
11female38.071.283310
23female26.07.925010
31female35.053.100010
43male35.08.050001
\n", + "
" + ], + "text/plain": [ + " Pclass Sex Age Fare female male\n", + "0 3 male 22.0 7.2500 0 1\n", + "1 1 female 38.0 71.2833 1 0\n", + "2 3 female 26.0 7.9250 1 0\n", + "3 1 female 35.0 53.1000 1 0\n", + "4 3 male 35.0 8.0500 0 1" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs = pd.concat([inputs,dummies],axis='columns') ### Appending the dummy columns replacing Sex columns in inputs...\n", + "inputs.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PclassAgeFarefemalemale
0322.07.250001
1138.071.283310
2326.07.925010
3135.053.100010
4335.08.050001
\n", + "
" + ], + "text/plain": [ + " Pclass Age Fare female male\n", + "0 3 22.0 7.2500 0 1\n", + "1 1 38.0 71.2833 1 0\n", + "2 3 26.0 7.9250 1 0\n", + "3 1 35.0 53.1000 1 0\n", + "4 3 35.0 8.0500 0 1" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs.drop('Sex',axis='columns',inplace=True) ### Dropping the Sex column...\n", + "inputs.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "### Looking for missing data in any of the attributes in the data..." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Age'], dtype='object')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs.columns[inputs.isna().any()] ### Searching for any NaN value in any column...found in Age column..." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 22.0\n", + "1 38.0\n", + "2 26.0\n", + "3 35.0\n", + "4 35.0\n", + "5 NaN\n", + "6 54.0\n", + "7 2.0\n", + "8 27.0\n", + "9 14.0\n", + "Name: Age, dtype: float64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs.Age[:10] ###Looking for the NaN values in Age column..." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 22.000000\n", + "1 38.000000\n", + "2 26.000000\n", + "3 35.000000\n", + "4 35.000000\n", + "5 29.699118\n", + "6 54.000000\n", + "7 2.000000\n", + "8 27.000000\n", + "9 14.000000\n", + "Name: Age, dtype: float64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs.Age = inputs.Age.fillna(inputs.Age.mean()) ### Filling the NaN values with the mean of the Age column... \n", + "inputs.Age[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "### After replacing the missing values with the mean value...\n", + "### Again checking for any missing data in any of the attributes..." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index([], dtype='object')" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs.columns[inputs.isna().any()]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "### So there are no more missing values ini any of the attrinutes in the data..." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}