titanic.html

<!DOCTYPE html>
<html lang="en">

<head>
    <base href="./" />
    <link rel="shortcut icon" type="image/png" href="/assets/img/favicon.png" />
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
    <meta name="description" content="Site description">
    <meta name="author" content="Biljana Zobenica">
    <link rel="canonical" href="/">

    <title>Biljana Zobenica</title>

    <!-- Bootstrap core CSS -->
    <link href="assets/css/bootstrap.min.css" rel="stylesheet">

    <!-- Custom fonts for this template -->
    <link href="assets/css/all.min.css" rel="stylesheet" type="text/css">
    <link href="https://fonts.googleapis.com/css?family=Montserrat:400,700" rel="stylesheet" type="text/css">
    <link href='https://fonts.googleapis.com/css?family=Kaushan+Script' rel='stylesheet' type='text/css'>
    <link href='https://fonts.googleapis.com/css?family=Droid+Serif:400,700,400italic,700italic' rel='stylesheet'
        type='text/css'>
    <link href='https://fonts.googleapis.com/css?family=Roboto+Slab:400,100,300,700' rel='stylesheet' type='text/css'>

    <!-- Custom styles for this theme -->
    <link href="assets/css/agency.css" rel="stylesheet">

    <!-- Page container change top padding when nav shrinks -->

</head>

<body id="page-top">
    <div class="container">
        <div class="row">
            <div class="col-lg-12">
                <!-- Navigation -->
                <nav class="navbar navbar-expand-lg navbar-dark fixed-top" id="mainNav">
                    <div class="container">
                        <a class="navbar-brand js-scroll-trigger" href="#page-top">Biljana Zobenica</a>
                        <button class="navbar-toggler navbar-toggler-right" type="button" data-toggle="collapse"
                            data-target="#navbarResponsive" aria-controls="navbarResponsive" aria-expanded="false"
                            aria-label="Toggle navigation">
                            Menu
                            <i class="fas fa-bars"></i>
                        </button>
                        <div class="collapse navbar-collapse" id="navbarResponsive">
                            <ul class="navbar-nav text-uppercase ml-auto">
                                <li class="nav-item"><a class="nav-link js-scroll-trigger" href="#services">Likes</a>
                                </li>
                                <li class="nav-item"><a class="nav-link js-scroll-trigger" href="#portfolio">Portfolio</a>
                                </li>
                                <li class="nav-item"><a class="nav-link js-scroll-trigger" href="#timeline">About</a>
                                </li>
                                <li class="nav-item"><a class="nav-link js-scroll-trigger" href="#contact">Contact</a>
                                </li>
                                <li class="nav-item"><a class="nav-link js-scroll-trigger" href="./blog.html">Blog</a>
                                </li>
    
                            </ul>
                        </div>
                    </div>
                </nav>
                <section id="content" class="section-top-padding">
                    <article class="article-top-padding">
                        <h1>
                            <a href="https://www.kaggle.com/c/titanic" rel="bookmark"
                                title="Premalink to Kaggle's Titanic Competition" target="_blank">
                                Breaking the Ice with Titanic
                            </a>
                        </h1>
                        <i><time datetime="2014-09-06T00:00:00-04:00"> Thu 03 June 2021</time></i>
                        <div class="entry-content">
                            <div class="panel">
                                <br />
                            </div>
                            <div class="container">
                                <br />
                                <img class="img-responsive" style="width: 50%;"
                                    src="/assets/img/blog/titanic_background.jpg">
                            </div>
                            <hr class="featurette-divider">
                            <h2>Kaggle Machine Learning Competition: <br> Predicting the Survival of Titanic Passengers
                            </h2>
                            <br>
                            <p>In this blog post, I will go through the Titanic dataset. <strong style="color:#df2344;">As
                                    opposed to its infamous destiny, this project is my
                                    breakage of the ice!</strong> <br>
                                All the credits go to Abhishek Kumar and his course of "Doing Data Science with Python"
                                on Pluralsight.
                                Thanks to his concise and clear mentorship, I have completed my first kaggle submission
                                in the Titanic competition.
                                For project structure, <a
                                    href="https://github.com/biljana-zobenica/cookiecutter-data-science" target="_blank">the
                                    cookiecutter tamplate</a> has been applied.
                                Also, the data analysis' steps are explained in details by using
                                <a href="https://github.com/biljana-zobenica/titanic/tree/master/notebooks"
                                    target="_blank">Jupyter notebook</a> and
                                <a href="https://github.com/biljana-zobenica/titanic" target="_blank">GitHub versioning
                                    system</a>.
                                <br>
                            <ul>
                                <li>Environment</li>
                                <li>Extracting Data</li>
                                <li>Exploring and Processing Data</li>
                                <li>Building and Evaluating Predictive Model</li>
                            </ul>
                            <hr class="featurette-divider">
                            <h3>Setting up the Environment</h3>
                            <br>
                            <p>Before diving into data, as mentioned above, I will go through the tools and setups,
                                which are used in order to set up the environment.
                                Firstly, regarding the python distributions, <a href="https://www.anaconda.com/"
                                    target="_blank">Anaconda</a> has been used as specialized python distribution, which
                                comes with pre-installed and optimized python packages.
                                For the project, the latest available version of <a href="https://www.python.org/"
                                    target="_blank">Python 3.9</a> has been used. Documentation of all analyses has been
                                written and showcased in <a href="https://jupyter.org/" target="_blank">Jupyter
                                    notebook</a>, in the notebooks folder,
                                as the part of the common data science project template called <a
                                    href="https://cookiecutter.readthedocs.io/en/1.7.2/README.html"
                                    target="_blank">cookiecutter template</a>. All changes and important insights have
                                been tracked by <a href="https://github.com/" target="_blank">GitHub</a> versioning
                                system.
                            </p>
                            <hr class="featurette-divider">
    
                            <h3>Extracting Data</h3>
                            <br>
                            <p>
                                For description, evaluation and dataset, <a href="https://www.kaggle.com/c/titanic/overview"
                                    target="_blank">the
                                    kaggle's
                                    platform</a> has been used.
                                Short description of the Titanic challenge: as widely well known, Titanic is one of the
                                most infamous shipwrecks in history.
                                On April 15, 1912, the "unsinkable" Titanic sank after colliding with an iceberg,
                                resulting in one of the deadliest shipwrecks.
                                While there was some element of luck involved in surviving, it is likely that some
                                groups of people were having higher possibilities to survive than others.
                                In this challenge, the main question is defined as follows: <strong
                                    style="color:#df2344;">"What kind of people were
                                    more likely to survive?"</strong>.
                                In this course, a couple of common data science practices were explained, regarding the
                                extraction of dataset by using techniques such as extraction from databases (sqlite3
                                library), through APIs (requests library) and web scraping (requests and BeautifulSoup
                                libraries).
                                In <a
                                    href="https://github.com/biljana-zobenica/titanic/blob/master/notebooks/1.0-bz-extract-titanic-data.ipynb"
                                    target="_blank">the first Jupyter notebook</a>, the automated script for extracting
                                data has been created.
                            </p>
                            <hr class="featurette-divider">
                            <h3>Exploring and Processing Data</h3>
                            <br>
                            <p>
                                This phase of the project covers some of the basic and advanced exploratory data
                                analysis techniques, such as basic data structure, summary statistics,
                                distributions, grouping, crosstabs and pivots. Also, here most of the time is invested
                                in terms of data cleaning, munging, visualization, using Python libraries
                                such as NumPy, Pandas and Matplotlib. All these steps are documented in
                                <a href="https://github.com/biljana-zobenica/titanic/blob/master/notebooks/2.0-bz-exploring-processing-data.ipynb"
                                    target="_blank">the second Jupyter notebook</a>.
                            </p>
                            <p>Firstly, we import the Python libraries and as a common practice, we make their aliases. In
                                order to start the basic
                                exploratory
                                data analysis, we need to import the dataset, more precisely the train and test .csv files.
                            </p>
<div class="highlight">
    <pre>
<span class="c">  # import python libraries</span>
  <span class="s">import</span> pandas <span class="s">as</span> pd
  <span class="s">import</span> numpy <span class="s">as</span> np
  <span class="s">import</span> os

<span class="c">  # set the path of the raw data, in accordance with cookiecutter template</span>
  raw_data_path <span class="s">=</span> os<span class="s">.</span>path<span class="s">.</span>join(os<span class="s">.</span>path<span class="s">.</span>pardir, <span class="f">'data'</span>, <span class="f">'raw'</span>)
  train_file_path <span class="s">=</span> os<span class="s">.</span>path<span class="s">.</span>join(raw_data_path, <span class="f">'train.csv'</span>)
  test_file_path <span class="s">=</span> os<span class="s">.</span>path<span class="s">.</span>join(raw_data_path, <span class="f">'test.csv'</span>)

<span class="c">  # read the data with .read_csv method</span>
  train_df <span class="s">=</span> pd<span class="s">.</span>read_csv(train_file_path, index_col<span class="s">=</span><span class="f">'PassengerId'</span>)
  test_df <span class="s">=</span> pd<span class="s">.</span>read_csv(test_file_path, index_col<span class="s">=</span><span class="f">'PassengerId'</span>)
</pre>
</div>
                                <p>Analysing the data structure,
                                    we can see some of the basic data-related info by using .info function, such as the number of data entries, columns,
                                    types of data,
                                    whether we have the missing data and memory usage of the dataframe. Having the separate train and test data, we
                                    concatenate these
                                    two dataframes into one dataset. After this, we have the data info as follows:
                                </p>
<div class="highlight">
    <pre>
<span class="c">  # concatenate the train and test datasets with pandas .concat method</span>
  df<span class="s"> = </span>pd<span class="s">.</span>concat((train_df, test_df), axis<span class="s">=</span><span class="n">0</span>)

<span class="c">  # get some basic information of the dataframe with .info method</span>
  df<span class="s">.</span>info()
</pre>
    <pre>  &lt;class 'pandas.core.frame.DataFrame'&gt; 

  Int64Index: 1309 entries, 1 to 1309 
  Data columns (total 11 columns):
   #   Column    Non-Null Count  Dtype  
  ---  ------    --------------  -----  
  0   Survived  1309 non-null   int64  
  1   Pclass    1309 non-null   int64  
  2   Name      1309 non-null   object 
  3   Sex       1309 non-null   object 
  4   Age       1046 non-null   float64
  5   SibSp     1309 non-null   int64  
  6   Parch     1309 non-null   int64  
  7   Ticket    1309 non-null   object 
  8   Fare      1308 non-null   float64
  9   Cabin     295 non-null    object 
  10  Embarked  1307 non-null   object 
  dtypes: float64(2), int64(4), object(5)
  memory usage: 122.7+ KB </pre>
</div>

                                <p>So, in total, we have 1309 entries, 11 columns and data type information for each one in the dataset. Apparently, we
                                    have some missing data,
                                    which we will analyise and resolve in the proper way. In this step, we just explore the data by using simple
                                    functions as .head, .tail, slicing
                                    techniques, filtering with .loc. In this way, the basic overview of the data structure is gained.
                                </p>
<div class="highlight">
    <pre>
<span class="c">  # use .head to get top 5 rows</span>
  df<span class="s">.</span>head()
</pre>
</div>
<img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_head.png">
<br>
<div class="highlight">
    <pre>
<span class="c">  # use .tail to get bottom 5 rows</span>
  df<span class="s">.</span>tail()
</pre>
</div>
<img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_tail.png">
<br>
                                <p>
                                    Now we have the general idea of the dataset contents, we can explore more deeply some specific features.
                                </p>
                                <br>
                                <h5>BASIC Exploratory Data Analysis (EDA)</h5>
                                <br>
                                <p>Next, we analyse the summary statistics, depending on the type of the feature: numerical or categorical ones.
                                    For numerical features, we analyse the centrality measures (mean, median) and dispersion measure (range,
                                    percentiles, variance, standard deviation).
                                    For categorical features, we analyse the total and unique count, category counts and proportions, as well as per
                                    category statistics.
                                    By using .describe functions, including the argument 'all', we get the summary statistics of
                                    all features, as follows:
                                </p>
<div class="highlight">
    <pre>
<span class="c">  # use .describe method (include='all' argument) to get a summary statistics</span>
  df<span class="s">.</span>describe<span class="s"> = </span>(include<span class="s">=</span><span class="f">'all'</span>)
</pre>
</div>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_summary_stat_all.jpg">
                                <br>
                                <p>
                                    For numerical features, we can analyse the centrality and dispersion measures. Whereas, the categorical ones
                                    need to be analysed in a different way. Take for example the feature of Pclass, which represents
                                    the class of the passanger. The graph below represents the number of passengers, categorized by the class 1st, 2nd or 3rd.
                                </p>
<div class="highlight">
    <pre>
<span class="c">  # use .value_counts for analysing categorical features and .plot for visualization</span>
  df<span class="s">.</span>Pclass<span class="s">.</span>value_counts()<span class="s">.</span>plot(kind<span class="s">=</span><span class="f">'bar'</span>, rot<span class="s">=</span><span class="n">0</span>, title<span class="s">=</span><span class="f">'Class wise passenger count'</span>, color<span class="s">=</span><span class="f">'c'</span>);
</pre>
</div>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_class_bar.png">
                                <br>
                                <p>
                                    Obviously, we can make a clear conclusion that the highest number of passengers were in the lowest class.
                                </p>
                                <br>
                                <h5>ADVANCED Exploratory Data Analysis (EDA)</h5>
                                <br>
                                <p>
                                    By creating a crosstab and applying it to the Pclass and Sex features,
                                    we drive the additional conclusion that the majority of passangers in the third class are the male passangers,
                                    exactly number of 493. This is a very handy EDA technique, but its extension would be creating pivots. With pivots,
                                     we can add an argument of a function, which could be applied on a specific feature.
                                </p>
<div class="highlight">
    <pre>
<span class="c">  # create crosstab for Sex and Pclass features to get insights, present with bar chart</span>
  pd<span class="s">.</span>crosstab (df<span class="s">.</span>Sex, df<span class="s">.</span>Pclass)<span class="s">.</span>plot(kind<span class="s">=</span><span class="f">'bar'</span>);
</pre>
</div>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_crosstab.png">
                                <br>
                                <p>
                                    For instance, using the same features, we could create a pivot table by defining the function argument to calculate mean of the value Age.
                                </p>
<div class="highlight">
    <pre>
<span class="c">  # create pivot table by defining 4 arguments (rows, columns, values and function)</span>
  pd<span class="s">.</span>pivot_table (index<span class="s">=</span><span class="f">'Sex'</span>, columns<span class="s">=</span><span class="f">'Pclass'</span>, values<span class="s">=</span><span class="f">'Age'</span>, aggfunc<span class="s">=</span><span class="f">'mean'</span>)
</pre>
</div>
<div class="highlight">
    <pre>
<span class="c">  # or get the same result by using .groupby, .mean and .unstack methods</span>
  df<span class="s">.</span>groupby ([<span class="f">'Sex'</span>, <span class="f">'Pclass'</span>])<span class="s">.</span>Age<span class="s">.</span>mean()<span class="s">.</span>unstack()
</pre>
</div>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_pivot.png">
                                <br>
                                <p> So, the majority of male passangers in the 3rd class are 25.96 years old in average.</p>
                                <p>
                                    Furthermore, we would like to apply some visualization tools in order to analyse the distribution of the data.
                                    Firslty, we make a distinction between an univariate distribution, where we use histogram
                                    and/or kernel density estimation (KDE) plot, and a bivariate distribution or a distribution of two features, 
                                    where we use a scatter plot for visualization. Analysing data distribution, we look into very important aspects of it, such as 
                                    skewness and kurtosis and its variations to the normal one, which serves as the standard.
                                </p>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_histogram_fare_age.png">
                                <p>Looking at the histogram of the features Age and Fare, we can see positively skewed distributions or
                                    we have their mean values higher than the medians.
                                    In terms of Age, this means that 50% of the passangers are older than 28 years and the other half is younger than
                                    the median of 28 years. Also, the mean age is 29.88,
                                    so, it is slightly skewed in right, meaning there are majority of ages around median, but also longer tail with some very old people, comparing
                                    them to the median, shifting the mean age to be slightly higher than the meadian. Similarly, with more positive
                                    distribution, logic could be applied in terms of Fare.
                                </p>
                                <br>
                                <h5>DATA MUNGING</h5>
                                <br>
                                <p>
                                    Data munging is a very important part of data analysis, which refers to dealing with missing values or outliers. 
                                    By using .info method earlier, we have already detected some features with missing values (Age, Fare and Embarked, whereas Cabin will be analysed in Feature engineering section).
                                    Similarly, while plotting some features, especially Age, we have seen the existence of some extreme values of this feature.
                                </p>
                                <br>
                                <h6>WORKING WITH MISSING VALUES</h6>
                                <br>
                                <p>
                                    As we have realised earlier, in Titanic dataset, there are a couple of features with missing values. In terms of possible solutions, 
                                    we have these techniques at our disposal:
                                    <ul>
                                        <li>Deletion - only if few observations have a missing-value issue;</li>
                                        <li>Imputation - replacing NaNs with plausable data, such as mean, median or mode imputation;</li>
                                        <li>Forward/backward fill - used in case of time series or sequential data;</li>
                                        <li>Predictive model;</li>
                                    </ul>
                                    The last two in the list represent advanced techniques for resolving a missing-value issue.
                                </p>
                                <br>
                                <h6>FEATURE: Embarked</h6>
                                <br>
                                <p>
                                    From our previous EDA section, we see that there are two NaN values of this feature in the dataset.
                                </p>
<div class="highlight">
    <pre>
<span class="c">  # use .isnull function to extract missing values</span>
  df [df<span class="s">.</span>Embarked<span class="s">.</span>isnull()]
</pre>
</div>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_embarked_missing_value.png">
                                <br>
<div class="highlight">
    <pre>
<span class="c">  # use .value_counts function to count the frequency of embarkment</span>
  df<span class="s">.</span>Embarked<span class="s">.</span>value_counts()
</pre>
</div>
<div class="highlight">
<pre>  S    914
  C    270
  Q    123
  Name: Embarked, dtype: int64
</pre>
</div>
                                <p>
                                    So, the highest number of embarkment happened at the S location. 
                                    <br>But which embarked point had higher survival count? As both of these passangers survived.
                                </p>
<div class="highlight">
    <pre>
<span class="c">  # use .crosstab technique to discover which embarkment location had the highest survival rate</span>
  pd<span class="s">.</span>crosstab (df[df<span class="s">.</span>Survived <span class="s">!=</span> -888]<span class="s">.</span>Survived, df[df.Survived <span class="s">!=</span> -888]<span class="s">.</span>Embarked)
</pre>
</div>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_embarked_survival_rate.png">
                                <br>
                                <p>
                                    Here, we filter out the survival data with -888 value, which come from test data where we do not have survival information.
                                    So, in absolute terms, the embarkment point with the highest survival rate was the S one. Relatively, the result is different.
                                </p>
<div class="highlight">
    <pre>
<span class="c">  # explore Pclass and Fare for each Embarkment point</span>
  df<span class="s">.</span>groupby ([<span class="f">'Pclass'</span>, <span class="f">'Embarked'</span>])<span class="s">.</span>Fare<span class="s">.</span>median()
</pre>
</div>
                                <p>
                                    As both of these two passangers survived, were in 1st class and with Fare of 80, let's try to use these information as well.
                                </p>
<div class="highlight">
    <pre>  Pclass  Embarked
       1         C         76.7292
                 Q         90.0000
                 S         52.0000
       2         C         15.3146
                 Q         12.3500
                 S         15.3750
       3         C          7.8958
                 Q          7.7500
                 S          8.0500
  Name: Fare, dtype: float64
</pre>
</div>
                                <p>
                                    From this point of view, we see that it is most likely that these passangers had embarkment point C.
                                    Finally, let's fill in the missing values with C embarkment point and check if any null values remain afterwards.
                                </p>
<div class="highlight">
    <pre>
<span class="c">  # replace the missing values with 'C' by using .fillna method</span>
  df<span class="s">.</span>Embarked<span class="s">.</span>fillna (<span class="f">'C'</span>, inplace <span class="s">=</span> True)

<span class="c">  # check if any null values exist with .isnull, after .fillna was applied</span>
  df [df<span class="s">.</span>Embarked<span class="s">.</span>isnull()]
</pre>
</div>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_embarked_no_null.png">
                                <br>
                                <p>
                                    Great! We have solved the first feature with a missing-value issue. <br>Let's tackle the rest!
                                </p>
                                <br>
                                <h6>FEATURE: Fare</h6>
                                <br>
                                <p>
                                    We will use similar approach to the feature Fare, as we did in the previous section. So, spot the missing values, use existing 
                                    information we have and draw some conclusions.
                                </p>
<div class="highlight">
    <pre>
<span class="c">  # check if any null values exist with .isnull</span>
  df [df<span class="s">.</span>Fare<span class="s">.</span>isnull()]
</pre>
</div>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_fare_null.png">
                                <br>
<div class="highlight">
    <pre>
<span class="c">  # filter the passangers with Pclass = 3, Embarked = S, including the application of .median function to the Fare value</span>
  median_fare <span class="s">=</span> df<span class="s">.</span>loc [(df<span class="s">.</span>Pclass <span class="s">==</span> <span class="n">3</span>) & (df<span class="s">.</span>Embarked <span class="s">==</span> <span class="f">'S'</span>), <span class="f">'Fare'</span>]<span class="s">.</span>median()
  <span class="n">print</span> (median_fare)
</pre>
</div>
<div class="highlight">
    <pre>
  8.5
</pre>
</div>
                                <p>
                                    We will use imputation method to deal with missing value of Fare, based on median Fare that was applied for the passangers from the 3rd class and embarkment point S.
                                    In the following step, we fix the NaN value to be 8.5.
                                </p>
<div class="highlight">
    <pre>
<span class="c">  # apply the created median_fare function and replace NaN Fare values with median value (3rd class passangers from S embarkment point)</span>
  df<span class="s">.</span>Fare<span class="s">.</span>fillna (median_fare, inplace <span class="s">=</span> True)]
</pre>
</div>
                                <p>
                                    If we check the number of null values with .info method, we can safely continue with the Age feature.
                                </p>
                                <br>
                                <h6>FEATURE: Age</h6>
                                <br>
<div class="highlight">
    <pre>
<span class="c">  # check if any null values exist with .isnull</span>
  df [df<span class="s">.</span>Age<span class="s">.</span>isnull()]
</pre>
</div>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_age_null.png">
                                <br>
                                <p>
                                    We have exactly 263 rows with missing values of the Age feature. It is a lot of rows, so we should take a closer look of
                                    what is the best way to deal with this issue. Whether we should apply mean, median age of the passangers or some more complex
                                    logic. Let's find out!
                                </p>
                                <p>
                                    Previously, we have analysed the data distribution of the Age feature. We have seen some of very high values with passangers which
                                    are over 70 and 80 years old. So, these extreme values could easily impact the mean Age value. But, we will check, what are the mean
                                    and median values, just to have in mind the figures.
                                </p>
<div class="highlight">
    <pre>
<span class="c">  # get the Age mean value</span>
  df<span class="s">.</span>Age<span class="s">.</span>mean()
</pre>
</div>
<div class="highlight">
    <pre>  29.881137667304014
</pre>
</div>
<div class="highlight">
    <pre>
<span class="c">  # get the Age median values, by Sex category</span>
  df<span class="s">.</span>groupby (<span class="f">'Sex'</span>)<span class="s">.</span>Age<span class="s">.</span>median()
</pre>
</div>
<div class="highlight">
    <pre>  Sex
  -------------------------
  female    27.0
  male      28.0
  Name: Age, dtype: float64
</pre>
</div>
                                <p>
                                    It is useful to apply some visual tools for further analysis. We will use boxplot technique to discover more details.
                                </p>
<div class="highlight">
    <pre>
<span class="c">  # visualize Age notnull values by Sex category, using boxplot</span>
  df [df<span class="s">.</span>Age<span class="s">.</span>notnull()]<span class="s">.</span>boxplot(<span class="f">'Age'</span>, <span class="f">'Sex'</span>);
</pre>
</div>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_age_sex_boxplot.png">
                                <br>
                                <p>
                                    The plot shows similar result for both, female and male passangers, in terms of age data distribution. So, we continue to dig further.
                                </p>
<div class="highlight">
    <pre>
<span class="c">  # visualize Age notnull values by Pclass category, using boxplot</span>
  df [df<span class="s">.</span>Age<span class="s">.</span>notnull()]<span class="s">.</span>boxplot(<span class="f">'Age'</span>, <span class="f">'Pclass'</span>);
</pre>
</div>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_age_pclass_boxplot.png">
                                <br>
                                <p>
                                    Now we see some difference in the Age levels between the Pclasses of the passangers. But, at this point, we want to track a 
                                    passanger's title and try to see its correspondence with the age difference. Let's try to extract this insight from the data!
                                </p>
                                <p>
                                    We will now explore the Name values, extract the title from it and make a dictionary for titles, group titles in a couple of bins,
                                    so we could more easily drive some new conclusions. Whether we find some new variances of the age, depending od the passangers' titles,
                                    we could be on the right path.
                                </p>
<div class="highlight">
    <pre>
<span class="c">  # explore the Name feature</span>
  df<span class="s">.</span>Name<span class="s">
</pre>
</div>
<div class="highlight">
    <pre>  PassengerId
  1                                 Braund, Mr. Owen Harris
  2       Cumings, Mrs. John Bradley (Florence Briggs Th...
  3                                  Heikkinen, Miss. Laina
  4            Futrelle, Mrs. Jacques Heath (Lily May Peel)
  5                                Allen, Mr. William Henry
  ...                        
  1305                                   Spector, Mr. Woolf
  1306                         Oliva y Ocana, Dona. Fermina
  1307                         Saether, Mr. Simon Sivertsen
  1308                                  Ware, Mr. Frederick
  1309                             Peter, Master. Michael J
  Name: Name, Length: 1309, dtype: object
</pre>
</div>
<div class="highlight">
    <pre>
<span class="c">  # create the GetTitle function -> to extract the title info from the name</span>
  <span class="n">def</span> GetTitle (name):
      first_name_with_title <span class="s">=</span> name<span class="s">.</span>split (<span class="f">','</span>)[<span class="n">1</span>]
      title <span class="s">=</span> first_name_with_title<span class="s">.</span>split (<span class="f">'.'</span>)[<span class="n">0</span>]
      title <span class="s">=</span> title<span class="s">.</span>strip()<span class="s">.</span>lower()
      <span class="n">return</span> title
</pre>
</div>
                                <br>
                                <h6>WORKING WITH OUTLIERS</h6>
                                <br>
                                <p>
                                    One more data quality issue is a presence of outliers or extreme values. There are also a couple of techniques which could
                                    be used in order to deal with such values. We will take a closer look into the features Age and Fare, since we have spotted some
                                    high values of these variables earlier.
                                </p>
                                <br>
                                <h6>FEATURE: Fare</h6>
                                <br>
                                <p>
                                    If we recall the previous histogram visualisation of the Fare feature, we should have in mind the existence of some extremely high
                                    values. Let us pay more attention to those values. Firstly by plotting the box plot:
                                </p>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_fare_boxplot.png">
                                <br>
                                <p>
                                    We can see some really high fares, around 500 value. To be exact, let's extract the top fares:
                                </p>
<div class="highlight">
<pre>
<span class="c">  # extract the Fare TOP outliers</span>
  df<span class="s">.</span>loc [df<span class="s">.</span>Fare <span class="s">==</span> df<span class="s">.</span>Fare<span class="s">.</span>max()]
</pre>
</div>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_fare_top.png">
                                <br>
                                <p>
                                    The highest fare is exactly the value of 512.3292. As we know that fare could not be negative, we apply log transformation technique so we make
                                    it less skewed. Let's apply the numpy log function to the passangers' fare. 
                                </p>
<div class="highlight">
<pre>
<span class="c">  # apply log transformation to reduce the skewness, add 1 for zero fares</span>
  LogFare <span class="s">=</span> np<span class="s">.</span>log (df<span class="s">.</span>Fare <span class="s">+</span> <span class="n">1.0</span>)
</pre>
<pre>
<span class="c">  # plot the LogFare to check the skewness</span>
  LogFare<span class="s">.</span>plot (kind<span class="s">=</span><span class="f">'hist'</span>, color<span class="s">=</span><span class="f">'c'</span>, bins<span class="s">=</span><span class="n">20</span>);
</pre>
</div>
                                <br>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_fare_log.png">
                                <br>
                                <p>
                                    This is now less skewed distribution. Furthermore, we will apply the technique of binning in order to categorize the fare
                                    feature into 4 bins, so we treat these outliers more conveniently. In pandas, we use qcut function to achieve this. The qcut 
                                    comes from 'Quantile-based discretization function', which basically means that it tries to divide up the data into equal sized 
                                    bins.
                                </p>
<div class="highlight">
<pre>
<span class="c">  # apply the binning technique by using .qcut function</span>
  pd<span class="s">.</span>qcut (df<span class="s">.</span>Fare, <span class="n">4</span>)
</pre>
</div>
<div class="highlight">
    <pre>  PassengerId
  1         (-0.001, 7.896]
  2       (31.275, 512.329]
  3         (7.896, 14.454]
  4       (31.275, 512.329]
  5         (7.896, 14.454]
  ...        
  1305      (7.896, 14.454]
  1306    (31.275, 512.329]
  1307      (-0.001, 7.896]
  1308      (7.896, 14.454]
  1309     (14.454, 31.275]
  Name: Fare, Length: 1309, dtype: category
  Categories (4, interval[float64]): [(-0.001, 7.896] < (7.896, 14.454] < (14.454, 31.275] < (31.275, 512.329]]
</pre>
</div>
<div class="highlight">
<pre>
<span class="c">  # add bins' labels or discretization = turn numerical into categorical feature</span>
  pd<span class="s">.</span>qcut (df<span class="s">.</span>Fare, <span class="n">4</span>, labels<span class="s">=</span>[<span class="f">'very_low'</span>, <span class="f">'low'</span>, <span class="f">'high'</span>, <span class="f">'very_high'</span>])
</pre>
</div>
<div class="highlight">
    <pre>  PassengerId
  1        very_low
  2       very_high
  3             low
  4       very_high
  5             low
  ...    
  1305          low
  1306    very_high
  1307     very_low
  1308          low
  1309         high
  Name: Fare, Length: 1309, dtype: category
  Categories (4, object): ['very_low' < 'low' < 'high' < 'very_high']
</pre>
</div>
<div class="highlight">
<pre>
<span class="c">  # plot the labaled bins</span>
  pd<span class="s">.</span>qcut (df<span class="s">.</span>Fare, <span class="n">4</span>, labels<span class="s">=</span>[<span class="f">'very_low'</span>, <span class="f">'low'</span>, <span class="f">'high'</span>, <span class="f">'very_high'</span>])<span class="s">.</span>value_counts()<span class="s">.</span>plot (kind<span class="s">=</span><span class="f">'bar'</span>, color<span class="s">=</span><span class="f">'c'</span>, rot<span class="s">=</span><span class="n">0</span>);
</pre>
</div>
                                <br>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_fare_bins.png">
                                <br>
                                <p>
                                    By using the qcut pandas function, we categorized the fare numerical values into 4 buckets and turn it into categorical feature of
                                    4 bins: 'very_low', 'low', 'high' and 'very_high'. As we look at the bar graph, we see there are similar number of observations in 
                                    each bin.
                                    Finally, we will create new variable 'Fare_Bin' and store it into dataframe for possible future analyses.
                                </p>
</div>
<div class="highlight">
<pre>
<span class="c">  # store new variable 'Fare_Bin'</span>
  df[<span class="f">'Fare_Bin'</span>] <span class="s">=</span> pd<span class="s">.</span>qcut (df<span class="s">.</span>Fare, <span class="n">4</span>, labels<span class="s">=</span>[<span class="f">'very_low'</span>, <span class="f">'low'</span>, <span class="f">'high'</span>, <span class="f">'very_high'</span>])
</pre>
</div>

                                <br>
                                <h6>FEATURE: Age</h6>
                                <br>
                                <p>
                                    As previously plotted, the feature Age shows that the majority of passangers are aged around 29 years. On the other hand,
                                    we can see some of them are really old, so let's quickly check those who are 70 years old or above.
                                </p>
<div class="highlight">
<pre>
<span class="c">  # extract the passangers over 70 years old</span>
  df<span class="s">.</span>loc [df<span class="s">.</span>Age <span class="s">></span> <span class="n">70</span>]
</pre>
</div>
                                <br>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_age_oldest.png">
                                <br>
                                <p>
                                    So we have one male passanger who is 80 years old and who also survived the shipwreck. On the other hand, we see some missing 
                                    values in the column of Cabin feature. This is to be dealt with in the following section.
                                </p>
                                <br>
                                <h5>FEATURE ENGINEERING</h5>
                                <br>
                                <p>
                                    Feature engineering is one of the crucial aspects of data science project cycle. It represents a process of transforming raw
                                    data to better representative features in order to create better predictive models. In this section, we will go through the Deck feature.
                                    It is a wide area, which covers many various activities, such as transformation (as we did with the feature Fare in the previous section),
                                    feature creation and selection (based on domain knowledge).
                                </p>
                                <p>
                                    As we have seen earlier, looking at the Cabin feature, we have majority of NaN values. Analysing carefully, we will try to modify some of the 
                                    values, especially the NaNs, so we can have useful Deck information for future analysis.
                                </p>
<div class="highlight">
<pre>
<span class="c">  # explore Cabin values</span>
  df<span class="s">.</span>Cabin
</pre>
</div>
<div class="highlight">
<pre>  PassengerId
  1        NaN
  2        C85
  3        NaN
  4       C123
  5        NaN
  ... 
  1305     NaN
  1306    C105
  1307     NaN
  1308     NaN
  1309     NaN
  Name: Cabin, Length: 1309, dtype: object
</pre>
</div>
<div class="highlight">
<pre>
<span class="c">  # display the unique values of Cabin feature</span>
  df<span class="s">.</span>Cabin<span class="s">.</span>unique()
</pre>
</div>
<div class="highlight">
<pre>  array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
    'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
    'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
    'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
    'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
    'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
    'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
    'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
    'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
    'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
    'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
    'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
    'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
    'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
    'C62 C64', 'E24', 'C90', 'C45', 'E8', 'B101', 'D45', 'C46', 'D30',
    'E121', 'D11', 'E77', 'F38', 'B3', 'D6', 'B82 B84', 'D17', 'A36',
    'B102', 'B69', 'E49', 'C47', 'D28', 'E17', 'A24', 'C50', 'B42',
    'C148', 'B45', 'B36', 'A21', 'D34', 'A9', 'C31', 'B61', 'C53',
    'D43', 'C130', 'C132', 'C55 C57', 'C116', 'F', 'A29', 'C6', 'C28',
    'C51', 'C97', 'D22', 'B10', 'E45', 'E52', 'A11', 'B11', 'C80',
    'C89', 'F E46', 'B26', 'F E57', 'A18', 'E60', 'E39 E41',
    'B52 B54 B56', 'C39', 'B24', 'D40', 'D38', 'C105'], dtype=object)
</pre>
</div>
                                <p>
                                    So, we have a lot of NaN values, but also 'T' and 'D', which seem like a mistake, since only these do not have the number following the letter of the deck.
                                </p>
<div class="highlight">
<pre>
<span class="c">  # look at the Cabin = T</span>
  df<span class="s">.</span>loc [df<span class="s">.</span>Cabin <span class="s">==</span> <span class="f">'T'</span>]
</pre>
</div>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_cabin_T.png">
                                <br>
<div class="highlight">
<pre>
<span class="c">  # look at the Cabin = D</span>
  df<span class="s">.</span>loc [df<span class="s">.</span>Cabin <span class="s">==</span> <span class="f">'D'</span>]
</pre>
</div>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_cabin_D.png">
                                <br>
                                <p>
                                    In case of Cabin T, we do not want to create a separate Deck for only one passanger. So, we can assume this is a mistake and
                                    we will set it to NaN. Afterwards, all NaN values will convert to the deck Z.
                                </p>
<div class="highlight">
<pre>
<span class="c">  # set the Cabin T to NaN</span>
  df<span class="s">.</span>loc [df<span class="s">.</span>Cabin <span class="s">==</span> <span class="f">'T'</span>, <span class="f">'Cabin'</span>] <span class="s">=</span> np<span class="s">.</span>NaN
</pre>
</div>
<div class="highlight">
<pre>
<span class="c">  # extract the first character of Cabin string to the Deck</span>
  <span class="n">def</span> get_deck (cabin):
      <span class="n">return</span> np<span class="s">.</span>where (pd<span class="s">.</span>notnull(cabin), <span class="n">str</span>(cabin)[<span class="n">0</span>]<span class="s">.</span>upper(), <span class="f">'Z'</span>)
  df[<span class="f">'Deck'</span>] <span class="s">=</span> df[<span class="f">'Cabin'</span>]<span class="s">.</span>map (<span class="n">lambda</span> x <span class="s">:</span> get_deck(x))
</pre>
</div>
                                <p>
                                    So, we have created the function 'get_deck' to extract the Deck info from the Cabin feature. By using numpy function 'where', we
                                    extract the first letter of the Cabin info, otherwise, we set the NaN values, which we extract by using pandas 'notnull' function and set those NaNs
                                    values to 'Z' value. If it is not null, we firstly convert the cabin to a string and upper the first character. Finally, we apply the 'map' function
                                    to the cabin attribute and pass the cabin value to 'get_deck' function. Now, we can explore the passangers per Deck feature.
                                </p>
<div class="highlight">
<pre>
<span class="c">  # check the passangers per deck</span>
  df<span class="s">.</span>Deck<span class="s">.</span>value_counts()
</pre>
</div>
<div class="highlight">
    <pre>  Z    1015
  C      94
  B      65
  D      46
  E      41
  A      22
  F      21
  G       5
  Name: Deck, dtype: int64
</pre>
</div>
<div class="highlight">
<pre>
<span class="c">  # check the passangers survival rate per deck</span>
  pd<span class="s">.</span>crosstab (df[df<span class="s">.</span>Survived <span class="s">!=</span> <span class="n">-888</span>]<span class="s">.</span>Survived, df[df<span class="s">.</span>Survived <span class="s">!=</span> <span class="n">-888</span>]<span class="s">.</span>Deck)
</pre>
</div>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_deck.png">
                                <br>
                                <p>
                                    We have most values with Z deck, where the deck information is not clear. By using the crosstab technique, we check the survival
                                    rate of passangers per deck. Here, we can see that decks B, D and E have the highest survival rates.
                                <br>
                                <br>
                                In <a href="https://github.com/biljana-zobenica/titanic/blob/master/notebooks/2.0-bz-exploring-processing-data.ipynb"
                                target="_blank">the second Jupyter notebook</a>, you can see more details of new features we created, such as IsMother (for women older than 18 years, married and with children),
                                    IsMale (for male passangers), AgeState (whether a person is an adult or a child) and FamilySize (the number of family members). These are pretty interesting in terms of exploring the survival
                                    rate difference between the certain groups of passangers. Also, we drop the columns we do not need anymore or which we used in the process of feature
                                    engineering, reorder columns, so we have the Survived column at the first place, since it is the one to be predicted. In the notebook, we save the processed 
                                    dataset and create the summary script for getting, reading, processing and saving the data. Lastly, in the next section, we will try out some advanced techniques of 
                                    visualization, by using matplotlib library.
                                </p>
                                <br>
                                <h5>VISUALIZATION</h5>
                                <br>
                                <p>
                                    Using the technique of subplots in pyton library 'matplotlib', which we imported as alies plt, we create even more complex visualizations
                                    for the features Fare and Age. We adjust some additional settings, drop the sixth empty plot, stop the overlapping of the graphs. The final result, 
                                    with multiple conclusions that we have already driven in the previous analysis, is the multiple plots as shown below:
                                </p>
<div class="highlight">
<pre>
<span class="c">  # adding subplots using ax_arr or axis array (instead of individual axises)</span>
  f, ax_arr <span class="s">=</span> plt<span class="s">.</span>subplots (<span class="n">3</span>, <span class="n">2</span>, figsize <span class="s">=</span> (<span class="n">14</span>, <span class="n">7</span>))

<span class="c">  # plot 1</span>
  ax_arr [<span class="n">0</span>,<span class="n">0</span>]<span class="s">.</span>hist (df<span class="s">.</span>Fare, bins<span class="s">=</span><span class="n">20</span>, color<span class="s">=</span><span class="f">'c'</span>)
  ax_arr [<span class="n">0</span>,<span class="n">0</span>]<span class="s">.</span>set_title (<span class="f">'Histogram : Fare'</span>)
  ax_arr [<span class="n">0</span>,<span class="n">0</span>]<span class="s">.</span>set_xlabel (<span class="f">'Bins'</span>)
  ax_arr [<span class="n">0</span>,<span class="n">0</span>]<span class="s">.</span>set_ylabel (<span class="f">'Counts'</span>)

<span class="c">  # plot 2</span>
  ax_arr [<span class="n">0</span>,<span class="n">1</span>]<span class="s">.</span>hist (df<span class="s">.</span>Age, bins<span class="s">=</span><span class="n">20</span>, color<span class="s">=</span><span class="f">'c'</span>)
  ax_arr [<span class="n">0</span>,<span class="n">1</span>]<span class="s">.</span>set_title (<span class="f">'Histogram : Age'</span>)
  ax_arr [<span class="n">0</span>,<span class="n">1</span>]<span class="s">.</span>set_xlabel (<span class="f">'Bins'</span>)
  ax_arr [<span class="n">0</span>,<span class="n">1</span>]<span class="s">.</span>set_ylabel (<span class="f">'Counts'</span>)

<span class="c">  # plot 3</span>
  ax_arr [<span class="n">1</span>,<span class="n">0</span>]<span class="s">.</span>boxplot (df<span class="s">.</span>Fare<span class="s">.</span>values)
  ax_arr [<span class="n">1</span>,<span class="n">0</span>]<span class="s">.</span>set_title (<span class="f">'Boxplot : Fare'</span>)
  ax_arr [<span class="n">1</span>,<span class="n">0</span>]<span class="s">.</span>set_xlabel (<span class="f">'Fare'</span>)
  ax_arr [<span class="n">1</span>,<span class="n">0</span>]<span class="s">.</span>set_ylabel (<span class="f">'Fare'</span>)

<span class="c">  # plot 4</span>
  ax_arr [<span class="n">0</span>,<span class="n">1</span>]<span class="s">.</span>boxplot (df<span class="s">.</span>Age<span class="s">.</span>values)
  ax_arr [<span class="n">0</span>,<span class="n">1</span>]<span class="s">.</span>set_title (<span class="f">'Boxplot : Age'</span>)
  ax_arr [<span class="n">0</span>,<span class="n">1</span>]<span class="s">.</span>set_xlabel (<span class="f">'Age'</span>)
  ax_arr [<span class="n">0</span>,<span class="n">1</span>]<span class="s">.</span>set_ylabel (<span class="f">'Age'</span>)
  
<span class="c">  # plot 5</span>
  ax_arr [<span class="n">2</span>,<span class="n">0</span>]<span class="s">.</span>scatter (df<span class="s">.</span>Age, df<span class="s">.</span>Fare, color<span class="s">=</span><span class="f">'c'</span>, alpha<span class="s">=</span><span class="n">0.15</span>)
  ax_arr [<span class="n">2</span>,<span class="n">0</span>]<span class="s">.</span>set_title (<span class="f">'Scatter Plot : Age vs. Fare'</span>)
  ax_arr [<span class="n">2</span>,<span class="n">0</span>]<span class="s">.</span>set_xlabel (<span class="f">'Age'</span>)
  ax_arr [<span class="n">2</span>,<span class="n">0</span>]<span class="s">.</span>set_ylabel (<span class="f">'Fare'</span>)

<span class="c">  # cut off the 6th plot</span>
  ax_arr [<span class="n">2</span>,<span class="n">1</span>]<span class="s">.</span>axis(<span class="f">'off'</span>)

<span class="c">  # fix the overlapping</span>
  plt<span class="s">.</span>tight_layout()

  plt.show()
</pre>
</div>
                                <br>
                                <img class="img-responsive" style="width: auto;" src="/assets/img/blog/titanic_histogram_fare_age_advanced.png">
                                <br>
                                <hr class="featurette-divider">
                                <h3>Building and Evaluating Predictive Model</h3>
                                <br>
                                <p>
                                  As we had previously prepared the data for the model building, so we dropped unnecessary columns, put 'Survived' column as the first one.
                                  We will create our input variable as 'X' and output variable as 'y'. For 'X' we extract all columns from 'Age' onwards, excluding the 
                                  'Survived' column. Additionally, for X we apply .numpy method and set the type of data to 'float' by applying .astype method. The output
                                  variable y we create y array by using .ravel() we create a flattened one-dimensional array.
                                </p>
<div class="highlight">
<pre>
<span class="c">  # creating input variable X and output variable y for model building</span>
  X <span class="s">=</span> train_df<span class="s">.</span>loc [:, <span class="f">'Age'</span>:]<span class="s">.</span>to_numpy()<span class="s">.</span>astype(<span class="f">'float'</span>)
  y <span class="s">=</span> train_df [<span class="f">'Survived'</span>]<span class="s">.</span>ravel()

<span class="c">  # print the shape of created variables</span>
  <span class="n">print</span> (X<span class="s">.</span>shape, y<span class="s">.</span>shape)
</pre>
</div>
<div class="highlight">
  <pre>  (891, 32) (891,)
</pre>
</div>
<div class="highlight">
  <pre>
<span class="c">  # train-test split -> inside the function we define arrays X, y and test size of 20% of actual training data 
  # test data will be used for model evaluation, while the rest of 80% of training data will be used for model training</span>
  <span class="n">from</span> sklearn<span class="s">.</span>model_selection <span class="n">import</span> train_test_split
  X_train, X_test, y_train, y_test <span class="s">=</span> train_test_split (X, y, test_size<span class="s">=</span><span class="n">0.2</span>, random_state<span class="s">=</span><span class="n">0</span>)

<span class="c">  # print the shape of test and train data </span>
  <span class="n">print</span> (X_train<span class="s">.</span>shape, y_train<span class="s">.</span>shape)
  <span class="n">print</span> (X_test<span class="s">.</span>shape, y_test<span class="s">.</span>shape)
</pre>
</div>
<div class="highlight">
  <pre>  (712, 32) (712,)
  (179, 32) (179,)
</pre>
</div>
                                <p>
                                  Now, we are going to build a baseline model, without any machine learning, because it is a common practise to do so. It represents the output of
                                  the majority of the class and will be our guidence, in terms of model performance, as we build more advanced model by applying the machine learning
                                  techniques. As a conclusion, our predictive model should have better performances than the baseline one.
                                </p>
<div class="highlight">
  <pre>
<span class="c">  # import the function DummyClassifier in order to build a baseline classification model</span>
  <span class="n">from</span> sklearn<span class="s">.</span>dummy <span class="n">import</span> DummyClassifier

<span class="c">  # create the model object as the most frequent, in our case it's 0 or not survived </span>
  model_dummy <span class="s">=</span> DummyClassifier (strategy <span class="s">=</span><span class="f"> 'most_frequent'</span>, random_state<span class="s">=</span><span class="n">0</span>)

<span class="c">  # train the model by using .fit function on the model object </span>
  model_dummy<span class="s">.</span>fit (X_train, y_train)
</pre>
</div>
<div class="highlight">
  <pre>  DummyClassifier(random_state=0, strategy='most_frequent')
</pre>
</div>
<div class="highlight">
  <pre>
<span class="c">  # use .score method to evaluate the model performance on the test data</span>
  <span class="n">print</span> (<span class="f">'score for baseline model: {0:.2f}'</span><span class="s">.</span>format(model_dummy<span class="s">.</span>score(X_test, y_test)))
</div>
<div class="highlight">
  <pre>  score for baseline model: 0.61
</pre>
</div>
                                <p>
                                  So, we pass the test data to evaluate the model performance. Firstly, model will predict the output on X_test, 
                                  then, it will comparet the predicted output with the actual output y_test. Also, for classification model, the 
                                  default score represents the model accuracy. The result represents our baseline model accuracy of 61%.
                                </p>
                                <br>
                                <h5>Building Machine Learning (ML) Model</h5>
                                <br>
                                <p>
                                  The next step in building our model is to apply more advanced machine learning techniques. Before, we will import
                                  necessary libraries. Since we are dealing with a binary problem, or two possible outcomes, such as 1 for survived
                                  and 0 if not survived, we will apply a logistic regression analysis.
                                </p>
<div class="highlight">
  <pre>
<span class="c">  # import libraries for model performance metrics</span>
  <span class="n">from</span> sklearn<span class="s">.</span>metrics <span class="n">import</span> accuracy_score, confusion_matrix, precision_score, recall_score

<span class="c">  # import library for logistic regression analysis</span>
  <span class="n">from</span> sklearn<span class="s">.</span>linear_model <span class="n">import</span> LogisticRegression
  <!-- <span class="n">from</span> sklearn<span class="s">.</span>ensemble <span class="n">import</span> RandomForestClassifier -->
<span class="c">  # create model and model object</span>
  clf <span class="s">=</span> LogisticRegression (solver<span class="s">=</span><span class="f">'sublinear'</span>)
  model_lr_1 <span class="s">=</span> LogisticRegression (random_state<span class="s">=</span><span class="n">0</span>)

<span class="c">  # train model with .fit function on the model object</span>
  model_lr_1<span class="s">.</span>fit (X_train, y_train)
</pre>
</div>

                                <p>
                                  We have imported sci-kit libraries for logistic regression analysis, created the model object and trained the model. We will now
                                  check the model performances and compare those with the baseline model that we had previously created.
                                </p>
                                <br>
                                <h5>Evaluating ML model</h5>
                                <br>
                                <p>
                                  As we created and trained the logistic regression model, we should evaluate its performance:
                                </p>
<div class="highlight">
  <pre>
<span class="c">  # accuracy</span>
  <span class="n">print</span> (<span class="f">'accuracy for logistic regression - version 1: {0:.2f}'</span><span class="s">.</span>format (accuracy_score(y_test, model_lr_1<span class="s">.</span>predict(X_test))))

<span class="c">  # confusion matrix</span>
  <span class="n">print</span> (<span class="f">'confusion matrix for logistic regression - version 1: \n {0}'</span><span class="s">.</span>format (confusion_matrix(y_test, model_lr_1<span class="s">.</span>predict(X_test))))

<span class="c">  # precision</span>
  <span class="n">print</span> (<span class="f">'precision for logistic regression - version 1: {0:.2f}'</span><span class="s">.</span>format (precision_score(y_test, model_lr_1<span class="s">.</span>predict(X_test))))

<span class="c">  # recall</span>
  <span class="n">print</span> (<span class="f">'recall for logistic regression - version 1: {0:.2f}'</span><span class="s">.</span>format (recall_score(y_test, model_lr_1<span class="s">.</span>predict(X_test))))

</pre>
</div>
<div class="highlight">
  <pre>  accuracy for logistic regression - version 1: 0.83
  confusion matrix for logistic regression - version 1: 
    [[95 15]
    [15 54]]
  precision for logistic regression - version 1: 0.78
  recall for logistic regression - version 1: 0.78
</pre>
</div>
<div class="highlight">
  <pre>
<span class="c">  # extract the model coefficients</span>
  model_lr_1<span class="s">.</span>coef_
</pre>
</div>
<div class="highlight">
  <pre>  array([[-0.02367032,  0.00459391, -0.45856325,  0.42774923, -0.74536247,
  0.07698167, -0.04810058, -0.32375099,  0.45165998,  0.95470524,
  0.23766785, -0.03128023, -0.36468204,  0.81144645,  0.46934547,
  -0.32759101,  0.09309015,  1.11451687,  0.52592334, -1.56220423,
  1.07954989, -0.1103208 , -0.18735432,  0.12498004,  0.21616361,
  0.2329452 ,  0.37911205,  0.39268022,  0.47166964,  0.08885105,
  0.36215285,  0.59104806]])
</pre>
</div>
                                <p>
                                  Model performances are significantly improved with the model score of 83% compared to the baseline model of 61% accuracy.
                                  Let's apply some of the techniques for model tuning and see what happens.
                                </p>
                                <br>
                                <h5>Tuning ML model</h5>
                                <br>
                                <p>
                                  While building our model, we looked at just a couple of parameters, randomstate and solver attribute. However, logistic regression
                                  model has a lot of parameters, such as regularization parameter, which are common for hyperoptimization of the model. In the next section,
                                  we will try to optimize the model's parameters and its overall score. Most commonly used hyperparameters optimization technique is grid search,
                                  which we will apply on our model.
                                </p>
<div class="highlight">
  <pre>
<span class="c">  # create the model</span>
  model_lr_1 <span class="s">=</span> LogisticRegression (random_state<span class="s">=</span><span class="n">0</span>, solver<span class="s">=</span><span class="f">'liblinear'</span>)

<span class="c">  # import GridSearch function</span>
  <span class="n">from</span> sklearn.model_selection <span class="n">import</span> GridSearchCV

<span class="c">  # create parameters dictionary</span>
  parameters <span class="s">=</span> {<span class="f">'C'</span><span class="s">:</span>[<span class="n">1.0</span>, <span class="n">10.0</span>, <span class="n">50.0</span>, <span class="n">100.0</span>, <span class="n">1000.0</span>], <span class="f">'penalty'</span><span class="s">:</span>[<span class="f">'l1'</span>,<span class="f">'l2'</span>]} 

<span class="c">  # create the grid search object</span>
  clf <span class="s">=</span> GridSearchCV (model_lr, param_grid <span class="s">=</span> parameters, cv <span class="s">=</span><span class="n">3</span>)

<span class="c">  # pass the train data to train different models with different hyperparameter combinations</span>
  clf<span class="s">.</span>fit (X_train, y_train)
</pre>
</div>
<div class="highlight">
  <pre>  GridSearchCV(cv=3,
  estimator=LogisticRegression(random_state=0, solver='liblinear'),
  param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0],
              'penalty': ['l1', 'l2']})
</pre>
</div>

<div class="highlight">
  <pre>
<span class="c">  # get the best settings</span>
  clf<span class="s">.</span>best_params_
</pre>
</div>
<div class="highlight">
  <pre> {'C': 1.0, 'penalty': 'l1'}
</pre>
</div>
<div class="highlight">
  <pre>
<span class="c">  # check the model performance score</span>
  <span class="n">print</span> (<span class="f">'best score : {0:.2f}'</span><span class="s">.</span>format (clf.best_score_))
</pre>
</div>
<div class="highlight">
  <pre> best score : 0.82
</pre>
</div>
<div class="highlight">
  <pre>
<span class="c">  # evaluate the model</span>
  <span class="n">print</span> (<span class="f">'score for logistic regression - version 2 : {0:.2f}'</span><span class="s">.</span>format (clf.score(X_test, y_test)))
</pre>
</div>
<div class="highlight">
  <pre> score for logistic regression - version 2 : 0.82
</pre>
</div>
                                <p>
                                  So, in this part of model tuning, we are checking whether we can optimize our model even further. Grid search technique relies on a cross validation logic, 
                                  where we are trying out different hyperparameters in the logistic regression function. For example, the C value represents regularization parameter. We also
                                  set cv to be 3, so we apply the 3-k fold cross validation. The goal is to find the best combination of hyperparameters' values.
                                  Here, we can se that our model's performance is not significantly improved after applying this technique. This is simply because we are reaching the maximum
                                  limit of the logistic regression model. After applying feature standardization and normalization, we confirm the same conclusion.
                                </p>
                                <br>
                                <h5>Persistance of ML model and its API</h5>
                                <br>
                                <p>
                                  In the final deployment of the model, we used the pickle library, put all the .pkl files in the model folder, which is in accordance with the cookie-cutter data 
                                  science template. So, we write or persist our model to the disk, so it would be available at any moment for making predictions. Our disk is in the role of the server
                                  for our future model API. Afterwards, we created the model's API by using the flask and request libraries, so we could envoke the API. Simply put, the job of our API
                                  is to create predictions when the input data are delivered. It gets and extract the data, process and model the data. Finally, the API is predicting the survival of 
                                  the Titanic passangers.
                                </p>
                                <br>
                                <!-- <h5>ML API</h5> -->
                                <br>
                                
                                <br>
                                <p></p>
                                <br>
                                </div>
                                <hr class="featurette-divider">
                                </article>
                                </section>
                                </div>
                                </div>
                                </div>


    <!-- End Navigation -->

    <!-- Header -->
    <!--<header class="masthead">
        <div class="container">
            <div class="intro-text"><div class="intro-lead-in">
              <p><strong style="color: #df2344;"></strong> <i style="color:#343a40"></i></p>
            </div>
        </div>
        <div class="intro-heading text-capitalize"></div>
    </header>-->
    <!-- End Header -->

    <!-- Articles -->
    <!-- <span class="container" id="blog"></span> -->


    <!-- End Articles -->

    <!-- Footer -->
    <footer class="footer" id="footer" style="background-color:white;">
        <div class="container">
            <div class="row align-items-center">
                <div class="col-md-12 text-left">
                    <span class="copyright">Copyright &copy; Biljana Zobenica 2021-Present</span>
                </div>

            </div>
        </div>
    </footer>
    <!-- End Footer -->

    <!-- Bootstrap core JavaScript -->
    <script src="assets/js/jquery.min.js"></script>
    <script src="assets/js/bootstrap.bundle.min.js"></script>

    <!-- Plugin JavaScript -->
    <script src="assets/js/jquery.easing.min.js"></script>

    <!-- Contact form JavaScript -->
    <script src="assets/js/jqBootstrapValidation.js"></script>
    <script src="assets/js/contact_me.js"></script>

    <!-- Custom scripts for this template -->
    <script src="assets/js/agency.min.js"></script>

</body>

</html>