index.html

<!doctype html>
<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang=""> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8" lang=""> <![endif]-->
<!--[if IE 8]>         <html class="no-js lt-ie9" lang=""> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="">
<!--<![endif]-->

<head>
    <meta charset="utf-8">
    <meta name="description" content="">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>Speaker Recognition</title>
    <link rel="stylesheet" href="css/bootstrap.min.css">
    <link rel="stylesheet" href="css/flexslider.css">
    <link rel="stylesheet" href="css/main.css">
    <link rel="stylesheet" href="css/responsive.css">
    <link rel="stylesheet" href="css/animate.min.css">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.4.0/css/font-awesome.min.css">
    <!-- <link rel="stylesheet" href="css/googlefonts.css"> -->
</head>

<body>

    <!-- Header Section -->
    <section class="tophead" role="tophead">
        <!-- Navigation Section -->
        <header id="header">
            <div class="header-content clearfix"> <a class="logo" href="https://mahimg.github.io/Speaker-recognition/">Speech Sequence Segmentation</a>
                <nav class="navigation" role="navigation">
                    <ul class="primary-nav">
                        <li><a href="#header-slider">Home</a></li>
                        <!-- <li><a href="#services">Introduction</a></li> -->

                        <li><a href="#Introduction">Introduction</a></li>
                        <li><a href="#ProposedApproach">Proposed Approach</a></li>
                        <li><a href="#Experiments">Results</a></li>
                        <li><a href="#code">Code</a></li>
                        <li><a href="#Conclusions">Conclusion</a></li>


                    </ul>
                </nav>
                <a href="#" class="nav-toggle">Menu<span></span></a> </div>
        </header>
        <!-- Navigation Section -->
    </section>
    <!-- Header Section -->
    <!-- Slider Section -->
    <section id="header-slider" class="section">
        <div id="myCarousel" class="carousel slide" data-ride="carousel">
            <!-- <ol class="carousel-indicators">
                <li data-target="#myCarousel" data-slide-to="0" class="active"></li>
                <li data-target="#myCarousel" data-slide-to="1"></li>
            </ol> -->
            <div class="carousel-inner" role="listbox">
                <div class="item active"> <img src="images/11.png" alt="Chania" width="100%">
                    <!-- <div class="carousel-caption">
                        <h3>We'r Auro</h3>
                        <p>We Build Strong Brands which impact your customers</p>
                    </div> -->
                </div>
                <!-- <div class="item"> <img src="images/slider/slid2.jpg" alt="Chania">
                    <div class="carousel-caption">
                        <h3>Minimal Agency Template</h3>
                        <p>We're Australia based branding & design agency</p>
                    </div>
                </div> -->
            </div>
            <!-- <a class="left carousel-control" href="#myCarousel" role="button" data-slide="prev"> <span class="glyphicon glyphicon-chevron-left" aria-hidden="true"></span> <span class="sr-only">Previous</span></a> <a class="right carousel-control" href="#myCarousel"
                role="button" data-slide="next"> <span class="glyphicon glyphicon-chevron-right" aria-hidden="true"></span> <span class="sr-only">Next</span></a></div> -->
    </section>
    <!-- Slider Section -->
    <!-- Service Section -->
    <section id="services" class="section services">
        <div class="container-fluid">
            <div class="row" style="margin-top : 100px;">
                <!-- <div class="authors">

				, Roll No.: , Branch: ECE
				, Roll No.: 150102076, Branch: ECE
				, Roll No.: , Branch: EEE
				, Roll No.: , Branch: EEE
				, Roll No.: , Branch: EEE

			</div> -->

                <div class="col-md-7 col-sm-6">
                    <div class="services-content" style="margin-top : 40px;">
                        <h4>The project aims at segmenting speech sequences based on speaker transitions. Additionally, it will identify the number of speakers.</h4>
                        <p></p>
                    </div>
                </div>
                <div class="col-md-2 col-sm-3">
                    <div class="services-content">
                        <h5>Team Members:</h5>
                        <ul>
                            <li>Abhishek Agrawal</li>
                            <li>Souradip Pal</li>
                            <li>Mohammed Adnan</li>
                            <li>Mahim Goyal</li>
                            <li>Sahil Thandra</li>
                        </ul>
                    </div>
                </div>
                <div class="col-md-1 col-sm-3">
                    <div class="services-content" style="margin-top : 45px;">
                        <!-- <h5> </h5> -->
                        <ul>
                            <li>150102002</li>
                            <li>150102076</li>
                            <li>150108021</li>
                            <li>150108044</li>
                            <li>150108046</li>
                        </ul>
                    </div>
                </div>
                <div class="col-md-1 col-sm-3">
                    <div class="services-content" style="margin-top : 45px;">
                        <!-- <h5> </h5> -->
                        <ul>
                            <li>ECE</li>
                            <li>ECE</li>
                            <li>EEE</li>
                            <li>EEE</li>
                            <li>EEE</li>
                        </ul>
                    </div>
                </div>
            </div>
        </div>
    </section>
    <!-- Service Section -->
    <section id="services" class="services">
        <div class="container-fluid">
            <div class="wrapper row3">
                <main class="hoc container clear">
                    <!-- main body -->
                    <!-- ################################################################################################ -->
                    <div class="one_half first clear btmspace-80">
                        <h3 id="Abstract">Abstract</h3>
                        <p>Speaker Sequence Segmentation is the first step in many audio-processing applications and aims to solve the problem ”who spoke when”. It therefore relies on efficient use of temporal information from extracted audio features. In
                            this project, we have utilised the Linear Predictive Coefficients of the speech signal and it’s derived features to segment out the speech of individual speakers. We have employed both supervised and unsupervised learning methods
                            to approach the problem.</p>

                        <h3 id="Introduction">1. Introduction</h3>
                        <div class="subheading">
                            <h5>1.1 Introduction to Problem</h5>
                            <p>The objective of this project is to segment speech sequences based on speaker transitions, where the number of speakers is not known beforehand.
                            </p>
                        </div>
                        <div class="subheading">
                            <h5>1.2 Motivation</h5>
                            <p>The number of smart devices are increasing exponentially and so is the amount of data to process. Audio indexing which aims to organize content of multimedia using semantic information from audio data is broader class of problem
                                for audio processing. Speech sequence segmentation aims to label the segments of audio/video data with corresponding speaker identities. Apart from audio indexing it has central application in speech research such as automatic
                                speech recognition, rich transcription etc.</p>
                        </div>
                        <div class="subheading">
                            <h5>1.3 Figure</h5>
                            <div class="image">

                                <!-- Start edit here  -->
                                <img src="images/rep.png" alt="This text displays when the image is umavailable" width="500px" height="" />
                                <br>
                                <!-- <img src="Pictures/flow.jpg" alt="This text displays when the image is umavailable" width="500px" height="" /> -->

                                <!-- Stop edit here -->

                            </div>
                        </div>
                        <div class="subheading">
                            <h5>1.4 Literature Review</h5>
                            <p>The general unsupervised segmentation problem deals with the classification of a given utterance to a speaker participating in a multispeaker conversation. The exact definition of the problem is as follows. given a speech signal,
                                recorded from a multi-speaker conversation, determine the number of speakers, determine the transition times between speakers and assign each speech segment to its speaker.</p>
                            <ol>
                                <li>
                                    <p>Miro, Xavier Anguera, et al. <a href="http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6135543" style="color: rgb(0,0,255)">”Speaker diarization: A review of recent research.”</a> Audio, Speech, and Language Processing,
                                        IEEE Transactions on 20.2 (2012): 356-370 discusses two approach, top-down and bottom-up approach. The top-down approach is initialized with very few clusters (usually one) whereas the bottom-up approach is initialized
                                        with many clusters (usually more clusters than expected speakers). In both cases the aim is to iteratively converge towards an optimum number of clusters. If the final number is higher than the optimum then the
                                        system is said to under-cluster.</p>
                                </li>
                                <li>
                                    <p>Qin Jin, Kornel Laskowski, Tanja Schultz, and Alex Waibel, <a href="https://pdfs.semanticscholar.org/910b/3d2fe351188d69007d82457e13c210ba9574.pdf" style="color: rgb(0,0,255)">”Speaker Segmentation and Clustering In meetings”</a>                                        uses BIC ( Bayesian Information Criterion) to calculate the performance of different model. A negative value of BIC means that model provides a better fit to the data, that is there is a speaker change at point
                                        . Therefore, we continue merging segments until the value of BIC for the two closest segments is negative.</p>
                                </li>
                                <li>
                                    <p>Aadel Alatwi, Stephen So, Kuldip K. Paliwal <a href="http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7843309" style="color: rgb(0,0,255)">“Perceptually Motivated Linear Prediction Cepstral Features for Network Speech Recognition”</a>                                        proposed a new method for modifying the power spectrum of input speech to obtain a set of perceptually motivated Linear Prediction (LP) parameters that provide noise-robustness to Automatic Speech Recognition (ASR)
                                        features</p>
                                </li>
                                <li>
                                    <p>Vladimir Fabregas et al, <a href="https://link.springer.com/content/pdf/10.1007%2F11551188_57.pdf" style="color: rgb(0,0,255)">“Transformations of LPC and LSF Parameters to Speech Recognition Features” </a>discusses
                                        features that can be obtained from the LPC parameters are the LPCC (LPC Cepstrum) and the MLPCC (Mel-Frequency LPCC).
                                    </p>
                                </li>
                            </ol>
                        </div>
                        <div class="subheading">
                            <h5>1.5 Proposed Approach</h5>
                            <p>The problem requires that we split the input audio into multiple segments according to the speaker transitions. For this purpose, we need to characterise each individual’s voice by some features, using which we can detect if
                                there is a speaker transition. For this purpose, we are using LPC and it’s derived features. The pre-processing of the audio clip involves detecting and discarding the parts of audio clip that don’t contain any voice, i.e.
                                all speakers are silent. Further, we engage in feature extraction. The extracted features are used for the purpose of classification using two methods - supervised and unsupervised. The post-processing involves eliminating
                                the sporadic values/samples detected over each of the larger time frame.</p>
                        </div>
                        <h3 id="ProposedApproach">2. Proposed Approach</h3>
                        <p>The process of speech sequence segmenting comprises of the following steps:</p>
                        <ol style="padding-left: 20px;">
                            <li>
                                <p><b>Splitting audio into frames</b> - The complete audio is divided into smaller time frames of 20ms each with a hop of 10ms. This is done because in smaller time scales audio signals are statistically unchanged.</p>
                            </li>
                            <li>
                                <p><b>Voice Activity Detection</b> - Voice activity detection (VAD), also known as speech activity detection or speech detection, is a technique used in speech processing in which the presence or absence of human speech is
                                    detected. This was performed on each of the 20ms time frames and the time frames in which speech is not detected were discarded.</p>
                            </li>
                            <li>
                                <p><b>Feature Extraction</b> - Linear Predictive Coefficients (LPCs) are widely used features in automatic speech sequence segmenting. The redundancy in the speech signal is exploited in the LP analysis. The prediction of
                                    current sample as a linear combination of past p samples form the basis of linear prediction analysis where p is the order of prediction. The predicted sample s^(n) can be represented as follows,
                                    <br>
                                    <img class="formula" src="images/formula1.jpg" alt="this"> Various LPC-derived features such as <b>pitch, formants, Linear Predictive Cepstral Coefficients (LPCC), Line Spectral Frequencies (LSF), Pseudo-Cepstral Coefficients (PCC), Pseudo-Cepstrum (PCEP)</b>                                    were included in the feature vector extracted. LP analysis separates the given short term sequence of speech into its slowly varying vocal tract component represented by LP filter (H(z)) and fast varying excitation
                                    component given by the LP residual (e(n)). The LP filter (H(z)) induces the desired spectral shape for the shape on the flat spectrum (E(z)) of the noise like excitation sequence as given in equation (20). As the LP
                                    spectrum provides the vocal-tract characteristics, the vocal-tract resonances (formants) can be estimated from the LP spectrum. Various formant locations can be obtained by picking the peaks from the magnitude LP spectrum
                                    (|H(z)|).
                                    <img class="formula" src="images/formula2.jpg" alt="this"> The extraction process of the LPCC features from the LPC coefficients is formulated in the z -transform domain, using the complex logarithm of the LPC system
                                    transfer function, which is analogous to the cepstrum computation from the discrete Fourier transform of the speech signal. The i-th LPCC parameter is given by the following recursive equation
                                    <img class="formula" src="images/formula3.jpg" alt="this"> where ai is the i-th LPC parameter, p is the LPC system order and G is the gain factor of the system. The PCC is computed directly from the LSFs. However, its
                                    derivation is based on the LPCC. Mathematical manipulations and approximations allow it to be expressed in terms of the LSFs. The n-th PCC is given by the equation
                                    <img class="formula" src="images/formula4.jpg" alt="this"> where wi is the i -th LSF parameter. The n-th PCEP expression is given by
                                    <img class="formula" src="images/formula5.jpg" alt="this"> It is fair to expect a good spectral performance of the PCEP because they provide a spectral envelope very similar to the one provided by the Cepstrum, which
                                    is generated from the original speech signal.
                                    <table>
                                        <tr>
                                            <th>Feature Used</th>
                                            <th align="center">Number of components of each feature used to compose the feature vector</th>
                                        </tr>
                                        <tr>
                                            <td>Pitch</td>
                                            <td align="center">1</td>
                                        </tr>
                                        <tr>
                                            <td>Formants</td>
                                            <td align="center">2</td>
                                        </tr>
                                        <tr>
                                            <td>Linear Predictive Coefficients (LPCs)</td>
                                            <td align="center">20</td>
                                        </tr>
                                        <tr>
                                            <td>Linear Predictive Cepstral Coefficients (LPCC)</td>
                                            <td align="center">10</td>
                                        </tr>
                                        <tr>
                                            <td>Line Spectral Frequencies (LSF)</td>
                                            <td align="center">10</td>
                                        </tr>
                                        <tr>
                                            <td>Pseudo-Cepstral Coefficients (PCC)</td>
                                            <td align="center">10</td>
                                        </tr>
                                        <tr>
                                            <td>Pseudo-Cepstrum (PCEP)</td>
                                            <td align="center">10</td>
                                        </tr>
                                    </table>
                                    These features were normalised and cumulated to form a feature vector of length 63.

                                </p>
                            </li>
                            <li>
                                <p><b>Speech Segmentation based on speakers</b> - The speech segmentation was performed separately using the following two methods -
                                    <ol class="dual-list">
                                        <li>
                                            <p>
                                                <b>Supervised Approach</b> - The k-nearest neighbours (kNN) model was trained on a large numbers of voice samples to incorporate total variability in feature space. Hence, parameters of the model were set
                                                accordingly to classify individual speakers. This model was applied on a test data to cluster the 20ms time frames pertaining to individual speakers. For every 1 sec (50 time frames of 20ms each), the mode
                                                of the data was found and assigned to the whole 1 sec time frame. In order to eliminate the sporadic predictions, we have used a window function which passes through each time frame and computes the mode
                                                of elements left of the considered time-frame as well as the right side. If it does not match with the class predicted by the considered time-frame, it is assigned the mode computed. Hence, we were able
                                                to predict the transitions in the complete audio samples and segment out the speech of each individual speaker.
                                            </p>
                                        </li>
                                        <li>
                                            <p>
                                                <b>Unsupervised Approach</b> - The feature vectors pertaining to each time-frame were passed through stack of 4 sparse auto-encoders to carry out feature transfer learning. Gaussian Mixture Model was applied
                                                to cluster the output into 20 components. A graph was plotted depicting the predicted class at each time-frame. We could observe a change in distribution in the clusters at the speaker transition points.
                                                The time instances where such a change occurs across a significant number of clusters were recorded and marked as the transition points. This effectively predicts the transitions in the complete audio samples
                                                and segments out the speech of each individual speaker.
                                            </p>


                                        </li>
                                        <img class="formula" src="images/6.jpg" alt="this" style="margin-left: 30%;">
                                    </ol>
                                </p>
                            </li>
                        </ol>
                        <h3 id="Experiments">3. Experiments & Results</h3>
                        <div class="subheading">
                            <h5>3.1 Dataset Description</h5>
                            <p>Firstly, we acquired youtube videos of various lecturers from nptel channel. Later we extracted the audio signal from the videos at a sampling rate of 8000 Hz. In order to generate an audio signal comprising of variable number
                                of speakers of varied durations we concatenated the audio signals in such a way as to create various experimental test cases.
                                </br>Link: <a href="https://drive.google.com/drive/folders/0B6nPr2PIWVuPeXp1NHdaY1FnQms" style="color: rgb(0,0,255)">Audio Dataset</a>
                            </p>
                            <h5>Supervised:</h5>
                            <p>The probability that the model assigns the rightful class to a 20ms time-frame is shown in the plot below:</p>
                            <img src="images/7.jpg" alt="this" width="80%" style="margin-left: 10%;">
                            <div class="responsive1">
                                <div class="gallery">

                                        <img src="images/dsp/20speaker.png" alt="this" width="80%">
                                    </a>
                                </div>
                            </div>
                            <video width="90%" height="140" controls style=" border: 1px solid #777; margin: 10px;">
                                 <source src="https://docs.google.com/uc?export=download&id=17Hl4LFPrmDtZUB-A8IiDxj-kxcJy8XyB" type="audio/wav">
                                 <track src="images/dsp/supervised_uniform.vtt" kind="subtitles" srclang="en" label="English" default>
                            </video>
	<br>
                            To view the transition point of speakers <a href="images/dsp/supervised_uniform.vtt" target="_blank">click here</a>
                            <br>
                            <div class="responsive1">
                                <div class="gallery">

                                        <img src="images/dsp/random20_variable.png" alt="this" width="100%">
                                    </a>
                                </div>
                            </div>


                            <video width="100%" height="140" controls style=" border: 1px solid #777; margin: 10px;">
                                 <source src="images/dsp/random_20.wav" type="audio/wav">
                                 <track src="images/dsp/supervised_random.vtt" kind="subtitles" srclang="en" label="English" default>
                            </video>
				<br>
                            To view the transition point of speakers <a href="images/dsp/supervised_uniform.vtt" target="_blank">click here</a>

                            <h5>Unsupervised:</h5>
                            <p>The features vectors of five speakers was passed through four stacked sparse autoencoders with ReLU as activation function. The output of the autoencoder was then modelled using Gaussian Mixture Model with no. of clusters set
                                to 20. Decoded Value are the output of the sparse autoencoders.

                            </p>

                            <img src="images/8.jpg" alt="this" height="500px" style="position: inline;">
                            <div style="padding-left: 170px; padding-bottom: 20px;">
                                Training of Sparse Autoencoder

                            </div>
                            <img src="images/9.jpg" alt="this" width="530px" style="vertical-align: top;">
                            <div style="padding-left: 170px; padding-bottom: 10px;">
                                Visualisation of the Autoencoder layer

                            </div>
                            <!-- <img src="images/10.jpg" alt="this" width="80%" > -->

                            <br>
                            <p><br>Consecutive segments were taken of the order 1000 and total number of each type of clusters was calculated in them. The above graph plots the number of occurence of cluster X per 1000 segments. Each feature was then given
                                a binary code depending on the relative occurrence of different gaussian model. The difference in bits of these binary code does not vary significantly for a particular speaker. On the contrary, the no of bit change is
                                high when there is speaker transition. These binary codes can thus be used for detecting speaker transition as well as speaker identification.
                                <br>
                                <b class="subtopic">1st Dataset:</b> 5 unique speakers with each speaker having equal speaking time.


                            </p>

                            <div class="responsive1e">
                                <div class="gallery">

                                        <img src="images/dsp/1.png" alt="this" width="80%">
                                                                        <div class="desc">Visualization of Clusters generated by the GMM Model</div>
                                </div>
                            </div>
                            <br>
                            <div class="responsive">
                                <div class="gallery">

                                        <img src="images/dsp/1/output1.gif" alt="this" width="70%">
                                                                        <div class="desc">Density of each Cluster</div>
                                </div>
                            </div>

                            <div class="responsive">
                                <div class="gallery">

                                        <img src="images/dsp/1_1.png" alt="this" width="70%">
                                                                        <div class="desc">Speaker Identification</div>
                                </div>
                            </div>
                            <br>
                            <hr>
                            <br>
                            <video width="100%" height="140" controls style=" border: 1px solid #777; margin: 10px;">
                                 <source src="uniform.wav" type="audio/wav">
                                 <track src="1_subs.vtt" kind="subtitles" srclang="en" label="English" default>
                            </video>
                            To view the transition point of speakers <a href="1_subs.vtt" target="_blank">click here</a>
                            <p><br>
                                <b class="subtopic">2nd Dataset:</b> 5 unique speakers with duration of each speaker increasing progressively.


                            </p>

                            <div class="responsive1e">
                                <div class="gallery">

                                        <img src="images/dsp/2.png" alt="this" width="80%">
                                                                        <div class="desc">Visualization of Clusters generated by the GMM Model</div>
                                </div>
                            </div>
                            <br>
                            <div class="responsive">
                                <div class="gallery">

                                        <img src="images/dsp/2/output2.gif" alt="this" width="70%">
                                                                        <div class="desc">Density of each Cluster</div>
                                </div>
                            </div>

                            <div class="responsive">
                                <div class="gallery">

                                        <img src="images/dsp/2_1_ml.png" alt="this" width="70%">
                                                                        <div class="desc">Speaker Identification</div>
                                </div>
                            </div>
                            <br>
                            <p><br>
                                <br>
                                <br>
                                <b class="subtopic">3rd Dataset:</b> 5 unique speakers with each speaker having equal speaking time and after half of the signal the sequence of speakers repeat.

                            </p>

                            <!-- <div class="responsive1e">
                                <div class="gallery">

                                            <img src="images/dsp/3.png" alt="this" width="80%">

                                    <div class="desc">Visualization of Clusters generated by the GMM Model</div>
                                </div>
                            </div> -->
                            <br>
                            <div class="responsive">
                                <div class="gallery">

                                            <img src="images/dsp/3(Repeat)/output3.gif" alt="this" width="70%">

                                    <div class="desc">Density of each Cluster</div>
                                </div>
                            </div>

                            <div class="responsive">
                                <div class="gallery">

                                            <img src="images/dsp/3_1_ml.png" alt="this" width="70%">

                                    <div class="desc">Speaker Identification</div>
                                </div>
                            </div>
                            <br>
                            <hr>
                            <br>
                            <br>
                            <p><br>
                                <br>
                                <br>
                                <b class="subtopic">4th Dataset:</b> 5 unique speakers with each speaker having variable duration.

                            </p>

                            <!-- <div class="responsive1e">
                                <div class="gallery">

                                            <img src="images/dsp/4.png" alt="this" width="80%">

                                    <div class="desc">Visualization of Clusters generated by the GMM Model</div>
                                </div>
                            </div> -->
                            <br>
                            <div class="responsive">
                                <div class="gallery">

                                            <img src="images/dsp/4(Random)/output4.gif" alt="this" width="70%">

                                    <div class="desc">Density of each Cluster</div>
                                </div>
                            </div>

                            <div class="responsive">
                                <div class="gallery">

                                            <img src="images/dsp/4_1_ml.png" alt="this" width="70%">

                                    <div class="desc">Speaker Identification</div>
                                </div>
                            </div>
                            <br>
                            <hr>
                            <br>
                            <!-- <video width="100%" height="140" controls style=" border: 1px solid #777; margin: 10px;">
                           		 <source src="uniform.wav" type="audio/wav">
                          		 <track src="1_subs.vtt" kind="subtitles" srclang="en" label="English" default>
                    		</video> -->
                            <!-- <img src="images/dsp/1/output1.gif" alt="this" width="49%" style="position: inline;"> -->

                        </div>
                        <br>
                        <h3 id="code">Code</h3> Code can be found <a href="https://github.com/mahimg/Speaker-recognition/tree/master/Code" style="color: rgb(0,0,255)">here.</a>Model files for Supervised method can be found <a href="https://drive.google.com/open?id=1t5NSV-dGQi4UoQt1QljB_Co1KV5EBFuT" style="color: rgb(0,0,255)">here</a>.
                        <h3 id="Conclusions">4. Conclusion</h3>
                        <div class="subheading" style="padding-bottom: 200px;">
                            <h5>4.1 Summary</h5>
                            <p>
                                The overall aim of this project was to segment speech sequences based on speaker transitions, where the number of speakers is not known beforehand. We have achieved doing this firstly using the supervised approach wherein we had the data of the speakers
                                involved in the conversation beforehand. Secondly, the unsupervised approach implemented rarely failed to detect the speaker transitions.

                            </p>
                            <h5>4.2 Feature Extension
                                </h5>
                            <p>
                                Further improvement can be done to improve robustness to noise and non-speech audio such as music. Furthermore, advance speaker diarization should be able to handle presence of overlapped speech on which the occurrence of overlapping speech almost regularly
                                presents in natural conversation.

                            </p>
                            <h5>4.3 Applications
                                    </h5>
                            <p>
                                <ul>
                                    <li>One of the most important application will be in transcription of conversation. It can be used to localise the instances of speaker instances to pool data for model training which in turn improve transcription accuracy.
                                    </li>
                                    <li>
                                        It can be used to crop the speech of a specific person of interest from a long audio clip.</li>
                                </ul>
                            </p>

                        </div>
                    </div>
                </main>
            </div>
        </div>
    </section>

    <!-- footer section -->
    <!-- <footer id="contact" class="footer">
        <div class="container-fluid">
            <div class="col-md-2 left">
                <h4>Office Location</h4>
                <p> Collins Street West Victoria 8007 Australia.</p>
            </div>
            <div class="col-md-2 left">
                <h4>Contact</h4>
                <p> Call: 612.269.8419 <br> Email : <a href="mailto:hello@agency.com"> hello@agency.com </a></p>
            </div>
            <div class="col-md-2 left">
                <h4>Social presense</h4>
                <ul class="footer-share">
                    <li><a href="#"><i class="fa fa-facebook"></i></a></li>
                    <li><a href="#"><i class="fa fa-twitter"></i></a></li>
                    <li><a href="#"><i class="fa fa-linkedin"></i></a></li>
                    <li><a href="#"><i class="fa fa-google-plus"></i></a></li>
                </ul>
            </div>
            <div class="col-md-6 right">
                <p>© 2015 All rights reserved. All Rights Reserved<br> Made with <i class="fa fa-heart pulse"></i> by <a href="http://www.designstub.com/">Designstub</a></p>
            </div>
        </div>
    </footer> -->
    <!-- footer section -->

    <!-- JS FILES -->
    <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.3/jquery.min.js"></script>
    <script src="js/bootstrap.min.js"></script>
    <script src="js/jquery.flexslider-min.js"></script>
    <script src="js/retina.min.js"></script>
    <script src="js/modernizr.js"></script>
    <script src="js/main.js"></script>
</body>

</html>