index.html

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
<head>
  <meta charset="utf-8" />
  <meta name="generator" content="pandoc" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
  <title>0ba9c091992f4edba56aa694018004a8</title>
  <style>
    html {
      line-height: 1.5;
      font-family: Georgia, serif;
      font-size: 20px;
      color: #1a1a1a;
      background-color: #fdfdfd;
    }
    body {
      margin: 0 auto;
      max-width: 36em;
      padding-left: 50px;
      padding-right: 50px;
      padding-top: 50px;
      padding-bottom: 50px;
      hyphens: auto;
      overflow-wrap: break-word;
      text-rendering: optimizeLegibility;
      font-kerning: normal;
    }
    @media (max-width: 600px) {
      body {
        font-size: 0.9em;
        padding: 1em;
      }
      h1 {
        font-size: 1.8em;
      }
    }
    @media print {
      body {
        background-color: transparent;
        color: black;
        font-size: 12pt;
      }
      p, h2, h3 {
        orphans: 3;
        widows: 3;
      }
      h2, h3, h4 {
        page-break-after: avoid;
      }
    }
    p {
      margin: 1em 0;
    }
    a {
      color: #1a1a1a;
    }
    a:visited {
      color: #1a1a1a;
    }
    img {
      max-width: 100%;
    }
    h1, h2, h3, h4, h5, h6 {
      margin-top: 1.4em;
    }
    h5, h6 {
      font-size: 1em;
      font-style: italic;
    }
    h6 {
      font-weight: normal;
    }
    ol, ul {
      padding-left: 1.7em;
      margin-top: 1em;
    }
    li > ol, li > ul {
      margin-top: 0;
    }
    blockquote {
      margin: 1em 0 1em 1.7em;
      padding-left: 1em;
      border-left: 2px solid #e6e6e6;
      color: #606060;
    }
    code {
      font-family: Menlo, Monaco, 'Lucida Console', Consolas, monospace;
      font-size: 85%;
      margin: 0;
    }
    pre {
      margin: 1em 0;
      overflow: auto;
    }
    pre code {
      padding: 0;
      overflow: visible;
      overflow-wrap: normal;
    }
    .sourceCode {
     background-color: transparent;
     overflow: visible;
    }
    hr {
      background-color: #1a1a1a;
      border: none;
      height: 1px;
      margin: 1em 0;
    }
    table {
      margin: 1em 0;
      border-collapse: collapse;
      width: 100%;
      overflow-x: auto;
      display: block;
      font-variant-numeric: lining-nums tabular-nums;
    }
    table caption {
      margin-bottom: 0.75em;
    }
    tbody {
      margin-top: 0.5em;
      border-top: 1px solid #1a1a1a;
      border-bottom: 1px solid #1a1a1a;
    }
    th {
      border-top: 1px solid #1a1a1a;
      padding: 0.25em 0.5em 0.25em 0.5em;
    }
    td {
      padding: 0.125em 0.5em 0.25em 0.5em;
    }
    header {
      margin-bottom: 4em;
      text-align: center;
    }
    #TOC li {
      list-style: none;
    }
    #TOC ul {
      padding-left: 1.3em;
    }
    #TOC > ul {
      padding-left: 0;
    }
    #TOC a:not(:hover) {
      text-decoration: none;
    }
    code{white-space: pre-wrap;}
    span.smallcaps{font-variant: small-caps;}
    div.columns{display: flex; gap: min(4vw, 1.5em);}
    div.column{flex: auto; overflow-x: auto;}
    div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
    ul.task-list{list-style: none;}
    ul.task-list li input[type="checkbox"] {
      width: 0.8em;
      margin: 0 0.8em 0.2em -1.6em;
      vertical-align: middle;
    }
    pre > code.sourceCode { white-space: pre; position: relative; }
    pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
    pre > code.sourceCode > span:empty { height: 1.2em; }
    .sourceCode { overflow: visible; }
    code.sourceCode > span { color: inherit; text-decoration: inherit; }
    div.sourceCode { margin: 1em 0; }
    pre.sourceCode { margin: 0; }
    @media screen {
    div.sourceCode { overflow: auto; }
    }
    @media print {
    pre > code.sourceCode { white-space: pre-wrap; }
    pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
    }
    pre.numberSource code
      { counter-reset: source-line 0; }
    pre.numberSource code > span
      { position: relative; left: -4em; counter-increment: source-line; }
    pre.numberSource code > span > a:first-child::before
      { content: counter(source-line);
        position: relative; left: -1em; text-align: right; vertical-align: baseline;
        border: none; display: inline-block;
        -webkit-touch-callout: none; -webkit-user-select: none;
        -khtml-user-select: none; -moz-user-select: none;
        -ms-user-select: none; user-select: none;
        padding: 0 4px; width: 4em;
        color: #aaaaaa;
      }
    pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
    div.sourceCode
      {   }
    @media screen {
    pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
    }
    code span.al { color: #ff0000; font-weight: bold; } /* Alert */
    code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
    code span.at { color: #7d9029; } /* Attribute */
    code span.bn { color: #40a070; } /* BaseN */
    code span.bu { color: #008000; } /* BuiltIn */
    code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
    code span.ch { color: #4070a0; } /* Char */
    code span.cn { color: #880000; } /* Constant */
    code span.co { color: #60a0b0; font-style: italic; } /* Comment */
    code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
    code span.do { color: #ba2121; font-style: italic; } /* Documentation */
    code span.dt { color: #902000; } /* DataType */
    code span.dv { color: #40a070; } /* DecVal */
    code span.er { color: #ff0000; font-weight: bold; } /* Error */
    code span.ex { } /* Extension */
    code span.fl { color: #40a070; } /* Float */
    code span.fu { color: #06287e; } /* Function */
    code span.im { color: #008000; font-weight: bold; } /* Import */
    code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
    code span.kw { color: #007020; font-weight: bold; } /* Keyword */
    code span.op { color: #666666; } /* Operator */
    code span.ot { color: #007020; } /* Other */
    code span.pp { color: #bc7a00; } /* Preprocessor */
    code span.sc { color: #4070a0; } /* SpecialChar */
    code span.ss { color: #bb6688; } /* SpecialString */
    code span.st { color: #4070a0; } /* String */
    code span.va { color: #19177c; } /* Variable */
    code span.vs { color: #4070a0; } /* VerbatimString */
    code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
    .display.math{display: block; text-align: center; margin: 0.5rem auto;}
  </style>
  <!--[if lt IE 9]>
    <script src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv-printshiv.min.js"></script>
  <![endif]-->
</head>
<body>
<div class="cell code" data-execution_count="1" id="iXZsRRTpiMXc">
<div class="sourceCode" id="cb1"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># 210107005_UoL_DSM140_NLP_Text_Classification_CW_Sub_v240107wk.ipynb</span></span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Commentable @ https://colab.research.google.com/drive/1kUTphSV9lHhbu_HT_tvffIPEtFWpFPIg?usp=sharing</span></span></code></pre></div>
</div>
<section id="1st-uol-dsm140-cw---report" class="cell markdown"
id="wxi-a1JwISCC">
<h1>1st UoL DSM140 CW - Report</h1>
</section>
<section id="i-introduction" class="cell markdown" id="aefiBqEAI6qF">
<h2>I. Introduction</h2>
</section>
<section id="1-introduction-to-the-domain-specific-area"
class="cell markdown" id="Be4kcwTMKNPv">
<h3>1. Introduction to the domain-specific area</h3>
<p>The domain-specific area of interest is the application of AI and
machine learning techniques for Static Application Security Testing
(SAST) and vulnerability detection in critical infrastructure software.
This area is particularly relevant to the Artificial Intelligence Cyber
Challenge (AIxCC), a competition that encourages the development of AI
systems to secure critical code.</p>
<p>In our interconnected world, software underpins everything from
financial systems to public utilities. As this code enables modern life
and drives productivity, it also creates an expanding attack surface for
malicious actors. The AIxCC is a two-year competition that asks the best
and brightest in AI and cybersecurity to defend the software on which
almost everyone rely. The competition will award a cumulative 30 million
in prizes to teams with the best systems, including 7 million in prizes
to small businesses to empower entrepreneurial innovation.</p>
<p>The AIxCC is particularly focused on securing open-source software,
which comprises most of the code running on critical infrastructure
today, including the electricity and telecommunications sectors. The
competition is collaborating closely with the open-source community to
guide teams in creating AI systems capable of addressing vital
cybersecurity issues.</p>
<p>The challenge is to develop innovative systems guided by AI and
Machine Learning to semi-automatically find and fix software
vulnerabilities [2]. The AIxCC competition will foster innovative
research via a gamified environment that challenges the competitors to
design Cyber Reasoning Systems (CRSs) that integrate novel AI [4].</p>
<p>In the context of C6AI (Combined C++ Code Cybersecurity &amp;
CWE-based Classification AI), the focus is on using text classification
methods to analyse and classify C++ code for potential vulnerabilities.
This involves converting the raw text of the code into numerical feature
vectors that can be processed by machine learning algorithms. Techniques
such as text stemming and n-gram tokenization are used in this
preprocessing stage.</p>
<p>In summary, the domain-specific area is the intersection of AI,
cybersecurity, and software vulnerability detection, with a particular
focus on static analysis of C++ code. The goal is to develop AI systems
that can effectively identify and address software vulnerabilities,
thereby enhancing the security of critical infrastructure.</p>
<p>Ref: [1] <a href="https://aicyberchallenge.com/about"
class="uri">https://aicyberchallenge.com/about</a> [2] <a
href="https://www.sbir.gov/node/2464965"
class="uri">https://www.sbir.gov/node/2464965</a> [3] <a
href="https://www.darpa.mil/news-events/2023-12-14"
class="uri">https://www.darpa.mil/news-events/2023-12-14</a> [4] <a
href="https://openssf.org/blog/2023/12/19/deconstructing-the-ai-cyber-challenge-aixcc/"
class="uri">https://openssf.org/blog/2023/12/19/deconstructing-the-ai-cyber-challenge-aixcc/</a></p>
</section>
<section id="2-description-of-the-selected-dataset"
class="cell markdown" id="4MdAqy4aKHTu">
<h3>2. Description of the selected dataset</h3>
<p>The Juliet C/C++ 1.3.1 SARD dataset is a collection of test cases in
the C/C++ language, organized under 118 different Common Weakness
Enumerations (CWEs). This dataset is part of the Software Assurance
Reference Dataset (SARD) provided by the National Institute of Standards
and Technology (NIST) as ‘Juliet C/C++ 1.3 with extra support’ @ <a
href="https://samate.nist.gov/SARD/test-suites/116"
class="uri">https://samate.nist.gov/SARD/test-suites/116</a>.</p>
<p>The dataset is designed to test software for potential
vulnerabilities and weaknesses. Each test case in the dataset is
associated with a specific CWE, which represents a type of software
vulnerability. The dataset includes both 'good' and 'bad' examples, with
the 'bad' examples demonstrating the vulnerability and the 'good'
examples showing a correct or safe way to write the code.</p>
<p>The Juliet C/C++ 1.3.1 SARD dataset is publicly available and not
subject to copyright protection. It is made available under the CC0 1.0
Public Domain License.</p>
<p>The dataset is structured in a way that each CWE has its own
directory, and within each directory, there are multiple text files,
each representing a test case. The test cases are labelled with the
CWE-ID, which can be used to identify the type of vulnerability that the
test case is associated with.</p>
<p>The 671 MB combressed size of the Juliet C/C++ 1.3 SARD with extra
support, contains over 64,099 test cases. Given that the SARD has over
170,000 programs and the Juliet C/C++ 1.3 dataset is a part of this
collection, it can be inferred that the dataset is quite large. The data
types in the dataset are primarily text, as the test cases are
represented as C/C++ code in text files.</p>
<p>The dataset is typically used in machine learning experiments, where
it is divided into training, validation, and test sets. The SARD dataset
has already been divided into training and test sets, but it lacks a
validation set. Therefore, it is common practice to create a validation
set using an 80:20 split of the training data.</p>
</section>
<section id="3-objectives-of-the-project" class="cell markdown"
id="gN8m-55TKGNQ">
<h3>3. Objectives of the project</h3>
<p>The objectives of the project are to enhance the out-of-sample
generalization capabilities of the currently developed C6AI Cyber
Reasoning System (CRS) and to measure its 'Vulnerability Discovery
Accuracy'. This is in line with the AIxCC CRS Areas of Excellence, which
emphasize the importance of developing systems that can accurately
identify vulnerabilities in software, particularly in the context of
critical infrastructure.</p>
<p>The project aims to contribute to the AI Cyber Challenge (AIxCC) by
developing a CRS that can effectively and efficiently detect
vulnerabilities in C++ code. The focus on out-of-sample generalization
is crucial because it ensures that the system can perform well on new,
unseen data, which is a common scenario in real-world applications. The
ability to generalize well is indicative of a system's robustness and
its potential to adapt to evolving cybersecurity threats.</p>
<p>The impact of achieving these objectives is significant. By improving
the accuracy of vulnerability discovery, the project directly
contributes to the security of critical infrastructure software. This
has far-reaching implications for national security, economic stability,
and public safety, as critical infrastructure systems are essential to
the functioning of society.</p>
<p>Moreover, the project's contributions to the AIxCC challenge could
lead to advancements in the field of AI and cybersecurity. By
participating in the gamified environment of the competition, the
project fosters innovation and encourages the development of new
techniques and methodologies in AI-driven cybersecurity.</p>
<p>The potential contributions of the results to the AIxCC challenge
could include:</p>
<ol>
<li>Demonstrating the effectiveness of the C6AI CRS in accurately
identifying and classifying software vulnerabilities.</li>
<li>Providing insights into the strengths and weaknesses of the current
approaches to SAST and vulnerability detection.</li>
<li>Offering a benchmark for future research and development in the
domain of AI-powered cybersecurity solutions.</li>
<li>Encouraging the adoption of AI and machine learning techniques in
the cybersecurity industry, particularly for the protection of critical
infrastructure.</li>
</ol>
<p>In summary, the project's objectives are to develop a CRS that excels
in out-of-sample generalization and vulnerability discovery accuracy,
with the potential to make significant contributions to the AIxCC
challenge and the broader field of cybersecurity.</p>
</section>
<section id="4-evaluation-methodology" class="cell markdown"
id="My7GCBqFJ_4e">
<h3>4. Evaluation methodology</h3>
<p>The evaluation methodology for the project will involve several key
metrics to assess the performance of the C6AI Cyber Reasoning System
(CRS) in identifying vulnerabilities in C++ code. These metrics will
provide a comprehensive understanding of the system's performance,
including its ability to correctly identify vulnerabilities (accuracy),
its ability to correctly identify true vulnerabilities (precision), its
ability to identify all actual vulnerabilities (recall), and a balanced
measure of precision and recall (F-measure).</p>
<ol>
<li><p><strong>Accuracy</strong>: This is the most intuitive performance
measure, and it simply is a ratio of correctly predicted observation to
the total observations. It is the ability of the model to correctly
identify both vulnerabilities and non-vulnerabilities. It is calculated
as (True Positives + True Negatives) / (True Positives + False Positives
+ True Negatives + False Negatives).</p></li>
<li><p><strong>Precision</strong>: Precision is the ratio of correctly
predicted positive observations to the total predicted positives. It is
also called Positive Predictive Value. It is a measure of amongst all
the identified vulnerabilities, how many of them are vulnerabilities. It
is calculated as True Positives / (True Positives + False
Positives).</p></li>
<li><p><strong>Recall (Sensitivity)</strong>: Recall is the ratio of
correctly predicted positive observations to all observations in actual
class. It is also called Sensitivity, Hit Rate, or True Positive Rate.
It is a measure of the ability of the model to identify all possible
vulnerabilities. It is calculated as True Positives / (True Positives +
False Negatives).</p></li>
<li><p><strong>F-Measure (F1 Score)</strong>: F1 Score is the weighted
average of Precision and Recall. Therefore, this score takes both false
positives and false negatives into account. It is suitable for uneven
class distribution problems. It is calculated as 2<em>(Recall </em>
Precision) / (Recall + Precision).</p></li>
</ol>
<p>The evaluation will be conducted using a test set that the model has
not been trained on to ensure an unbiased assessment of the model's
performance. This is crucial to avoid overfitting, where the model
performs well on the training data but poorly on new, unseen data. The
test data will be representative of the real-world data the model will
encounter, ensuring the evaluation reflects the model's true predictive
performance.</p>
<p>In addition, the project will employ techniques such as
cross-validation to further ensure the robustness of the evaluation. In
n-Fold cross-validation, the data is divided into n non-overlapping
subsets. The model is trained on n-1 subsets and tested on the remaining
subset. This process is repeated n times, with each subset used once as
the test set. The error estimation is averaged over all n trials to get
the total accuracy of the model.</p>
<p>The evaluation methodology will provide a comprehensive understanding
of the model's performance, allowing for the identification of areas of
strength and potential improvement. This will ultimately contribute to
the development of a more accurate and robust Cyber Reasoning
System.</p>
</section>
<section id="ii-implementation" class="cell markdown" id="rMXfDgSEI9_M">
<h2>II. Implementation</h2>
</section>
<section id="5-pre-processing" class="cell markdown" id="OjHHMuqsJ6bG">
<h3>5. Pre-processing</h3>
<p>The pre-processing steps for the text classification task in the
provided Python file include several steps to convert the raw text data
into a format that can be used by machine learning algorithms.</p>
<ol>
<li><p><strong>Text Lowercasing</strong>: All the text is converted to
lower case. This is done to ensure that the algorithm does not treat the
same words in different cases as different words.</p></li>
<li><p><strong>Punctuation Removal</strong>: All punctuation marks are
removed from the text. Punctuation does not add any extra information
while training the machine learning model. Moreover, removing
punctuation reduces the size of the vocabulary and thus increases the
speed of training.</p></li>
<li><p><strong>Stop Words Removal</strong>: Stop words are the most
common words in a language like 'the', 'a', 'on', 'is', 'all'. These
words do not carry important meaning and are usually removed from texts.
The Python file uses a list of English stop words from the NLTK
library.</p></li>
<li><p><strong>Stemming</strong>: Stemming is the process of reducing
inflected (or sometimes derived) words to their word stem, base or root
form. The Python file uses the Snowball Stemmer from the NLTK
library.</p></li>
<li><p><strong>N-gram Tokenization</strong>: The text is tokenized into
n-grams. N-grams are contiguous sequences of n items from a given sample
of text or speech. This helps to capture the context and semantic
meanings of phrases.</p></li>
<li><p><strong>Vectorization</strong>: The tokenized text is then
converted into numerical vectors which can be used as input to the
machine learning algorithm. The Python file uses the bag-of-words model
to convert the text into vectors. The bag-of-words model represents each
text as a vector in a high-dimensional space, where each unique word in
the text is represented by one dimension, and the value in that
dimension represents the frequency of the word in the text.</p></li>
</ol>
<p>The Python file reads .cpp files as text into a pandas dataframe. The
vocabulary is built from the unique words in the text after applying the
pre-processing steps.</p>
</section>
<section id="6-baseline-performance" class="cell markdown"
id="gadWv8GLJ2nv">
<h3>6. Baseline performance</h3>
<p>The Naive Bayes classifier was chosen as the baseline for the C6AI
Cyber Reasoning System (CRS) project due to its simplicity, efficiency,
and proven effectiveness in text classification tasks. This classifier
was implemented using the SciKit Learn library, as shown in the attached
Python file.</p>
<p>The Naive Bayes classifier was selected as the baseline because it is
a well-established algorithm in the field of text classification and has
been used extensively in previous research, including by our lecturer
for Statistical Data Mining <a
href="https://github.com/nsadawi/Advanced-ML-Projects/tree/4e112da6c42670052eca1152bd0a786afc30c1c5">Dr.
Noureddin Sadawi</a>. It is a probabilistic classifier that makes use of
Bayes' theorem with strong independence assumptions between the
features. It is particularly suited for high-dimensional datasets, like
text data, and is known for its efficiency and scalability.</p>
<p>The 0.74 (+/- 0.03) MultinomialNB Accuracy performance of the Naive
Bayes classifier provides a meaningful benchmark for comparison with the
more complex Convolutional Neural Network (CNN) model. The CNN model,
implemented using the Keras library, is expected to outperform the Naive
Bayes classifier due to its ability to capture local dependencies in the
data and its capacity for hierarchical feature learning. However, the
Naive Bayes classifier provides a valuable point of reference to
evaluate the degree of improvement achieved with the CNN model.</p>
<p>In conclusion, the Naive Bayes classifier was chosen as the baseline
due to its simplicity, efficiency, and proven effectiveness in text
classification tasks. Its performance provides a meaningful benchmark
for comparison with the more complex CNN model.</p>
</section>
<div class="cell code" id="khkD1BDtk3Cp">
<div class="sourceCode" id="cb2"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co">#                                               precision    recall  f1-score   support</span></span>
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="co">#           CWE121_Stack_Based_Buffer_Overflow       0.91      0.71      0.80       324</span></span>
<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="co">#            CWE122_Heap_Based_Buffer_Overflow       0.87      0.66      0.75       316</span></span>
<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="co">#                     CWE124_Buffer_Underwrite       0.63      0.85      0.72       331</span></span>
<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a><span class="co">#                       CWE126_Buffer_Overread       0.86      0.92      0.89       335</span></span>
<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a><span class="co">#                      CWE127_Buffer_Underread       0.92      0.77      0.83       333</span></span>
<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a><span class="co">#            CWE134_Uncontrolled_Format_String       0.98      0.90      0.94       350</span></span>
<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a><span class="co">#                      CWE190_Integer_Overflow       0.94      0.79      0.86       298</span></span>
<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a><span class="co">#                     CWE191_Integer_Underflow       0.95      0.77      0.85       294</span></span>
<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a><span class="co">#             CWE194_Unexpected_Sign_Extension       1.00      0.59      0.74       165</span></span>
<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a><span class="co">#   CWE195_Signed_to_Unsigned_Conversion_Error       0.99      0.55      0.71       158</span></span>
<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a><span class="co">#              CWE197_Numeric_Truncation_Error       0.71      0.90      0.79       350</span></span>
<span id="cb2-14"><a href="#cb2-14" aria-hidden="true" tabindex="-1"></a><span class="co">#                CWE23_Relative_Path_Traversal       0.91      0.98      0.95       350</span></span>
<span id="cb2-15"><a href="#cb2-15" aria-hidden="true" tabindex="-1"></a><span class="co">#                        CWE369_Divide_by_Zero       0.83      0.91      0.87       350</span></span>
<span id="cb2-16"><a href="#cb2-16" aria-hidden="true" tabindex="-1"></a><span class="co">#                CWE36_Absolute_Path_Traversal       0.82      0.91      0.86       350</span></span>
<span id="cb2-17"><a href="#cb2-17" aria-hidden="true" tabindex="-1"></a><span class="co">#                   CWE400_Resource_Exhaustion       1.00      0.79      0.89       156</span></span>
<span id="cb2-18"><a href="#cb2-18" aria-hidden="true" tabindex="-1"></a><span class="co">#                           CWE401_Memory_Leak       0.81      0.82      0.81       333</span></span>
<span id="cb2-19"><a href="#cb2-19" aria-hidden="true" tabindex="-1"></a><span class="co">#                           CWE415_Double_Free       0.76      0.76      0.76       350</span></span>
<span id="cb2-20"><a href="#cb2-20" aria-hidden="true" tabindex="-1"></a><span class="co">#         CWE457_Use_of_Uninitialized_Variable       0.94      0.97      0.96       297</span></span>
<span id="cb2-21"><a href="#cb2-21" aria-hidden="true" tabindex="-1"></a><span class="co">#                       CWE563_Unused_Variable       0.81      1.00      0.89       350</span></span>
<span id="cb2-22"><a href="#cb2-22" aria-hidden="true" tabindex="-1"></a><span class="co">#               CWE590_Free_Memory_Not_on_Heap       0.88      0.88      0.88       348</span></span>
<span id="cb2-23"><a href="#cb2-23" aria-hidden="true" tabindex="-1"></a><span class="co">#   CWE680_Integer_Overflow_to_Buffer_Overflow       0.98      0.85      0.91       301</span></span>
<span id="cb2-24"><a href="#cb2-24" aria-hidden="true" tabindex="-1"></a><span class="co">#                CWE690_NULL_Deref_From_Return       1.00      0.31      0.47       167</span></span>
<span id="cb2-25"><a href="#cb2-25" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE762_Mismatched_Memory_Management_Routines       0.79      0.85      0.82       349</span></span>
<span id="cb2-26"><a href="#cb2-26" aria-hidden="true" tabindex="-1"></a><span class="co">#                CWE789_Uncontrolled_Mem_Alloc       0.57      0.97      0.72       323</span></span>
<span id="cb2-27"><a href="#cb2-27" aria-hidden="true" tabindex="-1"></a><span class="co">#                   CWE78_OS_Command_Injection       1.00      0.95      0.97       350</span></span>
<span id="cb2-28"><a href="#cb2-28" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-29"><a href="#cb2-29" aria-hidden="true" tabindex="-1"></a><span class="co">#                                     accuracy                           0.84      7628</span></span>
<span id="cb2-30"><a href="#cb2-30" aria-hidden="true" tabindex="-1"></a><span class="co">#                                    macro avg       0.87      0.81      0.83      7628</span></span>
<span id="cb2-31"><a href="#cb2-31" aria-hidden="true" tabindex="-1"></a><span class="co">#                                 weighted avg       0.86      0.84      0.84      7628</span></span></code></pre></div>
</div>
<section id="7-classification-approach" class="cell markdown"
id="LkAvcU-_J0Gt">
<h3>7. Classification approach</h3>
<p>The C6AI Cyber Reasoning System (CRS) project used a Naive Bayes
classifier for text classification, specifically for identifying
vulnerabilities in software code. The features used in the classifier
were derived from the 'Test-Case-Code' and the labels were the
'CWE-ID'.</p>
<p>The 'Test-Case-Code' features were chosen because they represent the
actual code snippets that could potentially contain vulnerabilities.
These features were transformed into a bag-of-words representation and
then weighted using TF-IDF (Term Frequency-Inverse Document Frequency).
This transformation was crucial in converting the raw text into a
numerical format that the classifier could process.</p>
<p>The 'CWE-ID' [a standard identifier for software vulnerabilities,
allowing the results to be easily interpreted] was used as the target
label because it represents the specific type of vulnerability present
in the code. The classifier was trained to predict this label based on
the features derived from the 'Test-Case-Code'.</p>
<p>The Naive Bayes classifier was chosen for its simplicity and
efficiency in text classification tasks. It was implemented using the
SciKit Learn library for the baseline model. For the final model, a
Convolutional Neural Network (CNN) was built using the Keras library.
CNNs are known for their effectiveness in text classification tasks, as
they can capture local dependencies in the text and can manage
variable-length inputs.</p>
<p>The Python script used for training and evaluating the classifier was
designed to be easily understood and modified, enhancing the project's
reproducibility. This means that the approach can be replicated by
others using different programming languages, development environments,
libraries, and algorithms.</p>
</section>
<section id="8-coding-style" class="cell markdown" id="WVyw8ZgoJw5H">
<h3>8. Coding style</h3>
<p>The Python code provided is to adhere to several key coding
conventions, which are crucial for maintaining high-quality, readable,
and maintainable code.</p>
<ol>
<li><p><strong>Indentation</strong>: The code uses consistent
indentation, which is a fundamental aspect of Python syntax and crucial
for code readability.</p></li>
<li><p><strong>Variable Naming</strong>: The code uses meaningful names
for variables, which makes the code more understandable and
maintainable. For example, <code>porter_stemmer</code>,
<code>stop_words</code>, and <code>global_start</code> are all
descriptive variable names that give a clear indication of their purpose
in the code.</p></li>
<li><p><strong>Use of Libraries</strong>: The code makes extensive use
of libraries, including <code>nltk</code>, <code>tensorflow</code>,
<code>keras</code>, <code>numpy</code>, <code>pandas</code>, and
<code>sklearn</code>, among others. This is a good practice as it
leverages existing, well-tested functionality and can make the code more
concise and efficient.</p></li>
<li><p><strong>Comments</strong>: The code includes numerous comments,
which are essential for explaining the purpose of code blocks, the
functionality of functions, and the meaning of variables. This is a good
practice as it makes the code more understandable for others (and for
the original coder at a later date).</p></li>
<li><p><strong>Avoiding Magic Numbers</strong>: The code defines several
constants at the beginning (like <code>epochs</code>,
<code>batch_size</code>, <code>seed</code>, etc.), which is a good
practice as it avoids the use of unnamed numerical constants ("magic
numbers") in the code. This makes the code more readable and easier to
modify.</p></li>
<li><p><strong>Code Organization</strong>: The code is well-organized,
with clear sections for importing libraries, setting up variables,
defining functions, and executing code. This organization makes the code
easier to follow and understand.</p></li>
</ol>
<p>In summary, the code in the provided Python file appears to follow
good coding practices, including consistent indentation, meaningful
variable names, extensive use of libraries, comprehensive comments,
avoidance of magic numbers, and clear organization. These practices
contribute to making the code high-quality, readable, and
maintainable.</p>
</section>
<section id="iii-outcome-conclusions" class="cell markdown"
id="FilYFIQzJDLi">
<h2>III. Outcome Conclusions</h2>
</section>
<section id="9-evaluation" class="cell markdown" id="52xkRUxaJrxf">
<h3>9. Evaluation</h3>
<p>The evaluation of the C6AI Cyber Reasoning System (CRS) classifier
was performed using the Python scripts provided in this notebook. Those
scripts finally use a baseline-beating CNN model after initially using
multiple common-sense models and statistical data mining algorithms
[starting with the Naive Bayes classifier algorithm] to train various
baseline models and then makes predictions on the entire dataset.</p>
<p>The script uses the following metrics for evaluation:</p>
<ol>
<li><p><strong>Accuracy</strong>: This metric measures the ratio of
correctly predicted observations to the total observations. It is the
ability of the model to correctly identify both vulnerabilities and
non-vulnerabilities.</p></li>
<li><p><strong>Precision</strong>: This metric measures the ratio of
correctly predicted positive observations to the total predicted
positives. It is a measure of amongst all the identified
vulnerabilities, how many of them are vulnerabilities.</p></li>
<li><p><strong>Recall (Sensitivity)</strong>: This metric measures the
ratio of correctly predicted positive observations to all observations
in the actual class. It is a measure of the ability of the model to
identify all possible vulnerabilities.</p></li>
<li><p><strong>F-Measure (F1 Score)</strong>: This metric is the
weighted average of Precision and Recall. Therefore, this score takes
both false positives and false negatives into account.</p></li>
</ol>
<p>The script uses the SciKit Learn's built-in classification report to
return these metrics.</p>
<p>The results of the evaluation provide a quantitative measure of the
performance of the CRS classifier. By comparing these results with a
suitable baseline, we can assess the improvement achieved by our
approach. The specific values of these metrics depended on the actual
data used for training and testing the classifier [as shown and
validated in the rest of the notebook].</p>
</section>
<div class="cell code" data-execution_count="2" id="M_5kw2qMQ3-x">
<div class="sourceCode" id="cb3"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Evaluated the model on the test set</span></span>
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="co">## 87/87 [==============================] - 1s 11ms/step - loss: 0.2339 - accuracy: 0.9259</span></span>
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="co">## Test loss: 0.23388110101222992</span></span>
<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="co">## Test accuracy: 0.9259259104728699</span></span></code></pre></div>
</div>
<section id="10-summary-of-the-project-and-its-results"
class="cell markdown" id="zNGw85fqJLSh">
<h3>10. Summary of the project and its results</h3>
<p>Baseline-beating CNN Model Test Accuracy was 0.925 (in contrast to
the 0.74 (+/- 0.03) MultinomialNB Accuracy or the Accuracy: 0.87 (+/-
0.02) RandomForestClassifier Accuracy); nevertheless, the C6AI Cyber
Reasoning System (CRS) project once further developed [using creative
advances out of scope for this elementary NLP assignment] could make
significant contributions to the field of text classification,
particularly in the context of identifying vulnerabilities in software
code. While the project ultimately employed a CNN model [amongst
others], its initial choice was a Naive Bayes classifier, as a popular
choice for text classification tasks due to its simplicity and
efficiency. The classifier was trained and evaluated using Python-based
SciKit Learn's scripts, which were designed to be easily understood and
modified, enhancing the project's reproducibility.</p>
<p>The project's preprocessing steps, including the transformation of
text into a bag-of-words representation and the use of TF-IDF weighting,
were crucial in preparing the data for the classifier. These steps
converted the raw text into a format that the classifier could process,
and they could be readily adapted for other text classification tasks in
different domains.</p>
<p>The CRS classifier demonstrated robust performance across several
evaluation metrics, including accuracy, precision, recall, and F1 score.
These metrics provide a comprehensive assessment of the classifier's
performance, considering both its ability to correctly identify
vulnerabilities and its ability to avoid false positives and false
negatives.</p>
<p>The project's approach is highly transferable to other
domain-specific areas that involve text classification. The
preprocessing steps and the Naive Bayes classifier can be applied to any
text data, provided that the data is labelled for supervised learning.
Furthermore, the Python script can be easily modified to accommodate
different data sources, classification algorithms, or evaluation
metrics.</p>
<p>The project's approach can also be replicated using different
programming languages, development environments, libraries, and
algorithms. The key steps of the approach, including text preprocessing,
classifier training, and performance evaluation, are common tasks in
machine learning and natural language processing, and they can be
implemented in many programming languages that support these tasks, such
as R, Java, or C++. Similarly, different development environments or
libraries [such the new KerasNLP or Jax/PyTorch-backed Keras 3.0 API]
can be used to provide the necessary functionalities for these
tasks.</p>
<p>While the Naive Bayes classifier baseline was effective, the more
complex CNN model achieves higher performance on related tasks or
datasets. Alas, such model also has drawbacks, such as increased
computational cost and the risk of overfitting. Therefore, the choice of
classifier should be guided by the specific requirements and constraints
of each task.</p>
</section>
<section id="1st-uol-dsm140-cw---code" class="cell markdown"
id="V45GkVUFIKdt">
<h1>1st UoL DSM140 CW - Code</h1>
</section>
<div class="cell code" data-execution_count="3"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="kYsAWNY1LLdp" data-outputId="64c05abc-167d-4390-b9e0-a88c7423d58e">
<div class="sourceCode" id="cb4"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>CONTENT_PATH<span class="op">=</span><span class="st">&#39;/content/&#39;</span></span>
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !cd $CONTENT_PATH &amp;&amp; ls</span></span>
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>ls</span>
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>rm <span class="op">-</span>r sample_data</span></code></pre></div>
<div class="output stream stdout">
<pre><code>sample_data
</code></pre>
</div>
</div>
<section id="env-prepping" class="cell markdown" id="LYk3tygxLLdw">
<h1>Env Prepping</h1>
</section>
<section id="import-libraries" class="cell markdown" id="-5yr1LwPLLdx">
<h3>Import Libraries</h3>
</section>
<div class="cell code" data-execution_count="4" id="P3_Ty5JWDc1I">
<div class="sourceCode" id="cb6"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co"># import codecs,collections,csv,glob,io,itertools,json,logging,nltk,pathlib,\</span></span>
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="co"># pickle,pprint,pytest,re,requests,shutil,string,sys,unicodedata,warnings,zipfile</span></span>
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> codecs,collections,csv,glob,io,itertools,json,logging,nltk,os,pathlib,<span class="op">\</span></span>
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>pickle,pprint,pytest,re,requests,shutil,string,sys,time,unicodedata,warnings,zipfile</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="5" id="9NqDP5goLLdz">
<div class="sourceCode" id="cb7"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="co"># import time</span></span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="co">## from time import time</span></span>
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>global_start <span class="op">=</span> time.time()</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="6" id="C35XKxHv0Bin">
<div class="sourceCode" id="cb8"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="co">## TensorFlow backend only supports string inputs</span></span>
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="co"># import os</span></span>
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>os.environ[<span class="st">&quot;KERAS_BACKEND&quot;</span>] <span class="op">=</span> <span class="st">&quot;tensorflow&quot;</span></span>
<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> keras</span>
<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> keras <span class="im">import</span> layers</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="7"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="14EQOfRF1cdA" data-outputId="89ef11a3-ee05-41dc-8919-25c3ec13dee9">
<div class="sourceCode" id="cb9"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>pip install <span class="op">-</span>q <span class="st">&quot;tensorflow-text&quot;</span> <span class="co"># ==2.13.*&quot;</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.2/5.2 MB 9.8 MB/s eta 0:00:00
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="8" id="rAP1eUb50jQC">
<div class="sourceCode" id="cb11"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !pip install -q &quot;tensorflow-text&quot; # ==2.13.*&quot;</span></span>
<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> tensorflow_text <span class="im">as</span> tf_text</span>
<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a><span class="co"># import keras</span></span>
<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> tensorflow <span class="im">as</span> tf</span>
<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> tensorflow.data <span class="im">as</span> tf_data</span>
<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> tensorflow_datasets <span class="im">as</span> tfds</span>
<span id="cb11-8"><a href="#cb11-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb11-9"><a href="#cb11-9" aria-hidden="true" tabindex="-1"></a><span class="co"># from keras import layers</span></span>
<span id="cb11-10"><a href="#cb11-10" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras <span class="im">import</span> Model</span>
<span id="cb11-11"><a href="#cb11-11" aria-hidden="true" tabindex="-1"></a><span class="co"># from tensorflow.keras import layers</span></span>
<span id="cb11-12"><a href="#cb11-12" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras <span class="im">import</span> losses</span>
<span id="cb11-13"><a href="#cb11-13" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras <span class="im">import</span> utils</span>
<span id="cb11-14"><a href="#cb11-14" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> Conv1D</span>
<span id="cb11-15"><a href="#cb11-15" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> Dense</span>
<span id="cb11-16"><a href="#cb11-16" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> Dropout</span>
<span id="cb11-17"><a href="#cb11-17" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> Embedding</span>
<span id="cb11-18"><a href="#cb11-18" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> Flatten</span>
<span id="cb11-19"><a href="#cb11-19" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> Input</span>
<span id="cb11-20"><a href="#cb11-20" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> MaxPooling1D</span>
<span id="cb11-21"><a href="#cb11-21" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> TextVectorization</span>
<span id="cb11-22"><a href="#cb11-22" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> concatenate</span>
<span id="cb11-23"><a href="#cb11-23" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.utils <span class="im">import</span> plot_model</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="9" id="RobpRSWTvsz3">
<div class="sourceCode" id="cb12"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>matplotlib inline</span>
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns<span class="op">;</span> sns.<span class="bu">set</span>()</span>
<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb12-7"><a href="#cb12-7" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> sklearn.feature_extraction.text</span>
<span id="cb12-8"><a href="#cb12-8" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> sklearn.metrics</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="10" id="l3jUDW5oLLd4">
<div class="sourceCode" id="cb13"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> matplotlib <span class="im">import</span> pyplot <span class="im">as</span> plt</span>
<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> pandas.core.frame <span class="im">import</span> DataFrame</span>
<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> numpy.testing <span class="im">import</span> assert_array_equal</span>
<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> google.colab <span class="im">import</span> files</span>
<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tempfile <span class="im">import</span> NamedTemporaryFile</span>
<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tqdm.notebook <span class="im">import</span> tqdm</span>
<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> typing <span class="im">import</span> KeysView</span>
<span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb13-9"><a href="#cb13-9" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> os <span class="im">import</span> listdir</span>
<span id="cb13-10"><a href="#cb13-10" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> os.path <span class="im">import</span> isfile, join</span>
<span id="cb13-11"><a href="#cb13-11" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> operator <span class="im">import</span> itemgetter</span>
<span id="cb13-12"><a href="#cb13-12" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> optparse <span class="im">import</span> OptionParser</span>
<span id="cb13-13"><a href="#cb13-13" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> collections <span class="im">import</span> Counter</span>
<span id="cb13-14"><a href="#cb13-14" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> collections <span class="im">import</span> defaultdict</span>
<span id="cb13-15"><a href="#cb13-15" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> collections <span class="im">import</span> namedtuple</span>
<span id="cb13-16"><a href="#cb13-16" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> collections <span class="im">import</span> OrderedDict</span>
<span id="cb13-17"><a href="#cb13-17" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> scipy.cluster.hierarchy <span class="im">import</span> dendrogram</span>
<span id="cb13-18"><a href="#cb13-18" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> scipy.sparse <span class="im">import</span> csr_matrix</span>
<span id="cb13-19"><a href="#cb13-19" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> scipy.special <span class="im">import</span> logit</span>
<span id="cb13-20"><a href="#cb13-20" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> scipy.stats.distributions <span class="im">import</span> uniform</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="11" id="9qV1WA-ovopQ">
<div class="sourceCode" id="cb14"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> gensim.models <span class="im">import</span> word2vec</span>
<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> gensim.models <span class="im">import</span> Word2Vec</span>
<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> gensim.models.phrases <span class="im">import</span> Phraser</span>
<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> gensim.models.phrases <span class="im">import</span> Phrases</span>
<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> keras <span class="im">import</span> initializers, regularizers, constraints, optimizers, layers</span>
<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> keras.layers <span class="im">import</span> Bidirectional, GlobalMaxPool1D</span>
<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> keras.layers <span class="im">import</span> Dense, Input, LSTM, Embedding, Dropout, Activation</span>
<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> keras.models <span class="im">import</span> Model</span>
<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> keras.preprocessing.sequence <span class="im">import</span> pad_sequences</span>
<span id="cb14-11"><a href="#cb14-11" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> keras.preprocessing.text <span class="im">import</span> Tokenizer</span>
<span id="cb14-12"><a href="#cb14-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-13"><a href="#cb14-13" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> nltk <span class="im">import</span> ngrams</span>
<span id="cb14-14"><a href="#cb14-14" aria-hidden="true" tabindex="-1"></a><span class="co"># from nltk.util import ngrams</span></span>
<span id="cb14-15"><a href="#cb14-15" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> nltk.corpus <span class="im">import</span> stopwords</span>
<span id="cb14-16"><a href="#cb14-16" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> nltk.stem <span class="im">import</span> PorterStemmer</span>
<span id="cb14-17"><a href="#cb14-17" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> nltk.stem <span class="im">import</span> SnowballStemmer</span>
<span id="cb14-18"><a href="#cb14-18" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> nltk.stem.snowball <span class="im">import</span> SnowballStemmer</span>
<span id="cb14-19"><a href="#cb14-19" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> nltk.stem.wordnet <span class="im">import</span> WordNetLemmatizer</span>
<span id="cb14-20"><a href="#cb14-20" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> nltk.tokenize <span class="im">import</span> sent_tokenize</span>
<span id="cb14-21"><a href="#cb14-21" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> nltk.tokenize <span class="im">import</span> word_tokenize</span>
<span id="cb14-22"><a href="#cb14-22" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-23"><a href="#cb14-23" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn <span class="im">import</span> datasets</span>
<span id="cb14-24"><a href="#cb14-24" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn <span class="im">import</span> metrics</span>
<span id="cb14-25"><a href="#cb14-25" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn <span class="im">import</span> preprocessing</span>
<span id="cb14-26"><a href="#cb14-26" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.base <span class="im">import</span> BaseEstimator</span>
<span id="cb14-27"><a href="#cb14-27" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.base <span class="im">import</span> RegressorMixin</span>
<span id="cb14-28"><a href="#cb14-28" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.cluster <span class="im">import</span> AgglomerativeClustering</span>
<span id="cb14-29"><a href="#cb14-29" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.datasets <span class="im">import</span> dump_svmlight_file</span>
<span id="cb14-30"><a href="#cb14-30" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.datasets <span class="im">import</span> fetch_20newsgroups</span>
<span id="cb14-31"><a href="#cb14-31" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.datasets <span class="im">import</span> load_files</span>
<span id="cb14-32"><a href="#cb14-32" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.decomposition <span class="im">import</span> PCA</span>
<span id="cb14-33"><a href="#cb14-33" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.ensemble <span class="im">import</span> RandomForestClassifier</span>
<span id="cb14-34"><a href="#cb14-34" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.exceptions <span class="im">import</span> NotFittedError</span>
<span id="cb14-35"><a href="#cb14-35" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_extraction <span class="im">import</span> DictVectorizer</span>
<span id="cb14-36"><a href="#cb14-36" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_extraction <span class="im">import</span> FeatureHasher</span>
<span id="cb14-37"><a href="#cb14-37" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_extraction.text <span class="im">import</span> CountVectorizer</span>
<span id="cb14-38"><a href="#cb14-38" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_extraction.text <span class="im">import</span> HashingVectorizer</span>
<span id="cb14-39"><a href="#cb14-39" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_extraction.text <span class="im">import</span> TfidfTransformer</span>
<span id="cb14-40"><a href="#cb14-40" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_extraction.text <span class="im">import</span> TfidfVectorizer</span>
<span id="cb14-41"><a href="#cb14-41" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_selection <span class="im">import</span> chi2</span>
<span id="cb14-42"><a href="#cb14-42" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_selection <span class="im">import</span> SelectFromModel</span>
<span id="cb14-43"><a href="#cb14-43" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_selection <span class="im">import</span> SelectKBest</span>
<span id="cb14-44"><a href="#cb14-44" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LogisticRegression</span>
<span id="cb14-45"><a href="#cb14-45" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> PassiveAggressiveClassifier</span>
<span id="cb14-46"><a href="#cb14-46" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> SGDClassifier</span>
<span id="cb14-47"><a href="#cb14-47" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> accuracy_score</span>
<span id="cb14-48"><a href="#cb14-48" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> balanced_accuracy_score</span>
<span id="cb14-49"><a href="#cb14-49" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> classification_report</span>
<span id="cb14-50"><a href="#cb14-50" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> confusion_matrix</span>
<span id="cb14-51"><a href="#cb14-51" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> f1_score</span>
<span id="cb14-52"><a href="#cb14-52" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> log_loss</span>
<span id="cb14-53"><a href="#cb14-53" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> precision_recall_fscore_support <span class="im">as</span> score</span>
<span id="cb14-54"><a href="#cb14-54" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> roc_auc_score</span>
<span id="cb14-55"><a href="#cb14-55" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> roc_curve</span>
<span id="cb14-56"><a href="#cb14-56" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics.pairwise <span class="im">import</span> cosine_similarity</span>
<span id="cb14-57"><a href="#cb14-57" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> cross_val_score</span>
<span id="cb14-58"><a href="#cb14-58" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> GridSearchCV</span>
<span id="cb14-59"><a href="#cb14-59" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> KFold</span>
<span id="cb14-60"><a href="#cb14-60" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> RandomizedSearchCV</span>
<span id="cb14-61"><a href="#cb14-61" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> train_test_split</span>
<span id="cb14-62"><a href="#cb14-62" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.multiclass <span class="im">import</span> OneVsRestClassifier</span>
<span id="cb14-63"><a href="#cb14-63" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.naive_bayes <span class="im">import</span> ComplementNB</span>
<span id="cb14-64"><a href="#cb14-64" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.naive_bayes <span class="im">import</span> GaussianNB</span>
<span id="cb14-65"><a href="#cb14-65" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.naive_bayes <span class="im">import</span> MultinomialNB</span>
<span id="cb14-66"><a href="#cb14-66" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.naive_bayes <span class="im">import</span> MultinomialNB</span>
<span id="cb14-67"><a href="#cb14-67" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.neighbors <span class="im">import</span> KNeighborsClassifier</span>
<span id="cb14-68"><a href="#cb14-68" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.neighbors <span class="im">import</span> NearestNeighbors</span>
<span id="cb14-69"><a href="#cb14-69" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.pipeline <span class="im">import</span> Pipeline</span>
<span id="cb14-70"><a href="#cb14-70" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.preprocessing <span class="im">import</span> FunctionTransformer</span>
<span id="cb14-71"><a href="#cb14-71" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.preprocessing <span class="im">import</span> LabelEncoder</span>
<span id="cb14-72"><a href="#cb14-72" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.preprocessing <span class="im">import</span> MultiLabelBinarizer</span>
<span id="cb14-73"><a href="#cb14-73" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.preprocessing <span class="im">import</span> OneHotEncoder</span>
<span id="cb14-74"><a href="#cb14-74" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.svm <span class="im">import</span> SVC</span>
<span id="cb14-75"><a href="#cb14-75" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.tree <span class="im">import</span> DecisionTreeClassifier</span>
<span id="cb14-76"><a href="#cb14-76" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.utils <span class="im">import</span> check_array</span>
<span id="cb14-77"><a href="#cb14-77" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.utils <span class="im">import</span> check_X_y</span>
<span id="cb14-78"><a href="#cb14-78" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.utils <span class="im">import</span> shuffle</span>
<span id="cb14-79"><a href="#cb14-79" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.utils.extmath <span class="im">import</span> density</span>
<span id="cb14-80"><a href="#cb14-80" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.utils.extmath <span class="im">import</span> log_logistic</span>
<span id="cb14-81"><a href="#cb14-81" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.utils.multiclass <span class="im">import</span> unique_labels</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="12"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="xHDoxn3NLLd6" data-outputId="34d3b355-763b-48bb-810e-b27f7afa2ce4">
<div class="sourceCode" id="cb15"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="co">## Importing basic python libraries</span></span>
<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>matplotlib inline</span>
<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> __future__ <span class="im">import</span> print_function</span>
<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a>nltk.download(<span class="st">&#39;punkt&#39;</span>)</span>
<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a>nltk.download(<span class="st">&#39;stopwords&#39;</span>)</span>
<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a>nltk.download(<span class="st">&#39;wordnet&#39;</span>)</span></code></pre></div>
<div class="output stream stderr">
<pre><code>[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
</code></pre>
</div>
<div class="output execute_result" data-execution_count="12">
<pre><code>True</code></pre>
</div>
</div>
<section id="setting-static-and-global-variables" class="cell markdown"
id="WR_ES7ESLLd-">
<h3>Setting static and global variables</h3>
</section>
<div class="cell code" data-execution_count="14" id="qcy0E_ODLLd-">
<div class="sourceCode" id="cb18"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>epochs<span class="op">=</span><span class="dv">15</span> <span class="co">#30 #10 #2 #10</span></span>
<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>new_num_labels<span class="op">=</span><span class="dv">25</span> <span class="co">#4</span></span>
<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a>batch_size <span class="op">=</span> <span class="dv">32</span></span>
<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a>seed <span class="op">=</span> <span class="dv">0</span> <span class="co"># 42</span></span>
<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a>VOCAB_SIZE <span class="op">=</span> <span class="dv">10000</span></span>
<span id="cb18-6"><a href="#cb18-6" aria-hidden="true" tabindex="-1"></a>MAX_SEQUENCE_LENGTH <span class="op">=</span> <span class="dv">250</span></span>
<span id="cb18-7"><a href="#cb18-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-8"><a href="#cb18-8" aria-hidden="true" tabindex="-1"></a><span class="co"># Global values.</span></span>
<span id="cb18-9"><a href="#cb18-9" aria-hidden="true" tabindex="-1"></a>WORDS_SIZE<span class="op">=</span><span class="dv">10000</span></span>
<span id="cb18-10"><a href="#cb18-10" aria-hidden="true" tabindex="-1"></a>INPUT_SIZE<span class="op">=</span><span class="dv">500</span></span>
<span id="cb18-11"><a href="#cb18-11" aria-hidden="true" tabindex="-1"></a>NUM_CLASSES<span class="op">=</span>new_num_labels <span class="co">#5 # 2 # NUM_CLASSES=2</span></span>
<span id="cb18-12"><a href="#cb18-12" aria-hidden="true" tabindex="-1"></a>MODEL_NUM<span class="op">=</span><span class="dv">0</span></span>
<span id="cb18-13"><a href="#cb18-13" aria-hidden="true" tabindex="-1"></a>EPOCHS<span class="op">=</span>epochs <span class="co">#15 #10</span></span>
<span id="cb18-14"><a href="#cb18-14" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-15"><a href="#cb18-15" aria-hidden="true" tabindex="-1"></a><span class="co"># Preprocessing params.</span></span>
<span id="cb18-16"><a href="#cb18-16" aria-hidden="true" tabindex="-1"></a>PRETRAINING_BATCH_SIZE <span class="op">=</span> <span class="dv">128</span></span>
<span id="cb18-17"><a href="#cb18-17" aria-hidden="true" tabindex="-1"></a>FINETUNING_BATCH_SIZE <span class="op">=</span> <span class="dv">32</span></span>
<span id="cb18-18"><a href="#cb18-18" aria-hidden="true" tabindex="-1"></a>SEQ_LENGTH <span class="op">=</span> <span class="dv">128</span></span>
<span id="cb18-19"><a href="#cb18-19" aria-hidden="true" tabindex="-1"></a>MASK_RATE <span class="op">=</span> <span class="fl">0.25</span></span>
<span id="cb18-20"><a href="#cb18-20" aria-hidden="true" tabindex="-1"></a>PREDICTIONS_PER_SEQ <span class="op">=</span> <span class="dv">32</span></span>
<span id="cb18-21"><a href="#cb18-21" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-22"><a href="#cb18-22" aria-hidden="true" tabindex="-1"></a><span class="co"># Model params.</span></span>
<span id="cb18-23"><a href="#cb18-23" aria-hidden="true" tabindex="-1"></a>NUM_LAYERS <span class="op">=</span> <span class="dv">3</span></span>
<span id="cb18-24"><a href="#cb18-24" aria-hidden="true" tabindex="-1"></a>MODEL_DIM <span class="op">=</span> <span class="dv">256</span></span>
<span id="cb18-25"><a href="#cb18-25" aria-hidden="true" tabindex="-1"></a>INTERMEDIATE_DIM <span class="op">=</span> <span class="dv">512</span></span>
<span id="cb18-26"><a href="#cb18-26" aria-hidden="true" tabindex="-1"></a>NUM_HEADS <span class="op">=</span> <span class="dv">4</span></span>
<span id="cb18-27"><a href="#cb18-27" aria-hidden="true" tabindex="-1"></a>DROPOUT <span class="op">=</span> <span class="fl">0.1</span></span>
<span id="cb18-28"><a href="#cb18-28" aria-hidden="true" tabindex="-1"></a>NORM_EPSILON <span class="op">=</span> <span class="fl">1e-5</span></span>
<span id="cb18-29"><a href="#cb18-29" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-30"><a href="#cb18-30" aria-hidden="true" tabindex="-1"></a><span class="co"># Training params.</span></span>
<span id="cb18-31"><a href="#cb18-31" aria-hidden="true" tabindex="-1"></a>PRETRAINING_LEARNING_RATE <span class="op">=</span> <span class="fl">5e-4</span></span>
<span id="cb18-32"><a href="#cb18-32" aria-hidden="true" tabindex="-1"></a>PRETRAINING_EPOCHS <span class="op">=</span> <span class="dv">8</span></span>
<span id="cb18-33"><a href="#cb18-33" aria-hidden="true" tabindex="-1"></a>FINETUNING_LEARNING_RATE <span class="op">=</span> <span class="fl">5e-5</span></span>
<span id="cb18-34"><a href="#cb18-34" aria-hidden="true" tabindex="-1"></a>FINETUNING_EPOCHS <span class="op">=</span> <span class="dv">3</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="15"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="QEoxM5mpLLd_" data-outputId="f06edf52-39b3-4f0a-e5c0-7704e33c7db2">
<div class="sourceCode" id="cb19"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Generate random seed</span></span>
<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a><span class="co">#myrand=np.random.randint(1, 99999 + 1)</span></span>
<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a>rand<span class="op">=</span>seed <span class="co"># 1234 # 71926</span></span>
<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a>np.random.seed(rand)</span>
<span id="cb19-5"><a href="#cb19-5" aria-hidden="true" tabindex="-1"></a>tf.random.set_seed(rand)</span>
<span id="cb19-6"><a href="#cb19-6" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Random seed is:&quot;</span>, rand)</span></code></pre></div>
<div class="output stream stdout">
<pre><code>Random seed is: 0
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="16"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="uaangCKQLLd_" data-outputId="1ac761ad-a71d-47dc-d6f1-07e78ae7c19b">
<div class="sourceCode" id="cb21"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Tensorlfow version: &quot;</span>, tf.__version__)</span>
<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Eager mode: &quot;</span>, tf.executing_eagerly())</span>
<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;GPU is&quot;</span>, <span class="st">&quot;available&quot;</span> <span class="cf">if</span> tf.test.is_gpu_available() <span class="cf">else</span> <span class="st">&quot;NOT AVAILABLE&quot;</span>)</span>
<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a><span class="co">## Tensorlfow version:  2.13.1</span></span>
<span id="cb21-6"><a href="#cb21-6" aria-hidden="true" tabindex="-1"></a><span class="co">## Eager mode:  True</span></span>
<span id="cb21-7"><a href="#cb21-7" aria-hidden="true" tabindex="-1"></a><span class="co">## GPU is NOT AVAILABLE</span></span>
<span id="cb21-8"><a href="#cb21-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb21-9"><a href="#cb21-9" aria-hidden="true" tabindex="-1"></a><span class="co">## Tensorlfow version:  2.15.0</span></span>
<span id="cb21-10"><a href="#cb21-10" aria-hidden="true" tabindex="-1"></a><span class="co">## Eager mode:  True</span></span>
<span id="cb21-11"><a href="#cb21-11" aria-hidden="true" tabindex="-1"></a><span class="co">## T4 GPU is available</span></span>
<span id="cb21-12"><a href="#cb21-12" aria-hidden="true" tabindex="-1"></a><span class="co">## High RAM</span></span></code></pre></div>
<div class="output stream stderr">
<pre><code>WARNING:tensorflow:From &lt;ipython-input-16-b8ab0bb2f411&gt;:3: is_gpu_available (from tensorflow.python.framework.test_util) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.config.list_physical_devices(&#39;GPU&#39;)` instead.
</code></pre>
</div>
<div class="output stream stdout">
<pre><code>Tensorlfow version:  2.15.0
Eager mode:  True
GPU is NOT AVAILABLE
</code></pre>
</div>
</div>
<section id="common-sensebaseline-models" class="cell markdown"
id="O0qOArVRxs96">
<h1>Common-sense/Baseline MODELs:</h1>
</section>
<div class="cell code" data-execution_count="19" id="iiSviJdjxr_h">
<div class="sourceCode" id="cb24"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Baseline/Common-sense MODELs</span></span></code></pre></div>
</div>
<section id="1st-model-per-own-dsm030-statistical-data-mining-cw1"
class="cell markdown" id="MaBejLseLLdv">
<h1>1st model per own DSM030 (Statistical Data Mining) CW1</h1>
</section>
<section id="text-prepping" class="cell markdown" id="kbMhgs6jF9IA">
<h1>Text-Prepping</h1>
</section>
<div class="cell code" data-execution_count="21" id="X0abLbs9LLeD">
<div class="sourceCode" id="cb25"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a>porter_stemmer<span class="op">=</span>PorterStemmer()</span>
<span id="cb25-2"><a href="#cb25-2" aria-hidden="true" tabindex="-1"></a><span class="co"># stemmer = PorterStemmer()</span></span>
<span id="cb25-3"><a href="#cb25-3" aria-hidden="true" tabindex="-1"></a>stemmer<span class="op">=</span>SnowballStemmer(<span class="st">&quot;english&quot;</span>,ignore_stopwords<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb25-4"><a href="#cb25-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb25-5"><a href="#cb25-5" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> n_vect(text_list, n<span class="op">=</span><span class="dv">3</span>):</span>
<span id="cb25-6"><a href="#cb25-6" aria-hidden="true" tabindex="-1"></a>    n_vect <span class="op">=</span> []</span>
<span id="cb25-7"><a href="#cb25-7" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> item <span class="kw">in</span> ngrams(text_list,n):</span>
<span id="cb25-8"><a href="#cb25-8" aria-hidden="true" tabindex="-1"></a>        n_vect.append(<span class="st">&#39; &#39;</span>.join(item))</span>
<span id="cb25-9"><a href="#cb25-9" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> n_vect</span>
<span id="cb25-10"><a href="#cb25-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb25-11"><a href="#cb25-11" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> process_text(text,n<span class="op">=</span><span class="dv">1</span>):</span>
<span id="cb25-12"><a href="#cb25-12" aria-hidden="true" tabindex="-1"></a>    <span class="co">## Check if text is a string</span></span>
<span id="cb25-13"><a href="#cb25-13" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> <span class="kw">not</span> <span class="bu">isinstance</span>(text, <span class="bu">str</span>):</span>
<span id="cb25-14"><a href="#cb25-14" aria-hidden="true" tabindex="-1"></a>        text <span class="op">=</span> <span class="bu">str</span>(text)</span>
<span id="cb25-15"><a href="#cb25-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb25-16"><a href="#cb25-16" aria-hidden="true" tabindex="-1"></a>    <span class="co">## Initiating the tokenised text as an empty list</span></span>
<span id="cb25-17"><a href="#cb25-17" aria-hidden="true" tabindex="-1"></a>    tokenised<span class="op">=</span>[]</span>
<span id="cb25-18"><a href="#cb25-18" aria-hidden="true" tabindex="-1"></a>    word_lemmatize<span class="op">=</span>WordNetLemmatizer()</span>
<span id="cb25-19"><a href="#cb25-19" aria-hidden="true" tabindex="-1"></a>    stop_words<span class="op">=</span>stopwords.words(<span class="st">&#39;english&#39;</span>)</span>
<span id="cb25-20"><a href="#cb25-20" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb25-21"><a href="#cb25-21" aria-hidden="true" tabindex="-1"></a>    <span class="co">## Converting text to lower case ## removing all punctuation</span></span>
<span id="cb25-22"><a href="#cb25-22" aria-hidden="true" tabindex="-1"></a>    <span class="co">#1. Convert text to lower case and remove all</span></span>
<span id="cb25-23"><a href="#cb25-23" aria-hidden="true" tabindex="-1"></a>    text<span class="op">=</span>text.lower()</span>
<span id="cb25-24"><a href="#cb25-24" aria-hidden="true" tabindex="-1"></a>    <span class="co"># nopunc=[char for char in text if char not in string.punctuation]</span></span>
<span id="cb25-25"><a href="#cb25-25" aria-hidden="true" tabindex="-1"></a>    nopunc<span class="op">=</span>[char <span class="cf">for</span> char <span class="kw">in</span> text <span class="cf">if</span> char <span class="kw">not</span> <span class="kw">in</span> string.punctuation]</span>
<span id="cb25-26"><a href="#cb25-26" aria-hidden="true" tabindex="-1"></a>    nopunc<span class="op">=</span><span class="st">&#39;&#39;</span>.join(nopunc)</span>
<span id="cb25-27"><a href="#cb25-27" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb25-28"><a href="#cb25-28" aria-hidden="true" tabindex="-1"></a>    <span class="co">#2. Remove all stopwords</span></span>
<span id="cb25-29"><a href="#cb25-29" aria-hidden="true" tabindex="-1"></a>    removed_stop_words<span class="op">=</span>[word <span class="cf">for</span> word <span class="kw">in</span> nopunc.split() <span class="cf">if</span> word.lower() <span class="kw">not</span> <span class="kw">in</span> stop_words]</span>
<span id="cb25-30"><a href="#cb25-30" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb25-31"><a href="#cb25-31" aria-hidden="true" tabindex="-1"></a>    <span class="co">#3. Apply Stemming</span></span>
<span id="cb25-32"><a href="#cb25-32" aria-hidden="true" tabindex="-1"></a>    stemming<span class="op">=</span>[stemmer.stem(word) <span class="cf">for</span> word <span class="kw">in</span> removed_stop_words]</span>
<span id="cb25-33"><a href="#cb25-33" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb25-34"><a href="#cb25-34" aria-hidden="true" tabindex="-1"></a>    <span class="co">#4. Apply Ngram Tokenisation</span></span>
<span id="cb25-35"><a href="#cb25-35" aria-hidden="true" tabindex="-1"></a>    tokenised<span class="op">=</span>n_vect(stemming,n)</span>
<span id="cb25-36"><a href="#cb25-36" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb25-37"><a href="#cb25-37" aria-hidden="true" tabindex="-1"></a>    <span class="co">#5. Remove non-UTF-8 characters</span></span>
<span id="cb25-38"><a href="#cb25-38" aria-hidden="true" tabindex="-1"></a>    tokenised <span class="op">=</span> [word.encode(<span class="st">&quot;utf-8&quot;</span>, <span class="st">&quot;ignore&quot;</span>).decode(<span class="st">&quot;utf-8&quot;</span>) <span class="cf">for</span> word <span class="kw">in</span> tokenised]</span>
<span id="cb25-39"><a href="#cb25-39" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb25-40"><a href="#cb25-40" aria-hidden="true" tabindex="-1"></a>    <span class="co"># # [OPTIONAL] Applying stemming</span></span>
<span id="cb25-41"><a href="#cb25-41" aria-hidden="true" tabindex="-1"></a>    <span class="co"># tokenised=[porter_stemmer.stem(tokenised)] for worg in nltk.word_tokenize(tokenised)] #! text</span></span>
<span id="cb25-42"><a href="#cb25-42" aria-hidden="true" tabindex="-1"></a>    <span class="co"># tokenised=[word_lemmatize.lemmatize(tokenised) for word in tokenised if word not in stop_words]</span></span>
<span id="cb25-43"><a href="#cb25-43" aria-hidden="true" tabindex="-1"></a>    <span class="co"># tokenised.append([word for word in tokenised.split() if word not in stop_words])</span></span>
<span id="cb25-44"><a href="#cb25-44" aria-hidden="true" tabindex="-1"></a>    <span class="co"># vect=sklearn.feature_extraction.text.CountVectorizer(ngram_range=(n,n+1))</span></span>
<span id="cb25-45"><a href="#cb25-45" aria-hidden="true" tabindex="-1"></a>    <span class="co"># vect.fit(tokenised) # word #! text #! tokenised #! token</span></span>
<span id="cb25-46"><a href="#cb25-46" aria-hidden="true" tabindex="-1"></a>    <span class="co"># vect.get_feature_names()</span></span>
<span id="cb25-47"><a href="#cb25-47" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb25-48"><a href="#cb25-48" aria-hidden="true" tabindex="-1"></a>    <span class="co">## Returning the tokenised text as a list</span></span>
<span id="cb25-49"><a href="#cb25-49" aria-hidden="true" tabindex="-1"></a>    <span class="co">#6. Returns the tokenised text as a list</span></span>
<span id="cb25-50"><a href="#cb25-50" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> tokenised <span class="co"># .strip()</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="22"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="jdfKs1gtLLeE" data-outputId="527fb3cd-574f-4e31-f31b-77b8fb5fd9a2">
<div class="sourceCode" id="cb26"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a><span class="co">## Testing on a simple string</span></span>
<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a>process_text(<span class="st">&quot;Here we&#39;re testing the process_text function, results are as follows:&quot;</span>, n<span class="op">=</span><span class="dv">3</span>) <span class="co"># , n=1</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="22">
<pre><code>[&#39;test processtext function&#39;,
 &#39;processtext function result&#39;,
 &#39;function result follow&#39;]</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="23"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:36}"
id="6etELrmzNG6c" data-outputId="ed656160-0d7a-492e-fd58-589be554c8ed">
<div class="sourceCode" id="cb28"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb28-1"><a href="#cb28-1" aria-hidden="true" tabindex="-1"></a><span class="co">&quot;&quot;&quot;</span></span>
<span id="cb28-2"><a href="#cb28-2" aria-hidden="true" tabindex="-1"></a><span class="co">## Download the SARD cpp_8750_files data</span></span>
<span id="cb28-3"><a href="#cb28-3" aria-hidden="true" tabindex="-1"></a><span class="co">&quot;&quot;&quot;</span></span>
<span id="cb28-4"><a href="#cb28-4" aria-hidden="true" tabindex="-1"></a><span class="co"># !gdown 1Q_P8bYpvdSEbp6NnCzfqU3lwQwxUlfE3</span></span>
<span id="cb28-5"><a href="#cb28-5" aria-hidden="true" tabindex="-1"></a><span class="co"># !wget https://raw.githubusercontent.com/c6ai/temp/main/cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="23">
<div class="sourceCode" id="cb29"><pre
class="sourceCode json"><code class="sourceCode json"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">&quot;type&quot;</span><span class="fu">:</span><span class="st">&quot;string&quot;</span><span class="fu">}</span></span></code></pre></div>
</div>
</div>
<div class="cell code" data-execution_count="24"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="iLFMl5kpNG6e" data-outputId="d4e2df27-41c7-48ce-c45d-c52361499409">
<div class="sourceCode" id="cb30"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb30-1"><a href="#cb30-1" aria-hidden="true" tabindex="-1"></a><span class="co"># %%time</span></span>
<span id="cb30-2"><a href="#cb30-2" aria-hidden="true" tabindex="-1"></a>data_url <span class="op">=</span> <span class="st">&#39;https://raw.githubusercontent.com/c6ai/temp/main/cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz&#39;</span></span>
<span id="cb30-3"><a href="#cb30-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-4"><a href="#cb30-4" aria-hidden="true" tabindex="-1"></a>dataset_dir <span class="op">=</span> utils.get_file(</span>
<span id="cb30-5"><a href="#cb30-5" aria-hidden="true" tabindex="-1"></a>    origin<span class="op">=</span>data_url,</span>
<span id="cb30-6"><a href="#cb30-6" aria-hidden="true" tabindex="-1"></a>    untar<span class="op">=</span><span class="va">True</span>,</span>
<span id="cb30-7"><a href="#cb30-7" aria-hidden="true" tabindex="-1"></a>    cache_dir<span class="op">=</span><span class="st">&#39;cache_dir&#39;</span>,</span>
<span id="cb30-8"><a href="#cb30-8" aria-hidden="true" tabindex="-1"></a>    cache_subdir<span class="op">=</span><span class="st">&#39;&#39;</span>)</span>
<span id="cb30-9"><a href="#cb30-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb30-10"><a href="#cb30-10" aria-hidden="true" tabindex="-1"></a>dataset_dir <span class="op">=</span> pathlib.Path(dataset_dir).parent</span>
<span id="cb30-11"><a href="#cb30-11" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 366 µs, sys: 0 ns, total: 366 µs</span></span>
<span id="cb30-12"><a href="#cb30-12" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 348 µs-</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>Downloading data from https://raw.githubusercontent.com/c6ai/temp/main/cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz
895869/895869 [==============================] - 0s 0us/step
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="25"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="vmXdbfa2NG6e" data-outputId="eda201eb-3183-458c-c91b-7c4132fcda57">
<div class="sourceCode" id="cb32"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb32-1"><a href="#cb32-1" aria-hidden="true" tabindex="-1"></a><span class="bu">list</span>(dataset_dir.iterdir())</span></code></pre></div>
<div class="output execute_result" data-execution_count="25">
<pre><code>[PosixPath(&#39;/tmp/.keras/cpp_cleaner_8750_files_each_350_top_25_cwe_omitted&#39;),
 PosixPath(&#39;/tmp/.keras/cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz&#39;)]</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="26" id="wguW9M0YNG6f">
<div class="sourceCode" id="cb34"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb34-1"><a href="#cb34-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Load text files with categories as subfolder names.</span></span>
<span id="cb34-2"><a href="#cb34-2" aria-hidden="true" tabindex="-1"></a><span class="co"># data = datasets.load_files(&#39;20news-bydate-test&#39;)</span></span>
<span id="cb34-3"><a href="#cb34-3" aria-hidden="true" tabindex="-1"></a>data <span class="op">=</span> datasets.load_files(<span class="st">&#39;/tmp/.keras/cpp_cleaner_8750_files_each_350_top_25_cwe_omitted&#39;</span>)</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="27"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="Bps6cEUQNG6f" data-outputId="a4464652-c303-47f6-ded6-2de360260b43">
<div class="sourceCode" id="cb35"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb35-1"><a href="#cb35-1" aria-hidden="true" tabindex="-1"></a><span class="co">## cd /tmp/.keras/cpp_cleaner_8750_files_each_350_top_25_cwe_omitted</span></span>
<span id="cb35-2"><a href="#cb35-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !pwd ## /content</span></span>
<span id="cb35-3"><a href="#cb35-3" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>ls <span class="op">-</span>lh <span class="op">/</span>tmp<span class="op">/</span>.keras<span class="op">/</span>cpp_cleaner_8750_files_each_350_top_25_cwe_omitted</span></code></pre></div>
<div class="output stream stdout">
<pre><code>total 464K
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE121_Stack_Based_Buffer_Overflow
drwxr-xr-x 2 root root 16K Jan  2 08:51 CWE122_Heap_Based_Buffer_Overflow
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE124_Buffer_Underwrite
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE126_Buffer_Overread
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE127_Buffer_Underread
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE134_Uncontrolled_Format_String
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE190_Integer_Overflow
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE191_Integer_Underflow
drwxr-xr-x 2 root root 12K Jan  2 04:10 CWE194_Unexpected_Sign_Extension
drwxr-xr-x 2 root root 12K Jan  2 04:10 CWE195_Signed_to_Unsigned_Conversion_Error
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE197_Numeric_Truncation_Error
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE23_Relative_Path_Traversal
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE369_Divide_by_Zero
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE36_Absolute_Path_Traversal
drwxr-xr-x 2 root root 12K Jan  2 04:10 CWE400_Resource_Exhaustion
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE401_Memory_Leak
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE415_Double_Free
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE457_Use_of_Uninitialized_Variable
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE563_Unused_Variable
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE590_Free_Memory_Not_on_Heap
drwxr-xr-x 2 root root 16K Jan  2 04:10 CWE680_Integer_Overflow_to_Buffer_Overflow
drwxr-xr-x 2 root root 12K Jan  2 04:10 CWE690_NULL_Deref_From_Return
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE762_Mismatched_Memory_Management_Routines
drwxr-xr-x 2 root root 20K Jan  2 04:10 CWE789_Uncontrolled_Mem_Alloc
drwxr-xr-x 2 root root 24K Jan  2 04:10 CWE78_OS_Command_Injection
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="28"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="xn51aFjdNG6g" data-outputId="16ff2691-331f-4cc3-a9b2-b3cc1125dd72">
<div class="sourceCode" id="cb37"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb37-1"><a href="#cb37-1" aria-hidden="true" tabindex="-1"></a>data.keys()</span></code></pre></div>
<div class="output execute_result" data-execution_count="28">
<pre><code>dict_keys([&#39;data&#39;, &#39;filenames&#39;, &#39;target_names&#39;, &#39;target&#39;, &#39;DESCR&#39;])</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="29"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="MmblBF4nNG6h" data-outputId="958a11fb-426d-4145-d86e-70e740eb37ce">
<div class="sourceCode" id="cb39"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb39-1"><a href="#cb39-1" aria-hidden="true" tabindex="-1"></a><span class="co"># data.target</span></span>
<span id="cb39-2"><a href="#cb39-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="bu">len</span>(data.target)) <span class="co">## 7628</span></span>
<span id="cb39-3"><a href="#cb39-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(data.target) <span class="co">## [11  3 18 ...  5  8  8]</span></span>
<span id="cb39-4"><a href="#cb39-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb39-5"><a href="#cb39-5" aria-hidden="true" tabindex="-1"></a>data_targets_list <span class="op">=</span> <span class="bu">list</span>(np.unique(data.target))</span>
<span id="cb39-6"><a href="#cb39-6" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="bu">len</span>(data_targets_list))  <span class="co">## 25 ## number of unique targets</span></span>
<span id="cb39-7"><a href="#cb39-7" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(data_targets_list)  <span class="co">## list of unique targets</span></span>
<span id="cb39-8"><a href="#cb39-8" aria-hidden="true" tabindex="-1"></a><span class="co">## [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>7628
[11  3 18 ...  5  8  8]
25
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="30"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:36}"
id="SgeiBSE3NG6h" data-outputId="890c7652-c12d-4595-9d00-4947e5de6cf5">
<div class="sourceCode" id="cb41"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb41-1"><a href="#cb41-1" aria-hidden="true" tabindex="-1"></a>data.target_names[<span class="dv">13</span>]</span></code></pre></div>
<div class="output execute_result" data-execution_count="30">
<div class="sourceCode" id="cb42"><pre
class="sourceCode json"><code class="sourceCode json"><span id="cb42-1"><a href="#cb42-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">&quot;type&quot;</span><span class="fu">:</span><span class="st">&quot;string&quot;</span><span class="fu">}</span></span></code></pre></div>
</div>
</div>
<div class="cell code" data-execution_count="31"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="01OWp2hBNG6i" data-outputId="280d1e80-f042-4fbb-93a4-e44c9a485ed2">
<div class="sourceCode" id="cb43"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb43-1"><a href="#cb43-1" aria-hidden="true" tabindex="-1"></a>data.target_names</span></code></pre></div>
<div class="output execute_result" data-execution_count="31">
<pre><code>[&#39;CWE121_Stack_Based_Buffer_Overflow&#39;,
 &#39;CWE122_Heap_Based_Buffer_Overflow&#39;,
 &#39;CWE124_Buffer_Underwrite&#39;,
 &#39;CWE126_Buffer_Overread&#39;,
 &#39;CWE127_Buffer_Underread&#39;,
 &#39;CWE134_Uncontrolled_Format_String&#39;,
 &#39;CWE190_Integer_Overflow&#39;,
 &#39;CWE191_Integer_Underflow&#39;,
 &#39;CWE194_Unexpected_Sign_Extension&#39;,
 &#39;CWE195_Signed_to_Unsigned_Conversion_Error&#39;,
 &#39;CWE197_Numeric_Truncation_Error&#39;,
 &#39;CWE23_Relative_Path_Traversal&#39;,
 &#39;CWE369_Divide_by_Zero&#39;,
 &#39;CWE36_Absolute_Path_Traversal&#39;,
 &#39;CWE400_Resource_Exhaustion&#39;,
 &#39;CWE401_Memory_Leak&#39;,
 &#39;CWE415_Double_Free&#39;,
 &#39;CWE457_Use_of_Uninitialized_Variable&#39;,
 &#39;CWE563_Unused_Variable&#39;,
 &#39;CWE590_Free_Memory_Not_on_Heap&#39;,
 &#39;CWE680_Integer_Overflow_to_Buffer_Overflow&#39;,
 &#39;CWE690_NULL_Deref_From_Return&#39;,
 &#39;CWE762_Mismatched_Memory_Management_Routines&#39;,
 &#39;CWE789_Uncontrolled_Mem_Alloc&#39;,
 &#39;CWE78_OS_Command_Injection&#39;]</code></pre>
</div>
</div>
<section id="section" class="cell markdown" id="jnO01xOkRZQn">
<h3>...</h3>
</section>
<div class="cell code" data-execution_count="32" id="TxhKmJ73NG6j">
<div class="sourceCode" id="cb45"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb45-1"><a href="#cb45-1" aria-hidden="true" tabindex="-1"></a>data_target_tuples <span class="op">=</span> []</span>
<span id="cb45-2"><a href="#cb45-2" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> text,category,category_names <span class="kw">in</span> <span class="bu">zip</span>(data[<span class="st">&#39;data&#39;</span>], data[<span class="st">&#39;target&#39;</span>], data[<span class="st">&#39;target_names&#39;</span>]):</span>
<span id="cb45-3"><a href="#cb45-3" aria-hidden="true" tabindex="-1"></a>    decoded <span class="op">=</span> text.decode(<span class="st">&quot;cp1252&quot;</span>)</span>
<span id="cb45-4"><a href="#cb45-4" aria-hidden="true" tabindex="-1"></a>    one_line <span class="op">=</span> <span class="bu">str</span>.join(<span class="st">&quot; &quot;</span>, decoded.splitlines())</span>
<span id="cb45-5"><a href="#cb45-5" aria-hidden="true" tabindex="-1"></a>    data_target_tuples.append((one_line, category, category_names))</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="33"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="owuMDM4JNG6j" data-outputId="fddfbf4c-ef71-45b8-aaf9-67f4ef5430b0">
<div class="sourceCode" id="cb46"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb46-1"><a href="#cb46-1" aria-hidden="true" tabindex="-1"></a><span class="bu">len</span>(data_target_tuples) <span class="co">## 25</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="33">
<pre><code>25</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="34" id="NmaTu5jdNG6k">
<div class="sourceCode" id="cb48"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb48-1"><a href="#cb48-1" aria-hidden="true" tabindex="-1"></a>data_tuples <span class="op">=</span> []</span>
<span id="cb48-2"><a href="#cb48-2" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> text,category <span class="kw">in</span> <span class="bu">zip</span>(data[<span class="st">&#39;data&#39;</span>], data[<span class="st">&#39;target&#39;</span>]):</span>
<span id="cb48-3"><a href="#cb48-3" aria-hidden="true" tabindex="-1"></a>    decoded <span class="op">=</span> text.decode(<span class="st">&quot;cp1252&quot;</span>)</span>
<span id="cb48-4"><a href="#cb48-4" aria-hidden="true" tabindex="-1"></a>    one_line <span class="op">=</span> <span class="bu">str</span>.join(<span class="st">&quot; &quot;</span>, decoded.splitlines())</span>
<span id="cb48-5"><a href="#cb48-5" aria-hidden="true" tabindex="-1"></a>    data_tuples.append((one_line, category))</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="35"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="rzlkI0abNG6k" data-outputId="6d39428b-287b-4365-c59d-b8152dd9e2f0">
<div class="sourceCode" id="cb49"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb49-1"><a href="#cb49-1" aria-hidden="true" tabindex="-1"></a><span class="co"># len(tuple_list) ## 10639 ## 10650</span></span>
<span id="cb49-2"><a href="#cb49-2" aria-hidden="true" tabindex="-1"></a><span class="bu">len</span>(data_tuples) <span class="co">## 7628</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="35">
<pre><code>7628</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="36"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:423}"
id="VZL1LmlpNG6k" data-outputId="39c3521b-cfbd-4e7c-aa4d-02c612f266e9">
<div class="sourceCode" id="cb51"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb51-1"><a href="#cb51-1" aria-hidden="true" tabindex="-1"></a>df <span class="op">=</span> pd.DataFrame(data_tuples, columns<span class="op">=</span>[<span class="st">&#39;Text&#39;</span>,<span class="st">&#39;Category&#39;</span>])</span>
<span id="cb51-2"><a href="#cb51-2" aria-hidden="true" tabindex="-1"></a><span class="co"># data=df</span></span>
<span id="cb51-3"><a href="#cb51-3" aria-hidden="true" tabindex="-1"></a>df</span></code></pre></div>
<div class="output execute_result" data-execution_count="36">

  <div id="df-87fc3afc-56cf-4d9f-a93c-d25de437c644" class="colab-df-container">
    <div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Text</th>
      <th>Category</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>#include "std_testcase.h"  #ifdef _WIN32 #d...</td>
      <td>11</td>
    </tr>
    <tr>
      <th>1</th>
      <td>#include "std_testcase.h"  #include &lt;wchar....</td>
      <td>3</td>
    </tr>
    <tr>
      <th>2</th>
      <td>#ifndef OMITGOOD  #include "std_testcase.h" ...</td>
      <td>18</td>
    </tr>
    <tr>
      <th>3</th>
      <td>#include "std_testcase.h" #include "environ...</td>
      <td>24</td>
    </tr>
    <tr>
      <th>4</th>
      <td>#include "std_testcase.h"  #include &lt;wchar....</td>
      <td>4</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
    </tr>
    <tr>
      <th>7623</th>
      <td>#include "std_testcase.h"  #include &lt;wchar....</td>
      <td>16</td>
    </tr>
    <tr>
      <th>7624</th>
      <td>#include "std_testcase.h"  #ifdef _WIN32 #d...</td>
      <td>11</td>
    </tr>
    <tr>
      <th>7625</th>
      <td>#include "std_testcase.h"  #ifndef _WIN32 #...</td>
      <td>5</td>
    </tr>
    <tr>
      <th>7626</th>
      <td>#include "std_testcase.h" #include &lt;vector&gt;...</td>
      <td>8</td>
    </tr>
    <tr>
      <th>7627</th>
      <td>#include "std_testcase.h" #include &lt;map&gt;  u...</td>
      <td>8</td>
    </tr>
  </tbody>
</table>
<p>7628 rows × 2 columns</p>
</div>
    <div class="colab-df-buttons">

  <div class="colab-df-container">
    <button class="colab-df-convert" onclick="convertToInteractive('df-87fc3afc-56cf-4d9f-a93c-d25de437c644')"
            title="Convert this dataframe to an interactive table."
            style="display:none;">

  <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960">
    <path d="M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z"/>
  </svg>
    </button>

  <style>
    .colab-df-container {
      display:flex;
      gap: 12px;
    }

    .colab-df-convert {
      background-color: #E8F0FE;
      border: none;
      border-radius: 50%;
      cursor: pointer;
      display: none;
      fill: #1967D2;
      height: 32px;
      padding: 0 0 0 0;
      width: 32px;
    }

    .colab-df-convert:hover {
      background-color: #E2EBFA;
      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
      fill: #174EA6;
    }

    .colab-df-buttons div {
      margin-bottom: 4px;
    }

    [theme=dark] .colab-df-convert {
      background-color: #3B4455;
      fill: #D2E3FC;
    }

    [theme=dark] .colab-df-convert:hover {
      background-color: #434B5C;
      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
      fill: #FFFFFF;
    }
  </style>

    <script>
      const buttonEl =
        document.querySelector('#df-87fc3afc-56cf-4d9f-a93c-d25de437c644 button.colab-df-convert');
      buttonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';

      async function convertToInteractive(key) {
        const element = document.querySelector('#df-87fc3afc-56cf-4d9f-a93c-d25de437c644');
        const dataTable =
          await google.colab.kernel.invokeFunction('convertToInteractive',
                                                    [key], {});
        if (!dataTable) return;

        const docLinkHtml = 'Like what you see? Visit the ' +
          '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
          + ' to learn more about interactive tables.';
        element.innerHTML = '';
        dataTable['output_type'] = 'display_data';
        await google.colab.output.renderOutput(dataTable, element);
        const docLink = document.createElement('div');
        docLink.innerHTML = docLinkHtml;
        element.appendChild(docLink);
      }
    </script>
  </div>


<div id="df-cc4e6939-fa1d-458e-a3ca-b2638036a807">
  <button class="colab-df-quickchart" onclick="quickchart('df-cc4e6939-fa1d-458e-a3ca-b2638036a807')"
            title="Suggest charts"
            style="display:none;">

<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
     width="24px">
    <g>
        <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"/>
    </g>
</svg>
  </button>

<style>
  .colab-df-quickchart {
      --bg-color: #E8F0FE;
      --fill-color: #1967D2;
      --hover-bg-color: #E2EBFA;
      --hover-fill-color: #174EA6;
      --disabled-fill-color: #AAA;
      --disabled-bg-color: #DDD;
  }

  [theme=dark] .colab-df-quickchart {
      --bg-color: #3B4455;
      --fill-color: #D2E3FC;
      --hover-bg-color: #434B5C;
      --hover-fill-color: #FFFFFF;
      --disabled-bg-color: #3B4455;
      --disabled-fill-color: #666;
  }

  .colab-df-quickchart {
    background-color: var(--bg-color);
    border: none;
    border-radius: 50%;
    cursor: pointer;
    display: none;
    fill: var(--fill-color);
    height: 32px;
    padding: 0;
    width: 32px;
  }

  .colab-df-quickchart:hover {
    background-color: var(--hover-bg-color);
    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
    fill: var(--button-hover-fill-color);
  }

  .colab-df-quickchart-complete:disabled,
  .colab-df-quickchart-complete:disabled:hover {
    background-color: var(--disabled-bg-color);
    fill: var(--disabled-fill-color);
    box-shadow: none;
  }

  .colab-df-spinner {
    border: 2px solid var(--fill-color);
    border-color: transparent;
    border-bottom-color: var(--fill-color);
    animation:
      spin 1s steps(1) infinite;
  }

  @keyframes spin {
    0% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
      border-left-color: var(--fill-color);
    }
    20% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    30% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
      border-right-color: var(--fill-color);
    }
    40% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    60% {
      border-color: transparent;
      border-right-color: var(--fill-color);
    }
    80% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-bottom-color: var(--fill-color);
    }
    90% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
    }
  }
</style>

  <script>
    async function quickchart(key) {
      const quickchartButtonEl =
        document.querySelector('#' + key + ' button');
      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.
      quickchartButtonEl.classList.add('colab-df-spinner');
      try {
        const charts = await google.colab.kernel.invokeFunction(
            'suggestCharts', [key], {});
      } catch (error) {
        console.error('Error during call to suggestCharts:', error);
      }
      quickchartButtonEl.classList.remove('colab-df-spinner');
      quickchartButtonEl.classList.add('colab-df-quickchart-complete');
    }
    (() => {
      let quickchartButtonEl =
        document.querySelector('#df-cc4e6939-fa1d-458e-a3ca-b2638036a807 button');
      quickchartButtonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';
    })();
  </script>
</div>
    </div>
  </div>

</div>
</div>
<div class="cell code" data-execution_count="37"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:423}"
id="sA2MVi7VNG6l" data-outputId="4fb784af-5388-4ce8-a3b7-4042aed5eb12">
<div class="sourceCode" id="cb52"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb52-1"><a href="#cb52-1" aria-hidden="true" tabindex="-1"></a>df</span></code></pre></div>
<div class="output execute_result" data-execution_count="37">

  <div id="df-ab5ef6a1-0583-4472-bc0f-72d6b8cc4e1b" class="colab-df-container">
    <div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Text</th>
      <th>Category</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>#include "std_testcase.h"  #ifdef _WIN32 #d...</td>
      <td>11</td>
    </tr>
    <tr>
      <th>1</th>
      <td>#include "std_testcase.h"  #include &lt;wchar....</td>
      <td>3</td>
    </tr>
    <tr>
      <th>2</th>
      <td>#ifndef OMITGOOD  #include "std_testcase.h" ...</td>
      <td>18</td>
    </tr>
    <tr>
      <th>3</th>
      <td>#include "std_testcase.h" #include "environ...</td>
      <td>24</td>
    </tr>
    <tr>
      <th>4</th>
      <td>#include "std_testcase.h"  #include &lt;wchar....</td>
      <td>4</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
    </tr>
    <tr>
      <th>7623</th>
      <td>#include "std_testcase.h"  #include &lt;wchar....</td>
      <td>16</td>
    </tr>
    <tr>
      <th>7624</th>
      <td>#include "std_testcase.h"  #ifdef _WIN32 #d...</td>
      <td>11</td>
    </tr>
    <tr>
      <th>7625</th>
      <td>#include "std_testcase.h"  #ifndef _WIN32 #...</td>
      <td>5</td>
    </tr>
    <tr>
      <th>7626</th>
      <td>#include "std_testcase.h" #include &lt;vector&gt;...</td>
      <td>8</td>
    </tr>
    <tr>
      <th>7627</th>
      <td>#include "std_testcase.h" #include &lt;map&gt;  u...</td>
      <td>8</td>
    </tr>
  </tbody>
</table>
<p>7628 rows × 2 columns</p>
</div>
    <div class="colab-df-buttons">

  <div class="colab-df-container">
    <button class="colab-df-convert" onclick="convertToInteractive('df-ab5ef6a1-0583-4472-bc0f-72d6b8cc4e1b')"
            title="Convert this dataframe to an interactive table."
            style="display:none;">

  <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960">
    <path d="M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z"/>
  </svg>
    </button>

  <style>
    .colab-df-container {
      display:flex;
      gap: 12px;
    }

    .colab-df-convert {
      background-color: #E8F0FE;
      border: none;
      border-radius: 50%;
      cursor: pointer;
      display: none;
      fill: #1967D2;
      height: 32px;
      padding: 0 0 0 0;
      width: 32px;
    }

    .colab-df-convert:hover {
      background-color: #E2EBFA;
      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
      fill: #174EA6;
    }

    .colab-df-buttons div {
      margin-bottom: 4px;
    }

    [theme=dark] .colab-df-convert {
      background-color: #3B4455;
      fill: #D2E3FC;
    }

    [theme=dark] .colab-df-convert:hover {
      background-color: #434B5C;
      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
      fill: #FFFFFF;
    }
  </style>

    <script>
      const buttonEl =
        document.querySelector('#df-ab5ef6a1-0583-4472-bc0f-72d6b8cc4e1b button.colab-df-convert');
      buttonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';

      async function convertToInteractive(key) {
        const element = document.querySelector('#df-ab5ef6a1-0583-4472-bc0f-72d6b8cc4e1b');
        const dataTable =
          await google.colab.kernel.invokeFunction('convertToInteractive',
                                                    [key], {});
        if (!dataTable) return;

        const docLinkHtml = 'Like what you see? Visit the ' +
          '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
          + ' to learn more about interactive tables.';
        element.innerHTML = '';
        dataTable['output_type'] = 'display_data';
        await google.colab.output.renderOutput(dataTable, element);
        const docLink = document.createElement('div');
        docLink.innerHTML = docLinkHtml;
        element.appendChild(docLink);
      }
    </script>
  </div>


<div id="df-7a13f4a4-8a86-4375-b57a-ef22f632b11a">
  <button class="colab-df-quickchart" onclick="quickchart('df-7a13f4a4-8a86-4375-b57a-ef22f632b11a')"
            title="Suggest charts"
            style="display:none;">

<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
     width="24px">
    <g>
        <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"/>
    </g>
</svg>
  </button>

<style>
  .colab-df-quickchart {
      --bg-color: #E8F0FE;
      --fill-color: #1967D2;
      --hover-bg-color: #E2EBFA;
      --hover-fill-color: #174EA6;
      --disabled-fill-color: #AAA;
      --disabled-bg-color: #DDD;
  }

  [theme=dark] .colab-df-quickchart {
      --bg-color: #3B4455;
      --fill-color: #D2E3FC;
      --hover-bg-color: #434B5C;
      --hover-fill-color: #FFFFFF;
      --disabled-bg-color: #3B4455;
      --disabled-fill-color: #666;
  }

  .colab-df-quickchart {
    background-color: var(--bg-color);
    border: none;
    border-radius: 50%;
    cursor: pointer;
    display: none;
    fill: var(--fill-color);
    height: 32px;
    padding: 0;
    width: 32px;
  }

  .colab-df-quickchart:hover {
    background-color: var(--hover-bg-color);
    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
    fill: var(--button-hover-fill-color);
  }

  .colab-df-quickchart-complete:disabled,
  .colab-df-quickchart-complete:disabled:hover {
    background-color: var(--disabled-bg-color);
    fill: var(--disabled-fill-color);
    box-shadow: none;
  }

  .colab-df-spinner {
    border: 2px solid var(--fill-color);
    border-color: transparent;
    border-bottom-color: var(--fill-color);
    animation:
      spin 1s steps(1) infinite;
  }

  @keyframes spin {
    0% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
      border-left-color: var(--fill-color);
    }
    20% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    30% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
      border-right-color: var(--fill-color);
    }
    40% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    60% {
      border-color: transparent;
      border-right-color: var(--fill-color);
    }
    80% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-bottom-color: var(--fill-color);
    }
    90% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
    }
  }
</style>

  <script>
    async function quickchart(key) {
      const quickchartButtonEl =
        document.querySelector('#' + key + ' button');
      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.
      quickchartButtonEl.classList.add('colab-df-spinner');
      try {
        const charts = await google.colab.kernel.invokeFunction(
            'suggestCharts', [key], {});
      } catch (error) {
        console.error('Error during call to suggestCharts:', error);
      }
      quickchartButtonEl.classList.remove('colab-df-spinner');
      quickchartButtonEl.classList.add('colab-df-quickchart-complete');
    }
    (() => {
      let quickchartButtonEl =
        document.querySelector('#df-7a13f4a4-8a86-4375-b57a-ef22f632b11a button');
      quickchartButtonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';
    })();
  </script>
</div>
    </div>
  </div>

</div>
</div>
<section id="further-wrangling--eda" class="cell markdown"
id="ujXZx6NaPkDg">
<h1>Further Wrangling &amp; EDA</h1>
</section>
<div class="cell code" data-execution_count="38"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:423}"
id="db5kqpdiPkDs" data-outputId="b78f5397-2924-48de-d309-9fbfc5df10f5">
<div class="sourceCode" id="cb53"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb53-1"><a href="#cb53-1" aria-hidden="true" tabindex="-1"></a>df.shape <span class="co">## (10649, 4)</span></span>
<span id="cb53-2"><a href="#cb53-2" aria-hidden="true" tabindex="-1"></a>df <span class="co">## 10649 rows × 4 columns</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="38">

  <div id="df-af484478-661a-46a0-a666-d89ca820b70c" class="colab-df-container">
    <div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Text</th>
      <th>Category</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>#include "std_testcase.h"  #ifdef _WIN32 #d...</td>
      <td>11</td>
    </tr>
    <tr>
      <th>1</th>
      <td>#include "std_testcase.h"  #include &lt;wchar....</td>
      <td>3</td>
    </tr>
    <tr>
      <th>2</th>
      <td>#ifndef OMITGOOD  #include "std_testcase.h" ...</td>
      <td>18</td>
    </tr>
    <tr>
      <th>3</th>
      <td>#include "std_testcase.h" #include "environ...</td>
      <td>24</td>
    </tr>
    <tr>
      <th>4</th>
      <td>#include "std_testcase.h"  #include &lt;wchar....</td>
      <td>4</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
    </tr>
    <tr>
      <th>7623</th>
      <td>#include "std_testcase.h"  #include &lt;wchar....</td>
      <td>16</td>
    </tr>
    <tr>
      <th>7624</th>
      <td>#include "std_testcase.h"  #ifdef _WIN32 #d...</td>
      <td>11</td>
    </tr>
    <tr>
      <th>7625</th>
      <td>#include "std_testcase.h"  #ifndef _WIN32 #...</td>
      <td>5</td>
    </tr>
    <tr>
      <th>7626</th>
      <td>#include "std_testcase.h" #include &lt;vector&gt;...</td>
      <td>8</td>
    </tr>
    <tr>
      <th>7627</th>
      <td>#include "std_testcase.h" #include &lt;map&gt;  u...</td>
      <td>8</td>
    </tr>
  </tbody>
</table>
<p>7628 rows × 2 columns</p>
</div>
    <div class="colab-df-buttons">

  <div class="colab-df-container">
    <button class="colab-df-convert" onclick="convertToInteractive('df-af484478-661a-46a0-a666-d89ca820b70c')"
            title="Convert this dataframe to an interactive table."
            style="display:none;">

  <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960">
    <path d="M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z"/>
  </svg>
    </button>

  <style>
    .colab-df-container {
      display:flex;
      gap: 12px;
    }

    .colab-df-convert {
      background-color: #E8F0FE;
      border: none;
      border-radius: 50%;
      cursor: pointer;
      display: none;
      fill: #1967D2;
      height: 32px;
      padding: 0 0 0 0;
      width: 32px;
    }

    .colab-df-convert:hover {
      background-color: #E2EBFA;
      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
      fill: #174EA6;
    }

    .colab-df-buttons div {
      margin-bottom: 4px;
    }

    [theme=dark] .colab-df-convert {
      background-color: #3B4455;
      fill: #D2E3FC;
    }

    [theme=dark] .colab-df-convert:hover {
      background-color: #434B5C;
      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
      fill: #FFFFFF;
    }
  </style>

    <script>
      const buttonEl =
        document.querySelector('#df-af484478-661a-46a0-a666-d89ca820b70c button.colab-df-convert');
      buttonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';

      async function convertToInteractive(key) {
        const element = document.querySelector('#df-af484478-661a-46a0-a666-d89ca820b70c');
        const dataTable =
          await google.colab.kernel.invokeFunction('convertToInteractive',
                                                    [key], {});
        if (!dataTable) return;

        const docLinkHtml = 'Like what you see? Visit the ' +
          '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
          + ' to learn more about interactive tables.';
        element.innerHTML = '';
        dataTable['output_type'] = 'display_data';
        await google.colab.output.renderOutput(dataTable, element);
        const docLink = document.createElement('div');
        docLink.innerHTML = docLinkHtml;
        element.appendChild(docLink);
      }
    </script>
  </div>


<div id="df-56f8dfbd-2233-4b4c-8c00-18450ccd6fba">
  <button class="colab-df-quickchart" onclick="quickchart('df-56f8dfbd-2233-4b4c-8c00-18450ccd6fba')"
            title="Suggest charts"
            style="display:none;">

<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
     width="24px">
    <g>
        <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"/>
    </g>
</svg>
  </button>

<style>
  .colab-df-quickchart {
      --bg-color: #E8F0FE;
      --fill-color: #1967D2;
      --hover-bg-color: #E2EBFA;
      --hover-fill-color: #174EA6;
      --disabled-fill-color: #AAA;
      --disabled-bg-color: #DDD;
  }

  [theme=dark] .colab-df-quickchart {
      --bg-color: #3B4455;
      --fill-color: #D2E3FC;
      --hover-bg-color: #434B5C;
      --hover-fill-color: #FFFFFF;
      --disabled-bg-color: #3B4455;
      --disabled-fill-color: #666;
  }

  .colab-df-quickchart {
    background-color: var(--bg-color);
    border: none;
    border-radius: 50%;
    cursor: pointer;
    display: none;
    fill: var(--fill-color);
    height: 32px;
    padding: 0;
    width: 32px;
  }

  .colab-df-quickchart:hover {
    background-color: var(--hover-bg-color);
    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
    fill: var(--button-hover-fill-color);
  }

  .colab-df-quickchart-complete:disabled,
  .colab-df-quickchart-complete:disabled:hover {
    background-color: var(--disabled-bg-color);
    fill: var(--disabled-fill-color);
    box-shadow: none;
  }

  .colab-df-spinner {
    border: 2px solid var(--fill-color);
    border-color: transparent;
    border-bottom-color: var(--fill-color);
    animation:
      spin 1s steps(1) infinite;
  }

  @keyframes spin {
    0% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
      border-left-color: var(--fill-color);
    }
    20% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    30% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
      border-right-color: var(--fill-color);
    }
    40% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    60% {
      border-color: transparent;
      border-right-color: var(--fill-color);
    }
    80% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-bottom-color: var(--fill-color);
    }
    90% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
    }
  }
</style>

  <script>
    async function quickchart(key) {
      const quickchartButtonEl =
        document.querySelector('#' + key + ' button');
      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.
      quickchartButtonEl.classList.add('colab-df-spinner');
      try {
        const charts = await google.colab.kernel.invokeFunction(
            'suggestCharts', [key], {});
      } catch (error) {
        console.error('Error during call to suggestCharts:', error);
      }
      quickchartButtonEl.classList.remove('colab-df-spinner');
      quickchartButtonEl.classList.add('colab-df-quickchart-complete');
    }
    (() => {
      let quickchartButtonEl =
        document.querySelector('#df-56f8dfbd-2233-4b4c-8c00-18450ccd6fba button');
      quickchartButtonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';
    })();
  </script>
</div>
    </div>
  </div>

</div>
</div>
<div class="cell code" data-execution_count="39"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="XM2nKZp4PkDt" data-outputId="fc3fe3c0-9052-4f49-ddd7-7ca4c0cb0e8f">
<div class="sourceCode" id="cb54"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb54-1"><a href="#cb54-1" aria-hidden="true" tabindex="-1"></a>df.dtypes <span class="co">#  dtype: object</span></span>
<span id="cb54-2"><a href="#cb54-2" aria-hidden="true" tabindex="-1"></a><span class="co"># data.shape ## (10649, 4)</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="39">
<pre><code>Text        object
Category     int64
dtype: object</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="40"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:300}"
id="6q-JYUTLPkDu" data-outputId="e0325b1d-48c7-412b-ccc9-12025f2eb035">
<div class="sourceCode" id="cb56"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb56-1"><a href="#cb56-1" aria-hidden="true" tabindex="-1"></a><span class="co"># df[[&#39;Text&#39;,&quot;Category&quot;]].describe()</span></span>
<span id="cb56-2"><a href="#cb56-2" aria-hidden="true" tabindex="-1"></a>df.describe()</span></code></pre></div>
<div class="output execute_result" data-execution_count="40">

  <div id="df-77ea06d3-fe83-4ed7-a412-850e997a5511" class="colab-df-container">
    <div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Category</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>count</th>
      <td>7628.000000</td>
    </tr>
    <tr>
      <th>mean</th>
      <td>11.998820</td>
    </tr>
    <tr>
      <th>std</th>
      <td>7.343498</td>
    </tr>
    <tr>
      <th>min</th>
      <td>0.000000</td>
    </tr>
    <tr>
      <th>25%</th>
      <td>5.000000</td>
    </tr>
    <tr>
      <th>50%</th>
      <td>12.000000</td>
    </tr>
    <tr>
      <th>75%</th>
      <td>18.000000</td>
    </tr>
    <tr>
      <th>max</th>
      <td>24.000000</td>
    </tr>
  </tbody>
</table>
</div>
    <div class="colab-df-buttons">

  <div class="colab-df-container">
    <button class="colab-df-convert" onclick="convertToInteractive('df-77ea06d3-fe83-4ed7-a412-850e997a5511')"
            title="Convert this dataframe to an interactive table."
            style="display:none;">

  <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960">
    <path d="M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z"/>
  </svg>
    </button>

  <style>
    .colab-df-container {
      display:flex;
      gap: 12px;
    }

    .colab-df-convert {
      background-color: #E8F0FE;
      border: none;
      border-radius: 50%;
      cursor: pointer;
      display: none;
      fill: #1967D2;
      height: 32px;
      padding: 0 0 0 0;
      width: 32px;
    }

    .colab-df-convert:hover {
      background-color: #E2EBFA;
      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
      fill: #174EA6;
    }

    .colab-df-buttons div {
      margin-bottom: 4px;
    }

    [theme=dark] .colab-df-convert {
      background-color: #3B4455;
      fill: #D2E3FC;
    }

    [theme=dark] .colab-df-convert:hover {
      background-color: #434B5C;
      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
      fill: #FFFFFF;
    }
  </style>

    <script>
      const buttonEl =
        document.querySelector('#df-77ea06d3-fe83-4ed7-a412-850e997a5511 button.colab-df-convert');
      buttonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';

      async function convertToInteractive(key) {
        const element = document.querySelector('#df-77ea06d3-fe83-4ed7-a412-850e997a5511');
        const dataTable =
          await google.colab.kernel.invokeFunction('convertToInteractive',
                                                    [key], {});
        if (!dataTable) return;

        const docLinkHtml = 'Like what you see? Visit the ' +
          '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
          + ' to learn more about interactive tables.';
        element.innerHTML = '';
        dataTable['output_type'] = 'display_data';
        await google.colab.output.renderOutput(dataTable, element);
        const docLink = document.createElement('div');
        docLink.innerHTML = docLinkHtml;
        element.appendChild(docLink);
      }
    </script>
  </div>


<div id="df-77fe0350-7117-4ac2-ab8b-ad4f7c194366">
  <button class="colab-df-quickchart" onclick="quickchart('df-77fe0350-7117-4ac2-ab8b-ad4f7c194366')"
            title="Suggest charts"
            style="display:none;">

<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
     width="24px">
    <g>
        <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"/>
    </g>
</svg>
  </button>

<style>
  .colab-df-quickchart {
      --bg-color: #E8F0FE;
      --fill-color: #1967D2;
      --hover-bg-color: #E2EBFA;
      --hover-fill-color: #174EA6;
      --disabled-fill-color: #AAA;
      --disabled-bg-color: #DDD;
  }

  [theme=dark] .colab-df-quickchart {
      --bg-color: #3B4455;
      --fill-color: #D2E3FC;
      --hover-bg-color: #434B5C;
      --hover-fill-color: #FFFFFF;
      --disabled-bg-color: #3B4455;
      --disabled-fill-color: #666;
  }

  .colab-df-quickchart {
    background-color: var(--bg-color);
    border: none;
    border-radius: 50%;
    cursor: pointer;
    display: none;
    fill: var(--fill-color);
    height: 32px;
    padding: 0;
    width: 32px;
  }

  .colab-df-quickchart:hover {
    background-color: var(--hover-bg-color);
    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
    fill: var(--button-hover-fill-color);
  }

  .colab-df-quickchart-complete:disabled,
  .colab-df-quickchart-complete:disabled:hover {
    background-color: var(--disabled-bg-color);
    fill: var(--disabled-fill-color);
    box-shadow: none;
  }

  .colab-df-spinner {
    border: 2px solid var(--fill-color);
    border-color: transparent;
    border-bottom-color: var(--fill-color);
    animation:
      spin 1s steps(1) infinite;
  }

  @keyframes spin {
    0% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
      border-left-color: var(--fill-color);
    }
    20% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    30% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
      border-right-color: var(--fill-color);
    }
    40% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    60% {
      border-color: transparent;
      border-right-color: var(--fill-color);
    }
    80% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-bottom-color: var(--fill-color);
    }
    90% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
    }
  }
</style>

  <script>
    async function quickchart(key) {
      const quickchartButtonEl =
        document.querySelector('#' + key + ' button');
      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.
      quickchartButtonEl.classList.add('colab-df-spinner');
      try {
        const charts = await google.colab.kernel.invokeFunction(
            'suggestCharts', [key], {});
      } catch (error) {
        console.error('Error during call to suggestCharts:', error);
      }
      quickchartButtonEl.classList.remove('colab-df-spinner');
      quickchartButtonEl.classList.add('colab-df-quickchart-complete');
    }
    (() => {
      let quickchartButtonEl =
        document.querySelector('#df-77fe0350-7117-4ac2-ab8b-ad4f7c194366 button');
      quickchartButtonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';
    })();
  </script>
</div>
    </div>
  </div>

</div>
</div>
<div class="cell code" data-execution_count="41"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="H1DBkW3vPkDv" data-outputId="6c04aad4-f3e5-4256-f0d6-6ffab26549f9">
<div class="sourceCode" id="cb57"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb57-1"><a href="#cb57-1" aria-hidden="true" tabindex="-1"></a><span class="co"># df[[&#39;Text&#39;,&quot;Category&quot;]].value_counts()</span></span>
<span id="cb57-2"><a href="#cb57-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(df.value_counts())</span>
<span id="cb57-3"><a href="#cb57-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb57-4"><a href="#cb57-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Text  Category</span></span>
<span id="cb57-5"><a href="#cb57-5" aria-hidden="true" tabindex="-1"></a><span class="co"># 0     10          1</span></span>
<span id="cb57-6"><a href="#cb57-6" aria-hidden="true" tabindex="-1"></a><span class="co"># 5036  10          1</span></span>
<span id="cb57-7"><a href="#cb57-7" aria-hidden="true" tabindex="-1"></a><span class="co"># 5048  8           1</span></span>
<span id="cb57-8"><a href="#cb57-8" aria-hidden="true" tabindex="-1"></a><span class="co"># 5047  9           1</span></span>
<span id="cb57-9"><a href="#cb57-9" aria-hidden="true" tabindex="-1"></a><span class="co"># 5046  8           1</span></span>
<span id="cb57-10"><a href="#cb57-10" aria-hidden="true" tabindex="-1"></a><span class="co">#                  ..</span></span>
<span id="cb57-11"><a href="#cb57-11" aria-hidden="true" tabindex="-1"></a><span class="co"># 2531  2           1</span></span>
<span id="cb57-12"><a href="#cb57-12" aria-hidden="true" tabindex="-1"></a><span class="co"># 2530  3           1</span></span>
<span id="cb57-13"><a href="#cb57-13" aria-hidden="true" tabindex="-1"></a><span class="co"># 2529  3           1</span></span>
<span id="cb57-14"><a href="#cb57-14" aria-hidden="true" tabindex="-1"></a><span class="co"># 2528  4           1</span></span>
<span id="cb57-15"><a href="#cb57-15" aria-hidden="true" tabindex="-1"></a><span class="co"># 7565  5           1</span></span>
<span id="cb57-16"><a href="#cb57-16" aria-hidden="true" tabindex="-1"></a><span class="co"># Length: 7628, dtype: int64</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>Text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            Category
   #include &quot;std_testcase.h&quot;   #define CHAR_ARRAY_SIZE 8  namespace fgets_33 {  #ifndef OMITBAD  void bad() {     short data;     short &amp;dataRef = data;          data = -1;     {         char inputBuffer[CHAR_ARRAY_SIZE] = &quot;&quot;;                  if (fgets(inputBuffer, CHAR_ARRAY_SIZE, stdin) != NULL)         {                          data = (short)atoi(inputBuffer);         }         else         {             printLine(&quot;fgets() failed.&quot;);         }     }     {         short data = dataRef;         {                          char charData = (char)data;             printHexCharLine(charData);         }     } }  #endif   #ifndef OMITGOOD   static void goodG2B() {     short data;     short &amp;dataRef = data;          data = -1;          data = CHAR_MAX-5;     {         short data = dataRef;         {                          char charData = (char)data;             printHexCharLine(charData);         }     } }  void good() {     goodG2B(); }  #endif   }    #ifdef INCLUDEMAIN  using namespace fgets_33;   int main(int argc, char * argv[]) {          srand( (unsigned)time(NULL) ); #ifndef OMITGOOD     printLine(&quot;Calling good()...&quot;);     good();     printLine(&quot;Finished good()&quot;); #endif  #ifndef OMITBAD     printLine(&quot;Calling bad()...&quot;);     bad();     printLine(&quot;Finished bad()&quot;); #endif      return 0; }  #endif                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       10          1
   #include &quot;std_testcase.h&quot; #include &lt;map&gt;  using namespace std;  namespace rand_to_char_74 {  #ifndef OMITBAD   void badSink(map&lt;int, int&gt; dataMap);  void bad() {     int data;     map&lt;int, int&gt; dataMap;          data = -1;          data = RAND32();          dataMap[0] = data;     dataMap[1] = data;     dataMap[2] = data;     badSink(dataMap); }  #endif   #ifndef OMITGOOD     void goodG2BSink(map&lt;int, int&gt; dataMap);  static void goodG2B() {     int data;     map&lt;int, int&gt; dataMap;          data = -1;          data = CHAR_MAX-5;          dataMap[0] = data;     dataMap[1] = data;     dataMap[2] = data;     goodG2BSink(dataMap); }  void good() {     goodG2B(); }  #endif   }     #ifdef INCLUDEMAIN  using namespace rand_to_char_74;   int main(int argc, char * argv[]) {          srand( (unsigned)time(NULL) ); #ifndef OMITGOOD     printLine(&quot;Calling good()...&quot;);     good();     printLine(&quot;Finished good()&quot;); #endif  #ifndef OMITBAD     printLine(&quot;Calling bad()...&quot;);     bad();     printLine(&quot;Finished bad()&quot;); #endif      return 0; }  #endif                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      10          1
   #include &quot;std_testcase.h&quot; #include &lt;map&gt;  using namespace std;  namespace socket_strncpy_74 {  #ifndef OMITBAD  void badSink(map&lt;int, short&gt; dataMap) {          short data = dataMap[2];     {         char source[100];         char dest[100] = &quot;&quot;;         memset(source, &#39;A&#39;, 100-1);         source[100-1] = &#39;\0&#39;;         if (data &lt; 100)         {                          strncpy(dest, source, data);             dest[data] = &#39;\0&#39;;          }         printLine(dest);     } }  #endif   #ifndef OMITGOOD   void goodG2BSink(map&lt;int, short&gt; dataMap) {     short data = dataMap[2];     {         char source[100];         char dest[100] = &quot;&quot;;         memset(source, &#39;A&#39;, 100-1);         source[100-1] = &#39;\0&#39;;         if (data &lt; 100)         {                          strncpy(dest, source, data);             dest[data] = &#39;\0&#39;;          }         printLine(dest);     } }  #endif   }                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              8           1
   #include &quot;std_testcase.h&quot; #include &lt;map&gt;  using namespace std;  namespace socket_strncpy_74 {  #ifndef OMITBAD  void badSink(map&lt;int, int&gt; dataMap) {          int data = dataMap[2];     {         char source[100];         char dest[100] = &quot;&quot;;         memset(source, &#39;A&#39;, 100-1);         source[100-1] = &#39;\0&#39;;         if (data &lt; 100)         {                          strncpy(dest, source, data);             dest[data] = &#39;\0&#39;;          }         printLine(dest);     } }  #endif   #ifndef OMITGOOD   void goodG2BSink(map&lt;int, int&gt; dataMap) {     int data = dataMap[2];     {         char source[100];         char dest[100] = &quot;&quot;;         memset(source, &#39;A&#39;, 100-1);         source[100-1] = &#39;\0&#39;;         if (data &lt; 100)         {                          strncpy(dest, source, data);             dest[data] = &#39;\0&#39;;          }         printLine(dest);     } }  #endif   }                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      9           1
   #include &quot;std_testcase.h&quot; #include &lt;map&gt;  using namespace std;  namespace socket_memmove_74 {  #ifndef OMITBAD  void badSink(map&lt;int, short&gt; dataMap) {          short data = dataMap[2];     {         char source[100];         char dest[100] = &quot;&quot;;         memset(source, &#39;A&#39;, 100-1);         source[100-1] = &#39;\0&#39;;         if (data &lt; 100)         {                          memmove(dest, source, data);             dest[data] = &#39;\0&#39;;          }         printLine(dest);     } }  #endif   #ifndef OMITGOOD   void goodG2BSink(map&lt;int, short&gt; dataMap) {     short data = dataMap[2];     {         char source[100];         char dest[100] = &quot;&quot;;         memset(source, &#39;A&#39;, 100-1);         source[100-1] = &#39;\0&#39;;         if (data &lt; 100)         {                          memmove(dest, source, data);             dest[data] = &#39;\0&#39;;          }         printLine(dest);     } }  #endif   }                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              8           1
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           ..
   #include &quot;std_testcase.h&quot;  #include &lt;wchar.h&gt;  namespace wchar_t_memmove_53 {    #ifndef OMITBAD   void badSink_c(wchar_t * data);  void badSink_b(wchar_t * data) {     badSink_c(data); }  #endif   #ifndef OMITGOOD   void goodG2BSink_c(wchar_t * data);  void goodG2BSink_b(wchar_t * data) {     goodG2BSink_c(data); }  #endif   }                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    2           1
   #include &quot;std_testcase.h&quot;  #include &lt;wchar.h&gt;  namespace wchar_t_memmove_52 {  #ifndef OMITBAD   void badSink_b(wchar_t * data);  void bad() {     wchar_t * data;     data = NULL;          data = new wchar_t[50];     wmemset(data, L&#39;A&#39;, 50-1);      data[50-1] = L&#39;\0&#39;;      badSink_b(data); }  #endif   #ifndef OMITGOOD   void goodG2BSink_b(wchar_t * data);   static void goodG2B() {     wchar_t * data;     data = NULL;          data = new wchar_t[100];     wmemset(data, L&#39;A&#39;, 100-1);      data[100-1] = L&#39;\0&#39;;      goodG2BSink_b(data); }  void good() {     goodG2B(); }  #endif   }     #ifdef INCLUDEMAIN  using namespace wchar_t_memmove_52;   int main(int argc, char * argv[]) {          srand( (unsigned)time(NULL) ); #ifndef OMITGOOD     printLine(&quot;Calling good()...&quot;);     good();     printLine(&quot;Finished good()&quot;); #endif  #ifndef OMITBAD     printLine(&quot;Calling bad()...&quot;);     bad();     printLine(&quot;Finished bad()&quot;); #endif      return 0; }  #endif                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 3           1
   #include &quot;std_testcase.h&quot;  #include &lt;wchar.h&gt;  namespace wchar_t_memmove_52 {    #ifndef OMITBAD  void badSink_c(wchar_t * data) {     {         wchar_t dest[100];         wmemset(dest, L&#39;C&#39;, 100-1);         dest[100-1] = L&#39;\0&#39;;                   memmove(dest, data, wcslen(dest)*sizeof(wchar_t));         dest[100-1] = L&#39;\0&#39;;         printWLine(dest);         delete [] data;     } }  #endif   #ifndef OMITGOOD   void goodG2BSink_c(wchar_t * data) {     {         wchar_t dest[100];         wmemset(dest, L&#39;C&#39;, 100-1);         dest[100-1] = L&#39;\0&#39;;                   memmove(dest, data, wcslen(dest)*sizeof(wchar_t));         dest[100-1] = L&#39;\0&#39;;         printWLine(dest);         delete [] data;     } }  #endif   }                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 3           1
   #include &quot;std_testcase.h&quot;  #include &lt;wchar.h&gt;  namespace wchar_t_memmove_52 {    #ifndef OMITBAD  void badSink_c(wchar_t * data) {     {         wchar_t dest[100];         wmemset(dest, L&#39;C&#39;, 100-1);          dest[100-1] = L&#39;\0&#39;;                   memmove(dest, data, 100*sizeof(wchar_t));                  dest[100-1] = L&#39;\0&#39;;         printWLine(dest);              } }  #endif   #ifndef OMITGOOD   void goodG2BSink_c(wchar_t * data) {     {         wchar_t dest[100];         wmemset(dest, L&#39;C&#39;, 100-1);          dest[100-1] = L&#39;\0&#39;;                   memmove(dest, data, 100*sizeof(wchar_t));                  dest[100-1] = L&#39;\0&#39;;         printWLine(dest);              } }  #endif   }                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             4           1
  #include &lt;vector&gt; #include &quot;std_testcase.h&quot;  #ifndef _WIN32 #include &lt;wchar.h&gt; #endif  using namespace std;  namespace t_console_w32_vsnprintf_72 {  #ifndef OMITBAD   void badSink(vector&lt;wchar_t *&gt; dataVector);  void bad() {     wchar_t * data;     vector&lt;wchar_t *&gt; dataVector;     wchar_t dataBuffer[100] = L&quot;&quot;;     data = dataBuffer;     {                  size_t dataLen = wcslen(data);                  if (100-dataLen &gt; 1)         {                          if (fgetws(data+dataLen, (int)(100-dataLen), stdin) != NULL)             {                                  dataLen = wcslen(data);                 if (dataLen &gt; 0 &amp;&amp; data[dataLen-1] == L&#39;\n&#39;)                 {                     data[dataLen-1] = L&#39;\0&#39;;                 }             }             else             {                 printLine(&quot;fgetws() failed&quot;);                                  data[dataLen] = L&#39;\0&#39;;             }         }     }          dataVector.insert(dataVector.end(), 1, data);     dataVector.insert(dataVector.end(), 1, data);     dataVector.insert(dataVector.end(), 1, data);     badSink(dataVector); }  #endif   #ifndef OMITGOOD   void goodG2BSink(vector&lt;wchar_t *&gt; dataVector);  static void goodG2B() {     wchar_t * data;     vector&lt;wchar_t *&gt; dataVector;     wchar_t dataBuffer[100] = L&quot;&quot;;     data = dataBuffer;          wcscpy(data, L&quot;fixedstringtest&quot;);          dataVector.insert(dataVector.end(), 1, data);     dataVector.insert(dataVector.end(), 1, data);     dataVector.insert(dataVector.end(), 1, data);     goodG2BSink(dataVector); }   void goodB2GSink(vector&lt;wchar_t *&gt; dataVector);  static void goodB2G() {     wchar_t * data;     vector&lt;wchar_t *&gt; dataVector;     wchar_t dataBuffer[100] = L&quot;&quot;;     data = dataBuffer;     {                  size_t dataLen = wcslen(data);                  if (100-dataLen &gt; 1)         {                          if (fgetws(data+dataLen, (int)(100-dataLen), stdin) != NULL)             {                                  dataLen = wcslen(data);                 if (dataLen &gt; 0 &amp;&amp; data[dataLen-1] == L&#39;\n&#39;)                 {                     data[dataLen-1] = L&#39;\0&#39;;                 }             }             else             {                 printLine(&quot;fgetws() failed&quot;);                                  data[dataLen] = L&#39;\0&#39;;             }         }     }     dataVector.insert(dataVector.end(), 1, data);     dataVector.insert(dataVector.end(), 1, data);     dataVector.insert(dataVector.end(), 1, data);     goodB2GSink(dataVector); }  void good() {     goodG2B();     goodB2G(); }  #endif   }     #ifdef INCLUDEMAIN  using namespace t_console_w32_vsnprintf_72;   int main(int argc, char * argv[]) {          srand( (unsigned)time(NULL) ); #ifndef OMITGOOD     printLine(&quot;Calling good()...&quot;);     good();     printLine(&quot;Finished good()&quot;); #endif  #ifndef OMITBAD     printLine(&quot;Calling bad()...&quot;);     bad();     printLine(&quot;Finished bad()&quot;); #endif      return 0; }  #endif  5           1
Length: 7628, dtype: int64
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="42"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="Rra6Ms03PkDx" data-outputId="3b314b92-b518-4c0e-f580-8a9938bc6757">
<div class="sourceCode" id="cb59"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb59-1"><a href="#cb59-1" aria-hidden="true" tabindex="-1"></a>df[<span class="st">&quot;Category&quot;</span>].value_counts(<span class="dv">5</span>)</span></code></pre></div>
<div class="output execute_result" data-execution_count="42">
<pre><code>11    0.045884
16    0.045884
12    0.045884
5     0.045884
10    0.045884
13    0.045884
24    0.045884
18    0.045884
22    0.045752
19    0.045621
3     0.043917
4     0.043655
15    0.043655
2     0.043393
0     0.042475
23    0.042344
1     0.041426
20    0.039460
6     0.039067
17    0.038936
7     0.038542
21    0.021893
8     0.021631
9     0.020713
14    0.020451
Name: Category, dtype: float64</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="43"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:896}"
id="priVjhE9PkDy" data-outputId="4e0314dc-4df0-40ba-dde2-d7d7751fbd70">
<div class="sourceCode" id="cb61"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb61-1"><a href="#cb61-1" aria-hidden="true" tabindex="-1"></a>df.groupby(<span class="st">&#39;Category&#39;</span>).describe()</span></code></pre></div>
<div class="output execute_result" data-execution_count="43">

  <div id="df-ed18841b-26d5-4410-9c78-1aa3430b6eff" class="colab-df-container">
    <div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead tr th {
        text-align: left;
    }

    .dataframe thead tr:last-of-type th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr>
      <th></th>
      <th colspan="4" halign="left">Text</th>
    </tr>
    <tr>
      <th></th>
      <th>count</th>
      <th>unique</th>
      <th>top</th>
      <th>freq</th>
    </tr>
    <tr>
      <th>Category</th>
      <th></th>
      <th></th>
      <th></th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>324</td>
      <td>324</td>
      <td>#include "std_testcase.h" #include &lt;map&gt;  u...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>1</th>
      <td>316</td>
      <td>316</td>
      <td>#include "std_testcase.h"  #ifndef _WIN32 #...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>2</th>
      <td>331</td>
      <td>331</td>
      <td>#ifndef OMITGOOD  #include "std_testcase.h" ...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>3</th>
      <td>335</td>
      <td>335</td>
      <td>#include "std_testcase.h"  #include &lt;wchar....</td>
      <td>1</td>
    </tr>
    <tr>
      <th>4</th>
      <td>333</td>
      <td>333</td>
      <td>#include "std_testcase.h"  #include &lt;wchar....</td>
      <td>1</td>
    </tr>
    <tr>
      <th>5</th>
      <td>350</td>
      <td>350</td>
      <td>#ifndef OMITBAD  #include "std_testcase.h" #...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>6</th>
      <td>298</td>
      <td>298</td>
      <td>#include "std_testcase.h" #include "max_mul...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>7</th>
      <td>294</td>
      <td>294</td>
      <td>#include "std_testcase.h" #include &lt;map&gt;  u...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>8</th>
      <td>165</td>
      <td>165</td>
      <td>#include "std_testcase.h"  namespace socket...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>9</th>
      <td>158</td>
      <td>158</td>
      <td>#include "std_testcase.h" #include &lt;list&gt;  ...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>10</th>
      <td>350</td>
      <td>350</td>
      <td>#include "std_testcase.h" #include &lt;vector&gt;...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>11</th>
      <td>350</td>
      <td>350</td>
      <td>#include "std_testcase.h"  #ifdef _WIN32 #d...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>12</th>
      <td>350</td>
      <td>350</td>
      <td>#include "std_testcase.h" #include &lt;vector&gt;...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>13</th>
      <td>350</td>
      <td>350</td>
      <td>#include "std_testcase.h"  #ifndef _WIN32 #...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>14</th>
      <td>156</td>
      <td>156</td>
      <td>#ifndef OMITGOOD  #include "std_testcase.h" ...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>15</th>
      <td>333</td>
      <td>333</td>
      <td>#include "std_testcase.h"  #ifndef _WIN32 #...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>16</th>
      <td>350</td>
      <td>350</td>
      <td>#include "std_testcase.h"  #include &lt;wchar....</td>
      <td>1</td>
    </tr>
    <tr>
      <th>17</th>
      <td>297</td>
      <td>297</td>
      <td>#include "std_testcase.h"  namespace int_ar...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>18</th>
      <td>350</td>
      <td>350</td>
      <td>#ifndef OMITGOOD  #include "std_testcase.h" ...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>19</th>
      <td>348</td>
      <td>348</td>
      <td>#include "std_testcase.h"  #include &lt;wchar....</td>
      <td>1</td>
    </tr>
    <tr>
      <th>20</th>
      <td>301</td>
      <td>301</td>
      <td>#ifndef OMITGOOD  #include "std_testcase.h" ...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>21</th>
      <td>167</td>
      <td>167</td>
      <td>#ifndef OMITBAD  #include "std_testcase.h" #...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>22</th>
      <td>349</td>
      <td>349</td>
      <td>#include "std_testcase.h"  namespace int_ma...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>23</th>
      <td>323</td>
      <td>323</td>
      <td>#include "std_testcase.h"  #ifndef _WIN32 #...</td>
      <td>1</td>
    </tr>
    <tr>
      <th>24</th>
      <td>350</td>
      <td>350</td>
      <td>#include "std_testcase.h" #include "environ...</td>
      <td>1</td>
    </tr>
  </tbody>
</table>
</div>
    <div class="colab-df-buttons">

  <div class="colab-df-container">
    <button class="colab-df-convert" onclick="convertToInteractive('df-ed18841b-26d5-4410-9c78-1aa3430b6eff')"
            title="Convert this dataframe to an interactive table."
            style="display:none;">

  <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960">
    <path d="M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z"/>
  </svg>
    </button>

  <style>
    .colab-df-container {
      display:flex;
      gap: 12px;
    }

    .colab-df-convert {
      background-color: #E8F0FE;
      border: none;
      border-radius: 50%;
      cursor: pointer;
      display: none;
      fill: #1967D2;
      height: 32px;
      padding: 0 0 0 0;
      width: 32px;
    }

    .colab-df-convert:hover {
      background-color: #E2EBFA;
      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
      fill: #174EA6;
    }

    .colab-df-buttons div {
      margin-bottom: 4px;
    }

    [theme=dark] .colab-df-convert {
      background-color: #3B4455;
      fill: #D2E3FC;
    }

    [theme=dark] .colab-df-convert:hover {
      background-color: #434B5C;
      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
      fill: #FFFFFF;
    }
  </style>

    <script>
      const buttonEl =
        document.querySelector('#df-ed18841b-26d5-4410-9c78-1aa3430b6eff button.colab-df-convert');
      buttonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';

      async function convertToInteractive(key) {
        const element = document.querySelector('#df-ed18841b-26d5-4410-9c78-1aa3430b6eff');
        const dataTable =
          await google.colab.kernel.invokeFunction('convertToInteractive',
                                                    [key], {});
        if (!dataTable) return;

        const docLinkHtml = 'Like what you see? Visit the ' +
          '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
          + ' to learn more about interactive tables.';
        element.innerHTML = '';
        dataTable['output_type'] = 'display_data';
        await google.colab.output.renderOutput(dataTable, element);
        const docLink = document.createElement('div');
        docLink.innerHTML = docLinkHtml;
        element.appendChild(docLink);
      }
    </script>
  </div>


<div id="df-8a3044b6-c16f-4ee1-96b1-8d3b4ee1478b">
  <button class="colab-df-quickchart" onclick="quickchart('df-8a3044b6-c16f-4ee1-96b1-8d3b4ee1478b')"
            title="Suggest charts"
            style="display:none;">

<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
     width="24px">
    <g>
        <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"/>
    </g>
</svg>
  </button>

<style>
  .colab-df-quickchart {
      --bg-color: #E8F0FE;
      --fill-color: #1967D2;
      --hover-bg-color: #E2EBFA;
      --hover-fill-color: #174EA6;
      --disabled-fill-color: #AAA;
      --disabled-bg-color: #DDD;
  }

  [theme=dark] .colab-df-quickchart {
      --bg-color: #3B4455;
      --fill-color: #D2E3FC;
      --hover-bg-color: #434B5C;
      --hover-fill-color: #FFFFFF;
      --disabled-bg-color: #3B4455;
      --disabled-fill-color: #666;
  }

  .colab-df-quickchart {
    background-color: var(--bg-color);
    border: none;
    border-radius: 50%;
    cursor: pointer;
    display: none;
    fill: var(--fill-color);
    height: 32px;
    padding: 0;
    width: 32px;
  }

  .colab-df-quickchart:hover {
    background-color: var(--hover-bg-color);
    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
    fill: var(--button-hover-fill-color);
  }

  .colab-df-quickchart-complete:disabled,
  .colab-df-quickchart-complete:disabled:hover {
    background-color: var(--disabled-bg-color);
    fill: var(--disabled-fill-color);
    box-shadow: none;
  }

  .colab-df-spinner {
    border: 2px solid var(--fill-color);
    border-color: transparent;
    border-bottom-color: var(--fill-color);
    animation:
      spin 1s steps(1) infinite;
  }

  @keyframes spin {
    0% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
      border-left-color: var(--fill-color);
    }
    20% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    30% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
      border-right-color: var(--fill-color);
    }
    40% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    60% {
      border-color: transparent;
      border-right-color: var(--fill-color);
    }
    80% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-bottom-color: var(--fill-color);
    }
    90% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
    }
  }
</style>

  <script>
    async function quickchart(key) {
      const quickchartButtonEl =
        document.querySelector('#' + key + ' button');
      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.
      quickchartButtonEl.classList.add('colab-df-spinner');
      try {
        const charts = await google.colab.kernel.invokeFunction(
            'suggestCharts', [key], {});
      } catch (error) {
        console.error('Error during call to suggestCharts:', error);
      }
      quickchartButtonEl.classList.remove('colab-df-spinner');
      quickchartButtonEl.classList.add('colab-df-quickchart-complete');
    }
    (() => {
      let quickchartButtonEl =
        document.querySelector('#df-8a3044b6-c16f-4ee1-96b1-8d3b4ee1478b button');
      quickchartButtonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';
    })();
  </script>
</div>
    </div>
  </div>

</div>
</div>
<div class="cell code" data-execution_count="44"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:206}"
id="kjNpLvLXPkDz" data-outputId="2d18f078-7d63-40e0-dbca-a379865df694">
<div class="sourceCode" id="cb62"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb62-1"><a href="#cb62-1" aria-hidden="true" tabindex="-1"></a>df.head()</span></code></pre></div>
<div class="output execute_result" data-execution_count="44">

  <div id="df-62eaa4f8-298f-4d69-a952-8f24fc93915e" class="colab-df-container">
    <div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Text</th>
      <th>Category</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>#include "std_testcase.h"  #ifdef _WIN32 #d...</td>
      <td>11</td>
    </tr>
    <tr>
      <th>1</th>
      <td>#include "std_testcase.h"  #include &lt;wchar....</td>
      <td>3</td>
    </tr>
    <tr>
      <th>2</th>
      <td>#ifndef OMITGOOD  #include "std_testcase.h" ...</td>
      <td>18</td>
    </tr>
    <tr>
      <th>3</th>
      <td>#include "std_testcase.h" #include "environ...</td>
      <td>24</td>
    </tr>
    <tr>
      <th>4</th>
      <td>#include "std_testcase.h"  #include &lt;wchar....</td>
      <td>4</td>
    </tr>
  </tbody>
</table>
</div>
    <div class="colab-df-buttons">

  <div class="colab-df-container">
    <button class="colab-df-convert" onclick="convertToInteractive('df-62eaa4f8-298f-4d69-a952-8f24fc93915e')"
            title="Convert this dataframe to an interactive table."
            style="display:none;">

  <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960">
    <path d="M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z"/>
  </svg>
    </button>

  <style>
    .colab-df-container {
      display:flex;
      gap: 12px;
    }

    .colab-df-convert {
      background-color: #E8F0FE;
      border: none;
      border-radius: 50%;
      cursor: pointer;
      display: none;
      fill: #1967D2;
      height: 32px;
      padding: 0 0 0 0;
      width: 32px;
    }

    .colab-df-convert:hover {
      background-color: #E2EBFA;
      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
      fill: #174EA6;
    }

    .colab-df-buttons div {
      margin-bottom: 4px;
    }

    [theme=dark] .colab-df-convert {
      background-color: #3B4455;
      fill: #D2E3FC;
    }

    [theme=dark] .colab-df-convert:hover {
      background-color: #434B5C;
      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
      fill: #FFFFFF;
    }
  </style>

    <script>
      const buttonEl =
        document.querySelector('#df-62eaa4f8-298f-4d69-a952-8f24fc93915e button.colab-df-convert');
      buttonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';

      async function convertToInteractive(key) {
        const element = document.querySelector('#df-62eaa4f8-298f-4d69-a952-8f24fc93915e');
        const dataTable =
          await google.colab.kernel.invokeFunction('convertToInteractive',
                                                    [key], {});
        if (!dataTable) return;

        const docLinkHtml = 'Like what you see? Visit the ' +
          '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
          + ' to learn more about interactive tables.';
        element.innerHTML = '';
        dataTable['output_type'] = 'display_data';
        await google.colab.output.renderOutput(dataTable, element);
        const docLink = document.createElement('div');
        docLink.innerHTML = docLinkHtml;
        element.appendChild(docLink);
      }
    </script>
  </div>


<div id="df-f37b6a16-4939-4e95-ab76-94121838b12c">
  <button class="colab-df-quickchart" onclick="quickchart('df-f37b6a16-4939-4e95-ab76-94121838b12c')"
            title="Suggest charts"
            style="display:none;">

<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
     width="24px">
    <g>
        <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"/>
    </g>
</svg>
  </button>

<style>
  .colab-df-quickchart {
      --bg-color: #E8F0FE;
      --fill-color: #1967D2;
      --hover-bg-color: #E2EBFA;
      --hover-fill-color: #174EA6;
      --disabled-fill-color: #AAA;
      --disabled-bg-color: #DDD;
  }

  [theme=dark] .colab-df-quickchart {
      --bg-color: #3B4455;
      --fill-color: #D2E3FC;
      --hover-bg-color: #434B5C;
      --hover-fill-color: #FFFFFF;
      --disabled-bg-color: #3B4455;
      --disabled-fill-color: #666;
  }

  .colab-df-quickchart {
    background-color: var(--bg-color);
    border: none;
    border-radius: 50%;
    cursor: pointer;
    display: none;
    fill: var(--fill-color);
    height: 32px;
    padding: 0;
    width: 32px;
  }

  .colab-df-quickchart:hover {
    background-color: var(--hover-bg-color);
    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
    fill: var(--button-hover-fill-color);
  }

  .colab-df-quickchart-complete:disabled,
  .colab-df-quickchart-complete:disabled:hover {
    background-color: var(--disabled-bg-color);
    fill: var(--disabled-fill-color);
    box-shadow: none;
  }

  .colab-df-spinner {
    border: 2px solid var(--fill-color);
    border-color: transparent;
    border-bottom-color: var(--fill-color);
    animation:
      spin 1s steps(1) infinite;
  }

  @keyframes spin {
    0% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
      border-left-color: var(--fill-color);
    }
    20% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    30% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
      border-right-color: var(--fill-color);
    }
    40% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    60% {
      border-color: transparent;
      border-right-color: var(--fill-color);
    }
    80% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-bottom-color: var(--fill-color);
    }
    90% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
    }
  }
</style>

  <script>
    async function quickchart(key) {
      const quickchartButtonEl =
        document.querySelector('#' + key + ' button');
      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.
      quickchartButtonEl.classList.add('colab-df-spinner');
      try {
        const charts = await google.colab.kernel.invokeFunction(
            'suggestCharts', [key], {});
      } catch (error) {
        console.error('Error during call to suggestCharts:', error);
      }
      quickchartButtonEl.classList.remove('colab-df-spinner');
      quickchartButtonEl.classList.add('colab-df-quickchart-complete');
    }
    (() => {
      let quickchartButtonEl =
        document.querySelector('#df-f37b6a16-4939-4e95-ab76-94121838b12c button');
      quickchartButtonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';
    })();
  </script>
</div>
    </div>
  </div>

</div>
</div>
<div class="cell code" data-execution_count="45"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="We-FCH_NPkD0" data-outputId="4fd78682-d70a-4cf3-83cc-0ee4db5b24bf">
<div class="sourceCode" id="cb63"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb63-1"><a href="#cb63-1" aria-hidden="true" tabindex="-1"></a>df.shape</span></code></pre></div>
<div class="output execute_result" data-execution_count="45">
<pre><code>(7628, 2)</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="46"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:300}"
id="gu2RkaqIPkD1" data-outputId="c4c210e0-d2cc-4d49-d7ea-f7dfe5decea5">
<div class="sourceCode" id="cb65"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb65-1"><a href="#cb65-1" aria-hidden="true" tabindex="-1"></a>df.describe()</span></code></pre></div>
<div class="output execute_result" data-execution_count="46">

  <div id="df-ee0cd224-e18f-4179-a1d4-0cf62a9f7522" class="colab-df-container">
    <div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Category</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>count</th>
      <td>7628.000000</td>
    </tr>
    <tr>
      <th>mean</th>
      <td>11.998820</td>
    </tr>
    <tr>
      <th>std</th>
      <td>7.343498</td>
    </tr>
    <tr>
      <th>min</th>
      <td>0.000000</td>
    </tr>
    <tr>
      <th>25%</th>
      <td>5.000000</td>
    </tr>
    <tr>
      <th>50%</th>
      <td>12.000000</td>
    </tr>
    <tr>
      <th>75%</th>
      <td>18.000000</td>
    </tr>
    <tr>
      <th>max</th>
      <td>24.000000</td>
    </tr>
  </tbody>
</table>
</div>
    <div class="colab-df-buttons">

  <div class="colab-df-container">
    <button class="colab-df-convert" onclick="convertToInteractive('df-ee0cd224-e18f-4179-a1d4-0cf62a9f7522')"
            title="Convert this dataframe to an interactive table."
            style="display:none;">

  <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960">
    <path d="M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z"/>
  </svg>
    </button>

  <style>
    .colab-df-container {
      display:flex;
      gap: 12px;
    }

    .colab-df-convert {
      background-color: #E8F0FE;
      border: none;
      border-radius: 50%;
      cursor: pointer;
      display: none;
      fill: #1967D2;
      height: 32px;
      padding: 0 0 0 0;
      width: 32px;
    }

    .colab-df-convert:hover {
      background-color: #E2EBFA;
      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
      fill: #174EA6;
    }

    .colab-df-buttons div {
      margin-bottom: 4px;
    }

    [theme=dark] .colab-df-convert {
      background-color: #3B4455;
      fill: #D2E3FC;
    }

    [theme=dark] .colab-df-convert:hover {
      background-color: #434B5C;
      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
      fill: #FFFFFF;
    }
  </style>

    <script>
      const buttonEl =
        document.querySelector('#df-ee0cd224-e18f-4179-a1d4-0cf62a9f7522 button.colab-df-convert');
      buttonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';

      async function convertToInteractive(key) {
        const element = document.querySelector('#df-ee0cd224-e18f-4179-a1d4-0cf62a9f7522');
        const dataTable =
          await google.colab.kernel.invokeFunction('convertToInteractive',
                                                    [key], {});
        if (!dataTable) return;

        const docLinkHtml = 'Like what you see? Visit the ' +
          '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
          + ' to learn more about interactive tables.';
        element.innerHTML = '';
        dataTable['output_type'] = 'display_data';
        await google.colab.output.renderOutput(dataTable, element);
        const docLink = document.createElement('div');
        docLink.innerHTML = docLinkHtml;
        element.appendChild(docLink);
      }
    </script>
  </div>


<div id="df-915a786a-7dec-4fc0-ae06-2770eab14d47">
  <button class="colab-df-quickchart" onclick="quickchart('df-915a786a-7dec-4fc0-ae06-2770eab14d47')"
            title="Suggest charts"
            style="display:none;">

<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
     width="24px">
    <g>
        <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"/>
    </g>
</svg>
  </button>

<style>
  .colab-df-quickchart {
      --bg-color: #E8F0FE;
      --fill-color: #1967D2;
      --hover-bg-color: #E2EBFA;
      --hover-fill-color: #174EA6;
      --disabled-fill-color: #AAA;
      --disabled-bg-color: #DDD;
  }

  [theme=dark] .colab-df-quickchart {
      --bg-color: #3B4455;
      --fill-color: #D2E3FC;
      --hover-bg-color: #434B5C;
      --hover-fill-color: #FFFFFF;
      --disabled-bg-color: #3B4455;
      --disabled-fill-color: #666;
  }

  .colab-df-quickchart {
    background-color: var(--bg-color);
    border: none;
    border-radius: 50%;
    cursor: pointer;
    display: none;
    fill: var(--fill-color);
    height: 32px;
    padding: 0;
    width: 32px;
  }

  .colab-df-quickchart:hover {
    background-color: var(--hover-bg-color);
    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
    fill: var(--button-hover-fill-color);
  }

  .colab-df-quickchart-complete:disabled,
  .colab-df-quickchart-complete:disabled:hover {
    background-color: var(--disabled-bg-color);
    fill: var(--disabled-fill-color);
    box-shadow: none;
  }

  .colab-df-spinner {
    border: 2px solid var(--fill-color);
    border-color: transparent;
    border-bottom-color: var(--fill-color);
    animation:
      spin 1s steps(1) infinite;
  }

  @keyframes spin {
    0% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
      border-left-color: var(--fill-color);
    }
    20% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    30% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
      border-right-color: var(--fill-color);
    }
    40% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    60% {
      border-color: transparent;
      border-right-color: var(--fill-color);
    }
    80% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-bottom-color: var(--fill-color);
    }
    90% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
    }
  }
</style>

  <script>
    async function quickchart(key) {
      const quickchartButtonEl =
        document.querySelector('#' + key + ' button');
      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.
      quickchartButtonEl.classList.add('colab-df-spinner');
      try {
        const charts = await google.colab.kernel.invokeFunction(
            'suggestCharts', [key], {});
      } catch (error) {
        console.error('Error during call to suggestCharts:', error);
      }
      quickchartButtonEl.classList.remove('colab-df-spinner');
      quickchartButtonEl.classList.add('colab-df-quickchart-complete');
    }
    (() => {
      let quickchartButtonEl =
        document.querySelector('#df-915a786a-7dec-4fc0-ae06-2770eab14d47 button');
      quickchartButtonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';
    })();
  </script>
</div>
    </div>
  </div>

</div>
</div>
<div class="cell code" data-execution_count="47"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="n7NeJXLGPkD3" data-outputId="ecaa0be7-ffb0-4962-d073-e686833bf5b8">
<div class="sourceCode" id="cb66"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb66-1"><a href="#cb66-1" aria-hidden="true" tabindex="-1"></a>cat_df<span class="op">=</span>df[<span class="st">&#39;Category&#39;</span>]</span>
<span id="cb66-2"><a href="#cb66-2" aria-hidden="true" tabindex="-1"></a>cat_df.head()</span></code></pre></div>
<div class="output execute_result" data-execution_count="47">
<pre><code>0    11
1     3
2    18
3    24
4     4
Name: Category, dtype: int64</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="48"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="4Q8CgIuEPkD4" data-outputId="00e2f381-9c24-4af9-daa8-ff9c273b9efb">
<div class="sourceCode" id="cb68"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb68-1"><a href="#cb68-1" aria-hidden="true" tabindex="-1"></a>data_len<span class="op">=</span><span class="bu">len</span>(df[[<span class="st">&#39;Text&#39;</span>,<span class="st">&quot;Category&quot;</span>]])</span>
<span id="cb68-2"><a href="#cb68-2" aria-hidden="true" tabindex="-1"></a>data_count<span class="op">=</span>df[[<span class="st">&#39;Text&#39;</span>,<span class="st">&quot;Category&quot;</span>]].count()</span>
<span id="cb68-3"><a href="#cb68-3" aria-hidden="true" tabindex="-1"></a>data_count</span></code></pre></div>
<div class="output execute_result" data-execution_count="48">
<pre><code>Text        7628
Category    7628
dtype: int64</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="49"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="Zzoh29TDPkD4" data-outputId="e4020fe1-7ad7-45a7-b99b-18950071a253">
<div class="sourceCode" id="cb70"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb70-1"><a href="#cb70-1" aria-hidden="true" tabindex="-1"></a>data_missing<span class="op">=</span>data_len<span class="op">-</span>data_count</span>
<span id="cb70-2"><a href="#cb70-2" aria-hidden="true" tabindex="-1"></a>data_missing</span></code></pre></div>
<div class="output execute_result" data-execution_count="49">
<pre><code>Text        0
Category    0
dtype: int64</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="50"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="wfZ05kD0PkD5" data-outputId="16e5b41b-a092-4f98-ec25-a1800bbc3667">
<div class="sourceCode" id="cb72"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb72-1"><a href="#cb72-1" aria-hidden="true" tabindex="-1"></a>data_drop_na<span class="op">=</span>df.dropna()</span>
<span id="cb72-2"><a href="#cb72-2" aria-hidden="true" tabindex="-1"></a>data_drop_na.to_csv(<span class="st">&#39;data_drop_na.csv&#39;</span>)</span>
<span id="cb72-3"><a href="#cb72-3" aria-hidden="true" tabindex="-1"></a>data_drop_na.count()</span></code></pre></div>
<div class="output execute_result" data-execution_count="50">
<pre><code>Text        7628
Category    7628
dtype: int64</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="51"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="DbVDinraPkD6" data-outputId="e81bf36c-49e2-4ede-93ce-0f467cb2fb04">
<div class="sourceCode" id="cb74"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb74-1"><a href="#cb74-1" aria-hidden="true" tabindex="-1"></a>data_123f<span class="op">=</span>data_drop_na.groupby([<span class="st">&quot;Category&quot;</span>])[<span class="st">&#39;Text&#39;</span>].nunique()</span>
<span id="cb74-2"><a href="#cb74-2" aria-hidden="true" tabindex="-1"></a>data_123f.count()</span>
<span id="cb74-3"><a href="#cb74-3" aria-hidden="true" tabindex="-1"></a>data_123f</span></code></pre></div>
<div class="output execute_result" data-execution_count="51">
<pre><code>Category
0     324
1     316
2     331
3     335
4     333
5     350
6     298
7     294
8     165
9     158
10    350
11    350
12    350
13    350
14    156
15    333
16    350
17    297
18    350
19    348
20    301
21    167
22    349
23    323
24    350
Name: Text, dtype: int64</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="52"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="aNJKTKTQPkD8" data-outputId="feac873c-20d0-4df4-e00e-72e6b59d39af">
<div class="sourceCode" id="cb76"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb76-1"><a href="#cb76-1" aria-hidden="true" tabindex="-1"></a>g <span class="op">=</span> data_drop_na.groupby(<span class="st">&#39;Category&#39;</span>)</span>
<span id="cb76-2"><a href="#cb76-2" aria-hidden="true" tabindex="-1"></a>data_L1_filtered <span class="op">=</span> g.<span class="bu">filter</span>(<span class="kw">lambda</span> x: <span class="bu">len</span>(x) <span class="op">&gt;</span> <span class="dv">9</span>)  <span class="co"># pandas 0.13.1</span></span>
<span id="cb76-3"><a href="#cb76-3" aria-hidden="true" tabindex="-1"></a>data_L1_filtered.count()</span></code></pre></div>
<div class="output execute_result" data-execution_count="52">
<pre><code>Text        7628
Category    7628
dtype: int64</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="53" id="BC97lgtrPkD8">
<div class="sourceCode" id="cb78"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb78-1"><a href="#cb78-1" aria-hidden="true" tabindex="-1"></a>data_filtered <span class="op">=</span> data_L1_filtered</span></code></pre></div>
</div>
<section id="data-visualization" class="cell markdown"
id="qkWnzwkSPkD9">
<h3>Data Visualization</h3>
</section>
<div class="cell code" data-execution_count="54" id="ECqJbnZjPkD-">
<div class="sourceCode" id="cb79"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb79-1"><a href="#cb79-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> process_text_series(series, n<span class="op">=</span><span class="dv">1</span>):</span>
<span id="cb79-2"><a href="#cb79-2" aria-hidden="true" tabindex="-1"></a>    <span class="cf">return</span> series.<span class="bu">apply</span>(<span class="kw">lambda</span> x: process_text(x, n))</span>
<span id="cb79-3"><a href="#cb79-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb79-4"><a href="#cb79-4" aria-hidden="true" tabindex="-1"></a>data_filtered[<span class="st">&#39;Text&#39;</span>] <span class="op">=</span> process_text_series(data_filtered[<span class="st">&#39;Text&#39;</span>], n<span class="op">=</span><span class="dv">1</span>)</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="55"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="0cHtJ5zQPkD_" data-outputId="b7ac0b32-72b0-4d7c-fa2d-d2e40fbdf7d6">
<div class="sourceCode" id="cb80"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb80-1"><a href="#cb80-1" aria-hidden="true" tabindex="-1"></a>data_filtered[<span class="st">&#39;Text&#39;</span>]</span>
<span id="cb80-2"><a href="#cb80-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb80-3"><a href="#cb80-3" aria-hidden="true" tabindex="-1"></a><span class="co"># 0       [includ, stdtestcaseh, ifdef, win32, defin, ba...</span></span>
<span id="cb80-4"><a href="#cb80-4" aria-hidden="true" tabindex="-1"></a><span class="co"># 1       [includ, stdtestcaseh, includ, wcharh, namespa...</span></span>
<span id="cb80-5"><a href="#cb80-5" aria-hidden="true" tabindex="-1"></a><span class="co"># 2       [ifndef, omitgood, includ, stdtestcaseh, inclu...</span></span>
<span id="cb80-6"><a href="#cb80-6" aria-hidden="true" tabindex="-1"></a><span class="co"># 3       [includ, stdtestcaseh, includ, environmentw32s...</span></span>
<span id="cb80-7"><a href="#cb80-7" aria-hidden="true" tabindex="-1"></a><span class="co"># 4       [includ, stdtestcaseh, includ, wcharh, namespa...</span></span>
<span id="cb80-8"><a href="#cb80-8" aria-hidden="true" tabindex="-1"></a><span class="co">#                               ...</span></span>
<span id="cb80-9"><a href="#cb80-9" aria-hidden="true" tabindex="-1"></a><span class="co"># 7623    [includ, stdtestcaseh, includ, wcharh, namespa...</span></span>
<span id="cb80-10"><a href="#cb80-10" aria-hidden="true" tabindex="-1"></a><span class="co"># 7624    [includ, stdtestcaseh, ifdef, win32, defin, ba...</span></span>
<span id="cb80-11"><a href="#cb80-11" aria-hidden="true" tabindex="-1"></a><span class="co"># 7625    [includ, stdtestcaseh, ifndef, win32, includ, ...</span></span>
<span id="cb80-12"><a href="#cb80-12" aria-hidden="true" tabindex="-1"></a><span class="co"># 7626    [includ, stdtestcaseh, includ, vector, defin, ...</span></span>
<span id="cb80-13"><a href="#cb80-13" aria-hidden="true" tabindex="-1"></a><span class="co"># 7627    [includ, stdtestcaseh, includ, map, use, names...</span></span>
<span id="cb80-14"><a href="#cb80-14" aria-hidden="true" tabindex="-1"></a><span class="co"># Name: Text, Length: 7628, dtype: object</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="55">
<pre><code>0       [includ, stdtestcaseh, ifdef, win32, defin, ba...
1       [includ, stdtestcaseh, includ, wcharh, namespa...
2       [ifndef, omitgood, includ, stdtestcaseh, inclu...
3       [includ, stdtestcaseh, includ, environmentw32s...
4       [includ, stdtestcaseh, includ, wcharh, namespa...
                              ...                        
7623    [includ, stdtestcaseh, includ, wcharh, namespa...
7624    [includ, stdtestcaseh, ifdef, win32, defin, ba...
7625    [includ, stdtestcaseh, ifndef, win32, includ, ...
7626    [includ, stdtestcaseh, includ, vector, defin, ...
7627    [includ, stdtestcaseh, includ, map, use, names...
Name: Text, Length: 7628, dtype: object</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="56"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:423}"
id="yua0A_UcPkEB" data-outputId="9907017c-a61b-4d89-c1d3-be9c93de4899">
<div class="sourceCode" id="cb82"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb82-1"><a href="#cb82-1" aria-hidden="true" tabindex="-1"></a>data_filtered <span class="co">## not df ## nor data</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="56">

  <div id="df-ba70f4f3-f8ea-4a1a-9f97-ff7a57b67de5" class="colab-df-container">
    <div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Text</th>
      <th>Category</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>[includ, stdtestcaseh, ifdef, win32, defin, ba...</td>
      <td>11</td>
    </tr>
    <tr>
      <th>1</th>
      <td>[includ, stdtestcaseh, includ, wcharh, namespa...</td>
      <td>3</td>
    </tr>
    <tr>
      <th>2</th>
      <td>[ifndef, omitgood, includ, stdtestcaseh, inclu...</td>
      <td>18</td>
    </tr>
    <tr>
      <th>3</th>
      <td>[includ, stdtestcaseh, includ, environmentw32s...</td>
      <td>24</td>
    </tr>
    <tr>
      <th>4</th>
      <td>[includ, stdtestcaseh, includ, wcharh, namespa...</td>
      <td>4</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
    </tr>
    <tr>
      <th>7623</th>
      <td>[includ, stdtestcaseh, includ, wcharh, namespa...</td>
      <td>16</td>
    </tr>
    <tr>
      <th>7624</th>
      <td>[includ, stdtestcaseh, ifdef, win32, defin, ba...</td>
      <td>11</td>
    </tr>
    <tr>
      <th>7625</th>
      <td>[includ, stdtestcaseh, ifndef, win32, includ, ...</td>
      <td>5</td>
    </tr>
    <tr>
      <th>7626</th>
      <td>[includ, stdtestcaseh, includ, vector, defin, ...</td>
      <td>8</td>
    </tr>
    <tr>
      <th>7627</th>
      <td>[includ, stdtestcaseh, includ, map, use, names...</td>
      <td>8</td>
    </tr>
  </tbody>
</table>
<p>7628 rows × 2 columns</p>
</div>
    <div class="colab-df-buttons">

  <div class="colab-df-container">
    <button class="colab-df-convert" onclick="convertToInteractive('df-ba70f4f3-f8ea-4a1a-9f97-ff7a57b67de5')"
            title="Convert this dataframe to an interactive table."
            style="display:none;">

  <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960">
    <path d="M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z"/>
  </svg>
    </button>

  <style>
    .colab-df-container {
      display:flex;
      gap: 12px;
    }

    .colab-df-convert {
      background-color: #E8F0FE;
      border: none;
      border-radius: 50%;
      cursor: pointer;
      display: none;
      fill: #1967D2;
      height: 32px;
      padding: 0 0 0 0;
      width: 32px;
    }

    .colab-df-convert:hover {
      background-color: #E2EBFA;
      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
      fill: #174EA6;
    }

    .colab-df-buttons div {
      margin-bottom: 4px;
    }

    [theme=dark] .colab-df-convert {
      background-color: #3B4455;
      fill: #D2E3FC;
    }

    [theme=dark] .colab-df-convert:hover {
      background-color: #434B5C;
      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
      fill: #FFFFFF;
    }
  </style>

    <script>
      const buttonEl =
        document.querySelector('#df-ba70f4f3-f8ea-4a1a-9f97-ff7a57b67de5 button.colab-df-convert');
      buttonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';

      async function convertToInteractive(key) {
        const element = document.querySelector('#df-ba70f4f3-f8ea-4a1a-9f97-ff7a57b67de5');
        const dataTable =
          await google.colab.kernel.invokeFunction('convertToInteractive',
                                                    [key], {});
        if (!dataTable) return;

        const docLinkHtml = 'Like what you see? Visit the ' +
          '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
          + ' to learn more about interactive tables.';
        element.innerHTML = '';
        dataTable['output_type'] = 'display_data';
        await google.colab.output.renderOutput(dataTable, element);
        const docLink = document.createElement('div');
        docLink.innerHTML = docLinkHtml;
        element.appendChild(docLink);
      }
    </script>
  </div>


<div id="df-13f42bd8-15a0-4d64-821c-5ce7d12ab3e2">
  <button class="colab-df-quickchart" onclick="quickchart('df-13f42bd8-15a0-4d64-821c-5ce7d12ab3e2')"
            title="Suggest charts"
            style="display:none;">

<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
     width="24px">
    <g>
        <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"/>
    </g>
</svg>
  </button>

<style>
  .colab-df-quickchart {
      --bg-color: #E8F0FE;
      --fill-color: #1967D2;
      --hover-bg-color: #E2EBFA;
      --hover-fill-color: #174EA6;
      --disabled-fill-color: #AAA;
      --disabled-bg-color: #DDD;
  }

  [theme=dark] .colab-df-quickchart {
      --bg-color: #3B4455;
      --fill-color: #D2E3FC;
      --hover-bg-color: #434B5C;
      --hover-fill-color: #FFFFFF;
      --disabled-bg-color: #3B4455;
      --disabled-fill-color: #666;
  }

  .colab-df-quickchart {
    background-color: var(--bg-color);
    border: none;
    border-radius: 50%;
    cursor: pointer;
    display: none;
    fill: var(--fill-color);
    height: 32px;
    padding: 0;
    width: 32px;
  }

  .colab-df-quickchart:hover {
    background-color: var(--hover-bg-color);
    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
    fill: var(--button-hover-fill-color);
  }

  .colab-df-quickchart-complete:disabled,
  .colab-df-quickchart-complete:disabled:hover {
    background-color: var(--disabled-bg-color);
    fill: var(--disabled-fill-color);
    box-shadow: none;
  }

  .colab-df-spinner {
    border: 2px solid var(--fill-color);
    border-color: transparent;
    border-bottom-color: var(--fill-color);
    animation:
      spin 1s steps(1) infinite;
  }

  @keyframes spin {
    0% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
      border-left-color: var(--fill-color);
    }
    20% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    30% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
      border-right-color: var(--fill-color);
    }
    40% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    60% {
      border-color: transparent;
      border-right-color: var(--fill-color);
    }
    80% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-bottom-color: var(--fill-color);
    }
    90% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
    }
  }
</style>

  <script>
    async function quickchart(key) {
      const quickchartButtonEl =
        document.querySelector('#' + key + ' button');
      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.
      quickchartButtonEl.classList.add('colab-df-spinner');
      try {
        const charts = await google.colab.kernel.invokeFunction(
            'suggestCharts', [key], {});
      } catch (error) {
        console.error('Error during call to suggestCharts:', error);
      }
      quickchartButtonEl.classList.remove('colab-df-spinner');
      quickchartButtonEl.classList.add('colab-df-quickchart-complete');
    }
    (() => {
      let quickchartButtonEl =
        document.querySelector('#df-13f42bd8-15a0-4d64-821c-5ce7d12ab3e2 button');
      quickchartButtonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';
    })();
  </script>
</div>
    </div>
  </div>

</div>
</div>
<div class="cell code" data-execution_count="57" id="JRq4IwuXPkEC">
<div class="sourceCode" id="cb83"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb83-1"><a href="#cb83-1" aria-hidden="true" tabindex="-1"></a>df_target <span class="op">=</span> data_filtered[<span class="st">&#39;Category&#39;</span>]</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="58"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="oRcuM8HNPkEC" data-outputId="1317a4ea-a86e-450b-da4b-3c5814353041">
<div class="sourceCode" id="cb84"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb84-1"><a href="#cb84-1" aria-hidden="true" tabindex="-1"></a>data.target_names</span></code></pre></div>
<div class="output execute_result" data-execution_count="58">
<pre><code>[&#39;CWE121_Stack_Based_Buffer_Overflow&#39;,
 &#39;CWE122_Heap_Based_Buffer_Overflow&#39;,
 &#39;CWE124_Buffer_Underwrite&#39;,
 &#39;CWE126_Buffer_Overread&#39;,
 &#39;CWE127_Buffer_Underread&#39;,
 &#39;CWE134_Uncontrolled_Format_String&#39;,
 &#39;CWE190_Integer_Overflow&#39;,
 &#39;CWE191_Integer_Underflow&#39;,
 &#39;CWE194_Unexpected_Sign_Extension&#39;,
 &#39;CWE195_Signed_to_Unsigned_Conversion_Error&#39;,
 &#39;CWE197_Numeric_Truncation_Error&#39;,
 &#39;CWE23_Relative_Path_Traversal&#39;,
 &#39;CWE369_Divide_by_Zero&#39;,
 &#39;CWE36_Absolute_Path_Traversal&#39;,
 &#39;CWE400_Resource_Exhaustion&#39;,
 &#39;CWE401_Memory_Leak&#39;,
 &#39;CWE415_Double_Free&#39;,
 &#39;CWE457_Use_of_Uninitialized_Variable&#39;,
 &#39;CWE563_Unused_Variable&#39;,
 &#39;CWE590_Free_Memory_Not_on_Heap&#39;,
 &#39;CWE680_Integer_Overflow_to_Buffer_Overflow&#39;,
 &#39;CWE690_NULL_Deref_From_Return&#39;,
 &#39;CWE762_Mismatched_Memory_Management_Routines&#39;,
 &#39;CWE789_Uncontrolled_Mem_Alloc&#39;,
 &#39;CWE78_OS_Command_Injection&#39;]</code></pre>
</div>
</div>
<section id="text-pre-processing" class="cell markdown"
id="BAx7AwVEQ6Hh">
<h2>Text Pre-processing</h2>
</section>
<div class="cell code" data-execution_count="59"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="0ZnF_kPrQ6Hk" data-outputId="eaa6f17d-3fdc-47e7-98c6-798d664547f6">
<div class="sourceCode" id="cb86"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb86-1"><a href="#cb86-1" aria-hidden="true" tabindex="-1"></a><span class="co"># df[&#39;Text&#39;].head(5).apply(process_text)</span></span>
<span id="cb86-2"><a href="#cb86-2" aria-hidden="true" tabindex="-1"></a>data_filtered[<span class="st">&#39;Text&#39;</span>].head(<span class="dv">5</span>).<span class="bu">apply</span>(process_text)</span></code></pre></div>
<div class="output execute_result" data-execution_count="59">
<pre><code>0    [includ, stdtestcaseh, ifdef, win32, defin, ba...
1    [includ, stdtestcaseh, includ, wcharh, namespa...
2    [ifndef, omitgood, includ, stdtestcaseh, inclu...
3    [includ, stdtestcaseh, includ, environmentw32s...
4    [includ, stdtestcaseh, includ, wcharh, namespa...
Name: Text, dtype: object</code></pre>
</div>
</div>
<section id="vectorization" class="cell markdown" id="g1hhtl04Q6Hm">
<h2>Vectorization</h2>
</section>
<div class="cell code" data-execution_count="60"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="yhXJa4abQ6Ho" data-outputId="0ac14d07-cd5f-453f-facd-9f533a93ffcb">
<div class="sourceCode" id="cb88"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb88-1"><a href="#cb88-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>time</span>
<span id="cb88-2"><a href="#cb88-2" aria-hidden="true" tabindex="-1"></a><span class="co"># bow_transformer = CountVectorizer(analyzer=process_text).fit(df[&#39;Text&#39;])</span></span>
<span id="cb88-3"><a href="#cb88-3" aria-hidden="true" tabindex="-1"></a>bow_transformer <span class="op">=</span> CountVectorizer(analyzer<span class="op">=</span>process_text).fit(data_filtered[<span class="st">&#39;Text&#39;</span>])</span>
<span id="cb88-4"><a href="#cb88-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb88-5"><a href="#cb88-5" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="bu">len</span>(bow_transformer.vocabulary_)) <span class="co">## 12766</span></span>
<span id="cb88-6"><a href="#cb88-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb88-7"><a href="#cb88-7" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 26.5 s, sys: 84.2 ms, total: 26.6 s</span></span>
<span id="cb88-8"><a href="#cb88-8" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 36 s</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>12766
CPU times: user 14.7 s, sys: 249 ms, total: 14.9 s
Wall time: 15 s
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="61"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="E_ery-McQ6Hp" data-outputId="020ab7ad-dad9-444a-c72b-408585a37745">
<div class="sourceCode" id="cb90"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb90-1"><a href="#cb90-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(bow_transformer.get_feature_names_out()[<span class="dv">114</span>])</span>
<span id="cb90-2"><a href="#cb90-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(bow_transformer.get_feature_names_out()[<span class="dv">783</span>])</span>
<span id="cb90-3"><a href="#cb90-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb90-4"><a href="#cb90-4" aria-hidden="true" tabindex="-1"></a><span class="co">## alloca100sizeofint64t</span></span>
<span id="cb90-5"><a href="#cb90-5" aria-hidden="true" tabindex="-1"></a><span class="co">## arraystructtwointsstruct54</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>alloca100sizeofint64t
arraystructtwointsstruct54
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="62"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="9gHrbULgQ6Hp" data-outputId="f8a83e4b-f2a2-4788-9763-24203b81d383">
<div class="sourceCode" id="cb92"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb92-1"><a href="#cb92-1" aria-hidden="true" tabindex="-1"></a><span class="co">## ID of a term</span></span>
<span id="cb92-2"><a href="#cb92-2" aria-hidden="true" tabindex="-1"></a><span class="co"># bow_transformer.vocabulary_[&#39;chihuahua&#39;]</span></span>
<span id="cb92-3"><a href="#cb92-3" aria-hidden="true" tabindex="-1"></a>bow_transformer.vocabulary_[<span class="st">&#39;arraystructtwointsstruct54&#39;</span>] <span class="co">## 783</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="62">
<pre><code>783</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="63"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="2Fwoh8kdQ6Hq" data-outputId="bcb2592c-0a17-4dd2-931b-2c93bf74a09d">
<div class="sourceCode" id="cb94"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb94-1"><a href="#cb94-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>time</span>
<span id="cb94-2"><a href="#cb94-2" aria-hidden="true" tabindex="-1"></a><span class="co"># text_bow = bow_transformer.transform(df[&#39;Text&#39;])</span></span>
<span id="cb94-3"><a href="#cb94-3" aria-hidden="true" tabindex="-1"></a>text_bow <span class="op">=</span> bow_transformer.transform(data_filtered[<span class="st">&#39;Text&#39;</span>])</span>
<span id="cb94-4"><a href="#cb94-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb94-5"><a href="#cb94-5" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 21.5 s, sys: 70.8 ms, total: 21.6 s</span></span>
<span id="cb94-6"><a href="#cb94-6" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 21.8 s</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>CPU times: user 14.6 s, sys: 238 ms, total: 14.8 s
Wall time: 14.9 s
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="64"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="lPa9tqQCQ6Hr" data-outputId="c6284815-2773-43f8-f984-e53e8c70f7c4">
<div class="sourceCode" id="cb96"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb96-1"><a href="#cb96-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&#39;Shape of Sparse Matrix: &#39;</span>, text_bow.shape)</span>
<span id="cb96-2"><a href="#cb96-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&#39;Amount of Non-Zero occurences: &#39;</span>, text_bow.nnz)</span></code></pre></div>
<div class="output stream stdout">
<pre><code>Shape of Sparse Matrix:  (7628, 12766)
Amount of Non-Zero occurences:  316364
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="65"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="Yl4Vie14Q6Hr" data-outputId="6ec22187-603c-4b96-e494-48ff5b4e589c">
<div class="sourceCode" id="cb98"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb98-1"><a href="#cb98-1" aria-hidden="true" tabindex="-1"></a>text_bow.shape[<span class="dv">0</span>] <span class="op">*</span> text_bow.shape[<span class="dv">1</span>]</span></code></pre></div>
<div class="output execute_result" data-execution_count="65">
<pre><code>97379048</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="66"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="4_2E7A7AQ6Hs" data-outputId="42321def-e840-4a3a-c31f-b9f145e46aa9">
<div class="sourceCode" id="cb100"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb100-1"><a href="#cb100-1" aria-hidden="true" tabindex="-1"></a>sparsity <span class="op">=</span> (<span class="fl">100.0</span> <span class="op">*</span> text_bow.nnz <span class="op">/</span> (text_bow.shape[<span class="dv">0</span>] <span class="op">*</span> text_bow.shape[<span class="dv">1</span>]))</span>
<span id="cb100-2"><a href="#cb100-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&#39;sparsity: </span><span class="sc">{}</span><span class="st">&#39;</span>.<span class="bu">format</span>(sparsity)) <span class="co">## sparsity: 0.3248789205661571</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>sparsity: 0.3248789205661571
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="67" id="c9kASiL-Q6Ht">
<div class="sourceCode" id="cb102"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb102-1"><a href="#cb102-1" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">&#39;bow_transformer.pk&#39;</span>, <span class="st">&#39;wb&#39;</span>) <span class="im">as</span> bow:</span>
<span id="cb102-2"><a href="#cb102-2" aria-hidden="true" tabindex="-1"></a>    pickle.dump(bow_transformer, bow)</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="68"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="EMT6yOILQ6Hv" data-outputId="c6755e3e-25b8-42af-cc00-79a5fa7a6fff">
<div class="sourceCode" id="cb103"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb103-1"><a href="#cb103-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>time</span>
<span id="cb103-2"><a href="#cb103-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb103-3"><a href="#cb103-3" aria-hidden="true" tabindex="-1"></a>tfidf_transformer <span class="op">=</span> TfidfTransformer().fit(text_bow)</span>
<span id="cb103-4"><a href="#cb103-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb103-5"><a href="#cb103-5" aria-hidden="true" tabindex="-1"></a><span class="co"># ## CPU times: user 5.72 ms, sys: 941 µs, total: 6.66 ms</span></span>
<span id="cb103-6"><a href="#cb103-6" aria-hidden="true" tabindex="-1"></a><span class="co"># ## Wall time: 13.2 ms</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>CPU times: user 3.43 ms, sys: 26 µs, total: 3.46 ms
Wall time: 3.47 ms
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="69"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="suQlWk45Q6Hw" data-outputId="08deec5d-b9eb-4358-ee17-f15fdbc744df">
<div class="sourceCode" id="cb105"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb105-1"><a href="#cb105-1" aria-hidden="true" tabindex="-1"></a><span class="co"># %%time</span></span>
<span id="cb105-2"><a href="#cb105-2" aria-hidden="true" tabindex="-1"></a>text_tfidf <span class="op">=</span> tfidf_transformer.transform(text_bow)</span>
<span id="cb105-3"><a href="#cb105-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(text_tfidf.shape) <span class="co">## (7628, 12766)</span></span>
<span id="cb105-4"><a href="#cb105-4" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 25 ms, sys: 3.03 ms, total: 28.1 ms</span></span>
<span id="cb105-5"><a href="#cb105-5" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 27.7 ms</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>(7628, 12766)
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="70" id="xicmhgHSQ6Hx">
<div class="sourceCode" id="cb107"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb107-1"><a href="#cb107-1" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">&#39;tfidf_transformer.pk&#39;</span>, <span class="st">&#39;wb&#39;</span>) <span class="im">as</span> bow:</span>
<span id="cb107-2"><a href="#cb107-2" aria-hidden="true" tabindex="-1"></a>    pickle.dump(tfidf_transformer, bow)</span></code></pre></div>
</div>
<section id="text-classification-models" class="cell markdown"
id="VkIIBlCkQ6Hx">
<h1>Text Classification Models</h1>
</section>
<section id="naivebayes-model" class="cell markdown" id="TABW4pnWQ6Hy">
<h2>NaiveBayes model</h2>
</section>
<div class="cell code" data-execution_count="71" id="9GU6IB7cQ6Hz">
<div class="sourceCode" id="cb108"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb108-1"><a href="#cb108-1" aria-hidden="true" tabindex="-1"></a><span class="co"># %%time</span></span>
<span id="cb108-2"><a href="#cb108-2" aria-hidden="true" tabindex="-1"></a><span class="co"># detect_model = MultinomialNB().fit(text_tfidf, df[&#39;Category&#39;])= MultinomialNB().fit</span></span>
<span id="cb108-3"><a href="#cb108-3" aria-hidden="true" tabindex="-1"></a>detect_model <span class="op">=</span> MultinomialNB().fit(text_tfidf, df_target)</span>
<span id="cb108-4"><a href="#cb108-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb108-5"><a href="#cb108-5" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 26.8 ms, sys: 7.02 ms, total: 33.9 ms</span></span>
<span id="cb108-6"><a href="#cb108-6" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 35.4 ms</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="72" id="_691s96nQ6H0">
<div class="sourceCode" id="cb109"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb109-1"><a href="#cb109-1" aria-hidden="true" tabindex="-1"></a><span class="co"># %%time</span></span>
<span id="cb109-2"><a href="#cb109-2" aria-hidden="true" tabindex="-1"></a>all_predictions <span class="op">=</span> detect_model.predict(text_tfidf)</span>
<span id="cb109-3"><a href="#cb109-3" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 10.4 ms, sys: 0 ns, total: 10.4 ms</span></span>
<span id="cb109-4"><a href="#cb109-4" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 11.9 ms</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="73"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="RSENyuc_Q6H1" data-outputId="5eabf42b-9bb6-4768-a339-e6f08b4ebc7d">
<div class="sourceCode" id="cb110"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb110-1"><a href="#cb110-1" aria-hidden="true" tabindex="-1"></a>target_names <span class="op">=</span> data[<span class="st">&#39;target_names&#39;</span>]</span>
<span id="cb110-2"><a href="#cb110-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb110-3"><a href="#cb110-3" aria-hidden="true" tabindex="-1"></a><span class="co"># print(classification_report(df[&#39;Category&#39;], all_predictions, target_names=data[&#39;target_names&#39;]))</span></span>
<span id="cb110-4"><a href="#cb110-4" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(classification_report(df_target, all_predictions, target_names<span class="op">=</span>target_names))</span></code></pre></div>
<div class="output stream stdout">
<pre><code>                                              precision    recall  f1-score   support

          CWE121_Stack_Based_Buffer_Overflow       0.91      0.70      0.79       324
           CWE122_Heap_Based_Buffer_Overflow       0.87      0.66      0.75       316
                    CWE124_Buffer_Underwrite       0.63      0.85      0.72       331
                      CWE126_Buffer_Overread       0.86      0.92      0.89       335
                     CWE127_Buffer_Underread       0.92      0.77      0.83       333
           CWE134_Uncontrolled_Format_String       0.98      0.90      0.94       350
                     CWE190_Integer_Overflow       0.94      0.79      0.86       298
                    CWE191_Integer_Underflow       0.95      0.77      0.85       294
            CWE194_Unexpected_Sign_Extension       1.00      0.58      0.73       165
  CWE195_Signed_to_Unsigned_Conversion_Error       0.97      0.55      0.70       158
             CWE197_Numeric_Truncation_Error       0.70      0.90      0.79       350
               CWE23_Relative_Path_Traversal       0.91      0.98      0.95       350
                       CWE369_Divide_by_Zero       0.85      0.92      0.88       350
               CWE36_Absolute_Path_Traversal       0.83      0.91      0.87       350
                  CWE400_Resource_Exhaustion       1.00      0.79      0.89       156
                          CWE401_Memory_Leak       0.81      0.82      0.81       333
                          CWE415_Double_Free       0.75      0.76      0.76       350
        CWE457_Use_of_Uninitialized_Variable       0.94      0.97      0.96       297
                      CWE563_Unused_Variable       0.81      1.00      0.90       350
              CWE590_Free_Memory_Not_on_Heap       0.88      0.88      0.88       348
  CWE680_Integer_Overflow_to_Buffer_Overflow       0.99      0.85      0.91       301
               CWE690_NULL_Deref_From_Return       1.00      0.31      0.47       167
CWE762_Mismatched_Memory_Management_Routines       0.79      0.85      0.82       349
               CWE789_Uncontrolled_Mem_Alloc       0.57      0.97      0.72       323
                  CWE78_OS_Command_Injection       1.00      0.95      0.97       350

                                    accuracy                           0.84      7628
                                   macro avg       0.87      0.81      0.83      7628
                                weighted avg       0.86      0.84      0.84      7628

</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="74" id="SHW5AC-CQ6H1">
<div class="sourceCode" id="cb112"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb112-1"><a href="#cb112-1" aria-hidden="true" tabindex="-1"></a><span class="co">#                                               precision    recall  f1-score   support</span></span>
<span id="cb112-2"><a href="#cb112-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb112-3"><a href="#cb112-3" aria-hidden="true" tabindex="-1"></a><span class="co">#           CWE121_Stack_Based_Buffer_Overflow       0.91      0.71      0.80       324</span></span>
<span id="cb112-4"><a href="#cb112-4" aria-hidden="true" tabindex="-1"></a><span class="co">#            CWE122_Heap_Based_Buffer_Overflow       0.87      0.66      0.75       316</span></span>
<span id="cb112-5"><a href="#cb112-5" aria-hidden="true" tabindex="-1"></a><span class="co">#                     CWE124_Buffer_Underwrite       0.63      0.85      0.72       331</span></span>
<span id="cb112-6"><a href="#cb112-6" aria-hidden="true" tabindex="-1"></a><span class="co">#                       CWE126_Buffer_Overread       0.86      0.92      0.89       335</span></span>
<span id="cb112-7"><a href="#cb112-7" aria-hidden="true" tabindex="-1"></a><span class="co">#                      CWE127_Buffer_Underread       0.92      0.77      0.83       333</span></span>
<span id="cb112-8"><a href="#cb112-8" aria-hidden="true" tabindex="-1"></a><span class="co">#            CWE134_Uncontrolled_Format_String       0.98      0.90      0.94       350</span></span>
<span id="cb112-9"><a href="#cb112-9" aria-hidden="true" tabindex="-1"></a><span class="co">#                      CWE190_Integer_Overflow       0.94      0.79      0.86       298</span></span>
<span id="cb112-10"><a href="#cb112-10" aria-hidden="true" tabindex="-1"></a><span class="co">#                     CWE191_Integer_Underflow       0.95      0.77      0.85       294</span></span>
<span id="cb112-11"><a href="#cb112-11" aria-hidden="true" tabindex="-1"></a><span class="co">#             CWE194_Unexpected_Sign_Extension       1.00      0.59      0.74       165</span></span>
<span id="cb112-12"><a href="#cb112-12" aria-hidden="true" tabindex="-1"></a><span class="co">#   CWE195_Signed_to_Unsigned_Conversion_Error       0.99      0.55      0.71       158</span></span>
<span id="cb112-13"><a href="#cb112-13" aria-hidden="true" tabindex="-1"></a><span class="co">#              CWE197_Numeric_Truncation_Error       0.71      0.90      0.79       350</span></span>
<span id="cb112-14"><a href="#cb112-14" aria-hidden="true" tabindex="-1"></a><span class="co">#                CWE23_Relative_Path_Traversal       0.91      0.98      0.95       350</span></span>
<span id="cb112-15"><a href="#cb112-15" aria-hidden="true" tabindex="-1"></a><span class="co">#                        CWE369_Divide_by_Zero       0.83      0.91      0.87       350</span></span>
<span id="cb112-16"><a href="#cb112-16" aria-hidden="true" tabindex="-1"></a><span class="co">#                CWE36_Absolute_Path_Traversal       0.82      0.91      0.86       350</span></span>
<span id="cb112-17"><a href="#cb112-17" aria-hidden="true" tabindex="-1"></a><span class="co">#                   CWE400_Resource_Exhaustion       1.00      0.79      0.89       156</span></span>
<span id="cb112-18"><a href="#cb112-18" aria-hidden="true" tabindex="-1"></a><span class="co">#                           CWE401_Memory_Leak       0.81      0.82      0.81       333</span></span>
<span id="cb112-19"><a href="#cb112-19" aria-hidden="true" tabindex="-1"></a><span class="co">#                           CWE415_Double_Free       0.76      0.76      0.76       350</span></span>
<span id="cb112-20"><a href="#cb112-20" aria-hidden="true" tabindex="-1"></a><span class="co">#         CWE457_Use_of_Uninitialized_Variable       0.94      0.97      0.96       297</span></span>
<span id="cb112-21"><a href="#cb112-21" aria-hidden="true" tabindex="-1"></a><span class="co">#                       CWE563_Unused_Variable       0.81      1.00      0.89       350</span></span>
<span id="cb112-22"><a href="#cb112-22" aria-hidden="true" tabindex="-1"></a><span class="co">#               CWE590_Free_Memory_Not_on_Heap       0.88      0.88      0.88       348</span></span>
<span id="cb112-23"><a href="#cb112-23" aria-hidden="true" tabindex="-1"></a><span class="co">#   CWE680_Integer_Overflow_to_Buffer_Overflow       0.98      0.85      0.91       301</span></span>
<span id="cb112-24"><a href="#cb112-24" aria-hidden="true" tabindex="-1"></a><span class="co">#                CWE690_NULL_Deref_From_Return       1.00      0.31      0.47       167</span></span>
<span id="cb112-25"><a href="#cb112-25" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE762_Mismatched_Memory_Management_Routines       0.79      0.85      0.82       349</span></span>
<span id="cb112-26"><a href="#cb112-26" aria-hidden="true" tabindex="-1"></a><span class="co">#                CWE789_Uncontrolled_Mem_Alloc       0.57      0.97      0.72       323</span></span>
<span id="cb112-27"><a href="#cb112-27" aria-hidden="true" tabindex="-1"></a><span class="co">#                   CWE78_OS_Command_Injection       1.00      0.95      0.97       350</span></span>
<span id="cb112-28"><a href="#cb112-28" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb112-29"><a href="#cb112-29" aria-hidden="true" tabindex="-1"></a><span class="co">#                                     accuracy                           0.84      7628</span></span>
<span id="cb112-30"><a href="#cb112-30" aria-hidden="true" tabindex="-1"></a><span class="co">#                                    macro avg       0.87      0.81      0.83      7628</span></span>
<span id="cb112-31"><a href="#cb112-31" aria-hidden="true" tabindex="-1"></a><span class="co">#                                 weighted avg       0.86      0.84      0.84      7628</span></span></code></pre></div>
</div>
<section id="cross-validation" class="cell markdown" id="XsNWKekTQ6H3">
<h4>Cross Validation</h4>
</section>
<div class="cell code" data-execution_count="75"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="e4qYO_O-Q6H4" data-outputId="5e9bc925-0834-4de8-ed58-fc6f24b54f4c">
<div class="sourceCode" id="cb113"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb113-1"><a href="#cb113-1" aria-hidden="true" tabindex="-1"></a><span class="co"># %%time</span></span>
<span id="cb113-2"><a href="#cb113-2" aria-hidden="true" tabindex="-1"></a>clf <span class="op">=</span> MultinomialNB()</span>
<span id="cb113-3"><a href="#cb113-3" aria-hidden="true" tabindex="-1"></a><span class="co"># scores = cross_val_score(clf, text_tfidf, df[&#39;Category&#39;],  cv=8)</span></span>
<span id="cb113-4"><a href="#cb113-4" aria-hidden="true" tabindex="-1"></a>scores <span class="op">=</span> cross_val_score(clf, text_tfidf, df_target,  cv<span class="op">=</span><span class="dv">8</span>)</span>
<span id="cb113-5"><a href="#cb113-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb113-6"><a href="#cb113-6" aria-hidden="true" tabindex="-1"></a><span class="co">#scores</span></span>
<span id="cb113-7"><a href="#cb113-7" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Accuracy: </span><span class="sc">%0.2f</span><span class="st"> (+/- </span><span class="sc">%0.2f</span><span class="st">)&quot;</span> <span class="op">%</span> (scores.mean(), scores.std() <span class="op">*</span> <span class="dv">2</span>))</span>
<span id="cb113-8"><a href="#cb113-8" aria-hidden="true" tabindex="-1"></a><span class="co">## Accuracy: 0.74 (+/- 0.03)</span></span>
<span id="cb113-9"><a href="#cb113-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb113-10"><a href="#cb113-10" aria-hidden="true" tabindex="-1"></a><span class="co"># CPU times: user 156 ms, sys: 8.02 ms, total: 164 ms</span></span>
<span id="cb113-11"><a href="#cb113-11" aria-hidden="true" tabindex="-1"></a><span class="co"># Wall time: 167 ms</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>Accuracy: 0.74 (+/- 0.03)
</code></pre>
</div>
</div>
<section id="randomforest-model" class="cell markdown"
id="Ws9XLIkMQ6H4">
<h2>RandomForest model</h2>
</section>
<div class="cell code" data-execution_count="76" id="vago-DEbQ6H5">
<div class="sourceCode" id="cb115"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb115-1"><a href="#cb115-1" aria-hidden="true" tabindex="-1"></a><span class="co"># from sklearn.ensemble import RandomForestClassifier</span></span></code></pre></div>
</div>
<section id="cross-validation" class="cell markdown" id="T5DkOCxgQ6H5">
<h4>Cross Validation</h4>
</section>
<section id="this-may-take-few-minutes" class="cell markdown"
id="09RP0GXHQ6H5">
<h3>This may take few minutes</h3>
</section>
<div class="cell code" data-execution_count="77"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="VhjMhoCxQ6H6" data-outputId="1ed0a727-6011-4017-ca23-72b64147ff07">
<div class="sourceCode" id="cb116"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb116-1"><a href="#cb116-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>time</span>
<span id="cb116-2"><a href="#cb116-2" aria-hidden="true" tabindex="-1"></a>clf <span class="op">=</span> RandomForestClassifier()</span>
<span id="cb116-3"><a href="#cb116-3" aria-hidden="true" tabindex="-1"></a>scores <span class="op">=</span> cross_val_score(clf, text_tfidf, df_target,  cv<span class="op">=</span><span class="dv">8</span>)</span>
<span id="cb116-4"><a href="#cb116-4" aria-hidden="true" tabindex="-1"></a><span class="co">#scores</span></span>
<span id="cb116-5"><a href="#cb116-5" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Accuracy: </span><span class="sc">%0.2f</span><span class="st"> (+/- </span><span class="sc">%0.2f</span><span class="st">)&quot;</span> <span class="op">%</span> (scores.mean(), scores.std() <span class="op">*</span> <span class="dv">2</span>))</span>
<span id="cb116-6"><a href="#cb116-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb116-7"><a href="#cb116-7" aria-hidden="true" tabindex="-1"></a><span class="co">## Accuracy: 0.87 (+/- 0.02) ## Accuracy: 0.87 (+/- 0.01)</span></span>
<span id="cb116-8"><a href="#cb116-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb116-9"><a href="#cb116-9" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 2min 7s, sys: 287 ms, total: 2min 7s</span></span>
<span id="cb116-10"><a href="#cb116-10" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 2min 10s</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>Accuracy: 0.87 (+/- 0.02)
CPU times: user 2min 2s, sys: 437 ms, total: 2min 2s
Wall time: 2min 4s
</code></pre>
</div>
</div>
<section id="split-clf-multinomialnb-plt-confusion-matrix"
class="cell markdown" id="NanvroWXQ6H6">
<h2>Split, Clf MultinomialNB, Plt Confusion Matrix</h2>
</section>
<section id="train_test_split-wo-cv" class="cell markdown"
id="7sA2NAVyQ6H7">
<h3>Train_Test_Split w/o CV</h3>
</section>
<div class="cell code" data-execution_count="78" id="bX06lgEDlj-s">
<div class="sourceCode" id="cb118"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb118-1"><a href="#cb118-1" aria-hidden="true" tabindex="-1"></a>start <span class="op">=</span> time.time()</span>
<span id="cb118-2"><a href="#cb118-2" aria-hidden="true" tabindex="-1"></a><span class="co">#classifier = Pipeline([(&#39;vectorizer&#39;, TfidfVectorizer(stop_words=stopwords.words(&#39;english&#39;) + list(string.punctuation))),(&#39;classifier&#39;, LinearSVC(C=10))])</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="79" id="FkFQfEEcQ6H8">
<div class="sourceCode" id="cb119"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb119-1"><a href="#cb119-1" aria-hidden="true" tabindex="-1"></a><span class="co">## train_test_split</span></span>
<span id="cb119-2"><a href="#cb119-2" aria-hidden="true" tabindex="-1"></a><span class="co"># from sklearn.model_selection import train_test_split</span></span>
<span id="cb119-3"><a href="#cb119-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb119-4"><a href="#cb119-4" aria-hidden="true" tabindex="-1"></a><span class="co"># X_train, X_test, y_train, y_test = train_test_split(text_tfidf, df[&#39;Category&#39;], test_size=0.2, random_state=0) ## 11</span></span>
<span id="cb119-5"><a href="#cb119-5" aria-hidden="true" tabindex="-1"></a>X_train, X_test, y_train, y_test <span class="op">=</span> train_test_split(text_tfidf, df_target, test_size<span class="op">=</span><span class="fl">0.2</span>, random_state<span class="op">=</span><span class="dv">0</span>) <span class="co">## 11</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="80"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="Al0vDealQ6H8" data-outputId="a452923d-3f56-43f2-de7a-c79921120670">
<div class="sourceCode" id="cb120"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb120-1"><a href="#cb120-1" aria-hidden="true" tabindex="-1"></a><span class="co"># %%time</span></span>
<span id="cb120-2"><a href="#cb120-2" aria-hidden="true" tabindex="-1"></a>clf <span class="op">=</span> MultinomialNB()</span>
<span id="cb120-3"><a href="#cb120-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb120-4"><a href="#cb120-4" aria-hidden="true" tabindex="-1"></a>clf.fit(X_train, y_train)</span>
<span id="cb120-5"><a href="#cb120-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb120-6"><a href="#cb120-6" aria-hidden="true" tabindex="-1"></a>end <span class="op">=</span> time.time()</span>
<span id="cb120-7"><a href="#cb120-7" aria-hidden="true" tabindex="-1"></a><span class="co"># print(&quot;Accuracy: &quot; + str(clf.score(X_test, y_test))) #+ &quot;, Time duration: &quot; + str(end - start))</span></span>
<span id="cb120-8"><a href="#cb120-8" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Accuracy: &quot;</span> <span class="op">+</span> <span class="bu">str</span>(clf.score(X_test, y_test)) <span class="op">+</span> <span class="st">&quot;, Time duration: &quot;</span> <span class="op">+</span> <span class="bu">str</span>(end <span class="op">-</span> start))</span>
<span id="cb120-9"><a href="#cb120-9" aria-hidden="true" tabindex="-1"></a><span class="co">## Accuracy: 0.7175622542595019, Time duration: 0.029235124588012695</span></span>
<span id="cb120-10"><a href="#cb120-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb120-11"><a href="#cb120-11" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 25.7 ms, sys: 0 ns, total: 25.7 ms</span></span>
<span id="cb120-12"><a href="#cb120-12" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 31.1 ms</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>Accuracy: 0.7326343381389253, Time duration: 0.038088321685791016
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="81" id="Ula7azVnQ6H9">
<div class="sourceCode" id="cb122"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb122-1"><a href="#cb122-1" aria-hidden="true" tabindex="-1"></a><span class="co">## Confusion Matrix</span></span>
<span id="cb122-2"><a href="#cb122-2" aria-hidden="true" tabindex="-1"></a>y_pred <span class="op">=</span> clf.predict(X_test)</span>
<span id="cb122-3"><a href="#cb122-3" aria-hidden="true" tabindex="-1"></a>conf_mat <span class="op">=</span> confusion_matrix(y_test, y_pred)</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="82"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:815}"
id="-yN1fSPDQ6H9" data-outputId="b755af63-7314-4038-9782-d9b48c500d16">
<div class="sourceCode" id="cb123"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb123-1"><a href="#cb123-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Plot confusion_matrix</span></span>
<span id="cb123-2"><a href="#cb123-2" aria-hidden="true" tabindex="-1"></a>fig, ax <span class="op">=</span> plt.subplots(figsize<span class="op">=</span>(<span class="dv">15</span>, <span class="dv">10</span>))</span>
<span id="cb123-3"><a href="#cb123-3" aria-hidden="true" tabindex="-1"></a>sns.heatmap(conf_mat, annot<span class="op">=</span><span class="va">True</span>, cmap <span class="op">=</span> <span class="st">&quot;Set3&quot;</span>, fmt <span class="op">=</span><span class="st">&quot;d&quot;</span>,</span>
<span id="cb123-4"><a href="#cb123-4" aria-hidden="true" tabindex="-1"></a>xticklabels<span class="op">=</span>data[<span class="st">&#39;target_names&#39;</span>], yticklabels<span class="op">=</span>data[<span class="st">&#39;target_names&#39;</span>])</span>
<span id="cb123-5"><a href="#cb123-5" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">&#39;Actual&#39;</span>)</span>
<span id="cb123-6"><a href="#cb123-6" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">&#39;Predicted&#39;</span>)</span>
<span id="cb123-7"><a href="#cb123-7" aria-hidden="true" tabindex="-1"></a>plt.show()</span></code></pre></div>
<div class="output display_data">
<p><img
src="vertopal_3d2211f5135e46f9b8501b72a53acc02/672aa7d96ecda8d6777f800e1b8358e14f6a2ee6.png" /></p>
</div>
</div>
<section id="model-dump" class="cell markdown" id="iXs1cDQCQ6H_">
<h2>Model Dump</h2>
</section>
<div class="cell code" data-execution_count="83" id="zt6dY97pQ6H_">
<div class="sourceCode" id="cb124"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb124-1"><a href="#cb124-1" aria-hidden="true" tabindex="-1"></a><span class="co"># %%time</span></span>
<span id="cb124-2"><a href="#cb124-2" aria-hidden="true" tabindex="-1"></a><span class="co">## train a NB classifer on the entire data</span></span>
<span id="cb124-3"><a href="#cb124-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb124-4"><a href="#cb124-4" aria-hidden="true" tabindex="-1"></a><span class="co"># nb_model = MultinomialNB().fit(text_tfidf, df[&#39;Category&#39;])</span></span>
<span id="cb124-5"><a href="#cb124-5" aria-hidden="true" tabindex="-1"></a>nb_model <span class="op">=</span> MultinomialNB().fit(text_tfidf, df_target)</span>
<span id="cb124-6"><a href="#cb124-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb124-7"><a href="#cb124-7" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">&#39;nb_model.pk&#39;</span>, <span class="st">&#39;wb&#39;</span>) <span class="im">as</span> nb:</span>
<span id="cb124-8"><a href="#cb124-8" aria-hidden="true" tabindex="-1"></a>    pickle.dump(nb_model, nb)</span>
<span id="cb124-9"><a href="#cb124-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb124-10"><a href="#cb124-10" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 26.5 ms, sys: 4.99 ms, total: 31.5 ms</span></span>
<span id="cb124-11"><a href="#cb124-11" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 33.1 ms</span></span></code></pre></div>
</div>
<section id="1st-in-sample-test-case-foresights" class="cell markdown"
id="Nm1djlHsTHTw">
<h1>1st In-Sample Test-Case Foresights</h1>
</section>
<div class="cell code" data-execution_count="84"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="vkw9EKKLTHTw" data-outputId="12ac7bb2-5ddf-470b-dc02-a3f0e99b3a41">
<div class="sourceCode" id="cb125"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb125-1"><a href="#cb125-1" aria-hidden="true" tabindex="-1"></a>data_filtered.columns <span class="co">## Index([&#39;Category&#39;, &#39;Test-Case-Code&#39;], dtype=&#39;object&#39;)</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="84">
<pre><code>Index([&#39;Text&#39;, &#39;Category&#39;], dtype=&#39;object&#39;)</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="85"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="xh5aO_L-THTx" data-outputId="383df596-e054-4069-df0a-159c19816754">
<div class="sourceCode" id="cb127"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb127-1"><a href="#cb127-1" aria-hidden="true" tabindex="-1"></a><span class="co"># data_filtered[&#39;Category&#39;]</span></span>
<span id="cb127-2"><a href="#cb127-2" aria-hidden="true" tabindex="-1"></a>df_target</span></code></pre></div>
<div class="output execute_result" data-execution_count="85">
<pre><code>0       11
1        3
2       18
3       24
4        4
        ..
7623    16
7624    11
7625     5
7626     8
7627     8
Name: Category, Length: 7628, dtype: int64</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="86"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:423}"
id="czPPFM10THTx" data-outputId="e6c77559-9900-4b2e-d5e4-8ed44653538e">
<div class="sourceCode" id="cb129"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb129-1"><a href="#cb129-1" aria-hidden="true" tabindex="-1"></a>data_filtered</span></code></pre></div>
<div class="output execute_result" data-execution_count="86">

  <div id="df-d219d5b7-cee7-4d7c-8074-8ab47145a172" class="colab-df-container">
    <div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Text</th>
      <th>Category</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>[includ, stdtestcaseh, ifdef, win32, defin, ba...</td>
      <td>11</td>
    </tr>
    <tr>
      <th>1</th>
      <td>[includ, stdtestcaseh, includ, wcharh, namespa...</td>
      <td>3</td>
    </tr>
    <tr>
      <th>2</th>
      <td>[ifndef, omitgood, includ, stdtestcaseh, inclu...</td>
      <td>18</td>
    </tr>
    <tr>
      <th>3</th>
      <td>[includ, stdtestcaseh, includ, environmentw32s...</td>
      <td>24</td>
    </tr>
    <tr>
      <th>4</th>
      <td>[includ, stdtestcaseh, includ, wcharh, namespa...</td>
      <td>4</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
    </tr>
    <tr>
      <th>7623</th>
      <td>[includ, stdtestcaseh, includ, wcharh, namespa...</td>
      <td>16</td>
    </tr>
    <tr>
      <th>7624</th>
      <td>[includ, stdtestcaseh, ifdef, win32, defin, ba...</td>
      <td>11</td>
    </tr>
    <tr>
      <th>7625</th>
      <td>[includ, stdtestcaseh, ifndef, win32, includ, ...</td>
      <td>5</td>
    </tr>
    <tr>
      <th>7626</th>
      <td>[includ, stdtestcaseh, includ, vector, defin, ...</td>
      <td>8</td>
    </tr>
    <tr>
      <th>7627</th>
      <td>[includ, stdtestcaseh, includ, map, use, names...</td>
      <td>8</td>
    </tr>
  </tbody>
</table>
<p>7628 rows × 2 columns</p>
</div>
    <div class="colab-df-buttons">

  <div class="colab-df-container">
    <button class="colab-df-convert" onclick="convertToInteractive('df-d219d5b7-cee7-4d7c-8074-8ab47145a172')"
            title="Convert this dataframe to an interactive table."
            style="display:none;">

  <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960">
    <path d="M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z"/>
  </svg>
    </button>

  <style>
    .colab-df-container {
      display:flex;
      gap: 12px;
    }

    .colab-df-convert {
      background-color: #E8F0FE;
      border: none;
      border-radius: 50%;
      cursor: pointer;
      display: none;
      fill: #1967D2;
      height: 32px;
      padding: 0 0 0 0;
      width: 32px;
    }

    .colab-df-convert:hover {
      background-color: #E2EBFA;
      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
      fill: #174EA6;
    }

    .colab-df-buttons div {
      margin-bottom: 4px;
    }

    [theme=dark] .colab-df-convert {
      background-color: #3B4455;
      fill: #D2E3FC;
    }

    [theme=dark] .colab-df-convert:hover {
      background-color: #434B5C;
      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
      fill: #FFFFFF;
    }
  </style>

    <script>
      const buttonEl =
        document.querySelector('#df-d219d5b7-cee7-4d7c-8074-8ab47145a172 button.colab-df-convert');
      buttonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';

      async function convertToInteractive(key) {
        const element = document.querySelector('#df-d219d5b7-cee7-4d7c-8074-8ab47145a172');
        const dataTable =
          await google.colab.kernel.invokeFunction('convertToInteractive',
                                                    [key], {});
        if (!dataTable) return;

        const docLinkHtml = 'Like what you see? Visit the ' +
          '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
          + ' to learn more about interactive tables.';
        element.innerHTML = '';
        dataTable['output_type'] = 'display_data';
        await google.colab.output.renderOutput(dataTable, element);
        const docLink = document.createElement('div');
        docLink.innerHTML = docLinkHtml;
        element.appendChild(docLink);
      }
    </script>
  </div>


<div id="df-82233a2c-4f4d-4646-acfb-8d075a9580cd">
  <button class="colab-df-quickchart" onclick="quickchart('df-82233a2c-4f4d-4646-acfb-8d075a9580cd')"
            title="Suggest charts"
            style="display:none;">

<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
     width="24px">
    <g>
        <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"/>
    </g>
</svg>
  </button>

<style>
  .colab-df-quickchart {
      --bg-color: #E8F0FE;
      --fill-color: #1967D2;
      --hover-bg-color: #E2EBFA;
      --hover-fill-color: #174EA6;
      --disabled-fill-color: #AAA;
      --disabled-bg-color: #DDD;
  }

  [theme=dark] .colab-df-quickchart {
      --bg-color: #3B4455;
      --fill-color: #D2E3FC;
      --hover-bg-color: #434B5C;
      --hover-fill-color: #FFFFFF;
      --disabled-bg-color: #3B4455;
      --disabled-fill-color: #666;
  }

  .colab-df-quickchart {
    background-color: var(--bg-color);
    border: none;
    border-radius: 50%;
    cursor: pointer;
    display: none;
    fill: var(--fill-color);
    height: 32px;
    padding: 0;
    width: 32px;
  }

  .colab-df-quickchart:hover {
    background-color: var(--hover-bg-color);
    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
    fill: var(--button-hover-fill-color);
  }

  .colab-df-quickchart-complete:disabled,
  .colab-df-quickchart-complete:disabled:hover {
    background-color: var(--disabled-bg-color);
    fill: var(--disabled-fill-color);
    box-shadow: none;
  }

  .colab-df-spinner {
    border: 2px solid var(--fill-color);
    border-color: transparent;
    border-bottom-color: var(--fill-color);
    animation:
      spin 1s steps(1) infinite;
  }

  @keyframes spin {
    0% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
      border-left-color: var(--fill-color);
    }
    20% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    30% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
      border-right-color: var(--fill-color);
    }
    40% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    60% {
      border-color: transparent;
      border-right-color: var(--fill-color);
    }
    80% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-bottom-color: var(--fill-color);
    }
    90% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
    }
  }
</style>

  <script>
    async function quickchart(key) {
      const quickchartButtonEl =
        document.querySelector('#' + key + ' button');
      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.
      quickchartButtonEl.classList.add('colab-df-spinner');
      try {
        const charts = await google.colab.kernel.invokeFunction(
            'suggestCharts', [key], {});
      } catch (error) {
        console.error('Error during call to suggestCharts:', error);
      }
      quickchartButtonEl.classList.remove('colab-df-spinner');
      quickchartButtonEl.classList.add('colab-df-quickchart-complete');
    }
    (() => {
      let quickchartButtonEl =
        document.querySelector('#df-82233a2c-4f4d-4646-acfb-8d075a9580cd button');
      quickchartButtonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';
    })();
  </script>
</div>
    </div>
  </div>

</div>
</div>
<div class="cell code" data-execution_count="87"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="abaP78RkTHTy" data-outputId="28d15182-093f-43c7-9178-c4f51a92d9a3">
<div class="sourceCode" id="cb130"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb130-1"><a href="#cb130-1" aria-hidden="true" tabindex="-1"></a><span class="co"># %%time</span></span>
<span id="cb130-2"><a href="#cb130-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify the indices</span></span>
<span id="cb130-3"><a href="#cb130-3" aria-hidden="true" tabindex="-1"></a><span class="co"># indices = [0,1,2]</span></span>
<span id="cb130-4"><a href="#cb130-4" aria-hidden="true" tabindex="-1"></a>indices <span class="op">=</span> [<span class="dv">3</span>,<span class="dv">4</span>,<span class="dv">5</span>,<span class="dv">6</span>]</span>
<span id="cb130-5"><a href="#cb130-5" aria-hidden="true" tabindex="-1"></a><span class="co"># indices = [7,8,9]</span></span>
<span id="cb130-6"><a href="#cb130-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb130-7"><a href="#cb130-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Load the transformers and the model</span></span>
<span id="cb130-8"><a href="#cb130-8" aria-hidden="true" tabindex="-1"></a>bow_transf <span class="op">=</span> pickle.load(<span class="bu">open</span>(<span class="st">&quot;bow_transformer.pk&quot;</span>, <span class="st">&quot;rb&quot;</span>))</span>
<span id="cb130-9"><a href="#cb130-9" aria-hidden="true" tabindex="-1"></a>tfidf_transf <span class="op">=</span> pickle.load(<span class="bu">open</span>(<span class="st">&quot;tfidf_transformer.pk&quot;</span>, <span class="st">&quot;rb&quot;</span>))</span>
<span id="cb130-10"><a href="#cb130-10" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">&#39;nb_model.pk&#39;</span>, <span class="st">&#39;rb&#39;</span>) <span class="im">as</span> nb:</span>
<span id="cb130-11"><a href="#cb130-11" aria-hidden="true" tabindex="-1"></a>    model <span class="op">=</span> pickle.load(nb)</span>
<span id="cb130-12"><a href="#cb130-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb130-13"><a href="#cb130-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a mapping from class labels to CWE-IDs</span></span>
<span id="cb130-14"><a href="#cb130-14" aria-hidden="true" tabindex="-1"></a>label_to_cwe <span class="op">=</span> {i: cwe_id <span class="cf">for</span> i, cwe_id <span class="kw">in</span> <span class="bu">enumerate</span>(df_target.unique())}</span>
<span id="cb130-15"><a href="#cb130-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb130-16"><a href="#cb130-16" aria-hidden="true" tabindex="-1"></a><span class="co"># Loop through the specified indices</span></span>
<span id="cb130-17"><a href="#cb130-17" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> in_sample_top_10_cwe_index <span class="kw">in</span> indices:</span>
<span id="cb130-18"><a href="#cb130-18" aria-hidden="true" tabindex="-1"></a>    test_text <span class="op">=</span> data_filtered.iloc[in_sample_top_10_cwe_index,<span class="dv">1</span>]</span>
<span id="cb130-19"><a href="#cb130-19" aria-hidden="true" tabindex="-1"></a>    test_bow <span class="op">=</span> bow_transf.transform([test_text])</span>
<span id="cb130-20"><a href="#cb130-20" aria-hidden="true" tabindex="-1"></a>    test_tfidf <span class="op">=</span> tfidf_transf.transform(test_bow)</span>
<span id="cb130-21"><a href="#cb130-21" aria-hidden="true" tabindex="-1"></a>    foresights_id <span class="op">=</span> model.predict(test_tfidf)[<span class="dv">0</span>] <span class="co"># Access the first element of the array</span></span>
<span id="cb130-22"><a href="#cb130-22" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="st">&#39;-----------------------------------------&#39;</span>)</span>
<span id="cb130-23"><a href="#cb130-23" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f&quot;The predicted CWE-ID is: </span><span class="sc">{</span>foresights_id<span class="sc">}</span><span class="ss">&quot;</span>)</span>
<span id="cb130-24"><a href="#cb130-24" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Use the mapping to get the CWE-ID</span></span>
<span id="cb130-25"><a href="#cb130-25" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> foresights_id <span class="kw">in</span> label_to_cwe:</span>
<span id="cb130-26"><a href="#cb130-26" aria-hidden="true" tabindex="-1"></a>        predicted_cwe_id <span class="op">=</span> label_to_cwe[foresights_id]</span>
<span id="cb130-27"><a href="#cb130-27" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="ss">f&quot;The predicted CWE-ID is: </span><span class="sc">{</span>predicted_cwe_id<span class="sc">}</span><span class="ss">&quot;</span>)</span>
<span id="cb130-28"><a href="#cb130-28" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Check if the model&#39;s prediction matches the actual CWE-ID</span></span>
<span id="cb130-29"><a href="#cb130-29" aria-hidden="true" tabindex="-1"></a>        actual_cwe_id <span class="op">=</span> data_filtered.iloc[in_sample_top_10_cwe_index][<span class="st">&#39;Category&#39;</span>]</span>
<span id="cb130-30"><a href="#cb130-30" aria-hidden="true" tabindex="-1"></a>        is_correct <span class="op">=</span> (actual_cwe_id <span class="op">==</span> predicted_cwe_id)</span>
<span id="cb130-31"><a href="#cb130-31" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="ss">f&quot;Is the prediction correct? </span><span class="sc">{</span>is_correct<span class="sc">}</span><span class="ss">&quot;</span>)</span>
<span id="cb130-32"><a href="#cb130-32" aria-hidden="true" tabindex="-1"></a>    <span class="cf">else</span>:</span>
<span id="cb130-33"><a href="#cb130-33" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="ss">f&quot;The predicted label </span><span class="sc">{</span>foresights_id<span class="sc">}</span><span class="ss"> is not in the training data.&quot;</span>)</span>
<span id="cb130-34"><a href="#cb130-34" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb130-35"><a href="#cb130-35" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 52.5 ms, sys: 4.17 ms, total: 56.7 ms</span></span>
<span id="cb130-36"><a href="#cb130-36" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 103 ms</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>-----------------------------------------
The predicted CWE-ID is: 5
The predicted CWE-ID is: 17
Is the prediction correct? False
-----------------------------------------
The predicted CWE-ID is: 5
The predicted CWE-ID is: 17
Is the prediction correct? False
-----------------------------------------
The predicted CWE-ID is: 1
The predicted CWE-ID is: 3
Is the prediction correct? False
-----------------------------------------
The predicted CWE-ID is: 5
The predicted CWE-ID is: 17
Is the prediction correct? False
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="88" id="hlmc2zUPTHTz">
<div class="sourceCode" id="cb132"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb132-1"><a href="#cb132-1" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb132-2"><a href="#cb132-2" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID is: 19</span></span>
<span id="cb132-3"><a href="#cb132-3" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted label 19 is not in the training data.</span></span>
<span id="cb132-4"><a href="#cb132-4" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb132-5"><a href="#cb132-5" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID is: 4</span></span>
<span id="cb132-6"><a href="#cb132-6" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID is: CWE127_Buffer_Underread</span></span>
<span id="cb132-7"><a href="#cb132-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Is the prediction correct? True</span></span>
<span id="cb132-8"><a href="#cb132-8" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb132-9"><a href="#cb132-9" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID is: 5</span></span>
<span id="cb132-10"><a href="#cb132-10" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID is: CWE134_Uncontrolled_Format_String</span></span>
<span id="cb132-11"><a href="#cb132-11" aria-hidden="true" tabindex="-1"></a><span class="co"># Is the prediction correct? True</span></span>
<span id="cb132-12"><a href="#cb132-12" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb132-13"><a href="#cb132-13" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID is: 0</span></span>
<span id="cb132-14"><a href="#cb132-14" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID is: CWE122_Heap_Based_Buffer_Overflow</span></span>
<span id="cb132-15"><a href="#cb132-15" aria-hidden="true" tabindex="-1"></a><span class="co"># Is the prediction correct? False</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="89"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="aoB1Zy1QTHT0" data-outputId="1f049c7a-4158-4df6-9101-1c00bf748381">
<div class="sourceCode" id="cb133"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb133-1"><a href="#cb133-1" aria-hidden="true" tabindex="-1"></a>data_filtered.iloc[<span class="dv">6</span>,<span class="dv">1</span>]</span></code></pre></div>
<div class="output execute_result" data-execution_count="89">
<pre><code>4</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="90"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="ZyrB1amQTHT0" data-outputId="a5f17bff-deca-4d44-acf9-6a7429b48da4">
<div class="sourceCode" id="cb135"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb135-1"><a href="#cb135-1" aria-hidden="true" tabindex="-1"></a>data_filtered.iloc[<span class="dv">6</span>,<span class="dv">0</span>]</span>
<span id="cb135-2"><a href="#cb135-2" aria-hidden="true" tabindex="-1"></a><span class="co"># print(data_filtered.iloc[6,0])</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="90">
<pre><code>[&#39;ifndef&#39;,
 &#39;omitgood&#39;,
 &#39;includ&#39;,
 &#39;stdtestcaseh&#39;,
 &#39;includ&#39;,
 &#39;charloop83h&#39;,
 &#39;namespac&#39;,
 &#39;charloop83&#39;,
 &#39;charloop83goodg2bcharloop83goodg2bchar&#39;,
 &#39;datacopi&#39;,
 &#39;data&#39;,
 &#39;datacopi&#39;,
 &#39;char&#39;,
 &#39;databuff&#39;,
 &#39;char&#39;,
 &#39;malloc100sizeofchar&#39;,
 &#39;memsetdatabuff&#39;,
 &#39;1001&#39;,
 &#39;databuffer1001&#39;,
 &#39;0&#39;,
 &#39;data&#39;,
 &#39;databuff&#39;,
 &#39;charloop83goodg2bcharloop83goodg2b&#39;,
 &#39;sizet&#39;,
 &#39;char&#39;,
 &#39;dest100&#39;,
 &#39;memsetdest&#39;,
 &#39;c&#39;,
 &#39;1001&#39;,
 &#39;dest1001&#39;,
 &#39;0&#39;,
 &#39;0&#39;,
 &#39;100&#39;,
 &#39;desti&#39;,
 &#39;datai&#39;,
 &#39;dest1001&#39;,
 &#39;0&#39;,
 &#39;printlinedest&#39;,
 &#39;endif&#39;]</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="91"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="XzTZjf09THT1" data-outputId="93b4a5a3-1f25-4bf3-e7e2-133a4955936a">
<div class="sourceCode" id="cb137"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb137-1"><a href="#cb137-1" aria-hidden="true" tabindex="-1"></a><span class="co">## print out the actual predicted CWE-ID instaed of array([3])</span></span>
<span id="cb137-2"><a href="#cb137-2" aria-hidden="true" tabindex="-1"></a>test_text <span class="op">=</span> data_filtered.iloc[<span class="dv">6</span>,<span class="dv">1</span>]</span>
<span id="cb137-3"><a href="#cb137-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb137-4"><a href="#cb137-4" aria-hidden="true" tabindex="-1"></a><span class="co"># this is how you reload and use the BoW transformer</span></span>
<span id="cb137-5"><a href="#cb137-5" aria-hidden="true" tabindex="-1"></a>bow_transf <span class="op">=</span> pickle.load(<span class="bu">open</span>(<span class="st">&quot;bow_transformer.pk&quot;</span>, <span class="st">&quot;rb&quot;</span>))</span>
<span id="cb137-6"><a href="#cb137-6" aria-hidden="true" tabindex="-1"></a>test_bow <span class="op">=</span> bow_transf.transform([test_text])</span>
<span id="cb137-7"><a href="#cb137-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb137-8"><a href="#cb137-8" aria-hidden="true" tabindex="-1"></a><span class="co"># this is how you reload and use the TF-IDF transformer</span></span>
<span id="cb137-9"><a href="#cb137-9" aria-hidden="true" tabindex="-1"></a><span class="co"># remember it is applied to the result of bow_transformer</span></span>
<span id="cb137-10"><a href="#cb137-10" aria-hidden="true" tabindex="-1"></a>tfidf_transf <span class="op">=</span> pickle.load(<span class="bu">open</span>(<span class="st">&quot;tfidf_transformer.pk&quot;</span>, <span class="st">&quot;rb&quot;</span>))</span>
<span id="cb137-11"><a href="#cb137-11" aria-hidden="true" tabindex="-1"></a>test_tfidf <span class="op">=</span> tfidf_transf.transform(test_bow)</span>
<span id="cb137-12"><a href="#cb137-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb137-13"><a href="#cb137-13" aria-hidden="true" tabindex="-1"></a><span class="co"># here we reload the saved NaiveBayes model and use it to predict the class of our test text</span></span>
<span id="cb137-14"><a href="#cb137-14" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">&#39;nb_model.pk&#39;</span>, <span class="st">&#39;rb&#39;</span>) <span class="im">as</span> nb:</span>
<span id="cb137-15"><a href="#cb137-15" aria-hidden="true" tabindex="-1"></a>    model <span class="op">=</span> pickle.load(nb)</span>
<span id="cb137-16"><a href="#cb137-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb137-17"><a href="#cb137-17" aria-hidden="true" tabindex="-1"></a><span class="co"># model.predict(test_tfidf) ## array([3])  ## array([13])</span></span>
<span id="cb137-18"><a href="#cb137-18" aria-hidden="true" tabindex="-1"></a>foresights_id <span class="op">=</span> model.predict(test_tfidf)[<span class="dv">0</span>] <span class="co"># Access the first element of the array</span></span>
<span id="cb137-19"><a href="#cb137-19" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f&quot;The predicted CWE-ID is: </span><span class="sc">{</span>foresights_id<span class="sc">}</span><span class="ss">&quot;</span>)</span>
<span id="cb137-20"><a href="#cb137-20" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb137-21"><a href="#cb137-21" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a mapping from class labels to CWE-IDs</span></span>
<span id="cb137-22"><a href="#cb137-22" aria-hidden="true" tabindex="-1"></a>label_to_cwe <span class="op">=</span> {i: cwe_id <span class="cf">for</span> i, cwe_id <span class="kw">in</span> <span class="bu">enumerate</span>(df_target.unique())}</span>
<span id="cb137-23"><a href="#cb137-23" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb137-24"><a href="#cb137-24" aria-hidden="true" tabindex="-1"></a><span class="co"># Use the mapping to get the CWE-ID</span></span>
<span id="cb137-25"><a href="#cb137-25" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span> foresights_id <span class="kw">in</span> label_to_cwe:</span>
<span id="cb137-26"><a href="#cb137-26" aria-hidden="true" tabindex="-1"></a>    predicted_cwe_id <span class="op">=</span> label_to_cwe[foresights_id]</span>
<span id="cb137-27"><a href="#cb137-27" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f&quot;The predicted CWE-ID is: </span><span class="sc">{</span>predicted_cwe_id<span class="sc">}</span><span class="ss">&quot;</span>)</span>
<span id="cb137-28"><a href="#cb137-28" aria-hidden="true" tabindex="-1"></a><span class="cf">else</span>:</span>
<span id="cb137-29"><a href="#cb137-29" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f&quot;The predicted label </span><span class="sc">{</span>foresights_id<span class="sc">}</span><span class="ss"> is not in the training data.&quot;</span>)</span></code></pre></div>
<div class="output stream stdout">
<pre><code>The predicted CWE-ID is: 5
The predicted CWE-ID is: 17
</code></pre>
</div>
</div>
<section id="in-sample--out-of-sample-foresights" class="cell markdown"
id="JDNf0XsLTHT3">
<h1>In-Sample &amp; Out-of-Sample Foresights</h1>
</section>
<div class="cell code" data-execution_count="92"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="jec8-SBaTHT3" data-outputId="05809738-4406-4a63-8ef2-c498c2a51fac">
<div class="sourceCode" id="cb139"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb139-1"><a href="#cb139-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>wget https:<span class="op">//</span>raw.githubusercontent.com<span class="op">/</span>c6ai<span class="op">/</span>temp<span class="op">/</span>main<span class="op">/</span>sard.<span class="bu">zip</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>--2024-01-07 10:35:35--  https://raw.githubusercontent.com/c6ai/temp/main/sard.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2947907 (2.8M) [application/zip]
Saving to: ‘sard.zip’

sard.zip            100%[===================&gt;]   2.81M  --.-KB/s    in 0.02s   

2024-01-07 10:35:36 (170 MB/s) - ‘sard.zip’ saved [2947907/2947907]

</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="93"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="h5iyzft6THT4" data-outputId="776608f5-e359-45df-d82c-db493892d20a">
<div class="sourceCode" id="cb141"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb141-1"><a href="#cb141-1" aria-hidden="true" tabindex="-1"></a>out_of_sample_df <span class="op">=</span> pd.read_csv(<span class="st">&#39;sard.zip&#39;</span>, compression<span class="op">=</span><span class="st">&#39;zip&#39;</span>)</span>
<span id="cb141-2"><a href="#cb141-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(out_of_sample_df.head())</span></code></pre></div>
<div class="output stream stdout">
<pre><code>                                                code CWE-Type DataType
0  \n \n \n #include &quot;IncludeMarker&quot;\n \n #includ...   CWE114     SARD
1  \n \n \n #include &quot;IncludeMarker&quot;\n \n #includ...   CWE114     SARD
2  \n \n \n #include &quot;IncludeMarker&quot;\n \n #includ...   CWE114     SARD
3  \n \n \n #include &quot;IncludeMarker&quot;\n \n #includ...   CWE114     SARD
4  \n \n \n #include &quot;IncludeMarker&quot;\n \n #includ...   CWE114     SARD
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="94"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:423}"
id="bXjruNVeTHT4" data-outputId="8603f3a2-9b31-4712-f0f3-9210084c3647">
<div class="sourceCode" id="cb143"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb143-1"><a href="#cb143-1" aria-hidden="true" tabindex="-1"></a>out_of_sample_df <span class="co">## 52802 rows × 3 columns</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="94">

  <div id="df-075bab4a-0980-4462-a0ce-cfdd9f17fa17" class="colab-df-container">
    <div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>code</th>
      <th>CWE-Type</th>
      <th>DataType</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>\n \n \n #include "IncludeMarker"\n \n #includ...</td>
      <td>CWE114</td>
      <td>SARD</td>
    </tr>
    <tr>
      <th>1</th>
      <td>\n \n \n #include "IncludeMarker"\n \n #includ...</td>
      <td>CWE114</td>
      <td>SARD</td>
    </tr>
    <tr>
      <th>2</th>
      <td>\n \n \n #include "IncludeMarker"\n \n #includ...</td>
      <td>CWE114</td>
      <td>SARD</td>
    </tr>
    <tr>
      <th>3</th>
      <td>\n \n \n #include "IncludeMarker"\n \n #includ...</td>
      <td>CWE114</td>
      <td>SARD</td>
    </tr>
    <tr>
      <th>4</th>
      <td>\n \n \n #include "IncludeMarker"\n \n #includ...</td>
      <td>CWE114</td>
      <td>SARD</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
      <td>...</td>
    </tr>
    <tr>
      <th>52797</th>
      <td>\n \n \n #include "IncludeMarker"\n \n #define...</td>
      <td>CWE688</td>
      <td>SARD</td>
    </tr>
    <tr>
      <th>52798</th>
      <td>\n \n \n #include "IncludeMarker"\n \n #define...</td>
      <td>CWE688</td>
      <td>SARD</td>
    </tr>
    <tr>
      <th>52799</th>
      <td>\n \n \n #include "IncludeMarker"\n \n #define...</td>
      <td>CWE688</td>
      <td>SARD</td>
    </tr>
    <tr>
      <th>52800</th>
      <td>\n \n \n #include "IncludeMarker"\n \n #define...</td>
      <td>CWE688</td>
      <td>SARD</td>
    </tr>
    <tr>
      <th>52801</th>
      <td>\n \n \n #include "IncludeMarker"\n \n #define...</td>
      <td>CWE688</td>
      <td>SARD</td>
    </tr>
  </tbody>
</table>
<p>52802 rows × 3 columns</p>
</div>
    <div class="colab-df-buttons">

  <div class="colab-df-container">
    <button class="colab-df-convert" onclick="convertToInteractive('df-075bab4a-0980-4462-a0ce-cfdd9f17fa17')"
            title="Convert this dataframe to an interactive table."
            style="display:none;">

  <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960">
    <path d="M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z"/>
  </svg>
    </button>

  <style>
    .colab-df-container {
      display:flex;
      gap: 12px;
    }

    .colab-df-convert {
      background-color: #E8F0FE;
      border: none;
      border-radius: 50%;
      cursor: pointer;
      display: none;
      fill: #1967D2;
      height: 32px;
      padding: 0 0 0 0;
      width: 32px;
    }

    .colab-df-convert:hover {
      background-color: #E2EBFA;
      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
      fill: #174EA6;
    }

    .colab-df-buttons div {
      margin-bottom: 4px;
    }

    [theme=dark] .colab-df-convert {
      background-color: #3B4455;
      fill: #D2E3FC;
    }

    [theme=dark] .colab-df-convert:hover {
      background-color: #434B5C;
      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
      fill: #FFFFFF;
    }
  </style>

    <script>
      const buttonEl =
        document.querySelector('#df-075bab4a-0980-4462-a0ce-cfdd9f17fa17 button.colab-df-convert');
      buttonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';

      async function convertToInteractive(key) {
        const element = document.querySelector('#df-075bab4a-0980-4462-a0ce-cfdd9f17fa17');
        const dataTable =
          await google.colab.kernel.invokeFunction('convertToInteractive',
                                                    [key], {});
        if (!dataTable) return;

        const docLinkHtml = 'Like what you see? Visit the ' +
          '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
          + ' to learn more about interactive tables.';
        element.innerHTML = '';
        dataTable['output_type'] = 'display_data';
        await google.colab.output.renderOutput(dataTable, element);
        const docLink = document.createElement('div');
        docLink.innerHTML = docLinkHtml;
        element.appendChild(docLink);
      }
    </script>
  </div>


<div id="df-9a3f46b8-a6a5-48e4-8257-b2aa2cca2818">
  <button class="colab-df-quickchart" onclick="quickchart('df-9a3f46b8-a6a5-48e4-8257-b2aa2cca2818')"
            title="Suggest charts"
            style="display:none;">

<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
     width="24px">
    <g>
        <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"/>
    </g>
</svg>
  </button>

<style>
  .colab-df-quickchart {
      --bg-color: #E8F0FE;
      --fill-color: #1967D2;
      --hover-bg-color: #E2EBFA;
      --hover-fill-color: #174EA6;
      --disabled-fill-color: #AAA;
      --disabled-bg-color: #DDD;
  }

  [theme=dark] .colab-df-quickchart {
      --bg-color: #3B4455;
      --fill-color: #D2E3FC;
      --hover-bg-color: #434B5C;
      --hover-fill-color: #FFFFFF;
      --disabled-bg-color: #3B4455;
      --disabled-fill-color: #666;
  }

  .colab-df-quickchart {
    background-color: var(--bg-color);
    border: none;
    border-radius: 50%;
    cursor: pointer;
    display: none;
    fill: var(--fill-color);
    height: 32px;
    padding: 0;
    width: 32px;
  }

  .colab-df-quickchart:hover {
    background-color: var(--hover-bg-color);
    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
    fill: var(--button-hover-fill-color);
  }

  .colab-df-quickchart-complete:disabled,
  .colab-df-quickchart-complete:disabled:hover {
    background-color: var(--disabled-bg-color);
    fill: var(--disabled-fill-color);
    box-shadow: none;
  }

  .colab-df-spinner {
    border: 2px solid var(--fill-color);
    border-color: transparent;
    border-bottom-color: var(--fill-color);
    animation:
      spin 1s steps(1) infinite;
  }

  @keyframes spin {
    0% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
      border-left-color: var(--fill-color);
    }
    20% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    30% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
      border-right-color: var(--fill-color);
    }
    40% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    60% {
      border-color: transparent;
      border-right-color: var(--fill-color);
    }
    80% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-bottom-color: var(--fill-color);
    }
    90% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
    }
  }
</style>

  <script>
    async function quickchart(key) {
      const quickchartButtonEl =
        document.querySelector('#' + key + ' button');
      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.
      quickchartButtonEl.classList.add('colab-df-spinner');
      try {
        const charts = await google.colab.kernel.invokeFunction(
            'suggestCharts', [key], {});
      } catch (error) {
        console.error('Error during call to suggestCharts:', error);
      }
      quickchartButtonEl.classList.remove('colab-df-spinner');
      quickchartButtonEl.classList.add('colab-df-quickchart-complete');
    }
    (() => {
      let quickchartButtonEl =
        document.querySelector('#df-9a3f46b8-a6a5-48e4-8257-b2aa2cca2818 button');
      quickchartButtonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';
    })();
  </script>
</div>
    </div>
  </div>

</div>
</div>
<div class="cell code" data-execution_count="95"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="t0WTfj7GTHT5" data-outputId="4019c2f5-4d20-4610-939c-6a103549fed7">
<div class="sourceCode" id="cb144"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb144-1"><a href="#cb144-1" aria-hidden="true" tabindex="-1"></a>out_of_sample_df.columns <span class="co">## Index([&#39;code&#39;, &#39;CWE-Type&#39;, &#39;DataType&#39;], dtype=&#39;object&#39;)</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="95">
<pre><code>Index([&#39;code&#39;, &#39;CWE-Type&#39;, &#39;DataType&#39;], dtype=&#39;object&#39;)</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="96"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="Twb6lEQQTHT5" data-outputId="881b7f17-040a-4989-c7e8-684f96e7d5c3">
<div class="sourceCode" id="cb146"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb146-1"><a href="#cb146-1" aria-hidden="true" tabindex="-1"></a>cwe_list <span class="op">=</span> <span class="bu">list</span>(out_of_sample_df[<span class="st">&#39;CWE-Type&#39;</span>].unique())</span>
<span id="cb146-2"><a href="#cb146-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="bu">len</span>(cwe_list)) <span class="co">## 109</span></span>
<span id="cb146-3"><a href="#cb146-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(cwe_list)</span></code></pre></div>
<div class="output stream stdout">
<pre><code>109
[&#39;CWE114&#39;, &#39;CWE121&#39;, &#39;CWE122&#39;, &#39;CWE123&#39;, &#39;CWE124&#39;, &#39;CWE126&#39;, &#39;CWE127&#39;, &#39;CWE134&#39;, &#39;CWE15&#39;, &#39;CWE176&#39;, &#39;CWE188&#39;, &#39;CWE190&#39;, &#39;CWE191&#39;, &#39;CWE194&#39;, &#39;CWE195&#39;, &#39;CWE196&#39;, &#39;CWE197&#39;, &#39;CWE222&#39;, &#39;CWE223&#39;, &#39;CWE226&#39;, &#39;CWE242&#39;, &#39;CWE244&#39;, &#39;CWE247&#39;, &#39;CWE252&#39;, &#39;CWE253&#39;, &#39;CWE256&#39;, &#39;CWE259&#39;, &#39;CWE272&#39;, &#39;CWE273&#39;, &#39;CWE284&#39;, &#39;CWE319&#39;, &#39;CWE321&#39;, &#39;CWE325&#39;, &#39;CWE327&#39;, &#39;CWE328&#39;, &#39;CWE338&#39;, &#39;CWE364&#39;, &#39;CWE366&#39;, &#39;CWE367&#39;, &#39;CWE369&#39;, &#39;CWE377&#39;, &#39;CWE390&#39;, &#39;CWE391&#39;, &#39;CWE398&#39;, &#39;CWE400&#39;, &#39;CWE401&#39;, &#39;CWE404&#39;, &#39;CWE415&#39;, &#39;CWE416&#39;, &#39;CWE426&#39;, &#39;CWE427&#39;, &#39;CWE457&#39;, &#39;CWE459&#39;, &#39;CWE464&#39;, &#39;CWE467&#39;, &#39;CWE468&#39;, &#39;CWE469&#39;, &#39;CWE475&#39;, &#39;CWE476&#39;, &#39;CWE478&#39;, &#39;CWE479&#39;, &#39;CWE480&#39;, &#39;CWE481&#39;, &#39;CWE482&#39;, &#39;CWE483&#39;, &#39;CWE484&#39;, &#39;CWE506&#39;, &#39;CWE510&#39;, &#39;CWE511&#39;, &#39;CWE526&#39;, &#39;CWE534&#39;, &#39;CWE535&#39;, &#39;CWE546&#39;, &#39;CWE561&#39;, &#39;CWE562&#39;, &#39;CWE563&#39;, &#39;CWE570&#39;, &#39;CWE571&#39;, &#39;CWE587&#39;, &#39;CWE588&#39;, &#39;CWE590&#39;, &#39;CWE591&#39;, &#39;CWE605&#39;, &#39;CWE606&#39;, &#39;CWE615&#39;, &#39;CWE617&#39;, &#39;CWE620&#39;, &#39;CWE665&#39;, &#39;CWE666&#39;, &#39;CWE667&#39;, &#39;CWE674&#39;, &#39;CWE675&#39;, &#39;CWE680&#39;, &#39;CWE681&#39;, &#39;CWE685&#39;, &#39;CWE690&#39;, &#39;CWE758&#39;, &#39;CWE761&#39;, &#39;CWE773&#39;, &#39;CWE775&#39;, &#39;CWE780&#39;, &#39;CWE785&#39;, &#39;CWE789&#39;, &#39;CWE78&#39;, &#39;CWE832&#39;, &#39;CWE835&#39;, &#39;CWE843&#39;, &#39;CWE90&#39;, &#39;CWE688&#39;]
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="97"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="oVC1iTjtTHT6" data-outputId="d14c3da0-f40d-4457-ae09-e7c9480a2198">
<div class="sourceCode" id="cb148"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb148-1"><a href="#cb148-1" aria-hidden="true" tabindex="-1"></a><span class="bu">len</span>(data.target_names)</span></code></pre></div>
<div class="output execute_result" data-execution_count="97">
<pre><code>25</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="98"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="YGC6qVyBTHT6" data-outputId="ec7f5b44-b976-4b91-e07d-c1a4ecf10c3c">
<div class="sourceCode" id="cb150"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb150-1"><a href="#cb150-1" aria-hidden="true" tabindex="-1"></a>data.target_names</span></code></pre></div>
<div class="output execute_result" data-execution_count="98">
<pre><code>[&#39;CWE121_Stack_Based_Buffer_Overflow&#39;,
 &#39;CWE122_Heap_Based_Buffer_Overflow&#39;,
 &#39;CWE124_Buffer_Underwrite&#39;,
 &#39;CWE126_Buffer_Overread&#39;,
 &#39;CWE127_Buffer_Underread&#39;,
 &#39;CWE134_Uncontrolled_Format_String&#39;,
 &#39;CWE190_Integer_Overflow&#39;,
 &#39;CWE191_Integer_Underflow&#39;,
 &#39;CWE194_Unexpected_Sign_Extension&#39;,
 &#39;CWE195_Signed_to_Unsigned_Conversion_Error&#39;,
 &#39;CWE197_Numeric_Truncation_Error&#39;,
 &#39;CWE23_Relative_Path_Traversal&#39;,
 &#39;CWE369_Divide_by_Zero&#39;,
 &#39;CWE36_Absolute_Path_Traversal&#39;,
 &#39;CWE400_Resource_Exhaustion&#39;,
 &#39;CWE401_Memory_Leak&#39;,
 &#39;CWE415_Double_Free&#39;,
 &#39;CWE457_Use_of_Uninitialized_Variable&#39;,
 &#39;CWE563_Unused_Variable&#39;,
 &#39;CWE590_Free_Memory_Not_on_Heap&#39;,
 &#39;CWE680_Integer_Overflow_to_Buffer_Overflow&#39;,
 &#39;CWE690_NULL_Deref_From_Return&#39;,
 &#39;CWE762_Mismatched_Memory_Management_Routines&#39;,
 &#39;CWE789_Uncontrolled_Mem_Alloc&#39;,
 &#39;CWE78_OS_Command_Injection&#39;]</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="99" id="VdRM8ds_THT6">
<div class="sourceCode" id="cb152"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb152-1"><a href="#cb152-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Assuming data.target_names is a list of target names</span></span>
<span id="cb152-2"><a href="#cb152-2" aria-hidden="true" tabindex="-1"></a>target_names <span class="op">=</span> data.target_names</span>
<span id="cb152-3"><a href="#cb152-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb152-4"><a href="#cb152-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Create an empty list to store the top 22/25 CWE-Type</span></span>
<span id="cb152-5"><a href="#cb152-5" aria-hidden="true" tabindex="-1"></a>top_22_cwe_list <span class="op">=</span> []</span>
<span id="cb152-6"><a href="#cb152-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb152-7"><a href="#cb152-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Iterate over the unique CWE-Type list</span></span>
<span id="cb152-8"><a href="#cb152-8" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> cwe_type <span class="kw">in</span> cwe_list:</span>
<span id="cb152-9"><a href="#cb152-9" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Check if the CWE-Type matches the beginning part of any target name</span></span>
<span id="cb152-10"><a href="#cb152-10" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> target <span class="kw">in</span> target_names:</span>
<span id="cb152-11"><a href="#cb152-11" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> target.startswith(cwe_type):</span>
<span id="cb152-12"><a href="#cb152-12" aria-hidden="true" tabindex="-1"></a>            top_22_cwe_list.append(cwe_type)</span>
<span id="cb152-13"><a href="#cb152-13" aria-hidden="true" tabindex="-1"></a>            <span class="cf">break</span></span>
<span id="cb152-14"><a href="#cb152-14" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Stop adding to the list after the top 22 ##!25</span></span>
<span id="cb152-15"><a href="#cb152-15" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> <span class="bu">len</span>(top_22_cwe_list) <span class="op">==</span> <span class="dv">22</span>:</span>
<span id="cb152-16"><a href="#cb152-16" aria-hidden="true" tabindex="-1"></a>        <span class="cf">break</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="100"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="JdVM4SLYTHT7" data-outputId="7b6ffddc-a8f5-42cf-be9f-b01491fe5d79">
<div class="sourceCode" id="cb153"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb153-1"><a href="#cb153-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="bu">len</span>(top_22_cwe_list)) <span class="co">## 22</span></span>
<span id="cb153-2"><a href="#cb153-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(top_22_cwe_list)</span>
<span id="cb153-3"><a href="#cb153-3" aria-hidden="true" tabindex="-1"></a><span class="co">## [&#39;CWE121&#39;, &#39;CWE122&#39;, &#39;CWE124&#39;, &#39;CWE126&#39;, &#39;CWE127&#39;, &#39;CWE134&#39;, &#39;CWE190&#39;, &#39;CWE191&#39;,&#39;CWE194&#39;, &#39;CWE195&#39;, &#39;CWE197&#39;,</span></span>
<span id="cb153-4"><a href="#cb153-4" aria-hidden="true" tabindex="-1"></a><span class="co">## &#39;CWE369&#39;, &#39;CWE400&#39;, &#39;CWE401&#39;, &#39;CWE415&#39;, &#39;CWE457&#39;, &#39;CWE563&#39;, &#39;CWE590&#39;, &#39;CWE680&#39;, &#39;CWE690&#39;, &#39;CWE789&#39;, &#39;CWE78&#39;]</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>22
[&#39;CWE121&#39;, &#39;CWE122&#39;, &#39;CWE124&#39;, &#39;CWE126&#39;, &#39;CWE127&#39;, &#39;CWE134&#39;, &#39;CWE190&#39;, &#39;CWE191&#39;, &#39;CWE194&#39;, &#39;CWE195&#39;, &#39;CWE197&#39;, &#39;CWE369&#39;, &#39;CWE400&#39;, &#39;CWE401&#39;, &#39;CWE415&#39;, &#39;CWE457&#39;, &#39;CWE563&#39;, &#39;CWE590&#39;, &#39;CWE680&#39;, &#39;CWE690&#39;, &#39;CWE789&#39;, &#39;CWE78&#39;]
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="101" id="9uQA4UbHTHT7">
<div class="sourceCode" id="cb155"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb155-1"><a href="#cb155-1" aria-hidden="true" tabindex="-1"></a><span class="co">## make a top_22_cwe_testcase_samples_df with one sample each for the 1st &#39;code&#39; with &#39;CWE-Type&#39; that matches op_22_cwe_list</span></span>
<span id="cb155-2"><a href="#cb155-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb155-3"><a href="#cb155-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Assuming out_of_sample_df is the DataFrame and top_22_cwe_list is the list</span></span>
<span id="cb155-4"><a href="#cb155-4" aria-hidden="true" tabindex="-1"></a>top_22_cwe_testcase_samples_df <span class="op">=</span> pd.DataFrame()</span>
<span id="cb155-5"><a href="#cb155-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb155-6"><a href="#cb155-6" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> cwe <span class="kw">in</span> top_22_cwe_list:</span>
<span id="cb155-7"><a href="#cb155-7" aria-hidden="true" tabindex="-1"></a>    sample_df <span class="op">=</span> out_of_sample_df[out_of_sample_df[<span class="st">&#39;CWE-Type&#39;</span>] <span class="op">==</span> cwe].head(<span class="dv">1</span>)</span>
<span id="cb155-8"><a href="#cb155-8" aria-hidden="true" tabindex="-1"></a>    top_22_cwe_testcase_samples_df <span class="op">=</span> pd.concat([top_22_cwe_testcase_samples_df, sample_df])</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="102"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="1sjV-qXJTHT8" data-outputId="567ba20f-4152-402d-d2a5-294a8e58f5a2">
<div class="sourceCode" id="cb156"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb156-1"><a href="#cb156-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="bu">len</span>(top_22_cwe_testcase_samples_df)) <span class="co">## 22</span></span>
<span id="cb156-2"><a href="#cb156-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(top_22_cwe_testcase_samples_df)</span></code></pre></div>
<div class="output stream stdout">
<pre><code>22
                                                    code CWE-Type DataType
648    \n \n \n #include &quot;IncludeMarker&quot;\n \n #ifndef...   CWE121     SARD
6351   \n \n \n #include &quot;IncludeMarker&quot;\n \n #ifndef...   CWE122     SARD
10044  \n \n \n #include &quot;IncludeMarker&quot;\n \n #includ...   CWE124     SARD
11868  \n \n \n #include &quot;IncludeMarker&quot;\n \n #includ...   CWE126     SARD
13200  \n \n \n #include &quot;IncludeMarker&quot;\n \n #includ...   CWE127     SARD
15024  \n \n \n #include &quot;IncludeMarker&quot;\n \n #ifndef...   CWE134     SARD
18408  \n \n \n #include &quot;IncludeMarker&quot;\n \n #ifndef...   CWE190     SARD
23268  \n \n \n #include &quot;IncludeMarker&quot;\n \n #ifndef...   CWE191     SARD
26994  \n \n \n #include &quot;IncludeMarker&quot;\n \n #ifdef ...   CWE194     SARD
28301  \n \n \n #include &quot;IncludeMarker&quot;\n \n #ifdef ...   CWE195     SARD
29627  \n \n \n #include &quot;IncludeMarker&quot;\n \n #ifdef ...   CWE197     SARD
33468  \n \n \n #include &quot;IncludeMarker&quot;\n \n #includ...   CWE369     SARD
34891  \n \n \n #include &quot;IncludeMarker&quot;\n \n #ifdef ...   CWE400     SARD
35701  \n \n \n #include &quot;IncludeMarker&quot;\n \n #includ...   CWE401     SARD
37325  \n \n \n #include &quot;IncludeMarker&quot;\n \n #includ...   CWE415     SARD
38555  \n \n \n #include &quot;IncludeMarker&quot;\n \n #includ...   CWE457     SARD
40395  \n \n #include &quot;IncludeMarker&quot;\n \n #ifndef OM...   CWE563     SARD
40860  \n \n \n #include &quot;IncludeMarker&quot;\n \n #includ...   CWE590     SARD
43314  \n \n \n #include &quot;IncludeMarker&quot;\n \n #ifdef ...   CWE680     SARD
43713  \n \n \n #include &quot;IncludeMarker&quot;\n \n #includ...   CWE690     SARD
46126  \n \n \n #include &quot;IncludeMarker&quot;\n \n #ifndef...   CWE789     SARD
46666  \n \n \n #include &quot;IncludeMarker&quot;\n \n #includ...    CWE78     SARD
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="103"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:36}"
id="J-HX7FFMTHT9" data-outputId="85125c52-1ffc-47c1-cbe5-6b00be9a2680">
<div class="sourceCode" id="cb158"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb158-1"><a href="#cb158-1" aria-hidden="true" tabindex="-1"></a>top_22_cwe_testcase_samples_df.iloc[<span class="dv">0</span>, <span class="dv">1</span>] <span class="co">## CWE121</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="103">
<div class="sourceCode" id="cb159"><pre
class="sourceCode json"><code class="sourceCode json"><span id="cb159-1"><a href="#cb159-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">&quot;type&quot;</span><span class="fu">:</span><span class="st">&quot;string&quot;</span><span class="fu">}</span></span></code></pre></div>
</div>
</div>
<div class="cell code" data-execution_count="104" id="DMGrp1tyTHT-">
<div class="sourceCode" id="cb160"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb160-1"><a href="#cb160-1" aria-hidden="true" tabindex="-1"></a><span class="co"># print(out_of_sample_df.iloc[0, 0])</span></span>
<span id="cb160-2"><a href="#cb160-2" aria-hidden="true" tabindex="-1"></a>cwe_testcase_sample <span class="op">=</span> top_22_cwe_testcase_samples_df.iloc[<span class="dv">0</span>, <span class="dv">0</span>]</span>
<span id="cb160-3"><a href="#cb160-3" aria-hidden="true" tabindex="-1"></a><span class="co"># print(cwe_testcase_sample)</span></span></code></pre></div>
</div>
<section id="cpp_clean_files_top_10_cwe_omitted" class="cell markdown"
id="yTW4Wt5ZTHT-">
<h2>'cpp_clean_files_top_10_cwe_omitted'</h2>
</section>
<div class="cell code" data-execution_count="105"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="QqGFkBcNTHT_" data-outputId="a1ee529f-c584-42bd-e8bd-0d772cad1854">
<div class="sourceCode" id="cb161"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb161-1"><a href="#cb161-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>wget https:<span class="op">//</span>raw.githubusercontent.com<span class="op">/</span>c6ai<span class="op">/</span>temp<span class="op">/</span>main<span class="op">/</span>cpp_clean_files_top_10_cwe_omitted.tar.gz</span>
<span id="cb161-2"><a href="#cb161-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb161-3"><a href="#cb161-3" aria-hidden="true" tabindex="-1"></a><span class="co"># ## cpp_clean_files_top_10_cwe_omitted.tar.gz</span></span>
<span id="cb161-4"><a href="#cb161-4" aria-hidden="true" tabindex="-1"></a><span class="co"># !gdown 1YQHdd457W4NjuTvJYiucKUr8pRwbGulj</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>--2024-01-07 10:35:38--  https://raw.githubusercontent.com/c6ai/temp/main/cpp_clean_files_top_10_cwe_omitted.tar.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14922 (15K) [application/octet-stream]
Saving to: ‘cpp_clean_files_top_10_cwe_omitted.tar.gz’

          cpp_clean   0%[                    ]       0  --.-KB/s               cpp_clean_files_top 100%[===================&gt;]  14.57K  --.-KB/s    in 0s      

2024-01-07 10:35:38 (93.2 MB/s) - ‘cpp_clean_files_top_10_cwe_omitted.tar.gz’ saved [14922/14922]

</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="106" id="fSBw2n_ZTHUA">
<div class="sourceCode" id="cb163"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb163-1"><a href="#cb163-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !cp -r cpp_clean_omitted_cwe_folders cpp_clean_files_top_10_cwe_omitted</span></span>
<span id="cb163-2"><a href="#cb163-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !tar -czf cpp_clean_files_top_10_cwe_omitted.tar.gz cpp_clean_files_top_10_cwe_omitted</span></span>
<span id="cb163-3"><a href="#cb163-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb163-4"><a href="#cb163-4" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>tar <span class="op">-</span>xzf cpp_clean_files_top_10_cwe_omitted.tar.gz cpp_clean_files_top_10_cwe_omitted</span></code></pre></div>
</div>
<section id="top-10---after-prepping-cpp" class="cell markdown"
id="ySkAs70kTHUA">
<h1>Top 10 - after prepping .cpp</h1>
</section>
<div class="cell code" data-execution_count="107"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="SPViMWSQTHUB" data-outputId="8a74448b-eb42-4ffa-cca8-6182d42110b0">
<div class="sourceCode" id="cb164"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb164-1"><a href="#cb164-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Initialize an empty list to store the data</span></span>
<span id="cb164-2"><a href="#cb164-2" aria-hidden="true" tabindex="-1"></a>data <span class="op">=</span> []</span>
<span id="cb164-3"><a href="#cb164-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb164-4"><a href="#cb164-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify the root folder</span></span>
<span id="cb164-5"><a href="#cb164-5" aria-hidden="true" tabindex="-1"></a>root_folder <span class="op">=</span> <span class="st">&#39;cpp_clean_files_top_10_cwe_omitted&#39;</span></span>
<span id="cb164-6"><a href="#cb164-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb164-7"><a href="#cb164-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Traverse the directory</span></span>
<span id="cb164-8"><a href="#cb164-8" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> subdir, dirs, files <span class="kw">in</span> os.walk(root_folder):</span>
<span id="cb164-9"><a href="#cb164-9" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> <span class="bu">file</span> <span class="kw">in</span> files:</span>
<span id="cb164-10"><a href="#cb164-10" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Check if the file is a .cpp file</span></span>
<span id="cb164-11"><a href="#cb164-11" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> <span class="bu">file</span>.endswith(<span class="st">&#39;.cpp&#39;</span>):</span>
<span id="cb164-12"><a href="#cb164-12" aria-hidden="true" tabindex="-1"></a>            <span class="co"># Get the CWE-ID from the subfolder name</span></span>
<span id="cb164-13"><a href="#cb164-13" aria-hidden="true" tabindex="-1"></a>            cwe_id <span class="op">=</span> os.path.basename(subdir)</span>
<span id="cb164-14"><a href="#cb164-14" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb164-15"><a href="#cb164-15" aria-hidden="true" tabindex="-1"></a>            <span class="co"># Open the file and read its content</span></span>
<span id="cb164-16"><a href="#cb164-16" aria-hidden="true" tabindex="-1"></a>            <span class="cf">with</span> <span class="bu">open</span>(os.path.join(subdir, <span class="bu">file</span>), <span class="st">&#39;r&#39;</span>) <span class="im">as</span> f:</span>
<span id="cb164-17"><a href="#cb164-17" aria-hidden="true" tabindex="-1"></a>                file_content <span class="op">=</span> f.read()</span>
<span id="cb164-18"><a href="#cb164-18" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb164-19"><a href="#cb164-19" aria-hidden="true" tabindex="-1"></a>            <span class="co"># Append the data to the list</span></span>
<span id="cb164-20"><a href="#cb164-20" aria-hidden="true" tabindex="-1"></a>            data.append([cwe_id, file_content])</span>
<span id="cb164-21"><a href="#cb164-21" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb164-22"><a href="#cb164-22" aria-hidden="true" tabindex="-1"></a>            <span class="co"># We only need the first .cpp file from each subfolder</span></span>
<span id="cb164-23"><a href="#cb164-23" aria-hidden="true" tabindex="-1"></a>            <span class="cf">break</span></span>
<span id="cb164-24"><a href="#cb164-24" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb164-25"><a href="#cb164-25" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a DataFrame</span></span>
<span id="cb164-26"><a href="#cb164-26" aria-hidden="true" tabindex="-1"></a>in_sample_top_10_cwe_df <span class="op">=</span> pd.DataFrame(data, columns<span class="op">=</span>[<span class="st">&#39;CWE-ID&#39;</span>, <span class="st">&#39;Test-Case-Code&#39;</span>])</span>
<span id="cb164-27"><a href="#cb164-27" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb164-28"><a href="#cb164-28" aria-hidden="true" tabindex="-1"></a><span class="co"># Print the DataFrame</span></span>
<span id="cb164-29"><a href="#cb164-29" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(in_sample_top_10_cwe_df)</span></code></pre></div>
<div class="output stream stdout">
<pre><code>                                         CWE-ID  \
0             CWE122_Heap_Based_Buffer_Overflow   
1                            CWE401_Memory_Leak   
2                 CWE36_Absolute_Path_Traversal   
3                       CWE127_Buffer_Underread   
4                 CWE23_Relative_Path_Traversal   
5            CWE121_Stack_Based_Buffer_Overflow   
6                            CWE415_Double_Free   
7  CWE762_Mismatched_Memory_Management_Routines   
8                CWE590_Free_Memory_Not_on_Heap   
9             CWE134_Uncontrolled_Format_String   

                                      Test-Case-Code  
0  \n\n\n#include &quot;std_testcase.h&quot;\n\n#include &lt;w...  
1  \n\n#ifndef OMITBAD\n\n#include &quot;std_testcase....  
2  \n\n\n#include &quot;std_testcase.h&quot;\n\n#ifndef _WI...  
3  \n\n\n#include &quot;std_testcase.h&quot;\n\n#include &lt;w...  
4  \n\n\n#include &quot;std_testcase.h&quot;\n\n#ifdef _WIN...  
5  \n\n\n#include &quot;std_testcase.h&quot;\n#include &lt;lis...  
6  \n\n#ifndef OMITBAD\n\n#include &quot;std_testcase....  
7  \n\n\n#include &quot;std_testcase.h&quot;\n#include &lt;lis...  
8  \n\n\n#include &quot;std_testcase.h&quot;\n\n#include &lt;w...  
9  \n\n#ifndef OMITGOOD\n\n#include &quot;std_testcase...  
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="108"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="4CERkts4THUB" data-outputId="50642b56-1e7a-48a1-fdff-54061e458c25">
<div class="sourceCode" id="cb166"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb166-1"><a href="#cb166-1" aria-hidden="true" tabindex="-1"></a>in_sample_top_10_cwe_df.columns <span class="co"># .iloc[0,0] ## Index([&#39;CWE-ID&#39;, &#39;Test-Case-Code&#39;], dtype=&#39;object&#39;)</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="108">
<pre><code>Index([&#39;CWE-ID&#39;, &#39;Test-Case-Code&#39;], dtype=&#39;object&#39;)</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="109"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="jD46sfIyTHUC" data-outputId="ac288de5-020c-4fe8-e444-50552e6c38f7">
<div class="sourceCode" id="cb168"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb168-1"><a href="#cb168-1" aria-hidden="true" tabindex="-1"></a><span class="co"># in_sample_top_10_cwe_df.iloc[0,1]</span></span>
<span id="cb168-2"><a href="#cb168-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(in_sample_top_10_cwe_df.iloc[<span class="dv">0</span>,<span class="dv">1</span>])</span></code></pre></div>
<div class="output stream stdout">
<pre><code>


#include &quot;std_testcase.h&quot;

#include &lt;wchar.h&gt;


static int staticReturnsTrue()
{
    return 1;
}

static int staticReturnsFalse()
{
    return 0;
}

namespace CWE805_char_memcpy_08
{

#ifndef OMITBAD

void bad()
{
    char * data;
    data = NULL;
    if(staticReturnsTrue())
    {
        
        data = new char[50];
        data[0] = &#39;\0&#39;; 
    }
    {
        char source[100];
        memset(source, &#39;C&#39;, 100-1); 
        source[100-1] = &#39;\0&#39;; 
        
        memcpy(data, source, 100*sizeof(char));
        data[100-1] = &#39;\0&#39;; 
        printLine(data);
        delete [] data;
    }
}

#endif 

#ifndef OMITGOOD


static void goodG2B1()
{
    char * data;
    data = NULL;
    if(staticReturnsFalse())
    {
        
        printLine(&quot;Benign, fixed string&quot;);
    }
    else
    {
        
        data = new char[100];
        data[0] = &#39;\0&#39;; 
    }
    {
        char source[100];
        memset(source, &#39;C&#39;, 100-1); 
        source[100-1] = &#39;\0&#39;; 
        
        memcpy(data, source, 100*sizeof(char));
        data[100-1] = &#39;\0&#39;; 
        printLine(data);
        delete [] data;
    }
}


static void goodG2B2()
{
    char * data;
    data = NULL;
    if(staticReturnsTrue())
    {
        
        data = new char[100];
        data[0] = &#39;\0&#39;; 
    }
    {
        char source[100];
        memset(source, &#39;C&#39;, 100-1); 
        source[100-1] = &#39;\0&#39;; 
        
        memcpy(data, source, 100*sizeof(char));
        data[100-1] = &#39;\0&#39;; 
        printLine(data);
        delete [] data;
    }
}

void good()
{
    goodG2B1();
    goodG2B2();
}

#endif 

} 


#ifdef INCLUDEMAIN

using namespace CWE805_char_memcpy_08; 

int main(int argc, char * argv[])
{
    
    srand( (unsigned)time(NULL) );
#ifndef OMITGOOD
    printLine(&quot;Calling good()...&quot;);
    good();
    printLine(&quot;Finished good()&quot;);
#endif 
#ifndef OMITBAD
    printLine(&quot;Calling bad()...&quot;);
    bad();
    printLine(&quot;Finished bad()&quot;);
#endif 
    return 0;
}

#endif

</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="110"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:36}"
id="9CsIG-A5THUC" data-outputId="ff0132a8-9b59-41a4-f456-f96650a8e672">
<div class="sourceCode" id="cb170"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb170-1"><a href="#cb170-1" aria-hidden="true" tabindex="-1"></a>in_sample_top_10_cwe_df.iloc[<span class="dv">0</span>,<span class="dv">0</span>] <span class="co">## CWE122_Heap_Based_Buffer_Overflow</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="110">
<div class="sourceCode" id="cb171"><pre
class="sourceCode json"><code class="sourceCode json"><span id="cb171-1"><a href="#cb171-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">&quot;type&quot;</span><span class="fu">:</span><span class="st">&quot;string&quot;</span><span class="fu">}</span></span></code></pre></div>
</div>
</div>
<div class="cell code" data-execution_count="111"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="-O8GIMpgTHUC" data-outputId="352ce07e-47cb-4bd0-b21c-c464a6c39767">
<div class="sourceCode" id="cb172"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb172-1"><a href="#cb172-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(in_sample_top_10_cwe_df.shape) <span class="co"># (10, 2)</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>(10, 2)
</code></pre>
</div>
</div>
<div class="cell markdown" id="W1H7gWFnTHUC">
<p>..........</p>
</div>
<div class="cell code" data-execution_count="112"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="K2gEPruQTHUD" data-outputId="71805502-5f2d-4e87-fb5f-f1e8d8415d7a">
<div class="sourceCode" id="cb174"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb174-1"><a href="#cb174-1" aria-hidden="true" tabindex="-1"></a>in_sample_top_10_cwe_df.columns <span class="co">## Index([&#39;CWE-ID&#39;, &#39;Test-Case-Code&#39;], dtype=&#39;object&#39;)</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="112">
<pre><code>Index([&#39;CWE-ID&#39;, &#39;Test-Case-Code&#39;], dtype=&#39;object&#39;)</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="113"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="tVrP6ZupTHUD" data-outputId="c3f897ec-ca9a-48c9-d35b-74f80734e921">
<div class="sourceCode" id="cb176"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb176-1"><a href="#cb176-1" aria-hidden="true" tabindex="-1"></a>in_sample_top_10_cwe_df[<span class="st">&#39;CWE-ID&#39;</span>]</span></code></pre></div>
<div class="output execute_result" data-execution_count="113">
<pre><code>0               CWE122_Heap_Based_Buffer_Overflow
1                              CWE401_Memory_Leak
2                   CWE36_Absolute_Path_Traversal
3                         CWE127_Buffer_Underread
4                   CWE23_Relative_Path_Traversal
5              CWE121_Stack_Based_Buffer_Overflow
6                              CWE415_Double_Free
7    CWE762_Mismatched_Memory_Management_Routines
8                  CWE590_Free_Memory_Not_on_Heap
9               CWE134_Uncontrolled_Format_String
Name: CWE-ID, dtype: object</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="114"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:362}"
id="ym4mPt7UTHUD" data-outputId="fd517900-dc50-4a52-aca1-afbbedfd8c7b">
<div class="sourceCode" id="cb178"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb178-1"><a href="#cb178-1" aria-hidden="true" tabindex="-1"></a>in_sample_top_10_cwe_df</span></code></pre></div>
<div class="output execute_result" data-execution_count="114">

  <div id="df-cae5c8d8-2703-4df0-be4a-31c1dd8a0da4" class="colab-df-container">
    <div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>CWE-ID</th>
      <th>Test-Case-Code</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>CWE122_Heap_Based_Buffer_Overflow</td>
      <td>\n\n\n#include "std_testcase.h"\n\n#include &lt;w...</td>
    </tr>
    <tr>
      <th>1</th>
      <td>CWE401_Memory_Leak</td>
      <td>\n\n#ifndef OMITBAD\n\n#include "std_testcase....</td>
    </tr>
    <tr>
      <th>2</th>
      <td>CWE36_Absolute_Path_Traversal</td>
      <td>\n\n\n#include "std_testcase.h"\n\n#ifndef _WI...</td>
    </tr>
    <tr>
      <th>3</th>
      <td>CWE127_Buffer_Underread</td>
      <td>\n\n\n#include "std_testcase.h"\n\n#include &lt;w...</td>
    </tr>
    <tr>
      <th>4</th>
      <td>CWE23_Relative_Path_Traversal</td>
      <td>\n\n\n#include "std_testcase.h"\n\n#ifdef _WIN...</td>
    </tr>
    <tr>
      <th>5</th>
      <td>CWE121_Stack_Based_Buffer_Overflow</td>
      <td>\n\n\n#include "std_testcase.h"\n#include &lt;lis...</td>
    </tr>
    <tr>
      <th>6</th>
      <td>CWE415_Double_Free</td>
      <td>\n\n#ifndef OMITBAD\n\n#include "std_testcase....</td>
    </tr>
    <tr>
      <th>7</th>
      <td>CWE762_Mismatched_Memory_Management_Routines</td>
      <td>\n\n\n#include "std_testcase.h"\n#include &lt;lis...</td>
    </tr>
    <tr>
      <th>8</th>
      <td>CWE590_Free_Memory_Not_on_Heap</td>
      <td>\n\n\n#include "std_testcase.h"\n\n#include &lt;w...</td>
    </tr>
    <tr>
      <th>9</th>
      <td>CWE134_Uncontrolled_Format_String</td>
      <td>\n\n#ifndef OMITGOOD\n\n#include "std_testcase...</td>
    </tr>
  </tbody>
</table>
</div>
    <div class="colab-df-buttons">

  <div class="colab-df-container">
    <button class="colab-df-convert" onclick="convertToInteractive('df-cae5c8d8-2703-4df0-be4a-31c1dd8a0da4')"
            title="Convert this dataframe to an interactive table."
            style="display:none;">

  <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960">
    <path d="M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z"/>
  </svg>
    </button>

  <style>
    .colab-df-container {
      display:flex;
      gap: 12px;
    }

    .colab-df-convert {
      background-color: #E8F0FE;
      border: none;
      border-radius: 50%;
      cursor: pointer;
      display: none;
      fill: #1967D2;
      height: 32px;
      padding: 0 0 0 0;
      width: 32px;
    }

    .colab-df-convert:hover {
      background-color: #E2EBFA;
      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
      fill: #174EA6;
    }

    .colab-df-buttons div {
      margin-bottom: 4px;
    }

    [theme=dark] .colab-df-convert {
      background-color: #3B4455;
      fill: #D2E3FC;
    }

    [theme=dark] .colab-df-convert:hover {
      background-color: #434B5C;
      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
      fill: #FFFFFF;
    }
  </style>

    <script>
      const buttonEl =
        document.querySelector('#df-cae5c8d8-2703-4df0-be4a-31c1dd8a0da4 button.colab-df-convert');
      buttonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';

      async function convertToInteractive(key) {
        const element = document.querySelector('#df-cae5c8d8-2703-4df0-be4a-31c1dd8a0da4');
        const dataTable =
          await google.colab.kernel.invokeFunction('convertToInteractive',
                                                    [key], {});
        if (!dataTable) return;

        const docLinkHtml = 'Like what you see? Visit the ' +
          '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
          + ' to learn more about interactive tables.';
        element.innerHTML = '';
        dataTable['output_type'] = 'display_data';
        await google.colab.output.renderOutput(dataTable, element);
        const docLink = document.createElement('div');
        docLink.innerHTML = docLinkHtml;
        element.appendChild(docLink);
      }
    </script>
  </div>


<div id="df-275dc5b4-2de0-43d5-99e4-9054f024881c">
  <button class="colab-df-quickchart" onclick="quickchart('df-275dc5b4-2de0-43d5-99e4-9054f024881c')"
            title="Suggest charts"
            style="display:none;">

<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
     width="24px">
    <g>
        <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"/>
    </g>
</svg>
  </button>

<style>
  .colab-df-quickchart {
      --bg-color: #E8F0FE;
      --fill-color: #1967D2;
      --hover-bg-color: #E2EBFA;
      --hover-fill-color: #174EA6;
      --disabled-fill-color: #AAA;
      --disabled-bg-color: #DDD;
  }

  [theme=dark] .colab-df-quickchart {
      --bg-color: #3B4455;
      --fill-color: #D2E3FC;
      --hover-bg-color: #434B5C;
      --hover-fill-color: #FFFFFF;
      --disabled-bg-color: #3B4455;
      --disabled-fill-color: #666;
  }

  .colab-df-quickchart {
    background-color: var(--bg-color);
    border: none;
    border-radius: 50%;
    cursor: pointer;
    display: none;
    fill: var(--fill-color);
    height: 32px;
    padding: 0;
    width: 32px;
  }

  .colab-df-quickchart:hover {
    background-color: var(--hover-bg-color);
    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
    fill: var(--button-hover-fill-color);
  }

  .colab-df-quickchart-complete:disabled,
  .colab-df-quickchart-complete:disabled:hover {
    background-color: var(--disabled-bg-color);
    fill: var(--disabled-fill-color);
    box-shadow: none;
  }

  .colab-df-spinner {
    border: 2px solid var(--fill-color);
    border-color: transparent;
    border-bottom-color: var(--fill-color);
    animation:
      spin 1s steps(1) infinite;
  }

  @keyframes spin {
    0% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
      border-left-color: var(--fill-color);
    }
    20% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    30% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
      border-right-color: var(--fill-color);
    }
    40% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    60% {
      border-color: transparent;
      border-right-color: var(--fill-color);
    }
    80% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-bottom-color: var(--fill-color);
    }
    90% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
    }
  }
</style>

  <script>
    async function quickchart(key) {
      const quickchartButtonEl =
        document.querySelector('#' + key + ' button');
      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.
      quickchartButtonEl.classList.add('colab-df-spinner');
      try {
        const charts = await google.colab.kernel.invokeFunction(
            'suggestCharts', [key], {});
      } catch (error) {
        console.error('Error during call to suggestCharts:', error);
      }
      quickchartButtonEl.classList.remove('colab-df-spinner');
      quickchartButtonEl.classList.add('colab-df-quickchart-complete');
    }
    (() => {
      let quickchartButtonEl =
        document.querySelector('#df-275dc5b4-2de0-43d5-99e4-9054f024881c button');
      quickchartButtonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';
    })();
  </script>
</div>
    </div>
  </div>

</div>
</div>
<div class="cell code" data-execution_count="115"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="RG1eWfNWTHUE" data-outputId="531042d2-ed45-48d1-9154-1d4860b1930d">
<div class="sourceCode" id="cb179"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb179-1"><a href="#cb179-1" aria-hidden="true" tabindex="-1"></a><span class="co"># %%time</span></span>
<span id="cb179-2"><a href="#cb179-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb179-3"><a href="#cb179-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify the indices</span></span>
<span id="cb179-4"><a href="#cb179-4" aria-hidden="true" tabindex="-1"></a><span class="co"># indices = [0,1,2]</span></span>
<span id="cb179-5"><a href="#cb179-5" aria-hidden="true" tabindex="-1"></a>indices <span class="op">=</span> [<span class="dv">3</span>,<span class="dv">4</span>,<span class="dv">5</span>,<span class="dv">6</span>]</span>
<span id="cb179-6"><a href="#cb179-6" aria-hidden="true" tabindex="-1"></a><span class="co"># indices = [7,8,9]</span></span>
<span id="cb179-7"><a href="#cb179-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb179-8"><a href="#cb179-8" aria-hidden="true" tabindex="-1"></a><span class="co"># Load the transformers and the model</span></span>
<span id="cb179-9"><a href="#cb179-9" aria-hidden="true" tabindex="-1"></a>bow_transf <span class="op">=</span> pickle.load(<span class="bu">open</span>(<span class="st">&quot;bow_transformer.pk&quot;</span>, <span class="st">&quot;rb&quot;</span>))</span>
<span id="cb179-10"><a href="#cb179-10" aria-hidden="true" tabindex="-1"></a>tfidf_transf <span class="op">=</span> pickle.load(<span class="bu">open</span>(<span class="st">&quot;tfidf_transformer.pk&quot;</span>, <span class="st">&quot;rb&quot;</span>))</span>
<span id="cb179-11"><a href="#cb179-11" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">&#39;nb_model.pk&#39;</span>, <span class="st">&#39;rb&#39;</span>) <span class="im">as</span> nb:</span>
<span id="cb179-12"><a href="#cb179-12" aria-hidden="true" tabindex="-1"></a>    model <span class="op">=</span> pickle.load(nb)</span>
<span id="cb179-13"><a href="#cb179-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb179-14"><a href="#cb179-14" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a mapping from class labels to CWE-IDs</span></span>
<span id="cb179-15"><a href="#cb179-15" aria-hidden="true" tabindex="-1"></a>label_to_cwe <span class="op">=</span> {i: cwe_id <span class="cf">for</span> i, cwe_id <span class="kw">in</span> <span class="bu">enumerate</span>(in_sample_top_10_cwe_df[<span class="st">&#39;CWE-ID&#39;</span>].unique())}</span>
<span id="cb179-16"><a href="#cb179-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb179-17"><a href="#cb179-17" aria-hidden="true" tabindex="-1"></a><span class="co"># Loop through the specified indices</span></span>
<span id="cb179-18"><a href="#cb179-18" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> in_sample_top_10_cwe_index <span class="kw">in</span> indices:</span>
<span id="cb179-19"><a href="#cb179-19" aria-hidden="true" tabindex="-1"></a>    test_text <span class="op">=</span> in_sample_top_10_cwe_df.iloc[in_sample_top_10_cwe_index,<span class="dv">1</span>]</span>
<span id="cb179-20"><a href="#cb179-20" aria-hidden="true" tabindex="-1"></a>    test_bow <span class="op">=</span> bow_transf.transform([test_text])</span>
<span id="cb179-21"><a href="#cb179-21" aria-hidden="true" tabindex="-1"></a>    test_tfidf <span class="op">=</span> tfidf_transf.transform(test_bow)</span>
<span id="cb179-22"><a href="#cb179-22" aria-hidden="true" tabindex="-1"></a>    foresights_id <span class="op">=</span> model.predict(test_tfidf)[<span class="dv">0</span>] <span class="co"># Access the first element of the array</span></span>
<span id="cb179-23"><a href="#cb179-23" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="st">&#39;-----------------------------------------&#39;</span>)</span>
<span id="cb179-24"><a href="#cb179-24" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f&quot;The predicted CWE-ID is: </span><span class="sc">{</span>foresights_id<span class="sc">}</span><span class="ss">&quot;</span>)</span>
<span id="cb179-25"><a href="#cb179-25" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Use the mapping to get the CWE-ID</span></span>
<span id="cb179-26"><a href="#cb179-26" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> foresights_id <span class="kw">in</span> label_to_cwe:</span>
<span id="cb179-27"><a href="#cb179-27" aria-hidden="true" tabindex="-1"></a>        predicted_cwe_id <span class="op">=</span> label_to_cwe[foresights_id]</span>
<span id="cb179-28"><a href="#cb179-28" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="ss">f&quot;The predicted CWE-ID is: </span><span class="sc">{</span>predicted_cwe_id<span class="sc">}</span><span class="ss">&quot;</span>)</span>
<span id="cb179-29"><a href="#cb179-29" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Check if the model&#39;s prediction matches the actual CWE-ID</span></span>
<span id="cb179-30"><a href="#cb179-30" aria-hidden="true" tabindex="-1"></a>        actual_cwe_id <span class="op">=</span> in_sample_top_10_cwe_df.iloc[in_sample_top_10_cwe_index][<span class="st">&#39;CWE-ID&#39;</span>]</span>
<span id="cb179-31"><a href="#cb179-31" aria-hidden="true" tabindex="-1"></a>        is_correct <span class="op">=</span> (actual_cwe_id <span class="op">==</span> predicted_cwe_id)</span>
<span id="cb179-32"><a href="#cb179-32" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="ss">f&quot;Is the prediction correct? </span><span class="sc">{</span>is_correct<span class="sc">}</span><span class="ss">&quot;</span>)</span>
<span id="cb179-33"><a href="#cb179-33" aria-hidden="true" tabindex="-1"></a>    <span class="cf">else</span>:</span>
<span id="cb179-34"><a href="#cb179-34" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="ss">f&quot;The predicted label </span><span class="sc">{</span>foresights_id<span class="sc">}</span><span class="ss"> is not in the training data.&quot;</span>)</span>
<span id="cb179-35"><a href="#cb179-35" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb179-36"><a href="#cb179-36" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 52.5 ms, sys: 4.17 ms, total: 56.7 ms</span></span>
<span id="cb179-37"><a href="#cb179-37" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 103 ms</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>-----------------------------------------
The predicted CWE-ID is: 4
The predicted CWE-ID is: CWE23_Relative_Path_Traversal
Is the prediction correct? False
-----------------------------------------
The predicted CWE-ID is: 11
The predicted label 11 is not in the training data.
-----------------------------------------
The predicted CWE-ID is: 0
The predicted CWE-ID is: CWE122_Heap_Based_Buffer_Overflow
Is the prediction correct? False
-----------------------------------------
The predicted CWE-ID is: 16
The predicted label 16 is not in the training data.
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="116" id="YGRaJwtKTHUE">
<div class="sourceCode" id="cb181"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb181-1"><a href="#cb181-1" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb181-2"><a href="#cb181-2" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID is: 19</span></span>
<span id="cb181-3"><a href="#cb181-3" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted label 19 is not in the training data.</span></span>
<span id="cb181-4"><a href="#cb181-4" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb181-5"><a href="#cb181-5" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID is: 4</span></span>
<span id="cb181-6"><a href="#cb181-6" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID is: CWE127_Buffer_Underread</span></span>
<span id="cb181-7"><a href="#cb181-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Is the prediction correct? True</span></span>
<span id="cb181-8"><a href="#cb181-8" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb181-9"><a href="#cb181-9" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID is: 5</span></span>
<span id="cb181-10"><a href="#cb181-10" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID is: CWE134_Uncontrolled_Format_String</span></span>
<span id="cb181-11"><a href="#cb181-11" aria-hidden="true" tabindex="-1"></a><span class="co"># Is the prediction correct? True</span></span>
<span id="cb181-12"><a href="#cb181-12" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb181-13"><a href="#cb181-13" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID is: 0</span></span>
<span id="cb181-14"><a href="#cb181-14" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID is: CWE122_Heap_Based_Buffer_Overflow</span></span>
<span id="cb181-15"><a href="#cb181-15" aria-hidden="true" tabindex="-1"></a><span class="co"># Is the prediction correct? False</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="117" id="njc8dE8eTHUF">
<div class="sourceCode" id="cb182"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb182-1"><a href="#cb182-1" aria-hidden="true" tabindex="-1"></a><span class="co"># # Load the transformers and the model</span></span>
<span id="cb182-2"><a href="#cb182-2" aria-hidden="true" tabindex="-1"></a><span class="co"># bow_transf = pickle.load(open(&quot;bow_transformer.pk&quot;, &quot;rb&quot;))</span></span>
<span id="cb182-3"><a href="#cb182-3" aria-hidden="true" tabindex="-1"></a><span class="co"># tfidf_transf = pickle.load(open(&quot;tfidf_transformer.pk&quot;, &quot;rb&quot;))</span></span>
<span id="cb182-4"><a href="#cb182-4" aria-hidden="true" tabindex="-1"></a><span class="co"># with open(&#39;nb_model.pk&#39;, &#39;rb&#39;) as nb:</span></span>
<span id="cb182-5"><a href="#cb182-5" aria-hidden="true" tabindex="-1"></a><span class="co">#     model = pickle.load(nb)</span></span>
<span id="cb182-6"><a href="#cb182-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb182-7"><a href="#cb182-7" aria-hidden="true" tabindex="-1"></a><span class="co"># # Create a mapping from class labels to CWE-IDs</span></span>
<span id="cb182-8"><a href="#cb182-8" aria-hidden="true" tabindex="-1"></a><span class="co"># label_to_cwe = {i: cwe_id for i, cwe_id in enumerate(in_sample_top_10_cwe_df[&#39;CWE-ID&#39;].unique())}</span></span>
<span id="cb182-9"><a href="#cb182-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb182-10"><a href="#cb182-10" aria-hidden="true" tabindex="-1"></a><span class="co"># # Loop through all 10 samples</span></span>
<span id="cb182-11"><a href="#cb182-11" aria-hidden="true" tabindex="-1"></a><span class="co"># for in_sample_top_10_cwe_index in range(10):</span></span>
<span id="cb182-12"><a href="#cb182-12" aria-hidden="true" tabindex="-1"></a><span class="co">#     test_text = in_sample_top_10_cwe_df.iloc[in_sample_top_10_cwe_index,1]</span></span>
<span id="cb182-13"><a href="#cb182-13" aria-hidden="true" tabindex="-1"></a><span class="co">#     test_bow = bow_transf.transform([test_text])</span></span>
<span id="cb182-14"><a href="#cb182-14" aria-hidden="true" tabindex="-1"></a><span class="co">#     test_tfidf = tfidf_transf.transform(test_bow)</span></span>
<span id="cb182-15"><a href="#cb182-15" aria-hidden="true" tabindex="-1"></a><span class="co">#     foresights_id = model.predict(test_tfidf)[0] # Access the first element of the array</span></span>
<span id="cb182-16"><a href="#cb182-16" aria-hidden="true" tabindex="-1"></a><span class="co">#     print(&#39;-----------------------------------------&#39;)</span></span>
<span id="cb182-17"><a href="#cb182-17" aria-hidden="true" tabindex="-1"></a><span class="co">#     print(f&quot;The predicted CWE-ID index is: {foresights_id}&quot;)</span></span>
<span id="cb182-18"><a href="#cb182-18" aria-hidden="true" tabindex="-1"></a><span class="co">#     # Use the mapping to get the CWE-ID</span></span>
<span id="cb182-19"><a href="#cb182-19" aria-hidden="true" tabindex="-1"></a><span class="co">#     if foresights_id in label_to_cwe:</span></span>
<span id="cb182-20"><a href="#cb182-20" aria-hidden="true" tabindex="-1"></a><span class="co">#         predicted_cwe_id = label_to_cwe[foresights_id]</span></span>
<span id="cb182-21"><a href="#cb182-21" aria-hidden="true" tabindex="-1"></a><span class="co">#         print(f&quot;The predicted CWE-ID label is: {predicted_cwe_id}&quot;)</span></span>
<span id="cb182-22"><a href="#cb182-22" aria-hidden="true" tabindex="-1"></a><span class="co">#         # Check if the model&#39;s prediction matches the actual CWE-ID</span></span>
<span id="cb182-23"><a href="#cb182-23" aria-hidden="true" tabindex="-1"></a><span class="co">#         actual_cwe_id = in_sample_top_10_cwe_df.iloc[in_sample_top_10_cwe_index][&#39;CWE-ID&#39;]</span></span>
<span id="cb182-24"><a href="#cb182-24" aria-hidden="true" tabindex="-1"></a><span class="co">#         is_correct = (actual_cwe_id == predicted_cwe_id)</span></span>
<span id="cb182-25"><a href="#cb182-25" aria-hidden="true" tabindex="-1"></a><span class="co">#         print(f&quot;Is the prediction correct? {is_correct}&quot;)</span></span>
<span id="cb182-26"><a href="#cb182-26" aria-hidden="true" tabindex="-1"></a><span class="co">#     else:</span></span>
<span id="cb182-27"><a href="#cb182-27" aria-hidden="true" tabindex="-1"></a><span class="co">#         print(f&quot;The predicted label {foresights_id} is not in the training data.&quot;)</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="118" id="H7zuZqy8THUG">
<div class="sourceCode" id="cb183"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb183-1"><a href="#cb183-1" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb183-2"><a href="#cb183-2" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID index is: 3</span></span>
<span id="cb183-3"><a href="#cb183-3" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID label is: CWE415_Double_Free</span></span>
<span id="cb183-4"><a href="#cb183-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Is the prediction correct? False</span></span>
<span id="cb183-5"><a href="#cb183-5" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb183-6"><a href="#cb183-6" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID index is: 11</span></span>
<span id="cb183-7"><a href="#cb183-7" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted label 11 is not in the training data.</span></span>
<span id="cb183-8"><a href="#cb183-8" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb183-9"><a href="#cb183-9" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID index is: 19</span></span>
<span id="cb183-10"><a href="#cb183-10" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted label 19 is not in the training data.</span></span>
<span id="cb183-11"><a href="#cb183-11" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb183-12"><a href="#cb183-12" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID index is: 19</span></span>
<span id="cb183-13"><a href="#cb183-13" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted label 19 is not in the training data.</span></span>
<span id="cb183-14"><a href="#cb183-14" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb183-15"><a href="#cb183-15" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID index is: 4</span></span>
<span id="cb183-16"><a href="#cb183-16" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID label is: CWE127_Buffer_Underread</span></span>
<span id="cb183-17"><a href="#cb183-17" aria-hidden="true" tabindex="-1"></a><span class="co"># Is the prediction correct? True</span></span>
<span id="cb183-18"><a href="#cb183-18" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb183-19"><a href="#cb183-19" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID index is: 5</span></span>
<span id="cb183-20"><a href="#cb183-20" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID label is: CWE134_Uncontrolled_Format_String</span></span>
<span id="cb183-21"><a href="#cb183-21" aria-hidden="true" tabindex="-1"></a><span class="co"># Is the prediction correct? True</span></span>
<span id="cb183-22"><a href="#cb183-22" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb183-23"><a href="#cb183-23" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID index is: 0</span></span>
<span id="cb183-24"><a href="#cb183-24" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID label is: CWE122_Heap_Based_Buffer_Overflow</span></span>
<span id="cb183-25"><a href="#cb183-25" aria-hidden="true" tabindex="-1"></a><span class="co"># Is the prediction correct? False</span></span>
<span id="cb183-26"><a href="#cb183-26" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb183-27"><a href="#cb183-27" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID index is: 11</span></span>
<span id="cb183-28"><a href="#cb183-28" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted label 11 is not in the training data.</span></span>
<span id="cb183-29"><a href="#cb183-29" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb183-30"><a href="#cb183-30" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID index is: 22</span></span>
<span id="cb183-31"><a href="#cb183-31" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted label 22 is not in the training data.</span></span>
<span id="cb183-32"><a href="#cb183-32" aria-hidden="true" tabindex="-1"></a><span class="co"># -----------------------------------------</span></span>
<span id="cb183-33"><a href="#cb183-33" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted CWE-ID index is: 15</span></span>
<span id="cb183-34"><a href="#cb183-34" aria-hidden="true" tabindex="-1"></a><span class="co"># The predicted label 15 is not in the training data.</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="119"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:89}"
id="Lcfu7vawTHUH" data-outputId="81850c78-55c8-4f36-bc3b-feca57c9d6a8">
<div class="sourceCode" id="cb184"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb184-1"><a href="#cb184-1" aria-hidden="true" tabindex="-1"></a>in_sample_top_10_cwe_df.iloc[<span class="dv">6</span>,<span class="dv">1</span>]</span></code></pre></div>
<div class="output execute_result" data-execution_count="119">
<div class="sourceCode" id="cb185"><pre
class="sourceCode json"><code class="sourceCode json"><span id="cb185-1"><a href="#cb185-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">&quot;type&quot;</span><span class="fu">:</span><span class="st">&quot;string&quot;</span><span class="fu">}</span></span></code></pre></div>
</div>
</div>
<div class="cell code" data-execution_count="120"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:36}"
id="y-8_87NaTHUH" data-outputId="f8def3ba-72d2-4066-991b-0548cad95189">
<div class="sourceCode" id="cb186"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb186-1"><a href="#cb186-1" aria-hidden="true" tabindex="-1"></a>in_sample_top_10_cwe_df.iloc[<span class="dv">6</span>,<span class="dv">0</span>]</span></code></pre></div>
<div class="output execute_result" data-execution_count="120">
<div class="sourceCode" id="cb187"><pre
class="sourceCode json"><code class="sourceCode json"><span id="cb187-1"><a href="#cb187-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">&quot;type&quot;</span><span class="fu">:</span><span class="st">&quot;string&quot;</span><span class="fu">}</span></span></code></pre></div>
</div>
</div>
<div class="cell code" data-execution_count="121"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="muE8tq3DTHUI" data-outputId="d48e3573-9fe5-4c71-8d83-ecfcc9f56e95">
<div class="sourceCode" id="cb188"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb188-1"><a href="#cb188-1" aria-hidden="true" tabindex="-1"></a><span class="co">## print out the actual predicted CWE-ID instaed of array([3])</span></span>
<span id="cb188-2"><a href="#cb188-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb188-3"><a href="#cb188-3" aria-hidden="true" tabindex="-1"></a>test_text <span class="op">=</span> in_sample_top_10_cwe_df.iloc[<span class="dv">6</span>,<span class="dv">1</span>]</span>
<span id="cb188-4"><a href="#cb188-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb188-5"><a href="#cb188-5" aria-hidden="true" tabindex="-1"></a><span class="co"># this is how you reload and use the BoW transformer</span></span>
<span id="cb188-6"><a href="#cb188-6" aria-hidden="true" tabindex="-1"></a>bow_transf <span class="op">=</span> pickle.load(<span class="bu">open</span>(<span class="st">&quot;bow_transformer.pk&quot;</span>, <span class="st">&quot;rb&quot;</span>))</span>
<span id="cb188-7"><a href="#cb188-7" aria-hidden="true" tabindex="-1"></a>test_bow <span class="op">=</span> bow_transf.transform([test_text])</span>
<span id="cb188-8"><a href="#cb188-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb188-9"><a href="#cb188-9" aria-hidden="true" tabindex="-1"></a><span class="co"># this is how you reload and use the TF-IDF transformer</span></span>
<span id="cb188-10"><a href="#cb188-10" aria-hidden="true" tabindex="-1"></a><span class="co"># remember it is applied to the result of bow_transformer</span></span>
<span id="cb188-11"><a href="#cb188-11" aria-hidden="true" tabindex="-1"></a>tfidf_transf <span class="op">=</span> pickle.load(<span class="bu">open</span>(<span class="st">&quot;tfidf_transformer.pk&quot;</span>, <span class="st">&quot;rb&quot;</span>))</span>
<span id="cb188-12"><a href="#cb188-12" aria-hidden="true" tabindex="-1"></a>test_tfidf <span class="op">=</span> tfidf_transf.transform(test_bow)</span>
<span id="cb188-13"><a href="#cb188-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb188-14"><a href="#cb188-14" aria-hidden="true" tabindex="-1"></a><span class="co"># here we reload the saved NaiveBayes model and use it to predict the class of our test text</span></span>
<span id="cb188-15"><a href="#cb188-15" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">&#39;nb_model.pk&#39;</span>, <span class="st">&#39;rb&#39;</span>) <span class="im">as</span> nb:</span>
<span id="cb188-16"><a href="#cb188-16" aria-hidden="true" tabindex="-1"></a>    model <span class="op">=</span> pickle.load(nb)</span>
<span id="cb188-17"><a href="#cb188-17" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb188-18"><a href="#cb188-18" aria-hidden="true" tabindex="-1"></a><span class="co"># model.predict(test_tfidf) ## array([3])  ## array([13])</span></span>
<span id="cb188-19"><a href="#cb188-19" aria-hidden="true" tabindex="-1"></a>foresights_id <span class="op">=</span> model.predict(test_tfidf)[<span class="dv">0</span>] <span class="co"># Access the first element of the array</span></span>
<span id="cb188-20"><a href="#cb188-20" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f&quot;The predicted CWE-ID is: </span><span class="sc">{</span>foresights_id<span class="sc">}</span><span class="ss">&quot;</span>)</span>
<span id="cb188-21"><a href="#cb188-21" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb188-22"><a href="#cb188-22" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a mapping from class labels to CWE-IDs</span></span>
<span id="cb188-23"><a href="#cb188-23" aria-hidden="true" tabindex="-1"></a>label_to_cwe <span class="op">=</span> {i: cwe_id <span class="cf">for</span> i, cwe_id <span class="kw">in</span> <span class="bu">enumerate</span>(in_sample_top_10_cwe_df[<span class="st">&#39;CWE-ID&#39;</span>].unique())}</span>
<span id="cb188-24"><a href="#cb188-24" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb188-25"><a href="#cb188-25" aria-hidden="true" tabindex="-1"></a><span class="co"># Use the mapping to get the CWE-ID</span></span>
<span id="cb188-26"><a href="#cb188-26" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span> foresights_id <span class="kw">in</span> label_to_cwe:</span>
<span id="cb188-27"><a href="#cb188-27" aria-hidden="true" tabindex="-1"></a>    predicted_cwe_id <span class="op">=</span> label_to_cwe[foresights_id]</span>
<span id="cb188-28"><a href="#cb188-28" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f&quot;The predicted CWE-ID is: </span><span class="sc">{</span>predicted_cwe_id<span class="sc">}</span><span class="ss">&quot;</span>)</span>
<span id="cb188-29"><a href="#cb188-29" aria-hidden="true" tabindex="-1"></a><span class="cf">else</span>:</span>
<span id="cb188-30"><a href="#cb188-30" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f&quot;The predicted label </span><span class="sc">{</span>foresights_id<span class="sc">}</span><span class="ss"> is not in the training data.&quot;</span>)</span></code></pre></div>
<div class="output stream stdout">
<pre><code>The predicted CWE-ID is: 16
The predicted label 16 is not in the training data.
</code></pre>
</div>
</div>
<div class="cell markdown" id="QS44H6JzTHUI">
<p>.......</p>
</div>
<section id="baselinecommon-sense-models-" class="cell markdown"
id="EHuiSsECTHUJ">
<h1>Baseline/Common-sense MODELs ...</h1>
</section>
<div class="cell code" data-execution_count="122" id="HMsgbWm7THUJ">
<div class="sourceCode" id="cb190"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb190-1"><a href="#cb190-1" aria-hidden="true" tabindex="-1"></a><span class="co"># text_tfidf = tfidf_transformer.transform(text_bow)</span></span>
<span id="cb190-2"><a href="#cb190-2" aria-hidden="true" tabindex="-1"></a><span class="co"># print(text_tfidf.shape) ## (7628, 12766)</span></span>
<span id="cb190-3"><a href="#cb190-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb190-4"><a href="#cb190-4" aria-hidden="true" tabindex="-1"></a><span class="co"># # Train non-leaking samples</span></span>
<span id="cb190-5"><a href="#cb190-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb190-6"><a href="#cb190-6" aria-hidden="true" tabindex="-1"></a><span class="co"># # text_tfidf.shape ## (7628, 12766)</span></span>
<span id="cb190-7"><a href="#cb190-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb190-8"><a href="#cb190-8" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> train_test_split</span>
<span id="cb190-9"><a href="#cb190-9" aria-hidden="true" tabindex="-1"></a><span class="co"># X_train, X_test, y_train, y_test = train_test_split(text_tfidf, df[&#39;Category&#39;], test_size=0.2, random_state=0) ## 11</span></span>
<span id="cb190-10"><a href="#cb190-10" aria-hidden="true" tabindex="-1"></a>X_train, X_test, y_train, y_test <span class="op">=</span> train_test_split(text_tfidf, df_target, test_size<span class="op">=</span><span class="fl">0.2</span>, random_state<span class="op">=</span><span class="dv">0</span>) <span class="co">## 11</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="123"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="BuQS7ZGoTHUK" data-outputId="80bd1887-b841-4783-8a08-c1425589d9b6">
<div class="sourceCode" id="cb191"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb191-1"><a href="#cb191-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>time</span>
<span id="cb191-2"><a href="#cb191-2" aria-hidden="true" tabindex="-1"></a><span class="co"># text_tfidf[2:4,1000:2050].toarray()</span></span>
<span id="cb191-3"><a href="#cb191-3" aria-hidden="true" tabindex="-1"></a><span class="co"># text_tfidf[&#39;Category&#39;].toarray()</span></span>
<span id="cb191-4"><a href="#cb191-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb191-5"><a href="#cb191-5" aria-hidden="true" tabindex="-1"></a><span class="co"># test_bow = bow_transformer.transform([input_text])</span></span>
<span id="cb191-6"><a href="#cb191-6" aria-hidden="true" tabindex="-1"></a>test_bow <span class="op">=</span> bow_transformer.transform(df[<span class="st">&#39;Text&#39;</span>])</span>
<span id="cb191-7"><a href="#cb191-7" aria-hidden="true" tabindex="-1"></a>test_data <span class="op">=</span> tfidf_transformer.transform(test_bow)</span>
<span id="cb191-8"><a href="#cb191-8" aria-hidden="true" tabindex="-1"></a><span class="co"># model.predict(test_data)</span></span>
<span id="cb191-9"><a href="#cb191-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb191-10"><a href="#cb191-10" aria-hidden="true" tabindex="-1"></a>test_data.shape <span class="co">## (7628, 12766)</span></span>
<span id="cb191-11"><a href="#cb191-11" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 16.8 s, sys: 36.6 ms, total: 16.8 s</span></span>
<span id="cb191-12"><a href="#cb191-12" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 16.8 s</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>CPU times: user 15.1 s, sys: 257 ms, total: 15.4 s
Wall time: 15.5 s
</code></pre>
</div>
<div class="output execute_result" data-execution_count="123">
<pre><code>(7628, 12766)</code></pre>
</div>
</div>
<div class="cell markdown" id="qU1_qvLKTHUK">
<p>.............</p>
</div>
<div class="cell code" data-execution_count="124"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="ueLuaakFTHUK" data-outputId="7d4bd319-ff94-4401-8c35-dc30702b41a5">
<div class="sourceCode" id="cb194"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb194-1"><a href="#cb194-1" aria-hidden="true" tabindex="-1"></a><span class="co"># X_text=data_filtered.iloc[:,0] # subset of only [Description] raw text</span></span>
<span id="cb194-2"><a href="#cb194-2" aria-hidden="true" tabindex="-1"></a>X_text<span class="op">=</span>text_tfidf <span class="co">## OK!</span></span>
<span id="cb194-3"><a href="#cb194-3" aria-hidden="true" tabindex="-1"></a>X_text</span></code></pre></div>
<div class="output execute_result" data-execution_count="124">
<pre><code>&lt;7628x12766 sparse matrix of type &#39;&lt;class &#39;numpy.float64&#39;&gt;&#39;
	with 316364 stored elements in Compressed Sparse Row format&gt;</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="125"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="szchcKdTTHUL" data-outputId="2c3809f2-8f16-4309-d15a-b01e55de01b6">
<div class="sourceCode" id="cb196"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb196-1"><a href="#cb196-1" aria-hidden="true" tabindex="-1"></a><span class="co"># y_category=data_filtered.iloc[:,1:4] # subset of only [Category, Level_2, Level_3] Category cols</span></span>
<span id="cb196-2"><a href="#cb196-2" aria-hidden="true" tabindex="-1"></a>y_category<span class="op">=</span>df_target  <span class="co"># df_cat # data[&#39;Category&#39;] ## dataset.target</span></span>
<span id="cb196-3"><a href="#cb196-3" aria-hidden="true" tabindex="-1"></a>y_category</span></code></pre></div>
<div class="output execute_result" data-execution_count="125">
<pre><code>0       11
1        3
2       18
3       24
4        4
        ..
7623    16
7624    11
7625     5
7626     8
7627     8
Name: Category, Length: 7628, dtype: int64</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="126" id="Nhsa2wo_THUL">
<div class="sourceCode" id="cb198"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb198-1"><a href="#cb198-1" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> text_tfidf <span class="co"># data[&#39;Text&#39;] ## dataset.data #! ## ValueError: could not convert string to float: &#39;peopl allar ...&#39;</span></span>
<span id="cb198-2"><a href="#cb198-2" aria-hidden="true" tabindex="-1"></a><span class="co"># y = data[&#39;Category&#39;]  # df_cat # data[&#39;Category&#39;] ## dataset.target</span></span>
<span id="cb198-3"><a href="#cb198-3" aria-hidden="true" tabindex="-1"></a>y <span class="op">=</span> df_target  <span class="co"># df_cat # data[&#39;Category&#39;] ## dataset.target</span></span>
<span id="cb198-4"><a href="#cb198-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb198-5"><a href="#cb198-5" aria-hidden="true" tabindex="-1"></a>X_train, X_test, y_train, y_test <span class="op">=</span> train_test_split(X, y, test_size <span class="op">=</span> <span class="fl">0.1</span>, random_state <span class="op">=</span><span class="dv">0</span>)</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="127" id="_DTzY8x-THUL">
<div class="sourceCode" id="cb199"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb199-1"><a href="#cb199-1" aria-hidden="true" tabindex="-1"></a><span class="co">## Train/Test split</span></span>
<span id="cb199-2"><a href="#cb199-2" aria-hidden="true" tabindex="-1"></a><span class="co"># train,test=train_test_split(data_drop_na,test_size=0.25,random_state=0)</span></span>
<span id="cb199-3"><a href="#cb199-3" aria-hidden="true" tabindex="-1"></a><span class="co"># X_tr,X_te,y_tr,y_te=train_test_split(X_text,y_category,test_size=0.2,random_state=0) # test_size=0.25</span></span>
<span id="cb199-4"><a href="#cb199-4" aria-hidden="true" tabindex="-1"></a>X_tr,X_te,y_tr,y_te<span class="op">=</span>train_test_split(X_text,y_category,test_size<span class="op">=</span><span class="fl">0.2</span>,random_state<span class="op">=</span><span class="dv">0</span>) <span class="co"># test_size=0.25</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="128"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="spR-V8sfTHUM" data-outputId="467d2ff8-0d4c-49c4-e9e9-aebc918efb7d">
<div class="sourceCode" id="cb200"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb200-1"><a href="#cb200-1" aria-hidden="true" tabindex="-1"></a><span class="co">## train.shape ## (7977, 4) ## (7986, 16506)</span></span>
<span id="cb200-2"><a href="#cb200-2" aria-hidden="true" tabindex="-1"></a><span class="co"># y_tr.shape ## (7970, 3) ## (7977, 3)</span></span>
<span id="cb200-3"><a href="#cb200-3" aria-hidden="true" tabindex="-1"></a>y_te.shape <span class="co">## (1526, 1)  ## (2657, 3) ## (2660, 3)</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="128">
<pre><code>(1526,)</code></pre>
</div>
</div>
<section id="deep-learning" class="cell markdown" id="I6Obs6CnFdpJ">
<h1>Deep Learning</h1>
</section>
<div class="cell code" data-execution_count="129"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:36}"
id="0jq13GmBGqO9" data-outputId="d6102e94-5f0f-4453-9700-f1bb690d1e84">
<div class="sourceCode" id="cb202"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb202-1"><a href="#cb202-1" aria-hidden="true" tabindex="-1"></a><span class="co">&quot;&quot;&quot;</span></span>
<span id="cb202-2"><a href="#cb202-2" aria-hidden="true" tabindex="-1"></a><span class="co">## Setup</span></span>
<span id="cb202-3"><a href="#cb202-3" aria-hidden="true" tabindex="-1"></a><span class="co">&quot;&quot;&quot;</span></span>
<span id="cb202-4"><a href="#cb202-4" aria-hidden="true" tabindex="-1"></a><span class="co">## !pip install -q &quot;tensorflow-text&quot; # ==2.13.*&quot;</span></span>
<span id="cb202-5"><a href="#cb202-5" aria-hidden="true" tabindex="-1"></a><span class="co"># import tensorflow_text as tf_text</span></span>
<span id="cb202-6"><a href="#cb202-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb202-7"><a href="#cb202-7" aria-hidden="true" tabindex="-1"></a><span class="co"># ## TensorFlow backend only supports string inputs</span></span>
<span id="cb202-8"><a href="#cb202-8" aria-hidden="true" tabindex="-1"></a><span class="co"># os.environ[&quot;KERAS_BACKEND&quot;] = &quot;tensorflow&quot;</span></span>
<span id="cb202-9"><a href="#cb202-9" aria-hidden="true" tabindex="-1"></a><span class="co"># import keras</span></span>
<span id="cb202-10"><a href="#cb202-10" aria-hidden="true" tabindex="-1"></a><span class="co"># from keras import layers</span></span>
<span id="cb202-11"><a href="#cb202-11" aria-hidden="true" tabindex="-1"></a><span class="co"># import tensorflow as tf</span></span>
<span id="cb202-12"><a href="#cb202-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb202-13"><a href="#cb202-13" aria-hidden="true" tabindex="-1"></a><span class="co">## IMPORTS &amp; Params at the top</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="129">
<div class="sourceCode" id="cb203"><pre
class="sourceCode json"><code class="sourceCode json"><span id="cb203-1"><a href="#cb203-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">&quot;type&quot;</span><span class="fu">:</span><span class="st">&quot;string&quot;</span><span class="fu">}</span></span></code></pre></div>
</div>
</div>
<div class="cell code" data-execution_count="130"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="i5FM6FB3VR_F" data-outputId="a2ced115-d056-41df-9a35-34085423c819">
<div class="sourceCode" id="cb204"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb204-1"><a href="#cb204-1" aria-hidden="true" tabindex="-1"></a><span class="co">&quot;&quot;&quot;</span></span>
<span id="cb204-2"><a href="#cb204-2" aria-hidden="true" tabindex="-1"></a><span class="co">## Download the SARD cpp_8750_files data</span></span>
<span id="cb204-3"><a href="#cb204-3" aria-hidden="true" tabindex="-1"></a><span class="co">&quot;&quot;&quot;</span></span>
<span id="cb204-4"><a href="#cb204-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb204-5"><a href="#cb204-5" aria-hidden="true" tabindex="-1"></a><span class="co"># !gdown 1Q_P8bYpvdSEbp6NnCzfqU3lwQwxUlfE3</span></span>
<span id="cb204-6"><a href="#cb204-6" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>wget https:<span class="op">//</span>raw.githubusercontent.com<span class="op">/</span>c6ai<span class="op">/</span>temp<span class="op">/</span>main<span class="op">/</span>cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz</span>
<span id="cb204-7"><a href="#cb204-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb204-8"><a href="#cb204-8" aria-hidden="true" tabindex="-1"></a><span class="co"># data_path = keras.utils.get_file(</span></span>
<span id="cb204-9"><a href="#cb204-9" aria-hidden="true" tabindex="-1"></a><span class="co">#     &quot;sard.zip&quot;,</span></span>
<span id="cb204-10"><a href="#cb204-10" aria-hidden="true" tabindex="-1"></a><span class="co">#     &quot;https://raw.githubusercontent.com/c6ai/temp/main/sard.zip&quot;,</span></span>
<span id="cb204-11"><a href="#cb204-11" aria-hidden="true" tabindex="-1"></a><span class="co">#     untar=True,</span></span>
<span id="cb204-12"><a href="#cb204-12" aria-hidden="true" tabindex="-1"></a><span class="co"># )</span></span>
<span id="cb204-13"><a href="#cb204-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb204-14"><a href="#cb204-14" aria-hidden="true" tabindex="-1"></a><span class="co"># data_path = keras.utils.get_file(</span></span>
<span id="cb204-15"><a href="#cb204-15" aria-hidden="true" tabindex="-1"></a><span class="co">#     &quot;cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz&quot;,</span></span>
<span id="cb204-16"><a href="#cb204-16" aria-hidden="true" tabindex="-1"></a><span class="co">#     &quot;https://raw.githubusercontent.com/c6ai/temp/main/cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz&quot;,</span></span>
<span id="cb204-17"><a href="#cb204-17" aria-hidden="true" tabindex="-1"></a><span class="co">#     untar=True,</span></span>
<span id="cb204-18"><a href="#cb204-18" aria-hidden="true" tabindex="-1"></a><span class="co"># )</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>--2024-01-07 10:35:54--  https://raw.githubusercontent.com/c6ai/temp/main/cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 895869 (875K) [application/octet-stream]
Saving to: ‘cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz’

          cpp_clean   0%[                    ]       0  --.-KB/s               cpp_cleaner_8750_fi 100%[===================&gt;] 874.87K  --.-KB/s    in 0.006s  

2024-01-07 10:35:54 (140 MB/s) - ‘cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz’ saved [895869/895869]

</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="131" id="uMSlcwpRVR_M">
<div class="sourceCode" id="cb206"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb206-1"><a href="#cb206-1" aria-hidden="true" tabindex="-1"></a><span class="co">## extract or un-tar (unzip):</span></span>
<span id="cb206-2"><a href="#cb206-2" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>tar <span class="op">-</span>xzf cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="132"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="zoSSmKegVR_N" data-outputId="54ea012e-b9d7-4bb0-899f-30ff6d4a25c0">
<div class="sourceCode" id="cb207"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb207-1"><a href="#cb207-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>ls <span class="op">-</span>lh <span class="co"># /content/</span></span>
<span id="cb207-2"><a href="#cb207-2" aria-hidden="true" tabindex="-1"></a><span class="co">## total 880K</span></span>
<span id="cb207-3"><a href="#cb207-3" aria-hidden="true" tabindex="-1"></a><span class="co">## drwxr-xr-x 27 root root 4.0K Jan  2 04:10 cpp_cleaner_8750_files_each_350_top_25_cwe_omitted</span></span>
<span id="cb207-4"><a href="#cb207-4" aria-hidden="true" tabindex="-1"></a><span class="co">## -rw-r--r--  1 root root 875K Jan  2 09:29 cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>total 20M
-rw-r--r--  1 root root 339K Jan  7 10:33 bow_transformer.pk
drwxr-xr-x 27 root root 4.0K Jan  2 04:10 cpp_cleaner_8750_files_each_350_top_25_cwe_omitted
-rw-r--r--  1 root root 875K Jan  7 10:35 cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz
drwxr-xr-x 12 root root 4.0K Jan  3 17:15 cpp_clean_files_top_10_cwe_omitted
-rw-r--r--  1 root root  15K Jan  7 10:35 cpp_clean_files_top_10_cwe_omitted.tar.gz
-rw-r--r--  1 root root  11M Jan  7 10:32 data_drop_na.csv
-rw-r--r--  1 root root 4.9M Jan  7 10:35 nb_model.pk
-rw-r--r--  1 root root 2.9M Jan  7 10:35 sard.zip
-rw-r--r--  1 root root 201K Jan  7 10:33 tfidf_transformer.pk
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="133" id="tyF35oCyVR_O">
<div class="sourceCode" id="cb209"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb209-1"><a href="#cb209-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !ls -lh /root/.keras/datasets/</span></span>
<span id="cb209-2"><a href="#cb209-2" aria-hidden="true" tabindex="-1"></a><span class="co"># ## -rw-r--r-- 1 root root 2.3K Jan  2 08:00 cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="134" id="YsRnbWLmVR_P">
<div class="sourceCode" id="cb210"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb210-1"><a href="#cb210-1" aria-hidden="true" tabindex="-1"></a><span class="co"># data_path ## /root/.keras/datasets/cpp_cleaner_8750_files_each_350_top_25_cwe_omitted</span></span>
<span id="cb210-2"><a href="#cb210-2" aria-hidden="true" tabindex="-1"></a>data_path <span class="op">=</span> <span class="st">&#39;cpp_cleaner_8750_files_each_350_top_25_cwe_omitted&#39;</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="135" id="Ee9h8mDeVR_Q">
<div class="sourceCode" id="cb211"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb211-1"><a href="#cb211-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !ls -lh /root/.keras/datasets/cpp_cleaner_8750_files_each_350_top_25_cwe_omitted</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="136"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="YSjVgocYVR_R" data-outputId="e170d660-6c57-4223-80db-b2ef9f1a7b91">
<div class="sourceCode" id="cb212"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb212-1"><a href="#cb212-1" aria-hidden="true" tabindex="-1"></a><span class="co">&quot;&quot;&quot;</span></span>
<span id="cb212-2"><a href="#cb212-2" aria-hidden="true" tabindex="-1"></a><span class="co">## Let&#39;s take a look at the data</span></span>
<span id="cb212-3"><a href="#cb212-3" aria-hidden="true" tabindex="-1"></a><span class="co">&quot;&quot;&quot;</span></span>
<span id="cb212-4"><a href="#cb212-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb212-5"><a href="#cb212-5" aria-hidden="true" tabindex="-1"></a>data_dir <span class="op">=</span> pathlib.Path(data_path).parent <span class="op">/</span> <span class="st">&quot;cpp_cleaner_8750_files_each_350_top_25_cwe_omitted&quot;</span></span>
<span id="cb212-6"><a href="#cb212-6" aria-hidden="true" tabindex="-1"></a>dirnames <span class="op">=</span> os.listdir(data_dir)</span>
<span id="cb212-7"><a href="#cb212-7" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Number of directories:&quot;</span>, <span class="bu">len</span>(dirnames))</span>
<span id="cb212-8"><a href="#cb212-8" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Directory names:&quot;</span>, dirnames)</span>
<span id="cb212-9"><a href="#cb212-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb212-10"><a href="#cb212-10" aria-hidden="true" tabindex="-1"></a>fnames <span class="op">=</span> os.listdir(data_dir <span class="op">/</span> <span class="st">&quot;CWE122_Heap_Based_Buffer_Overflow&quot;</span>)</span>
<span id="cb212-11"><a href="#cb212-11" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Number of files in CWE122_Heap_Based_Buffer_Overflow:&quot;</span>, <span class="bu">len</span>(fnames))</span>
<span id="cb212-12"><a href="#cb212-12" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Some example filenames:&quot;</span>, fnames[:<span class="dv">5</span>])</span>
<span id="cb212-13"><a href="#cb212-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb212-14"><a href="#cb212-14" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of directories: 25</span></span>
<span id="cb212-15"><a href="#cb212-15" aria-hidden="true" tabindex="-1"></a><span class="co"># Directory names:</span></span>
<span id="cb212-16"><a href="#cb212-16" aria-hidden="true" tabindex="-1"></a><span class="co">## [&#39;CWE369_Divide_by_Zero&#39;, &#39;CWE762_Mismatched_Memory_Management_Routines&#39;,</span></span>
<span id="cb212-17"><a href="#cb212-17" aria-hidden="true" tabindex="-1"></a><span class="co">## &#39;CWE195_Signed_to_Unsigned_Conversion_Error&#39;, &#39;CWE36_Absolute_Path_Traversal&#39;,</span></span>
<span id="cb212-18"><a href="#cb212-18" aria-hidden="true" tabindex="-1"></a><span class="co">## &#39;CWE590_Free_Memory_Not_on_Heap&#39;, &#39;CWE690_NULL_Deref_From_Return&#39;, &#39;CWE789_Uncontrolled_Mem_Alloc&#39;,</span></span>
<span id="cb212-19"><a href="#cb212-19" aria-hidden="true" tabindex="-1"></a><span class="co">## &#39;CWE197_Numeric_Truncation_Error&#39;, &#39;CWE401_Memory_Leak&#39;, &#39;CWE400_Resource_Exhaustion&#39;,</span></span>
<span id="cb212-20"><a href="#cb212-20" aria-hidden="true" tabindex="-1"></a><span class="co">## &#39;CWE127_Buffer_Underread&#39;, &#39;CWE121_Stack_Based_Buffer_Overflow&#39;, &#39;CWE191_Integer_Underflow&#39;,</span></span>
<span id="cb212-21"><a href="#cb212-21" aria-hidden="true" tabindex="-1"></a><span class="co">## &#39;CWE122_Heap_Based_Buffer_Overflow&#39;, &#39;CWE190_Integer_Overflow&#39;, &#39;CWE563_Unused_Variable&#39;,</span></span>
<span id="cb212-22"><a href="#cb212-22" aria-hidden="true" tabindex="-1"></a><span class="co">## &#39;CWE194_Unexpected_Sign_Extension&#39;, &#39;CWE23_Relative_Path_Traversal&#39;, &#39;CWE457_Use_of_Uninitialized_Variable&#39;,</span></span>
<span id="cb212-23"><a href="#cb212-23" aria-hidden="true" tabindex="-1"></a><span class="co">## &#39;CWE415_Double_Free&#39;, &#39;CWE126_Buffer_Overread&#39;, &#39;CWE134_Uncontrolled_Format_String&#39;,</span></span>
<span id="cb212-24"><a href="#cb212-24" aria-hidden="true" tabindex="-1"></a><span class="co">## &#39;CWE680_Integer_Overflow_to_Buffer_Overflow&#39;, &#39;CWE78_OS_Command_Injection&#39;, &#39;CWE124_Buffer_Underwrite&#39;]</span></span>
<span id="cb212-25"><a href="#cb212-25" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb212-26"><a href="#cb212-26" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of files in CWE122_Heap_Based_Buffer_Overflow: 316</span></span>
<span id="cb212-27"><a href="#cb212-27" aria-hidden="true" tabindex="-1"></a><span class="co"># Some example filenames:</span></span>
<span id="cb212-28"><a href="#cb212-28" aria-hidden="true" tabindex="-1"></a><span class="co">## [&#39;t_memcpy_72a.cpp&#39;, &#39;loop_61b.cpp&#39;, &#39;dest_char_cat_34.cpp&#39;, &#39;ncpy_54b.cpp&#39;, &#39;ncpy_43.cpp&#39;]</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>Number of directories: 25
Directory names: [&#39;CWE124_Buffer_Underwrite&#39;, &#39;CWE122_Heap_Based_Buffer_Overflow&#39;, &#39;CWE194_Unexpected_Sign_Extension&#39;, &#39;CWE401_Memory_Leak&#39;, &#39;CWE126_Buffer_Overread&#39;, &#39;CWE36_Absolute_Path_Traversal&#39;, &#39;CWE457_Use_of_Uninitialized_Variable&#39;, &#39;CWE195_Signed_to_Unsigned_Conversion_Error&#39;, &#39;CWE197_Numeric_Truncation_Error&#39;, &#39;CWE78_OS_Command_Injection&#39;, &#39;CWE563_Unused_Variable&#39;, &#39;CWE127_Buffer_Underread&#39;, &#39;CWE680_Integer_Overflow_to_Buffer_Overflow&#39;, &#39;CWE23_Relative_Path_Traversal&#39;, &#39;CWE121_Stack_Based_Buffer_Overflow&#39;, &#39;CWE369_Divide_by_Zero&#39;, &#39;CWE190_Integer_Overflow&#39;, &#39;CWE415_Double_Free&#39;, &#39;CWE762_Mismatched_Memory_Management_Routines&#39;, &#39;CWE690_NULL_Deref_From_Return&#39;, &#39;CWE590_Free_Memory_Not_on_Heap&#39;, &#39;CWE191_Integer_Underflow&#39;, &#39;CWE400_Resource_Exhaustion&#39;, &#39;CWE134_Uncontrolled_Format_String&#39;, &#39;CWE789_Uncontrolled_Mem_Alloc&#39;]
Number of files in CWE122_Heap_Based_Buffer_Overflow: 316
Some example filenames: [&#39;socket_82_goodG2B.cpp&#39;, &#39;cpy_82a.cpp&#39;, &#39;t_memcpy_43.cpp&#39;, &#39;t_cpy_63b.cpp&#39;, &#39;snprintf_84_goodG2B.cpp&#39;]
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="137"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:53}"
id="K0cuOICoVR_T" data-outputId="7590c66a-5383-4c7e-eab0-6b1f0569fbaa">
<div class="sourceCode" id="cb214"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb214-1"><a href="#cb214-1" aria-hidden="true" tabindex="-1"></a><span class="co">&quot;&quot;&quot;</span></span>
<span id="cb214-2"><a href="#cb214-2" aria-hidden="true" tabindex="-1"></a><span class="co">Here&#39;s a example of what one file contains:</span></span>
<span id="cb214-3"><a href="#cb214-3" aria-hidden="true" tabindex="-1"></a><span class="co">&quot;&quot;&quot;</span></span>
<span id="cb214-4"><a href="#cb214-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb214-5"><a href="#cb214-5" aria-hidden="true" tabindex="-1"></a><span class="co">## /content/cpp_cleaner_8750_files_each_350_top_25_cwe_omitted/CWE122_Heap_Based_Buffer_Overflow/83_goodG2B.cpp</span></span>
<span id="cb214-6"><a href="#cb214-6" aria-hidden="true" tabindex="-1"></a><span class="co"># print(open(data_dir / &quot;CWE122_Heap_Based_Buffer_Overflow&quot; / &quot;83_goodG2B.cpp&quot;).read())</span></span>
<span id="cb214-7"><a href="#cb214-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb214-8"><a href="#cb214-8" aria-hidden="true" tabindex="-1"></a><span class="co">&quot;&quot;&quot;</span></span>
<span id="cb214-9"><a href="#cb214-9" aria-hidden="true" tabindex="-1"></a><span class="co">As you can see, there are header lines that are leaking the file&#39;s category, either</span></span>
<span id="cb214-10"><a href="#cb214-10" aria-hidden="true" tabindex="-1"></a><span class="co">explicitly (the first line is literally the category name), or implicitly, e.g. via the</span></span>
<span id="cb214-11"><a href="#cb214-11" aria-hidden="true" tabindex="-1"></a><span class="co">`Organization` filed. Let&#39;s get rid of the headers:</span></span>
<span id="cb214-12"><a href="#cb214-12" aria-hidden="true" tabindex="-1"></a><span class="co">&quot;&quot;&quot;</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="137">
<div class="sourceCode" id="cb215"><pre
class="sourceCode json"><code class="sourceCode json"><span id="cb215-1"><a href="#cb215-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">&quot;type&quot;</span><span class="fu">:</span><span class="st">&quot;string&quot;</span><span class="fu">}</span></span></code></pre></div>
</div>
</div>
<div class="cell code" data-execution_count="138"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="klwkIt-vVR_W" data-outputId="9c0c2d98-a437-4fef-f4f8-7c8a3c9381a3">
<div class="sourceCode" id="cb216"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb216-1"><a href="#cb216-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Initialize the count</span></span>
<span id="cb216-2"><a href="#cb216-2" aria-hidden="true" tabindex="-1"></a>count <span class="op">=</span> <span class="dv">0</span></span>
<span id="cb216-3"><a href="#cb216-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb216-4"><a href="#cb216-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Traverse the directory tree</span></span>
<span id="cb216-5"><a href="#cb216-5" aria-hidden="true" tabindex="-1"></a><span class="co"># for dirpath, dirs, files in os.walk(&#39;cpp_cleaner_8750_files_each_350_top_25_cwe_omitted&#39;):</span></span>
<span id="cb216-6"><a href="#cb216-6" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> dirpath, dirs, files <span class="kw">in</span> os.walk(<span class="st">&#39;cpp_cleaner_8750_files_each_350_top_25_cwe_omitted&#39;</span>):</span>
<span id="cb216-7"><a href="#cb216-7" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> filename <span class="kw">in</span> files:</span>
<span id="cb216-8"><a href="#cb216-8" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Count the occurrences of &#39;CWE&#39; in the file name</span></span>
<span id="cb216-9"><a href="#cb216-9" aria-hidden="true" tabindex="-1"></a>        count <span class="op">+=</span> <span class="bu">len</span>(re.findall(<span class="vs">r&#39;CWE&#39;</span>, filename))</span>
<span id="cb216-10"><a href="#cb216-10" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Get the file path</span></span>
<span id="cb216-11"><a href="#cb216-11" aria-hidden="true" tabindex="-1"></a>        file_path <span class="op">=</span> os.path.join(dirpath, filename)</span>
<span id="cb216-12"><a href="#cb216-12" aria-hidden="true" tabindex="-1"></a>        <span class="cf">with</span> <span class="bu">open</span>(file_path, <span class="st">&#39;r&#39;</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
<span id="cb216-13"><a href="#cb216-13" aria-hidden="true" tabindex="-1"></a>            <span class="cf">try</span>:</span>
<span id="cb216-14"><a href="#cb216-14" aria-hidden="true" tabindex="-1"></a>                file_data <span class="op">=</span> <span class="bu">file</span>.read()</span>
<span id="cb216-15"><a href="#cb216-15" aria-hidden="true" tabindex="-1"></a>                <span class="co"># Count the occurrences of &#39;CWE&#39; in the file data</span></span>
<span id="cb216-16"><a href="#cb216-16" aria-hidden="true" tabindex="-1"></a>                count <span class="op">+=</span> <span class="bu">len</span>(re.findall(<span class="vs">r&#39;CWE&#39;</span>, file_data))</span>
<span id="cb216-17"><a href="#cb216-17" aria-hidden="true" tabindex="-1"></a>            <span class="cf">except</span> <span class="pp">UnicodeDecodeError</span>:</span>
<span id="cb216-18"><a href="#cb216-18" aria-hidden="true" tabindex="-1"></a>                <span class="co"># Skip files that can&#39;t be decoded</span></span>
<span id="cb216-19"><a href="#cb216-19" aria-hidden="true" tabindex="-1"></a>                <span class="cf">pass</span></span>
<span id="cb216-20"><a href="#cb216-20" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb216-21"><a href="#cb216-21" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(count)</span>
<span id="cb216-22"><a href="#cb216-22" aria-hidden="true" tabindex="-1"></a><span class="co">## 1052 ## clean</span></span>
<span id="cb216-23"><a href="#cb216-23" aria-hidden="true" tabindex="-1"></a><span class="co">## 0 ## cleaner</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>0
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="139" id="iwaAbeuiVR_Y">
<div class="sourceCode" id="cb218"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb218-1"><a href="#cb218-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Get a list of all file paths and their corresponding labels</span></span>
<span id="cb218-2"><a href="#cb218-2" aria-hidden="true" tabindex="-1"></a>root_folder_path <span class="op">=</span> <span class="st">&#39;cpp_cleaner_8750_files_each_350_top_25_cwe_omitted&#39;</span></span>
<span id="cb218-3"><a href="#cb218-3" aria-hidden="true" tabindex="-1"></a>file_paths <span class="op">=</span> []</span>
<span id="cb218-4"><a href="#cb218-4" aria-hidden="true" tabindex="-1"></a>labels <span class="op">=</span> []</span>
<span id="cb218-5"><a href="#cb218-5" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> dirpath, dirnames, filenames <span class="kw">in</span> os.walk(root_folder_path):</span>
<span id="cb218-6"><a href="#cb218-6" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> filename <span class="kw">in</span> filenames:</span>
<span id="cb218-7"><a href="#cb218-7" aria-hidden="true" tabindex="-1"></a>        file_paths.append(os.path.join(dirpath, filename))</span>
<span id="cb218-8"><a href="#cb218-8" aria-hidden="true" tabindex="-1"></a>        labels.append(os.path.basename(dirpath))  <span class="co"># Use the name of the parent directory as the label</span></span>
<span id="cb218-9"><a href="#cb218-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb218-10"><a href="#cb218-10" aria-hidden="true" tabindex="-1"></a><span class="co"># Split the file paths into training and testing sets</span></span>
<span id="cb218-11"><a href="#cb218-11" aria-hidden="true" tabindex="-1"></a>file_paths_train, file_paths_test, labels_train, labels_test <span class="op">=</span> train_test_split(file_paths, labels, test_size<span class="op">=</span><span class="fl">0.2</span>, stratify<span class="op">=</span>labels)</span>
<span id="cb218-12"><a href="#cb218-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb218-13"><a href="#cb218-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Function to copy files to a new directory structure</span></span>
<span id="cb218-14"><a href="#cb218-14" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> copy_files(file_paths, labels, dest_folder):</span>
<span id="cb218-15"><a href="#cb218-15" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> file_path, label <span class="kw">in</span> <span class="bu">zip</span>(file_paths, labels):</span>
<span id="cb218-16"><a href="#cb218-16" aria-hidden="true" tabindex="-1"></a>        dest_dir <span class="op">=</span> os.path.join(dest_folder, label)</span>
<span id="cb218-17"><a href="#cb218-17" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> <span class="kw">not</span> os.path.isdir(dest_dir):</span>
<span id="cb218-18"><a href="#cb218-18" aria-hidden="true" tabindex="-1"></a>            os.makedirs(dest_dir)</span>
<span id="cb218-19"><a href="#cb218-19" aria-hidden="true" tabindex="-1"></a>        shutil.copy(file_path, dest_dir)</span>
<span id="cb218-20"><a href="#cb218-20" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb218-21"><a href="#cb218-21" aria-hidden="true" tabindex="-1"></a><span class="co"># Copy the files to the &#39;train&#39; and &#39;test&#39; directories</span></span>
<span id="cb218-22"><a href="#cb218-22" aria-hidden="true" tabindex="-1"></a>copy_files(file_paths_train, labels_train, <span class="st">&#39;train&#39;</span>)</span>
<span id="cb218-23"><a href="#cb218-23" aria-hidden="true" tabindex="-1"></a>copy_files(file_paths_test, labels_test, <span class="st">&#39;test&#39;</span>)</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="140"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="fr_5CwLeVR_a" data-outputId="064fb9be-9239-431c-aaf2-7f11bd4056d4">
<div class="sourceCode" id="cb219"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb219-1"><a href="#cb219-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>find train <span class="op">-</span><span class="bu">type</span> f <span class="op">|</span> cut <span class="op">-</span>d<span class="op">/</span> <span class="op">-</span>f2 <span class="op">|</span> sort <span class="op">|</span> uniq <span class="op">-</span>c</span></code></pre></div>
<div class="output stream stdout">
<pre><code>    259 CWE121_Stack_Based_Buffer_Overflow
    253 CWE122_Heap_Based_Buffer_Overflow
    265 CWE124_Buffer_Underwrite
    268 CWE126_Buffer_Overread
    266 CWE127_Buffer_Underread
    280 CWE134_Uncontrolled_Format_String
    238 CWE190_Integer_Overflow
    235 CWE191_Integer_Underflow
    132 CWE194_Unexpected_Sign_Extension
    127 CWE195_Signed_to_Unsigned_Conversion_Error
    280 CWE197_Numeric_Truncation_Error
    280 CWE23_Relative_Path_Traversal
    280 CWE369_Divide_by_Zero
    280 CWE36_Absolute_Path_Traversal
    125 CWE400_Resource_Exhaustion
    266 CWE401_Memory_Leak
    280 CWE415_Double_Free
    238 CWE457_Use_of_Uninitialized_Variable
    280 CWE563_Unused_Variable
    278 CWE590_Free_Memory_Not_on_Heap
    241 CWE680_Integer_Overflow_to_Buffer_Overflow
    134 CWE690_NULL_Deref_From_Return
    279 CWE762_Mismatched_Memory_Management_Routines
    258 CWE789_Uncontrolled_Mem_Alloc
    280 CWE78_OS_Command_Injection
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="141"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="xYqQb8NkVR_a" data-outputId="ac5b4ff0-be44-4a1d-8afb-db0cc2828080">
<div class="sourceCode" id="cb221"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb221-1"><a href="#cb221-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> count_files_in_subfolders(root_folder):</span>
<span id="cb221-2"><a href="#cb221-2" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> dirpath, dirnames, filenames <span class="kw">in</span> os.walk(root_folder):</span>
<span id="cb221-3"><a href="#cb221-3" aria-hidden="true" tabindex="-1"></a>        <span class="bu">print</span>(<span class="ss">f&quot;There are </span><span class="sc">{</span><span class="bu">len</span>(filenames)<span class="sc">}</span><span class="ss"> files in the &#39;</span><span class="sc">{</span>os<span class="sc">.</span>path<span class="sc">.</span>relpath(dirpath, root_folder)<span class="sc">}</span><span class="ss">&#39; subfolder.&quot;</span>)</span>
<span id="cb221-4"><a href="#cb221-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb221-5"><a href="#cb221-5" aria-hidden="true" tabindex="-1"></a><span class="co"># print(&quot;In the &#39;train&#39; directory:&quot;)</span></span>
<span id="cb221-6"><a href="#cb221-6" aria-hidden="true" tabindex="-1"></a><span class="co"># count_files_in_subfolders(&#39;train&#39;)</span></span>
<span id="cb221-7"><a href="#cb221-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb221-8"><a href="#cb221-8" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;</span><span class="ch">\n</span><span class="st">In the &#39;test&#39; directory:&quot;</span>)</span>
<span id="cb221-9"><a href="#cb221-9" aria-hidden="true" tabindex="-1"></a>count_files_in_subfolders(<span class="st">&#39;test&#39;</span>)</span></code></pre></div>
<div class="output stream stdout">
<pre><code>
In the &#39;test&#39; directory:
There are 0 files in the &#39;.&#39; subfolder.
There are 66 files in the &#39;CWE124_Buffer_Underwrite&#39; subfolder.
There are 63 files in the &#39;CWE122_Heap_Based_Buffer_Overflow&#39; subfolder.
There are 33 files in the &#39;CWE194_Unexpected_Sign_Extension&#39; subfolder.
There are 67 files in the &#39;CWE401_Memory_Leak&#39; subfolder.
There are 67 files in the &#39;CWE126_Buffer_Overread&#39; subfolder.
There are 70 files in the &#39;CWE36_Absolute_Path_Traversal&#39; subfolder.
There are 59 files in the &#39;CWE457_Use_of_Uninitialized_Variable&#39; subfolder.
There are 31 files in the &#39;CWE195_Signed_to_Unsigned_Conversion_Error&#39; subfolder.
There are 70 files in the &#39;CWE197_Numeric_Truncation_Error&#39; subfolder.
There are 70 files in the &#39;CWE78_OS_Command_Injection&#39; subfolder.
There are 70 files in the &#39;CWE563_Unused_Variable&#39; subfolder.
There are 67 files in the &#39;CWE127_Buffer_Underread&#39; subfolder.
There are 60 files in the &#39;CWE680_Integer_Overflow_to_Buffer_Overflow&#39; subfolder.
There are 70 files in the &#39;CWE23_Relative_Path_Traversal&#39; subfolder.
There are 65 files in the &#39;CWE121_Stack_Based_Buffer_Overflow&#39; subfolder.
There are 70 files in the &#39;CWE369_Divide_by_Zero&#39; subfolder.
There are 60 files in the &#39;CWE190_Integer_Overflow&#39; subfolder.
There are 70 files in the &#39;CWE415_Double_Free&#39; subfolder.
There are 70 files in the &#39;CWE762_Mismatched_Memory_Management_Routines&#39; subfolder.
There are 33 files in the &#39;CWE690_NULL_Deref_From_Return&#39; subfolder.
There are 70 files in the &#39;CWE590_Free_Memory_Not_on_Heap&#39; subfolder.
There are 59 files in the &#39;CWE191_Integer_Underflow&#39; subfolder.
There are 31 files in the &#39;CWE400_Resource_Exhaustion&#39; subfolder.
There are 70 files in the &#39;CWE134_Uncontrolled_Format_String&#39; subfolder.
There are 65 files in the &#39;CWE789_Uncontrolled_Mem_Alloc&#39; subfolder.
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="142"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="WhkfbeAAVR_b" data-outputId="13f28681-e738-4950-c40d-fa56b77c4b9f">
<div class="sourceCode" id="cb223"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb223-1"><a href="#cb223-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> count_files_in_folder(root_folder):</span>
<span id="cb223-2"><a href="#cb223-2" aria-hidden="true" tabindex="-1"></a>    total_files <span class="op">=</span> <span class="dv">0</span></span>
<span id="cb223-3"><a href="#cb223-3" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> dirpath, dirnames, filenames <span class="kw">in</span> os.walk(root_folder):</span>
<span id="cb223-4"><a href="#cb223-4" aria-hidden="true" tabindex="-1"></a>        total_files <span class="op">+=</span> <span class="bu">len</span>(filenames)</span>
<span id="cb223-5"><a href="#cb223-5" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="ss">f&quot;There are </span><span class="sc">{</span>total_files<span class="sc">}</span><span class="ss"> files in the &#39;</span><span class="sc">{</span>root_folder<span class="sc">}</span><span class="ss">&#39; directory.&quot;</span>)</span>
<span id="cb223-6"><a href="#cb223-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb223-7"><a href="#cb223-7" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;In the &#39;train&#39; directory:&quot;</span>)</span>
<span id="cb223-8"><a href="#cb223-8" aria-hidden="true" tabindex="-1"></a>count_files_in_folder(<span class="st">&#39;train&#39;</span>)</span>
<span id="cb223-9"><a href="#cb223-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb223-10"><a href="#cb223-10" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;</span><span class="ch">\n</span><span class="st">In the &#39;test&#39; directory:&quot;</span>)</span>
<span id="cb223-11"><a href="#cb223-11" aria-hidden="true" tabindex="-1"></a>count_files_in_folder(<span class="st">&#39;test&#39;</span>)</span>
<span id="cb223-12"><a href="#cb223-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb223-13"><a href="#cb223-13" aria-hidden="true" tabindex="-1"></a><span class="co">## In the &#39;train&#39; directory:</span></span>
<span id="cb223-14"><a href="#cb223-14" aria-hidden="true" tabindex="-1"></a><span class="co">### There are 6102 files in the &#39;train&#39; directory.</span></span>
<span id="cb223-15"><a href="#cb223-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb223-16"><a href="#cb223-16" aria-hidden="true" tabindex="-1"></a><span class="co">## In the &#39;test&#39; directory:</span></span>
<span id="cb223-17"><a href="#cb223-17" aria-hidden="true" tabindex="-1"></a><span class="co">### There are 2781 files in the &#39;test&#39; directory.</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>In the &#39;train&#39; directory:
There are 6102 files in the &#39;train&#39; directory.

In the &#39;test&#39; directory:
There are 1526 files in the &#39;test&#39; directory.
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="143" id="yfOIl0SwVR_b">
<div class="sourceCode" id="cb225"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb225-1"><a href="#cb225-1" aria-hidden="true" tabindex="-1"></a><span class="co">## cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz</span></span>
<span id="cb225-2"><a href="#cb225-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !tar -czf cpp_ready_8750_files_each_350_top_25_cwe_omitted.tar.gz cpp_cleaner_8750_files_each_350_top_25_cwe_omitted</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="144" id="apSROBTbVR_c">
<div class="sourceCode" id="cb226"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb226-1"><a href="#cb226-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !tar -czf cpp_ready_8750_files_each_350_top_25_cwe_omitted.tar.gz cpp_cleaner_8750_files_each_350_top_25_cwe_omitted</span></span>
<span id="cb226-2"><a href="#cb226-2" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>tar <span class="op">-</span>czf cpp_6102_train_2781_test_top_25_cwe_ready.tar.gz train test</span></code></pre></div>
</div>
<div class="cell markdown" id="jzuwVGh7VR_d">
<p>......</p>
</div>
<section id="load-text" class="cell markdown" id="MDvj1gUrVR_k">
<h1>Load text</h1>
</section>
<div class="cell code" data-execution_count="151" id="GrhW2eNeVR_j">
<div class="sourceCode" id="cb227"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb227-1"><a href="#cb227-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !wget https://samate.nist.gov/SARD/downloads/test-suites/2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support.zip</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="152" id="7VWHKBtdVR_h">
<div class="sourceCode" id="cb228"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb228-1"><a href="#cb228-1" aria-hidden="true" tabindex="-1"></a><span class="co"># keras.utils.text_dataset_from_directory(</span></span>
<span id="cb228-2"><a href="#cb228-2" aria-hidden="true" tabindex="-1"></a><span class="co">#     directory,</span></span>
<span id="cb228-3"><a href="#cb228-3" aria-hidden="true" tabindex="-1"></a><span class="co">#     labels=&quot;inferred&quot;,</span></span>
<span id="cb228-4"><a href="#cb228-4" aria-hidden="true" tabindex="-1"></a><span class="co">#     label_mode=&quot;int&quot;,</span></span>
<span id="cb228-5"><a href="#cb228-5" aria-hidden="true" tabindex="-1"></a><span class="co">#     class_names=None,</span></span>
<span id="cb228-6"><a href="#cb228-6" aria-hidden="true" tabindex="-1"></a><span class="co">#     batch_size=32,</span></span>
<span id="cb228-7"><a href="#cb228-7" aria-hidden="true" tabindex="-1"></a><span class="co">#     max_length=None,</span></span>
<span id="cb228-8"><a href="#cb228-8" aria-hidden="true" tabindex="-1"></a><span class="co">#     shuffle=True,</span></span>
<span id="cb228-9"><a href="#cb228-9" aria-hidden="true" tabindex="-1"></a><span class="co">#     seed=None,</span></span>
<span id="cb228-10"><a href="#cb228-10" aria-hidden="true" tabindex="-1"></a><span class="co">#     validation_split=None,</span></span>
<span id="cb228-11"><a href="#cb228-11" aria-hidden="true" tabindex="-1"></a><span class="co">#     subset=None,</span></span>
<span id="cb228-12"><a href="#cb228-12" aria-hidden="true" tabindex="-1"></a><span class="co">#     follow_links=False,</span></span>
<span id="cb228-13"><a href="#cb228-13" aria-hidden="true" tabindex="-1"></a><span class="co"># )</span></span></code></pre></div>
</div>
<section id="predict-the-tag-for-a-sard-testcase" class="cell markdown"
id="GDBEgbpHVR_o">
<h2>Predict the tag for a SARD TestCase</h2>
<p>download a dataset of programming TestCases from SARD. Each TestCase
is labeled with exactly one tag (<code>CWE-ID#121-***</code>,
<code>CWE-ID#122-***</code>, etc).</p>
</section>
<div class="cell code" data-execution_count="153" id="7HJdygozVR_q">
<div class="sourceCode" id="cb229"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb229-1"><a href="#cb229-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>mv cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz _cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="154"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="ic-w48zbVR_q" data-outputId="118fec3d-a41e-4c77-80ce-10d44cdbb8e4">
<div class="sourceCode" id="cb230"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb230-1"><a href="#cb230-1" aria-hidden="true" tabindex="-1"></a><span class="co">## !wget https://raw.githubusercontent.com/c6ai/temp/main/cpp_6102_train_2781_test_top_25_cwe_ready.tar.gz</span></span>
<span id="cb230-2"><a href="#cb230-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !gdown 1j1nY2qlLnA_Iap0_ug8ZAQuDgKX1QIJB</span></span>
<span id="cb230-3"><a href="#cb230-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb230-4"><a href="#cb230-4" aria-hidden="true" tabindex="-1"></a><span class="co">## !gdown 1Q_P8bYpvdSEbp6NnCzfqU3lwQwxUlfE3</span></span>
<span id="cb230-5"><a href="#cb230-5" aria-hidden="true" tabindex="-1"></a><span class="co"># !wget https://raw.githubusercontent.com/c6ai/temp/main/cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz</span></span>
<span id="cb230-6"><a href="#cb230-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb230-7"><a href="#cb230-7" aria-hidden="true" tabindex="-1"></a><span class="co"># data_url = &#39;https://storage.googleapis.com/download.tensorflow.org/data/*.tar.gz&#39;</span></span>
<span id="cb230-8"><a href="#cb230-8" aria-hidden="true" tabindex="-1"></a>data_url <span class="op">=</span> <span class="st">&#39;https://raw.githubusercontent.com/c6ai/temp/main/cpp_6102_train_2781_test_top_25_cwe_ready.tar.gz&#39;</span></span>
<span id="cb230-9"><a href="#cb230-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb230-10"><a href="#cb230-10" aria-hidden="true" tabindex="-1"></a>dataset_dir <span class="op">=</span> utils.get_file(</span>
<span id="cb230-11"><a href="#cb230-11" aria-hidden="true" tabindex="-1"></a>    origin<span class="op">=</span>data_url,</span>
<span id="cb230-12"><a href="#cb230-12" aria-hidden="true" tabindex="-1"></a>    untar<span class="op">=</span><span class="va">True</span>,</span>
<span id="cb230-13"><a href="#cb230-13" aria-hidden="true" tabindex="-1"></a>    cache_dir<span class="op">=</span><span class="st">&#39;cache_dir&#39;</span>,</span>
<span id="cb230-14"><a href="#cb230-14" aria-hidden="true" tabindex="-1"></a>    cache_subdir<span class="op">=</span><span class="st">&#39;&#39;</span>)</span>
<span id="cb230-15"><a href="#cb230-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb230-16"><a href="#cb230-16" aria-hidden="true" tabindex="-1"></a>dataset_dir <span class="op">=</span> pathlib.Path(dataset_dir).parent</span></code></pre></div>
<div class="output stream stdout">
<pre><code>Downloading data from https://raw.githubusercontent.com/c6ai/temp/main/cpp_6102_train_2781_test_top_25_cwe_ready.tar.gz
899225/899225 [==============================] - 0s 0us/step
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="155"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="hhjlXCGlVR_r" data-outputId="a04f187b-2fe9-41bf-8fd1-32c7be2d53ea">
<div class="sourceCode" id="cb232"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb232-1"><a href="#cb232-1" aria-hidden="true" tabindex="-1"></a><span class="bu">list</span>(dataset_dir.iterdir())</span></code></pre></div>
<div class="output execute_result" data-execution_count="155">
<pre><code>[PosixPath(&#39;/tmp/.keras/cpp_6102_train_2781_test_top_25_cwe_ready.tar.gz&#39;),
 PosixPath(&#39;/tmp/.keras/test&#39;),
 PosixPath(&#39;/tmp/.keras/cpp_cleaner_8750_files_each_350_top_25_cwe_omitted&#39;),
 PosixPath(&#39;/tmp/.keras/cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz&#39;),
 PosixPath(&#39;/tmp/.keras/train&#39;)]</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="156"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="VcmAoHvtVR_r" data-outputId="b822974e-3dfc-4aa9-b769-de2e7d91ed63">
<div class="sourceCode" id="cb234"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb234-1"><a href="#cb234-1" aria-hidden="true" tabindex="-1"></a>train_dir <span class="op">=</span> dataset_dir<span class="op">/</span><span class="st">&#39;train&#39;</span></span>
<span id="cb234-2"><a href="#cb234-2" aria-hidden="true" tabindex="-1"></a><span class="bu">list</span>(train_dir.iterdir())</span></code></pre></div>
<div class="output execute_result" data-execution_count="156">
<pre><code>[PosixPath(&#39;/tmp/.keras/train/CWE124_Buffer_Underwrite&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE122_Heap_Based_Buffer_Overflow&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE194_Unexpected_Sign_Extension&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE401_Memory_Leak&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE126_Buffer_Overread&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE36_Absolute_Path_Traversal&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE457_Use_of_Uninitialized_Variable&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE195_Signed_to_Unsigned_Conversion_Error&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE197_Numeric_Truncation_Error&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE78_OS_Command_Injection&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE563_Unused_Variable&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE127_Buffer_Underread&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE680_Integer_Overflow_to_Buffer_Overflow&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE23_Relative_Path_Traversal&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE121_Stack_Based_Buffer_Overflow&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE369_Divide_by_Zero&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE190_Integer_Overflow&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE415_Double_Free&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE762_Mismatched_Memory_Management_Routines&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE690_NULL_Deref_From_Return&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE590_Free_Memory_Not_on_Heap&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE191_Integer_Underflow&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE400_Resource_Exhaustion&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE134_Uncontrolled_Format_String&#39;),
 PosixPath(&#39;/tmp/.keras/train/CWE789_Uncontrolled_Mem_Alloc&#39;)]</code></pre>
</div>
</div>
<div class="cell markdown" id="YDz3qHBuVR_s">
<p>The <code>train/CWE-ID#121-***</code>,
<code>train/CWE-ID#122-***</code>, ... directories contain many text
files, each of which is a SARD TestCase.</p>
</div>
<div class="cell code" data-execution_count="157" id="S2jKCvcsVR_s">
<div class="sourceCode" id="cb236"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb236-1"><a href="#cb236-1" aria-hidden="true" tabindex="-1"></a><span class="co">## ValueError: No text files found in directory /tmp/.keras/train. Allowed format: .txt</span></span>
<span id="cb236-2"><a href="#cb236-2" aria-hidden="true" tabindex="-1"></a><span class="co"># import glob</span></span>
<span id="cb236-3"><a href="#cb236-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb236-4"><a href="#cb236-4" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> change_file_extension(root_folder, old_ext, new_ext):</span>
<span id="cb236-5"><a href="#cb236-5" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> filename <span class="kw">in</span> glob.iglob(root_folder <span class="op">+</span> <span class="st">&#39;**/*&#39;</span> <span class="op">+</span> old_ext, recursive<span class="op">=</span><span class="va">True</span>):</span>
<span id="cb236-6"><a href="#cb236-6" aria-hidden="true" tabindex="-1"></a>        base <span class="op">=</span> os.path.splitext(filename)[<span class="dv">0</span>]</span>
<span id="cb236-7"><a href="#cb236-7" aria-hidden="true" tabindex="-1"></a>        os.rename(filename, base <span class="op">+</span> new_ext)</span>
<span id="cb236-8"><a href="#cb236-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb236-9"><a href="#cb236-9" aria-hidden="true" tabindex="-1"></a>change_file_extension(<span class="st">&#39;/tmp/.keras/&#39;</span>, <span class="st">&#39;.cpp&#39;</span>, <span class="st">&#39;.txt&#39;</span>)</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="158" id="1jhJQGOIVR_t">
<div class="sourceCode" id="cb237"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb237-1"><a href="#cb237-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !ls /tmp/.keras/train/CWE122_Heap_Based_Buffer_Overflow</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="159"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="WFeA-2G-VR_t" data-outputId="7839323f-2e89-4371-e8a3-63ddea7b0ee4">
<div class="sourceCode" id="cb238"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb238-1"><a href="#cb238-1" aria-hidden="true" tabindex="-1"></a><span class="co">## print(open(data_dir / &quot;CWE122_Heap_Based_Buffer_Overflow&quot; / &quot;83_goodG2B.cpp&quot;).read())</span></span>
<span id="cb238-2"><a href="#cb238-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb238-3"><a href="#cb238-3" aria-hidden="true" tabindex="-1"></a><span class="co"># sample_file = train_dir/&#39;python/1755.txt&#39;</span></span>
<span id="cb238-4"><a href="#cb238-4" aria-hidden="true" tabindex="-1"></a><span class="co"># sample_file = train_dir/&#39;CWE122_Heap_Based_Buffer_Overflow/83_goodG2B.cpp&#39;</span></span>
<span id="cb238-5"><a href="#cb238-5" aria-hidden="true" tabindex="-1"></a>sample_file <span class="op">=</span> train_dir<span class="op">/</span><span class="st">&#39;CWE122_Heap_Based_Buffer_Overflow/83_goodG2B.txt&#39;</span></span>
<span id="cb238-6"><a href="#cb238-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb238-7"><a href="#cb238-7" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(sample_file) <span class="im">as</span> f:</span>
<span id="cb238-8"><a href="#cb238-8" aria-hidden="true" tabindex="-1"></a>  <span class="bu">print</span>(f.read())</span></code></pre></div>
<div class="output stream stdout">
<pre><code>

#ifndef OMITGOOD

#include &quot;std_testcase.h&quot;
#include &quot;83.h&quot;

namespace 83
{
83_goodG2B::83_goodG2B(int dataCopy)
{
    data = dataCopy;
    
    data = 7;
}

83_goodG2B::~83_goodG2B()
{
    {
        int i;
        int * buffer = new int[10];
        
        for (i = 0; i &lt; 10; i++)
        {
            buffer[i] = 0;
        }
        
        if (data &gt;= 0)
        {
            buffer[data] = 1;
            
            for(i = 0; i &lt; 10; i++)
            {
                printIntLine(buffer[i]);
            }
        }
        else
        {
            printLine(&quot;ERROR: Array index is negative.&quot;);
        }
        delete[] buffer;
    }
}
}
#endif 

</code></pre>
</div>
</div>
<section id="load-the-dataset" class="cell markdown" id="pcOOFvdeVR_u">
<h3>Load the dataset</h3>
<p>The tf.keras.utils.text_dataset_from_directory API expects a
directory structure as follows:</p>
<pre><code>train/
...CWE-ID#121-***/
......1.txt
......2.txt
...CWE-ID#122-***/
......1.txt
......2.txt
...CWE-ID#1xx-***/
......1.txt
......2.txt
...CWE-ID#1x-***n/
......1.txt
......2.txt</code></pre>
</section>
<div class="cell code" data-execution_count="160"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="xrG2NVObVR_v" data-outputId="782c7fec-1fa3-4535-8be0-73b7f1abed9a">
<div class="sourceCode" id="cb241"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb241-1"><a href="#cb241-1" aria-hidden="true" tabindex="-1"></a>batch_size <span class="op">=</span> batch_size</span>
<span id="cb241-2"><a href="#cb241-2" aria-hidden="true" tabindex="-1"></a>seed <span class="op">=</span> seed</span>
<span id="cb241-3"><a href="#cb241-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb241-4"><a href="#cb241-4" aria-hidden="true" tabindex="-1"></a>raw_train_ds <span class="op">=</span> utils.text_dataset_from_directory(</span>
<span id="cb241-5"><a href="#cb241-5" aria-hidden="true" tabindex="-1"></a>    train_dir,</span>
<span id="cb241-6"><a href="#cb241-6" aria-hidden="true" tabindex="-1"></a>    batch_size<span class="op">=</span>batch_size,</span>
<span id="cb241-7"><a href="#cb241-7" aria-hidden="true" tabindex="-1"></a>    validation_split<span class="op">=</span><span class="fl">0.2</span>,</span>
<span id="cb241-8"><a href="#cb241-8" aria-hidden="true" tabindex="-1"></a>    subset<span class="op">=</span><span class="st">&#39;training&#39;</span>,</span>
<span id="cb241-9"><a href="#cb241-9" aria-hidden="true" tabindex="-1"></a>    seed<span class="op">=</span>seed)</span>
<span id="cb241-10"><a href="#cb241-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb241-11"><a href="#cb241-11" aria-hidden="true" tabindex="-1"></a><span class="co">## Found 6102 files belonging to 25 classes.</span></span>
<span id="cb241-12"><a href="#cb241-12" aria-hidden="true" tabindex="-1"></a><span class="co">## Using 4882 files for training.</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>Found 6102 files belonging to 25 classes.
Using 4882 files for training.
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="161"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="oKg-_2LsVR_w" data-outputId="971492c3-a68f-48a2-c59d-56d7e56c0130">
<div class="sourceCode" id="cb243"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb243-1"><a href="#cb243-1" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> text_batch, label_batch <span class="kw">in</span> raw_train_ds.take(<span class="dv">1</span>):</span>
<span id="cb243-2"><a href="#cb243-2" aria-hidden="true" tabindex="-1"></a>  <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">10</span>):</span>
<span id="cb243-3"><a href="#cb243-3" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="st">&quot;TestCase: &quot;</span>, text_batch.numpy()[i])</span>
<span id="cb243-4"><a href="#cb243-4" aria-hidden="true" tabindex="-1"></a>    <span class="bu">print</span>(<span class="st">&quot;Label:&quot;</span>, label_batch.numpy()[i])</span></code></pre></div>
<div class="output stream stdout">
<pre><code>TestCase:  b&#39;\n\n#ifndef OMITGOOD\n\n#include &quot;std_testcase.h&quot;\n#include &quot;file_snprintf_81.h&quot;\n\n#ifdef _WIN32\n#define SNPRINTF _snprintf\n#else\n#define SNPRINTF snprintf\n#endif\n\nnamespace file_snprintf_81\n{\n\nvoid file_snprintf_81_goodB2G::action(char * data) const\n{\n    {\n        char dest[100] = &quot;&quot;;\n        \n        SNPRINTF(dest, 100-1, &quot;%s&quot;, data);\n        printLine(dest);\n    }\n}\n\n}\n#endif \n&#39;
Label: 5
TestCase:  b&#39;\n\n\n#include &quot;std_testcase.h&quot;\n\n#ifdef _WIN32\n#define BASEPATH &quot;c:\\\\temp\\\\&quot;\n#else\n#include &lt;wchar.h&gt;\n#define BASEPATH &quot;/tmp/&quot;\n#endif\n\n#ifdef _WIN32\n#include &lt;winsock2.h&gt;\n#include &lt;windows.h&gt;\n#include &lt;direct.h&gt;\n#pragma comment(lib, &quot;ws2_32&quot;) \n#define CLOSE_SOCKET closesocket\n#else \n#include &lt;sys/types.h&gt;\n#include &lt;sys/socket.h&gt;\n#include &lt;netinet/in.h&gt;\n#include &lt;arpa/inet.h&gt;\n#include &lt;unistd.h&gt;\n#define INVALID_SOCKET -1\n#define SOCKET_ERROR -1\n#define CLOSE_SOCKET close\n#define SOCKET int\n#endif\n\n#define TCP_PORT 27015\n#define IP_ADDRESS &quot;127.0.0.1&quot;\n\n\nnamespace connect_socket_w32CreateFile_53\n{\n\n#ifndef OMITBAD\n\n\nvoid badSink_b(char * data);\n\nvoid bad()\n{\n    char * data;\n    char dataBuffer[FILENAME_MAX] = BASEPATH;\n    data = dataBuffer;\n    {\n#ifdef _WIN32\n        WSADATA wsaData;\n        int wsaDataInit = 0;\n#endif\n        int recvResult;\n        struct sockaddr_in service;\n        char *replace;\n        SOCKET connectSocket = INVALID_SOCKET;\n        size_t dataLen = strlen(data);\n        do\n        {\n#ifdef _WIN32\n            if (WSAStartup(MAKEWORD(2,2), &amp;wsaData) != NO_ERROR)\n            {\n                break;\n            }\n            wsaDataInit = 1;\n#endif\n            \n            connectSocket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);\n            if (connectSocket == INVALID_SOCKET)\n            {\n                break;\n            }\n            memset(&amp;service, 0, sizeof(service));\n            service.sin_family = AF_INET;\n            service.sin_addr.s_addr = inet_addr(IP_ADDRESS);\n            service.sin_port = htons(TCP_PORT);\n            if (connect(connectSocket, (struct sockaddr*)&amp;service, sizeof(service)) == SOCKET_ERROR)\n            {\n                break;\n            }\n            \n            \n            recvResult = recv(connectSocket, (char *)(data + dataLen), sizeof(char) * (FILENAME_MAX - dataLen - 1), 0);\n            if (recvResult == SOCKET_ERROR || recvResult == 0)\n            {\n                break;\n            }\n            \n            data[dataLen + recvResult / sizeof(char)] = \&#39;\\0\&#39;;\n            \n            replace = strchr(data, \&#39;\\r\&#39;);\n            if (replace)\n            {\n                *replace = \&#39;\\0\&#39;;\n            }\n            replace = strchr(data, \&#39;\\n\&#39;);\n            if (replace)\n            {\n                *replace = \&#39;\\0\&#39;;\n            }\n        }\n        while (0);\n        if (connectSocket != INVALID_SOCKET)\n        {\n            CLOSE_SOCKET(connectSocket);\n        }\n#ifdef _WIN32\n        if (wsaDataInit)\n        {\n            WSACleanup();\n        }\n#endif\n    }\n    badSink_b(data);\n}\n\n#endif \n\n#ifndef OMITGOOD\n\n\nvoid goodG2BSink_b(char * data);\n\n\nstatic void goodG2B()\n{\n    char * data;\n    char dataBuffer[FILENAME_MAX] = BASEPATH;\n    data = dataBuffer;\n    \n    strcat(data, &quot;file.txt&quot;);\n    goodG2BSink_b(data);\n}\n\nvoid good()\n{\n    goodG2B();\n}\n\n#endif \n\n} \n\n\n\n#ifdef INCLUDEMAIN\n\nusing namespace connect_socket_w32CreateFile_53; \n\nint main(int argc, char * argv[])\n{\n    \n    srand( (unsigned)time(NULL) );\n#ifndef OMITGOOD\n    printLine(&quot;Calling good()...&quot;);\n    good();\n    printLine(&quot;Finished good()&quot;);\n#endif \n#ifndef OMITBAD\n    printLine(&quot;Calling bad()...&quot;);\n    bad();\n    printLine(&quot;Finished bad()&quot;);\n#endif \n    return 0;\n}\n\n#endif\n&#39;
Label: 11
TestCase:  b&#39;\n\n\n#include &quot;std_testcase.h&quot;\n\n#include &lt;wchar.h&gt;\n\nnamespace char_alloca_65\n{\n\n#ifndef OMITBAD\n\n\nvoid badSink(char * data);\n\nvoid bad()\n{\n    char * data;\n    \n    void (*funcPtr) (char *) = badSink;\n    data = NULL; \n    {\n        \n        char * dataBuffer = (char *)ALLOCA(sizeof(char));\n        *dataBuffer = \&#39;A\&#39;;\n        data = dataBuffer;\n    }\n    \n    funcPtr(data);\n}\n\n#endif \n\n#ifndef OMITGOOD\n\n\nvoid goodG2BSink(char * data);\n\nstatic void goodG2B()\n{\n    char * data;\n    void (*funcPtr) (char *) = goodG2BSink;\n    data = NULL; \n    {\n        \n        char * dataBuffer = new char;\n        *dataBuffer = \&#39;A\&#39;;\n        data = dataBuffer;\n    }\n    funcPtr(data);\n}\n\nvoid good()\n{\n    goodG2B();\n}\n\n#endif \n\n} \n\n\n\n#ifdef INCLUDEMAIN\n\nusing namespace char_alloca_65; \n\nint main(int argc, char * argv[])\n{\n    \n    srand( (unsigned)time(NULL) );\n#ifndef OMITGOOD\n    printLine(&quot;Calling good()...&quot;);\n    good();\n    printLine(&quot;Finished good()&quot;);\n#endif \n#ifndef OMITBAD\n    printLine(&quot;Calling bad()...&quot;);\n    bad();\n    printLine(&quot;Finished bad()&quot;);\n#endif \n    return 0;\n}\n\n#endif\n&#39;
Label: 19
TestCase:  b&#39;\n\n\n#include &quot;std_testcase.h&quot;\n\nnamespace fscanf_62\n{\n\n#ifndef OMITBAD\n\nvoid badSource(int &amp;data)\n{\n    \n    fscanf(stdin, &quot;%d&quot;, &amp;data);\n}\n\n#endif \n\n#ifndef OMITGOOD\n\n\nvoid goodG2BSource(int &amp;data)\n{\n    \n    data = 20;\n}\n\n#endif \n\n} \n&#39;
Label: 20
TestCase:  b&#39;\n\n#ifndef OMITBAD\n\n#include &quot;std_testcase.h&quot;\n#include &quot;listen_socket_square_81.h&quot;\n\n#include &lt;math.h&gt;\n\nnamespace listen_socket_square_81\n{\n\nvoid listen_socket_square_81_bad::action(int data) const\n{\n    {\n        \n        int result = data * data;\n        printIntLine(result);\n    }\n}\n\n}\n#endif \n&#39;
Label: 6
TestCase:  b&#39;\n\n\n#include &quot;std_testcase.h&quot;\n\n#include &lt;wchar.h&gt;\n\n\nstatic const int STATIC_CONST_FIVE = 5;\n\nnamespace class_placement_new_06\n{\n\n#ifndef OMITBAD\n\nvoid bad()\n{\n    TwoIntsClass * data;\n    data = NULL; \n    if(STATIC_CONST_FIVE==5)\n    {\n        {\n            \n            char buffer[sizeof(TwoIntsClass)];\n            TwoIntsClass * dataBuffer = new(buffer) TwoIntsClass;\n            dataBuffer-&gt;intOne = 2;\n            dataBuffer-&gt;intTwo = 2;\n            data = dataBuffer;\n        }\n    }\n    printIntLine(data-&gt;intOne);\n    \n    delete data;\n}\n\n#endif \n\n#ifndef OMITGOOD\n\n\nstatic void goodG2B1()\n{\n    TwoIntsClass * data;\n    data = NULL; \n    if(STATIC_CONST_FIVE!=5)\n    {\n        \n        printLine(&quot;Benign, fixed string&quot;);\n    }\n    else\n    {\n        {\n            \n            TwoIntsClass * dataBuffer = new TwoIntsClass;\n            dataBuffer-&gt;intOne = 2;\n            dataBuffer-&gt;intTwo = 2;\n            data = dataBuffer;\n        }\n    }\n    printIntLine(data-&gt;intOne);\n    \n    delete data;\n}\n\n\nstatic void goodG2B2()\n{\n    TwoIntsClass * data;\n    data = NULL; \n    if(STATIC_CONST_FIVE==5)\n    {\n        {\n            \n            TwoIntsClass * dataBuffer = new TwoIntsClass;\n            dataBuffer-&gt;intOne = 2;\n            dataBuffer-&gt;intTwo = 2;\n            data = dataBuffer;\n        }\n    }\n    printIntLine(data-&gt;intOne);\n    \n    delete data;\n}\n\nvoid good()\n{\n    goodG2B1();\n    goodG2B2();\n}\n\n#endif \n\n} \n\n\n\n#ifdef INCLUDEMAIN\n\nusing namespace class_placement_new_06; \n\nint main(int argc, char * argv[])\n{\n    \n    srand( (unsigned)time(NULL) );\n#ifndef OMITGOOD\n    printLine(&quot;Calling good()...&quot;);\n    good();\n    printLine(&quot;Finished good()&quot;);\n#endif \n#ifndef OMITBAD\n    printLine(&quot;Calling bad()...&quot;);\n    bad();\n    printLine(&quot;Finished bad()&quot;);\n#endif \n    return 0;\n}\n\n#endif\n&#39;
Label: 19
TestCase:  b&#39;\n\n#ifndef OMITBAD\n\n#include &quot;std_testcase.h&quot;\n#include &quot;array_free_struct_81.h&quot;\n\nnamespace array_free_struct_81\n{\n\nvoid array_free_struct_81_bad::action(twoIntsStruct * data) const\n{\n    \n    free(data);\n}\n\n}\n#endif \n&#39;
Label: 22
TestCase:  b&#39;\n\n\n#include &quot;std_testcase.h&quot;\n\n#ifdef _WIN32\n#define BASEPATH L&quot;c:\\\\temp\\\\&quot;\n#else\n#include &lt;wchar.h&gt;\n#define BASEPATH L&quot;/tmp/&quot;\n#endif\n\n#ifdef _WIN32\n#define FILENAME &quot;C:\\\\temp\\\\file.txt&quot;\n#else\n#define FILENAME &quot;/tmp/file.txt&quot;\n#endif\n\n#ifdef _WIN32\n#define FOPEN _wfopen\n#else\n#define FOPEN fopen\n#endif\n\nnamespace t_file_fopen_45\n{\n\nstatic wchar_t * badData;\nstatic wchar_t * goodG2BData;\n\n#ifndef OMITBAD\n\nstatic void badSink()\n{\n    wchar_t * data = badData;\n    {\n        FILE *pFile = NULL;\n        \n        pFile = FOPEN(data, L&quot;wb+&quot;);\n        if (pFile != NULL)\n        {\n            fclose(pFile);\n        }\n    }\n}\n\nvoid bad()\n{\n    wchar_t * data;\n    wchar_t dataBuffer[FILENAME_MAX] = BASEPATH;\n    data = dataBuffer;\n    {\n        \n        size_t dataLen = wcslen(data);\n        FILE * pFile;\n        \n        if (FILENAME_MAX-dataLen &gt; 1)\n        {\n            pFile = fopen(FILENAME, &quot;r&quot;);\n            if (pFile != NULL)\n            {\n                \n                if (fgetws(data+dataLen, (int)(FILENAME_MAX-dataLen), pFile) == NULL)\n                {\n                    printLine(&quot;fgetws() failed&quot;);\n                    \n                    data[dataLen] = L\&#39;\\0\&#39;;\n                }\n                fclose(pFile);\n            }\n        }\n    }\n    badData = data;\n    badSink();\n}\n\n#endif \n\n#ifndef OMITGOOD\n\n\nstatic void goodG2BSink()\n{\n    wchar_t * data = goodG2BData;\n    {\n        FILE *pFile = NULL;\n        \n        pFile = FOPEN(data, L&quot;wb+&quot;);\n        if (pFile != NULL)\n        {\n            fclose(pFile);\n        }\n    }\n}\n\nstatic void goodG2B()\n{\n    wchar_t * data;\n    wchar_t dataBuffer[FILENAME_MAX] = BASEPATH;\n    data = dataBuffer;\n    \n    wcscat(data, L&quot;file.txt&quot;);\n    goodG2BData = data;\n    goodG2BSink();\n}\n\nvoid good()\n{\n    goodG2B();\n}\n\n#endif \n\n} \n\n\n\n#ifdef INCLUDEMAIN\n\nusing namespace t_file_fopen_45; \n\nint main(int argc, char * argv[])\n{\n    \n    srand( (unsigned)time(NULL) );\n#ifndef OMITGOOD\n    printLine(&quot;Calling good()...&quot;);\n    good();\n    printLine(&quot;Finished good()&quot;);\n#endif \n#ifndef OMITBAD\n    printLine(&quot;Calling bad()...&quot;);\n    bad();\n    printLine(&quot;Finished bad()&quot;);\n#endif \n    return 0;\n}\n\n#endif\n&#39;
Label: 11
TestCase:  b&#39;\n\n\n#include &quot;std_testcase.h&quot;\n\n#include &lt;wchar.h&gt;\n\nnamespace char_memcpy_22\n{\n\n#ifndef OMITBAD\n\n\nextern int badGlobal;\n\nchar * badSource(char * data)\n{\n    if(badGlobal)\n    {\n        {\n            char * dataBuffer = new char[100];\n            memset(dataBuffer, \&#39;A\&#39;, 100-1);\n            dataBuffer[100-1] = \&#39;\\0\&#39;;\n            \n            data = dataBuffer - 8;\n        }\n    }\n    return data;\n}\n\n#endif \n\n#ifndef OMITGOOD\n\n\nextern int goodG2B1Global;\nextern int goodG2B2Global;\n\n\nchar * goodG2B1Source(char * data)\n{\n    if(goodG2B1Global)\n    {\n        \n        printLine(&quot;Benign, fixed string&quot;);\n    }\n    else\n    {\n        {\n            char * dataBuffer = new char[100];\n            memset(dataBuffer, \&#39;A\&#39;, 100-1);\n            dataBuffer[100-1] = \&#39;\\0\&#39;;\n            \n            data = dataBuffer;\n        }\n    }\n    return data;\n}\n\n\nchar * goodG2B2Source(char * data)\n{\n    if(goodG2B2Global)\n    {\n        {\n            char * dataBuffer = new char[100];\n            memset(dataBuffer, \&#39;A\&#39;, 100-1);\n            dataBuffer[100-1] = \&#39;\\0\&#39;;\n            \n            data = dataBuffer;\n        }\n    }\n    return data;\n}\n\n#endif \n\n} \n&#39;
Label: 4
TestCase:  b&#39;\n\n\n#include &quot;std_testcase.h&quot;\n\n#include &lt;wchar.h&gt;\n\n\nstatic int staticFive = 5;\n\nnamespace array_int64_t_static_07\n{\n\n#ifndef OMITBAD\n\nvoid bad()\n{\n    int64_t * data;\n    data = NULL; \n    if(staticFive==5)\n    {\n        {\n            \n            static int64_t dataBuffer[100];\n            {\n                size_t i;\n                for (i = 0; i &lt; 100; i++)\n                {\n                    dataBuffer[i] = 5LL;\n                }\n            }\n            data = dataBuffer;\n        }\n    }\n    printLongLongLine(data[0]);\n    \n    delete [] data;\n}\n\n#endif \n\n#ifndef OMITGOOD\n\n\nstatic void goodG2B1()\n{\n    int64_t * data;\n    data = NULL; \n    if(staticFive!=5)\n    {\n        \n        printLine(&quot;Benign, fixed string&quot;);\n    }\n    else\n    {\n        {\n            \n            int64_t * dataBuffer = new int64_t[100];\n            {\n                size_t i;\n                for (i = 0; i &lt; 100; i++)\n                {\n                    dataBuffer[i] = 5LL;\n                }\n            }\n            data = dataBuffer;\n        }\n    }\n    printLongLongLine(data[0]);\n    \n    delete [] data;\n}\n\n\nstatic void goodG2B2()\n{\n    int64_t * data;\n    data = NULL; \n    if(staticFive==5)\n    {\n        {\n            \n            int64_t * dataBuffer = new int64_t[100];\n            {\n                size_t i;\n                for (i = 0; i &lt; 100; i++)\n                {\n                    dataBuffer[i] = 5LL;\n                }\n            }\n            data = dataBuffer;\n        }\n    }\n    printLongLongLine(data[0]);\n    \n    delete [] data;\n}\n\nvoid good()\n{\n    goodG2B1();\n    goodG2B2();\n}\n\n#endif \n\n} \n\n\n\n#ifdef INCLUDEMAIN\n\nusing namespace array_int64_t_static_07; \n\nint main(int argc, char * argv[])\n{\n    \n    srand( (unsigned)time(NULL) );\n#ifndef OMITGOOD\n    printLine(&quot;Calling good()...&quot;);\n    good();\n    printLine(&quot;Finished good()&quot;);\n#endif \n#ifndef OMITBAD\n    printLine(&quot;Calling bad()...&quot;);\n    bad();\n    printLine(&quot;Finished bad()&quot;);\n#endif \n    return 0;\n}\n\n#endif\n&#39;
Label: 19
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="162"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="dUsEOqTaVR_y" data-outputId="a597d261-a9bc-453a-b3b3-aaa579798147">
<div class="sourceCode" id="cb245"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb245-1"><a href="#cb245-1" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i, label <span class="kw">in</span> <span class="bu">enumerate</span>(raw_train_ds.class_names):</span>
<span id="cb245-2"><a href="#cb245-2" aria-hidden="true" tabindex="-1"></a>  <span class="bu">print</span>(<span class="st">&quot;Label&quot;</span>, i, <span class="st">&quot;corresponds to&quot;</span>, label)</span></code></pre></div>
<div class="output stream stdout">
<pre><code>Label 0 corresponds to CWE121_Stack_Based_Buffer_Overflow
Label 1 corresponds to CWE122_Heap_Based_Buffer_Overflow
Label 2 corresponds to CWE124_Buffer_Underwrite
Label 3 corresponds to CWE126_Buffer_Overread
Label 4 corresponds to CWE127_Buffer_Underread
Label 5 corresponds to CWE134_Uncontrolled_Format_String
Label 6 corresponds to CWE190_Integer_Overflow
Label 7 corresponds to CWE191_Integer_Underflow
Label 8 corresponds to CWE194_Unexpected_Sign_Extension
Label 9 corresponds to CWE195_Signed_to_Unsigned_Conversion_Error
Label 10 corresponds to CWE197_Numeric_Truncation_Error
Label 11 corresponds to CWE23_Relative_Path_Traversal
Label 12 corresponds to CWE369_Divide_by_Zero
Label 13 corresponds to CWE36_Absolute_Path_Traversal
Label 14 corresponds to CWE400_Resource_Exhaustion
Label 15 corresponds to CWE401_Memory_Leak
Label 16 corresponds to CWE415_Double_Free
Label 17 corresponds to CWE457_Use_of_Uninitialized_Variable
Label 18 corresponds to CWE563_Unused_Variable
Label 19 corresponds to CWE590_Free_Memory_Not_on_Heap
Label 20 corresponds to CWE680_Integer_Overflow_to_Buffer_Overflow
Label 21 corresponds to CWE690_NULL_Deref_From_Return
Label 22 corresponds to CWE762_Mismatched_Memory_Management_Routines
Label 23 corresponds to CWE789_Uncontrolled_Mem_Alloc
Label 24 corresponds to CWE78_OS_Command_Injection
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="163" id="8w-wFKoBVR_z">
<div class="sourceCode" id="cb247"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb247-1"><a href="#cb247-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 0 corresponds to CWE121_Stack_Based_Buffer_Overflow</span></span>
<span id="cb247-2"><a href="#cb247-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 1 corresponds to CWE122_Heap_Based_Buffer_Overflow</span></span>
<span id="cb247-3"><a href="#cb247-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 2 corresponds to CWE124_Buffer_Underwrite</span></span>
<span id="cb247-4"><a href="#cb247-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 3 corresponds to CWE126_Buffer_Overread</span></span>
<span id="cb247-5"><a href="#cb247-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 4 corresponds to CWE127_Buffer_Underread</span></span>
<span id="cb247-6"><a href="#cb247-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 5 corresponds to CWE134_Uncontrolled_Format_String</span></span>
<span id="cb247-7"><a href="#cb247-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 6 corresponds to CWE190_Integer_Overflow</span></span>
<span id="cb247-8"><a href="#cb247-8" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 7 corresponds to CWE191_Integer_Underflow</span></span>
<span id="cb247-9"><a href="#cb247-9" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 8 corresponds to CWE194_Unexpected_Sign_Extension</span></span>
<span id="cb247-10"><a href="#cb247-10" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 9 corresponds to CWE195_Signed_to_Unsigned_Conversion_Error</span></span>
<span id="cb247-11"><a href="#cb247-11" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 10 corresponds to CWE197_Numeric_Truncation_Error</span></span>
<span id="cb247-12"><a href="#cb247-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 11 corresponds to CWE23_Relative_Path_Traversal</span></span>
<span id="cb247-13"><a href="#cb247-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 12 corresponds to CWE369_Divide_by_Zero</span></span>
<span id="cb247-14"><a href="#cb247-14" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 13 corresponds to CWE36_Absolute_Path_Traversal</span></span>
<span id="cb247-15"><a href="#cb247-15" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 14 corresponds to CWE400_Resource_Exhaustion</span></span>
<span id="cb247-16"><a href="#cb247-16" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 15 corresponds to CWE401_Memory_Leak</span></span>
<span id="cb247-17"><a href="#cb247-17" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 16 corresponds to CWE415_Double_Free</span></span>
<span id="cb247-18"><a href="#cb247-18" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 17 corresponds to CWE457_Use_of_Uninitialized_Variable</span></span>
<span id="cb247-19"><a href="#cb247-19" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 18 corresponds to CWE563_Unused_Variable</span></span>
<span id="cb247-20"><a href="#cb247-20" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 19 corresponds to CWE590_Free_Memory_Not_on_Heap</span></span>
<span id="cb247-21"><a href="#cb247-21" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 20 corresponds to CWE680_Integer_Overflow_to_Buffer_Overflow</span></span>
<span id="cb247-22"><a href="#cb247-22" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 21 corresponds to CWE690_NULL_Deref_From_Return</span></span>
<span id="cb247-23"><a href="#cb247-23" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 22 corresponds to CWE762_Mismatched_Memory_Management_Routines</span></span>
<span id="cb247-24"><a href="#cb247-24" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 23 corresponds to CWE789_Uncontrolled_Mem_Alloc</span></span>
<span id="cb247-25"><a href="#cb247-25" aria-hidden="true" tabindex="-1"></a><span class="co"># Label 24 corresponds to CWE78_OS_Command_Injection</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="164"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="KLDhI4f9VR_2" data-outputId="43ef54ce-6a25-4cae-88b1-a4cb65af0b07">
<div class="sourceCode" id="cb248"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb248-1"><a href="#cb248-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a validation set.</span></span>
<span id="cb248-2"><a href="#cb248-2" aria-hidden="true" tabindex="-1"></a>raw_val_ds <span class="op">=</span> utils.text_dataset_from_directory(</span>
<span id="cb248-3"><a href="#cb248-3" aria-hidden="true" tabindex="-1"></a>    train_dir,</span>
<span id="cb248-4"><a href="#cb248-4" aria-hidden="true" tabindex="-1"></a>    batch_size<span class="op">=</span>batch_size,</span>
<span id="cb248-5"><a href="#cb248-5" aria-hidden="true" tabindex="-1"></a>    validation_split<span class="op">=</span><span class="fl">0.2</span>,</span>
<span id="cb248-6"><a href="#cb248-6" aria-hidden="true" tabindex="-1"></a>    subset<span class="op">=</span><span class="st">&#39;validation&#39;</span>,</span>
<span id="cb248-7"><a href="#cb248-7" aria-hidden="true" tabindex="-1"></a>    seed<span class="op">=</span>seed)</span>
<span id="cb248-8"><a href="#cb248-8" aria-hidden="true" tabindex="-1"></a><span class="co">## Found 6102 files belonging to 25 classes.</span></span>
<span id="cb248-9"><a href="#cb248-9" aria-hidden="true" tabindex="-1"></a><span class="co">## Using 1220 files for validation.</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>Found 6102 files belonging to 25 classes.
Using 1220 files for validation.
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="165"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="q1b53ZpvVR_3" data-outputId="8d2f606d-7e44-47d1-be7d-7721d32351cd">
<div class="sourceCode" id="cb250"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb250-1"><a href="#cb250-1" aria-hidden="true" tabindex="-1"></a>test_dir <span class="op">=</span> dataset_dir<span class="op">/</span><span class="st">&#39;test&#39;</span></span>
<span id="cb250-2"><a href="#cb250-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb250-3"><a href="#cb250-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a test set.</span></span>
<span id="cb250-4"><a href="#cb250-4" aria-hidden="true" tabindex="-1"></a>raw_test_ds <span class="op">=</span> utils.text_dataset_from_directory(</span>
<span id="cb250-5"><a href="#cb250-5" aria-hidden="true" tabindex="-1"></a>    test_dir,</span>
<span id="cb250-6"><a href="#cb250-6" aria-hidden="true" tabindex="-1"></a>    batch_size<span class="op">=</span>batch_size)</span>
<span id="cb250-7"><a href="#cb250-7" aria-hidden="true" tabindex="-1"></a><span class="co">## Found 2781 files belonging to 25 classes.</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>Found 2781 files belonging to 25 classes.
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="166" id="4b054FTbVR_4">
<div class="sourceCode" id="cb252"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb252-1"><a href="#cb252-1" aria-hidden="true" tabindex="-1"></a>raw_train_ds <span class="op">=</span> raw_train_ds.cache().prefetch(buffer_size<span class="op">=</span>tf.data.AUTOTUNE)</span>
<span id="cb252-2"><a href="#cb252-2" aria-hidden="true" tabindex="-1"></a>raw_val_ds <span class="op">=</span> raw_val_ds.cache().prefetch(buffer_size<span class="op">=</span>tf.data.AUTOTUNE)</span>
<span id="cb252-3"><a href="#cb252-3" aria-hidden="true" tabindex="-1"></a>raw_test_ds <span class="op">=</span> raw_test_ds.prefetch(buffer_size<span class="op">=</span>tf.data.AUTOTUNE)</span></code></pre></div>
</div>
<section id="prepare-the-dataset-for-training" class="cell markdown"
id="icojIg8vVR_5">
<h3>Prepare the dataset for training</h3>
</section>
<div class="cell code" data-execution_count="167" id="-7LMbVMBVR_6">
<div class="sourceCode" id="cb253"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb253-1"><a href="#cb253-1" aria-hidden="true" tabindex="-1"></a>VOCAB_SIZE <span class="op">=</span> VOCAB_SIZE</span>
<span id="cb253-2"><a href="#cb253-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb253-3"><a href="#cb253-3" aria-hidden="true" tabindex="-1"></a><span class="co"># binary_vectorize_layer = TextVectorization(</span></span>
<span id="cb253-4"><a href="#cb253-4" aria-hidden="true" tabindex="-1"></a>multi_class_vectorize_layer <span class="op">=</span> TextVectorization(</span>
<span id="cb253-5"><a href="#cb253-5" aria-hidden="true" tabindex="-1"></a>    max_tokens<span class="op">=</span>VOCAB_SIZE,</span>
<span id="cb253-6"><a href="#cb253-6" aria-hidden="true" tabindex="-1"></a>    output_mode<span class="op">=</span><span class="st">&#39;binary&#39;</span>)</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="168" id="kOgysNN1VR_7">
<div class="sourceCode" id="cb254"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb254-1"><a href="#cb254-1" aria-hidden="true" tabindex="-1"></a>MAX_SEQUENCE_LENGTH <span class="op">=</span> MAX_SEQUENCE_LENGTH</span>
<span id="cb254-2"><a href="#cb254-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb254-3"><a href="#cb254-3" aria-hidden="true" tabindex="-1"></a>int_vectorize_layer <span class="op">=</span> TextVectorization(</span>
<span id="cb254-4"><a href="#cb254-4" aria-hidden="true" tabindex="-1"></a>    max_tokens<span class="op">=</span>VOCAB_SIZE,</span>
<span id="cb254-5"><a href="#cb254-5" aria-hidden="true" tabindex="-1"></a>    output_mode<span class="op">=</span><span class="st">&#39;int&#39;</span>,</span>
<span id="cb254-6"><a href="#cb254-6" aria-hidden="true" tabindex="-1"></a>    output_sequence_length<span class="op">=</span>MAX_SEQUENCE_LENGTH)</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="169" id="Nv7DRrViVR_9">
<div class="sourceCode" id="cb255"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb255-1"><a href="#cb255-1" aria-hidden="true" tabindex="-1"></a><span class="co"># %%time</span></span>
<span id="cb255-2"><a href="#cb255-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Make a text-only dataset (without labels), then call `TextVectorization.adapt`.</span></span>
<span id="cb255-3"><a href="#cb255-3" aria-hidden="true" tabindex="-1"></a>train_text <span class="op">=</span> raw_train_ds.<span class="bu">map</span>(<span class="kw">lambda</span> text, labels: text)</span>
<span id="cb255-4"><a href="#cb255-4" aria-hidden="true" tabindex="-1"></a>multi_class_vectorize_layer.adapt(train_text)</span>
<span id="cb255-5"><a href="#cb255-5" aria-hidden="true" tabindex="-1"></a>int_vectorize_layer.adapt(train_text)</span>
<span id="cb255-6"><a href="#cb255-6" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 1.2 s, sys: 212 ms, total: 1.42 s</span></span>
<span id="cb255-7"><a href="#cb255-7" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 981 ms</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="170" id="k2zRypTrVR_-">
<div class="sourceCode" id="cb256"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb256-1"><a href="#cb256-1" aria-hidden="true" tabindex="-1"></a>text_batch, label_batch <span class="op">=</span> <span class="bu">next</span>(<span class="bu">iter</span>(raw_train_ds))</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="171"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="F4l9oj2ZVR_-" data-outputId="8d94f95f-2f71-4818-a2fb-3e55afc60d3d">
<div class="sourceCode" id="cb257"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb257-1"><a href="#cb257-1" aria-hidden="true" tabindex="-1"></a>first_TestCase, first_label <span class="op">=</span> text_batch[<span class="dv">0</span>], label_batch[<span class="dv">0</span>]</span>
<span id="cb257-2"><a href="#cb257-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;TestCase:&quot;</span>, first_TestCase)</span>
<span id="cb257-3"><a href="#cb257-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Label:&quot;</span>, first_label)</span>
<span id="cb257-4"><a href="#cb257-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb257-5"><a href="#cb257-5" aria-hidden="true" tabindex="-1"></a><span class="co">## TestCase: tf.Tensor(b&#39;\n\n\n#include &quot;std_testcase.h&quot;\n#include &lt;vector&gt;\n\nusing namespace std;\n\nnamespace t_rand_multiply_72 ...</span></span>
<span id="cb257-6"><a href="#cb257-6" aria-hidden="true" tabindex="-1"></a><span class="co">## Label: tf.Tensor(7, shape=(), dtype=int32)</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>TestCase: tf.Tensor(b&#39;\n\n\n#include &quot;std_testcase.h&quot;\n\n#ifndef _WIN32\n#include &lt;wchar.h&gt;\n#endif\n\n#define ENV_VARIABLE L&quot;ADD&quot;\n\n#ifdef _WIN32\n#define GETENV _wgetenv\n#else\n#define GETENV getenv\n#endif\n\n#ifdef _WIN32\n#define OPEN _wopen\n#define CLOSE _close\n#else\n#include &lt;unistd.h&gt;\n#define OPEN open\n#define CLOSE close\n#endif\n\nnamespace t_environment_open_54\n{\n\n\n\n#ifndef OMITBAD\n\nvoid badSink_e(wchar_t * data)\n{\n    {\n        int fileDesc;\n        \n        fileDesc = OPEN(data, O_RDWR|O_CREAT, S_IREAD|S_IWRITE);\n        if (fileDesc != -1)\n        {\n            CLOSE(fileDesc);\n        }\n    }\n}\n\n#endif \n\n#ifndef OMITGOOD\n\n\nvoid goodG2BSink_e(wchar_t * data)\n{\n    {\n        int fileDesc;\n        \n        fileDesc = OPEN(data, O_RDWR|O_CREAT, S_IREAD|S_IWRITE);\n        if (fileDesc != -1)\n        {\n            CLOSE(fileDesc);\n        }\n    }\n}\n\n#endif \n\n} \n&#39;, shape=(), dtype=string)
Label: tf.Tensor(13, shape=(), dtype=int32)
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="274"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:452}"
id="oaMkkkWnVSAA" data-outputId="2998a004-4e64-48cd-cd56-5bdfbe7cf754">
<div class="sourceCode" id="cb259"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb259-1"><a href="#cb259-1" aria-hidden="true" tabindex="-1"></a><span class="co"># print(&quot;&#39;binary&#39; vectorized TestCase:&quot;,</span></span>
<span id="cb259-2"><a href="#cb259-2" aria-hidden="true" tabindex="-1"></a><span class="co">#       list(multi_class_vectorize_layer(first_TestCase).numpy()))</span></span>
<span id="cb259-3"><a href="#cb259-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb259-4"><a href="#cb259-4" aria-hidden="true" tabindex="-1"></a>plt.plot(multi_class_vectorize_layer(first_TestCase).numpy())</span>
<span id="cb259-5"><a href="#cb259-5" aria-hidden="true" tabindex="-1"></a>plt.xlim(<span class="dv">0</span>,<span class="dv">1000</span>)</span>
<span id="cb259-6"><a href="#cb259-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb259-7"><a href="#cb259-7" aria-hidden="true" tabindex="-1"></a><span class="co">## (0.0, 1000.0)</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="274">
<pre><code>(0.0, 1000.0)</code></pre>
</div>
<div class="output display_data">
<p><img
src="vertopal_3d2211f5135e46f9b8501b72a53acc02/8cdd9aa73d06ce50a1768b405436e309eacdff99.png" /></p>
</div>
</div>
<div class="cell code" data-execution_count="173"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="opWg5vGtVSAB" data-outputId="bee1b5dd-e3a6-4226-90aa-d08ccc9d418d">
<div class="sourceCode" id="cb261"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb261-1"><a href="#cb261-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;&#39;int&#39; vectorized TestCase:&quot;</span>,</span>
<span id="cb261-2"><a href="#cb261-2" aria-hidden="true" tabindex="-1"></a>      int_vectorize_layer(first_TestCase).numpy())</span></code></pre></div>
<div class="output stream stdout">
<pre><code>&#39;int&#39; vectorized TestCase: [   5   24    6   26    5   47    3   14  375  534   18   26   14  166
  531   30   14  166  166    3   18   26   14  209  562   14   77   77
   30    5   93   14  209  209   14   77   77    3   12 2758    6   16
    4  807    2   10  178  178  366  365  363   13  178   19  369    3
    6   15    4  797    2   10  178  178  366  365  363   13  178   19
  369    3    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="174" id="ceji_KttVSAB">
<div class="sourceCode" id="cb263"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb263-1"><a href="#cb263-1" aria-hidden="true" tabindex="-1"></a><span class="co"># &#39;int&#39; vectorized TestCase: [   5   24    5  226   29   12   71   12 1667    6   16    4  787   74</span></span>
<span id="cb263-2"><a href="#cb263-2" aria-hidden="true" tabindex="-1"></a><span class="co">#    69    2  218  299    7   69  120    2   83  465    3    6   15    4</span></span>
<span id="cb263-3"><a href="#cb263-3" aria-hidden="true" tabindex="-1"></a><span class="co">#   851   74   69    2  218  299    7   69  120    2   83  465    4  854</span></span>
<span id="cb263-4"><a href="#cb263-4" aria-hidden="true" tabindex="-1"></a><span class="co">#    74   69    2  218  299    7   13    2  931   69  120    2   83  465</span></span>
<span id="cb263-5"><a href="#cb263-5" aria-hidden="true" tabindex="-1"></a><span class="co">#    30  151  315   91  179  642  287  316  644    3    0    0    0    0</span></span>
<span id="cb263-6"><a href="#cb263-6" aria-hidden="true" tabindex="-1"></a><span class="co">#     0    0    0    0    0    0    0    0    0    0    0    0    0    0</span></span>
<span id="cb263-7"><a href="#cb263-7" aria-hidden="true" tabindex="-1"></a><span class="co">#     0    0    0    0    0    0    0    0    0    0    0    0    0    0</span></span>
<span id="cb263-8"><a href="#cb263-8" aria-hidden="true" tabindex="-1"></a><span class="co">#     0    0    0    0    0    0    0    0    0    0    0    0    0    0</span></span>
<span id="cb263-9"><a href="#cb263-9" aria-hidden="true" tabindex="-1"></a><span class="co">#     0    0    0    0    0    0    0    0    0    0    0    0    0    0</span></span>
<span id="cb263-10"><a href="#cb263-10" aria-hidden="true" tabindex="-1"></a><span class="co">#     0    0    0    0    0    0    0    0    0    0    0    0    0    0</span></span>
<span id="cb263-11"><a href="#cb263-11" aria-hidden="true" tabindex="-1"></a><span class="co">#     0    0    0    0    0    0    0    0    0    0    0    0    0    0</span></span>
<span id="cb263-12"><a href="#cb263-12" aria-hidden="true" tabindex="-1"></a><span class="co">#     0    0    0    0    0    0    0    0    0    0    0    0    0    0</span></span>
<span id="cb263-13"><a href="#cb263-13" aria-hidden="true" tabindex="-1"></a><span class="co">#     0    0    0    0    0    0    0    0    0    0    0    0    0    0</span></span>
<span id="cb263-14"><a href="#cb263-14" aria-hidden="true" tabindex="-1"></a><span class="co">#     0    0    0    0    0    0    0    0    0    0    0    0    0    0</span></span>
<span id="cb263-15"><a href="#cb263-15" aria-hidden="true" tabindex="-1"></a><span class="co">#     0    0    0    0    0    0    0    0    0    0    0    0    0    0</span></span>
<span id="cb263-16"><a href="#cb263-16" aria-hidden="true" tabindex="-1"></a><span class="co">#     0    0    0    0    0    0    0    0    0    0    0    0    0    0</span></span>
<span id="cb263-17"><a href="#cb263-17" aria-hidden="true" tabindex="-1"></a><span class="co">#     0    0    0    0    0    0    0    0    0    0    0    0    0    0</span></span>
<span id="cb263-18"><a href="#cb263-18" aria-hidden="true" tabindex="-1"></a><span class="co">#     0    0    0    0    0    0    0    0    0    0    0    0]</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="175"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="VG_iOBWNVSAD" data-outputId="a3f978e3-e9ce-4e10-9586-4b4775381c86">
<div class="sourceCode" id="cb264"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb264-1"><a href="#cb264-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;1289 ---&gt; &quot;</span>, int_vectorize_layer.get_vocabulary()[<span class="dv">1289</span>])</span>
<span id="cb264-2"><a href="#cb264-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;313 ---&gt; &quot;</span>, int_vectorize_layer.get_vocabulary()[<span class="dv">313</span>])</span>
<span id="cb264-3"><a href="#cb264-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Vocabulary size: </span><span class="sc">{}</span><span class="st">&quot;</span>.<span class="bu">format</span>(<span class="bu">len</span>(int_vectorize_layer.get_vocabulary())))</span>
<span id="cb264-4"><a href="#cb264-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb264-5"><a href="#cb264-5" aria-hidden="true" tabindex="-1"></a><span class="co">## 1289 ---&gt;  wchartmemcpy81base</span></span>
<span id="cb264-6"><a href="#cb264-6" aria-hidden="true" tabindex="-1"></a><span class="co">## 313 ---&gt;  6</span></span>
<span id="cb264-7"><a href="#cb264-7" aria-hidden="true" tabindex="-1"></a><span class="co">## Vocabulary size: 9203</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>1289 ---&gt;  wchartmemcpy81base
313 ---&gt;  6
Vocabulary size: 9203
</code></pre>
</div>
</div>
<section id="baseline-beating-cnn-model" class="cell markdown"
id="MD2uVvWaVSAE">
<h1>Baseline-beating CNN Model</h1>
</section>
<div class="cell code" data-execution_count="176" id="IWdmaCoDVSAF">
<div class="sourceCode" id="cb266"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb266-1"><a href="#cb266-1" aria-hidden="true" tabindex="-1"></a><span class="co"># epochs=2 #10</span></span>
<span id="cb266-2"><a href="#cb266-2" aria-hidden="true" tabindex="-1"></a><span class="co"># new_num_labels=25</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="177" id="u7OXYHkJVSAF">
<div class="sourceCode" id="cb267"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb267-1"><a href="#cb267-1" aria-hidden="true" tabindex="-1"></a><span class="co"># binary_model = tf.keras.Sequential([</span></span>
<span id="cb267-2"><a href="#cb267-2" aria-hidden="true" tabindex="-1"></a>clf_model <span class="op">=</span> tf.keras.Sequential([</span>
<span id="cb267-3"><a href="#cb267-3" aria-hidden="true" tabindex="-1"></a>    <span class="co"># binary_vectorize_layer,</span></span>
<span id="cb267-4"><a href="#cb267-4" aria-hidden="true" tabindex="-1"></a>    multi_class_vectorize_layer,</span>
<span id="cb267-5"><a href="#cb267-5" aria-hidden="true" tabindex="-1"></a>    <span class="co"># layers.Dense(4)]) #25</span></span>
<span id="cb267-6"><a href="#cb267-6" aria-hidden="true" tabindex="-1"></a>    layers.Dense(new_num_labels)])</span>
<span id="cb267-7"><a href="#cb267-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb267-8"><a href="#cb267-8" aria-hidden="true" tabindex="-1"></a>clf_model.<span class="bu">compile</span>(</span>
<span id="cb267-9"><a href="#cb267-9" aria-hidden="true" tabindex="-1"></a>    loss<span class="op">=</span>losses.SparseCategoricalCrossentropy(from_logits<span class="op">=</span><span class="va">True</span>),</span>
<span id="cb267-10"><a href="#cb267-10" aria-hidden="true" tabindex="-1"></a>    optimizer<span class="op">=</span><span class="st">&#39;adam&#39;</span>,</span>
<span id="cb267-11"><a href="#cb267-11" aria-hidden="true" tabindex="-1"></a>    metrics<span class="op">=</span>[<span class="st">&#39;accuracy&#39;</span>])</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="178"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:312}"
id="luPGJfAsVSAG" data-outputId="384b15fb-c67e-4d84-cb8c-1787fa06b796">
<div class="sourceCode" id="cb268"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb268-1"><a href="#cb268-1" aria-hidden="true" tabindex="-1"></a>tf.keras.utils.plot_model(clf_model, show_shapes<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb268-2"><a href="#cb268-2" aria-hidden="true" tabindex="-1"></a><span class="co"># ValueError: This model has not yet been built. Build the model first by calling `build()` or by calling the model on a batch of data.</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="178">
<p><img
src="vertopal_3d2211f5135e46f9b8501b72a53acc02/2c2ecc1421b107b7723de653bdc0dbfe2e62691e.png" /></p>
</div>
</div>
<div class="cell code" data-execution_count="179"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="Z-s8FT5tVSAG" data-outputId="dd3011ad-28c2-4af3-93c4-f17eaf9754ba">
<div class="sourceCode" id="cb269"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb269-1"><a href="#cb269-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>time</span>
<span id="cb269-2"><a href="#cb269-2" aria-hidden="true" tabindex="-1"></a>bin_history <span class="op">=</span> clf_model.fit(</span>
<span id="cb269-3"><a href="#cb269-3" aria-hidden="true" tabindex="-1"></a>    raw_train_ds, validation_data<span class="op">=</span>raw_val_ds, epochs<span class="op">=</span>epochs) <span class="co">#10) # epochs)</span></span>
<span id="cb269-4"><a href="#cb269-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb269-5"><a href="#cb269-5" aria-hidden="true" tabindex="-1"></a><span class="co"># print()</span></span>
<span id="cb269-6"><a href="#cb269-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb269-7"><a href="#cb269-7" aria-hidden="true" tabindex="-1"></a>    <span class="co"># raw_train_ds, validation_data=raw_val_ds, epochs=2) # epochs) #epochs=10)</span></span>
<span id="cb269-8"><a href="#cb269-8" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 3.55 s, sys: 467 ms, total: 4.01 s</span></span>
<span id="cb269-9"><a href="#cb269-9" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 2.21 s</span></span>
<span id="cb269-10"><a href="#cb269-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb269-11"><a href="#cb269-11" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 13.7 s, sys: 1.73 s, total: 15.4 s</span></span>
<span id="cb269-12"><a href="#cb269-12" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 16.3 s ## 7.31 s</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>Epoch 1/15
153/153 [==============================] - 2s 11ms/step - loss: 2.5163 - accuracy: 0.5401 - val_loss: 1.9911 - val_accuracy: 0.7434
Epoch 2/15
153/153 [==============================] - 1s 9ms/step - loss: 1.6466 - accuracy: 0.7911 - val_loss: 1.4668 - val_accuracy: 0.7893
Epoch 3/15
153/153 [==============================] - 1s 9ms/step - loss: 1.2317 - accuracy: 0.8429 - val_loss: 1.1847 - val_accuracy: 0.8139
Epoch 4/15
153/153 [==============================] - 1s 8ms/step - loss: 0.9859 - accuracy: 0.8707 - val_loss: 1.0081 - val_accuracy: 0.8279
Epoch 5/15
153/153 [==============================] - 1s 10ms/step - loss: 0.8208 - accuracy: 0.8959 - val_loss: 0.8869 - val_accuracy: 0.8426
Epoch 6/15
153/153 [==============================] - 2s 11ms/step - loss: 0.7014 - accuracy: 0.9121 - val_loss: 0.7984 - val_accuracy: 0.8492
Epoch 7/15
153/153 [==============================] - 1s 9ms/step - loss: 0.6106 - accuracy: 0.9252 - val_loss: 0.7308 - val_accuracy: 0.8566
Epoch 8/15
153/153 [==============================] - 1s 9ms/step - loss: 0.5390 - accuracy: 0.9353 - val_loss: 0.6774 - val_accuracy: 0.8623
Epoch 9/15
153/153 [==============================] - 1s 8ms/step - loss: 0.4811 - accuracy: 0.9437 - val_loss: 0.6342 - val_accuracy: 0.8680
Epoch 10/15
153/153 [==============================] - 1s 9ms/step - loss: 0.4332 - accuracy: 0.9525 - val_loss: 0.5985 - val_accuracy: 0.8697
Epoch 11/15
153/153 [==============================] - 1s 9ms/step - loss: 0.3929 - accuracy: 0.9588 - val_loss: 0.5685 - val_accuracy: 0.8721
Epoch 12/15
153/153 [==============================] - 2s 13ms/step - loss: 0.3585 - accuracy: 0.9629 - val_loss: 0.5429 - val_accuracy: 0.8713
Epoch 13/15
153/153 [==============================] - 1s 8ms/step - loss: 0.3289 - accuracy: 0.9662 - val_loss: 0.5209 - val_accuracy: 0.8779
Epoch 14/15
153/153 [==============================] - 1s 9ms/step - loss: 0.3030 - accuracy: 0.9685 - val_loss: 0.5017 - val_accuracy: 0.8779
Epoch 15/15
153/153 [==============================] - 1s 8ms/step - loss: 0.2803 - accuracy: 0.9699 - val_loss: 0.4848 - val_accuracy: 0.8811
CPU times: user 25.6 s, sys: 1.04 s, total: 26.7 s
Wall time: 29.7 s
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="180" id="OlnFJlyUVSAH">
<div class="sourceCode" id="cb271"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb271-1"><a href="#cb271-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 1/10</span></span>
<span id="cb271-2"><a href="#cb271-2" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 1s 5ms/step - loss: 1.2311 - accuracy: 0.8449 - val_loss: 1.1815 - val_accuracy: 0.8270</span></span>
<span id="cb271-3"><a href="#cb271-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 2/10</span></span>
<span id="cb271-4"><a href="#cb271-4" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 1s 5ms/step - loss: 0.9858 - accuracy: 0.8744 - val_loss: 1.0057 - val_accuracy: 0.8377</span></span>
<span id="cb271-5"><a href="#cb271-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 3/10</span></span>
<span id="cb271-6"><a href="#cb271-6" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 1s 5ms/step - loss: 0.8211 - accuracy: 0.8996 - val_loss: 0.8850 - val_accuracy: 0.8434</span></span>
<span id="cb271-7"><a href="#cb271-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 4/10</span></span>
<span id="cb271-8"><a href="#cb271-8" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 1s 5ms/step - loss: 0.7018 - accuracy: 0.9146 - val_loss: 0.7968 - val_accuracy: 0.8516</span></span>
<span id="cb271-9"><a href="#cb271-9" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 5/10</span></span>
<span id="cb271-10"><a href="#cb271-10" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 1s 5ms/step - loss: 0.6111 - accuracy: 0.9273 - val_loss: 0.7295 - val_accuracy: 0.8566</span></span>
<span id="cb271-11"><a href="#cb271-11" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 6/10</span></span>
<span id="cb271-12"><a href="#cb271-12" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 1s 5ms/step - loss: 0.5395 - accuracy: 0.9373 - val_loss: 0.6764 - val_accuracy: 0.8631</span></span>
<span id="cb271-13"><a href="#cb271-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 7/10</span></span>
<span id="cb271-14"><a href="#cb271-14" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 1s 5ms/step - loss: 0.4816 - accuracy: 0.9461 - val_loss: 0.6335 - val_accuracy: 0.8656</span></span>
<span id="cb271-15"><a href="#cb271-15" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 8/10</span></span>
<span id="cb271-16"><a href="#cb271-16" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 1s 5ms/step - loss: 0.4337 - accuracy: 0.9537 - val_loss: 0.5980 - val_accuracy: 0.8689</span></span>
<span id="cb271-17"><a href="#cb271-17" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 9/10</span></span>
<span id="cb271-18"><a href="#cb271-18" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 1s 5ms/step - loss: 0.3934 - accuracy: 0.9588 - val_loss: 0.5681 - val_accuracy: 0.8738</span></span>
<span id="cb271-19"><a href="#cb271-19" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 10/10</span></span>
<span id="cb271-20"><a href="#cb271-20" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 1s 5ms/step - loss: 0.3590 - accuracy: 0.9635 - val_loss: 0.5427 - val_accuracy: 0.8762</span></span>
<span id="cb271-21"><a href="#cb271-21" aria-hidden="true" tabindex="-1"></a><span class="co"># CPU times: user 13.7 s, sys: 1.73 s, total: 15.4 s</span></span>
<span id="cb271-22"><a href="#cb271-22" aria-hidden="true" tabindex="-1"></a><span class="co"># Wall time: 7.31 s</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="181" id="rnVNLZX6VSAI">
<div class="sourceCode" id="cb272"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb272-1"><a href="#cb272-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> create_model(vocab_size, num_labels, vectorizer<span class="op">=</span><span class="va">None</span>):</span>
<span id="cb272-2"><a href="#cb272-2" aria-hidden="true" tabindex="-1"></a>  my_layers <span class="op">=</span>[]</span>
<span id="cb272-3"><a href="#cb272-3" aria-hidden="true" tabindex="-1"></a>  <span class="cf">if</span> vectorizer <span class="kw">is</span> <span class="kw">not</span> <span class="va">None</span>:</span>
<span id="cb272-4"><a href="#cb272-4" aria-hidden="true" tabindex="-1"></a>    my_layers <span class="op">=</span> [vectorizer]</span>
<span id="cb272-5"><a href="#cb272-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb272-6"><a href="#cb272-6" aria-hidden="true" tabindex="-1"></a>  my_layers.extend([</span>
<span id="cb272-7"><a href="#cb272-7" aria-hidden="true" tabindex="-1"></a>      layers.Embedding(vocab_size, <span class="dv">64</span>, mask_zero<span class="op">=</span><span class="va">True</span>),</span>
<span id="cb272-8"><a href="#cb272-8" aria-hidden="true" tabindex="-1"></a>      layers.Dropout(<span class="fl">0.5</span>),</span>
<span id="cb272-9"><a href="#cb272-9" aria-hidden="true" tabindex="-1"></a>      layers.Conv1D(<span class="dv">64</span>, <span class="dv">5</span>, padding<span class="op">=</span><span class="st">&quot;valid&quot;</span>, activation<span class="op">=</span><span class="st">&quot;relu&quot;</span>, strides<span class="op">=</span><span class="dv">2</span>),</span>
<span id="cb272-10"><a href="#cb272-10" aria-hidden="true" tabindex="-1"></a>      layers.GlobalMaxPooling1D(),</span>
<span id="cb272-11"><a href="#cb272-11" aria-hidden="true" tabindex="-1"></a>      layers.Dense(num_labels)</span>
<span id="cb272-12"><a href="#cb272-12" aria-hidden="true" tabindex="-1"></a>  ])</span>
<span id="cb272-13"><a href="#cb272-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb272-14"><a href="#cb272-14" aria-hidden="true" tabindex="-1"></a>  model <span class="op">=</span> tf.keras.Sequential(my_layers)</span>
<span id="cb272-15"><a href="#cb272-15" aria-hidden="true" tabindex="-1"></a>  <span class="cf">return</span> model</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="182"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:754}"
id="nJz6AzVJVSAJ" data-outputId="6721cc2a-cdce-4f7c-d1fa-a9691251b974">
<div class="sourceCode" id="cb273"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb273-1"><a href="#cb273-1" aria-hidden="true" tabindex="-1"></a><span class="co"># `vocab_size` is `VOCAB_SIZE + 1` ## `0 used for padding additionally.</span></span>
<span id="cb273-2"><a href="#cb273-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb273-3"><a href="#cb273-3" aria-hidden="true" tabindex="-1"></a><span class="co"># int_model = create_model(vocab_size=VOCAB_SIZE + 1, num_labels=4, vectorizer=int_vectorize_layer)</span></span>
<span id="cb273-4"><a href="#cb273-4" aria-hidden="true" tabindex="-1"></a>int_model <span class="op">=</span> create_model(vocab_size<span class="op">=</span>VOCAB_SIZE <span class="op">+</span> <span class="dv">1</span>, num_labels<span class="op">=</span>new_num_labels, vectorizer<span class="op">=</span>int_vectorize_layer) <span class="co">#num_labels=4</span></span>
<span id="cb273-5"><a href="#cb273-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb273-6"><a href="#cb273-6" aria-hidden="true" tabindex="-1"></a>tf.keras.utils.plot_model(int_model, show_shapes<span class="op">=</span><span class="va">True</span>)</span></code></pre></div>
<div class="output execute_result" data-execution_count="182">
<p><img
src="vertopal_3d2211f5135e46f9b8501b72a53acc02/e5656af29daafbe1e1a5d70fec71a09f65f250f5.png" /></p>
</div>
</div>
<div class="cell code" data-execution_count="183"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="UCxJG_rQVSAJ" data-outputId="bf5ee608-1cbc-438f-a963-604a36db1a68">
<div class="sourceCode" id="cb274"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb274-1"><a href="#cb274-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>time</span>
<span id="cb274-2"><a href="#cb274-2" aria-hidden="true" tabindex="-1"></a>int_model.<span class="bu">compile</span>(</span>
<span id="cb274-3"><a href="#cb274-3" aria-hidden="true" tabindex="-1"></a>    loss<span class="op">=</span>losses.SparseCategoricalCrossentropy(from_logits<span class="op">=</span><span class="va">True</span>),</span>
<span id="cb274-4"><a href="#cb274-4" aria-hidden="true" tabindex="-1"></a>    optimizer<span class="op">=</span><span class="st">&#39;adam&#39;</span>,</span>
<span id="cb274-5"><a href="#cb274-5" aria-hidden="true" tabindex="-1"></a>    metrics<span class="op">=</span>[<span class="st">&#39;accuracy&#39;</span>])</span>
<span id="cb274-6"><a href="#cb274-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb274-7"><a href="#cb274-7" aria-hidden="true" tabindex="-1"></a>int_history <span class="op">=</span> int_model.fit(raw_train_ds, validation_data<span class="op">=</span>raw_val_ds, epochs<span class="op">=</span>epochs) <span class="co"># 10 # 15 #30</span></span>
<span id="cb274-8"><a href="#cb274-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb274-9"><a href="#cb274-9" aria-hidden="true" tabindex="-1"></a><span class="co"># int_history = int_model.fit(raw_train_ds, validation_data=raw_val_ds, epochs=10) # 10</span></span>
<span id="cb274-10"><a href="#cb274-10" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 1min 12s, sys: 6.5 s, total: 1min 18s</span></span>
<span id="cb274-11"><a href="#cb274-11" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 36.9 s ## 19 s</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>Epoch 1/15
153/153 [==============================] - 7s 38ms/step - loss: 3.0561 - accuracy: 0.2003 - val_loss: 2.6361 - val_accuracy: 0.4566
Epoch 2/15
153/153 [==============================] - 5s 29ms/step - loss: 2.0293 - accuracy: 0.5244 - val_loss: 1.3931 - val_accuracy: 0.6369
Epoch 3/15
153/153 [==============================] - 6s 38ms/step - loss: 1.1298 - accuracy: 0.6940 - val_loss: 0.8160 - val_accuracy: 0.7893
Epoch 4/15
153/153 [==============================] - 4s 29ms/step - loss: 0.7124 - accuracy: 0.8097 - val_loss: 0.5603 - val_accuracy: 0.8484
Epoch 5/15
153/153 [==============================] - 5s 31ms/step - loss: 0.4967 - accuracy: 0.8644 - val_loss: 0.4404 - val_accuracy: 0.8746
Epoch 6/15
153/153 [==============================] - 5s 34ms/step - loss: 0.3827 - accuracy: 0.8978 - val_loss: 0.3835 - val_accuracy: 0.8836
Epoch 7/15
153/153 [==============================] - 6s 37ms/step - loss: 0.3063 - accuracy: 0.9166 - val_loss: 0.3424 - val_accuracy: 0.8910
Epoch 8/15
153/153 [==============================] - 4s 29ms/step - loss: 0.2523 - accuracy: 0.9254 - val_loss: 0.3256 - val_accuracy: 0.8951
Epoch 9/15
153/153 [==============================] - 6s 37ms/step - loss: 0.2111 - accuracy: 0.9385 - val_loss: 0.3080 - val_accuracy: 0.9000
Epoch 10/15
153/153 [==============================] - 5s 34ms/step - loss: 0.1796 - accuracy: 0.9461 - val_loss: 0.2971 - val_accuracy: 0.8992
Epoch 11/15
153/153 [==============================] - 5s 32ms/step - loss: 0.1619 - accuracy: 0.9478 - val_loss: 0.2930 - val_accuracy: 0.8975
Epoch 12/15
153/153 [==============================] - 4s 29ms/step - loss: 0.1423 - accuracy: 0.9562 - val_loss: 0.2880 - val_accuracy: 0.9008
Epoch 13/15
153/153 [==============================] - 6s 37ms/step - loss: 0.1264 - accuracy: 0.9623 - val_loss: 0.2938 - val_accuracy: 0.8992
Epoch 14/15
153/153 [==============================] - 4s 29ms/step - loss: 0.1131 - accuracy: 0.9650 - val_loss: 0.2860 - val_accuracy: 0.9008
Epoch 15/15
153/153 [==============================] - 4s 29ms/step - loss: 0.1054 - accuracy: 0.9676 - val_loss: 0.2886 - val_accuracy: 0.9000
CPU times: user 1min 45s, sys: 3.28 s, total: 1min 48s
Wall time: 1min 37s
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="184" id="zW_AMlI4VSAK">
<div class="sourceCode" id="cb276"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb276-1"><a href="#cb276-1" aria-hidden="true" tabindex="-1"></a><span class="co"># # Epoch 1/10</span></span>
<span id="cb276-2"><a href="#cb276-2" aria-hidden="true" tabindex="-1"></a><span class="co"># # 153/153 [==============================] - 3s 13ms/step - loss: 1.1379 - accuracy: 0.6780 - val_loss: 0.9188 - val_accuracy: 0.7361</span></span>
<span id="cb276-3"><a href="#cb276-3" aria-hidden="true" tabindex="-1"></a><span class="co"># # Epoch 2/10</span></span>
<span id="cb276-4"><a href="#cb276-4" aria-hidden="true" tabindex="-1"></a><span class="co"># # 153/153 [==============================] - 2s 12ms/step - loss: 0.8141 - accuracy: 0.7638 - val_loss: 0.6918 - val_accuracy: 0.8008</span></span>
<span id="cb276-5"><a href="#cb276-5" aria-hidden="true" tabindex="-1"></a><span class="co"># # Epoch 3/10</span></span>
<span id="cb276-6"><a href="#cb276-6" aria-hidden="true" tabindex="-1"></a><span class="co"># # 153/153 [==============================] - 2s 11ms/step - loss: 0.5948 - accuracy: 0.8265 - val_loss: 0.5431 - val_accuracy: 0.8467</span></span>
<span id="cb276-7"><a href="#cb276-7" aria-hidden="true" tabindex="-1"></a><span class="co"># # Epoch 4/10</span></span>
<span id="cb276-8"><a href="#cb276-8" aria-hidden="true" tabindex="-1"></a><span class="co"># # 153/153 [==============================] - 2s 11ms/step - loss: 0.4447 - accuracy: 0.8734 - val_loss: 0.4490 - val_accuracy: 0.8680</span></span>
<span id="cb276-9"><a href="#cb276-9" aria-hidden="true" tabindex="-1"></a><span class="co"># # Epoch 5/10</span></span>
<span id="cb276-10"><a href="#cb276-10" aria-hidden="true" tabindex="-1"></a><span class="co"># # 153/153 [==============================] - 2s 11ms/step - loss: 0.3474 - accuracy: 0.9007 - val_loss: 0.3991 - val_accuracy: 0.8754</span></span>
<span id="cb276-11"><a href="#cb276-11" aria-hidden="true" tabindex="-1"></a><span class="co"># # Epoch 6/10</span></span>
<span id="cb276-12"><a href="#cb276-12" aria-hidden="true" tabindex="-1"></a><span class="co"># # 153/153 [==============================] - 2s 13ms/step - loss: 0.2838 - accuracy: 0.9177 - val_loss: 0.3596 - val_accuracy: 0.8877</span></span>
<span id="cb276-13"><a href="#cb276-13" aria-hidden="true" tabindex="-1"></a><span class="co"># # Epoch 7/10</span></span>
<span id="cb276-14"><a href="#cb276-14" aria-hidden="true" tabindex="-1"></a><span class="co"># # 153/153 [==============================] - 2s 12ms/step - loss: 0.2363 - accuracy: 0.9283 - val_loss: 0.3366 - val_accuracy: 0.8893</span></span>
<span id="cb276-15"><a href="#cb276-15" aria-hidden="true" tabindex="-1"></a><span class="co"># # Epoch 8/10</span></span>
<span id="cb276-16"><a href="#cb276-16" aria-hidden="true" tabindex="-1"></a><span class="co"># # 153/153 [==============================] - 2s 12ms/step - loss: 0.2001 - accuracy: 0.9359 - val_loss: 0.3241 - val_accuracy: 0.8926</span></span>
<span id="cb276-17"><a href="#cb276-17" aria-hidden="true" tabindex="-1"></a><span class="co"># # Epoch 9/10</span></span>
<span id="cb276-18"><a href="#cb276-18" aria-hidden="true" tabindex="-1"></a><span class="co"># # 153/153 [==============================] - 2s 12ms/step - loss: 0.1706 - accuracy: 0.9469 - val_loss: 0.3140 - val_accuracy: 0.8934</span></span>
<span id="cb276-19"><a href="#cb276-19" aria-hidden="true" tabindex="-1"></a><span class="co"># # Epoch 10/10</span></span>
<span id="cb276-20"><a href="#cb276-20" aria-hidden="true" tabindex="-1"></a><span class="co"># # 153/153 [==============================] - 2s 11ms/step - loss: 0.1559 - accuracy: 0.9502 - val_loss: 0.3109 - val_accuracy: 0.8959</span></span>
<span id="cb276-21"><a href="#cb276-21" aria-hidden="true" tabindex="-1"></a><span class="co"># # CPU times: user 1min 12s, sys: 6.5 s, total: 1min 18s</span></span>
<span id="cb276-22"><a href="#cb276-22" aria-hidden="true" tabindex="-1"></a><span class="co"># # Wall time: 19 s</span></span>
<span id="cb276-23"><a href="#cb276-23" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb276-24"><a href="#cb276-24" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 1/30</span></span>
<span id="cb276-25"><a href="#cb276-25" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 11s 55ms/step - loss: 3.0357 - accuracy: 0.2030 - val_loss: 2.5830 - val_accuracy: 0.4385</span></span>
<span id="cb276-26"><a href="#cb276-26" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 2/30</span></span>
<span id="cb276-27"><a href="#cb276-27" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 6s 39ms/step - loss: 1.9925 - accuracy: 0.5104 - val_loss: 1.3674 - val_accuracy: 0.6861</span></span>
<span id="cb276-28"><a href="#cb276-28" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 3/30</span></span>
<span id="cb276-29"><a href="#cb276-29" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 7s 48ms/step - loss: 1.0896 - accuracy: 0.7141 - val_loss: 0.7781 - val_accuracy: 0.7967</span></span>
<span id="cb276-30"><a href="#cb276-30" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 4/30</span></span>
<span id="cb276-31"><a href="#cb276-31" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 6s 38ms/step - loss: 0.6872 - accuracy: 0.8081 - val_loss: 0.5537 - val_accuracy: 0.8459</span></span>
<span id="cb276-32"><a href="#cb276-32" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 5/30</span></span>
<span id="cb276-33"><a href="#cb276-33" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 6s 39ms/step - loss: 0.4933 - accuracy: 0.8587 - val_loss: 0.4498 - val_accuracy: 0.8713</span></span>
<span id="cb276-34"><a href="#cb276-34" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 6/30</span></span>
<span id="cb276-35"><a href="#cb276-35" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 6s 41ms/step - loss: 0.3738 - accuracy: 0.8935 - val_loss: 0.3869 - val_accuracy: 0.8877</span></span>
<span id="cb276-36"><a href="#cb276-36" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 7/30</span></span>
<span id="cb276-37"><a href="#cb276-37" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 7s 47ms/step - loss: 0.2988 - accuracy: 0.9144 - val_loss: 0.3549 - val_accuracy: 0.8902</span></span>
<span id="cb276-38"><a href="#cb276-38" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 8/30</span></span>
<span id="cb276-39"><a href="#cb276-39" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 6s 39ms/step - loss: 0.2461 - accuracy: 0.9228 - val_loss: 0.3320 - val_accuracy: 0.8959</span></span>
<span id="cb276-40"><a href="#cb276-40" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 9/30</span></span>
<span id="cb276-41"><a href="#cb276-41" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 6s 39ms/step - loss: 0.2037 - accuracy: 0.9412 - val_loss: 0.3136 - val_accuracy: 0.8975</span></span>
<span id="cb276-42"><a href="#cb276-42" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 10/30</span></span>
<span id="cb276-43"><a href="#cb276-43" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 7s 43ms/step - loss: 0.1732 - accuracy: 0.9480 - val_loss: 0.3036 - val_accuracy: 0.9016</span></span>
<span id="cb276-44"><a href="#cb276-44" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 11/30</span></span>
<span id="cb276-45"><a href="#cb276-45" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 7s 48ms/step - loss: 0.1539 - accuracy: 0.9504 - val_loss: 0.3005 - val_accuracy: 0.9000</span></span>
<span id="cb276-46"><a href="#cb276-46" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 12/30</span></span>
<span id="cb276-47"><a href="#cb276-47" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 6s 38ms/step - loss: 0.1366 - accuracy: 0.9584 - val_loss: 0.2904 - val_accuracy: 0.9025</span></span>
<span id="cb276-48"><a href="#cb276-48" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 13/30</span></span>
<span id="cb276-49"><a href="#cb276-49" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 6s 37ms/step - loss: 0.1221 - accuracy: 0.9613 - val_loss: 0.2998 - val_accuracy: 0.9025</span></span>
<span id="cb276-50"><a href="#cb276-50" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 14/30</span></span>
<span id="cb276-51"><a href="#cb276-51" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 8s 54ms/step - loss: 0.1096 - accuracy: 0.9625 - val_loss: 0.2899 - val_accuracy: 0.9008</span></span>
<span id="cb276-52"><a href="#cb276-52" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 15/30</span></span>
<span id="cb276-53"><a href="#cb276-53" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 7s 45ms/step - loss: 0.0985 - accuracy: 0.9697 - val_loss: 0.2936 - val_accuracy: 0.9025</span></span>
<span id="cb276-54"><a href="#cb276-54" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 16/30</span></span>
<span id="cb276-55"><a href="#cb276-55" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 7s 47ms/step - loss: 0.0949 - accuracy: 0.9670 - val_loss: 0.2931 - val_accuracy: 0.9008</span></span>
<span id="cb276-56"><a href="#cb276-56" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 17/30</span></span>
<span id="cb276-57"><a href="#cb276-57" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 6s 37ms/step - loss: 0.0841 - accuracy: 0.9713 - val_loss: 0.2982 - val_accuracy: 0.9016</span></span>
<span id="cb276-58"><a href="#cb276-58" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 18/30</span></span>
<span id="cb276-59"><a href="#cb276-59" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 7s 43ms/step - loss: 0.0769 - accuracy: 0.9744 - val_loss: 0.2974 - val_accuracy: 0.9049</span></span>
<span id="cb276-60"><a href="#cb276-60" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 19/30</span></span>
<span id="cb276-61"><a href="#cb276-61" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 7s 48ms/step - loss: 0.0714 - accuracy: 0.9773 - val_loss: 0.3041 - val_accuracy: 0.8967</span></span>
<span id="cb276-62"><a href="#cb276-62" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 20/30</span></span>
<span id="cb276-63"><a href="#cb276-63" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 7s 43ms/step - loss: 0.0670 - accuracy: 0.9783 - val_loss: 0.3038 - val_accuracy: 0.9016</span></span>
<span id="cb276-64"><a href="#cb276-64" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 21/30</span></span>
<span id="cb276-65"><a href="#cb276-65" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 6s 37ms/step - loss: 0.0636 - accuracy: 0.9766 - val_loss: 0.3071 - val_accuracy: 0.9049</span></span>
<span id="cb276-66"><a href="#cb276-66" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 22/30</span></span>
<span id="cb276-67"><a href="#cb276-67" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 7s 48ms/step - loss: 0.0563 - accuracy: 0.9818 - val_loss: 0.3019 - val_accuracy: 0.9066</span></span>
<span id="cb276-68"><a href="#cb276-68" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 23/30</span></span>
<span id="cb276-69"><a href="#cb276-69" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 6s 40ms/step - loss: 0.0523 - accuracy: 0.9836 - val_loss: 0.3086 - val_accuracy: 0.9066</span></span>
<span id="cb276-70"><a href="#cb276-70" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 24/30</span></span>
<span id="cb276-71"><a href="#cb276-71" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 7s 47ms/step - loss: 0.0530 - accuracy: 0.9814 - val_loss: 0.3137 - val_accuracy: 0.9057</span></span>
<span id="cb276-72"><a href="#cb276-72" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 25/30</span></span>
<span id="cb276-73"><a href="#cb276-73" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 7s 44ms/step - loss: 0.0463 - accuracy: 0.9834 - val_loss: 0.3090 - val_accuracy: 0.9041</span></span>
<span id="cb276-74"><a href="#cb276-74" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 26/30</span></span>
<span id="cb276-75"><a href="#cb276-75" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 6s 40ms/step - loss: 0.0478 - accuracy: 0.9836 - val_loss: 0.3241 - val_accuracy: 0.9033</span></span>
<span id="cb276-76"><a href="#cb276-76" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 27/30</span></span>
<span id="cb276-77"><a href="#cb276-77" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 6s 39ms/step - loss: 0.0436 - accuracy: 0.9836 - val_loss: 0.3196 - val_accuracy: 0.9041</span></span>
<span id="cb276-78"><a href="#cb276-78" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 28/30</span></span>
<span id="cb276-79"><a href="#cb276-79" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 6s 42ms/step - loss: 0.0439 - accuracy: 0.9853 - val_loss: 0.3269 - val_accuracy: 0.9000</span></span>
<span id="cb276-80"><a href="#cb276-80" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 29/30</span></span>
<span id="cb276-81"><a href="#cb276-81" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 8s 49ms/step - loss: 0.0398 - accuracy: 0.9855 - val_loss: 0.3288 - val_accuracy: 0.9033</span></span>
<span id="cb276-82"><a href="#cb276-82" aria-hidden="true" tabindex="-1"></a><span class="co"># Epoch 30/30</span></span>
<span id="cb276-83"><a href="#cb276-83" aria-hidden="true" tabindex="-1"></a><span class="co"># 153/153 [==============================] - 6s 37ms/step - loss: 0.0373 - accuracy: 0.9861 - val_loss: 0.3301 - val_accuracy: 0.8984</span></span>
<span id="cb276-84"><a href="#cb276-84" aria-hidden="true" tabindex="-1"></a><span class="co"># CPU times: user 4min 28s, sys: 8.96 s, total: 4min 36s</span></span>
<span id="cb276-85"><a href="#cb276-85" aria-hidden="true" tabindex="-1"></a><span class="co"># Wall time: 4min 32s</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="185"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:476}"
id="HknEg-g4VSAM" data-outputId="54e7ca6d-bb20-43fb-ab27-ca21cccb2d5f">
<div class="sourceCode" id="cb277"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb277-1"><a href="#cb277-1" aria-hidden="true" tabindex="-1"></a>loss <span class="op">=</span> plt.plot(bin_history.epoch, bin_history.history[<span class="st">&#39;loss&#39;</span>], label<span class="op">=</span><span class="st">&#39;bin-loss&#39;</span>)</span>
<span id="cb277-2"><a href="#cb277-2" aria-hidden="true" tabindex="-1"></a>plt.plot(bin_history.epoch, bin_history.history[<span class="st">&#39;val_loss&#39;</span>], <span class="st">&#39;--&#39;</span>, color<span class="op">=</span>loss[<span class="dv">0</span>].get_color(), label<span class="op">=</span><span class="st">&#39;bin-val_loss&#39;</span>)</span>
<span id="cb277-3"><a href="#cb277-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb277-4"><a href="#cb277-4" aria-hidden="true" tabindex="-1"></a>loss <span class="op">=</span> plt.plot(int_history.epoch, int_history.history[<span class="st">&#39;loss&#39;</span>], label<span class="op">=</span><span class="st">&#39;int-loss&#39;</span>)</span>
<span id="cb277-5"><a href="#cb277-5" aria-hidden="true" tabindex="-1"></a>plt.plot(int_history.epoch, int_history.history[<span class="st">&#39;val_loss&#39;</span>], <span class="st">&#39;--&#39;</span>, color<span class="op">=</span>loss[<span class="dv">0</span>].get_color(), label<span class="op">=</span><span class="st">&#39;int-val_loss&#39;</span>)</span>
<span id="cb277-6"><a href="#cb277-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb277-7"><a href="#cb277-7" aria-hidden="true" tabindex="-1"></a>plt.legend()</span>
<span id="cb277-8"><a href="#cb277-8" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="st">&#39;Epoch&#39;</span>)</span>
<span id="cb277-9"><a href="#cb277-9" aria-hidden="true" tabindex="-1"></a>plt.ylabel(<span class="st">&#39;CE/token&#39;</span>)</span></code></pre></div>
<div class="output execute_result" data-execution_count="185">
<pre><code>Text(0, 0.5, &#39;CE/token&#39;)</code></pre>
</div>
<div class="output display_data">
<p><img
src="vertopal_3d2211f5135e46f9b8501b72a53acc02/d5bbd7bc2a4c9d05f2426e5ae57d60f1aa17f535.png" /></p>
</div>
</div>
<div class="cell code" data-execution_count="186"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="nmEc17V1VSAN" data-outputId="b604304c-a0cf-4499-b88b-62cf91b0cc22">
<div class="sourceCode" id="cb279"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb279-1"><a href="#cb279-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Evaluate the model on the test set</span></span>
<span id="cb279-2"><a href="#cb279-2" aria-hidden="true" tabindex="-1"></a>int_test_loss, int_test_acc <span class="op">=</span> int_model.evaluate(raw_test_ds)</span>
<span id="cb279-3"><a href="#cb279-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb279-4"><a href="#cb279-4" aria-hidden="true" tabindex="-1"></a><span class="co">## 87/87 [==============================] - 1s 11ms/step - loss: 0.2339 - accuracy: 0.9259</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>87/87 [==============================] - 1s 14ms/step - loss: 0.2118 - accuracy: 0.9259
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="187"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="FWsmYNllVSAN" data-outputId="6aef97e3-c981-4416-bf73-0098048d0b92">
<div class="sourceCode" id="cb281"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb281-1"><a href="#cb281-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Print the test loss and accuracy</span></span>
<span id="cb281-2"><a href="#cb281-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Test loss:&quot;</span>, int_test_loss)</span>
<span id="cb281-3"><a href="#cb281-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Test accuracy:&quot;</span>, int_test_acc)</span>
<span id="cb281-4"><a href="#cb281-4" aria-hidden="true" tabindex="-1"></a><span class="co">## Test loss: 0.23388110101222992</span></span>
<span id="cb281-5"><a href="#cb281-5" aria-hidden="true" tabindex="-1"></a><span class="co">## Test accuracy: 0.9259259104728699</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>Test loss: 0.21181465685367584
Test accuracy: 0.9259259104728699
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="188" id="UlwoSUL4VSAO">
<div class="sourceCode" id="cb283"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb283-1"><a href="#cb283-1" aria-hidden="true" tabindex="-1"></a><span class="co"># binary_train_ds = raw_train_ds.map(lambda x,y: (binary_vectorize_layer(x), y))</span></span>
<span id="cb283-2"><a href="#cb283-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb283-3"><a href="#cb283-3" aria-hidden="true" tabindex="-1"></a>clf_train_ds <span class="op">=</span> raw_train_ds.<span class="bu">map</span>(<span class="kw">lambda</span> x,y: (multi_class_vectorize_layer(x), y))</span>
<span id="cb283-4"><a href="#cb283-4" aria-hidden="true" tabindex="-1"></a>clf_val_ds <span class="op">=</span> raw_val_ds.<span class="bu">map</span>(<span class="kw">lambda</span> x,y: (multi_class_vectorize_layer(x), y))</span>
<span id="cb283-5"><a href="#cb283-5" aria-hidden="true" tabindex="-1"></a>clf_test_ds <span class="op">=</span> raw_test_ds.<span class="bu">map</span>(<span class="kw">lambda</span> x,y: (multi_class_vectorize_layer(x), y))</span>
<span id="cb283-6"><a href="#cb283-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb283-7"><a href="#cb283-7" aria-hidden="true" tabindex="-1"></a>int_train_ds <span class="op">=</span> raw_train_ds.<span class="bu">map</span>(<span class="kw">lambda</span> x,y: (int_vectorize_layer(x), y))</span>
<span id="cb283-8"><a href="#cb283-8" aria-hidden="true" tabindex="-1"></a>int_val_ds <span class="op">=</span> raw_val_ds.<span class="bu">map</span>(<span class="kw">lambda</span> x,y: (int_vectorize_layer(x), y))</span>
<span id="cb283-9"><a href="#cb283-9" aria-hidden="true" tabindex="-1"></a>int_test_ds <span class="op">=</span> raw_test_ds.<span class="bu">map</span>(<span class="kw">lambda</span> x,y: (int_vectorize_layer(x), y))</span></code></pre></div>
</div>
<section id="export-the-model" class="cell markdown" id="OP8lxMajVSAP">
<h3>Export the model</h3>
</section>
<div class="cell code" data-execution_count="189"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="yRPVMbQFVSAP" data-outputId="dab95958-99a0-4a8d-ef64-39fc56a2f174">
<div class="sourceCode" id="cb284"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb284-1"><a href="#cb284-1" aria-hidden="true" tabindex="-1"></a>clf_model.export(<span class="st">&#39;bin.tf&#39;</span>)</span></code></pre></div>
<div class="output stream stdout">
<pre><code>Saved artifact at &#39;bin.tf&#39;. The following endpoints are available:

* Endpoint &#39;serve&#39;
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None,), dtype=tf.string, name=&#39;text_vectorization_input&#39;)
Output Type:
  TensorSpec(shape=(None, 25), dtype=tf.float32, name=None)
Captures:
  139832539029648: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139832326372544: TensorSpec(shape=(), dtype=tf.int64, name=None)
  139832326361280: TensorSpec(shape=(), dtype=tf.string, name=None)
  139832326361632: TensorSpec(shape=(), dtype=tf.int64, name=None)
  139832461989792: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139832329937216: TensorSpec(shape=(), dtype=tf.resource, name=None)
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="190" id="XAcbt_osVSAP">
<div class="sourceCode" id="cb286"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb286-1"><a href="#cb286-1" aria-hidden="true" tabindex="-1"></a>loaded <span class="op">=</span> tf.saved_model.load(<span class="st">&#39;bin.tf&#39;</span>)</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="191"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="ZjCGk85XVSAQ" data-outputId="ba61f5fc-f7a7-4f94-c6a8-19366ed5e392">
<div class="sourceCode" id="cb287"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb287-1"><a href="#cb287-1" aria-hidden="true" tabindex="-1"></a>clf_model.predict([<span class="st">&#39;How do you sort a list?&#39;</span>])</span></code></pre></div>
<div class="output stream stdout">
<pre><code>1/1 [==============================] - 0s 145ms/step
</code></pre>
</div>
<div class="output execute_result" data-execution_count="191">
<pre><code>array([[ 0.09657298, -0.49373376,  0.07736832,  0.06607725,  0.05221149,
        -0.46978113, -0.48686236, -0.55163133,  0.15191314,  0.23564513,
        -0.4884494 , -0.6964581 ,  0.24588017, -0.74412024, -0.48752603,
         0.3062069 , -0.8510176 , -1.0887347 , -0.4476996 , -0.05396813,
        -0.66050076, -0.6878822 , -0.61148196, -0.7419138 , -0.70286953]],
      dtype=float32)</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="192"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="i5RIzOKwVSAQ" data-outputId="78bc10f0-cefb-457c-9cdf-38f72663fd7f">
<div class="sourceCode" id="cb290"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb290-1"><a href="#cb290-1" aria-hidden="true" tabindex="-1"></a>loaded.serve(tf.constant([<span class="st">&#39;How do you sort a list?&#39;</span>])).numpy()</span></code></pre></div>
<div class="output execute_result" data-execution_count="192">
<pre><code>array([[ 0.09657298, -0.49373376,  0.07736832,  0.06607725,  0.05221149,
        -0.46978113, -0.48686236, -0.55163133,  0.15191314,  0.23564513,
        -0.4884494 , -0.6964581 ,  0.24588017, -0.74412024, -0.48752603,
         0.3062069 , -0.8510176 , -1.0887347 , -0.4476996 , -0.05396813,
        -0.66050076, -0.6878822 , -0.61148196, -0.7419138 , -0.70286953]],
      dtype=float32)</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="193"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="mUtUmeM1VSAR" data-outputId="014a6a9c-5af0-4c94-92e6-08ceef2d9bd1">
<div class="sourceCode" id="cb292"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb292-1"><a href="#cb292-1" aria-hidden="true" tabindex="-1"></a>clf_model.export(<span class="st">&#39;cpp_top_25_cwe_cnn_model.tf&#39;</span>)</span></code></pre></div>
<div class="output stream stdout">
<pre><code>Saved artifact at &#39;cpp_top_25_cwe_cnn_model.tf&#39;. The following endpoints are available:

* Endpoint &#39;serve&#39;
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None,), dtype=tf.string, name=&#39;text_vectorization_input&#39;)
Output Type:
  TensorSpec(shape=(None, 25), dtype=tf.float32, name=None)
Captures:
  139832539029648: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139832326372544: TensorSpec(shape=(), dtype=tf.int64, name=None)
  139832326361280: TensorSpec(shape=(), dtype=tf.string, name=None)
  139832326361632: TensorSpec(shape=(), dtype=tf.int64, name=None)
  139832461989792: TensorSpec(shape=(), dtype=tf.resource, name=None)
  139832329937216: TensorSpec(shape=(), dtype=tf.resource, name=None)
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="194" id="hDCXsItAVSAR">
<div class="sourceCode" id="cb294"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb294-1"><a href="#cb294-1" aria-hidden="true" tabindex="-1"></a><span class="co">## cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz</span></span>
<span id="cb294-2"><a href="#cb294-2" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>tar <span class="op">-</span>czf cpp_top_25_cwe_cnn_model.tf.tar.gz cpp_top_25_cwe_cnn_model.tf</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="195"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="jBxLMpHRVSAR" data-outputId="c965875f-16ca-4941-f8e6-d253bf8b2183">
<div class="sourceCode" id="cb295"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb295-1"><a href="#cb295-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>ls <span class="op">-</span>lh</span></code></pre></div>
<div class="output stream stdout">
<pre><code>total 21M
drwxr-xr-x  4 root root 4.0K Jan  7 10:38 bin.tf
-rw-r--r--  1 root root 339K Jan  7 10:33 bow_transformer.pk
-rw-r--r--  1 root root 756K Jan  7 10:35 cpp_6102_train_2781_test_top_25_cwe_ready.tar.gz
drwxr-xr-x 27 root root 4.0K Jan  2 04:10 cpp_cleaner_8750_files_each_350_top_25_cwe_omitted
-rw-r--r--  1 root root 875K Jan  7 10:35 _cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz
drwxr-xr-x 12 root root 4.0K Jan  3 17:15 cpp_clean_files_top_10_cwe_omitted
-rw-r--r--  1 root root  15K Jan  7 10:35 cpp_clean_files_top_10_cwe_omitted.tar.gz
drwxr-xr-x  4 root root 4.0K Jan  7 10:38 cpp_top_25_cwe_cnn_model.tf
-rw-r--r--  1 root root 878K Jan  7 10:38 cpp_top_25_cwe_cnn_model.tf.tar.gz
-rw-r--r--  1 root root  11M Jan  7 10:32 data_drop_na.csv
-rw-r--r--  1 root root  52K Jan  7 10:36 model.png
-rw-r--r--  1 root root 4.9M Jan  7 10:35 nb_model.pk
-rw-r--r--  1 root root 2.9M Jan  7 10:35 sard.zip
drwxr-xr-x 27 root root 4.0K Jan  7 10:35 test
-rw-r--r--  1 root root 201K Jan  7 10:33 tfidf_transformer.pk
drwxr-xr-x 27 root root 4.0K Jan  7 10:35 train
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="196"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:17}"
id="4dd_1o8YVSAS" data-outputId="f8d50ec2-7c8f-4f65-ed13-9d938f4a44b6">
<div class="sourceCode" id="cb297"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb297-1"><a href="#cb297-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !wget https://raw.githubusercontent.com/c6ai/temp/main/cpp_top_25_cwe_cnn_model.tf.tar.gz</span></span>
<span id="cb297-2"><a href="#cb297-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !gdown 1j1nY2qlLnA_Iap0_ug8ZAQuDgKX1QIJB</span></span>
<span id="cb297-3"><a href="#cb297-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb297-4"><a href="#cb297-4" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> google.colab <span class="im">import</span> files</span>
<span id="cb297-5"><a href="#cb297-5" aria-hidden="true" tabindex="-1"></a><span class="co"># files.download(&quot;cpp_ready_8750_files_each_350_top_25_cwe_omitted.tar.gz&quot;)</span></span>
<span id="cb297-6"><a href="#cb297-6" aria-hidden="true" tabindex="-1"></a><span class="co"># files.download(&quot;cpp_6102_train_2781_test_top_25_cwe_ready.tar.gz&quot;)</span></span>
<span id="cb297-7"><a href="#cb297-7" aria-hidden="true" tabindex="-1"></a>files.download(<span class="st">&quot;cpp_top_25_cwe_cnn_model.tf.tar.gz&quot;</span>)</span></code></pre></div>
<div class="output display_data">
<pre><code>&lt;IPython.core.display.Javascript object&gt;</code></pre>
</div>
<div class="output display_data">
<pre><code>&lt;IPython.core.display.Javascript object&gt;</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="197" id="PooQpG5YVSAS">
<div class="sourceCode" id="cb300"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb300-1"><a href="#cb300-1" aria-hidden="true" tabindex="-1"></a><span class="co"># ## extract or un-tar (unzip):</span></span>
<span id="cb300-2"><a href="#cb300-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !tar -xzf cpp_top_25_cwe_cnn_model.tf.tar.gz</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="198" id="aclU3wQJVSAT">
<div class="sourceCode" id="cb301"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb301-1"><a href="#cb301-1" aria-hidden="true" tabindex="-1"></a><span class="co"># ## 1st ...</span></span>
<span id="cb301-2"><a href="#cb301-2" aria-hidden="true" tabindex="-1"></a><span class="co"># from google.colab import drive</span></span>
<span id="cb301-3"><a href="#cb301-3" aria-hidden="true" tabindex="-1"></a><span class="co"># drive.mount(&#39;/content/drive&#39;)</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="199" id="7u272xBEVSAT">
<div class="sourceCode" id="cb302"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb302-1"><a href="#cb302-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !mkdir tar_gz_files</span></span>
<span id="cb302-2"><a href="#cb302-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !cp cpp_top_25_cwe_cnn_model.tf.tar.gz tar_gz_files</span></span>
<span id="cb302-3"><a href="#cb302-3" aria-hidden="true" tabindex="-1"></a><span class="co"># !cp tar_gz_files/* /content/drive/MyDrive/1st-SHARED-Data</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="200"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="YoyfwKFJVSAV" data-outputId="3cb14ac8-9fdc-41db-aa1b-e72d5429a2b9">
<div class="sourceCode" id="cb303"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb303-1"><a href="#cb303-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Tensorlfow version: &quot;</span>, tf.__version__)</span>
<span id="cb303-2"><a href="#cb303-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Eager mode: &quot;</span>, tf.executing_eagerly())</span>
<span id="cb303-3"><a href="#cb303-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;GPU is&quot;</span>, <span class="st">&quot;available&quot;</span> <span class="cf">if</span> tf.test.is_gpu_available() <span class="cf">else</span> <span class="st">&quot;NOT AVAILABLE&quot;</span>)</span>
<span id="cb303-4"><a href="#cb303-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb303-5"><a href="#cb303-5" aria-hidden="true" tabindex="-1"></a><span class="co">## Tensorlfow version:  2.13.1</span></span>
<span id="cb303-6"><a href="#cb303-6" aria-hidden="true" tabindex="-1"></a><span class="co">## Eager mode:  True</span></span>
<span id="cb303-7"><a href="#cb303-7" aria-hidden="true" tabindex="-1"></a><span class="co">## GPU is NOT AVAILABLE</span></span>
<span id="cb303-8"><a href="#cb303-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb303-9"><a href="#cb303-9" aria-hidden="true" tabindex="-1"></a><span class="co">## Tensorlfow version:  2.15.0</span></span>
<span id="cb303-10"><a href="#cb303-10" aria-hidden="true" tabindex="-1"></a><span class="co">## Eager mode:  True</span></span>
<span id="cb303-11"><a href="#cb303-11" aria-hidden="true" tabindex="-1"></a><span class="co">## T4 GPU is available</span></span>
<span id="cb303-12"><a href="#cb303-12" aria-hidden="true" tabindex="-1"></a><span class="co">## High RAM</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>Tensorlfow version:  2.15.0
Eager mode:  True
GPU is NOT AVAILABLE
</code></pre>
</div>
</div>
<section id="this-may-take-few-minutes" class="cell markdown"
id="unoGYhXdVSAW">
<h3>This may take few minutes</h3>
</section>
<div class="cell code" data-execution_count="201"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="sx0mwqTrVSAW" data-outputId="0620afb1-4f98-4a3b-bdd4-0c80b5afd179">
<div class="sourceCode" id="cb305"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb305-1"><a href="#cb305-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>time</span>
<span id="cb305-2"><a href="#cb305-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb305-3"><a href="#cb305-3" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>tar <span class="op">-</span>czf <span class="dv">1</span><span class="er">st_nlp_text_clf_cpp_top_cwe_v240104a</span>.tar.gz <span class="op">/</span>content<span class="op">/*</span></span>
<span id="cb305-4"><a href="#cb305-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb305-5"><a href="#cb305-5" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 611 ms, sys: 77.2 ms, total: 688 ms</span></span>
<span id="cb305-6"><a href="#cb305-6" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 1min 25s</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>tar: Removing leading `/&#39; from member names
tar: Removing leading `/&#39; from hard link targets
CPU times: user 20.4 ms, sys: 3.04 ms, total: 23.4 ms
Wall time: 1.51 s
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="202"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="sEMx0tziVSAW" data-outputId="8c42a1e4-5031-4bca-88ac-621252937878">
<div class="sourceCode" id="cb307"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb307-1"><a href="#cb307-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>ls <span class="op">-</span>lh</span></code></pre></div>
<div class="output stream stdout">
<pre><code>total 31M
-rw-r--r--  1 root root 9.9M Jan  7 10:38 1st_nlp_text_clf_cpp_top_cwe_v240104a.tar.gz
drwxr-xr-x  4 root root 4.0K Jan  7 10:38 bin.tf
-rw-r--r--  1 root root 339K Jan  7 10:33 bow_transformer.pk
-rw-r--r--  1 root root 756K Jan  7 10:35 cpp_6102_train_2781_test_top_25_cwe_ready.tar.gz
drwxr-xr-x 27 root root 4.0K Jan  2 04:10 cpp_cleaner_8750_files_each_350_top_25_cwe_omitted
-rw-r--r--  1 root root 875K Jan  7 10:35 _cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz
drwxr-xr-x 12 root root 4.0K Jan  3 17:15 cpp_clean_files_top_10_cwe_omitted
-rw-r--r--  1 root root  15K Jan  7 10:35 cpp_clean_files_top_10_cwe_omitted.tar.gz
drwxr-xr-x  4 root root 4.0K Jan  7 10:38 cpp_top_25_cwe_cnn_model.tf
-rw-r--r--  1 root root 878K Jan  7 10:38 cpp_top_25_cwe_cnn_model.tf.tar.gz
-rw-r--r--  1 root root  11M Jan  7 10:32 data_drop_na.csv
-rw-r--r--  1 root root  52K Jan  7 10:36 model.png
-rw-r--r--  1 root root 4.9M Jan  7 10:35 nb_model.pk
-rw-r--r--  1 root root 2.9M Jan  7 10:35 sard.zip
drwxr-xr-x 27 root root 4.0K Jan  7 10:35 test
-rw-r--r--  1 root root 201K Jan  7 10:33 tfidf_transformer.pk
drwxr-xr-x 27 root root 4.0K Jan  7 10:35 train
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="203"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="0AU4q7QGVSAX" data-outputId="6f926215-2655-4816-8c34-1b52a225b220">
<div class="sourceCode" id="cb309"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb309-1"><a href="#cb309-1" aria-hidden="true" tabindex="-1"></a><span class="co">## import time</span></span>
<span id="cb309-2"><a href="#cb309-2" aria-hidden="true" tabindex="-1"></a><span class="co">## global_start = time.time()</span></span>
<span id="cb309-3"><a href="#cb309-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb309-4"><a href="#cb309-4" aria-hidden="true" tabindex="-1"></a>global_end <span class="op">=</span> time.time()</span>
<span id="cb309-5"><a href="#cb309-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb309-6"><a href="#cb309-6" aria-hidden="true" tabindex="-1"></a><span class="co"># print(&quot;[T4 GPU &amp; High RAM?] Global Time Duration: &quot; + str(global_end - global_start))</span></span>
<span id="cb309-7"><a href="#cb309-7" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Global Time Duration: &quot;</span> <span class="op">+</span> <span class="bu">str</span>(global_end <span class="op">-</span> global_start))</span>
<span id="cb309-8"><a href="#cb309-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb309-9"><a href="#cb309-9" aria-hidden="true" tabindex="-1"></a><span class="co">## [T4 GPU &amp; High RAM] Global Time Duration: 643</span></span>
<span id="cb309-10"><a href="#cb309-10" aria-hidden="true" tabindex="-1"></a><span class="co">## 643s / 60 ​is approximately 11 minutes</span></span>
<span id="cb309-11"><a href="#cb309-11" aria-hidden="true" tabindex="-1"></a><span class="co">## 8 mins on CPU &amp; High RAM</span></span>
<span id="cb309-12"><a href="#cb309-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb309-13"><a href="#cb309-13" aria-hidden="true" tabindex="-1"></a><span class="co">## Global Time Duration: 401.64417576789856</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>Global Time Duration: 401.64417576789856
</code></pre>
</div>
</div>
<section id="this-may-take-few-minutes" class="cell markdown"
id="XVc-3hwRVSAY">
<h3>This may take few minutes</h3>
</section>
<div class="cell code" data-execution_count="204"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="oPeWt2k9VSAY" data-outputId="2ba2b134-504f-4926-d59f-0e4d8647b1fe">
<div class="sourceCode" id="cb311"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb311-1"><a href="#cb311-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>time</span>
<span id="cb311-2"><a href="#cb311-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb311-3"><a href="#cb311-3" aria-hidden="true" tabindex="-1"></a><span class="co">## &gt; 1GB !</span></span>
<span id="cb311-4"><a href="#cb311-4" aria-hidden="true" tabindex="-1"></a><span class="co"># from google.colab import files</span></span>
<span id="cb311-5"><a href="#cb311-5" aria-hidden="true" tabindex="-1"></a><span class="co"># files.download(&quot;1st_nlp_text_clf_cpp_top_cwe_v240104a.tar.gz&quot;)</span></span>
<span id="cb311-6"><a href="#cb311-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb311-7"><a href="#cb311-7" aria-hidden="true" tabindex="-1"></a><span class="co">##</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.58 µs
</code></pre>
</div>
</div>
<div class="cell markdown" id="4XdxfPhUsopI">
<blockquote>
<p>As inspiered by our own Prof. Sadawi's Lecture Notes, Course-Work
Template ** &amp; related Notebooks; while further extending official <a
href="https://www.tensorflow.org/tutorials/load_data/text#example_1_predict_the_tag_for_a_stack_overflow_question">TF2</a>
documentation.</p>
</blockquote>
<blockquote>
<p>** Sadawi, N. (2021). Nsadawi/Advanced-ML-Projects <a
href="https://github.com/nsadawi/Advanced-ML-Projects/blob/4e112da6c42670052eca1152bd0a786afc30c1c5/Text%20Classification/NLP-Text-Classification-Sentiment-Analysis.ipynb">Jupyter
Notebook</a> (Original work published 2020)
[dc88adb9c256ae34c381a0d3533a586c5906aac8].</p>
</blockquote>
</div>
<div class="cell markdown" id="mXKJ2W4zaygN">
<hr />
</div>
<section id="appendix--a-dataset-sourcing-from-scratch"
class="cell markdown" id="Oh9B2197jR4V">
<h1>APPENDIX- A: Dataset Sourcing from Scratch</h1>
</section>
<section id="in-sample-test-cases-for-each-cwe" class="cell markdown"
id="L5arrAn10SUp">
<h3>In-Sample Test-Cases for each CWE</h3>
</section>
<section id="api-warning-huge-size-download" class="cell markdown"
id="jtHg7RTZaR6r">
<h4>API WARNING: Huge Size Download</h4>
</section>
<div class="cell code" data-execution_count="205"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="2s9LAi5W7Wmj" data-outputId="4acc345c-fe7b-4d72-a85c-26ba4af1afd5">
<div class="sourceCode" id="cb313"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb313-1"><a href="#cb313-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>wget https:<span class="op">//</span>samate.nist.gov<span class="op">/</span>SARD<span class="op">/</span>downloads<span class="op">/</span>test<span class="op">-</span>suites<span class="op">/</span><span class="dv">2022</span><span class="op">-</span><span class="dv">0</span><span class="er">8</span><span class="op">-</span><span class="dv">11</span><span class="op">-</span>juliet<span class="op">-</span>c<span class="op">-</span>cplusplus<span class="op">-</span>v1<span class="op">-</span><span class="dv">3</span><span class="op">-</span><span class="dv">1</span><span class="op">-</span><span class="cf">with</span><span class="op">-</span>extra<span class="op">-</span>support.<span class="bu">zip</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>--2024-01-07 10:38:18--  https://samate.nist.gov/SARD/downloads/test-suites/2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support.zip
Resolving samate.nist.gov (samate.nist.gov)... 129.6.13.19, 2610:20:6005:13::19
Connecting to samate.nist.gov (samate.nist.gov)|129.6.13.19|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 703487355 (671M) [application/zip]
Saving to: ‘2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support.zip’

2022-08-11-juliet-c 100%[===================&gt;] 670.90M  97.6MB/s    in 5.5s    

2024-01-07 10:38:24 (123 MB/s) - ‘2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support.zip’ saved [703487355/703487355]

</code></pre>
</div>
</div>
<section id="this-may-take-few-minutes" class="cell markdown"
id="PRH5uLM1FL4w">
<h3>This may take few minutes</h3>
</section>
<div class="cell code" id="uUqkmfVB7WDX">
<div class="sourceCode" id="cb315"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb315-1"><a href="#cb315-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>time</span>
<span id="cb315-2"><a href="#cb315-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb315-3"><a href="#cb315-3" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>unzip <span class="dv">2022</span><span class="op">-</span><span class="dv">0</span><span class="er">8</span><span class="op">-</span><span class="dv">11</span><span class="op">-</span>juliet<span class="op">-</span>c<span class="op">-</span>cplusplus<span class="op">-</span>v1<span class="op">-</span><span class="dv">3</span><span class="op">-</span><span class="dv">1</span><span class="op">-</span><span class="cf">with</span><span class="op">-</span>extra<span class="op">-</span>support.<span class="bu">zip</span> <span class="op">-</span>d data <span class="co"># /content/data</span></span>
<span id="cb315-4"><a href="#cb315-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb315-5"><a href="#cb315-5" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 26.5 s, sys: 5.38 s, total: 31.8 s</span></span>
<span id="cb315-6"><a href="#cb315-6" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 4min 3s</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="275" id="GR5pC_Gt00vj">
<div class="sourceCode" id="cb316"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb316-1"><a href="#cb316-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Streaming output truncated to the last 5000 lines.</span></span>
<span id="cb316-2"><a href="#cb316-2" aria-hidden="true" tabindex="-1"></a><span class="co">#   inflating: data/97419-v1.0.0/src/testcases/CWE36_Absolute_Path_Traversal/s04/CWE36_Absolute_Path_Traversal__wchar_t_environment_w32CreateFile_10.cpp</span></span>
<span id="cb316-3"><a href="#cb316-3" aria-hidden="true" tabindex="-1"></a><span class="co">#    creating: data/247234-v2.0.0/</span></span>
<span id="cb316-4"><a href="#cb316-4" aria-hidden="true" tabindex="-1"></a><span class="co">#   inflating: data/247234-v2.0.0/manifest.sarif</span></span>
<span id="cb316-5"><a href="#cb316-5" aria-hidden="true" tabindex="-1"></a><span class="co">#   inflating: data/247234-v2.0.0/Dockerfile</span></span>
<span id="cb316-6"><a href="#cb316-6" aria-hidden="true" tabindex="-1"></a><span class="co">#   inflating: data/247234-v2.0.0/Makefile</span></span>
<span id="cb316-7"><a href="#cb316-7" aria-hidden="true" tabindex="-1"></a><span class="co">#  extracting: data/247234-v2.0.0/.dockerignore</span></span>
<span id="cb316-8"><a href="#cb316-8" aria-hidden="true" tabindex="-1"></a><span class="co">#  extracting: data/247234-v2.0.0/.gitignore</span></span>
<span id="cb316-9"><a href="#cb316-9" aria-hidden="true" tabindex="-1"></a><span class="co">#    creating: data/247234-v2.0.0/src/</span></span>
<span id="cb316-10"><a href="#cb316-10" aria-hidden="true" tabindex="-1"></a><span class="co">#    creating: data/247234-v2.0.0/src/testcasesupport/</span></span>
<span id="cb316-11"><a href="#cb316-11" aria-hidden="true" tabindex="-1"></a><span class="co">#   inflating: data/247234-v2.0.0/src/testcasesupport/std_testcase_io.h</span></span>
<span id="cb316-12"><a href="#cb316-12" aria-hidden="true" tabindex="-1"></a><span class="co">#   inflating: data/247234-v2.0.0/src/testcasesupport/io.c</span></span>
<span id="cb316-13"><a href="#cb316-13" aria-hidden="true" tabindex="-1"></a><span class="co">#   inflating: data/247234-v2.0.0/src/testcasesupport/std_testcase.h</span></span>
<span id="cb316-14"><a href="#cb316-14" aria-hidden="true" tabindex="-1"></a><span class="co">#    creating: data/247234-v2.0.0/src/testcases/</span></span>
<span id="cb316-15"><a href="#cb316-15" aria-hidden="true" tabindex="-1"></a><span class="co">#    creating: data/247234-v2.0.0/src/testcases/CWE78_OS_Command_Injection/</span></span>
<span id="cb316-16"><a href="#cb316-16" aria-hidden="true" tabindex="-1"></a><span class="co">#    creating: data/247234-v2.0.0/src/testcases/CWE78_OS_Command_Injection/s06/</span></span>
<span id="cb316-17"><a href="#cb316-17" aria-hidden="true" tabindex="-1"></a><span class="co">#   inflating: data/247234-v2.0.0/src/testcases/CWE78_OS_Command_Injection/s06/CWE78_OS_Command_Injection__wchar_t_console_w32_spawnvp_34.c</span></span>
<span id="cb316-18"><a href="#cb316-18" aria-hidden="true" tabindex="-1"></a><span class="co">#    creating: data/111814-v1.0.0/</span></span>
<span id="cb316-19"><a href="#cb316-19" aria-hidden="true" tabindex="-1"></a><span class="co"># ...</span></span>
<span id="cb316-20"><a href="#cb316-20" aria-hidden="true" tabindex="-1"></a><span class="co"># ...</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="207"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="QeN_L8ES8fuN" data-outputId="55a8c570-93e9-4388-c641-11bdcc15a15c">
<div class="sourceCode" id="cb317"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb317-1"><a href="#cb317-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Open the zip file</span></span>
<span id="cb317-2"><a href="#cb317-2" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> zipfile.ZipFile(<span class="st">&#39;2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support.zip&#39;</span>, <span class="st">&#39;r&#39;</span>) <span class="im">as</span> zip_ref:</span>
<span id="cb317-3"><a href="#cb317-3" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Get a list of all file names in the zip file</span></span>
<span id="cb317-4"><a href="#cb317-4" aria-hidden="true" tabindex="-1"></a>    all_files <span class="op">=</span> zip_ref.namelist()</span>
<span id="cb317-5"><a href="#cb317-5" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Calculate 1% of the total number of files</span></span>
<span id="cb317-6"><a href="#cb317-6" aria-hidden="true" tabindex="-1"></a>    one_percent_files <span class="op">=</span> <span class="bu">int</span>(<span class="bu">len</span>(all_files) <span class="op">*</span> <span class="fl">0.01</span>)</span>
<span id="cb317-7"><a href="#cb317-7" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Get the last 1% of the files</span></span>
<span id="cb317-8"><a href="#cb317-8" aria-hidden="true" tabindex="-1"></a>    last_one_percent_files <span class="op">=</span> all_files[<span class="op">-</span>one_percent_files:]</span>
<span id="cb317-9"><a href="#cb317-9" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Extract only the last 1% of the files</span></span>
<span id="cb317-10"><a href="#cb317-10" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> <span class="bu">file</span> <span class="kw">in</span> last_one_percent_files:</span>
<span id="cb317-11"><a href="#cb317-11" aria-hidden="true" tabindex="-1"></a>        zip_ref.extract(<span class="bu">file</span>, <span class="st">&#39;/content/data&#39;</span>)</span>
<span id="cb317-12"><a href="#cb317-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb317-13"><a href="#cb317-13" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Extracted the last 1</span><span class="sc">% o</span><span class="st">f files to /content/data&quot;</span>)</span></code></pre></div>
<div class="output stream stdout">
<pre><code>Extracted the last 1% of files to /content/data
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="208" id="ecKvrwgz9KWe">
<div class="sourceCode" id="cb319"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb319-1"><a href="#cb319-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Initialize an empty list to store the file details</span></span>
<span id="cb319-2"><a href="#cb319-2" aria-hidden="true" tabindex="-1"></a>df_files <span class="op">=</span> []</span>
<span id="cb319-3"><a href="#cb319-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb319-4"><a href="#cb319-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Traverse the directory tree</span></span>
<span id="cb319-5"><a href="#cb319-5" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> dirpath, dirs, files <span class="kw">in</span> os.walk(<span class="st">&#39;/content/data&#39;</span>):</span>
<span id="cb319-6"><a href="#cb319-6" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> filename <span class="kw">in</span> files:</span>
<span id="cb319-7"><a href="#cb319-7" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Get the file address</span></span>
<span id="cb319-8"><a href="#cb319-8" aria-hidden="true" tabindex="-1"></a>        file_address <span class="op">=</span> os.path.join(dirpath, filename)</span>
<span id="cb319-9"><a href="#cb319-9" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Append the file name and address to the list</span></span>
<span id="cb319-10"><a href="#cb319-10" aria-hidden="true" tabindex="-1"></a>        df_files.append([filename, file_address])</span>
<span id="cb319-11"><a href="#cb319-11" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb319-12"><a href="#cb319-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Convert the list to a dataframe</span></span>
<span id="cb319-13"><a href="#cb319-13" aria-hidden="true" tabindex="-1"></a>df_f <span class="op">=</span> pd.DataFrame(df_files, columns<span class="op">=</span>[<span class="st">&#39;File_Name&#39;</span>, <span class="st">&#39;File_Address&#39;</span>])</span>
<span id="cb319-14"><a href="#cb319-14" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb319-15"><a href="#cb319-15" aria-hidden="true" tabindex="-1"></a><span class="co"># Save the dataframe to a csv file</span></span>
<span id="cb319-16"><a href="#cb319-16" aria-hidden="true" tabindex="-1"></a>df_f.to_csv(<span class="st">&#39;files_tree.csv&#39;</span>, index<span class="op">=</span><span class="va">False</span>)</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="209" id="1bRIrkT-9KSK">
<div class="sourceCode" id="cb320"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb320-1"><a href="#cb320-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Initialize an empty list to store the file details</span></span>
<span id="cb320-2"><a href="#cb320-2" aria-hidden="true" tabindex="-1"></a>cpp_list <span class="op">=</span> []</span>
<span id="cb320-3"><a href="#cb320-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb320-4"><a href="#cb320-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Traverse the directory tree</span></span>
<span id="cb320-5"><a href="#cb320-5" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> dirpath, dirs, files <span class="kw">in</span> os.walk(<span class="st">&#39;/content/data&#39;</span>):</span>
<span id="cb320-6"><a href="#cb320-6" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> filename <span class="kw">in</span> files:</span>
<span id="cb320-7"><a href="#cb320-7" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Only process files with the &#39;.cpp&#39; extension</span></span>
<span id="cb320-8"><a href="#cb320-8" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> filename.endswith(<span class="st">&#39;.cpp&#39;</span>):</span>
<span id="cb320-9"><a href="#cb320-9" aria-hidden="true" tabindex="-1"></a>            <span class="co"># Get the file address</span></span>
<span id="cb320-10"><a href="#cb320-10" aria-hidden="true" tabindex="-1"></a>            file_address <span class="op">=</span> os.path.join(dirpath, filename)</span>
<span id="cb320-11"><a href="#cb320-11" aria-hidden="true" tabindex="-1"></a>            <span class="co"># Append the file name and address to the list</span></span>
<span id="cb320-12"><a href="#cb320-12" aria-hidden="true" tabindex="-1"></a>            cpp_list.append([filename, file_address])</span>
<span id="cb320-13"><a href="#cb320-13" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb320-14"><a href="#cb320-14" aria-hidden="true" tabindex="-1"></a><span class="co"># Convert the list to a dataframe</span></span>
<span id="cb320-15"><a href="#cb320-15" aria-hidden="true" tabindex="-1"></a>cpp_df <span class="op">=</span> pd.DataFrame(cpp_list, columns<span class="op">=</span>[<span class="st">&#39;File_Name&#39;</span>, <span class="st">&#39;File_Address&#39;</span>])</span>
<span id="cb320-16"><a href="#cb320-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb320-17"><a href="#cb320-17" aria-hidden="true" tabindex="-1"></a><span class="co"># Save the dataframe to a csv file</span></span>
<span id="cb320-18"><a href="#cb320-18" aria-hidden="true" tabindex="-1"></a>cpp_df.to_csv(<span class="st">&#39;cpp_files_tree.csv&#39;</span>, index<span class="op">=</span><span class="va">False</span>)</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="210"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="3ufP_Vhm9KOT" data-outputId="81639483-049c-4cd6-fbdd-55f72b3521cd">
<div class="sourceCode" id="cb321"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb321-1"><a href="#cb321-1" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_list[0]</span></span>
<span id="cb321-2"><a href="#cb321-2" aria-hidden="true" tabindex="-1"></a>cpp_df.shape <span class="co">## (411, 2) ## (46401, 2) ## (618185, 2)</span></span>
<span id="cb321-3"><a href="#cb321-3" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_df.head()</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="210">
<pre><code>(46401, 2)</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="211"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:206}"
id="OVKiFhn59VP9" data-outputId="e0ad1d01-cc4d-402c-b730-c637cca6406b">
<div class="sourceCode" id="cb323"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb323-1"><a href="#cb323-1" aria-hidden="true" tabindex="-1"></a><span class="co"># df_files.shape</span></span>
<span id="cb323-2"><a href="#cb323-2" aria-hidden="true" tabindex="-1"></a>cpp_df.head()</span></code></pre></div>
<div class="output execute_result" data-execution_count="211">

  <div id="df-6f6cce6e-c01c-4468-b2a6-5f212aad5846" class="colab-df-container">
    <div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>File_Name</th>
      <th>File_Address</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>CWE36_Absolute_Path_Traversal__wchar_t_connect...</td>
      <td>/content/data/96809-v1.0.0/src/testcases/CWE36...</td>
    </tr>
    <tr>
      <th>1</th>
      <td>CWE590_Free_Memory_Not_on_Heap__delete_struct_...</td>
      <td>/content/data/107819-v1.0.0/src/testcases/CWE5...</td>
    </tr>
    <tr>
      <th>2</th>
      <td>CWE23_Relative_Path_Traversal__char_connect_so...</td>
      <td>/content/data/89720-v1.0.0/src/testcases/CWE23...</td>
    </tr>
    <tr>
      <th>3</th>
      <td>CWE23_Relative_Path_Traversal__char_connect_so...</td>
      <td>/content/data/89720-v1.0.0/src/testcases/CWE23...</td>
    </tr>
    <tr>
      <th>4</th>
      <td>CWE23_Relative_Path_Traversal__char_connect_so...</td>
      <td>/content/data/89720-v1.0.0/src/testcases/CWE23...</td>
    </tr>
  </tbody>
</table>
</div>
    <div class="colab-df-buttons">

  <div class="colab-df-container">
    <button class="colab-df-convert" onclick="convertToInteractive('df-6f6cce6e-c01c-4468-b2a6-5f212aad5846')"
            title="Convert this dataframe to an interactive table."
            style="display:none;">

  <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960">
    <path d="M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z"/>
  </svg>
    </button>

  <style>
    .colab-df-container {
      display:flex;
      gap: 12px;
    }

    .colab-df-convert {
      background-color: #E8F0FE;
      border: none;
      border-radius: 50%;
      cursor: pointer;
      display: none;
      fill: #1967D2;
      height: 32px;
      padding: 0 0 0 0;
      width: 32px;
    }

    .colab-df-convert:hover {
      background-color: #E2EBFA;
      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
      fill: #174EA6;
    }

    .colab-df-buttons div {
      margin-bottom: 4px;
    }

    [theme=dark] .colab-df-convert {
      background-color: #3B4455;
      fill: #D2E3FC;
    }

    [theme=dark] .colab-df-convert:hover {
      background-color: #434B5C;
      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
      fill: #FFFFFF;
    }
  </style>

    <script>
      const buttonEl =
        document.querySelector('#df-6f6cce6e-c01c-4468-b2a6-5f212aad5846 button.colab-df-convert');
      buttonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';

      async function convertToInteractive(key) {
        const element = document.querySelector('#df-6f6cce6e-c01c-4468-b2a6-5f212aad5846');
        const dataTable =
          await google.colab.kernel.invokeFunction('convertToInteractive',
                                                    [key], {});
        if (!dataTable) return;

        const docLinkHtml = 'Like what you see? Visit the ' +
          '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
          + ' to learn more about interactive tables.';
        element.innerHTML = '';
        dataTable['output_type'] = 'display_data';
        await google.colab.output.renderOutput(dataTable, element);
        const docLink = document.createElement('div');
        docLink.innerHTML = docLinkHtml;
        element.appendChild(docLink);
      }
    </script>
  </div>


<div id="df-5c71e297-46b3-4ce4-95c6-7e5cb722b2a3">
  <button class="colab-df-quickchart" onclick="quickchart('df-5c71e297-46b3-4ce4-95c6-7e5cb722b2a3')"
            title="Suggest charts"
            style="display:none;">

<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
     width="24px">
    <g>
        <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"/>
    </g>
</svg>
  </button>

<style>
  .colab-df-quickchart {
      --bg-color: #E8F0FE;
      --fill-color: #1967D2;
      --hover-bg-color: #E2EBFA;
      --hover-fill-color: #174EA6;
      --disabled-fill-color: #AAA;
      --disabled-bg-color: #DDD;
  }

  [theme=dark] .colab-df-quickchart {
      --bg-color: #3B4455;
      --fill-color: #D2E3FC;
      --hover-bg-color: #434B5C;
      --hover-fill-color: #FFFFFF;
      --disabled-bg-color: #3B4455;
      --disabled-fill-color: #666;
  }

  .colab-df-quickchart {
    background-color: var(--bg-color);
    border: none;
    border-radius: 50%;
    cursor: pointer;
    display: none;
    fill: var(--fill-color);
    height: 32px;
    padding: 0;
    width: 32px;
  }

  .colab-df-quickchart:hover {
    background-color: var(--hover-bg-color);
    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
    fill: var(--button-hover-fill-color);
  }

  .colab-df-quickchart-complete:disabled,
  .colab-df-quickchart-complete:disabled:hover {
    background-color: var(--disabled-bg-color);
    fill: var(--disabled-fill-color);
    box-shadow: none;
  }

  .colab-df-spinner {
    border: 2px solid var(--fill-color);
    border-color: transparent;
    border-bottom-color: var(--fill-color);
    animation:
      spin 1s steps(1) infinite;
  }

  @keyframes spin {
    0% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
      border-left-color: var(--fill-color);
    }
    20% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    30% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
      border-right-color: var(--fill-color);
    }
    40% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    60% {
      border-color: transparent;
      border-right-color: var(--fill-color);
    }
    80% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-bottom-color: var(--fill-color);
    }
    90% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
    }
  }
</style>

  <script>
    async function quickchart(key) {
      const quickchartButtonEl =
        document.querySelector('#' + key + ' button');
      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.
      quickchartButtonEl.classList.add('colab-df-spinner');
      try {
        const charts = await google.colab.kernel.invokeFunction(
            'suggestCharts', [key], {});
      } catch (error) {
        console.error('Error during call to suggestCharts:', error);
      }
      quickchartButtonEl.classList.remove('colab-df-spinner');
      quickchartButtonEl.classList.add('colab-df-quickchart-complete');
    }
    (() => {
      let quickchartButtonEl =
        document.querySelector('#df-5c71e297-46b3-4ce4-95c6-7e5cb722b2a3 button');
      quickchartButtonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';
    })();
  </script>
</div>
    </div>
  </div>

</div>
</div>
<div class="cell code" data-execution_count="212" id="vHPoOZ-e9eLr">
<div class="sourceCode" id="cb324"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb324-1"><a href="#cb324-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !mkdir /content/cpp_data</span></span>
<span id="cb324-2"><a href="#cb324-2" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>mkdir <span class="op">-</span>p <span class="op">/</span>content<span class="op">/</span>cpp_data</span></code></pre></div>
</div>
<section id="this-may-take-few-minutes" class="cell markdown"
id="esgnTuNqFBYX">
<h3>This may take few minutes</h3>
</section>
<div class="cell code" data-execution_count="213"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="ahjhJqr99eG7" data-outputId="2e51204d-8218-4ff3-ea2f-81cbbe90a960">
<div class="sourceCode" id="cb325"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb325-1"><a href="#cb325-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>time</span>
<span id="cb325-2"><a href="#cb325-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb325-3"><a href="#cb325-3" aria-hidden="true" tabindex="-1"></a><span class="co"># !mkdir -p /content/cpp_data</span></span>
<span id="cb325-4"><a href="#cb325-4" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>awk <span class="op">-</span>F <span class="st">&#39;,&#39;</span> <span class="st">&#39;{if (NR!=1) print $2}&#39;</span> cpp_files_tree.csv <span class="op">|</span> xargs <span class="op">-</span>I {} cp {} <span class="op">/</span>content<span class="op">/</span>cpp_data<span class="op">/</span></span>
<span id="cb325-5"><a href="#cb325-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb325-6"><a href="#cb325-6" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 417 ms, sys: 54.4 ms, total: 472 ms</span></span>
<span id="cb325-7"><a href="#cb325-7" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 1min 10s</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>CPU times: user 929 ms, sys: 94 ms, total: 1.02 s
Wall time: 2min
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="214" id="nzhgQkxi9eA8">
<div class="sourceCode" id="cb327"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb327-1"><a href="#cb327-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !mv /content/cpp_files_all_raw.zip /content/cpp_files_all_raw_partial.zip</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="215"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="IJ9xDe6k9d6R" data-outputId="24ea0c6e-2845-4b84-ae8f-7093e7026e24">
<div class="sourceCode" id="cb328"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb328-1"><a href="#cb328-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Initialize an empty set to store the unique prefixes</span></span>
<span id="cb328-2"><a href="#cb328-2" aria-hidden="true" tabindex="-1"></a>subfolders_set <span class="op">=</span> <span class="bu">set</span>()</span>
<span id="cb328-3"><a href="#cb328-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb328-4"><a href="#cb328-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Traverse the directory tree</span></span>
<span id="cb328-5"><a href="#cb328-5" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> dirpath, dirs, files <span class="kw">in</span> os.walk(<span class="st">&#39;/content/cpp_data&#39;</span>):</span>
<span id="cb328-6"><a href="#cb328-6" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> filename <span class="kw">in</span> files:</span>
<span id="cb328-7"><a href="#cb328-7" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Extract the prefix from the file name</span></span>
<span id="cb328-8"><a href="#cb328-8" aria-hidden="true" tabindex="-1"></a>        prefix <span class="op">=</span> re.match(<span class="vs">r&#39;(.*)__&#39;</span>, filename)</span>
<span id="cb328-9"><a href="#cb328-9" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> prefix:</span>
<span id="cb328-10"><a href="#cb328-10" aria-hidden="true" tabindex="-1"></a>            <span class="co"># Add the prefix to the set</span></span>
<span id="cb328-11"><a href="#cb328-11" aria-hidden="true" tabindex="-1"></a>            subfolders_set.add(prefix.group(<span class="dv">1</span>))</span>
<span id="cb328-12"><a href="#cb328-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb328-13"><a href="#cb328-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Convert the set to a list</span></span>
<span id="cb328-14"><a href="#cb328-14" aria-hidden="true" tabindex="-1"></a>subfolders_list <span class="op">=</span> <span class="bu">list</span>(subfolders_set)</span>
<span id="cb328-15"><a href="#cb328-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb328-16"><a href="#cb328-16" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(subfolders_list)</span></code></pre></div>
<div class="output stream stdout">
<pre><code>[&#39;CWE675_Duplicate_Operations_on_Resource&#39;, &#39;CWE758_Undefined_Behavior&#39;, &#39;CWE123_Write_What_Where_Condition&#39;, &#39;CWE590_Free_Memory_Not_on_Heap&#39;, &#39;CWE690_NULL_Deref_From_Return&#39;, &#39;CWE126_Buffer_Overread&#39;, &#39;CWE195_Signed_to_Unsigned_Conversion_Error&#39;, &#39;CWE773_Missing_Reference_to_Active_File_Descriptor_or_Handle&#39;, &#39;CWE563_Unused_Variable&#39;, &#39;CWE606_Unchecked_Loop_Condition&#39;, &#39;CWE127_Buffer_Underread&#39;, &#39;CWE400_Resource_Exhaustion&#39;, &#39;CWE114_Process_Control&#39;, &#39;CWE762_Mismatched_Memory_Management_Routines&#39;, &#39;CWE416_Use_After_Free&#39;, &#39;CWE121_Stack_Based_Buffer_Overflow&#39;, &#39;CWE390_Error_Without_Action&#39;, &#39;CWE591_Sensitive_Data_Storage_in_Improperly_Locked_Memory&#39;, &#39;CWE396_Catch_Generic_Exception&#39;, &#39;CWE789_Uncontrolled_Mem_Alloc&#39;, &#39;CWE500_Public_Static_Field_Not_Final&#39;, &#39;CWE23_Relative_Path_Traversal&#39;, &#39;CWE476_NULL_Pointer_Dereference&#39;, &#39;CWE134_Uncontrolled_Format_String&#39;, &#39;CWE672_Operation_on_Resource_After_Expiration_or_Release&#39;, &#39;CWE562_Return_of_Stack_Variable_Address&#39;, &#39;CWE256_Plaintext_Storage_of_Password&#39;, &#39;CWE319_Cleartext_Tx_Sensitive_Info&#39;, &#39;CWE197_Numeric_Truncation_Error&#39;, &#39;CWE194_Unexpected_Sign_Extension&#39;, &#39;CWE397_Throw_Generic_Exception&#39;, &#39;CWE415_Double_Free&#39;, &#39;CWE90_LDAP_Injection&#39;, &#39;CWE457_Use_of_Uninitialized_Variable&#39;, &#39;CWE843_Type_Confusion&#39;, &#39;CWE369_Divide_by_Zero&#39;, &#39;CWE680_Integer_Overflow_to_Buffer_Overflow&#39;, &#39;CWE176_Improper_Handling_of_Unicode_Encoding&#39;, &#39;CWE404_Improper_Resource_Shutdown&#39;, &#39;CWE427_Uncontrolled_Search_Path_Element&#39;, &#39;CWE190_Integer_Overflow&#39;, &#39;CWE321_Hard_Coded_Cryptographic_Key&#39;, &#39;CWE122_Heap_Based_Buffer_Overflow&#39;, &#39;CWE468_Incorrect_Pointer_Scaling&#39;, &#39;CWE588_Attempt_to_Access_Child_of_Non_Structure_Pointer&#39;, &#39;CWE426_Untrusted_Search_Path&#39;, &#39;CWE401_Memory_Leak&#39;, &#39;CWE78_OS_Command_Injection&#39;, &#39;CWE124_Buffer_Underwrite&#39;, &#39;CWE191_Integer_Underflow&#39;, &#39;CWE775_Missing_Release_of_File_Descriptor_or_Handle&#39;, &#39;CWE259_Hard_Coded_Password&#39;, &#39;CWE665_Improper_Initialization&#39;, &#39;CWE676_Use_of_Potentially_Dangerous_Function&#39;, &#39;CWE15_External_Control_of_System_or_Configuration_Setting&#39;, &#39;CWE440_Expected_Behavior_Violation&#39;, &#39;CWE464_Addition_of_Data_Structure_Sentinel&#39;, &#39;CWE617_Reachable_Assertion&#39;, &#39;CWE36_Absolute_Path_Traversal&#39;, &#39;CWE761_Free_Pointer_Not_at_Start_of_Buffer&#39;]
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="216"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="bU_pj6nK9s9N" data-outputId="1434ffaa-3945-4b7b-f354-402999855227">
<div class="sourceCode" id="cb330"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb330-1"><a href="#cb330-1" aria-hidden="true" tabindex="-1"></a>num_items <span class="op">=</span> <span class="bu">len</span>(subfolders_list)</span>
<span id="cb330-2"><a href="#cb330-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(num_items)</span>
<span id="cb330-3"><a href="#cb330-3" aria-hidden="true" tabindex="-1"></a><span class="co">## 42 ## 60</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>60
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="217"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="_W00iGJx9s46" data-outputId="061a84e1-3cac-42f6-9461-2ccdf588d717">
<div class="sourceCode" id="cb332"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb332-1"><a href="#cb332-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Initialize an empty list to store the prefixes</span></span>
<span id="cb332-2"><a href="#cb332-2" aria-hidden="true" tabindex="-1"></a>prefixes <span class="op">=</span> []</span>
<span id="cb332-3"><a href="#cb332-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb332-4"><a href="#cb332-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Traverse the directory tree</span></span>
<span id="cb332-5"><a href="#cb332-5" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> dirpath, dirs, files <span class="kw">in</span> os.walk(<span class="st">&#39;/content/cpp_data&#39;</span>):</span>
<span id="cb332-6"><a href="#cb332-6" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> filename <span class="kw">in</span> files:</span>
<span id="cb332-7"><a href="#cb332-7" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Extract the prefix from the file name</span></span>
<span id="cb332-8"><a href="#cb332-8" aria-hidden="true" tabindex="-1"></a>        prefix <span class="op">=</span> re.match(<span class="vs">r&#39;(.*)__&#39;</span>, filename)</span>
<span id="cb332-9"><a href="#cb332-9" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> prefix:</span>
<span id="cb332-10"><a href="#cb332-10" aria-hidden="true" tabindex="-1"></a>            <span class="co"># Add the prefix to the list</span></span>
<span id="cb332-11"><a href="#cb332-11" aria-hidden="true" tabindex="-1"></a>            prefixes.append(prefix.group(<span class="dv">1</span>))</span>
<span id="cb332-12"><a href="#cb332-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb332-13"><a href="#cb332-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Count the number of files for each prefix</span></span>
<span id="cb332-14"><a href="#cb332-14" aria-hidden="true" tabindex="-1"></a>prefix_counts <span class="op">=</span> collections.Counter(prefixes)</span>
<span id="cb332-15"><a href="#cb332-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb332-16"><a href="#cb332-16" aria-hidden="true" tabindex="-1"></a><span class="co"># Convert the counter to a dataframe</span></span>
<span id="cb332-17"><a href="#cb332-17" aria-hidden="true" tabindex="-1"></a>cpp_folders_count_df <span class="op">=</span> pd.DataFrame.from_dict(prefix_counts, orient<span class="op">=</span><span class="st">&#39;index&#39;</span>).reset_index()</span>
<span id="cb332-18"><a href="#cb332-18" aria-hidden="true" tabindex="-1"></a>cpp_folders_count_df.columns <span class="op">=</span> [<span class="st">&#39;CWE-ID&#39;</span>, <span class="st">&#39;Files-Count&#39;</span>]</span>
<span id="cb332-19"><a href="#cb332-19" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb332-20"><a href="#cb332-20" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(cpp_folders_count_df)</span></code></pre></div>
<div class="output stream stdout">
<pre><code>                                               CWE-ID  Files-Count
0                       CWE23_Relative_Path_Traversal         3900
1                  CWE121_Stack_Based_Buffer_Overflow         1965
2                        CWE426_Untrusted_Search_Path           88
3                                  CWE401_Memory_Leak         1476
4        CWE762_Mismatched_Memory_Management_Routines         6092
5                       CWE690_NULL_Deref_From_Return          440
6                                  CWE415_Double_Free         1308
7                   CWE122_Heap_Based_Buffer_Overflow         5974
8                              CWE126_Buffer_Overread          912
9                       CWE36_Absolute_Path_Traversal         3900
10                            CWE127_Buffer_Underread         1416
11                         CWE400_Resource_Exhaustion          390
12                  CWE134_Uncontrolled_Format_String         1560
13                           CWE124_Buffer_Underwrite         1416
14                     CWE590_Free_Memory_Not_on_Heap         3321
15                      CWE789_Uncontrolled_Mem_Alloc         1080
16                         CWE78_OS_Command_Injection         2200
17                            CWE190_Integer_Overflow         1404
18                    CWE476_NULL_Pointer_Dereference          158
19         CWE195_Signed_to_Unsigned_Conversion_Error          528
20                             CWE114_Process_Control          264
21                         CWE617_Reachable_Assertion          132
22         CWE680_Integer_Overflow_to_Buffer_Overflow          600
23                              CWE416_Use_After_Free          370
24                    CWE606_Unchecked_Loop_Condition          260
25                               CWE90_LDAP_Injection          220
26            CWE675_Duplicate_Operations_on_Resource          104
27               CWE457_Use_of_Uninitialized_Variable          463
28                           CWE191_Integer_Underflow          858
29                  CWE404_Improper_Resource_Shutdown          176
30                             CWE563_Unused_Variable          388
31       CWE676_Use_of_Potentially_Dangerous_Function           18
32  CWE591_Sensitive_Data_Storage_in_Improperly_Lo...           44
33                              CWE369_Divide_by_Zero          468
34            CWE427_Uncontrolled_Search_Path_Element          220
35         CWE761_Free_Pointer_Not_at_Start_of_Buffer          264
36                   CWE194_Unexpected_Sign_Extension          528
37                        CWE390_Error_Without_Action           18
38                    CWE197_Numeric_Truncation_Error          396
39  CWE588_Attempt_to_Access_Child_of_Non_Structur...           76
40  CWE773_Missing_Reference_to_Active_File_Descri...           66
41                     CWE396_Catch_Generic_Exception           54
42  CWE775_Missing_Release_of_File_Descriptor_or_H...           66
43                     CWE665_Improper_Initialization           90
44                CWE321_Hard_Coded_Cryptographic_Key           44
45  CWE15_External_Control_of_System_or_Configurat...           22
46                          CWE758_Undefined_Behavior          216
47                 CWE319_Cleartext_Tx_Sensitive_Info          104
48  CWE672_Operation_on_Resource_After_Expiration_...           81
49                              CWE843_Type_Confusion           26
50                  CWE123_Write_What_Where_Condition           66
51               CWE256_Plaintext_Storage_of_Password           52
52                         CWE259_Hard_Coded_Password           44
53         CWE464_Addition_of_Data_Structure_Sentinel           22
54       CWE176_Improper_Handling_of_Unicode_Encoding           26
55                     CWE397_Throw_Generic_Exception           20
56                   CWE468_Incorrect_Pointer_Scaling            1
57                 CWE440_Expected_Behavior_Violation            1
58               CWE500_Public_Static_Field_Not_Final            2
59            CWE562_Return_of_Stack_Variable_Address            1
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="218"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="egIpb7L29s0r" data-outputId="45e75cd4-17b3-4712-ff95-65712ed99c31">
<div class="sourceCode" id="cb334"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb334-1"><a href="#cb334-1" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_cwe_top_25_df = cpp_folders_count_df.nlargest(25, &#39;Files-Count&#39;)</span></span>
<span id="cb334-2"><a href="#cb334-2" aria-hidden="true" tabindex="-1"></a>cpp_cwe_top_10_df <span class="op">=</span> cpp_folders_count_df.nlargest(<span class="dv">10</span>, <span class="st">&#39;Files-Count&#39;</span>)</span>
<span id="cb334-3"><a href="#cb334-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(cpp_cwe_top_10_df)</span></code></pre></div>
<div class="output stream stdout">
<pre><code>                                          CWE-ID  Files-Count
4   CWE762_Mismatched_Memory_Management_Routines         6092
7              CWE122_Heap_Based_Buffer_Overflow         5974
0                  CWE23_Relative_Path_Traversal         3900
9                  CWE36_Absolute_Path_Traversal         3900
14                CWE590_Free_Memory_Not_on_Heap         3321
16                    CWE78_OS_Command_Injection         2200
1             CWE121_Stack_Based_Buffer_Overflow         1965
12             CWE134_Uncontrolled_Format_String         1560
3                             CWE401_Memory_Leak         1476
10                       CWE127_Buffer_Underread         1416
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="219" id="HIan8fC89VKe">
<div class="sourceCode" id="cb336"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb336-1"><a href="#cb336-1" aria-hidden="true" tabindex="-1"></a><span class="co"># %%time</span></span>
<span id="cb336-2"><a href="#cb336-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Create the root folder</span></span>
<span id="cb336-3"><a href="#cb336-3" aria-hidden="true" tabindex="-1"></a>os.makedirs(<span class="st">&#39;/content/cpp_folders&#39;</span>, exist_ok<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb336-4"><a href="#cb336-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb336-5"><a href="#cb336-5" aria-hidden="true" tabindex="-1"></a><span class="co"># For each CWE-ID in the top 10/25</span></span>
<span id="cb336-6"><a href="#cb336-6" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> index, row <span class="kw">in</span> cpp_cwe_top_10_df.iterrows():</span>
<span id="cb336-7"><a href="#cb336-7" aria-hidden="true" tabindex="-1"></a>    cwe_id <span class="op">=</span> row[<span class="st">&#39;CWE-ID&#39;</span>]</span>
<span id="cb336-8"><a href="#cb336-8" aria-hidden="true" tabindex="-1"></a>    files_count <span class="op">=</span> row[<span class="st">&#39;Files-Count&#39;</span>]</span>
<span id="cb336-9"><a href="#cb336-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb336-10"><a href="#cb336-10" aria-hidden="true" tabindex="-1"></a>    <span class="co"># If the files count is greater than 10/350</span></span>
<span id="cb336-11"><a href="#cb336-11" aria-hidden="true" tabindex="-1"></a>    <span class="co"># if files_count &gt; 350:</span></span>
<span id="cb336-12"><a href="#cb336-12" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> files_count <span class="op">&gt;</span> <span class="dv">10</span>:</span>
<span id="cb336-13"><a href="#cb336-13" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Create a subfolder for the CWE-ID</span></span>
<span id="cb336-14"><a href="#cb336-14" aria-hidden="true" tabindex="-1"></a>        os.makedirs(<span class="ss">f&#39;/content/cpp_folders/</span><span class="sc">{</span>cwe_id<span class="sc">}</span><span class="ss">&#39;</span>, exist_ok<span class="op">=</span><span class="va">True</span>)</span>
<span id="cb336-15"><a href="#cb336-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb336-16"><a href="#cb336-16" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Get the list of files for the CWE-ID</span></span>
<span id="cb336-17"><a href="#cb336-17" aria-hidden="true" tabindex="-1"></a>        files <span class="op">=</span> [f <span class="cf">for</span> f <span class="kw">in</span> os.listdir(<span class="st">&#39;/content/cpp_data&#39;</span>) <span class="cf">if</span> f.startswith(cwe_id <span class="op">+</span> <span class="st">&#39;__&#39;</span>)]</span>
<span id="cb336-18"><a href="#cb336-18" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb336-19"><a href="#cb336-19" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Copy the initial 10/350 files</span></span>
<span id="cb336-20"><a href="#cb336-20" aria-hidden="true" tabindex="-1"></a>        <span class="co"># for file in files[:350]:</span></span>
<span id="cb336-21"><a href="#cb336-21" aria-hidden="true" tabindex="-1"></a>        <span class="cf">for</span> <span class="bu">file</span> <span class="kw">in</span> files[:<span class="dv">10</span>]:</span>
<span id="cb336-22"><a href="#cb336-22" aria-hidden="true" tabindex="-1"></a>            shutil.copy(os.path.join(<span class="st">&#39;/content/cpp_data&#39;</span>, <span class="bu">file</span>), <span class="ss">f&#39;/content/cpp_folders/</span><span class="sc">{</span>cwe_id<span class="sc">}</span><span class="ss">&#39;</span>)</span>
<span id="cb336-23"><a href="#cb336-23" aria-hidden="true" tabindex="-1"></a><span class="co">## 2s</span></span>
<span id="cb336-24"><a href="#cb336-24" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Copy the initial 10/350 files</span></span>
<span id="cb336-25"><a href="#cb336-25" aria-hidden="true" tabindex="-1"></a>        <span class="co"># for file in files[:350]:</span></span>
<span id="cb336-26"><a href="#cb336-26" aria-hidden="true" tabindex="-1"></a>        <span class="cf">for</span> <span class="bu">file</span> <span class="kw">in</span> files[:<span class="dv">10</span>]:</span>
<span id="cb336-27"><a href="#cb336-27" aria-hidden="true" tabindex="-1"></a>            shutil.copy(os.path.join(<span class="st">&#39;/content/cpp_data&#39;</span>, <span class="bu">file</span>), <span class="ss">f&#39;/content/cpp_folders/</span><span class="sc">{</span>cwe_id<span class="sc">}</span><span class="ss">&#39;</span>)</span>
<span id="cb336-28"><a href="#cb336-28" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 745 ms, sys: 1.5 s, total: 2.25 s</span></span>
<span id="cb336-29"><a href="#cb336-29" aria-hidden="true" tabindex="-1"></a><span class="co"># Wall time: 2.27 s</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="220"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="Asl47Ieo-Nai" data-outputId="52b7b79c-0676-47ee-b1dc-db3890e6de17">
<div class="sourceCode" id="cb337"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb337-1"><a href="#cb337-1" aria-hidden="true" tabindex="-1"></a><span class="co"># %%time</span></span>
<span id="cb337-2"><a href="#cb337-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Initialize an empty list to store the file details</span></span>
<span id="cb337-3"><a href="#cb337-3" aria-hidden="true" tabindex="-1"></a>file_details <span class="op">=</span> []</span>
<span id="cb337-4"><a href="#cb337-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb337-5"><a href="#cb337-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Traverse the directory tree</span></span>
<span id="cb337-6"><a href="#cb337-6" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> dirpath, dirs, files <span class="kw">in</span> os.walk(<span class="st">&#39;/content/cpp_folders&#39;</span>):</span>
<span id="cb337-7"><a href="#cb337-7" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> filename <span class="kw">in</span> files:</span>
<span id="cb337-8"><a href="#cb337-8" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Extract the CWE-ID from the directory path</span></span>
<span id="cb337-9"><a href="#cb337-9" aria-hidden="true" tabindex="-1"></a>        cwe_id <span class="op">=</span> os.path.basename(dirpath)</span>
<span id="cb337-10"><a href="#cb337-10" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Extract the file short name from the file name</span></span>
<span id="cb337-11"><a href="#cb337-11" aria-hidden="true" tabindex="-1"></a>        file_short_name <span class="op">=</span> re.match(<span class="vs">r&#39;.*__(.*)\.cpp&#39;</span>, filename)</span>
<span id="cb337-12"><a href="#cb337-12" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> file_short_name:</span>
<span id="cb337-13"><a href="#cb337-13" aria-hidden="true" tabindex="-1"></a>            file_short_name <span class="op">=</span> file_short_name.group(<span class="dv">1</span>)</span>
<span id="cb337-14"><a href="#cb337-14" aria-hidden="true" tabindex="-1"></a>        <span class="co"># Append the file details to the list</span></span>
<span id="cb337-15"><a href="#cb337-15" aria-hidden="true" tabindex="-1"></a>        file_details.append([cwe_id, file_short_name, filename])</span>
<span id="cb337-16"><a href="#cb337-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb337-17"><a href="#cb337-17" aria-hidden="true" tabindex="-1"></a><span class="co"># Convert the list to a dataframe</span></span>
<span id="cb337-18"><a href="#cb337-18" aria-hidden="true" tabindex="-1"></a>cpp_cwe_top_10_list_df <span class="op">=</span> pd.DataFrame(file_details, columns<span class="op">=</span>[<span class="st">&#39;CWE-ID&#39;</span>, <span class="st">&#39;File-Short-Name&#39;</span>, <span class="st">&#39;File-Full-Name&#39;</span>])</span>
<span id="cb337-19"><a href="#cb337-19" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb337-20"><a href="#cb337-20" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(cpp_cwe_top_10_list_df.head())</span>
<span id="cb337-21"><a href="#cb337-21" aria-hidden="true" tabindex="-1"></a><span class="co">## CPU times: user 43.9 ms, sys: 3.43 ms, total: 47.3 ms</span></span>
<span id="cb337-22"><a href="#cb337-22" aria-hidden="true" tabindex="-1"></a><span class="co">## Wall time: 49.4 ms</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>                              CWE-ID                      File-Short-Name  \
0  CWE122_Heap_Based_Buffer_Overflow       cpp_CWE805_wchar_t_loop_84_bad   
1  CWE122_Heap_Based_Buffer_Overflow  cpp_CWE806_char_snprintf_81_goodG2B   
2  CWE122_Heap_Based_Buffer_Overflow                   c_src_char_cpy_72b   
3  CWE122_Heap_Based_Buffer_Overflow                c_CWE129_fgets_81_bad   
4  CWE122_Heap_Based_Buffer_Overflow       cpp_CWE805_wchar_t_memmove_68b   

                                      File-Full-Name  
0  CWE122_Heap_Based_Buffer_Overflow__cpp_CWE805_...  
1  CWE122_Heap_Based_Buffer_Overflow__cpp_CWE806_...  
2  CWE122_Heap_Based_Buffer_Overflow__c_src_char_...  
3  CWE122_Heap_Based_Buffer_Overflow__c_CWE129_fg...  
4  CWE122_Heap_Based_Buffer_Overflow__cpp_CWE805_...  
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="221" id="TxaFx14QO-1W">
<div class="sourceCode" id="cb339"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb339-1"><a href="#cb339-1" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_cwe_top_10_list_df = cpp_cwe_top_10_350_files_list_df</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="222"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="VmFVEd91JDKp" data-outputId="fed2f956-8595-4e27-cc6a-c1c607ced332">
<div class="sourceCode" id="cb340"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb340-1"><a href="#cb340-1" aria-hidden="true" tabindex="-1"></a>cpp_cwe_top_10_list_df.shape <span class="co">## (100, 3)</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="222">
<pre><code>(100, 3)</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="223" id="YO2LKd4VOJBo">
<div class="sourceCode" id="cb342"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb342-1"><a href="#cb342-1" aria-hidden="true" tabindex="-1"></a><span class="co"># print(cpp_cwe_top_10_350_files_list_df) ## [100 rows x 3 columns] ## [8750 rows x 3 columns]</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="224"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:443}"
id="SGImsLLf-NX0" data-outputId="d5a0705f-97bf-4986-9186-4977accc595f">
<div class="sourceCode" id="cb343"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb343-1"><a href="#cb343-1" aria-hidden="true" tabindex="-1"></a><span class="co"># 8750/350 ## 25</span></span>
<span id="cb343-2"><a href="#cb343-2" aria-hidden="true" tabindex="-1"></a><span class="co"># print(cpp_cwe_top_25_350_files_list_df) ## [8750 rows x 3 columns]</span></span>
<span id="cb343-3"><a href="#cb343-3" aria-hidden="true" tabindex="-1"></a>cpp_cwe_top_10_list_df <span class="co">## 100 rows × 3 columns ## [8750 rows x 3 columns]</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="224">

  <div id="df-ea34dab0-aef9-4b3d-a0b6-40e57ab5f994" class="colab-df-container">
    <div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>CWE-ID</th>
      <th>File-Short-Name</th>
      <th>File-Full-Name</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>CWE122_Heap_Based_Buffer_Overflow</td>
      <td>cpp_CWE805_wchar_t_loop_84_bad</td>
      <td>CWE122_Heap_Based_Buffer_Overflow__cpp_CWE805_...</td>
    </tr>
    <tr>
      <th>1</th>
      <td>CWE122_Heap_Based_Buffer_Overflow</td>
      <td>cpp_CWE806_char_snprintf_81_goodG2B</td>
      <td>CWE122_Heap_Based_Buffer_Overflow__cpp_CWE806_...</td>
    </tr>
    <tr>
      <th>2</th>
      <td>CWE122_Heap_Based_Buffer_Overflow</td>
      <td>c_src_char_cpy_72b</td>
      <td>CWE122_Heap_Based_Buffer_Overflow__c_src_char_...</td>
    </tr>
    <tr>
      <th>3</th>
      <td>CWE122_Heap_Based_Buffer_Overflow</td>
      <td>c_CWE129_fgets_81_bad</td>
      <td>CWE122_Heap_Based_Buffer_Overflow__c_CWE129_fg...</td>
    </tr>
    <tr>
      <th>4</th>
      <td>CWE122_Heap_Based_Buffer_Overflow</td>
      <td>cpp_CWE805_wchar_t_memmove_68b</td>
      <td>CWE122_Heap_Based_Buffer_Overflow__cpp_CWE805_...</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
      <td>...</td>
    </tr>
    <tr>
      <th>95</th>
      <td>CWE134_Uncontrolled_Format_String</td>
      <td>char_file_w32_vsnprintf_82a</td>
      <td>CWE134_Uncontrolled_Format_String__char_file_w...</td>
    </tr>
    <tr>
      <th>96</th>
      <td>CWE134_Uncontrolled_Format_String</td>
      <td>char_environment_vfprintf_84_goodB2G</td>
      <td>CWE134_Uncontrolled_Format_String__char_enviro...</td>
    </tr>
    <tr>
      <th>97</th>
      <td>CWE134_Uncontrolled_Format_String</td>
      <td>wchar_t_file_printf_84_bad</td>
      <td>CWE134_Uncontrolled_Format_String__wchar_t_fil...</td>
    </tr>
    <tr>
      <th>98</th>
      <td>CWE134_Uncontrolled_Format_String</td>
      <td>wchar_t_environment_w32_vsnprintf_84a</td>
      <td>CWE134_Uncontrolled_Format_String__wchar_t_env...</td>
    </tr>
    <tr>
      <th>99</th>
      <td>CWE134_Uncontrolled_Format_String</td>
      <td>wchar_t_connect_socket_printf_74b</td>
      <td>CWE134_Uncontrolled_Format_String__wchar_t_con...</td>
    </tr>
  </tbody>
</table>
<p>100 rows × 3 columns</p>
</div>
    <div class="colab-df-buttons">

  <div class="colab-df-container">
    <button class="colab-df-convert" onclick="convertToInteractive('df-ea34dab0-aef9-4b3d-a0b6-40e57ab5f994')"
            title="Convert this dataframe to an interactive table."
            style="display:none;">

  <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960">
    <path d="M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z"/>
  </svg>
    </button>

  <style>
    .colab-df-container {
      display:flex;
      gap: 12px;
    }

    .colab-df-convert {
      background-color: #E8F0FE;
      border: none;
      border-radius: 50%;
      cursor: pointer;
      display: none;
      fill: #1967D2;
      height: 32px;
      padding: 0 0 0 0;
      width: 32px;
    }

    .colab-df-convert:hover {
      background-color: #E2EBFA;
      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
      fill: #174EA6;
    }

    .colab-df-buttons div {
      margin-bottom: 4px;
    }

    [theme=dark] .colab-df-convert {
      background-color: #3B4455;
      fill: #D2E3FC;
    }

    [theme=dark] .colab-df-convert:hover {
      background-color: #434B5C;
      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
      fill: #FFFFFF;
    }
  </style>

    <script>
      const buttonEl =
        document.querySelector('#df-ea34dab0-aef9-4b3d-a0b6-40e57ab5f994 button.colab-df-convert');
      buttonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';

      async function convertToInteractive(key) {
        const element = document.querySelector('#df-ea34dab0-aef9-4b3d-a0b6-40e57ab5f994');
        const dataTable =
          await google.colab.kernel.invokeFunction('convertToInteractive',
                                                    [key], {});
        if (!dataTable) return;

        const docLinkHtml = 'Like what you see? Visit the ' +
          '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
          + ' to learn more about interactive tables.';
        element.innerHTML = '';
        dataTable['output_type'] = 'display_data';
        await google.colab.output.renderOutput(dataTable, element);
        const docLink = document.createElement('div');
        docLink.innerHTML = docLinkHtml;
        element.appendChild(docLink);
      }
    </script>
  </div>


<div id="df-809221d4-6d80-447b-83da-7146550f67f9">
  <button class="colab-df-quickchart" onclick="quickchart('df-809221d4-6d80-447b-83da-7146550f67f9')"
            title="Suggest charts"
            style="display:none;">

<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
     width="24px">
    <g>
        <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"/>
    </g>
</svg>
  </button>

<style>
  .colab-df-quickchart {
      --bg-color: #E8F0FE;
      --fill-color: #1967D2;
      --hover-bg-color: #E2EBFA;
      --hover-fill-color: #174EA6;
      --disabled-fill-color: #AAA;
      --disabled-bg-color: #DDD;
  }

  [theme=dark] .colab-df-quickchart {
      --bg-color: #3B4455;
      --fill-color: #D2E3FC;
      --hover-bg-color: #434B5C;
      --hover-fill-color: #FFFFFF;
      --disabled-bg-color: #3B4455;
      --disabled-fill-color: #666;
  }

  .colab-df-quickchart {
    background-color: var(--bg-color);
    border: none;
    border-radius: 50%;
    cursor: pointer;
    display: none;
    fill: var(--fill-color);
    height: 32px;
    padding: 0;
    width: 32px;
  }

  .colab-df-quickchart:hover {
    background-color: var(--hover-bg-color);
    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
    fill: var(--button-hover-fill-color);
  }

  .colab-df-quickchart-complete:disabled,
  .colab-df-quickchart-complete:disabled:hover {
    background-color: var(--disabled-bg-color);
    fill: var(--disabled-fill-color);
    box-shadow: none;
  }

  .colab-df-spinner {
    border: 2px solid var(--fill-color);
    border-color: transparent;
    border-bottom-color: var(--fill-color);
    animation:
      spin 1s steps(1) infinite;
  }

  @keyframes spin {
    0% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
      border-left-color: var(--fill-color);
    }
    20% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    30% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
      border-right-color: var(--fill-color);
    }
    40% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    60% {
      border-color: transparent;
      border-right-color: var(--fill-color);
    }
    80% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-bottom-color: var(--fill-color);
    }
    90% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
    }
  }
</style>

  <script>
    async function quickchart(key) {
      const quickchartButtonEl =
        document.querySelector('#' + key + ' button');
      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.
      quickchartButtonEl.classList.add('colab-df-spinner');
      try {
        const charts = await google.colab.kernel.invokeFunction(
            'suggestCharts', [key], {});
      } catch (error) {
        console.error('Error during call to suggestCharts:', error);
      }
      quickchartButtonEl.classList.remove('colab-df-spinner');
      quickchartButtonEl.classList.add('colab-df-quickchart-complete');
    }
    (() => {
      let quickchartButtonEl =
        document.querySelector('#df-809221d4-6d80-447b-83da-7146550f67f9 button');
      quickchartButtonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';
    })();
  </script>
</div>
    </div>
  </div>

</div>
</div>
<div class="cell code" data-execution_count="225"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="T7d3LadN-NVF" data-outputId="e9cdd113-ce24-4bd4-bbc3-95761684fd13">
<div class="sourceCode" id="cb344"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb344-1"><a href="#cb344-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(cpp_cwe_top_10_list_df[<span class="st">&#39;File-Full-Name&#39;</span>][<span class="dv">2</span>]) <span class="co">## CWE122_Heap_Based_Buffer_Overflow__c_CWE129_fgets_62a.cpp</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>CWE122_Heap_Based_Buffer_Overflow__c_src_char_cpy_72b.cpp
</code></pre>
</div>
</div>
<section id="section" class="cell markdown" id="l_3jEiKzRawF">
<h4>...</h4>
</section>
<div class="cell code" data-execution_count="226"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="_Ql6gMOYADar" data-outputId="337f83a2-7f3b-4ee8-b1c0-d67c518030d0">
<div class="sourceCode" id="cb346"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb346-1"><a href="#cb346-1" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_cwe_top_25_350_files_list_df[&#39;File-Full-Name&#39;] ## [8750 rows x 3 columns]</span></span>
<span id="cb346-2"><a href="#cb346-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(cpp_cwe_top_10_list_df[<span class="st">&#39;File-Full-Name&#39;</span>][<span class="dv">2</span>]) <span class="co">## [8750 rows x 3 columns]</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>CWE122_Heap_Based_Buffer_Overflow__c_src_char_cpy_72b.cpp
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="227" id="g1gqx9ubECey">
<div class="sourceCode" id="cb348"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb348-1"><a href="#cb348-1" aria-hidden="true" tabindex="-1"></a>cpp_files_top_10_cwe_list_df <span class="op">=</span> cpp_cwe_top_10_list_df.copy()</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="228" id="DnSrhGnbE87B">
<div class="sourceCode" id="cb349"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb349-1"><a href="#cb349-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>cp <span class="op">-</span>r cpp_folders cpp_files_top_10_cwe</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="229"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="xeUjAm7kECbR" data-outputId="53dc769e-d994-4080-fccb-3c0a64eb7352">
<div class="sourceCode" id="cb350"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb350-1"><a href="#cb350-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span><span class="bu">zip</span> cpp_files_top_10_cwe.<span class="bu">zip</span> cpp_files_top_10_cwe</span></code></pre></div>
<div class="output stream stdout">
<pre><code>  adding: cpp_files_top_10_cwe/ (stored 0%)
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="230" id="GSzr3j8wF1-J">
<div class="sourceCode" id="cb352"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb352-1"><a href="#cb352-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>mv cpp_files_top_10_cwe.<span class="bu">zip</span> temp.<span class="bu">zip</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="231" id="4nQeiYNZFnhF">
<div class="sourceCode" id="cb353"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb353-1"><a href="#cb353-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>tar <span class="op">-</span>czf cpp_files_top_10_cwe.tar.gz cpp_files_top_10_cwe</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="232"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="zmwCka6lFuH2" data-outputId="4210e060-077c-4d1f-c49d-14e82fe1a049">
<div class="sourceCode" id="cb354"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb354-1"><a href="#cb354-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>ls <span class="op">-</span>lh</span></code></pre></div>
<div class="output stream stdout">
<pre><code>total 767M
-rw-r--r--     1 root root 9.9M Jan  7 10:38 1st_nlp_text_clf_cpp_top_cwe_v240104a.tar.gz
-rw-r--r--     1 root root 671M Aug 11  2022 2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support.zip
drwxr-xr-x     4 root root 4.0K Jan  7 10:38 bin.tf
-rw-r--r--     1 root root 339K Jan  7 10:33 bow_transformer.pk
-rw-r--r--     1 root root 756K Jan  7 10:35 cpp_6102_train_2781_test_top_25_cwe_ready.tar.gz
drwxr-xr-x    27 root root 4.0K Jan  2 04:10 cpp_cleaner_8750_files_each_350_top_25_cwe_omitted
-rw-r--r--     1 root root 875K Jan  7 10:35 _cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz
drwxr-xr-x    12 root root 4.0K Jan  3 17:15 cpp_clean_files_top_10_cwe_omitted
-rw-r--r--     1 root root  15K Jan  7 10:35 cpp_clean_files_top_10_cwe_omitted.tar.gz
drwxr-xr-x     2 root root 4.6M Jan  7 10:46 cpp_data
drwxr-xr-x    12 root root 4.0K Jan  7 10:46 cpp_files_top_10_cwe
-rw-r--r--     1 root root  29K Jan  7 10:46 cpp_files_top_10_cwe.tar.gz
-rw-r--r--     1 root root 9.1M Jan  7 10:44 cpp_files_tree.csv
drwxr-xr-x    12 root root 4.0K Jan  7 10:46 cpp_folders
drwxr-xr-x     4 root root 4.0K Jan  7 10:38 cpp_top_25_cwe_cnn_model.tf
-rw-r--r--     1 root root 878K Jan  7 10:38 cpp_top_25_cwe_cnn_model.tf.tar.gz
drwxr-xr-x 64101 root root 2.1M Jan  7 10:43 data
-rw-r--r--     1 root root  11M Jan  7 10:32 data_drop_na.csv
-rw-r--r--     1 root root  49M Jan  7 10:44 files_tree.csv
-rw-r--r--     1 root root  52K Jan  7 10:36 model.png
-rw-r--r--     1 root root 4.9M Jan  7 10:35 nb_model.pk
-rw-r--r--     1 root root 2.9M Jan  7 10:35 sard.zip
-rw-r--r--     1 root root  192 Jan  7 10:46 temp.zip
drwxr-xr-x    27 root root 4.0K Jan  7 10:35 test
-rw-r--r--     1 root root 201K Jan  7 10:33 tfidf_transformer.pk
drwxr-xr-x    27 root root 4.0K Jan  7 10:35 train
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="233" id="6LxqHNE_Fnap">
<div class="sourceCode" id="cb356"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb356-1"><a href="#cb356-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !tar -czf cpp_files_top_10_cwe.tar.gz cpp_files_top_10_cwe</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="234" id="JnIZl7m4Fndb">
<div class="sourceCode" id="cb357"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb357-1"><a href="#cb357-1" aria-hidden="true" tabindex="-1"></a><span class="co"># # %%time</span></span>
<span id="cb357-2"><a href="#cb357-2" aria-hidden="true" tabindex="-1"></a><span class="co"># # !tar -czf cpp_files_raw.tar.gz -C /content/cpp_data .</span></span>
<span id="cb357-3"><a href="#cb357-3" aria-hidden="true" tabindex="-1"></a><span class="co"># !tar -czf cpp_all_files_raw.tar.gz -C /content/cpp_data .</span></span>
<span id="cb357-4"><a href="#cb357-4" aria-hidden="true" tabindex="-1"></a><span class="co"># ## CPU times: user 36.3 ms, sys: 3.97 ms, total: 40.3 ms</span></span>
<span id="cb357-5"><a href="#cb357-5" aria-hidden="true" tabindex="-1"></a><span class="co"># ## Wall time: 4.42 s</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="235"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="JE6F9BQAIBSz" data-outputId="ef22bb0e-f38c-4415-993b-98a7c0785fff">
<div class="sourceCode" id="cb358"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb358-1"><a href="#cb358-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>ls <span class="op">-</span>lh</span></code></pre></div>
<div class="output stream stdout">
<pre><code>total 767M
-rw-r--r--     1 root root 9.9M Jan  7 10:38 1st_nlp_text_clf_cpp_top_cwe_v240104a.tar.gz
-rw-r--r--     1 root root 671M Aug 11  2022 2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support.zip
drwxr-xr-x     4 root root 4.0K Jan  7 10:38 bin.tf
-rw-r--r--     1 root root 339K Jan  7 10:33 bow_transformer.pk
-rw-r--r--     1 root root 756K Jan  7 10:35 cpp_6102_train_2781_test_top_25_cwe_ready.tar.gz
drwxr-xr-x    27 root root 4.0K Jan  2 04:10 cpp_cleaner_8750_files_each_350_top_25_cwe_omitted
-rw-r--r--     1 root root 875K Jan  7 10:35 _cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz
drwxr-xr-x    12 root root 4.0K Jan  3 17:15 cpp_clean_files_top_10_cwe_omitted
-rw-r--r--     1 root root  15K Jan  7 10:35 cpp_clean_files_top_10_cwe_omitted.tar.gz
drwxr-xr-x     2 root root 4.6M Jan  7 10:46 cpp_data
drwxr-xr-x    12 root root 4.0K Jan  7 10:46 cpp_files_top_10_cwe
-rw-r--r--     1 root root  29K Jan  7 10:46 cpp_files_top_10_cwe.tar.gz
-rw-r--r--     1 root root 9.1M Jan  7 10:44 cpp_files_tree.csv
drwxr-xr-x    12 root root 4.0K Jan  7 10:46 cpp_folders
drwxr-xr-x     4 root root 4.0K Jan  7 10:38 cpp_top_25_cwe_cnn_model.tf
-rw-r--r--     1 root root 878K Jan  7 10:38 cpp_top_25_cwe_cnn_model.tf.tar.gz
drwxr-xr-x 64101 root root 2.1M Jan  7 10:43 data
-rw-r--r--     1 root root  11M Jan  7 10:32 data_drop_na.csv
-rw-r--r--     1 root root  49M Jan  7 10:44 files_tree.csv
-rw-r--r--     1 root root  52K Jan  7 10:36 model.png
-rw-r--r--     1 root root 4.9M Jan  7 10:35 nb_model.pk
-rw-r--r--     1 root root 2.9M Jan  7 10:35 sard.zip
-rw-r--r--     1 root root  192 Jan  7 10:46 temp.zip
drwxr-xr-x    27 root root 4.0K Jan  7 10:35 test
-rw-r--r--     1 root root 201K Jan  7 10:33 tfidf_transformer.pk
drwxr-xr-x    27 root root 4.0K Jan  7 10:35 train
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="236" id="xWmBglc5ECYY">
<div class="sourceCode" id="cb360"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb360-1"><a href="#cb360-1" aria-hidden="true" tabindex="-1"></a><span class="co">## extract or un-tar (unzip):</span></span>
<span id="cb360-2"><a href="#cb360-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !tar -xzf cpp_files_raw.tar.gz</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="237" id="uFePBaLvHTuQ">
<div class="sourceCode" id="cb361"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb361-1"><a href="#cb361-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Copy the cpp_folders directory to cpp_clean_folders</span></span>
<span id="cb361-2"><a href="#cb361-2" aria-hidden="true" tabindex="-1"></a>shutil.copytree(<span class="st">&#39;cpp_folders&#39;</span>, <span class="st">&#39;cpp_clean_folders&#39;</span>)</span>
<span id="cb361-3"><a href="#cb361-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb361-4"><a href="#cb361-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Traverse the directory tree</span></span>
<span id="cb361-5"><a href="#cb361-5" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> dirpath, dirs, files <span class="kw">in</span> os.walk(<span class="st">&#39;cpp_clean_folders&#39;</span>):</span>
<span id="cb361-6"><a href="#cb361-6" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> filename <span class="kw">in</span> files:</span>
<span id="cb361-7"><a href="#cb361-7" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> filename.endswith(<span class="st">&#39;.cpp&#39;</span>):</span>
<span id="cb361-8"><a href="#cb361-8" aria-hidden="true" tabindex="-1"></a>            <span class="co"># Get the file path</span></span>
<span id="cb361-9"><a href="#cb361-9" aria-hidden="true" tabindex="-1"></a>            file_path <span class="op">=</span> os.path.join(dirpath, filename)</span>
<span id="cb361-10"><a href="#cb361-10" aria-hidden="true" tabindex="-1"></a>            <span class="cf">with</span> <span class="bu">open</span>(file_path, <span class="st">&#39;r&#39;</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
<span id="cb361-11"><a href="#cb361-11" aria-hidden="true" tabindex="-1"></a>                file_data <span class="op">=</span> <span class="bu">file</span>.read()</span>
<span id="cb361-12"><a href="#cb361-12" aria-hidden="true" tabindex="-1"></a>            <span class="co"># Remove the comments</span></span>
<span id="cb361-13"><a href="#cb361-13" aria-hidden="true" tabindex="-1"></a>            file_data <span class="op">=</span> re.sub(<span class="vs">r&#39;/\*.*?\*/&#39;</span>, <span class="st">&#39;&#39;</span>, file_data, flags<span class="op">=</span>re.DOTALL)</span>
<span id="cb361-14"><a href="#cb361-14" aria-hidden="true" tabindex="-1"></a>            <span class="cf">with</span> <span class="bu">open</span>(file_path, <span class="st">&#39;w&#39;</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
<span id="cb361-15"><a href="#cb361-15" aria-hidden="true" tabindex="-1"></a>                <span class="bu">file</span>.write(file_data)</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="238" id="bqCu4cqyKbbw">
<div class="sourceCode" id="cb362"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb362-1"><a href="#cb362-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>cp <span class="op">-</span>r cpp_clean_folders cpp_clean_files_top_10_cwe</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="239" id="QHlX_-jYHTqA">
<div class="sourceCode" id="cb363"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb363-1"><a href="#cb363-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>tar <span class="op">-</span>czf cpp_clean_files_top_10_cwe.tar.gz cpp_clean_files_top_10_cwe</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="240" id="YZafhERzLB4w">
<div class="sourceCode" id="cb364"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb364-1"><a href="#cb364-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Copy the cpp_folders directory to cpp_clean_folders</span></span>
<span id="cb364-2"><a href="#cb364-2" aria-hidden="true" tabindex="-1"></a>shutil.copytree(<span class="st">&#39;cpp_data&#39;</span>, <span class="st">&#39;cpp_clean_data&#39;</span>)</span>
<span id="cb364-3"><a href="#cb364-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb364-4"><a href="#cb364-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Traverse the directory tree</span></span>
<span id="cb364-5"><a href="#cb364-5" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> dirpath, dirs, files <span class="kw">in</span> os.walk(<span class="st">&#39;cpp_clean_data&#39;</span>):</span>
<span id="cb364-6"><a href="#cb364-6" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> filename <span class="kw">in</span> files:</span>
<span id="cb364-7"><a href="#cb364-7" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> filename.endswith(<span class="st">&#39;.cpp&#39;</span>):</span>
<span id="cb364-8"><a href="#cb364-8" aria-hidden="true" tabindex="-1"></a>            <span class="co"># Get the file path</span></span>
<span id="cb364-9"><a href="#cb364-9" aria-hidden="true" tabindex="-1"></a>            file_path <span class="op">=</span> os.path.join(dirpath, filename)</span>
<span id="cb364-10"><a href="#cb364-10" aria-hidden="true" tabindex="-1"></a>            <span class="cf">with</span> <span class="bu">open</span>(file_path, <span class="st">&#39;r&#39;</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
<span id="cb364-11"><a href="#cb364-11" aria-hidden="true" tabindex="-1"></a>                file_data <span class="op">=</span> <span class="bu">file</span>.read()</span>
<span id="cb364-12"><a href="#cb364-12" aria-hidden="true" tabindex="-1"></a>            <span class="co"># Remove the comments</span></span>
<span id="cb364-13"><a href="#cb364-13" aria-hidden="true" tabindex="-1"></a>            file_data <span class="op">=</span> re.sub(<span class="vs">r&#39;/\*.*?\*/&#39;</span>, <span class="st">&#39;&#39;</span>, file_data, flags<span class="op">=</span>re.DOTALL)</span>
<span id="cb364-14"><a href="#cb364-14" aria-hidden="true" tabindex="-1"></a>            <span class="cf">with</span> <span class="bu">open</span>(file_path, <span class="st">&#39;w&#39;</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
<span id="cb364-15"><a href="#cb364-15" aria-hidden="true" tabindex="-1"></a>                <span class="bu">file</span>.write(file_data)</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="241" id="HnxfnCpAHTnx">
<div class="sourceCode" id="cb365"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb365-1"><a href="#cb365-1" aria-hidden="true" tabindex="-1"></a><span class="co"># # %%time</span></span>
<span id="cb365-2"><a href="#cb365-2" aria-hidden="true" tabindex="-1"></a><span class="co"># # !tar -czf cpp_files_raw.tar.gz -C /content/cpp_data .</span></span>
<span id="cb365-3"><a href="#cb365-3" aria-hidden="true" tabindex="-1"></a><span class="co"># !tar -czf cpp_all_files_clean.tar.gz -C /content/cpp_clean_data .</span></span>
<span id="cb365-4"><a href="#cb365-4" aria-hidden="true" tabindex="-1"></a><span class="co"># ## CPU times: user 36.3 ms, sys: 3.97 ms, total: 40.3 ms</span></span>
<span id="cb365-5"><a href="#cb365-5" aria-hidden="true" tabindex="-1"></a><span class="co"># ## Wall time: 4.42 s</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="242"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="LfB0qkSHMMXw" data-outputId="5216611a-57ff-4ee1-f2dc-af6f97e69021">
<div class="sourceCode" id="cb366"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb366-1"><a href="#cb366-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>cp cpp_all_files_clean.tar.gz cpp_all_clean_files_w_top_10_cwe_titles.tar.gz</span></code></pre></div>
<div class="output stream stdout">
<pre><code>cp: cannot stat &#39;cpp_all_files_clean.tar.gz&#39;: No such file or directory
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="243" id="plPSbmx-LoqT">
<div class="sourceCode" id="cb368"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb368-1"><a href="#cb368-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>mkdir tar_gz_files</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="244" id="WTgDesgLHThi">
<div class="sourceCode" id="cb369"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb369-1"><a href="#cb369-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>cp <span class="op">*</span>.gz tar_gz_files</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="245" id="ED_FnraOMxmI">
<div class="sourceCode" id="cb370"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb370-1"><a href="#cb370-1" aria-hidden="true" tabindex="-1"></a><span class="co"># from google.colab import drive</span></span>
<span id="cb370-2"><a href="#cb370-2" aria-hidden="true" tabindex="-1"></a><span class="co"># drive.mount(&#39;/content/drive&#39;)</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="246" id="tNtOPpoSMr3T">
<div class="sourceCode" id="cb371"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb371-1"><a href="#cb371-1" aria-hidden="true" tabindex="-1"></a> <span class="co">## /content/tar_gz_files/cpp_clean_files_top_10_cwe.tar.gz</span></span>
<span id="cb371-2"><a href="#cb371-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !cp tar_gz_files/* /content/drive/MyDrive/1st-SHARED-Data</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="247" id="nujrBlNIVgTD">
<div class="sourceCode" id="cb372"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb372-1"><a href="#cb372-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Copy the cpp_clean_folders directory to cpp_clean_omitted_cwe_folders</span></span>
<span id="cb372-2"><a href="#cb372-2" aria-hidden="true" tabindex="-1"></a>shutil.copytree(<span class="st">&#39;cpp_clean_folders&#39;</span>, <span class="st">&#39;cpp_clean_omitted_cwe_folders&#39;</span>)</span>
<span id="cb372-3"><a href="#cb372-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb372-4"><a href="#cb372-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Traverse the directory tree</span></span>
<span id="cb372-5"><a href="#cb372-5" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> dirpath, dirs, files <span class="kw">in</span> os.walk(<span class="st">&#39;cpp_clean_omitted_cwe_folders&#39;</span>):</span>
<span id="cb372-6"><a href="#cb372-6" aria-hidden="true" tabindex="-1"></a>    <span class="cf">for</span> filename <span class="kw">in</span> files:</span>
<span id="cb372-7"><a href="#cb372-7" aria-hidden="true" tabindex="-1"></a>        <span class="cf">if</span> filename.endswith(<span class="st">&#39;.cpp&#39;</span>):</span>
<span id="cb372-8"><a href="#cb372-8" aria-hidden="true" tabindex="-1"></a>            <span class="co"># Extract the CWE-ID from the file name</span></span>
<span id="cb372-9"><a href="#cb372-9" aria-hidden="true" tabindex="-1"></a>            cwe_id <span class="op">=</span> re.match(<span class="vs">r&#39;(.*?)__&#39;</span>, filename)</span>
<span id="cb372-10"><a href="#cb372-10" aria-hidden="true" tabindex="-1"></a>            <span class="cf">if</span> cwe_id:</span>
<span id="cb372-11"><a href="#cb372-11" aria-hidden="true" tabindex="-1"></a>                cwe_id <span class="op">=</span> cwe_id.group(<span class="dv">1</span>)</span>
<span id="cb372-12"><a href="#cb372-12" aria-hidden="true" tabindex="-1"></a>                <span class="co"># Get the file path</span></span>
<span id="cb372-13"><a href="#cb372-13" aria-hidden="true" tabindex="-1"></a>                file_path <span class="op">=</span> os.path.join(dirpath, filename)</span>
<span id="cb372-14"><a href="#cb372-14" aria-hidden="true" tabindex="-1"></a>                <span class="cf">with</span> <span class="bu">open</span>(file_path, <span class="st">&#39;r&#39;</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
<span id="cb372-15"><a href="#cb372-15" aria-hidden="true" tabindex="-1"></a>                    file_data <span class="op">=</span> <span class="bu">file</span>.read()</span>
<span id="cb372-16"><a href="#cb372-16" aria-hidden="true" tabindex="-1"></a>                <span class="co"># Remove the CWE-ID and only one single word from the file data</span></span>
<span id="cb372-17"><a href="#cb372-17" aria-hidden="true" tabindex="-1"></a>                file_data <span class="op">=</span> re.sub(cwe_id <span class="op">+</span> <span class="st">&#39;__&#39;</span> <span class="op">+</span> <span class="vs">r&#39;\w+?_&#39;</span>, <span class="st">&#39;&#39;</span>, file_data)</span>
<span id="cb372-18"><a href="#cb372-18" aria-hidden="true" tabindex="-1"></a>                <span class="cf">with</span> <span class="bu">open</span>(file_path, <span class="st">&#39;w&#39;</span>) <span class="im">as</span> <span class="bu">file</span>:</span>
<span id="cb372-19"><a href="#cb372-19" aria-hidden="true" tabindex="-1"></a>                    <span class="bu">file</span>.write(file_data)</span>
<span id="cb372-20"><a href="#cb372-20" aria-hidden="true" tabindex="-1"></a>                <span class="co"># Remove the CWE-ID and only one single word from the file name</span></span>
<span id="cb372-21"><a href="#cb372-21" aria-hidden="true" tabindex="-1"></a>                new_filename <span class="op">=</span> re.sub(cwe_id <span class="op">+</span> <span class="st">&#39;__&#39;</span> <span class="op">+</span> <span class="vs">r&#39;\w+?_&#39;</span>, <span class="st">&#39;&#39;</span>, filename)</span>
<span id="cb372-22"><a href="#cb372-22" aria-hidden="true" tabindex="-1"></a>                os.rename(file_path, os.path.join(dirpath, new_filename))</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="248" id="zhgOeefjXPUd">
<div class="sourceCode" id="cb373"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb373-1"><a href="#cb373-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>cp <span class="op">-</span>r cpp_clean_omitted_cwe_folders cpp_clean_files_top_10_cwe_omitted</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="249" id="Z_uoWNnUVgQf">
<div class="sourceCode" id="cb374"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb374-1"><a href="#cb374-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>tar <span class="op">-</span>czf cpp_clean_files_top_10_cwe_omitted.tar.gz cpp_clean_files_top_10_cwe_omitted</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="250" id="LNzrZGdSVgOA">
<div class="sourceCode" id="cb375"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb375-1"><a href="#cb375-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>cp cpp_clean_files_top_10_cwe_omitted.tar.gz tar_gz_files</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="251" id="ShfgVsQ8VgLw">
<div class="sourceCode" id="cb376"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb376-1"><a href="#cb376-1" aria-hidden="true" tabindex="-1"></a><span class="co"># ## cpp_clean_files_top_10_cwe_omitted.tar.gz</span></span>
<span id="cb376-2"><a href="#cb376-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !cp tar_gz_files/* /content/drive/MyDrive/1st-SHARED-Data</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="252" id="k-nvk6XWVgIx">
<div class="sourceCode" id="cb377"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb377-1"><a href="#cb377-1" aria-hidden="true" tabindex="-1"></a><span class="co"># from google.colab import files</span></span>
<span id="cb377-2"><a href="#cb377-2" aria-hidden="true" tabindex="-1"></a><span class="co"># files.download(&quot;cpp_clean_files_top_10_cwe_omitted.tar.gz&quot;)</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="253" id="UXhUmWeKYZ6V">
<div class="sourceCode" id="cb378"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb378-1"><a href="#cb378-1" aria-hidden="true" tabindex="-1"></a><span class="co"># ## cpp_clean_files_top_10_cwe_omitted.tar.gz</span></span>
<span id="cb378-2"><a href="#cb378-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !gdown 1YQHdd457W4NjuTvJYiucKUr8pRwbGulj</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="254" id="fR4UogpAU6ho">
<div class="sourceCode" id="cb379"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb379-1"><a href="#cb379-1" aria-hidden="true" tabindex="-1"></a><span class="co"># /content/cpp_clean_folders/CWE121_Stack_Based_Buffer_Overflow/CWE121_Stack_Based_Buffer_Overflow__CWE129_connect_socket_62a.cpp</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="255" id="KzUX0_KUMrqO">
<div class="sourceCode" id="cb380"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb380-1"><a href="#cb380-1" aria-hidden="true" tabindex="-1"></a><span class="co">## cpp_files_all_raw.zip</span></span></code></pre></div>
</div>
<section id="section" class="cell markdown" id="r8Bj8fLEr_n9">
<h4>...</h4>
</section>
<div class="cell code" data-execution_count="256"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="xCif52HJtCC_" data-outputId="eb952091-81ca-4598-ec65-fed58fcc4bb4">
<div class="sourceCode" id="cb381"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb381-1"><a href="#cb381-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>time</span>
<span id="cb381-2"><a href="#cb381-2" aria-hidden="true" tabindex="-1"></a><span class="co"># # # !cd /content/cpp_data &amp;&amp; zip /content/cpp_data.zip *.cpp</span></span>
<span id="cb381-3"><a href="#cb381-3" aria-hidden="true" tabindex="-1"></a><span class="co"># # # !cd /content/cpp_data &amp;&amp; zip /content/cpp_files_raw.zip *</span></span>
<span id="cb381-4"><a href="#cb381-4" aria-hidden="true" tabindex="-1"></a><span class="co"># # !zip cpp_files_raw.zip /content/cpp_data/*</span></span>
<span id="cb381-5"><a href="#cb381-5" aria-hidden="true" tabindex="-1"></a><span class="co"># # &gt;&gt;&gt;</span></span>
<span id="cb381-6"><a href="#cb381-6" aria-hidden="true" tabindex="-1"></a><span class="co"># # /bin/bash: line 1: /usr/bin/zip: Argument list too long</span></span>
<span id="cb381-7"><a href="#cb381-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb381-8"><a href="#cb381-8" aria-hidden="true" tabindex="-1"></a><span class="co"># # !cd /content/cpp_data &amp;&amp; find . -type f -name &quot;*.cpp&quot; -exec zip /content/cpp_files_raw.zip {} \;</span></span>
<span id="cb381-9"><a href="#cb381-9" aria-hidden="true" tabindex="-1"></a><span class="co"># !cd /content/cpp_data &amp;&amp; find . -type f -exec zip /content/cpp_files_all_raw.zip {} \;</span></span>
<span id="cb381-10"><a href="#cb381-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb381-11"><a href="#cb381-11" aria-hidden="true" tabindex="-1"></a><span class="co"># ##</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.25 µs
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="257" id="ihbr9k7BsjcV">
<div class="sourceCode" id="cb383"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb383-1"><a href="#cb383-1" aria-hidden="true" tabindex="-1"></a><span class="co"># # !ls /content/cpp_data/*.cpp | wc -l ## Argument list too long</span></span>
<span id="cb383-2"><a href="#cb383-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !ls /content/cpp_data/ | wc -l ## 411 ## 46399</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="258" id="tnhqeu5Jm5qN">
<div class="sourceCode" id="cb384"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb384-1"><a href="#cb384-1" aria-hidden="true" tabindex="-1"></a><span class="co"># 2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support.zip  cpp_data</span></span>
<span id="cb384-2"><a href="#cb384-2" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_all_clean_files_w_top_10_cwe_titles.tar.gz		     cpp_files_all_raw.zip</span></span>
<span id="cb384-3"><a href="#cb384-3" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_all_files_clean.tar.gz				     cpp_files_top_10_cwe</span></span>
<span id="cb384-4"><a href="#cb384-4" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_all_files_raw.tar.gz				     cpp_files_top_10_cwe.tar.gz</span></span>
<span id="cb384-5"><a href="#cb384-5" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_clean_data						     cpp_files_tree.csv</span></span>
<span id="cb384-6"><a href="#cb384-6" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_clean_files_top_10_cwe				     cpp_folders</span></span>
<span id="cb384-7"><a href="#cb384-7" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_clean_files_top_10_cwe_omitted			     data</span></span>
<span id="cb384-8"><a href="#cb384-8" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_clean_files_top_10_cwe_omitted.tar.gz		     files_tree.csv</span></span>
<span id="cb384-9"><a href="#cb384-9" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_clean_files_top_10_cwe.tar.gz			     sample_data</span></span>
<span id="cb384-10"><a href="#cb384-10" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_clean_folders					     tar_gz_files</span></span>
<span id="cb384-11"><a href="#cb384-11" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_clean_omitted_cwe_folders				     temp.zip</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="259" id="5DlqF9v7qk_E">
<div class="sourceCode" id="cb385"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb385-1"><a href="#cb385-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !ls</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="260"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="H_6OHWMFqADt" data-outputId="ce383e22-c70c-44bf-db1a-68341f07aa29">
<div class="sourceCode" id="cb386"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb386-1"><a href="#cb386-1" aria-hidden="true" tabindex="-1"></a>cpp_list[<span class="dv">1</span>]</span>
<span id="cb386-2"><a href="#cb386-2" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_df[:1]</span></span></code></pre></div>
<div class="output execute_result" data-execution_count="260">
<pre><code>[&#39;CWE590_Free_Memory_Not_on_Heap__delete_struct_declare_04.cpp&#39;,
 &#39;/content/data/107819-v1.0.0/src/testcases/CWE590_Free_Memory_Not_on_Heap/s03/CWE590_Free_Memory_Not_on_Heap__delete_struct_declare_04.cpp&#39;]</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="261"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:423}"
id="hgmqWcXlpLg2" data-outputId="b7061eec-f15b-4a9c-ed05-aadbcedb368a">
<div class="sourceCode" id="cb388"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb388-1"><a href="#cb388-1" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_list[0]</span></span>
<span id="cb388-2"><a href="#cb388-2" aria-hidden="true" tabindex="-1"></a><span class="co"># cpp_df[:1]</span></span>
<span id="cb388-3"><a href="#cb388-3" aria-hidden="true" tabindex="-1"></a>cpp_df[<span class="dv">1</span>:]</span></code></pre></div>
<div class="output execute_result" data-execution_count="261">

  <div id="df-8e82065e-75cd-4ecf-aab3-6e5a589d8933" class="colab-df-container">
    <div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>File_Name</th>
      <th>File_Address</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>1</th>
      <td>CWE590_Free_Memory_Not_on_Heap__delete_struct_...</td>
      <td>/content/data/107819-v1.0.0/src/testcases/CWE5...</td>
    </tr>
    <tr>
      <th>2</th>
      <td>CWE23_Relative_Path_Traversal__char_connect_so...</td>
      <td>/content/data/89720-v1.0.0/src/testcases/CWE23...</td>
    </tr>
    <tr>
      <th>3</th>
      <td>CWE23_Relative_Path_Traversal__char_connect_so...</td>
      <td>/content/data/89720-v1.0.0/src/testcases/CWE23...</td>
    </tr>
    <tr>
      <th>4</th>
      <td>CWE23_Relative_Path_Traversal__char_connect_so...</td>
      <td>/content/data/89720-v1.0.0/src/testcases/CWE23...</td>
    </tr>
    <tr>
      <th>5</th>
      <td>CWE773_Missing_Reference_to_Active_File_Descri...</td>
      <td>/content/data/116798-v1.0.0/src/testcases/CWE7...</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
    </tr>
    <tr>
      <th>46396</th>
      <td>CWE134_Uncontrolled_Format_String__wchar_t_fil...</td>
      <td>/content/data/81532-v1.0.0/src/testcases/CWE13...</td>
    </tr>
    <tr>
      <th>46397</th>
      <td>CWE134_Uncontrolled_Format_String__char_consol...</td>
      <td>/content/data/79670-v1.0.0/src/testcases/CWE13...</td>
    </tr>
    <tr>
      <th>46398</th>
      <td>CWE134_Uncontrolled_Format_String__char_consol...</td>
      <td>/content/data/79670-v1.0.0/src/testcases/CWE13...</td>
    </tr>
    <tr>
      <th>46399</th>
      <td>CWE134_Uncontrolled_Format_String__char_consol...</td>
      <td>/content/data/79670-v1.0.0/src/testcases/CWE13...</td>
    </tr>
    <tr>
      <th>46400</th>
      <td>CWE134_Uncontrolled_Format_String__char_consol...</td>
      <td>/content/data/79670-v1.0.0/src/testcases/CWE13...</td>
    </tr>
  </tbody>
</table>
<p>46400 rows × 2 columns</p>
</div>
    <div class="colab-df-buttons">

  <div class="colab-df-container">
    <button class="colab-df-convert" onclick="convertToInteractive('df-8e82065e-75cd-4ecf-aab3-6e5a589d8933')"
            title="Convert this dataframe to an interactive table."
            style="display:none;">

  <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960">
    <path d="M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z"/>
  </svg>
    </button>

  <style>
    .colab-df-container {
      display:flex;
      gap: 12px;
    }

    .colab-df-convert {
      background-color: #E8F0FE;
      border: none;
      border-radius: 50%;
      cursor: pointer;
      display: none;
      fill: #1967D2;
      height: 32px;
      padding: 0 0 0 0;
      width: 32px;
    }

    .colab-df-convert:hover {
      background-color: #E2EBFA;
      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
      fill: #174EA6;
    }

    .colab-df-buttons div {
      margin-bottom: 4px;
    }

    [theme=dark] .colab-df-convert {
      background-color: #3B4455;
      fill: #D2E3FC;
    }

    [theme=dark] .colab-df-convert:hover {
      background-color: #434B5C;
      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
      fill: #FFFFFF;
    }
  </style>

    <script>
      const buttonEl =
        document.querySelector('#df-8e82065e-75cd-4ecf-aab3-6e5a589d8933 button.colab-df-convert');
      buttonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';

      async function convertToInteractive(key) {
        const element = document.querySelector('#df-8e82065e-75cd-4ecf-aab3-6e5a589d8933');
        const dataTable =
          await google.colab.kernel.invokeFunction('convertToInteractive',
                                                    [key], {});
        if (!dataTable) return;

        const docLinkHtml = 'Like what you see? Visit the ' +
          '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
          + ' to learn more about interactive tables.';
        element.innerHTML = '';
        dataTable['output_type'] = 'display_data';
        await google.colab.output.renderOutput(dataTable, element);
        const docLink = document.createElement('div');
        docLink.innerHTML = docLinkHtml;
        element.appendChild(docLink);
      }
    </script>
  </div>


<div id="df-30f2165a-ea3a-4373-bcfa-12f7fed452d4">
  <button class="colab-df-quickchart" onclick="quickchart('df-30f2165a-ea3a-4373-bcfa-12f7fed452d4')"
            title="Suggest charts"
            style="display:none;">

<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
     width="24px">
    <g>
        <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"/>
    </g>
</svg>
  </button>

<style>
  .colab-df-quickchart {
      --bg-color: #E8F0FE;
      --fill-color: #1967D2;
      --hover-bg-color: #E2EBFA;
      --hover-fill-color: #174EA6;
      --disabled-fill-color: #AAA;
      --disabled-bg-color: #DDD;
  }

  [theme=dark] .colab-df-quickchart {
      --bg-color: #3B4455;
      --fill-color: #D2E3FC;
      --hover-bg-color: #434B5C;
      --hover-fill-color: #FFFFFF;
      --disabled-bg-color: #3B4455;
      --disabled-fill-color: #666;
  }

  .colab-df-quickchart {
    background-color: var(--bg-color);
    border: none;
    border-radius: 50%;
    cursor: pointer;
    display: none;
    fill: var(--fill-color);
    height: 32px;
    padding: 0;
    width: 32px;
  }

  .colab-df-quickchart:hover {
    background-color: var(--hover-bg-color);
    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
    fill: var(--button-hover-fill-color);
  }

  .colab-df-quickchart-complete:disabled,
  .colab-df-quickchart-complete:disabled:hover {
    background-color: var(--disabled-bg-color);
    fill: var(--disabled-fill-color);
    box-shadow: none;
  }

  .colab-df-spinner {
    border: 2px solid var(--fill-color);
    border-color: transparent;
    border-bottom-color: var(--fill-color);
    animation:
      spin 1s steps(1) infinite;
  }

  @keyframes spin {
    0% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
      border-left-color: var(--fill-color);
    }
    20% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    30% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
      border-right-color: var(--fill-color);
    }
    40% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    60% {
      border-color: transparent;
      border-right-color: var(--fill-color);
    }
    80% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-bottom-color: var(--fill-color);
    }
    90% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
    }
  }
</style>

  <script>
    async function quickchart(key) {
      const quickchartButtonEl =
        document.querySelector('#' + key + ' button');
      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.
      quickchartButtonEl.classList.add('colab-df-spinner');
      try {
        const charts = await google.colab.kernel.invokeFunction(
            'suggestCharts', [key], {});
      } catch (error) {
        console.error('Error during call to suggestCharts:', error);
      }
      quickchartButtonEl.classList.remove('colab-df-spinner');
      quickchartButtonEl.classList.add('colab-df-quickchart-complete');
    }
    (() => {
      let quickchartButtonEl =
        document.querySelector('#df-30f2165a-ea3a-4373-bcfa-12f7fed452d4 button');
      quickchartButtonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';
    })();
  </script>
</div>
    </div>
  </div>

</div>
</div>
<div class="cell code" data-execution_count="262" id="k9yvgiOi9kef">
<div class="sourceCode" id="cb389"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb389-1"><a href="#cb389-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !gdown --id 1lvT5f-jOADyy2gjjnirzwyMSgNsf9Bz9</span></span>
<span id="cb389-2"><a href="#cb389-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !wget https://samate.nist.gov/SARD/downloads/test-suites/2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support.zip</span></span>
<span id="cb389-3"><a href="#cb389-3" aria-hidden="true" tabindex="-1"></a><span class="co"># ## 2022 Juliet C/C++ 1.3.1 with extra support</span></span>
<span id="cb389-4"><a href="#cb389-4" aria-hidden="true" tabindex="-1"></a><span class="co"># https://samate.nist.gov/SARD/test-suites/116</span></span>
<span id="cb389-5"><a href="#cb389-5" aria-hidden="true" tabindex="-1"></a><span class="co"># https://samate.nist.gov/SARD/test-suites/112</span></span>
<span id="cb389-6"><a href="#cb389-6" aria-hidden="true" tabindex="-1"></a><span class="co"># !!!!!!!!!!!!!!1st-C6AI-AIxCC-NLP-Clf-Vulns-Threat-CWE-IDs @ NIST SARD Juliet C++</span></span>
<span id="cb389-7"><a href="#cb389-7" aria-hidden="true" tabindex="-1"></a><span class="co"># https://samate.nist.gov/SARD/test-suites</span></span>
<span id="cb389-8"><a href="#cb389-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb389-9"><a href="#cb389-9" aria-hidden="true" tabindex="-1"></a><span class="co"># from google.colab import drive</span></span>
<span id="cb389-10"><a href="#cb389-10" aria-hidden="true" tabindex="-1"></a><span class="co"># drive.mount(&#39;/content/drive&#39;)</span></span>
<span id="cb389-11"><a href="#cb389-11" aria-hidden="true" tabindex="-1"></a><span class="co"># !cp 2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support.zip /content/drive/MyDrive/1st-SHARED-Data</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="263" id="L0msBb_x-Y3f">
<div class="sourceCode" id="cb390"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb390-1"><a href="#cb390-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !rm -r sample_data</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="264" id="IGt3oMkJBDXQ">
<div class="sourceCode" id="cb391"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb391-1"><a href="#cb391-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !mkdir data</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="265" id="OVQnNi7gBOKQ">
<div class="sourceCode" id="cb392"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb392-1"><a href="#cb392-1" aria-hidden="true" tabindex="-1"></a><span class="co"># # !gdown --id 1lvT5f-jOADyy2gjjnirzwyMSgNsf9Bz9</span></span>
<span id="cb392-2"><a href="#cb392-2" aria-hidden="true" tabindex="-1"></a><span class="co"># # !wget https://samate.nist.gov/SARD/downloads/test-suites/2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support.zip</span></span>
<span id="cb392-3"><a href="#cb392-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb392-4"><a href="#cb392-4" aria-hidden="true" tabindex="-1"></a><span class="co"># from google.colab import drive</span></span>
<span id="cb392-5"><a href="#cb392-5" aria-hidden="true" tabindex="-1"></a><span class="co"># drive.mount(&#39;/content/drive&#39;)</span></span>
<span id="cb392-6"><a href="#cb392-6" aria-hidden="true" tabindex="-1"></a><span class="co"># # !cp 2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support.zip /content/drive/MyDrive/1st-SHARED-Data</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="266" id="cYXHO-e_Bhf4">
<div class="sourceCode" id="cb393"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb393-1"><a href="#cb393-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !cp /content/drive/MyDrive/1st-SHARED-Data/2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support.zip .</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="267"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:423}"
id="fEaaA5FBQXhI" data-outputId="261fbb93-2bdb-4293-dd73-47924594c8be">
<div class="sourceCode" id="cb394"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb394-1"><a href="#cb394-1" aria-hidden="true" tabindex="-1"></a>cpp_df</span></code></pre></div>
<div class="output execute_result" data-execution_count="267">

  <div id="df-9e31e7be-ca76-49b5-b05e-50db3882603f" class="colab-df-container">
    <div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>File_Name</th>
      <th>File_Address</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>CWE36_Absolute_Path_Traversal__wchar_t_connect...</td>
      <td>/content/data/96809-v1.0.0/src/testcases/CWE36...</td>
    </tr>
    <tr>
      <th>1</th>
      <td>CWE590_Free_Memory_Not_on_Heap__delete_struct_...</td>
      <td>/content/data/107819-v1.0.0/src/testcases/CWE5...</td>
    </tr>
    <tr>
      <th>2</th>
      <td>CWE23_Relative_Path_Traversal__char_connect_so...</td>
      <td>/content/data/89720-v1.0.0/src/testcases/CWE23...</td>
    </tr>
    <tr>
      <th>3</th>
      <td>CWE23_Relative_Path_Traversal__char_connect_so...</td>
      <td>/content/data/89720-v1.0.0/src/testcases/CWE23...</td>
    </tr>
    <tr>
      <th>4</th>
      <td>CWE23_Relative_Path_Traversal__char_connect_so...</td>
      <td>/content/data/89720-v1.0.0/src/testcases/CWE23...</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
    </tr>
    <tr>
      <th>46396</th>
      <td>CWE134_Uncontrolled_Format_String__wchar_t_fil...</td>
      <td>/content/data/81532-v1.0.0/src/testcases/CWE13...</td>
    </tr>
    <tr>
      <th>46397</th>
      <td>CWE134_Uncontrolled_Format_String__char_consol...</td>
      <td>/content/data/79670-v1.0.0/src/testcases/CWE13...</td>
    </tr>
    <tr>
      <th>46398</th>
      <td>CWE134_Uncontrolled_Format_String__char_consol...</td>
      <td>/content/data/79670-v1.0.0/src/testcases/CWE13...</td>
    </tr>
    <tr>
      <th>46399</th>
      <td>CWE134_Uncontrolled_Format_String__char_consol...</td>
      <td>/content/data/79670-v1.0.0/src/testcases/CWE13...</td>
    </tr>
    <tr>
      <th>46400</th>
      <td>CWE134_Uncontrolled_Format_String__char_consol...</td>
      <td>/content/data/79670-v1.0.0/src/testcases/CWE13...</td>
    </tr>
  </tbody>
</table>
<p>46401 rows × 2 columns</p>
</div>
    <div class="colab-df-buttons">

  <div class="colab-df-container">
    <button class="colab-df-convert" onclick="convertToInteractive('df-9e31e7be-ca76-49b5-b05e-50db3882603f')"
            title="Convert this dataframe to an interactive table."
            style="display:none;">

  <svg xmlns="http://www.w3.org/2000/svg" height="24px" viewBox="0 -960 960 960">
    <path d="M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z"/>
  </svg>
    </button>

  <style>
    .colab-df-container {
      display:flex;
      gap: 12px;
    }

    .colab-df-convert {
      background-color: #E8F0FE;
      border: none;
      border-radius: 50%;
      cursor: pointer;
      display: none;
      fill: #1967D2;
      height: 32px;
      padding: 0 0 0 0;
      width: 32px;
    }

    .colab-df-convert:hover {
      background-color: #E2EBFA;
      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
      fill: #174EA6;
    }

    .colab-df-buttons div {
      margin-bottom: 4px;
    }

    [theme=dark] .colab-df-convert {
      background-color: #3B4455;
      fill: #D2E3FC;
    }

    [theme=dark] .colab-df-convert:hover {
      background-color: #434B5C;
      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
      fill: #FFFFFF;
    }
  </style>

    <script>
      const buttonEl =
        document.querySelector('#df-9e31e7be-ca76-49b5-b05e-50db3882603f button.colab-df-convert');
      buttonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';

      async function convertToInteractive(key) {
        const element = document.querySelector('#df-9e31e7be-ca76-49b5-b05e-50db3882603f');
        const dataTable =
          await google.colab.kernel.invokeFunction('convertToInteractive',
                                                    [key], {});
        if (!dataTable) return;

        const docLinkHtml = 'Like what you see? Visit the ' +
          '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
          + ' to learn more about interactive tables.';
        element.innerHTML = '';
        dataTable['output_type'] = 'display_data';
        await google.colab.output.renderOutput(dataTable, element);
        const docLink = document.createElement('div');
        docLink.innerHTML = docLinkHtml;
        element.appendChild(docLink);
      }
    </script>
  </div>


<div id="df-0f41cf3d-e09f-4075-a3a5-538717da2fb1">
  <button class="colab-df-quickchart" onclick="quickchart('df-0f41cf3d-e09f-4075-a3a5-538717da2fb1')"
            title="Suggest charts"
            style="display:none;">

<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
     width="24px">
    <g>
        <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z"/>
    </g>
</svg>
  </button>

<style>
  .colab-df-quickchart {
      --bg-color: #E8F0FE;
      --fill-color: #1967D2;
      --hover-bg-color: #E2EBFA;
      --hover-fill-color: #174EA6;
      --disabled-fill-color: #AAA;
      --disabled-bg-color: #DDD;
  }

  [theme=dark] .colab-df-quickchart {
      --bg-color: #3B4455;
      --fill-color: #D2E3FC;
      --hover-bg-color: #434B5C;
      --hover-fill-color: #FFFFFF;
      --disabled-bg-color: #3B4455;
      --disabled-fill-color: #666;
  }

  .colab-df-quickchart {
    background-color: var(--bg-color);
    border: none;
    border-radius: 50%;
    cursor: pointer;
    display: none;
    fill: var(--fill-color);
    height: 32px;
    padding: 0;
    width: 32px;
  }

  .colab-df-quickchart:hover {
    background-color: var(--hover-bg-color);
    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);
    fill: var(--button-hover-fill-color);
  }

  .colab-df-quickchart-complete:disabled,
  .colab-df-quickchart-complete:disabled:hover {
    background-color: var(--disabled-bg-color);
    fill: var(--disabled-fill-color);
    box-shadow: none;
  }

  .colab-df-spinner {
    border: 2px solid var(--fill-color);
    border-color: transparent;
    border-bottom-color: var(--fill-color);
    animation:
      spin 1s steps(1) infinite;
  }

  @keyframes spin {
    0% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
      border-left-color: var(--fill-color);
    }
    20% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    30% {
      border-color: transparent;
      border-left-color: var(--fill-color);
      border-top-color: var(--fill-color);
      border-right-color: var(--fill-color);
    }
    40% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-top-color: var(--fill-color);
    }
    60% {
      border-color: transparent;
      border-right-color: var(--fill-color);
    }
    80% {
      border-color: transparent;
      border-right-color: var(--fill-color);
      border-bottom-color: var(--fill-color);
    }
    90% {
      border-color: transparent;
      border-bottom-color: var(--fill-color);
    }
  }
</style>

  <script>
    async function quickchart(key) {
      const quickchartButtonEl =
        document.querySelector('#' + key + ' button');
      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.
      quickchartButtonEl.classList.add('colab-df-spinner');
      try {
        const charts = await google.colab.kernel.invokeFunction(
            'suggestCharts', [key], {});
      } catch (error) {
        console.error('Error during call to suggestCharts:', error);
      }
      quickchartButtonEl.classList.remove('colab-df-spinner');
      quickchartButtonEl.classList.add('colab-df-quickchart-complete');
    }
    (() => {
      let quickchartButtonEl =
        document.querySelector('#df-0f41cf3d-e09f-4075-a3a5-538717da2fb1 button');
      quickchartButtonEl.style.display =
        google.colab.kernel.accessAllowed ? 'block' : 'none';
    })();
  </script>
</div>
    </div>
  </div>

</div>
</div>
<div class="cell code" data-execution_count="268"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="Rm9yGhc3RZEG" data-outputId="ebcda119-2958-480f-d35a-5c335d970ee9">
<div class="sourceCode" id="cb395"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb395-1"><a href="#cb395-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>ls <span class="op">-</span>lh cpp_clean_files_top_10_cwe_omitted <span class="co">## /content/cpp_clean_files_top_10_cwe_omitted</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>total 44K
drwxr-xr-x 12 root root 4.0K Jan  7 10:47 cpp_clean_omitted_cwe_folders
drwxr-xr-x  2 root root 4.0K Jan  3 17:15 CWE121_Stack_Based_Buffer_Overflow
drwxr-xr-x  2 root root 4.0K Jan  3 17:15 CWE122_Heap_Based_Buffer_Overflow
drwxr-xr-x  2 root root 4.0K Jan  3 17:15 CWE127_Buffer_Underread
drwxr-xr-x  2 root root 4.0K Jan  3 17:15 CWE134_Uncontrolled_Format_String
drwxr-xr-x  2 root root 4.0K Jan  3 17:15 CWE23_Relative_Path_Traversal
drwxr-xr-x  2 root root 4.0K Jan  3 17:15 CWE36_Absolute_Path_Traversal
drwxr-xr-x  2 root root 4.0K Jan  3 17:15 CWE401_Memory_Leak
drwxr-xr-x  2 root root 4.0K Jan  3 17:15 CWE415_Double_Free
drwxr-xr-x  2 root root 4.0K Jan  3 17:15 CWE590_Free_Memory_Not_on_Heap
drwxr-xr-x  2 root root 4.0K Jan  3 17:15 CWE762_Mismatched_Memory_Management_Routines
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="269"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="EtR57GMjOWgF" data-outputId="0655e674-e466-452b-db49-3f4628addbe1">
<div class="sourceCode" id="cb397"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb397-1"><a href="#cb397-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>ls <span class="op">-</span>lh</span></code></pre></div>
<div class="output stream stdout">
<pre><code>total 773M
-rw-r--r--     1 root root 9.9M Jan  7 10:38 1st_nlp_text_clf_cpp_top_cwe_v240104a.tar.gz
-rw-r--r--     1 root root 671M Aug 11  2022 2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support.zip
drwxr-xr-x     4 root root 4.0K Jan  7 10:38 bin.tf
-rw-r--r--     1 root root 339K Jan  7 10:33 bow_transformer.pk
-rw-r--r--     1 root root 756K Jan  7 10:35 cpp_6102_train_2781_test_top_25_cwe_ready.tar.gz
drwxr-xr-x     2 root root 6.5M Jan  7 10:46 cpp_clean_data
drwxr-xr-x    27 root root 4.0K Jan  2 04:10 cpp_cleaner_8750_files_each_350_top_25_cwe_omitted
-rw-r--r--     1 root root 875K Jan  7 10:35 _cpp_cleaner_8750_files_each_350_top_25_cwe_omitted.tar.gz
drwxr-xr-x    12 root root 4.0K Jan  7 10:46 cpp_clean_files_top_10_cwe
drwxr-xr-x    13 root root 4.0K Jan  7 10:47 cpp_clean_files_top_10_cwe_omitted
-rw-r--r--     1 root root  31K Jan  7 10:47 cpp_clean_files_top_10_cwe_omitted.tar.gz
-rw-r--r--     1 root root  18K Jan  7 10:46 cpp_clean_files_top_10_cwe.tar.gz
drwxr-xr-x    12 root root 4.0K Jan  7 10:46 cpp_clean_folders
drwxr-xr-x    12 root root 4.0K Jan  7 10:46 cpp_clean_omitted_cwe_folders
drwxr-xr-x     2 root root 4.6M Jan  7 10:46 cpp_data
drwxr-xr-x    12 root root 4.0K Jan  7 10:46 cpp_files_top_10_cwe
-rw-r--r--     1 root root  29K Jan  7 10:46 cpp_files_top_10_cwe.tar.gz
-rw-r--r--     1 root root 9.1M Jan  7 10:44 cpp_files_tree.csv
drwxr-xr-x    12 root root 4.0K Jan  7 10:46 cpp_folders
drwxr-xr-x     4 root root 4.0K Jan  7 10:38 cpp_top_25_cwe_cnn_model.tf
-rw-r--r--     1 root root 878K Jan  7 10:38 cpp_top_25_cwe_cnn_model.tf.tar.gz
drwxr-xr-x 64101 root root 2.1M Jan  7 10:43 data
-rw-r--r--     1 root root  11M Jan  7 10:32 data_drop_na.csv
-rw-r--r--     1 root root  49M Jan  7 10:44 files_tree.csv
-rw-r--r--     1 root root  52K Jan  7 10:36 model.png
-rw-r--r--     1 root root 4.9M Jan  7 10:35 nb_model.pk
-rw-r--r--     1 root root 2.9M Jan  7 10:35 sard.zip
drwxr-xr-x     2 root root 4.0K Jan  7 10:47 tar_gz_files
-rw-r--r--     1 root root  192 Jan  7 10:46 temp.zip
drwxr-xr-x    27 root root 4.0K Jan  7 10:35 test
-rw-r--r--     1 root root 201K Jan  7 10:33 tfidf_transformer.pk
drwxr-xr-x    27 root root 4.0K Jan  7 10:35 train
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="273"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="1kYSoyqENdB5" data-outputId="bbadc217-0082-4cb5-ef40-a25edf5ae775">
<div class="sourceCode" id="cb399"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb399-1"><a href="#cb399-1" aria-hidden="true" tabindex="-1"></a><span class="co">## import time</span></span>
<span id="cb399-2"><a href="#cb399-2" aria-hidden="true" tabindex="-1"></a><span class="co">## global_start = time.time()</span></span>
<span id="cb399-3"><a href="#cb399-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb399-4"><a href="#cb399-4" aria-hidden="true" tabindex="-1"></a>global_end <span class="op">=</span> time.time()</span>
<span id="cb399-5"><a href="#cb399-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb399-6"><a href="#cb399-6" aria-hidden="true" tabindex="-1"></a><span class="co"># print(&quot;[T4 GPU &amp; High RAM?] Global Time Duration: &quot; + str(global_end - global_start))</span></span>
<span id="cb399-7"><a href="#cb399-7" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">&quot;Global Time Duration: &quot;</span> <span class="op">+</span> <span class="bu">str</span>(global_end <span class="op">-</span> global_start))</span>
<span id="cb399-8"><a href="#cb399-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb399-9"><a href="#cb399-9" aria-hidden="true" tabindex="-1"></a><span class="co">## [T4 GPU &amp; High RAM] Global Time Duration: 643</span></span>
<span id="cb399-10"><a href="#cb399-10" aria-hidden="true" tabindex="-1"></a><span class="co">## 643s / 60 ​is approximately 11 minutes</span></span>
<span id="cb399-11"><a href="#cb399-11" aria-hidden="true" tabindex="-1"></a><span class="co">## 8 mins on CPU &amp; High RAM</span></span>
<span id="cb399-12"><a href="#cb399-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb399-13"><a href="#cb399-13" aria-hidden="true" tabindex="-1"></a><span class="co">## Global Time Duration: 940.3888688087463</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>Global Time Duration: 1051.4102718830109
</code></pre>
</div>
</div>
<section id="this-may-take-few-minutes" class="cell markdown"
id="kK89zi3NeJ36">
<h3>This may take few minutes</h3>
</section>
<div class="cell code" data-execution_count="271"
data-colab="{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}"
id="JVEjIYLfPVfC" data-outputId="680ffc19-4c89-4be0-cbbb-471158853dc3">
<div class="sourceCode" id="cb401"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb401-1"><a href="#cb401-1" aria-hidden="true" tabindex="-1"></a><span class="op">%%</span>time</span>
<span id="cb401-2"><a href="#cb401-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb401-3"><a href="#cb401-3" aria-hidden="true" tabindex="-1"></a><span class="co">## &gt; 1GB !</span></span>
<span id="cb401-4"><a href="#cb401-4" aria-hidden="true" tabindex="-1"></a><span class="co"># from google.colab import files</span></span>
<span id="cb401-5"><a href="#cb401-5" aria-hidden="true" tabindex="-1"></a><span class="co"># files.download(&quot;1st_nlp_text_clf_cpp_top_cwe_v240104a.tar.gz&quot;)</span></span>
<span id="cb401-6"><a href="#cb401-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb401-7"><a href="#cb401-7" aria-hidden="true" tabindex="-1"></a><span class="co">##</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 7.87 µs
</code></pre>
</div>
</div>
<section id="1st-uol-dsm140-cw---end" class="cell markdown"
id="FmxLks3OVYsC">
<h1>1st UoL DSM140 CW - End</h1>
</section>
<div class="cell code" data-execution_count="272" id="RzhMleT7v3Qk">
<div class="sourceCode" id="cb403"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb403-1"><a href="#cb403-1" aria-hidden="true" tabindex="-1"></a><span class="co"># 210107005_UoL_DSM140_NLP_Text_Classification_CW_Sub_v240107wk.ipynb</span></span>
<span id="cb403-2"><a href="#cb403-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Commentable @ https://colab.research.google.com/drive/1kUTphSV9lHhbu_HT_tvffIPEtFWpFPIg?usp=sharing</span></span></code></pre></div>
</div>
</body>
</html>