-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.html
9938 lines (9483 loc) · 651 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
<head>
<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
<title>0ba9c091992f4edba56aa694018004a8</title>
<style>
html {
line-height: 1.5;
font-family: Georgia, serif;
font-size: 20px;
color: #1a1a1a;
background-color: #fdfdfd;
}
body {
margin: 0 auto;
max-width: 36em;
padding-left: 50px;
padding-right: 50px;
padding-top: 50px;
padding-bottom: 50px;
hyphens: auto;
overflow-wrap: break-word;
text-rendering: optimizeLegibility;
font-kerning: normal;
}
@media (max-width: 600px) {
body {
font-size: 0.9em;
padding: 1em;
}
h1 {
font-size: 1.8em;
}
}
@media print {
body {
background-color: transparent;
color: black;
font-size: 12pt;
}
p, h2, h3 {
orphans: 3;
widows: 3;
}
h2, h3, h4 {
page-break-after: avoid;
}
}
p {
margin: 1em 0;
}
a {
color: #1a1a1a;
}
a:visited {
color: #1a1a1a;
}
img {
max-width: 100%;
}
h1, h2, h3, h4, h5, h6 {
margin-top: 1.4em;
}
h5, h6 {
font-size: 1em;
font-style: italic;
}
h6 {
font-weight: normal;
}
ol, ul {
padding-left: 1.7em;
margin-top: 1em;
}
li > ol, li > ul {
margin-top: 0;
}
blockquote {
margin: 1em 0 1em 1.7em;
padding-left: 1em;
border-left: 2px solid #e6e6e6;
color: #606060;
}
code {
font-family: Menlo, Monaco, 'Lucida Console', Consolas, monospace;
font-size: 85%;
margin: 0;
}
pre {
margin: 1em 0;
overflow: auto;
}
pre code {
padding: 0;
overflow: visible;
overflow-wrap: normal;
}
.sourceCode {
background-color: transparent;
overflow: visible;
}
hr {
background-color: #1a1a1a;
border: none;
height: 1px;
margin: 1em 0;
}
table {
margin: 1em 0;
border-collapse: collapse;
width: 100%;
overflow-x: auto;
display: block;
font-variant-numeric: lining-nums tabular-nums;
}
table caption {
margin-bottom: 0.75em;
}
tbody {
margin-top: 0.5em;
border-top: 1px solid #1a1a1a;
border-bottom: 1px solid #1a1a1a;
}
th {
border-top: 1px solid #1a1a1a;
padding: 0.25em 0.5em 0.25em 0.5em;
}
td {
padding: 0.125em 0.5em 0.25em 0.5em;
}
header {
margin-bottom: 4em;
text-align: center;
}
#TOC li {
list-style: none;
}
#TOC ul {
padding-left: 1.3em;
}
#TOC > ul {
padding-left: 0;
}
#TOC a:not(:hover) {
text-decoration: none;
}
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
width: 0.8em;
margin: 0 0.8em 0.2em -1.6em;
vertical-align: middle;
}
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; } /* Alert */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code span.at { color: #7d9029; } /* Attribute */
code span.bn { color: #40a070; } /* BaseN */
code span.bu { color: #008000; } /* BuiltIn */
code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code span.ch { color: #4070a0; } /* Char */
code span.cn { color: #880000; } /* Constant */
code span.co { color: #60a0b0; font-style: italic; } /* Comment */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code span.do { color: #ba2121; font-style: italic; } /* Documentation */
code span.dt { color: #902000; } /* DataType */
code span.dv { color: #40a070; } /* DecVal */
code span.er { color: #ff0000; font-weight: bold; } /* Error */
code span.ex { } /* Extension */
code span.fl { color: #40a070; } /* Float */
code span.fu { color: #06287e; } /* Function */
code span.im { color: #008000; font-weight: bold; } /* Import */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
code span.kw { color: #007020; font-weight: bold; } /* Keyword */
code span.op { color: #666666; } /* Operator */
code span.ot { color: #007020; } /* Other */
code span.pp { color: #bc7a00; } /* Preprocessor */
code span.sc { color: #4070a0; } /* SpecialChar */
code span.ss { color: #bb6688; } /* SpecialString */
code span.st { color: #4070a0; } /* String */
code span.va { color: #19177c; } /* Variable */
code span.vs { color: #4070a0; } /* VerbatimString */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
.display.math{display: block; text-align: center; margin: 0.5rem auto;}
</style>
<!--[if lt IE 9]>
<script src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv-printshiv.min.js"></script>
<![endif]-->
</head>
<body>
<div class="cell code" data-execution_count="1" id="iXZsRRTpiMXc">
<div class="sourceCode" id="cb1"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="co"># 210107005_UoL_DSM140_NLP_Text_Classification_CW_Sub_v240107wk.ipynb</span></span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Commentable @ https://colab.research.google.com/drive/1kUTphSV9lHhbu_HT_tvffIPEtFWpFPIg?usp=sharing</span></span></code></pre></div>
</div>
<section id="1st-uol-dsm140-cw---report" class="cell markdown"
id="wxi-a1JwISCC">
<h1>1st UoL DSM140 CW - Report</h1>
</section>
<section id="i-introduction" class="cell markdown" id="aefiBqEAI6qF">
<h2>I. Introduction</h2>
</section>
<section id="1-introduction-to-the-domain-specific-area"
class="cell markdown" id="Be4kcwTMKNPv">
<h3>1. Introduction to the domain-specific area</h3>
<p>The domain-specific area of interest is the application of AI and
machine learning techniques for Static Application Security Testing
(SAST) and vulnerability detection in critical infrastructure software.
This area is particularly relevant to the Artificial Intelligence Cyber
Challenge (AIxCC), a competition that encourages the development of AI
systems to secure critical code.</p>
<p>In our interconnected world, software underpins everything from
financial systems to public utilities. As this code enables modern life
and drives productivity, it also creates an expanding attack surface for
malicious actors. The AIxCC is a two-year competition that asks the best
and brightest in AI and cybersecurity to defend the software on which
almost everyone rely. The competition will award a cumulative 30 million
in prizes to teams with the best systems, including 7 million in prizes
to small businesses to empower entrepreneurial innovation.</p>
<p>The AIxCC is particularly focused on securing open-source software,
which comprises most of the code running on critical infrastructure
today, including the electricity and telecommunications sectors. The
competition is collaborating closely with the open-source community to
guide teams in creating AI systems capable of addressing vital
cybersecurity issues.</p>
<p>The challenge is to develop innovative systems guided by AI and
Machine Learning to semi-automatically find and fix software
vulnerabilities [2]. The AIxCC competition will foster innovative
research via a gamified environment that challenges the competitors to
design Cyber Reasoning Systems (CRSs) that integrate novel AI [4].</p>
<p>In the context of C6AI (Combined C++ Code Cybersecurity &
CWE-based Classification AI), the focus is on using text classification
methods to analyse and classify C++ code for potential vulnerabilities.
This involves converting the raw text of the code into numerical feature
vectors that can be processed by machine learning algorithms. Techniques
such as text stemming and n-gram tokenization are used in this
preprocessing stage.</p>
<p>In summary, the domain-specific area is the intersection of AI,
cybersecurity, and software vulnerability detection, with a particular
focus on static analysis of C++ code. The goal is to develop AI systems
that can effectively identify and address software vulnerabilities,
thereby enhancing the security of critical infrastructure.</p>
<p>Ref: [1] <a href="https://aicyberchallenge.com/about"
class="uri">https://aicyberchallenge.com/about</a> [2] <a
href="https://www.sbir.gov/node/2464965"
class="uri">https://www.sbir.gov/node/2464965</a> [3] <a
href="https://www.darpa.mil/news-events/2023-12-14"
class="uri">https://www.darpa.mil/news-events/2023-12-14</a> [4] <a
href="https://openssf.org/blog/2023/12/19/deconstructing-the-ai-cyber-challenge-aixcc/"
class="uri">https://openssf.org/blog/2023/12/19/deconstructing-the-ai-cyber-challenge-aixcc/</a></p>
</section>
<section id="2-description-of-the-selected-dataset"
class="cell markdown" id="4MdAqy4aKHTu">
<h3>2. Description of the selected dataset</h3>
<p>The Juliet C/C++ 1.3.1 SARD dataset is a collection of test cases in
the C/C++ language, organized under 118 different Common Weakness
Enumerations (CWEs). This dataset is part of the Software Assurance
Reference Dataset (SARD) provided by the National Institute of Standards
and Technology (NIST) as ‘Juliet C/C++ 1.3 with extra support’ @ <a
href="https://samate.nist.gov/SARD/test-suites/116"
class="uri">https://samate.nist.gov/SARD/test-suites/116</a>.</p>
<p>The dataset is designed to test software for potential
vulnerabilities and weaknesses. Each test case in the dataset is
associated with a specific CWE, which represents a type of software
vulnerability. The dataset includes both 'good' and 'bad' examples, with
the 'bad' examples demonstrating the vulnerability and the 'good'
examples showing a correct or safe way to write the code.</p>
<p>The Juliet C/C++ 1.3.1 SARD dataset is publicly available and not
subject to copyright protection. It is made available under the CC0 1.0
Public Domain License.</p>
<p>The dataset is structured in a way that each CWE has its own
directory, and within each directory, there are multiple text files,
each representing a test case. The test cases are labelled with the
CWE-ID, which can be used to identify the type of vulnerability that the
test case is associated with.</p>
<p>The 671 MB combressed size of the Juliet C/C++ 1.3 SARD with extra
support, contains over 64,099 test cases. Given that the SARD has over
170,000 programs and the Juliet C/C++ 1.3 dataset is a part of this
collection, it can be inferred that the dataset is quite large. The data
types in the dataset are primarily text, as the test cases are
represented as C/C++ code in text files.</p>
<p>The dataset is typically used in machine learning experiments, where
it is divided into training, validation, and test sets. The SARD dataset
has already been divided into training and test sets, but it lacks a
validation set. Therefore, it is common practice to create a validation
set using an 80:20 split of the training data.</p>
</section>
<section id="3-objectives-of-the-project" class="cell markdown"
id="gN8m-55TKGNQ">
<h3>3. Objectives of the project</h3>
<p>The objectives of the project are to enhance the out-of-sample
generalization capabilities of the currently developed C6AI Cyber
Reasoning System (CRS) and to measure its 'Vulnerability Discovery
Accuracy'. This is in line with the AIxCC CRS Areas of Excellence, which
emphasize the importance of developing systems that can accurately
identify vulnerabilities in software, particularly in the context of
critical infrastructure.</p>
<p>The project aims to contribute to the AI Cyber Challenge (AIxCC) by
developing a CRS that can effectively and efficiently detect
vulnerabilities in C++ code. The focus on out-of-sample generalization
is crucial because it ensures that the system can perform well on new,
unseen data, which is a common scenario in real-world applications. The
ability to generalize well is indicative of a system's robustness and
its potential to adapt to evolving cybersecurity threats.</p>
<p>The impact of achieving these objectives is significant. By improving
the accuracy of vulnerability discovery, the project directly
contributes to the security of critical infrastructure software. This
has far-reaching implications for national security, economic stability,
and public safety, as critical infrastructure systems are essential to
the functioning of society.</p>
<p>Moreover, the project's contributions to the AIxCC challenge could
lead to advancements in the field of AI and cybersecurity. By
participating in the gamified environment of the competition, the
project fosters innovation and encourages the development of new
techniques and methodologies in AI-driven cybersecurity.</p>
<p>The potential contributions of the results to the AIxCC challenge
could include:</p>
<ol>
<li>Demonstrating the effectiveness of the C6AI CRS in accurately
identifying and classifying software vulnerabilities.</li>
<li>Providing insights into the strengths and weaknesses of the current
approaches to SAST and vulnerability detection.</li>
<li>Offering a benchmark for future research and development in the
domain of AI-powered cybersecurity solutions.</li>
<li>Encouraging the adoption of AI and machine learning techniques in
the cybersecurity industry, particularly for the protection of critical
infrastructure.</li>
</ol>
<p>In summary, the project's objectives are to develop a CRS that excels
in out-of-sample generalization and vulnerability discovery accuracy,
with the potential to make significant contributions to the AIxCC
challenge and the broader field of cybersecurity.</p>
</section>
<section id="4-evaluation-methodology" class="cell markdown"
id="My7GCBqFJ_4e">
<h3>4. Evaluation methodology</h3>
<p>The evaluation methodology for the project will involve several key
metrics to assess the performance of the C6AI Cyber Reasoning System
(CRS) in identifying vulnerabilities in C++ code. These metrics will
provide a comprehensive understanding of the system's performance,
including its ability to correctly identify vulnerabilities (accuracy),
its ability to correctly identify true vulnerabilities (precision), its
ability to identify all actual vulnerabilities (recall), and a balanced
measure of precision and recall (F-measure).</p>
<ol>
<li><p><strong>Accuracy</strong>: This is the most intuitive performance
measure, and it simply is a ratio of correctly predicted observation to
the total observations. It is the ability of the model to correctly
identify both vulnerabilities and non-vulnerabilities. It is calculated
as (True Positives + True Negatives) / (True Positives + False Positives
+ True Negatives + False Negatives).</p></li>
<li><p><strong>Precision</strong>: Precision is the ratio of correctly
predicted positive observations to the total predicted positives. It is
also called Positive Predictive Value. It is a measure of amongst all
the identified vulnerabilities, how many of them are vulnerabilities. It
is calculated as True Positives / (True Positives + False
Positives).</p></li>
<li><p><strong>Recall (Sensitivity)</strong>: Recall is the ratio of
correctly predicted positive observations to all observations in actual
class. It is also called Sensitivity, Hit Rate, or True Positive Rate.
It is a measure of the ability of the model to identify all possible
vulnerabilities. It is calculated as True Positives / (True Positives +
False Negatives).</p></li>
<li><p><strong>F-Measure (F1 Score)</strong>: F1 Score is the weighted
average of Precision and Recall. Therefore, this score takes both false
positives and false negatives into account. It is suitable for uneven
class distribution problems. It is calculated as 2<em>(Recall </em>
Precision) / (Recall + Precision).</p></li>
</ol>
<p>The evaluation will be conducted using a test set that the model has
not been trained on to ensure an unbiased assessment of the model's
performance. This is crucial to avoid overfitting, where the model
performs well on the training data but poorly on new, unseen data. The
test data will be representative of the real-world data the model will
encounter, ensuring the evaluation reflects the model's true predictive
performance.</p>
<p>In addition, the project will employ techniques such as
cross-validation to further ensure the robustness of the evaluation. In
n-Fold cross-validation, the data is divided into n non-overlapping
subsets. The model is trained on n-1 subsets and tested on the remaining
subset. This process is repeated n times, with each subset used once as
the test set. The error estimation is averaged over all n trials to get
the total accuracy of the model.</p>
<p>The evaluation methodology will provide a comprehensive understanding
of the model's performance, allowing for the identification of areas of
strength and potential improvement. This will ultimately contribute to
the development of a more accurate and robust Cyber Reasoning
System.</p>
</section>
<section id="ii-implementation" class="cell markdown" id="rMXfDgSEI9_M">
<h2>II. Implementation</h2>
</section>
<section id="5-pre-processing" class="cell markdown" id="OjHHMuqsJ6bG">
<h3>5. Pre-processing</h3>
<p>The pre-processing steps for the text classification task in the
provided Python file include several steps to convert the raw text data
into a format that can be used by machine learning algorithms.</p>
<ol>
<li><p><strong>Text Lowercasing</strong>: All the text is converted to
lower case. This is done to ensure that the algorithm does not treat the
same words in different cases as different words.</p></li>
<li><p><strong>Punctuation Removal</strong>: All punctuation marks are
removed from the text. Punctuation does not add any extra information
while training the machine learning model. Moreover, removing
punctuation reduces the size of the vocabulary and thus increases the
speed of training.</p></li>
<li><p><strong>Stop Words Removal</strong>: Stop words are the most
common words in a language like 'the', 'a', 'on', 'is', 'all'. These
words do not carry important meaning and are usually removed from texts.
The Python file uses a list of English stop words from the NLTK
library.</p></li>
<li><p><strong>Stemming</strong>: Stemming is the process of reducing
inflected (or sometimes derived) words to their word stem, base or root
form. The Python file uses the Snowball Stemmer from the NLTK
library.</p></li>
<li><p><strong>N-gram Tokenization</strong>: The text is tokenized into
n-grams. N-grams are contiguous sequences of n items from a given sample
of text or speech. This helps to capture the context and semantic
meanings of phrases.</p></li>
<li><p><strong>Vectorization</strong>: The tokenized text is then
converted into numerical vectors which can be used as input to the
machine learning algorithm. The Python file uses the bag-of-words model
to convert the text into vectors. The bag-of-words model represents each
text as a vector in a high-dimensional space, where each unique word in
the text is represented by one dimension, and the value in that
dimension represents the frequency of the word in the text.</p></li>
</ol>
<p>The Python file reads .cpp files as text into a pandas dataframe. The
vocabulary is built from the unique words in the text after applying the
pre-processing steps.</p>
</section>
<section id="6-baseline-performance" class="cell markdown"
id="gadWv8GLJ2nv">
<h3>6. Baseline performance</h3>
<p>The Naive Bayes classifier was chosen as the baseline for the C6AI
Cyber Reasoning System (CRS) project due to its simplicity, efficiency,
and proven effectiveness in text classification tasks. This classifier
was implemented using the SciKit Learn library, as shown in the attached
Python file.</p>
<p>The Naive Bayes classifier was selected as the baseline because it is
a well-established algorithm in the field of text classification and has
been used extensively in previous research, including by our lecturer
for Statistical Data Mining <a
href="https://github.com/nsadawi/Advanced-ML-Projects/tree/4e112da6c42670052eca1152bd0a786afc30c1c5">Dr.
Noureddin Sadawi</a>. It is a probabilistic classifier that makes use of
Bayes' theorem with strong independence assumptions between the
features. It is particularly suited for high-dimensional datasets, like
text data, and is known for its efficiency and scalability.</p>
<p>The 0.74 (+/- 0.03) MultinomialNB Accuracy performance of the Naive
Bayes classifier provides a meaningful benchmark for comparison with the
more complex Convolutional Neural Network (CNN) model. The CNN model,
implemented using the Keras library, is expected to outperform the Naive
Bayes classifier due to its ability to capture local dependencies in the
data and its capacity for hierarchical feature learning. However, the
Naive Bayes classifier provides a valuable point of reference to
evaluate the degree of improvement achieved with the CNN model.</p>
<p>In conclusion, the Naive Bayes classifier was chosen as the baseline
due to its simplicity, efficiency, and proven effectiveness in text
classification tasks. Its performance provides a meaningful benchmark
for comparison with the more complex CNN model.</p>
</section>
<div class="cell code" id="khkD1BDtk3Cp">
<div class="sourceCode" id="cb2"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="co"># precision recall f1-score support</span></span>
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE121_Stack_Based_Buffer_Overflow 0.91 0.71 0.80 324</span></span>
<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE122_Heap_Based_Buffer_Overflow 0.87 0.66 0.75 316</span></span>
<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE124_Buffer_Underwrite 0.63 0.85 0.72 331</span></span>
<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE126_Buffer_Overread 0.86 0.92 0.89 335</span></span>
<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE127_Buffer_Underread 0.92 0.77 0.83 333</span></span>
<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE134_Uncontrolled_Format_String 0.98 0.90 0.94 350</span></span>
<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE190_Integer_Overflow 0.94 0.79 0.86 298</span></span>
<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE191_Integer_Underflow 0.95 0.77 0.85 294</span></span>
<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE194_Unexpected_Sign_Extension 1.00 0.59 0.74 165</span></span>
<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE195_Signed_to_Unsigned_Conversion_Error 0.99 0.55 0.71 158</span></span>
<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE197_Numeric_Truncation_Error 0.71 0.90 0.79 350</span></span>
<span id="cb2-14"><a href="#cb2-14" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE23_Relative_Path_Traversal 0.91 0.98 0.95 350</span></span>
<span id="cb2-15"><a href="#cb2-15" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE369_Divide_by_Zero 0.83 0.91 0.87 350</span></span>
<span id="cb2-16"><a href="#cb2-16" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE36_Absolute_Path_Traversal 0.82 0.91 0.86 350</span></span>
<span id="cb2-17"><a href="#cb2-17" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE400_Resource_Exhaustion 1.00 0.79 0.89 156</span></span>
<span id="cb2-18"><a href="#cb2-18" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE401_Memory_Leak 0.81 0.82 0.81 333</span></span>
<span id="cb2-19"><a href="#cb2-19" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE415_Double_Free 0.76 0.76 0.76 350</span></span>
<span id="cb2-20"><a href="#cb2-20" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE457_Use_of_Uninitialized_Variable 0.94 0.97 0.96 297</span></span>
<span id="cb2-21"><a href="#cb2-21" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE563_Unused_Variable 0.81 1.00 0.89 350</span></span>
<span id="cb2-22"><a href="#cb2-22" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE590_Free_Memory_Not_on_Heap 0.88 0.88 0.88 348</span></span>
<span id="cb2-23"><a href="#cb2-23" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE680_Integer_Overflow_to_Buffer_Overflow 0.98 0.85 0.91 301</span></span>
<span id="cb2-24"><a href="#cb2-24" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE690_NULL_Deref_From_Return 1.00 0.31 0.47 167</span></span>
<span id="cb2-25"><a href="#cb2-25" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE762_Mismatched_Memory_Management_Routines 0.79 0.85 0.82 349</span></span>
<span id="cb2-26"><a href="#cb2-26" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE789_Uncontrolled_Mem_Alloc 0.57 0.97 0.72 323</span></span>
<span id="cb2-27"><a href="#cb2-27" aria-hidden="true" tabindex="-1"></a><span class="co"># CWE78_OS_Command_Injection 1.00 0.95 0.97 350</span></span>
<span id="cb2-28"><a href="#cb2-28" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb2-29"><a href="#cb2-29" aria-hidden="true" tabindex="-1"></a><span class="co"># accuracy 0.84 7628</span></span>
<span id="cb2-30"><a href="#cb2-30" aria-hidden="true" tabindex="-1"></a><span class="co"># macro avg 0.87 0.81 0.83 7628</span></span>
<span id="cb2-31"><a href="#cb2-31" aria-hidden="true" tabindex="-1"></a><span class="co"># weighted avg 0.86 0.84 0.84 7628</span></span></code></pre></div>
</div>
<section id="7-classification-approach" class="cell markdown"
id="LkAvcU-_J0Gt">
<h3>7. Classification approach</h3>
<p>The C6AI Cyber Reasoning System (CRS) project used a Naive Bayes
classifier for text classification, specifically for identifying
vulnerabilities in software code. The features used in the classifier
were derived from the 'Test-Case-Code' and the labels were the
'CWE-ID'.</p>
<p>The 'Test-Case-Code' features were chosen because they represent the
actual code snippets that could potentially contain vulnerabilities.
These features were transformed into a bag-of-words representation and
then weighted using TF-IDF (Term Frequency-Inverse Document Frequency).
This transformation was crucial in converting the raw text into a
numerical format that the classifier could process.</p>
<p>The 'CWE-ID' [a standard identifier for software vulnerabilities,
allowing the results to be easily interpreted] was used as the target
label because it represents the specific type of vulnerability present
in the code. The classifier was trained to predict this label based on
the features derived from the 'Test-Case-Code'.</p>
<p>The Naive Bayes classifier was chosen for its simplicity and
efficiency in text classification tasks. It was implemented using the
SciKit Learn library for the baseline model. For the final model, a
Convolutional Neural Network (CNN) was built using the Keras library.
CNNs are known for their effectiveness in text classification tasks, as
they can capture local dependencies in the text and can manage
variable-length inputs.</p>
<p>The Python script used for training and evaluating the classifier was
designed to be easily understood and modified, enhancing the project's
reproducibility. This means that the approach can be replicated by
others using different programming languages, development environments,
libraries, and algorithms.</p>
</section>
<section id="8-coding-style" class="cell markdown" id="WVyw8ZgoJw5H">
<h3>8. Coding style</h3>
<p>The Python code provided is to adhere to several key coding
conventions, which are crucial for maintaining high-quality, readable,
and maintainable code.</p>
<ol>
<li><p><strong>Indentation</strong>: The code uses consistent
indentation, which is a fundamental aspect of Python syntax and crucial
for code readability.</p></li>
<li><p><strong>Variable Naming</strong>: The code uses meaningful names
for variables, which makes the code more understandable and
maintainable. For example, <code>porter_stemmer</code>,
<code>stop_words</code>, and <code>global_start</code> are all
descriptive variable names that give a clear indication of their purpose
in the code.</p></li>
<li><p><strong>Use of Libraries</strong>: The code makes extensive use
of libraries, including <code>nltk</code>, <code>tensorflow</code>,
<code>keras</code>, <code>numpy</code>, <code>pandas</code>, and
<code>sklearn</code>, among others. This is a good practice as it
leverages existing, well-tested functionality and can make the code more
concise and efficient.</p></li>
<li><p><strong>Comments</strong>: The code includes numerous comments,
which are essential for explaining the purpose of code blocks, the
functionality of functions, and the meaning of variables. This is a good
practice as it makes the code more understandable for others (and for
the original coder at a later date).</p></li>
<li><p><strong>Avoiding Magic Numbers</strong>: The code defines several
constants at the beginning (like <code>epochs</code>,
<code>batch_size</code>, <code>seed</code>, etc.), which is a good
practice as it avoids the use of unnamed numerical constants ("magic
numbers") in the code. This makes the code more readable and easier to
modify.</p></li>
<li><p><strong>Code Organization</strong>: The code is well-organized,
with clear sections for importing libraries, setting up variables,
defining functions, and executing code. This organization makes the code
easier to follow and understand.</p></li>
</ol>
<p>In summary, the code in the provided Python file appears to follow
good coding practices, including consistent indentation, meaningful
variable names, extensive use of libraries, comprehensive comments,
avoidance of magic numbers, and clear organization. These practices
contribute to making the code high-quality, readable, and
maintainable.</p>
</section>
<section id="iii-outcome-conclusions" class="cell markdown"
id="FilYFIQzJDLi">
<h2>III. Outcome Conclusions</h2>
</section>
<section id="9-evaluation" class="cell markdown" id="52xkRUxaJrxf">
<h3>9. Evaluation</h3>
<p>The evaluation of the C6AI Cyber Reasoning System (CRS) classifier
was performed using the Python scripts provided in this notebook. Those
scripts finally use a baseline-beating CNN model after initially using
multiple common-sense models and statistical data mining algorithms
[starting with the Naive Bayes classifier algorithm] to train various
baseline models and then makes predictions on the entire dataset.</p>
<p>The script uses the following metrics for evaluation:</p>
<ol>
<li><p><strong>Accuracy</strong>: This metric measures the ratio of
correctly predicted observations to the total observations. It is the
ability of the model to correctly identify both vulnerabilities and
non-vulnerabilities.</p></li>
<li><p><strong>Precision</strong>: This metric measures the ratio of
correctly predicted positive observations to the total predicted
positives. It is a measure of amongst all the identified
vulnerabilities, how many of them are vulnerabilities.</p></li>
<li><p><strong>Recall (Sensitivity)</strong>: This metric measures the
ratio of correctly predicted positive observations to all observations
in the actual class. It is a measure of the ability of the model to
identify all possible vulnerabilities.</p></li>
<li><p><strong>F-Measure (F1 Score)</strong>: This metric is the
weighted average of Precision and Recall. Therefore, this score takes
both false positives and false negatives into account.</p></li>
</ol>
<p>The script uses the SciKit Learn's built-in classification report to
return these metrics.</p>
<p>The results of the evaluation provide a quantitative measure of the
performance of the CRS classifier. By comparing these results with a
suitable baseline, we can assess the improvement achieved by our
approach. The specific values of these metrics depended on the actual
data used for training and testing the classifier [as shown and
validated in the rest of the notebook].</p>
</section>
<div class="cell code" data-execution_count="2" id="M_5kw2qMQ3-x">
<div class="sourceCode" id="cb3"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Evaluated the model on the test set</span></span>
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="co">## 87/87 [==============================] - 1s 11ms/step - loss: 0.2339 - accuracy: 0.9259</span></span>
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="co">## Test loss: 0.23388110101222992</span></span>
<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="co">## Test accuracy: 0.9259259104728699</span></span></code></pre></div>
</div>
<section id="10-summary-of-the-project-and-its-results"
class="cell markdown" id="zNGw85fqJLSh">
<h3>10. Summary of the project and its results</h3>
<p>Baseline-beating CNN Model Test Accuracy was 0.925 (in contrast to
the 0.74 (+/- 0.03) MultinomialNB Accuracy or the Accuracy: 0.87 (+/-
0.02) RandomForestClassifier Accuracy); nevertheless, the C6AI Cyber
Reasoning System (CRS) project once further developed [using creative
advances out of scope for this elementary NLP assignment] could make
significant contributions to the field of text classification,
particularly in the context of identifying vulnerabilities in software
code. While the project ultimately employed a CNN model [amongst
others], its initial choice was a Naive Bayes classifier, as a popular
choice for text classification tasks due to its simplicity and
efficiency. The classifier was trained and evaluated using Python-based
SciKit Learn's scripts, which were designed to be easily understood and
modified, enhancing the project's reproducibility.</p>
<p>The project's preprocessing steps, including the transformation of
text into a bag-of-words representation and the use of TF-IDF weighting,
were crucial in preparing the data for the classifier. These steps
converted the raw text into a format that the classifier could process,
and they could be readily adapted for other text classification tasks in
different domains.</p>
<p>The CRS classifier demonstrated robust performance across several
evaluation metrics, including accuracy, precision, recall, and F1 score.
These metrics provide a comprehensive assessment of the classifier's
performance, considering both its ability to correctly identify
vulnerabilities and its ability to avoid false positives and false
negatives.</p>
<p>The project's approach is highly transferable to other
domain-specific areas that involve text classification. The
preprocessing steps and the Naive Bayes classifier can be applied to any
text data, provided that the data is labelled for supervised learning.
Furthermore, the Python script can be easily modified to accommodate
different data sources, classification algorithms, or evaluation
metrics.</p>
<p>The project's approach can also be replicated using different
programming languages, development environments, libraries, and
algorithms. The key steps of the approach, including text preprocessing,
classifier training, and performance evaluation, are common tasks in
machine learning and natural language processing, and they can be
implemented in many programming languages that support these tasks, such
as R, Java, or C++. Similarly, different development environments or
libraries [such the new KerasNLP or Jax/PyTorch-backed Keras 3.0 API]
can be used to provide the necessary functionalities for these
tasks.</p>
<p>While the Naive Bayes classifier baseline was effective, the more
complex CNN model achieves higher performance on related tasks or
datasets. Alas, such model also has drawbacks, such as increased
computational cost and the risk of overfitting. Therefore, the choice of
classifier should be guided by the specific requirements and constraints
of each task.</p>
</section>
<section id="1st-uol-dsm140-cw---code" class="cell markdown"
id="V45GkVUFIKdt">
<h1>1st UoL DSM140 CW - Code</h1>
</section>
<div class="cell code" data-execution_count="3"
data-colab="{"base_uri":"https://localhost:8080/"}"
id="kYsAWNY1LLdp" data-outputId="64c05abc-167d-4390-b9e0-a88c7423d58e">
<div class="sourceCode" id="cb4"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>CONTENT_PATH<span class="op">=</span><span class="st">'/content/'</span></span>
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="co"># !cd $CONTENT_PATH && ls</span></span>
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>ls</span>
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>rm <span class="op">-</span>r sample_data</span></code></pre></div>
<div class="output stream stdout">
<pre><code>sample_data
</code></pre>
</div>
</div>
<section id="env-prepping" class="cell markdown" id="LYk3tygxLLdw">
<h1>Env Prepping</h1>
</section>
<section id="import-libraries" class="cell markdown" id="-5yr1LwPLLdx">
<h3>Import Libraries</h3>
</section>
<div class="cell code" data-execution_count="4" id="P3_Ty5JWDc1I">
<div class="sourceCode" id="cb6"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co"># import codecs,collections,csv,glob,io,itertools,json,logging,nltk,pathlib,\</span></span>
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="co"># pickle,pprint,pytest,re,requests,shutil,string,sys,unicodedata,warnings,zipfile</span></span>
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> codecs,collections,csv,glob,io,itertools,json,logging,nltk,os,pathlib,<span class="op">\</span></span>
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a>pickle,pprint,pytest,re,requests,shutil,string,sys,time,unicodedata,warnings,zipfile</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="5" id="9NqDP5goLLdz">
<div class="sourceCode" id="cb7"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="co"># import time</span></span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="co">## from time import time</span></span>
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>global_start <span class="op">=</span> time.time()</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="6" id="C35XKxHv0Bin">
<div class="sourceCode" id="cb8"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="co">## TensorFlow backend only supports string inputs</span></span>
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="co"># import os</span></span>
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>os.environ[<span class="st">"KERAS_BACKEND"</span>] <span class="op">=</span> <span class="st">"tensorflow"</span></span>
<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> keras</span>
<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> keras <span class="im">import</span> layers</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="7"
data-colab="{"base_uri":"https://localhost:8080/"}"
id="14EQOfRF1cdA" data-outputId="89ef11a3-ee05-41dc-8919-25c3ec13dee9">
<div class="sourceCode" id="cb9"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>pip install <span class="op">-</span>q <span class="st">"tensorflow-text"</span> <span class="co"># ==2.13.*"</span></span></code></pre></div>
<div class="output stream stdout">
<pre><code>━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.2/5.2 MB 9.8 MB/s eta 0:00:00
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="8" id="rAP1eUb50jQC">
<div class="sourceCode" id="cb11"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="co"># !pip install -q "tensorflow-text" # ==2.13.*"</span></span>
<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> tensorflow_text <span class="im">as</span> tf_text</span>
<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a><span class="co"># import keras</span></span>
<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> tensorflow <span class="im">as</span> tf</span>
<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> tensorflow.data <span class="im">as</span> tf_data</span>
<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> tensorflow_datasets <span class="im">as</span> tfds</span>
<span id="cb11-8"><a href="#cb11-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb11-9"><a href="#cb11-9" aria-hidden="true" tabindex="-1"></a><span class="co"># from keras import layers</span></span>
<span id="cb11-10"><a href="#cb11-10" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras <span class="im">import</span> Model</span>
<span id="cb11-11"><a href="#cb11-11" aria-hidden="true" tabindex="-1"></a><span class="co"># from tensorflow.keras import layers</span></span>
<span id="cb11-12"><a href="#cb11-12" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras <span class="im">import</span> losses</span>
<span id="cb11-13"><a href="#cb11-13" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras <span class="im">import</span> utils</span>
<span id="cb11-14"><a href="#cb11-14" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> Conv1D</span>
<span id="cb11-15"><a href="#cb11-15" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> Dense</span>
<span id="cb11-16"><a href="#cb11-16" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> Dropout</span>
<span id="cb11-17"><a href="#cb11-17" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> Embedding</span>
<span id="cb11-18"><a href="#cb11-18" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> Flatten</span>
<span id="cb11-19"><a href="#cb11-19" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> Input</span>
<span id="cb11-20"><a href="#cb11-20" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> MaxPooling1D</span>
<span id="cb11-21"><a href="#cb11-21" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> TextVectorization</span>
<span id="cb11-22"><a href="#cb11-22" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.layers <span class="im">import</span> concatenate</span>
<span id="cb11-23"><a href="#cb11-23" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tensorflow.keras.utils <span class="im">import</span> plot_model</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="9" id="RobpRSWTvsz3">
<div class="sourceCode" id="cb12"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>matplotlib inline</span>
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns<span class="op">;</span> sns.<span class="bu">set</span>()</span>
<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb12-7"><a href="#cb12-7" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> sklearn.feature_extraction.text</span>
<span id="cb12-8"><a href="#cb12-8" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> sklearn.metrics</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="10" id="l3jUDW5oLLd4">
<div class="sourceCode" id="cb13"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> matplotlib <span class="im">import</span> pyplot <span class="im">as</span> plt</span>
<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> pandas.core.frame <span class="im">import</span> DataFrame</span>
<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> numpy.testing <span class="im">import</span> assert_array_equal</span>
<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> google.colab <span class="im">import</span> files</span>
<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tempfile <span class="im">import</span> NamedTemporaryFile</span>
<span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> tqdm.notebook <span class="im">import</span> tqdm</span>
<span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> typing <span class="im">import</span> KeysView</span>
<span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb13-9"><a href="#cb13-9" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> os <span class="im">import</span> listdir</span>
<span id="cb13-10"><a href="#cb13-10" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> os.path <span class="im">import</span> isfile, join</span>
<span id="cb13-11"><a href="#cb13-11" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> operator <span class="im">import</span> itemgetter</span>
<span id="cb13-12"><a href="#cb13-12" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> optparse <span class="im">import</span> OptionParser</span>
<span id="cb13-13"><a href="#cb13-13" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> collections <span class="im">import</span> Counter</span>
<span id="cb13-14"><a href="#cb13-14" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> collections <span class="im">import</span> defaultdict</span>
<span id="cb13-15"><a href="#cb13-15" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> collections <span class="im">import</span> namedtuple</span>
<span id="cb13-16"><a href="#cb13-16" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> collections <span class="im">import</span> OrderedDict</span>
<span id="cb13-17"><a href="#cb13-17" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> scipy.cluster.hierarchy <span class="im">import</span> dendrogram</span>
<span id="cb13-18"><a href="#cb13-18" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> scipy.sparse <span class="im">import</span> csr_matrix</span>
<span id="cb13-19"><a href="#cb13-19" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> scipy.special <span class="im">import</span> logit</span>
<span id="cb13-20"><a href="#cb13-20" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> scipy.stats.distributions <span class="im">import</span> uniform</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="11" id="9qV1WA-ovopQ">
<div class="sourceCode" id="cb14"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> gensim.models <span class="im">import</span> word2vec</span>
<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> gensim.models <span class="im">import</span> Word2Vec</span>
<span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> gensim.models.phrases <span class="im">import</span> Phraser</span>
<span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> gensim.models.phrases <span class="im">import</span> Phrases</span>
<span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> keras <span class="im">import</span> initializers, regularizers, constraints, optimizers, layers</span>
<span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> keras.layers <span class="im">import</span> Bidirectional, GlobalMaxPool1D</span>
<span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> keras.layers <span class="im">import</span> Dense, Input, LSTM, Embedding, Dropout, Activation</span>
<span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> keras.models <span class="im">import</span> Model</span>
<span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> keras.preprocessing.sequence <span class="im">import</span> pad_sequences</span>
<span id="cb14-11"><a href="#cb14-11" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> keras.preprocessing.text <span class="im">import</span> Tokenizer</span>
<span id="cb14-12"><a href="#cb14-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-13"><a href="#cb14-13" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> nltk <span class="im">import</span> ngrams</span>
<span id="cb14-14"><a href="#cb14-14" aria-hidden="true" tabindex="-1"></a><span class="co"># from nltk.util import ngrams</span></span>
<span id="cb14-15"><a href="#cb14-15" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> nltk.corpus <span class="im">import</span> stopwords</span>
<span id="cb14-16"><a href="#cb14-16" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> nltk.stem <span class="im">import</span> PorterStemmer</span>
<span id="cb14-17"><a href="#cb14-17" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> nltk.stem <span class="im">import</span> SnowballStemmer</span>
<span id="cb14-18"><a href="#cb14-18" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> nltk.stem.snowball <span class="im">import</span> SnowballStemmer</span>
<span id="cb14-19"><a href="#cb14-19" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> nltk.stem.wordnet <span class="im">import</span> WordNetLemmatizer</span>
<span id="cb14-20"><a href="#cb14-20" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> nltk.tokenize <span class="im">import</span> sent_tokenize</span>
<span id="cb14-21"><a href="#cb14-21" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> nltk.tokenize <span class="im">import</span> word_tokenize</span>
<span id="cb14-22"><a href="#cb14-22" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb14-23"><a href="#cb14-23" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn <span class="im">import</span> datasets</span>
<span id="cb14-24"><a href="#cb14-24" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn <span class="im">import</span> metrics</span>
<span id="cb14-25"><a href="#cb14-25" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn <span class="im">import</span> preprocessing</span>
<span id="cb14-26"><a href="#cb14-26" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.base <span class="im">import</span> BaseEstimator</span>
<span id="cb14-27"><a href="#cb14-27" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.base <span class="im">import</span> RegressorMixin</span>
<span id="cb14-28"><a href="#cb14-28" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.cluster <span class="im">import</span> AgglomerativeClustering</span>
<span id="cb14-29"><a href="#cb14-29" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.datasets <span class="im">import</span> dump_svmlight_file</span>
<span id="cb14-30"><a href="#cb14-30" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.datasets <span class="im">import</span> fetch_20newsgroups</span>
<span id="cb14-31"><a href="#cb14-31" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.datasets <span class="im">import</span> load_files</span>
<span id="cb14-32"><a href="#cb14-32" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.decomposition <span class="im">import</span> PCA</span>
<span id="cb14-33"><a href="#cb14-33" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.ensemble <span class="im">import</span> RandomForestClassifier</span>
<span id="cb14-34"><a href="#cb14-34" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.exceptions <span class="im">import</span> NotFittedError</span>
<span id="cb14-35"><a href="#cb14-35" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_extraction <span class="im">import</span> DictVectorizer</span>
<span id="cb14-36"><a href="#cb14-36" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_extraction <span class="im">import</span> FeatureHasher</span>
<span id="cb14-37"><a href="#cb14-37" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_extraction.text <span class="im">import</span> CountVectorizer</span>
<span id="cb14-38"><a href="#cb14-38" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_extraction.text <span class="im">import</span> HashingVectorizer</span>
<span id="cb14-39"><a href="#cb14-39" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_extraction.text <span class="im">import</span> TfidfTransformer</span>
<span id="cb14-40"><a href="#cb14-40" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_extraction.text <span class="im">import</span> TfidfVectorizer</span>
<span id="cb14-41"><a href="#cb14-41" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_selection <span class="im">import</span> chi2</span>
<span id="cb14-42"><a href="#cb14-42" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_selection <span class="im">import</span> SelectFromModel</span>
<span id="cb14-43"><a href="#cb14-43" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.feature_selection <span class="im">import</span> SelectKBest</span>
<span id="cb14-44"><a href="#cb14-44" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LogisticRegression</span>
<span id="cb14-45"><a href="#cb14-45" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> PassiveAggressiveClassifier</span>
<span id="cb14-46"><a href="#cb14-46" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> SGDClassifier</span>
<span id="cb14-47"><a href="#cb14-47" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> accuracy_score</span>
<span id="cb14-48"><a href="#cb14-48" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> balanced_accuracy_score</span>
<span id="cb14-49"><a href="#cb14-49" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> classification_report</span>
<span id="cb14-50"><a href="#cb14-50" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> confusion_matrix</span>
<span id="cb14-51"><a href="#cb14-51" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> f1_score</span>
<span id="cb14-52"><a href="#cb14-52" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> log_loss</span>
<span id="cb14-53"><a href="#cb14-53" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> precision_recall_fscore_support <span class="im">as</span> score</span>
<span id="cb14-54"><a href="#cb14-54" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> roc_auc_score</span>
<span id="cb14-55"><a href="#cb14-55" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> roc_curve</span>
<span id="cb14-56"><a href="#cb14-56" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics.pairwise <span class="im">import</span> cosine_similarity</span>
<span id="cb14-57"><a href="#cb14-57" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> cross_val_score</span>
<span id="cb14-58"><a href="#cb14-58" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> GridSearchCV</span>
<span id="cb14-59"><a href="#cb14-59" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> KFold</span>
<span id="cb14-60"><a href="#cb14-60" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> RandomizedSearchCV</span>
<span id="cb14-61"><a href="#cb14-61" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.model_selection <span class="im">import</span> train_test_split</span>
<span id="cb14-62"><a href="#cb14-62" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.multiclass <span class="im">import</span> OneVsRestClassifier</span>
<span id="cb14-63"><a href="#cb14-63" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.naive_bayes <span class="im">import</span> ComplementNB</span>
<span id="cb14-64"><a href="#cb14-64" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.naive_bayes <span class="im">import</span> GaussianNB</span>
<span id="cb14-65"><a href="#cb14-65" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.naive_bayes <span class="im">import</span> MultinomialNB</span>
<span id="cb14-66"><a href="#cb14-66" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.naive_bayes <span class="im">import</span> MultinomialNB</span>
<span id="cb14-67"><a href="#cb14-67" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.neighbors <span class="im">import</span> KNeighborsClassifier</span>
<span id="cb14-68"><a href="#cb14-68" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.neighbors <span class="im">import</span> NearestNeighbors</span>
<span id="cb14-69"><a href="#cb14-69" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.pipeline <span class="im">import</span> Pipeline</span>
<span id="cb14-70"><a href="#cb14-70" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.preprocessing <span class="im">import</span> FunctionTransformer</span>
<span id="cb14-71"><a href="#cb14-71" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.preprocessing <span class="im">import</span> LabelEncoder</span>
<span id="cb14-72"><a href="#cb14-72" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.preprocessing <span class="im">import</span> MultiLabelBinarizer</span>
<span id="cb14-73"><a href="#cb14-73" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.preprocessing <span class="im">import</span> OneHotEncoder</span>
<span id="cb14-74"><a href="#cb14-74" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.svm <span class="im">import</span> SVC</span>
<span id="cb14-75"><a href="#cb14-75" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.tree <span class="im">import</span> DecisionTreeClassifier</span>
<span id="cb14-76"><a href="#cb14-76" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.utils <span class="im">import</span> check_array</span>
<span id="cb14-77"><a href="#cb14-77" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.utils <span class="im">import</span> check_X_y</span>
<span id="cb14-78"><a href="#cb14-78" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.utils <span class="im">import</span> shuffle</span>
<span id="cb14-79"><a href="#cb14-79" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.utils.extmath <span class="im">import</span> density</span>
<span id="cb14-80"><a href="#cb14-80" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.utils.extmath <span class="im">import</span> log_logistic</span>
<span id="cb14-81"><a href="#cb14-81" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.utils.multiclass <span class="im">import</span> unique_labels</span></code></pre></div>
</div>
<div class="cell code" data-execution_count="12"
data-colab="{"base_uri":"https://localhost:8080/"}"
id="xHDoxn3NLLd6" data-outputId="34d3b355-763b-48bb-810e-b27f7afa2ce4">
<div class="sourceCode" id="cb15"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="co">## Importing basic python libraries</span></span>
<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="op">%</span>matplotlib inline</span>
<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> __future__ <span class="im">import</span> print_function</span>
<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a>nltk.download(<span class="st">'punkt'</span>)</span>
<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a>nltk.download(<span class="st">'stopwords'</span>)</span>
<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a>nltk.download(<span class="st">'wordnet'</span>)</span></code></pre></div>
<div class="output stream stderr">
<pre><code>[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
</code></pre>
</div>
<div class="output execute_result" data-execution_count="12">
<pre><code>True</code></pre>
</div>
</div>
<section id="setting-static-and-global-variables" class="cell markdown"
id="WR_ES7ESLLd-">
<h3>Setting static and global variables</h3>
</section>
<div class="cell code" data-execution_count="14" id="qcy0E_ODLLd-">
<div class="sourceCode" id="cb18"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>epochs<span class="op">=</span><span class="dv">15</span> <span class="co">#30 #10 #2 #10</span></span>
<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>new_num_labels<span class="op">=</span><span class="dv">25</span> <span class="co">#4</span></span>
<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a>batch_size <span class="op">=</span> <span class="dv">32</span></span>
<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a>seed <span class="op">=</span> <span class="dv">0</span> <span class="co"># 42</span></span>
<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a>VOCAB_SIZE <span class="op">=</span> <span class="dv">10000</span></span>
<span id="cb18-6"><a href="#cb18-6" aria-hidden="true" tabindex="-1"></a>MAX_SEQUENCE_LENGTH <span class="op">=</span> <span class="dv">250</span></span>
<span id="cb18-7"><a href="#cb18-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-8"><a href="#cb18-8" aria-hidden="true" tabindex="-1"></a><span class="co"># Global values.</span></span>
<span id="cb18-9"><a href="#cb18-9" aria-hidden="true" tabindex="-1"></a>WORDS_SIZE<span class="op">=</span><span class="dv">10000</span></span>
<span id="cb18-10"><a href="#cb18-10" aria-hidden="true" tabindex="-1"></a>INPUT_SIZE<span class="op">=</span><span class="dv">500</span></span>
<span id="cb18-11"><a href="#cb18-11" aria-hidden="true" tabindex="-1"></a>NUM_CLASSES<span class="op">=</span>new_num_labels <span class="co">#5 # 2 # NUM_CLASSES=2</span></span>
<span id="cb18-12"><a href="#cb18-12" aria-hidden="true" tabindex="-1"></a>MODEL_NUM<span class="op">=</span><span class="dv">0</span></span>
<span id="cb18-13"><a href="#cb18-13" aria-hidden="true" tabindex="-1"></a>EPOCHS<span class="op">=</span>epochs <span class="co">#15 #10</span></span>
<span id="cb18-14"><a href="#cb18-14" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-15"><a href="#cb18-15" aria-hidden="true" tabindex="-1"></a><span class="co"># Preprocessing params.</span></span>
<span id="cb18-16"><a href="#cb18-16" aria-hidden="true" tabindex="-1"></a>PRETRAINING_BATCH_SIZE <span class="op">=</span> <span class="dv">128</span></span>
<span id="cb18-17"><a href="#cb18-17" aria-hidden="true" tabindex="-1"></a>FINETUNING_BATCH_SIZE <span class="op">=</span> <span class="dv">32</span></span>
<span id="cb18-18"><a href="#cb18-18" aria-hidden="true" tabindex="-1"></a>SEQ_LENGTH <span class="op">=</span> <span class="dv">128</span></span>
<span id="cb18-19"><a href="#cb18-19" aria-hidden="true" tabindex="-1"></a>MASK_RATE <span class="op">=</span> <span class="fl">0.25</span></span>
<span id="cb18-20"><a href="#cb18-20" aria-hidden="true" tabindex="-1"></a>PREDICTIONS_PER_SEQ <span class="op">=</span> <span class="dv">32</span></span>
<span id="cb18-21"><a href="#cb18-21" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-22"><a href="#cb18-22" aria-hidden="true" tabindex="-1"></a><span class="co"># Model params.</span></span>
<span id="cb18-23"><a href="#cb18-23" aria-hidden="true" tabindex="-1"></a>NUM_LAYERS <span class="op">=</span> <span class="dv">3</span></span>
<span id="cb18-24"><a href="#cb18-24" aria-hidden="true" tabindex="-1"></a>MODEL_DIM <span class="op">=</span> <span class="dv">256</span></span>
<span id="cb18-25"><a href="#cb18-25" aria-hidden="true" tabindex="-1"></a>INTERMEDIATE_DIM <span class="op">=</span> <span class="dv">512</span></span>
<span id="cb18-26"><a href="#cb18-26" aria-hidden="true" tabindex="-1"></a>NUM_HEADS <span class="op">=</span> <span class="dv">4</span></span>
<span id="cb18-27"><a href="#cb18-27" aria-hidden="true" tabindex="-1"></a>DROPOUT <span class="op">=</span> <span class="fl">0.1</span></span>
<span id="cb18-28"><a href="#cb18-28" aria-hidden="true" tabindex="-1"></a>NORM_EPSILON <span class="op">=</span> <span class="fl">1e-5</span></span>
<span id="cb18-29"><a href="#cb18-29" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb18-30"><a href="#cb18-30" aria-hidden="true" tabindex="-1"></a><span class="co"># Training params.</span></span>
<span id="cb18-31"><a href="#cb18-31" aria-hidden="true" tabindex="-1"></a>PRETRAINING_LEARNING_RATE <span class="op">=</span> <span class="fl">5e-4</span></span>
<span id="cb18-32"><a href="#cb18-32" aria-hidden="true" tabindex="-1"></a>PRETRAINING_EPOCHS <span class="op">=</span> <span class="dv">8</span></span>
<span id="cb18-33"><a href="#cb18-33" aria-hidden="true" tabindex="-1"></a>FINETUNING_LEARNING_RATE <span class="op">=</span> <span class="fl">5e-5</span></span>
<span id="cb18-34"><a href="#cb18-34" aria-hidden="true" tabindex="-1"></a>FINETUNING_EPOCHS <span class="op">=</span> <span class="dv">3</span></span></code></pre></div>
</div>
<div class="cell code" data-execution_count="15"
data-colab="{"base_uri":"https://localhost:8080/"}"
id="QEoxM5mpLLd_" data-outputId="f06edf52-39b3-4f0a-e5c0-7704e33c7db2">
<div class="sourceCode" id="cb19"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Generate random seed</span></span>
<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a><span class="co">#myrand=np.random.randint(1, 99999 + 1)</span></span>
<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a>rand<span class="op">=</span>seed <span class="co"># 1234 # 71926</span></span>
<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a>np.random.seed(rand)</span>
<span id="cb19-5"><a href="#cb19-5" aria-hidden="true" tabindex="-1"></a>tf.random.set_seed(rand)</span>
<span id="cb19-6"><a href="#cb19-6" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"Random seed is:"</span>, rand)</span></code></pre></div>
<div class="output stream stdout">
<pre><code>Random seed is: 0
</code></pre>
</div>
</div>
<div class="cell code" data-execution_count="16"
data-colab="{"base_uri":"https://localhost:8080/"}"
id="uaangCKQLLd_" data-outputId="1ac761ad-a71d-47dc-d6f1-07e78ae7c19b">
<div class="sourceCode" id="cb21"><pre
class="sourceCode python"><code class="sourceCode python"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"Tensorlfow version: "</span>, tf.__version__)</span>
<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"Eager mode: "</span>, tf.executing_eagerly())</span>
<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"GPU is"</span>, <span class="st">"available"</span> <span class="cf">if</span> tf.test.is_gpu_available() <span class="cf">else</span> <span class="st">"NOT AVAILABLE"</span>)</span>
<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a><span class="co">## Tensorlfow version: 2.13.1</span></span>
<span id="cb21-6"><a href="#cb21-6" aria-hidden="true" tabindex="-1"></a><span class="co">## Eager mode: True</span></span>
<span id="cb21-7"><a href="#cb21-7" aria-hidden="true" tabindex="-1"></a><span class="co">## GPU is NOT AVAILABLE</span></span>
<span id="cb21-8"><a href="#cb21-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb21-9"><a href="#cb21-9" aria-hidden="true" tabindex="-1"></a><span class="co">## Tensorlfow version: 2.15.0</span></span>
<span id="cb21-10"><a href="#cb21-10" aria-hidden="true" tabindex="-1"></a><span class="co">## Eager mode: True</span></span>