-
Notifications
You must be signed in to change notification settings - Fork 0
/
CD-ICJ_Source_CodebookCreation.R
997 lines (688 loc) · 44.2 KB
/
CD-ICJ_Source_CodebookCreation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
#'---
#'title: "Codebook | Corpus of Decisions: International Court of Justice (CD-ICJ)"
#'author: Seán Fobbe
#'geometry: margin=3cm
#'papersize: a4
#'fontsize: 11pt
#'output:
#' pdf_document:
#' keep_tex: true
#' toc: true
#' toc_depth: 3
#' number_sections: true
#' pandoc_args: --listings
#' includes:
#' in_header: tex/CD-ICJ_Source_TEX_Preamble_EN.tex
#' before_body: [tex/CD-ICJ_Source_TEX_Author.tex,temp/CD-ICJ_Source_TEX_Definitions.tex,tex/CD-ICJ_Source_TEX_CodebookTitle.tex]
#'bibliography: temp/packages.bib
#'nocite: '@*'
#' ---
#'\newpage
#+ echo = FALSE
knitr::opts_chunk$set(fig.pos = "center",
echo = FALSE,
warning = FALSE,
message = FALSE)
############################
### Packages
############################
#+
library(knitr) # Scientific Reporting
library(kableExtra) # Enhanced Knitr Tables
library(magick) # Required for cropping when compiling PDF
library(parallel) # Base R Parallelization
library(data.table) # Advanced Data Handling
setDTthreads(threads = detectCores())
############################
### Preamble
############################
datashort <- "CD-ICJ"
files.zip <- list.files(pattern = "\\.zip")
datestamp <- unique(tstrsplit(files.zip,
split = "_")[[2]])
prefix.en <- paste0("ANALYSIS/",
datashort,
"_EN_01_FrequencyTable_var-")
prefix.fr <- paste0("ANALYSIS/",
datashort,
"_FR_01_FrequencyTable_var-")
############################
### Read Tables: Frequency
############################
table.doctype.en <- fread(paste0(prefix.en, "doctype.csv"))[,-3]
table.doctype.fr <- fread(paste0(prefix.fr, "doctype.csv"))[,-3]
table.opinion.en <- fread(paste0(prefix.en, "opinion.csv"))[,-3]
table.opinion.fr <- fread(paste0(prefix.fr, "opinion.csv"))[,-3]
table.year.en <- fread(paste0(prefix.en, "year.csv"))[,-3]
table.year.fr <- fread(paste0(prefix.fr, "year.csv"))[,-3]
table.applicant.en <- fread(paste0(prefix.en, "applicant.csv"))[,-3]
table.applicant.fr <- fread(paste0(prefix.fr, "applicant.csv"))[,-3]
table.respondent.en <- fread(paste0(prefix.en, "respondent.csv"))[,-3]
table.respondent.fr <- fread(paste0(prefix.fr, "respondent.csv"))[,-3]
##############################
### Read Tables: Entity Codes
##############################
table.countrycodes <- fread("data/CD-ICJ_Source_CountryCodes.csv")
table.advcodes <- fread("data/CD-ICJ_Source_AdvisoryRequestCoding.csv")
############################
### Read Tables: Linguistic
############################
stats.ling.en <- fread("ANALYSIS/CD-ICJ_EN_00_CorpusStatistics_Summaries_Linguistic.csv")
stats.ling.fr <- fread("ANALYSIS/CD-ICJ_FR_00_CorpusStatistics_Summaries_Linguistic.csv")
############################
### Read Metadata
############################
meta.zip.en <- paste(datashort,
datestamp,
"EN_CSV_BEST_META.zip",
sep = "_")
meta.best.en <- fread(cmd = paste("unzip -cq",
meta.zip.en))
############################
### Read Hash File
############################
hashfile <- paste(datashort,
datestamp,
"CryptographicHashes.csv",
sep = "_")
############################
### Begin Text
############################
#'# Introduction
#'The \textbf{\icj\ (ICJ)} is the primary judicial organ of the United Nations and one of the most consequential courts in international law.
#'
#' Called the \enquote{World Court} by many, it is the only international court with general thematic jurisdiction. While critics occasionally note the lack of compulsory jurisdiction and sharply limited access to the Court,\footnote{Only States may be party to proceedings in contentious jurisdiction and only certain bodies of international organizations may request advisory opinions.} its opinions continue to have an outsize influence on the modern interpretation, codification and wider development of international law. Every international legal textbook covers the workings and decisions of the Court *in extenso* and participation in international moot courts, such as the Philip C. Jessup Moot Court, without regular reference to and citation of the \icj 's decisions, is unthinkable.
#'
#'The \textbf{\datatitle\ (\datashort)} collects and presents for the first time in human- and machine-readable form all published decisions of the \icj . Among these are judgments, advisory opinions and orders, as well as their respective appended minority opinions (declarations, separate opinions and dissenting opinions).
#'
#'
#' This data set is designed to be complementary to and fully compatible with the \emph{Corpus of Decisions: Permanent Court of International Justice (CD-PCIJ)}, which is also available open access.\footnote{Corpus of Decisions: Permanent Court of International Justice (CD-PCIJ). <\url{https://doi.org/10.5281/zenodo.3840480}>.}
#'
#'
#' The quantitative analysis of international legal data is still in its infancy, a situation which is exacerbated by the lack of high-quality empirical data. Most advanced data sets are held in commercial databases and are therefore not easily available to academic researchers, journalists and the general public. With this data set I hope to contribute to a more systematic and empirical view of the international legal system. In an international community founded on the rule of law the activities of the judiciary must be public, transparent and defensible. In the 21st century this requires quantitative scientific review of decisions and actions.
#'
#' Design, construction and compilation of this data set are based on the principles of general availability through freedom from copyright (public domain status), strict transparency and full scientific reproducibility. The *FAIR Guiding Principles for Scientific Data Management and Stewardship* (Findable, Accessible, Interoperable and Reusable) inspire both the design and the manner of publication.\footnote{Wilkinson, M., Dumontier, M., Aalbersberg, I. et al. The FAIR Guiding Principles for Scientific Data Management and Stewardship. Sci Data 3, 160018 (2016). <\url{https://doi.org/10.1038/sdata.2016.18}>.}
#+
#'# Reading Files
#' The data are published in open, interoperable and widely used formats (CSV, TXT, PDF). They can be used with all modern programming languages (e.g. Python or R) and graphical interfaces. The PDF collections are intended to facilitate traditional legal research.
#'
#' **Important:** Missing values are always coded as \enquote{NA}.
#+
#'## CSV Files
#'
#' Working with the CSV files is recommended. CSV\footnote{The CSV format is defined in RFC 4180: <\url{https://tools.ietf.org/html/rfc4180}>.} is an open and simple machine-readable tabular data format. In this data set values are separated by commas. Each column is a variable and each row is a document. Variables are explained in detail in section \ref{variables}.
#'
#' To read \textbf{CSV} files into R I strongly recommend using the fast file reader **fread()** from the **data.table** package (available on CRAN). The file can be read into \textbf{R} like so:
#+ eval = FALSE, echo = TRUE
library(data.table)
icj.en <- fread("filename.csv")
#'## TXT Files
#'The \textbf{TXT} files, including metadata, can be read into \textbf{R} with the package \textbf{readtext} (available on CRAN) thus:
#+ eval = FALSE, echo = TRUE
library(readtext)
icj.en <- readtext("EN_TXT_BEST_FULL/*.txt",
docvarsfrom = "filenames",
docvarnames = c("court",
"caseno",
"shortname",
"applicant",
"respondent",
"date",
"doctype",
"collision",
"opinion",
"language"),
dvsep = "_",
encoding = "UTF-8")
#+
#'# Data Set Design
#'## Description of Data Set
#'The \textbf{\datatitle\ (\datashort)} collects and structures in human- and machine-readable form all published decisions of the \icj . Among these are judgments, advisory opinions and orders, as well as their respective appended minority opinions (declarations, separate opinions and dissenting opinions).
#'
#' It consists of a CSV file of the full data set, a CSV file with the metadata only, individual TXT files for each document and PDF files with an enhanced text layer generated by the LSTM neural network engine of the optical character recognition software (OCR) \emph{Tesseract}.
#'
#' Additionally, the raw PDF files and some intermediate stages of refinement are included to allow for easier replication of results and for production use in the event that even higher quality methods of optical character recognition (OCR) can be applied to the documents in the future.
#+
#'## Complementarity
#' This data set is intended to be complementary to and fully compatible with the \emph{Corpus of Decisions: Permanent Court of International Justice (CD-PCIJ)}, which is also available open access.\footnote{Corpus of Decisions: Permanent Court of International Justice (CD-PCIJ). <\url{https://doi.org/10.5281/zenodo.3840480}>.}
#'
#+
#'## Table of Sources
#'\begin{centering}
#'\begin{longtable}{P{5cm}p{9cm}}
#'\toprule
#' Data Source & Citation \\
#'\midrule
#' Primary Data Source & \url{https://www.icj-cij.org}\\
#' Source Code & \url{\softwareversionurldoi}\\
#' Country Codes & \url{\softwareversionurldoi}\\
#' Entity Codes & \url{\softwareversionurldoi}\\
#' Cases Names and Parties & \url{\softwareversionurldoi}\\
#'\bottomrule
#'\end{longtable}
#'\end{centering}
#+
#'## Data Collection
#' Data were collected with the explicit consent of the Registry of the \icj . All documents were downloaded via TLS-encrypted connections and cryptographically signed after data processing was complete. The data set collects all decisions and appended opinions issued by the \icj\ that were published on the official website of the \icj\ on \version .
#+
#'## Source Code and Compilation Report
#'
#' The full Source Code for the creation of this data set, the resulting Compilation Report and this Codebook are published open access and permanently archived in the scientific repository of CERN.
#'
#' With every compilation of the full data set an extensive **Compilation Report** is created in a professionally layouted PDF format (comparable to this Codebook). The Compilation Report includes the Source Code, comments and explanations of design decisions, relevant computational results, exact timestamps and a table of contents with clickable internal hyperlinks to each section. The Compilation Report is published under the same DOI as the Source Code.
#'
#' For details of the construction and validation of the data set please refer to the Compilation Report.
#+
#'## Limitations
#'Users should bear in mind certain limitations:
#'\begin{enumerate}
#' \item The data set contains only those documents which were published by the ICJ and have been made available by the ICJ on its official website (\emph{publication bias}).
#' \item While Tesseract yields high-quality OCR results, current OCR technology is not perfect and minor errors must be expected (\emph{OCR bias}).
#' \item Automatic language detection is not foolproof and some bilingual documents marked as monolingual may have gone undetected (\emph{language mismatch}).
#' \item Lengthy quotations in languages other than the language indicated in the metadata may further confound analyses (\emph{language blurring}).
#'\end{enumerate}
#+
#'## Public Domain Status
#'
#'According to written communication between the author and the Registry of the \icj\ the original documents are not subject to copyright.
#'
#' To ensure the widest possible distribution and to promote the international rule of law I waive any copyright to the data set under a \textbf{Creative Commons CC0 1.0 Universal (CC0 1.0) Public Domain Dedication}. For details of the license please refer to the CC0 copyright notice at the beginning of this Codebook or visit the Creative Commons website for the full terms of the license.\footnote{\url{https://creativecommons.org/publicdomain/zero/1.0/legalcode}}
#'\newpage
#+
#'## Quality Assurance
#' Dozens of automated tests were conducted to ensure the quality of the data and metadata, for example:
#'
#' \begin{enumerate}
#'\item Auto-detection of language via analysis of n-gram patterns with the \emph{textcat} package for R.
#'\item Strict validation of variable types via \emph{regular expressions}.
#'\item Construction of frequency tables for (almost) every variable followed by human review to detect anomalies.
#'\item Creation of visualizations for many common descriptive analyses.
#'\end{enumerate}
#'
#'For results of each test and more information on the construction of the data set please refer to the Compilation Report or the \enquote{ANALYSIS} archive included with the data set.
#' \begin{sidewaysfigure}
#'\includegraphics{ANALYSIS/CD-ICJ_Workflow_1.pdf}
#' \caption{Workflow Schematic Part 1: Download, Labelling, Conversion and Sorting of Documents}
#' \end{sidewaysfigure}
#' \begin{sidewaysfigure}
#'\includegraphics{ANALYSIS/CD-ICJ_Workflow_2.pdf}
#' \caption{Workflow Schematic Part 2: Ingestion, Pre-Processing, Analysis and Creation of CSV Files}
#' \end{sidewaysfigure}
#+
#'# Variants and Primary Target Audiences
#'The data set is provided in two language versions (English and French), as well as several differently processed variants geared towards specific target audiences.
#'
#' A reduced PDF variant of the data set containing only majority opinions is intended to assist practitioners.
#'
#' \medskip
#'\begin{centering}
#'\begin{longtable}{p{4cm}p{10cm}}
#'\toprule
#'Variant & Target Audience and Description \\
#'\midrule
#'\endhead
#'PDF\_BEST & \textbf{Traditional Legal Research (recommended).} A synthesis of all born-digital documents issued by the ICJ combined with older scanned documents (prior to 2005) which were given a new and enhanced text layer created with an advanced LSTM neural network machine learning engine. Its main advantages are vastly improved local searches in individual documents via Ctrl+F and copy/pasting without the need for extensive manual revisions. Researchers with slow internet connections should consider using the \enquote{TXT\_BEST} variant, as this still provides a reasonable visual approximation of the original documents, but offers the advantage of drastically reduced file size. A reduced PDF variant of the data set containing only majority opinions is available to assist practitioners.\\
#'CSV\_BEST & \textbf{Quantitative Research (recommended).} A structured representation of the full data set within a single comma-delimited file. Includes the full complement of metadata described in the Codebook. The \enquote{FULL} sub-variant includes the full text of the decisions, whereas the sub-variant \enquote{META} only contains the metadata.\\
#'TXT\_BEST & \textbf{Quantitative Research.} A synthesis of TXT files created by combining the extracted text of all born-digital documents issued by the ICJ (2005 and later) and the OCR texts from older scanned documents (prior to 2005) generated with an advanced LSTM neural network machine learning engine. R users should strongly consider using the package \emph{readtext} to read them into R with the filename metadata intact.\\
#'ANALYSIS & \textbf{Quantitative Research.} This archive contains almost all of the machine-readable analysis output generated during the data set creation process to facilitate further analysis (CSV for tables, PDF and PNG for plots). Minor analysis results are documented only in the Compilation Report.\\
#'TXT\_EXTRACTED & \textbf{Replication Research and Creation of New Data Sets.} TXT files containing the extracted text layer from all original documents as published by the ICJ. The quality of the original OCR text for older documents is poor and this variant should not be used for statistical analysis. Documents dated 2005 or later were born-digital and can be used for all purposes.\\
#'TXT\_TESSERACT & \textbf{Replication Research and Creation of New Data Sets.} TXT files containing the OCR text generated with an advanced LSTM neural network machine learning engine for documents predating 2005. Fully included in the BEST variant, but provided separately for reasons of transparency.\\
#'PDF\_ORIGINAL & \textbf{Replication Research and Creation of New Data Sets.} The original documents with the original text layer. Only recommended for researchers who wish to replicate the machine-readable files or who wish to create a new and improved data set. Not recommended for traditional research, as the quality of the original OCR text layer is quite poor.\\
#'PDF\_ENHANCED & \textbf{Replication Research and Creation of New Data Sets.} Scanned documents of opinions rendered before 2005 which were given a new and enhanced text layer generated with an advanced LSTM neural network machine learning engine. Fully included in the BEST variant, but provided separately for reasons of transparency.\\
#'\bottomrule
#'\end{longtable}
#'\end{centering}
#+
#'\newpage
#+
#'# Variables
#+
#'## General Remarks
#' \begin{itemize}
#'
#' \item Missing values are always coded as \enquote{NA}.
#'
#' \item All Strings are encoded in UTF-8.
#'
#' \item A significant part of the metadata was included with the files downloaded from the Court's website.
#'
#' \item The variables \enquote{shortname}, \enquote{applicant}, \enquote{respondent}, \enquote{stage}, \enquote{applicant\_region}, \enquote{applicant\_subregion}, \enquote{respondent\_region} and \enquote{respondent\_subregion} were coded manually by the author of the data set and added automatically at compilation time. Country codes conform to the ISO 3166 Alpha-3 standard and geographical classifications to the M49 standard used by the UN Statistics Division.
#'
#' \item The variable \enquote{fullname} is coded according to case headings as published on the ICJ website. Includes the full names of the parties in parentheses. Introductory phrases such as \enquote{Case concerning...} are omitted.
#'
#' \item The variables \enquote{nchars}, \enquote{ntokens}, \enquote{ntypes}, \enquote{nsentences} and \enquote{year} were calculated automatically based on the content and metadata of each document.
#'
#' \item The variables \enquote{version}, \enquote{doi\_concept}, \enquote{doi\_version} and \enquote{license} were added automatically during the data set creation process to document provenance and to comply with FAIR Data Principles F1, F3 and R1.1.
#'
#' \end{itemize}
#'\vspace{1cm}
#+
#'## Structure of TXT File Names
#'\begin{verbatim}
#'[court]_[caseno]_[shortname]_[applicant]_[respondent]_[date]_[doctype]_
#'[collision]_[stage]_[opinion]_[language]
#'\end{verbatim}
#'\vspace{1cm}
#'\subsection{Example TXT File Name}
#'\begin{verbatim}
#' ICJ_001_CorfuChannel_GBR_ALB_1949-04-09_JUD_01_ME_05_EN.txt
#'\end{verbatim}
#'\newpage
#+
#'## Structure of CSV Metadata
str(meta.best.en)
#'\newpage
#+
#'## Detailed Description of Variables
#'\begin{centering}
#'\begin{longtable}{p{3.5cm}p{2cm}p{9cm}}
#'\toprule
#'Variable & Type & Details\\
#'\midrule
#'\endhead
#' doc\_id & String & (CSV only) The name of the imported TXT file.\\
#' text & String & (CSV only) The full content of the imported TXT file.\\
#' court & String & The variable only takes the value \enquote{ICJ}, which stands for \enquote{\icj}. It is generally only useful if combined with the CD-PCIJ or other data sets.\\
#' caseno & Integer & The case number assigned by the ICJ. The same case may span multiple case numbers, i.e. the Interpretation or Revision stages have different case numbers than the original judgment. To analyze all stages of a case I recommend a pattern search on the variable \enquote{shortname}. Note: case number 2 is unassigned and there are no documents for case number 2 available on the ICJ website.\\
#' shortname & String & Short name of the case. This was custom-created by the author based on the original title. Short names include well-known components (e.g. \enquote{Nicaragua}) to facilitate quick local searches and try to be as faithful to the full title as possible. For requests concerning interpretation or revision of a judgment the shortname is followed by \enquote{Interpretation} or \enquote{Revision}.\\
#' fullname & String & (CSV only) Full name of the case as published on the ICJ website. Includes the full names of the Parties. Introductory phrases such as \enquote{Case concerning...} are omitted.\\
#' applicant & String & The unique identifier of the applicant. In contentious proceedings this is the three-letter (Alpha-3) country code as per the ISO 3166-1 standard. Table \ref{tab:countrycodes} contains an explanation of all country codes used in the data set. Please note that reserved country codes are in use for historical entities (e.g. the Soviet Union). For advisory proceedings this variable refers to the entity which requested an advisory opinion. Table \ref{tab:entities} explains the detailed advisory coding decisions.\\
#' respondent & String & The unique identifier of the respondent. In contentious proceedings this is the three-letter (Alpha-3) country code as per the ISO 3166-1 standard. Table \ref{tab:countrycodes} contains an explanation of all country codes used in the data set. Please note that reserved country codes are in use for historical entities (e.g. the Soviet Union). Advisory proceedings do not have a respondent and therefore always take the value \enquote{NA}.\\
#' applicant\_region & String & (CSV only) The geographical region of the applicant according to the UN M49 standard. Please refer to table \ref{tab:countrycodes} for details and exceptions. Geographical information is only available for countries, not for UN bodies or international organizations.\\
#' respondent\_region & String & (CSV only) The geographical region of the respondent according to the UN M49 standard. Please refer to table \ref{tab:countrycodes} for details and exceptions. Geographical information is only available for countries, not for UN bodies or international organizations.\\
#' applicant\_subregion & String & (CSV only) The geographical subregion of the applicant according to the UN M49 standard. Please refer to table \ref{tab:countrycodes} for details and exceptions. Geographical information is only available for countries, not for UN bodies or international organizations.\\
#' respondent\_subregion & String & (CSV only) The geographical subregion of the respondent according to the UN M49 standard. Please refer to table \ref{tab:countrycodes} for details and exceptions. Geographical information is only available for countries, not for UN bodies or international organizations.\\
#' date & ISO Date & The date of the document in the format YYYY-MM-DD (ISO-8601).\\
#' doctype & String & A three-letter code indicating the type of document. Possible values are \enquote{JUD} (judgments in contentious jurisdiction), \enquote{ADV} (advisory opinions) and \enquote{ORD} (orders in all types of jurisidiction).\\
#' collision & Integer & In rare instances the \icj\ issued several decisions of the same type in the same proceedings on the same day. Most documents take the value \enquote{01}. If documents with otherwise identical metadata would be issued, the value is incremented.\\
#' stage & String & The stage of proceedings in contentious jurisdiction, coded based on the title page (primary), or a close reading of the findings (secondary). Possible values are \enquote{PO} (preliminary objections), \enquote{ME} (merits), \enquote{IN} (intervention) and \enquote{CO} (compensation). Please note that the ICJ is very inconsistent in how it classifies admissibility; it can occur in the same document either together with a decision on jurisdiction or a decision on the merits. I have chosen to code pure admissibility decisions as \enquote{ME} (e.g. Second Phase of Nottebohm). In general all of the above types of decisions can occur in the same document. I therefore do not recommend this variable for computational analysis unless great care is taken to understand its limitations. Currently only judgments are coded, orders will be added in the future.\\
#' opinion & Integer & A sequential number assigned to each opinion. Majority opinions are always coded \enquote{00}. Minority opinions begin with \enquote{01} and ascend to the maximum number of minority opinions.\\
#' language & String & The language of the document as a two-letter ISO 639-1 code. This data set contains documents in the languages English (\enquote{EN}) and French (\enquote{FR}).\\
#' year & Integer & (CSV only) The year the document was issued. The format is YYYY.\\
#' minority & Integer & (CSV only) This variable indicates whether the document is a majority (0) or minority (1) opinion.\\
#' nchars & Integer & (CSV only) The number of characters in a given document.\\
#' ntokens & Integer & (CSV only) The number of tokens (an arbitrary character sequence bounded by whitespace) in a given document. This metric can vary significantly depending on tokenizer and parameters used. This count was generated based on plain tokenization with no further pre-processing (e.g. stopword removal, removal of numbers, lowercasing) applied. Analysts should use this number not as an exact figure, but as an estimate of the order of magnitude of a given document's length. If in doubt, perform an independent calculation with the software of your choice.\\
#' ntypes & Integer & (CSV only) The number of \emph{unique} tokens. This metric can vary significantly depending on tokenizer and parameters used. This count was generated based on plain tokenization with no further pre-processing (e.g. stopword removal, removal of numbers, lowercasing) applied. Analysts should use this number not as an exact figure, but as an estimate of the order of magnitude of a given document's length. If in doubt, perform an independent calculation with the software of your choice.\\
#' nsentences & Integer & (CSV only) The number of sentences in a given document. The rules for detecting sentence boundaries are very complex and are described in \enquote{Unicode Standard Annex No 29}. This metric can vary significantly depending on tokenizer and parameters used. This count was generated based on plain tokenization with no further pre-processing (e.g. stopword removal, removal of numbers, lowercasing) applied. Analysts should use this number not as an exact figure, but as an estimate of the order of magnitude of a given document's length. If in doubt, perform an independent calculation with the software of your choice.\\
#' version & ISO Date & (CSV only) The version of the data set as a date in long form as per ISO-8601. The version represents the date on which the data set creation process was begun and the data was acquired from the website of the Court.\\
#' doi\_concept & String & (CSV only) The Digital Object Identifier (DOI) for the \emph{concept} of the data set. Resolving this DOI via www.doi.org allows researchers to always acquire the \emph{latest version} of the data set. The DOI is a persistent identifier suitable for stable long-term citation. Principle F1 of the FAIR Data Principles (\enquote{data are assigned globally unique and persistent identifiers}) recommends the documentation of each data set with a persistent identifier and Principle F3 its inclusion with the metadata. Even if the CSV data set is transmitted without the accompanying Codebook this allows researchers to establish provenance of the data.\\
#' doi\_version & String & (CSV only) The Digital Object Identifier (DOI) for the \emph{specific version} of the data set. Resolving this DOI via www.doi.org allows researchers to always acquire this \emph{specific version} of the data set. The DOI is a persistent identifier suitable for stable long-term citation. Principle F1 of the FAIR Data Principles (\enquote{data are assigned globally unique and persistent identifiers}) recommends the documentation of each data set with a persistent identifier and Principle F3 its inclusion with the metadata. Even if the CSV data set is transmitted without the accompanying Codebook this allows researchers to establish provenance of the data.\\
#' license & String & (CSV only) The license of the data set. In this data set the value is always \enquote{Creative Commons Zero 1.0 Universal}. Ensures compliance with FAIR data principle R1.1 (\enquote{clear and accessible data usage license}).\\
#'\bottomrule
#'\end{longtable}
#'\end{centering}
#'\newpage
#'# Applicant and Respondent Codes
#+
#'## Contentious Jurisdiction: States
#'
#'\label{tab:countrycodes}
#'
#'Applicants and Respondents in contentious jurisdiction are coded according to the uppercase three-letter (Alpha-3) country codes described in the ISO 3166-1 standard. The codes are taken from the version of the standard which was valid on 4 November 2020. The table below only includes those codes which are used in the data set. The regions and subregions assigned to States generally follow the UN Standard Country or Area Codes for Statistics Use, 1999 (Revision 4), also known as the M49 standard.
#'
#'Please note that where States have ceased to exist (Soviet Union, Yugoslavia, Serbia and Montenegro, Czechoslovakia) their historical three-letter country codes from ISO 3166-1 are used. These are not part of the current ISO 3166-1 standard, but have been transitionally reserved by the ISO 3166 Maintenance Agency to ensure backwards compatibility. The four-letter ISO 3166-3 standard (\enquote{Code for formerly used names of countries}) is not used in this data set. The regions and subregions for Yugoslavia and Czechoslovakia are taken from M49 revision 2 (1982). The Soviet Union is coded as \enquote{Europe/Eastern Europe} (the M49 standard considers the SUN its own region). Serbia and Montenegro was never included in the M49 standard and has been assigned the same region and subregion as Yugoslavia.
#'\bigskip
#'\ra{1.2}
kable(table.countrycodes,
format = "latex",
align = 'p{1.5cm}p{4cm}p{2cm}p{6cm}',
booktabs = TRUE,
longtable = TRUE,
col.names = c("ISO-3",
"Name",
"Region",
"Sub-Region")) %>% kable_styling(latex_options = "repeat_header")
#'\newpage
#+
#'## Advisory Jurisdiction: Entities
#'
#'\label{tab:entities}
#'
#'Entities who requested an advisory opinion from the \icj\ are not Applicants in the strict sense, but have been coded under this variable to reduce clutter. I have tried to choose widely used codes for each entity.
#'
#'Note that the \emph{International Maritime Organization (IMO)} was known as the \enquote{Inter-Governmental Maritime Consultative Organization} at the time it requested the advisory opinon. I have coded it with the modern \enquote{IMO}, as the organization only underwent a change of name and its legal continuity is not in doubt.
#'
#'I was unable to discover a well-known acronym for the \emph{Committee on Applications for Review of Administrative Tribunal Judgements} and custom-coded it as \enquote{CARAT}.
#'\bigskip
kable(table.advcodes,
format = "latex",
align = c("p{3cm}",
"p{11cm}"),
booktabs = TRUE,
longtable = TRUE,
col.names = c("Code",
"Entity"))
#'\newpage
#+
#'# Linguistic Metrics
#+
#'## Explanation of Metrics
#' To better communicate the scope of the corpus and its constituent documents I provide a number of classic linguistic metrics and visualize their distributions:
#'
#'
#' \medskip
#'
#'\begin{centering}
#'\begin{longtable}{P{3.5cm}p{10.5cm}}
#'\toprule
#'Metric & Definition\\
#'\midrule
#' Characters & Characters roughly correspond to graphemes, the smallest functional unit in a writing system. The word \enquote{judge} is composed of 5 characters, for example.\\
#' Tokens & An arbitrary character sequence delimited by whitespace on both sides, e.g. it roughly corresponds to the notion of a \enquote{word}. However, due to its strictly syntactical definition it might also include arbitrary sequences of numbers or special characters.\\
#' Types & Unique tokens. If, for example, the token \enquote{human} appeared one hundred times in a given document, it would be counted as only one type. \\
#' Sentences & Corresponds approximately to the colloquial definition of a sentence. The exact rules for determining sentence boundaries are very complex and may be reviewed in \enquote{Unicode Standard: Annex No 29}.\\
#'\bottomrule
#'\end{longtable}
#'\end{centering}
#'
#'\bigskip
#+
#'## Summary Statistics
newnames <- c("Metric",
"Total",
"Min",
"Quart1",
"Median",
"Mean",
"Quart3",
"Max")
setnames(stats.ling.en, newnames)
setnames(stats.ling.fr, newnames)
#'### English
kable(stats.ling.en,
digits = 2,
format.args = list(big.mark = ","),
format = "latex",
booktabs = TRUE,
longtable = TRUE)
#'### French
kable(stats.ling.fr,
digits = 2,
format.args = list(big.mark = ","),
format = "latex",
booktabs = TRUE,
longtable = TRUE)
#'\newpage
#'## Explanation of Diagrams
#+
#'### Distributions of Document Length
#'The diagrams in Section \ref{doclength} are combined violin and box plots. They are especially useful in visualizing distributions of quantitative variables. Their interpretation is fairly straightforward: the greater the area under the curve for a given range, the more frequent the values are in this range. The thick center line of the box indicates the median, the outer lines of the box the first and third quartiles. Whiskers extend outwards to 1.5 times the inter-quartile range (IQR). Outliers beyond 1.5 times IQR are shown as individual points.
#'
#' Please note that the x-axis is logarithmically scaled, i.e. in powers of 10. It therefore increases in a non-linear fashion. Additional sub-markings are included to assist with interpretation.
#+
#'### Most Frequent Tokens
#' A token is defined as any character sequence delimited by whitespace on both sides, e.g. it roughly corresponds to the notion of a \enquote{word}. However, due to the strictly syntactical definition tokens might also include arbitrary sequences of numbers or special characters.
#'
#' The charts in Sections \ref{toptokens-en} and \ref{toptokens-fr} show the 50 most frequent tokens for each language, weighted by both term frequency (TF) and term frequency/inverse document frequency (TF-IDF). Sequences of numbers, special symbols and a general list of frequent words for English and French (\enquote{stopwords}) were removed prior to constructing the list. For details of the calculations, please refer to the Compilation Report and/or the Source Code.
#'
#' The term frequency $\text{tf}_{td}$ is calculated as the raw count of the number of times a term $t$ appears in a document $d$.
#'
#' The term frequency/inverse document frequency $\text{tf-idf}_{td}$ for a term $t$ in a document $d$ is calculated as follows, with $N$ the total number of documents in a corpus and $\text{df}_{t}$ being the number of documents in the corpus in which the term $t$ appears:
#'
#'$$\text{tf-idf}_{td} = \text{tf}_{td} \times \text{log}_{10}\left(\frac{N}{\text{df}_{t}}\right)$$
#+
#'### Tokens over Time
#' The charts in Section \ref{tokenperyear} show the total output of the \icj\ for each year as the sum total of the tokens of all published decisions (judgments, advisory opinions, orders, appended opinions). These charts may give a rough estimate of the activity of the \icj , although they should be interpreted with caution, as duplicate and highly similar opinions were not removed for this simple analysis. Please refer to Section \ref{docsim} for the scope of identical and near-identical documents in the corpus.
#+
#'\newpage
#'## Distributions of Document Length
#' \label{doclength}
#+
#'### English
#' ![](ANALYSIS/CD-ICJ_EN_10_Distributions_LinguisticMetrics-1.pdf)
#+
#'### French
#' ![](ANALYSIS/CD-ICJ_FR_10_Distributions_LinguisticMetrics-1.pdf)
#'\newpage
#+
#'## Most Frequent Tokens (English)
#'\label{toptokens-en}
#+
#'### Term Frequency Weighting (TF)
#' ![](ANALYSIS/CD-ICJ_EN_13_Top50Tokens_TF-Weighting_Scatter-1.pdf)
#+
#'### Term Frequency/Inverse Document Frequency Weighting (TF-IDF)
#' ![](ANALYSIS/CD-ICJ_EN_14_Top50Tokens_TFIDF-Weighting_Scatter-1.pdf)
#'\newpage
#+
#'## Most Frequent Tokens (French)
#'\label{toptokens-fr}
#+
#'### Term Frequency Weighting (TF)
#' ![](ANALYSIS/CD-ICJ_FR_13_Top50Tokens_TF-Weighting_Scatter-1.pdf)
#+
#'### Term Frequency/Inverse Document Frequency Weighting (TF-IDF)
#' ![](ANALYSIS/CD-ICJ_FR_14_Top50Tokens_TFIDF-Weighting_Scatter-1.pdf)
#+
#'\newpage
#'## Tokens over Time
#'\label{tokenperyear}
#+
#'### English
#' ![](ANALYSIS/CD-ICJ_EN_05_TokensPerYear-1.pdf)
#+
#'### French
#' ![](ANALYSIS/CD-ICJ_FR_05_TokensPerYear-1.pdf)
#+
#'# Document Similarity
#'
#' \label{docsim}
#+
#'## English
#' ![](ANALYSIS/CD-ICJ_EN_19_DocumentSimilarity_Correlation-1.pdf)
#'
#'## French
#' ![](ANALYSIS/CD-ICJ_FR_19_DocumentSimilarity_Correlation-1.pdf)
#+
#'## Comment
#' Analysts are advised that the CD-ICJ contains a non-negligible number of highly similar to near-identical documents. This is due to the Court's long-standing practice of issuing formally different decisions for each Applicant-Respondent pair in the course of the same proceedings. A prime example of such proceedings are the *Use of Force* cases, for which the judgments are identical in content, but differ only in the names of the Parties across more than half a dozen different judgments.
#'
#' The above figures plot the number of files to be excluded as a function of correlation similarity based on a document-unigram matrix (with the removal of numbers, special symbols and stopwords, as well as lowercasing). Analysts who wish to qualitatively review this computational approach will find the IDs of presumed duplicates, together with the relevant value of correlation similarity, stored as CSV files in the \enquote{ANALYSIS} archive published with the data set (item 17). These document IDs can also easily be read into statistical software and excluded directly from analyses without having to perform one's own similarity analysis. I do, however, recommend double-checking the IDs for false positives. The document pairings and similarity scores are included in a different CSV file (also item 17).
#'
#' The choice of similarity algorithm, the threshold for marking a document as duplicate and the question of whether duplicate documents should be removed at all should be decided with respect to individual analyses. My goal is to document the Court's output as faithfully as possible and provide analysts with fair warning, as well as the opportunity to make their own choices. Please note that the manner of de-duplication will substantially affect analytical results and should be made after careful consideration of both methodology and the data.
#'
#+
#'\newpage
#+
#'# Metadata Frequency Tables
#'
#' \ra{1.3}
#'
#+
#'## By Year
#+
#'### English
#'\vspace{0.3cm}
#' ![](ANALYSIS/CD-ICJ_EN_04_Barplot_Year-1.pdf)
#'\vspace{0.3cm}
kable(table.year.en,
format = "latex",
align = 'P{3cm}',
booktabs = TRUE,
longtable = TRUE,
col.names = c("Year",
"Documents",
"% Total",
"% Cumulative")) %>% kable_styling(latex_options = "repeat_header")
#+
#'\newpage
#+
#'### French
#'\vspace{0.3cm}
#' ![](ANALYSIS/CD-ICJ_FR_04_Barplot_Year-1.pdf)
#'\vspace{0.3cm}
kable(table.year.fr,
format = "latex",
align = 'P{3cm}',
booktabs = TRUE,
longtable = TRUE,
col.names = c("Year",
"Documents",
"% Total",
"% Cumulative")) %>% kable_styling(latex_options = "repeat_header")
#+
#'\newpage
#+
#'## By Document Type
#+
#'### English
#'\vspace{0.3cm}
#' ![](ANALYSIS/CD-ICJ_EN_02_Barplot_Doctype-1.pdf)
#'\vspace{1cm}
kable(table.doctype.en,
format = "latex",
align = 'P{3cm}',
booktabs = TRUE,
longtable = TRUE,
col.names = c("DocType",
"Documents",
"% Total",
"% Cumulative")) %>% kable_styling(latex_options = "repeat_header")
#+
#'### French
#'\vspace{0.3cm}
#' ![](ANALYSIS/CD-ICJ_FR_02_Barplot_Doctype-1.pdf)
#'\vspace{1cm}
kable(table.doctype.fr,
format = "latex",
align = 'P{3cm}',
booktabs = TRUE,
longtable = TRUE,
col.names = c("DocType",
"Documents",
"% Total",
"% Cumulative")) %>% kable_styling(latex_options = "repeat_header")
#'\ra{1.1}
#+
#'\newpage
#+
#'## By Opinion Number
#+
#'### English
#' ![](ANALYSIS/CD-ICJ_EN_03_Barplot_Opinion-1.pdf)
kable(table.opinion.en,
format = "latex",
align = 'P{3cm}',
booktabs = TRUE,
longtable = TRUE,
col.names = c("Opinion Number",
"Documents",
"% Total",
"% Cumulative")) %>% kable_styling(latex_options = "repeat_header")
#+
#'\newpage
#+
#'### French
#'\vspace{0.1cm}
#' ![](ANALYSIS/CD-ICJ_FR_03_Barplot_Opinion-1.pdf)
#'\vspace{0.1cm}
kable(table.opinion.fr,
format = "latex",
align = 'P{3cm}',
booktabs = TRUE,
longtable = TRUE,
col.names = c("Opinion Number",
"Documents",
"% Total",
"% Cumulative")) %>% kable_styling(latex_options = "repeat_header")
#'\ra{1.3}
#+
#'\newpage
#+
#'## By Applicant
#+
#'### English
kable(table.applicant.en,
format = "latex",
align = 'P{3cm}',
booktabs = TRUE,
longtable = TRUE,
col.names = c("Applicant",
"Documents",
"% Total",
"% Cumulative")) %>% kable_styling(latex_options = "repeat_header")
#+
#'\newpage
#+
#'### French
kable(table.applicant.fr,
format = "latex",
align = 'P{3cm}',
booktabs = TRUE,
longtable = TRUE,
col.names = c("Applicant",
"Documents",
"% Total",
"% Cumulative")) %>% kable_styling(latex_options = "repeat_header")
#+
#'\newpage
#+
#'## By Respondent
#+
#'### English
kable(table.respondent.en,
format = "latex",
align = 'P{3cm}',
booktabs = TRUE,
longtable = TRUE,
col.names = c("Respondent",
"Documents",
"% Total",
"% Cumulative")) %>% kable_styling(latex_options = "repeat_header")
#+
#'\newpage
#+
#'### French
kable(table.respondent.fr,
format = "latex",
align = 'P{3cm}',
booktabs = TRUE,
longtable = TRUE,
col.names = c("Respondent",
"Documents",
"% Total",
"% Cumulative")) %>% kable_styling(latex_options = "repeat_header")
#'\newpage
#+
#'# Verification of Cryptographic Signatures
#' This Codebook automatically verifies the SHA3-512 cryptographic signatures (\enquote{hashes}) of all ZIP archives during its compilation. SHA3-512 hashes are calculated via system call to the OpenSSL library on Linux systems.
#'
#' A successful check is indicated by \enquote{Signature verified!}. A failed check will print the line \enquote{ERROR!}
#+ echo = TRUE
# Function: Test SHA3-Hashes
sha3test <- function(filename, sig){
sig.new <- system2("openssl",
paste("sha3-512", filename),
stdout = TRUE)
sig.new <- gsub("^.*\\= ", "", sig.new)
if (sig == sig.new){
return("Signature verified!")
}else{
return("ERROR!")
}
}
# Import Original Signatures
input <- fread(hashfile)
filename <- input$filename
sha3.512 <- input$sha3.512
# Verify Signatures
sha3.512.result <- mcmapply(sha3test, filename, sha3.512, USE.NAMES = FALSE)
# Print Results
testresult <- data.table(filename, sha3.512.result)
kable(testresult,
format = "latex",
align = c("l", "r"),
booktabs = TRUE,
col.names = c("File",
"Result"))
#' \newpage
#+ results = "asis"
cat(readLines("CHANGELOG.md"),
sep = "\n")
#+
#'# Strict Replication Parameters
system2("openssl", "version", stdout = TRUE)
sessionInfo()
#'\newpage
#+
#'# References