forked from w3c/rdf-canon
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.html
3937 lines (3635 loc) · 186 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html lang="en">
<head>
<meta content="text/html; charset=utf-8" http-equiv="content-type" />
<title>RDF Dataset Canonicalization</title>
<script class="remove" src="https://www.w3.org/Tools/respec/respec-w3c"></script>
<script src="common/common.js" class="remove" defer></script>
<script class="remove" src="common/biblio.js"></script>
<script class="remove">
//<![CDATA[
var respecConfig = {
localBiblio: rch.localBiblio,
specStatus: "ED",
copyrightStart: "2010",
edDraftURI: "https://w3c.github.io/rdf-canon/spec",
// the specification's short name, as in http://www.w3.org/TR/short-name/
shortName: "rdf-canon",
subtitle: "A Standard RDF Dataset Canonicalization Algorithm",
// if you wish the publication date to be other than today, set this
// publishDate: "2009-08-06",
github: "https://github.com/w3c/rdf-canon/",
edDraftURI: "https://w3c.github.io/rdf-canon/spec/",
// editors, add as many as you like
// only "name" is required
editors: [
{ name: "Dave Longley",
url: "https://github.com/dlongley",
w3cid: "48025",
company: "Digital Bazaar",
companyURL: "https://digitalbazaar.com/" },
{ name: "Gregg Kellogg",
url: "https://greggkellogg.net/",
w3cid: "44770" },
{ name: "Dan Yamamoto",
url: "https://github.com/yamdan",
w3cid: "139477" }
],
// formerEditors, add as many as you like
// only "name" is required
formerEditors: [
{ name: "Manu Sporny",
url: "https://www.linkedin.com/in/manusporny/",
w3cid: "41758",
company: "Digital Bazaar",
companyURL: "https://digitalbazaar.com/",
note: "CG Report" }
],
// authors, add as many as you like.
// This is optional, uncomment if you have authors as well as editors.
// only "name" is required. Same format as editors.
authors: [
{ name: "Dave Longley",
url: "https://github.com/dlongley",
w3cid: "48025",
company: "Digital Bazaar",
companyURL: "https://digitalbazaar.com/" }
],
// name of the
group: "rch",
crEnd: "2023-11-28",
testSuiteURI: "https://w3c.github.io/rdf-canon/tests/",
implementationReportURI: "https://w3c.github.io/rdf-canon/reports/",
xref: ["infra"],
doJsonLd: true,
wgPublicList: "public-rch-wg",
lint: { "informative-dfn": false }
};
//]]>
</script>
<style>
.hl-bold { font-weight: bold; color: #0a3; }
.comment { color: #999; }
table, thead, tr, td {
padding: 5px;
border-width: 1px;
border-spacing: 0px;
border-style: solid;
border-collapse: collapse;
}
table.hash-related {
caption-side: bottom;
width: 100%;
}
table.hash-related > thead > tr> th {
text-align: center;
}
.algorithm ol {
counter-reset: numsection;
list-style-type: none;
}
.algorithm ol>li {
margin: 0.5em 0;
}
.algorithm ol>li:before {
font-weight: bold;
counter-increment: numsection;
content: counters(numsection, ".") ") ";
}
a.externalDFN {border-bottom: 1px solid #99c; font-style: italic;}
code { color: #c63501; }
details, .details {
background-color: rgb(245,245,245);
border-left: 0.3em solid rgb(200,200,200);
padding: 0.3em;
}
summary, .details > .summary {font-size: small; }
pre.logging {
border: thin solid #88AA88;
background-color: #E8F0E8;
margin: 1em 4em 1em 0em;
padding: 0.3em;
}
table.log ul { padding: 0; margin: 0; }
table.log td { text-align: left; vertical-align: top;}
table.log td.ca.c0 { display: none; }
table.log td.bn_to_quads.c0 { display: none; }
table.log td.identifier.c1 { font-style: italic; }
table.log li { list-style: none; }
</style>
</head>
<body>
<section id="abstract">
<p>RDF [[RDF11-CONCEPTS]] describes a graph-based data model for making claims
about the world and provides the foundation for reasoning upon that graph
of information. At times, it becomes necessary to compare the differences
between sets of graphs, digitally sign them, or generate short identifiers
for graphs via hashing algorithms. This document outlines an algorithm for
normalizing <a>RDF datasets</a> such that these operations can be
performed.</p>
</section>
<section id="sotd">
<p>This document describes the <a>RDFC-1.0</a> algorithm for canonicalizing
RDF datasets, which was the input from the
<a href="https://www.w3.org/community/credentials/">W3C Credentials Community Group</a>
published as [[CCG-RDC-FINAL]].</p>
<p>At the time of publication, [[RDF11-CONCEPTS]] is the most recent recommendation
defining <a>RDF datasets</a> and [[N-QUADS]],
however work on an updated specification
is ongoing within the <a href="https://www.w3.org/groups/wg/rdf-star">W3C RDF-star Working Group</a>.
Some dependencies from relevant updated specifications are provided
normatively in this specification with the expectation
that a future update to this specification will replace those with normative
references to updated RDF specifications.</p>
<p>To successfully advance to Proposed Recommendation,
each test in the <a href="https://w3c.github.io/rdf-canon/tests/">test suite</a>
MUST be passed by at least three independent implementations.
Two independent implementations MUST pass all tests.</p>
</section>
<section id="introduction" class="informative">
<h2>Introduction</h2>
<p>When data scientists discuss canonicalization,
they do so in the context of achieving a particular set of goals.
Since the same information may sometimes be expressed in a variety of different ways,
it often becomes necessary to transform each of these
different ways into a single, standard representation.
With a standard representation, the differences between
two different sets of data can be easily determined,
a cryptographically-strong hash identifier can be generated for a particular
set of data,
and a particular set of data may be digitally-signed for later
verification.</p>
<p>In particular, this specification is about normalizing
<a>RDF datasets</a>, which are collections of graphs. Since
a directed graph can express the same information in more than one
way, it requires canonicalization to achieve the aforementioned goals
and any others that may arise via serendipity.</p>
<p>Most <a>RDF datasets</a> can be canonicalized fairly quickly, in terms
of algorithmic time complexity. However, those that contain nodes that do
not have globally unique identifiers pose a greater challenge. Normalizing
these datasets presents the <dfn>graph isomorphism</dfn> problem, a
problem that is believed to be difficult to solve quickly in the worst
case. Fortunately, existing real world data is rarely, if ever, modeled in
a way that manifests as the worst case and new data can be modeled to avoid
it. In fact, software systems that detect a problematic dataset
(see <a href="#dataset-poisoning" class="sectionRef"></a>) can choose
to assume it's an attempted denial of service attack, rather than a
real input, and abort.</p>
<p>This document outlines an algorithm for generating a canonical
serialization of an <a>RDF dataset</a> given an <a>RDF dataset</a> as input.
The algorithm is called the
<strong>RDF Canonicalization algorithm version 1.0</strong> or
<a>RDFC-1.0</a>.</p>
<div class="note">
<p>[[[RDF11-CONCEPTS]]] [[RDF11-CONCEPTS]] lacks clarity on the representation of
<a data-cite="RDF11-CONCEPTS#dfn-language-tagged-string">language-tagged strings</a>,
where <a data-cite="RDF11-CONCEPTS#dfn-language-tag">language tags</a> of the form `xx-YY`
are treated as being case insensitive. Implementations might represent language tags
using all lower case in the form `xx-yy`,
retain the original representation `xx-YY`,
or use [[BCP47]] <a data-cite="BCP47#section-2.1.1">formatting conventions</a>,
leading to different canonical forms, and therefore, different hashed values.</p>
<ul>
<li>The Canonicalization algorithm is based on the RDF 1.1 definition,
in the sense that the <a data-cite="RDF11-CONCEPTS#dfn-language-tag">language tag</a> `xx-YY`
is case insensitive, which might lead to different canonicalizations if the user is not aware of this problem.</li>
<li>User communities ought to agree to use lower case
<a data-cite="RDF11-CONCEPTS#dfn-language-tag">language tags</a>,
while being aware that some implementations might normalize language tags,
affecting hash values.</li>
<li>Future evolution of RDF might regulate this issue, which RDF environments might have to adapt to,
and this might lead to an update of <a>RDFC-1.0</a>.</li>
</ul>
</div>
<p class="note">See <a href="#urdna2015" class="sectionRef"></a>
for a comparison with the version of the algorithm published
in [[[CCG-RDC-FINAL]]] [[CCG-RDC-FINAL]].</p>
<section id="intro-uses">
<h2>Uses of Dataset Canonicalization</h2>
<p>There are different use cases where graph or dataset canonicalization are important:</p>
<ul>
<li>Determining if one serialization is isomorphic to another.</li>
<li>Digital signing of graphs (datasets) independent of serialization or format.</li>
<li>Comparing two graphs (datasets) to find differences.</li>
<li>Communicating change sets when remotely updating an <a data-cite="RDF11-CONCEPTS#dfn-rdf-source">RDF source</a>.</li>
</ul>
<p>A canonicalization algorithm is necessary, but not necessarily sufficient, to handle many of these use cases. The use of <a>blank nodes</a> in RDF graphs and datasets has a long history and creates inevitable complexities. Blank nodes are used for different purposes:</p>
<ul>
<li>when a well known identifier for a node is not known, or the author of a document chooses not to unambiguously name that node,</li>
<li>when a node is used to stitch together parts of a graph and the nodes themselves are not interesting (e.g., <a data-cite="RDF11-MT#rdf-collections">RDF Collections</a> in [[RDF11-MT]]),</li>
<li>when someone is trying to create an intentionally difficult graph topology.</li>
</ul>
<p>Furthermore,
RDF semantics dictate that deserializing an RDF document
results in the creation of unique <a>blank nodes</a>,
unless it can be determined that on each occasion,
the <a>blank node</a> identifies the same resource.
This is due to the fact that <a>blank node identifiers</a>
are an aspect of a concrete RDF syntax
and are not intended to be persistent or portable.
Within the abstract RDF model,
blank nodes do not have identifiers
(although some
RDF store
implementations may use stable identifiers and may choose to make them portable).
See <a data-cite="RDF11-CONCEPTS#section-blank-nodes">Blank Nodes</a>
in [[RDF11-CONCEPTS]] for more information.</p>
<p>RDF does have a provision for allowing blank nodes
to be published in an externally identifiable way through the use of
<a data-cite="RDF11-CONCEPTS#dfn-skolem-iri">Skolem IRIs</a>,
which allow a given RDF store to replace the use of blank nodes
in a concrete syntax with <a>IRIs</a>,
which then serve to repeatably identify that blank node within that particular RDF store;
however, this is not generally useful for talking about the
same graph in different RDF stores,
or other concrete representations.
In any case, a stable <a>blank node identifier</a> defined for one
RDF store or serialization is arbitrary,
and typically not relatable to the context within which it is used.</p>
<p>This specification defines an algorithm for creating stable
<a>blank node identifiers</a> repeatably for different serializations
possibly using individualized <a>blank node identifiers</a>
of the same RDF graph (dataset) by grounding each <a>blank node</a>
through the nodes to which it is connected.
As a result, a graph signature can be obtained by hashing a canonical serialization
of the resulting <a>canonicalized dataset</a>,
allowing for the isomorphism and digital signing use cases.
This specification does not define such a graph signature.</p>
<p>As blank node identifiers can be stable even with other changes to a graph (dataset),
in some cases it is possible to compute the difference between two graphs (datasets),
for example if changes are made only to ground triples,
or if new blank nodes are introduced which do not create an automorphic confusion
with other existing blank nodes.
If any information which would change the generated blank node identifier,
a resulting diff might indicate a greater set of changes than actually exists.
Additionally, if the starting dataset is an N-Quads document,
it may be possible to correlate the original blank node identifiers
used within that N-Quads document with those issued in the
<a>canonicalized dataset</a>.</p>
<p class="note">Although alternative <a>hash algorithms</a> might be used
with this specification,
applications ought to carefully weigh the advantages
and disadvantages of using an alternative hash function.
This is the case, in particular, for any representation of the <a>canonical n-quads form</a>
or <a>issued identifiers map</a>
that does not identify the associated hash algorithm. Any use case
that requires reproduction of the same output is expected to
unequivocally express or communicate the internal
hash algorithm that was used when generating
the <a>canonical n-quads form</a>.
</p>
</section>
<section id="how-to-read">
<h2>How to Read this Document</h2>
<p>This document is a detailed specification for an <a>RDF dataset</a>
canonicalization algorithm. The document is primarily intended for the
following audiences:</p>
<ul>
<li>Software developers that want to implement an <a>RDF dataset</a>
canonicalization algorithm.</li>
<li>Masochists.</li>
</ul>
<p>To understand the basics in this specification you must be familiar with
basic RDF concepts [[RDF11-CONCEPTS]]. A working knowledge of
<a href="https://en.wikipedia.org/wiki/Graph_theory">graph theory</a> and
<a href="https://en.wikipedia.org/wiki/Graph_isomorphism">graph isomorphism</a>
is also recommended.</p>
</section>
<section id="typo-conventions" class="informative">
<h2>Typographical conventions</h2>
<div data-include="common/typographical-conventions.html"></div>
</section>
</section>
<section id="conformance">
<p>A conforming processor is a system which can generate
the <a>canonical n-quads form</a> of an <a>input dataset</a>
consistent with the algorithms defined in this specification.</p>
<p>The algorithms in this specification are normative,
because to consistently reproduce the same canonical identifiers,
implementations MUST strictly conform to the steps outlined in these algorithms.</p>
<p class="note">Implementers can partially check their level of conformance with
this specification by successfully passing the test cases of the
<a href="https://w3c.github.io/rdf-canon/tests/">RDF Dataset Canonicalization test suite</a>.
Note, however, that passing all the tests in the test
suite does not imply complete conformance to this specification. It only implies
that the implementation conforms to the aspects tested by the test suite.</p>
</section>
<section id="terminology" class="normative">
<h2>Terminology</h2>
<section id="canon-terms">
<h3>Terms defined by this specification</h3>
<dl data-sort="ascending">
<dt><dfn data-lt="input dataset|input datasets">input dataset</dfn></dt>
<dd>The abstract <a>RDF dataset</a> that is provided as input to
the algorithm.</dd>
<dt><dfn>input blank node identifier map</dfn></dt>
<dd>Records any blank node identifiers already assigned to the
<a>input dataset</a>.
If the <a>input dataset</a> is provided as an N-Quads document,
the <a>map</a> relates blank nodes in the abstract <a>input dataset</a>
to the blank node identifiers used within the N-Quads document,
otherwise, identifiers are assigned arbitrarily for
each blank node in the input dataset not previously identified.
<div class="note">Implementations or environments might deal with blank
node identifiers more directly; for example, some implementations might
retain blank node identifiers in the parsed or abstract dataset. Implementations
are expected to reuse these to enable usable mappings between input blank node
identifiers and output blank node identifiers outside of the algorithm.</div>
</dd>
<dt><dfn>canonicalized dataset</dfn></dt>
<dd>A <a>canonicalized dataset</a> is the combination of the following:
<ul>
<li>an <a>RDF dataset</a> —
the <a>input dataset</a>,</li>
<li>the <a>input blank node identifier map</a> —
mapping <a>blank nodes</a> in the input dataset to <a>blank node identifiers</a>, and</li>
<li>the <a>issued identifiers map</a> from the <a>canonical issuer</a> —
mapping identifiers in the input dataset to canonical identifiers</li>
</ul>
A concrete serialization of a <a>canonicalized dataset</a> MUST label
all <a>blank nodes</a> using the canonical <a>blank node identifiers</a>.
</dd>
<dt><dfn>identifier issuer</dfn></dt>
<dd>An identifier issuer is used to issue new <a>blank node identifiers</a>. It
maintains a
<a href="#bn-issuer-state">blank node identifier issuer state</a>.</dd>
<dt><dfn>hash</dfn></dt>
<dd>The lowercase, hexadecimal representation of a message digest.</dd>
<dt><dfn>hash algorithm</dfn></dt>
<dd>The default hash algorithm used by <a>RDFC-1.0</a>, namely, SHA-256 [[FIPS-180-4]].
<p>Implementations MUST support a parameter to define the <a>hash algorithm</a>,
MUST support SHA-256 and SHA-384 [[FIPS-180-4]],
and SHOULD support the ability to specify other hash algorithms.
Using a different hash algorithm will generally result in different output than
using the default.</p>
<p class="note">There is no expectation that the default hash algorithm
will also be used by any application creating a hash digest of the
canonical N-Quads result.</p>
</dd>
<dt><dfn>mention</dfn></dt>
<dd>
A node is mentioned in a <a>quad</a>
if it is a component of that quad,
as a <a>subject</a>, <a>predicate</a>, <a>object</a>, or <a>graph name</a>.</dd>
<dt><dfn>mention set</dfn></dt>
<dd>The set of all <a>quads</a> in a <a>dataset</a>
that <a>mention</a> a node <var>n</var> is called the <a>mention set</a> of <var>n</var>,
denoted <var>Q<sub>n</sub></var>.</dd>
<dt><dfn>gossip path</dfn></dt>
<dd>A particular enumeration of every incident <a>mention</a> emanating
from a blank node. This recursively includes transitively related
<a>mentions</a> until any named node or blank node already labeled by
a particular <a>identifier issuer</a> is reached. Gossip paths are
encoded and operated on in the <a>RDFC-1.0</a> algorithm as strings. (See
<a href="#hash-nd-quads" class="sectionRef"></a> for more information
on the construction of gossip paths.)
</dd>
<dt><dfn data-lt="canonical n-quads" class="no-export">canonical n-quads form</dfn></dt>
<dd>
The canonicalized representation of a quad is defined in <a href="#canonical-quads" class="sectionRef"></a>.
A quad in <a>canonical n-quads form</a> represents a <a>graph name</a>, if present, in the same manner as
a <a>subject</a>, and each quad is terminated with a single <code>LF</code> (line feed, code point <code class="codepoint">U+000A</code>).
</dd>
<dt><dfn data-lt="quad|quads" class="no-export">quad</dfn></dt>
<dd>A tuple composed of <a>subject</a>, <a>predicate</a>, <a>object</a>, and <a>graph name</a>.
This is a generalization of an <a>RDF triple</a> along with a <a>graph name</a>.
</dd>
<dt><dfn class="export">canonicalization function</dfn></dt>
<dd>A <a>canonicalization function</a> maps <a>RDF datasets</a>
into <a data-cite="RDF11-CONCEPTS#dfn-dataset-isomorphism">isomorphic datasets</a> [[!RDF11-CONCEPTS]].
Two datasets produce the same canonical result if and only if they are <a data-cite="RDF11-CONCEPTS#dfn-dataset-isomorphism">isomorphic</a>.
The <a>RDFC-1.0</a> algorithm implements a <a>canonicalization function</a>.
Some datasets may be constructed to prevent this algorithm from
terminating in a reasonable amount of time (see <a href="#dataset-poisoning" class="sectionRef"></a>),
in which case the algorithm can be considered to be
a <dfn class="export">partial canonicalization function</dfn>.
</dd>
</dl>
</section>
<section id="terms-reference">
<h3>Terms defined by cited specifications</h3>
<dl data-sort="ascending">
<dt><dfn data-cite="INFRA#string">string</dfn></dt><dd>
A string is a sequence of zero or more Unicode characters.</dd>
<dt><code>true</code> and <code>false</code></dt><dd>
Values that are used to express one of two possible <a data-cite="INFRA#boolean">boolean</a> states.</dd>
<dt><dfn data-cite="RDF11-CONCEPTS#dfn-iri" data-lt="internationalized resource identifier|iri|iris"><abbr title="Internationalized Resource Identifier">IRI</abbr></dfn></dt>
<dd>An <a>IRI</a> (Internationalized Resource Identifier) is a string that conforms to the syntax
defined in [[RFC3987]].</dd>
<dt><dfn data-cite="RDF11-CONCEPTS#dfn-subject" data-lt="subject|subjects">subject</dfn></dt>
<dd>A <a>subject</a>
as specified by [[!RDF11-CONCEPTS]].</dd>
<dt><dfn data-cite="RDF11-CONCEPTS#dfn-predicate" data-lt="predicate|predicates">predicate</dfn></dt>
<dd>A <a>predicate</a>
as specified by [[!RDF11-CONCEPTS]].</dd>
<dt><dfn data-cite="RDF11-CONCEPTS#dfn-object" data-lt="object|objects">object</dfn></dt>
<dd>An <a>object</a>
as specified by [[!RDF11-CONCEPTS]].</dd>
<dt><dfn data-cite="RDF11-CONCEPTS#dfn-rdf-triple" data-lt="rdf triple|rdf triples|triple|triples">RDF triple</dfn></dt>
<dd>A <a>triple</a>
as specified by [[!RDF11-CONCEPTS]].</dd>
<dt><dfn data-cite="RDF11-CONCEPTS#dfn-rdf-graph" data-lt="rdf graph|rdf graphs|graph|graphs">RDF graph</dfn></dt>
<dd>An <a>RDF graph</a>
as specified by [[!RDF11-CONCEPTS]].</dd>
<dt><dfn data-cite="RDF11-CONCEPTS#dfn-graph-name" data-lt="graph name|graph names">graph name</dfn></dt>
<dd>A <a>graph name</a>
as specified by [[!RDF11-CONCEPTS]].</dd>
<dt><dfn data-cite="RDF11-CONCEPTS#dfn-default-graph">default graph</dfn></dt>
<dd>The <a>default graph</a>
as specified by [[!RDF11-CONCEPTS]].</dd>
<dt><dfn data-cite="RDF11-CONCEPTS#dfn-rdf-dataset" data-lt="rdf dataset|rdf datasets|dataset|datasets">RDF dataset</dfn></dt>
<dd>A <a>dataset</a>
as specified by [[!RDF11-CONCEPTS]].
For the purposes of this specification, an <a>RDF dataset</a>
is considered to be a set of <a>quads</a></dd>
<dt><dfn data-cite="RDF11-CONCEPTS#dfn-blank-node">blank node</dfn></dt>
<dd>A <a>blank node</a>
as specified by [[!RDF11-CONCEPTS]]. In short, it is a node in a graph that is
neither an <a>IRI</a>, nor a
<a data-cite="RDF11-CONCEPTS#dfn-literal">literal</a>.</dd>
<dt><dfn data-cite="RDF11-CONCEPTS#dfn-blank-node-identifier">blank node identifier</dfn></dt>
<dd>A <a>blank node identifier</a>
as specified by [[!RDF11-CONCEPTS]]. In short, it is a <a>string</a> that begins
with <code>_:</code> that is used as an identifier for a
<a>blank node</a>. <a>Blank node identifiers</a>
are typically implementation-specific local identifiers; this document
specifies an algorithm for deterministically specifying them.</dd>
<dd>
Concrete syntaxes, like [[Turtle]] or [[N-Quads]], prepend blank node identifiers with the <code>_:</code> string
to differentiate them from other nodes in the graph. This affects the
canonicalization algorithm, which is based on calculating a hash over the representations of <a>quads</a> in this format.
</dd>
<dt><dfn data-cite="XPATH-FUNCTIONS#codepoint-collation" data-lt="code point order|code point ordered|unicode codepoint collation">Unicode code point order</dfn></dt>
<dd>This refers to determining the order of two Unicode strings (`A` and `B`),
using <a>Unicode Codepoint Collation</a>,
as defined in [[XPATH-FUNCTIONS]],
which defines a
<a href="https://en.wikipedia.org/wiki/Total_order">total ordering</a>
of <a>strings</a> comparing code points.
Note that for UTF-8 encoded strings, comparing the byte sequences gives the same result as <a>code point order</a>.
</dd>
</dl>
</section>
</section>
<section id="canonicalization">
<h2>Canonicalization</h2>
<p>Canonicalization is the process of transforming an
<a>input dataset</a> to its <a>serialized canonical form</a>.
That is, any two <a>input datasets</a> that contain the same information,
regardless of their arrangement,
will be transformed into the same <a>serialized canonical form</a>.
The problem requires directed
graphs to be deterministically ordered into sets of nodes and edges. This
is easy to do when all of the nodes have globally-unique identifiers, but
can be difficult to do when some of the nodes do not. Any nodes without
globally-unique identifiers must be issued deterministic identifiers.</p>
<p class="note">
This specification defines a <a>canonicalized dataset</a> to include stable identifiers for blank nodes,
practical uses of which will always generate a canonical serialization of such a dataset.</p>
<p>In time, there may be more than one canonicalization algorithm and,
therefore, for identification purposes, this algorithm is named the
"RDF Canonicalization algorithm version 1.0"
(<abbr title="RDF Canonicalization algorithm version 1.0"><dfn class="export">RDFC-1.0</dfn></abbr>).</p>
<p><a href="#fig-ca-overview"></a> provides an overview of <a>RDFC-1.0</a>,
with steps 1 through 7 corresponding to the various steps described in
<a href="#canon-algo-algo"></a>.</p>
<figure id="fig-ca-overview" style="text-align:center">
<!-- Source for this file is at https://docs.google.com/presentation/d/1iA2m9SZ1XtD0zorfWhi1_rZjvqrxYCnSs3kQR_zyUFk -->
<object data="ca-overview.svg" style="width: 100%" type="image/svg+xml" aria-describedby="fig-ca-overview-alt">
<p id="fig-ca-overview-alt">
The image represents an overview of the <a>RDFC-1.0</a> algorithm.
The Input Document is deserialized into the Input Dataset
and Input Blank Node Identifier Map.
Canonicalization steps 1-6 are executed resulting in
the Canonicalized Dataset including the
Input Blank Node Identifier Map and Issued Identifiers Map.
Step 7 of the Canonicalization algorithm creates
the canonical n-quads form of the Canonicalized Dataset.</p>
</object>
<figcaption>An illustrated overview of the <a>RDFC-1.0</a> algorithm.<br/>
Image available in
<a href="ca-overview.svg" title="SVG image of overview">
<abbr title="Scalable Vector Graphics">SVG</abbr>
</a>.</figcaption>
</figure>
<section id="canon-overview" class="informative">
<h3>Overview</h3>
<p>To determine a canonical labeling, <a>RDFC-1.0</a> considers the
information connected to each blank node.
Nodes with unique first degree information can immediately be issued a canonical identifier
via the <a href="#issue-identifier">Issue Identifier algorithm</a>.
When a node has non-unique first degree information,
it is necessary to determine all information that is transitively connected
to it throughout the entire dataset.
<a href="#hash-1d-quads" class="sectionRef"></a> defines a
node’s first degree information via its first degree hash. </p>
<p>Hashes are computed from the information of each blank node.
These hashes encode the <a>mentions</a> incident to each blank node.
The <a>hash</a> of a string <var>s</var>, is the lower-case,
hexadecimal representation of the result of passing <var>s</var>
through a cryptographic hash function.
By default, <a>RDFC-1.0</a> uses the SHA-256 hash algorithm [[FIPS-180-4]].</p>
<p class="note">The "degree" terminology is used within this specification
as colloquial way of describing
the <a href="https://en.wikipedia.org/wiki/Distance_(graph_theory)#Related_concepts">eccentricity</a> or
<a href="https://en.wikipedia.org/wiki/Distance_(graph_theory)#Related_concepts">radius</a>
of any two nodes within a dataset.
This concept is also related to "degrees of separation",
as in, for example, "<a href="https://en.wikipedia.org/wiki/Six_degrees_of_separation">six degrees of separation</a>".
Nodes with unique first degree information can be considered nodes with a radius of one.</p>
</section>
<section id="canon-state">
<h2>Canonicalization State</h2>
<p>When performing the steps required by the canonicalization algorithm,
it is helpful to track state in a data structure called the
<dfn>canonicalization state</dfn>. The information contained in the
<a>canonicalization state</a> is described below.</p>
<dl>
<dt><dfn>blank node to quads map</dfn></dt>
<dd>A <a>map</a> that relates a <a>blank node identifier</a> to
the <a>quads</a> in which they appear in the
<a>input dataset</a>.</dd>
<dt><dfn>hash to blank nodes map</dfn></dt>
<dd>A <a>map</a> that relates a <a>hash</a> to a
<a>list</a> of
<a>blank node identifiers</a>.</dd>
<dt><dfn>canonical issuer</dfn></dt>
<dd>An <a>identifier issuer</a>, initialized with the
prefix <code>c14n</code> (short for canonicalization), for issuing canonical
<a>blank node identifiers</a>.
<div class="note">
Mapping all <a>blank nodes</a> to use this
identifier spec means that an <a>RDF dataset</a> composed of two
different <a>RDF graphs</a> will issue different
identifiers than that for the graphs taken independently. This may
happen anyway, due to <a
href="https://en.wikipedia.org/wiki/Automorphism">automorphisms</a>,
or overlapping statements, but an identifier based on the resulting
<a>hash</a> along with an issue sequence number specific to that <a>hash</a> would
stand a better chance of surviving such minor changes, and allow the
resulting information to be useful for <a href="https://www.w3.org/2001/sw/wiki/How_to_diff_RDF">RDF Diff</a>.
</div>
</dd>
</dl>
</section>
<section id="bn-issuer-state">
<h2>Blank Node Identifier Issuer State</h2>
<p>The canonicalization algorithm issues identifiers to <a>blank nodes</a>.
The <a href="#issue-identifier">Issue Identifier algorithm</a> uses an
<a>identifier issuer</a> to accomplish this task.
The information an <a>identifier issuer</a> needs to keep track of is described
below.</p>
<dl>
<dt><dfn>identifier prefix</dfn></dt>
<dd>The identifier prefix is a string that is used at the beginning of an
<a>blank node identifier</a>. It should be initialized to a
string that is specified by the canonicalization algorithm. When
generating a new <a>blank node identifier</a>, the prefix
is concatenated with a <a>identifier counter</a>. For example,
<code>c14n</code> is a proper initial value for the
<a>identifier prefix</a> that would produce
<a>blank node identifiers</a> like <code>c14n1</code>.</dd>
<dt><dfn>identifier counter</dfn></dt>
<dd>A counter that is appended to the <a>identifier prefix</a> to
create an <a>blank node identifier</a>. It is initialized to
<code>0</code>.</dd>
<dt><dfn>issued identifiers map</dfn></dt>
<dd>An <a>ordered map</a> that relates <a>blank node identifiers</a> to issued identifiers,
to prevent issuance of more than one new identifier per existing identifier,
and to allow <a>blank nodes</a> to
be assigned identifiers some time after issuance.</dd>
</dl>
</section>
<section id="canon-algorithm" class="algorithm">
<h2>Canonicalization Algorithm</h2>
<p>The canonicalization algorithm converts an <a>input dataset</a>
into a <a>canonicalized dataset</a> or raises an error if
the input dataset is determined to be overly complex.
This algorithm will assign
deterministic identifiers to any <a>blank nodes</a> in the
<a>input dataset</a>.</p>
<section id="canon-algo-overview" class="informative">
<h3>Overview</h3>
<p><a>RDFC-1.0</a> canonically labels an <a>RDF dataset</a>
by assigning each <a>blank node</a> a canonical identifier.
In RDFC-1.0, an RDF dataset <var>D</var>
is represented as a set of <a>quads</a> of the form `< s, p, o, g >`
where the graph component `g` is empty if and only if the
<a data-lt="RDF triple">triple</a> `< s, p, o >` is in the <a>default graph</a>.
It is expected that, for two RDF datasets,
<a>RDFC-1.0</a> returns the same canonically labeled list of quads
if and only if the two datasets are isomorphic (i.e., the same modulo blank node identifiers).
</p>
<p><a>RDFC-1.0</a> consists of several sub-algorithms.
These sub-algorithms are introduced in the following sub-sections.
First, we give a high level summary of RDFC-1.0.</p>
<ol>
<li id="ca-hl.1"><strong>Initialization</strong>.
Initialize the state needed for the rest of the algorithm
using <a href="#canon-state" class="sectionRef"></a>.
Also initialize the <a>canonicalized dataset</a> using the <a>input dataset</a>
(which remains immutable)
the <a>input blank node identifier map</a>
(retaining blank node identifiers from the input if possible, otherwise assigning them arbitrarily);
the <a>issued identifiers map</a> from the <a>canonical issuer</a> is added upon completion of the algorithm.</li>
<li id="ca-hl.2"><strong>Compute first degree hashes</strong>.
Compute the first degree hash for each blank node in the dataset using <a href="#hash-1d-quads" class="sectionRef"></a>.</li>
<li id="ca-hl.3"><strong>Canonically label unique nodes</strong>.
Assign canonical identifiers via <a href="#issue-identifier" class="sectionRef"></a>,
in <a>Unicode code point order</a>, to each blank node whose first degree hash is unique.</li>
<li id="ca-hl.4"><strong>Compute N-degree hashes for non-unique nodes</strong>.
For each repeated first degree hash (proceeding in <a>Unicode code point order</a>),
compute the N-degree hash via <a href="#hash-nd-quads" class="sectionRef"></a>
of every unlabeled blank node that corresponds to the given repeated hash.</li>
<li id="ca-hl.5"><strong>Canonically label remaining nodes</strong>.
In <a>Unicode code point order</a> of the N-degree hashes,
issue canonical identifiers to each corresponding blank node using
<a href="#issue-identifier" class="sectionRef"></a>.
If more than one node produces the same N-degree hash,
the order in which these nodes receive a canonical identifier does not matter.</li>
<li id="ca-hl.6"><strong>Finish</strong>.
Return the <a>serialized canonical form</a> of the <a>canonicalized dataset</a>.
Alternatively, return the <a>canonicalized dataset</a> containing
the <a>input blank node identifier map</a> and <a>issued identifiers map</a>.</li>
</ol>
</section>
<section id="canon-algo-examples" class="informative">
<h3>Examples</h3>
<aside id="ex-ca-unique-hashes"
class="example"
title="Unique hashes">
<p>This example illustrates the <a href="#canon-algo-algo">Canonicalization Algorithm</a>
where all blank nodes have unique first-degree hashes.</p>
<figure id="fig-ca-unique-hashes" style="text-align:center">
<!-- Source for this file is at https://docs.google.com/drawings/d/1LQ6fp4a35lrEOKRMD20gcncUkoitX0eW8QI_U2mQnAs -->
<object data="unique-hashes.svg" style="width: 75%" type="image/svg+xml" aria-describedby="fig-ca-unique-hashes-alt">
<p id="fig-ca-unique-hashes-alt">
The image represents the graph described in
<a href="#ex-ca-unique-hashes-input">
the following code block
</a>.</p>
</object>
<figcaption>An illustration of a graph resulting in unique hashes.<br/>
Image available in
<a href="unique-hashes.svg" title="SVG image of unique hashes">
<abbr title="Scalable Vector Graphics">SVG</abbr>
</a>.</figcaption>
</figure>
<pre id="ex-ca-unique-hashes-input"
data-transform="updateExample"
title="Graph with blank nodes resulting in unique hashes">
<!--
:p :q _:e0 .
:p :r _:e1 .
_:e0 :s :u .
_:e1 :t :u .
-->
</pre>
<p><a href="#ca.2">Step 2</a> is called twice, with each blank node (<code><em>e0</em></code> and <code><em>e1</em></code>)
in the <a>input dataset</a>
to populate <a>blank node to quads map</a>:</p>
<table id="ex-ca-unique-bn-quads-map"
class="hash-related numbered">
<caption>Blank node to quads map for unique hashes</caption>
<thead><tr>
<th><var>blank node</var></th>
<th><var>Q</var></th>
</tr></thead>
<tbody>
<tr>
<td><code><em>e0</em></code></td>
<td><code>:p :q <em>e0</em> .</code><br/><code><em>e0</em> :s :u .</code></td>
</tr>
<tr>
<td><code><em>e1</em></code></td>
<td><code>:p :r <em>e1</em> .</code><br/><code><em>e1</em> :t :u .</code></td>
</tr>
</tbody>
</table>
<p><a href="#ca.3">Step 3</a> generates the first-degree
hash for each blank node, which is explored further
in <a href="#ex-1d-unique-hashes"></a>:</p>
<table id="ex-ca-unique-1d-hashes"
class="hash-related numbered">
<caption>Hash to blank nodes map for unique hashes</caption>
<thead><tr>
<th><var>hash</var></th>
<th><var>blank node(s)</var></th>
</tr></thead>
<tbody>
<tr>
<td>`21d1dd5ba21f3dee9d76c0c00c260fa6f5d5d65315099e553026f4828d0dc77a`</td>
<td><code><em>e0</em></code></td>
</tr>
<tr>
<td>`6fa0b9bdb376852b5743ff39ca4cbf7ea14d34966b2828478fbf222e7c764473`</td>
<td><code><em>e1</em></code></td>
</tr>
</tbody>
</table>
<p><a href="#ca.4">Step 4</a> creates canonical identifiers for each
blank node which has a unique hash:</p>
<table id="ex-ca-unique-1d-identifiers"
class="hash-related numbered">
<caption>Canonical identifiers for blank nodes with unique hashes</caption>
<thead><tr>
<th><var>blank node</var></th>
<th><var>canonical identifier</var></th>
</tr></thead>
<tbody>
<tr>
<td><code><em>e0</em></code></td>
<td><code><em>c14n0</em></code></td>
</tr>
<tr>
<td><code><em>e1</em></code></td>
<td><code><em>c14n1</em></code></td>
</tr>
</tbody>
</table>
<p><a href="#ca.5">Step 5</a> has no effect,
as there are no remaining blank nodes without canonical identifiers.</p>
<p><a href="#ca.6">Step 6</a> generates
the canonicalized dataset by mapping blank node identifiers in the input dataset
with canonical identifiers:</p>
<pre id="ex-ca-unique-canonicalized-dataset" data-transform="updateExample">
<!--
:p :q _:c14n0 .
:p :r _:c14n1 .
_:c14n0 :s :u .
_:c14n1 :t :u .
-->
</pre>
</aside>
<aside id="ex-ca-shared-hashes"
class="example"
title="Shared hashes">
<p>This example illustrates the <a href="#canon-algo-algo">Canonicalization Algorithm</a>
where hashing the statements <a data-lt="mention">mentioning</a>
those blank nodes have overlapping results.</p>
<figure id="fig-ca-shared-hashes" style="text-align:center">
<!-- Source for this file is at https://docs.google.com/drawings/d/1HWpz8S-PNaWi02-utF62HAMd_0b-AF_tHNSRRUQILbU -->
<object data="shared-hashes.svg" style="width: 75%" type="image/svg+xml" aria-describedby="fig-ca-shared-hashes-alt">
<p id="fig-ca-shared-hashes-alt">
The image represents the graph described in
<a href="#ex-ca-shared-hashes-input">
the following code block
</a>.</p>
</object>
<figcaption>An illustration of a graph resulting in shared hashes.<br/>
Image available in
<a href="shared-hashes.svg" title="SVG image of shared hashes">
<abbr title="Scalable Vector Graphics">SVG</abbr>
</a>.</figcaption>
</figure>
<pre id="ex-ca-shared-hashes-input"
data-transform="updateExample"
title="Graph with blank nodes resulting in shared hashes">
<!--
:p :q _:e0 .
:p :q _:e1 .
_:e0 :p _:e2 .
_:e1 :p _:e3 .
_:e2 :r _:e3 .
-->
</pre>
<p><a href="#ca.2">Step 2</a> is called four times, with each blank node
(<code><em>e0</em></code>, <code><em>e1</em></code>, <code><em>e2</em></code>, and <code><em>e3</em></code>)
to populate <a>blank node to quads map</a>:</p>
<table id="ex-ca-shared-bn-quads-map"
class="hash-related numbered">
<caption>Blank node to quads map for shared hashes</caption>
<thead><tr>
<th><var>blank node</var></th>
<th><var>Q</var></th>
</tr></thead>
<tbody>
<tr>
<td><code><em>e0</em></code></td>
<td><code>:p :q <em>e0</em> .</code><br/><code><em>e0</em> :p <em>e2</em> .</code></td>
</tr>
<tr>
<td><code><em>e1</em></code></td>
<td><code>:p :q <em>e1</em> .</code><br/><code><em>e1</em> :p <em>e3</em> .</code></td>
</tr>
<tr>
<td><code><em>e2</em></code></td>
<td><code><em>e0</em> :p <em>e2</em> .</code><br/><code><em>e2</em> :r <em>e3</em> .</code></td>
</tr>
<tr>
<td><code><em>e3</em></code></td>
<td><code><em>e1</em> :p <em>e3</em> .</code><br/><code><em>e2</em> :r <em>e3</em> .</code></td>
</tr>
</tbody>
</table>
<p><a href="#ca.3">Step 3</a> generates the first-degree
hash for each blank node, which is explored further
in <a href="#ex-1d-shared-hashes"></a>
(note that the hashes for <code><em>e0</em></code> and <code><em>e1</em></code> are shared):</p>
<table id="ex-ca-shared-1d-hash-to-bn"
class="hash-related numbered">
<caption>Hash to blank nodes map shared hashes</caption>
<thead><tr>
<th><var>hash</var></th>
<th><var>blank node(s)</var></th>
</tr></thead>
<tbody>
<tr>
<td>`3b26142829b8887d011d779079a243bd61ab53c3990d550320a17b59ade6ba36`</td>
<td><code><em>e0</em></code>, <code><em>e1</em></code></td>
</tr>
<tr>
<td>`15973d39de079913dac841ac4fa8c4781c0febfba5e83e5c6e250869587f8659`</td>
<td><code><em>e2</em></code></td>
</tr>
<tr>
<td>`7e790a99273eed1dc57e43205d37ce232252c85b26ca4a6ff74ff3b5aea7bccd`</td>
<td><code><em>e3</em></code></td>
</tr>
</tbody>
</table>
<p><a href="#ca.4">Step 4</a> creates canonical identifiers for each
blank node which has a unique hash:</p>
<table id="ex-ca-shared-canon-ids"
class="hash-related numbered">
<caption>Canonical identifiers for blank nodes with shared hashes</caption>
<thead><tr>
<th><var>blank node</var></th>
<th><var>canonical identifier</var></th>
</tr></thead>
<tbody>
<tr>
<td><code><em>e2</em></code></td>
<td><code><em>c14n0</em></code></td>
</tr>
<tr>
<td><code><em>e3</em></code></td>
<td><code><em>c14n1</em></code></td>
</tr>
</tbody>
</table>
<p><a href="#ca.5">Step 5</a> is run on <code><em>e0</em></code> and <code><em>e1</em></code>, separately,
which share the same hash,
and use separate instances of a <var>temporary issuer</var>,
(as explored in <a href="#ex-nd-shared-hashes"></a>)
to create the <var>hash path list</var>
composed of the hash result
and temporary identifier mappings from
<a href="#hash-nd-quads">Hash N-Degree Quads algorithm</a>
for each of these blank nodes:</p>
<table id="ex-ca-shared-result-e0"
class="hash-related numbered">
<caption>Result for <code><em>e0</em></code></caption>
<tbody>
<tr>
<td><var>temporary issuer</var> mappings</td>
<td>
<table class="hash-related">
<thead><tr>
<th><var>original identifier</var></th>
<th><var>temporary identifier</var></th>
</tr></thead>
<tbody>
<tr><td><code><em>e0</em></code></td><td><code><em>b0</em></code></td></tr>
</tbody>
</table>
</td>
</tr>
<tr>
<td><var>hash</var></td>
<td>`fbc300de5afafd97a4b9ee1e72b57754dcdcb7ebb724789ac6a94a5b82a48d30`</td>
</tr>
</tbody>
</table>
<table id="ex-ca-shared-result-e1"
class="hash-related numbered">
<caption>Result for <code><em>e1</em></code></caption>
<tbody>
<tr>
<td><var>temporary issuer</var> mappings</td>
<td>
<table class="hash-related">
<thead><tr>
<th><var>original identifier</var></th>
<th><var>temporary identifier</var></th>
</tr></thead>
<tbody>