-
Notifications
You must be signed in to change notification settings - Fork 18
/
index.html
5653 lines (4356 loc) · 315 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html lang="en" dir="ltr">
<head>
<meta charset="utf-8"/>
<title>Internationalization Best Practices for Spec Developers</title>
<!--
Per the W3C Manual of Style, this document avoids gender-specific pronouns.
In this document, the terms "he", "his", and "her" appear in an example of gender-specific
cultural norms (related to personal name handling in applications), which represents a
well-considered exception to this rule. The word-token "he"
appears elsewhere in this document as a language tag for the Hebrew language.
-->
<script src="make_checklist.js"></script>
<script async src="https://www.w3.org/Tools/respec/respec-w3c" class="remove"></script>
<script class="remove">
var respecConfig = {
// specification status (e.g. WD, LCWD, NOTE, etc.). If in doubt use ED.
specStatus: "ED",
//publishDate: "2015-10-20",
//previousPublishDate: "2020-05-29",
//previousMaturity: "WD",
noRecTrack: true,
shortName: "international-specs",
copyrightStart: "2014",
edDraftURI: "https://w3c.github.io/bp-i18n-specdev/",
// editors, add as many as you like
// only "name" is required
editors: [
{ name: "Richard Ishida", company: "W3C", w3cid: 3439 },
{ name: "Addison Phillips", w3cid: 33573 }
],
group: "i18n",
github: "w3c/bp-i18n-specdev",
maxTocLevel: 3,
xref: ["i18n-glossary", "webidl"],
postProcess: [
async function importStyleSheet() {
const elems = document.querySelectorAll(`link[rel='stylesheet'][data-import]`)
await Promise.all(
[...elems].map(async link => {
const text = await fetch(link.href).then(r => r.text())
const style = document.createElement("style")
style.textContent = text
link.replaceWith(style)
})
)
}
]
};
</script>
<!-- I18N's shared stylesheet -->
<link rel="stylesheet" data-import href="https://w3c.github.io/i18n-drafts/style/respec_2022.css">
<!-- local styles for this document -->
<link rel="stylesheet" href="local.css">
<script>// check for changed fragment ids and route to the new id
var fragid = location.hash
if (fragid !== '') {
switch (fragid) {
case '#sec_lang_decl': location.hash = '#lang_misc'; break;
case '#sec_lang_values': location.hash = '#lang_values'; break;
case '#sec_lang_declaration': location.hash = '#lang_declaration'; break;
case '#sec_dir_basic': location.hash = '#dir_misc'; break;
case '#sec_bidi_values': location.hash = '#bidi_values'; break;
case '#sec_bidi_markup': location.hash = '#bidi_markup'; break;
case '#sec_bidi_strings': location.hash = '#bidi_strings'; break;
case '#char_ref_Unicode_char': location.hash = '#char_ref'; break;
case '#sec_resid_basic': location.hash = '#resid_misc'; break;
case '#text_decoration': location.hash = '#typ_text_decoration'; break;
case '#cursive': location.hash = '#typ_cursive'; break;
case '#box_posn': location.hash = '#typ_box_posn'; break;
}
}
</script>
<script>
function sortTable(tableName, colNum, reversed) {
var table, rows, i, x, y, shouldSwitch;
table = document.getElementById(tableName);
if ( ! table) return;
var switching = true;
var collator = new Intl.Collator('en-US');
/* Loop until done */
while (switching) {
switching = false;
rows = table.rows;
/* Loop over all rows excepting the header */
for (i = 1; i < (rows.length - 1); i++) {
// Start by saying there should be no switching
shouldSwitch = false;
// Get the elements to compare
x = rows[i].getElementsByTagName("TD")[colNum];
y = rows[i + 1].getElementsByTagName("TD")[colNum];
if (reversed) {
var z = x;
x = y;
y = z;
}
// Check if the two rows should switch place
if (collator.compare(x.textContent, y.textContent) > 0) {
// If so, mark as a switch and break the loop
shouldSwitch = true;
break;
}
}
if (shouldSwitch) {
// do the switch
rows[i].parentNode.insertBefore(rows[i + 1], rows[i]);
switching = true;
}
} // while
} // sortTable
</script>
</head>
<body onload="sortTable('exampleNamesTable', 0, false);">
<div id="abstract">
<p>This document provides a checklist of internationalization-related considerations when developing a specification. Most checklist items point to detailed supporting information in other documents. Where such information does not yet exist, it can be given a temporary home in this document. <strong>The information in this document will change regularly as new content is added and existing content is modified in the light of experience and discussion.</strong></p>
</div>
<div id="sotd">
<p>This document provides advice to specification developers about how to incorporate requirements for international use. What is currently available here is expected to be useful immediately, but is still an early draft and the document is in flux, and will grow over time as knowledge applied in reviews and discussions can be crystallized into guidelines.</p>
</div>
<section id="intro">
<h2>Introduction</h2>
<p>Developers of specifications need advice to ensure that what they produce will work for communities around the globe.</p>
<p>The Internationalization (i18n) WG tries to assist working groups by reviewing specifications and engaging in discussion. Often, however, such interventions come later in the process than would be ideal, or mean that the i18n WG has to repeat the same information for each working group it interacts with.</p>
<p>It would be better if specification developers could access a checklist of best practices, which points to explanations, examples and rationales where developers need it. Developers would then be able to build this knowledge into their work from the earliest stages, and could thereby reduce rework needed when the i18n WG reviews their specification.</p>
<p>This document contains the beginnings of a checklist, and points to locations where you can find explanations, examples and rationales for recommendations made. If there is no such other place, that extra information will be added to this document. It may also be used to develop ideas and organize them.</p>
<p>The guidelines in this document are not intended to be hard and fast requirements. This document will achieve a significant part of its purpose if, where you don't understand the guidelines or disagree with them, you contact the Internationalization WG to discuss what should be done.</p>
<p class="note">In this document, the term <a>natural language</a> is usually used to refer to the portions of a document or protocol intended for human consumption. The term <a>localizable text</a> is used to refer to the natural language content of formal languages, protocol syntaxes and the like, as distinct from <a>syntactic content</a> or <a>user-supplied values</a>. See the [[I18N-GLOSSARY]] for definitions of these and other terms used by the Internationalization Working Group.</p>
<section class="appendix" id="ghChecklist">
<h3>Create a github checklist</h3>
<p>A checklist feature is provided with this page to help you review your spec for internationalization. The results of the review should be posted to a GitHub issue.</p>
<p>Follow these steps for each section that is relevant to your spec:</p>
<ol>
<li>Open the checklist by clicking on "Show the self-review checklist".</li>
<li>For each requirement that is relevant to your spec, click on the first checkbox.</li>
<li>For each requirement that your spec fulfills, click on the second checkbox. (Tip: To save time, clicking on the second checkbox will automatically turn on the first checkbox, too.)</li>
<li>When finished, click on the button "Create markdown for GitHub". This will produce markdown for just the requirements that you indicated were relevant to your spec.</li>
<li>Copy the markdown code to a comment in a GitHub issue where you are capturing the results of your self-review work. If you have already done a review using the short review checklist you should copy the results produced here to other comment fields in that issue. This keeps all the review information together. Note that you'll need to repeat this copy-paste for each of the sections that contain requirements relevant to your spec.</li>
<li>Add clarification notes for the results by editing the markdown in the GitHub issue.</li>
<li>Ensure that your GitHub issue has the i18n-tracker label set, so that the Internationalization WG is aware of your review results.</li>
</ol>
</section>
<section id="I18N_Considerations">
<h3>When and how to write an <em>Internationalization Considerations</em> section in your spec</h3>
<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Ai18n_considerations" target="_blank">See related review comments.</a></p>
<div class="req" id="i18n_considerations_review">
<p class="advisement">All additions of or changes to an <cite>Internationalization Considerations</cite> section MUST be reviewed by the Internationalization (i18n) WG.</p>
</div>
<div class="req" id="i18n_considerations_title">
<p class="advisement">If you create an internationalization considerations section, it MUST have the title <cite>Internationalization Considerations</cite> or <cite>Internationalization (i18n) Considerations</cite>.</p>
</div>
<p>Specifications are not required to include a special section or appendix describing internationalization considerations of their specification. In general, the Internationalization WG instead prefers that information about language, regional, or cultural variation, support, or adaptation appear in the body of the specification, closely associated with the relevant features.</p>
<p>However, there are a few cases in which you might consider providing a section like this. Consider including an internationalization considerations section when:
<ul>
<li>International features require additional explanation that would otherwise interfere with or clutter-up the body of the specification.</li>
<li>You wish to provide examples of features, such as localization, without interfering with the body of the specification. For example, summaries of the general approach taken, or factors that affect the approach taken throughout the document.</li>
<li>There are specific limitations or problems that your specification is unable to address, such as (but not limited to) technology that is evolving but not yet ready for inclusion; limitations discovered during the horizontal review process that you intend to address in future versions; or deliberate design decisions that limit or impact certain languages, groups, or cultures.</li>
<li>You have other information you wish to provide to adopters or implementers that doesn't fit with the remainder of your spec.</li>
</ul></p>
<p>If you decide to create an Internationalization Considerations section, it will usually be as an appendix. However, the order and placement relative to other parts of your spec or to other appendices is up to you.</p>
<p>If you decide to create an Internationalization Considerations section, you need to mention it in your horizontal review request to the Internationalization WG. The review request template includes a checkbox which allows you to do this easily.</p>
</section>
</section>
<section id="language" class="topic">
<h2>Language</h2>
<div id="language_checklist" class="summaryC"></div>
<section id="lang_misc" class="subtopic">
<h3>Language basics</h3>
<aside class="links" id="links_lang_decl">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/questions/qa-lang-why">Why use the language attribute?</a>.</p></li>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/questions/qa-http-and-lang">Types of language declaration</a>. How 'metadata' and 'text-processing' types of language information differ.</p></li>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/articles/lang-bidi-use-cases/">Use cases for bidi and language metadata on the Web</a>.</p></li>
</ul>
</aside>
<div class="req" id="lang_basics_1">
<p class="advisement">It should be possible to associate a language with any piece of <a>localizable text</a> or <a>natural language</a> content.</p>
<details class="links"><summary>more</summary>
<p><a href="https://www.w3.org/International/questions/qa-lang-why">Why use the language attribute?</a></p>
<p><a href="https://www.w3.org/International/articles/lang-bidi-use-cases/">Use cases for bidi and language metadata on the Web</a></p>
</details>
</div>
<div class="req" id="lang_basics_inline">
<p class="advisement">Where possible, there should be a way to label <a>natural language</a> changes in inline text.</p>
</div>
<p>Text is rendered or processed differently according to the language it is in. For example, screen readers need to be prompted when a language changes, and spell checkers should be language-sensitive. When rendering text a knowledge of language is need in order to apply correct fonts, hyphenation, line-breaking, upper/lower case changes, and other features.</p>
<p>For example, ideographic characters such as 雪, 刃, 直, 令, 垔 have slight but important differences when used with Japanese vs Chinese fonts, and it's important not to apply a Chinese font to the Japanese text, and vice versa when it is presented to a user.</p>
<div class="req" id="lang_basics_meta">
<p class="advisement">Consider whether it is useful to express the [=intended linguistic audience=] of a resource, in addition to specifying the language used for <a href="#sec_text_processing_lang">text processing</a>.</p>
<details class="links"><summary>more</summary>
<p><a href="https://www.w3.org/International/questions/qa-text-processing-vs-metadata">Types of language declaration</a></p>
</details>
</div>
<p>Language information for a given resource can be used with two main objectives in mind: for text-processing, or as a statement of the intended use of the resource. We will explain the difference below.</p>
<section id="sec_text_processing_lang">
<h4>Text-processing language information</h4>
<div class="req" id="tp_lang_values">
<p class="advisement">A language declaration that indicates the [=text-processing language=] for a range of text must associate a single language value with a specific range of text.</p>
</div>
<p>When specifying the <a>text-processing language</a> you are declaring the language in which <strong>a specific range of text is actually written</strong>, so that user agents or applications that manipulate the text, such as voice browsers, spell checkers, style processors, hyphenators, etc., can apply the appropriate rules to the text in question. So we are, by necessity, talking about associating a <em>single</em> language with a <em>specific</em> range of text.</p>
<p>It is normal to express a text-processing language as the default language, for processing the resource as a whole, but it may also be necessary to indicate where the language changes within the resource.</p>
<div class="req" id="lang_attribute_xml">
<p class="advisement">Use the HTML <code class="kw" translate="no">lang</code> and XML <code class="kw" translate="no">xml:lang</code> language attributes where appropriate to identify the <a href="#sec_text_processing_lang">text processing language</a>, rather than creating a new attribute or mechanism.</p>
</div>
To identify the text-processing language for a range of text, HTML provides the <code class="kw" translate="no">lang</code> attribute, while XML provides <code class="kw" translate="no">xml:lang</code> which can be used in all XML formats. It's useful to continue using those attributes for relevant markup formats, since authors recognize them, as do HTML and XML processors.
</section>
<section id="sec_lang_meta">
<h4>Language metadata about the resource as a whole</h4>
<p>It may also be useful to describe the language of a resource <strong>as a whole</strong>. This type of language declaration is called the <dfn>intended linguistic audience</dfn> <strong>of a resource</strong>. For example, such metadata may be used for searching, serving the right language version, classification, etc. </p>
<p>This type of language declaration differs from that of the text-processing declaration in that (a) the value for such declarations may be more than one language subtag, and (b) the language value declared doesn't indicate which bits of a multilingual resource are in which language.</p>
<div class="req" id="metadata_lang_values">
<p class="advisement">It should be possible to associate a metadata-type language declaration (which indicates the intended use of the resource rather than the language of a specific range of text) with multiple language values.</p>
</div>
<p>The language(s) describing the intended use of a resource do not necessarily include every language used in a document. For example, many documents on the Web contain embedded fragments of content in different languages, whereas the page is clearly aimed at speakers of one particular language. For example, a German city-guide for Beijing may contain useful phrases in Chinese, but it is aimed at a German-speaking audience, not a Chinese one.</p>
<p>On the other hand, it is also possible to imagine a situation where a document contains the same or parallel content in more than one language. For example, a web page may welcome Canadian readers with French content in the left column, and the same content in English in the right-hand column. Here the document is equally targeted at speakers of both languages, so there are two audience languages. Another use case is a blog or a news page aimed at a multilingual community, where some articles on a page are in one language and some in another. In this case, it may make sense to list more than one language tag as the value of the language declaration.</p>
<div class="req" id="metadata_not_lang">
<p class="advisement">Attributes that express the language of external resources should not use the HTML <code class="kw" translate="no">lang</code> and XML <code class="kw" translate="no">xml:lang</code> language attributes, but should use a different attribute when they represent metadata (which indicates the intended use of the resource rather than the language of a specific range of text).</p>
<details class="links"><summary>more</summary>
<p><a href="https://www.w3.org/International/questions/qa-when-xmllang"><code class="kw" translate="no">xml:lang</code> in XML document schemas</a> – When should I use xml:lang and when should I define my own element or attribute for passing language values in an XML document schema (DTD)?</p>
</details>
</div>
<p>Using a different attribute to indicate the language of an external resource allows the attribute to specify more than one language. It also works better if the resource pointed to is not in a single language. </p>
<p>This distinction can be seen in HTML in the separation of the <code class="kw" translate="no">lang</code> and <code class="kw" translate="no">hreflang</code> attributes. The former indicates the language of the text within the HTML page; the latter is metadata indicating the expected language of a page that is linked to.</p>
<p>For a longer discussion of this see <a href="https://www.w3.org/International/questions/qa-when-xmllang">xml:lang in XML document schemas</a>. This article talks specifically about <code class="kw" translate="no">xml:lang</code>, but the concepts are applicable to other situations.</p>
</section>
</section>
<section id="lang_values" class="subtopic">
<h3>Defining language values</h3>
<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Alang_values" target="_blank">See related review comments.</a></p>
<aside class="links" id="lang_values_links">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3">
<p class="link"><a href="https://www.w3.org/International/articles/language-tags/">Language tags in HTML and XML</a></p>
<p class="desc">An overview of how to create language tags using BCP 47.</p>
</li>
<li class="nonw3">
<p class="link"><a href="https://www.rfc-editor.org/rfc/bcp/bcp47.txt">BCP 47</a></p>
<p class="desc">The IETF specification that defines how language tags are formed, subtags are registered, as well as how to match language tags.</p>
</li>
</ul>
</aside>
<div class="req" id="lang_use_bcp47">
<p class="advisement">Values for language declarations must use BCP 47.</p>
<details class="links"><summary>more</summary>
<p><a href="https://www.w3.org/International/articles/language-tags/">Language tags in HTML and XML</a></p>
<p><a href="https://www.rfc-editor.org/rfc/bcp/bcp47.txt">BCP 47</a></p>
</details>
</div>
<p>BCP 47 is the <a>language tag</a> system used by Internet and Web standards (and many other places). It defines a method of using <em>subtags</em> from an IANA registry to form a string which describes the language of content. The subtags in the registry are primarily based on (and maintain strict compatibility with) ISO and UN standards for identifying languages, scripts, regions, and countries. BCP47 also forms the basis for <a>Unicode locales</a>.</p>
<p>For an overview of the key features of BCP 47, see <a href="https://www.w3.org/International/articles/language-tags/">Language tags in HTML and XML</a>.</p>
<div class="req" id="lang_bcp_not_rfc">
<p class="advisement">Refer to BCP 47, not to its constituent parts, such as RFC 5646 or RFC 4647.</p>
</div>
<p>The link to and name of BCP 47 was created specifically so that there is an unchanging reference to the definition of <cite>Tags for the Identification of Languages</cite>. RFCs 1766, 3066, 4646 were previous (superseded) versions. The current version of BCP 47 is made up of two RFCs: 5646 and 4647.</p>
<div class="req" id="lang_values_specificity">
<p class="advisement">Be specific about what level of conformance you expect for language tags: BCP 47 defines two levels of conformance, "valid" and "well-formed".</p>
</div>
<p>A <strong>well-formed</strong> BCP 47 language tag follows the syntax defined for a language tag: implementations check that each language tag consists of hyphen-separated subtags; each subtag has a specific length and specific content (letters, digits or specific combinations) depending on the placement in the tag. A <strong>valid</strong> BCP 47 language tag is well-formed but additionally ensures that only subtags that are listed in the IANA Subtag Registry are used. Note that the IANA Subtag Registry is frequently updated with new subtags.</p>
<div class="req" id="lang_checking_well_formed">
<p class="advisement">Specifications may require implementations to check if language tags are "valid", but in most circumstances should only require that the language tags be "well-formed".</p>
</div>
<p>Most specifications are second-order consumers of language metadata – they are using data already provided in the document format (HTML <span class=kw translate=no>lang</span>, XML <span class=kw translate=no>xml:lang</span>, or the document format's language fields/attributes).</p>
<p>Generally most specifications are concerned with selecting resources (such as spell checkers, tokenizers, fonts, etc.) or with matching (selecting which string to show, for example) and don't directly care about the content of the language tag. Invalid-but-well-formed tags just don't match anything and usually fallback schemes provide some behavior that is appropriate.</p>
<p>There might be cases where a specification really wants implementation-level checking for validity. In those cases, the result of a tag failing to be valid has to be specified (should the application die, warn the user, etc.). It's also a problem that the registry is sizeable and changes over time, so each implementation is registry-version dependent. The changes over time are often minor, but real users will encounter interoperability issues if random (out of date) implementations of the specification reject language tags that have become valid at a later date.</p>
<p>In addition, BCP 47 has an extension mechanism which defines add-on subtag sequences. For example, one extension [[RFC6067]] (Unicode Locales, which uses the singleton <span class=kw translate=no>-u</span>), is commonly used for controlling the internationalization features of JavaScript (and has other uses). Validating these additional subtags is likely out of scope for most specifications.</p>
<div class="req" id="lang_values_valid">
<p class="advisement">Specifications should require content and content authors to use "valid" language tags.</p>
</div>
<p>Normative language regarding language tags might be different between content and implementation requirements. Specification authors need to carefully consider what conformance requirements and tests are needed for their specification and what implementations are required to do. One solution is to normatively require that "valid" language tags be used by content authors but only require implementations to check for "well-formed" language tags.</p>
<div class="req" id="use_lstr">
<p class="advisement">Specifications SHOULD refer to the IANA Language Subtag Registry instead of providing lists of codes extracted from ISO 639, ISO 3166, or other standards.</p>
<details class="links"><summary>more</summary>
<p><a href="https://www.w3.org/International/questions/qa-choosing-language-tags">Choosing a language tag</a></p>
<p><a href="https://r12a.github.io/app-subtags/">Language subtag lookup tool</a></p>
<p><a href="https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry">IANA Language Subtag Registry</a></p>
</details>
</div>
<p>In the past, some of the standards used to provide subtags found in language tags were not freely or publicly available, so some specifications provided lists in order to help ensure interoperability. This is no longer necessary. As part of BCP 47, IANA maintains the language subtag registry, which is a publicly available, machine-readable list of valid subtags for use in constructing language tags. This registry is based on underlying standards, including the various parts of ISO 639 (639-1, 639-2, 639-3, etc.), ISO 15924 script codes, and ISO 3166 and UN M.49 region codes. The registry is actively maintained, stabilized, and comprehensive in ways that other lists found on the Internet might not be. Each of the subtag types is kept in sync with parent standards with the help and participation of those standards maintainers, so extracting or making your own list of codes or referring to ones found elsewhere can lead to maintenance problems or confusion.</p>
<div class="req" id="avoid_lang_lists">
<p class="advisement">Avoid creating a list of valid or supported language tags, language subtags, or [=locales=].</p>
</div>
<p>Making your own list of fully formed language tags will unnecessarily restrict the list of languages that can be used. In addition, locale data is always being expanded, so a list that describes support today will become outdated in the future. Restricting which tags or subtags are available to users conflicts with our goal of providing universal access.</p>
</section>
<section id="lang_declaration" class="subtopic">
<h3>Declaring language</h3>
<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Alang_declaration" target="_blank">See related review comments.</a></p>
<section id="sec_lang_mixed">
<h4>Declaring language at the resource level</h4>
<p>Here we are talking about an independent unit of data that contains structured text. Examples may include a whole HTML page, an XML document, a JSON file, a WebVTT script, an annotation, etc.</p>
<aside class="links" id="links_lang_mixed_links">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/questions/qa-lang-why">Why use the language attribute?</a>.</p></li>
<!--li class="w3"><p class="link"><a href="https://w3c.github.io/i18n-discuss/notes/annotation-language-use-cases">Use cases for language information in web annotations</a>. Illustrates the difference between text-processing and metadata types of language declaration.</p></li-->
<li class="w3"><p class="link"><a href="https://www.w3.org/International/questions/qa-http-and-lang">Types of language declaration</a>. How 'metadata' and 'text-processing' types of language information differ.</p></li>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/articles/lang-bidi-use-cases/">Use cases for bidi and language metadata on the Web</a>.</p></li>
</ul>
</aside>
<div class="xref"><span class="seealso">See also</span>
<p>[[[#lang_values]]].</p>
</div>
<div class="req" id="lang_whole_res">
<p class="advisement">The specification should indicate how to define the default text-processing language for the resource as a whole.</p>
</div>
<p>It often saves trouble to identify the language, or at least the default language, of the resource as a whole in one place. For example, in an HTML file, this is done by setting the <code class="kw" translate="no">lang</code> attribute on the <code class="kw" translate="no">html</code> element.</p>
<div class="req" id="lang_inherit">
<p class="advisement">Content within the resource should inherit the language of the text-processing declared at the resource level, unless it is specifically overridden.</p>
</div>
<div class="req" id="lang_tp_meta">
<p class="advisement">Consider whether it is necessary to have separate declarations to indicate the text-processing language versus metadata about the expected use of the resource.</p>
</div>
<p>In many cases a resource contains text in only one language, and in many more cases the language declared as the default language for text-processing is the same as the language that describes the metadata about the resource as a whole. In such cases it makes sense to have a single declaration.</p>
<p>It becomes problematic, however, to use a single declaration when it refers to more than one language unless there is a way to determine which one language should be used as the text-processing default.</p>
<div class="req" id="lang_mixing">
<p class="advisement">If there is only one language declaration for a resource, and it has more than one language tag as a value, it must be possible to identify the default text-processing language for the resource.</p>
</div>
</section>
<section id="lang_block">
<h4>Establishing the language of a content block</h4>
<div class="xref"><span class="seealso">See also</span>
<p>[[[#lang_values]]].</p>
</div>
<p>The words <dfn class="lint-ignore">block</dfn> and/or <dfn class="lint-ignore">chunk</dfn> are used here to refer to a structural component within the resource as a whole that groups content together and separates it from adjacent content. Boundaries between one block and another are equivalent to paragraph or section boundaries in text, or discrete data items inside a file. </p>
<p>For example, this could refer to a block or paragraph in XML or HTML, an object declaration in JSON, a cue in WebVTT, a line in a CSV file, etc. Contrast this with <dfn class="lint-ignore">inline</dfn> content, which describes a range within a paragraph, sentence, etc.</p>
<p>The interpretation of which structures defined in a spec are relevant to these requirements may require a little consideration, and will depend on the format of the data involved.</p>
<div class="req" id="lang_block_inherit">
<p class="advisement">By default, blocks of content should inherit any text-processing language set for the resource as a whole.</p>
</div>
<p>See [[[#lang_misc]]] for guidance related to the default text-processing language information.</p>
<div class="req" id="lang_block_change">
<p class="advisement">It should be possible to indicate a change in language for blocks of content where the language changes.</p>
</div>
</section>
<section id="lang_inline">
<h4>Establishing the language of inline runs</h4>
<p>In this section we refer to information that needs to be provided for a range of characters in the middle of a paragraph or string.</p>
<div class="xref"><span class="seealso">See also</span>
<p>[[[#lang_values]]]</p>
</div>
<div class="req" id="lang_inline_spans">
<p class="advisement">It should be possible to indicate language for spans of inline text where the language changes.</p>
</div>
<p>Where a switch in language can affect operations on the content, such as spell-checking, rendering, styling, voice production, translation, information retrieval, and so forth, it is necessary to indicate the range of text affected and identify the language of that content.</p>
</section>
</section>
<section id="lang_strings" class="subtopic">
<h3>Identifying the language of strings</h3>
<p class="note">The information in this section is being developed in <a href="https://www.w3.org/TR/string-meta/">Requirements for Language and Direction Metadata in Data Formats</a> [[STRING-META]]. That document is still being written, so these guidelines are likely to change at any time.</p>
<p>The exchange of data on the Web, to the degree possible, should use <a>locale-neutral</a> standardized formats. However, some data on the Web necessarily consists of <a>natural language</a> information intended for display to humans. This <a>natural language</a> information depends on and benefits from the presence of language and direction metadata for proper display. Along with support for Unicode, mechanisms for including and specifying the [=base direction=] and the natural language of spans of text are one of the key internationalization considerations when developing new formats and technologies for the Web.</p>
<p>The most basic best practice, which the Internationalization Working Group looks for in every specification, is:</p>
<div class="req" id="bp-determine">
<p class="advisement">For any string field containing natural language text, it MUST be possible to determine the language and <a>string direction</a> of that specific string. Such determination SHOULD use metadata at the string or document level and SHOULD NOT depend on heuristics.</p>
</div>
<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Alang_strings_x" target="_blank">See related review comments.</a></p>
<aside class="links" id="lang_strings_links">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta/">Strings on the Web: Language and Direction Metadata</a>.</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta/#bp_and-reco">Best Practices, Recommendations, and Gaps</a>.</p></li>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta/#use_cases">Requirements and Use Cases</a>.</p></li>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta/#bidi-approaches">Approaches Considered for Identifying the Base Direction</a>.</p></li>
</ul></p>
</aside>
<div class="xref"><span class="seealso">See also</span>
<p>[[[#bidi_strings]]].</p>
</div>
<div class="note">
<p>Work on language and direction metadata for string formats is a work in progress. Specifications might need to include a note indicating the need for future adoption of metadata. Here is a prototype:</p>
<p class="example_note" style="background-color:white;border:1px solid green;padding:10px">The field <code>{fieldname}</code> should follow the best practices found in <cite>Strings on the Web: Language and Direction Metadata</cite> [[STRING-META]]. This includes making use of any future standards which emerge regarding the reporting of string language and direction metadata.</p>
</div>
<div class="req" id="bp_lang_field_based_metadata">
<p class="advisement">Use field-based metadata or string datatypes to indicate the language and the [=string direction=] for individual <a>localizable text</a> values.</p>
</div>
<p>Individual data values can differ in language or direction from other values found in the same data file or document. Providing metadata values directly associated with each <a>localizable text</a> field allows for the metadata to be overridden appropriately and helps applications automate processing when assembling, extracting, forwarding, or otherwise processing each data field for use.</p>
<div class="req" id="bp_default_setting">
<p class="advisement">Specifications MAY define a mechanism to provide the default language and the default [=string direction=] for all strings in a given resource. However, specifications MUST NOT assume that a resource-wide default is sufficient. Even if a resource-wide setting is available, it must be possible to use string-specific metadata to override that default.</p>
</div>
<p>Many documents contain data in a single language. Providing a means of indicating the intended language audience, perhaps in a header, can reduce overall document size and complexity. However, the ability to override specific string values remains important, as it is always possible that some strings might not be available in the document language or when the base direction is not consistent with the default direction of other <a>localizable text</a> values in the document as a whole.</p>
<div class="req" id="bp_default_fallback">
<p class="advisement">Specify that, in the absence of other information, the default direction and default language are unknown.</p>
</div>
<div class="req" id="bp_separate_localizable">
<p class="advisement">Specifications SHOULD be careful to distinguish <a>syntactic content</a>, including <a>user-supplied values</a>, from <a>localizable text</a>.</p>
</div>
<div class="req" id="bp_non_displayable_syntactic">
<p class="advisement">Specifications MUST NOT treat <a>syntactic content</a> values as "displayable".</p>
</div>
<div class="req" id="bp_do_not_use_language_non_data">
<p class="advisement">Specifications SHOULD NOT specify or require the use of language metadata for fields that cannot contain natural language text.</p>
</div>
<p>Document formats on the Web consist of text. In most cases, data values in a given document format are meant to be representative and meaningful, not just arbitrary strings. The fact that a data value consists of, for example, an English keyword does not make the data value a <a>natural language</a> string meant for display as text (that is, the value is not <a>localizable text</a>). Such data values are part of the <a>syntactic content</a> of the document: not only do they not require language and direction metadata, but they should not be associated with such metadata.</p>
<div class="req" id="bp_legacy_fmt_nonlang">
<p class="advisement">For string values and string fields that are <em>not</em> <a>localizable text</a>, specifications SHOULD specify that the field is non-linguistic in nature and recommend the language tag <code class="kw" translate="no">zxx</code> ("No linguistic content") be associated with each string value.</p>
</div>
<div class="req" id="bp_legacy_fmt_lang_unknown">
<p class="advisement">For string values and string fields that are known to contain <a>localizable text</a> but for which there is no possibility of language metadata from the underlying format, specifications SHOULD specify that the language of the content is unknown and recommend the language tag <code class="kw" translate="no">und</code> ("Undetermined") be associated with each string. Specifications MAY also allow the use of heuristics or the inference of the language from other field values where appropriate.</p>
</div>
<p>Some string values depend on or are defined by existing protocols or formats. Often these strings are not associated with or do not provide language or direction metadata. For example, many HTTP headers define their contents as if they were not <a>localizable text</a>, even when, in some cases, they contain natural language text. Consuming specifications sometimes need to take a dependency on strings of this nature or define a format that describes one of these strings. In these cases there will be no language or direction metadata for <a>consumers</a> to associate with the string in the specification's data structure or document format, and any metadata that the specification's data structure or document format provides (when functioning as a <a>producer</a>) will not be serialized through the underlying format.</p>
<div class="req" id="bp_unicode_tag_chars_nonuse">
<p class="advisement">Specifications SHOULD NOT use the Unicode "language tag" characters (code points <code>U+E0000</code> to <code>U+E007F</code>) for language identification.</p>
</div>
<p>The Unicode "language tag" characters are deprecated for use as language tags and there are many reasons why they are a poor solution to the language metadata problem in document formats and wire protocols. Specification authors are cautioned not to repurpose these characters or try to build new mechanisms for transmitting language information based on them.</p>
<div class="req" id="bp_language_indexing">
<p class="advisement">Specifications SHOULD recommend the use of <a>language indexing</a> when localizable strings can be supplied in multiple languages for the same value.</p>
</div>
<p><a>Producers</a> sometimes need to supply localized values for a given content item or data record. Sometimes this is done by <a>language negotiation</a> between the <a>producer</a> and <a>consumer</a>. Localization then takes place in the <a>producer</a> using the negotiated language to select the content returned.</p>
<p>Other times localization of a content item is done by having the <a>producer</a> return multiple language representations for the item and letting the <a>consumer</a> choose the value to display. This latter process is called <dfn>language indexing</dfn>. For more information about language indexing, see <a href="https://www.w3.org/TR/string-meta#localization-considerations"><cite>Localization Considerations</cite></a> in [[STRING-META]].</p>
<section id="lang_strings_jsonld">
<h4>Language information in JSON-LD</h4>
<aside class="links" id="lang_strings_links_jsonld">
<p class="links_title">Additional material on this sub-section's contents can be found in:</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta/">Strings on the Web: Language and Direction Metadata</a>.</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta/#technology_specific_solutions">Technology-specific solutions</a>.</p></li>
</ul>
</aside>
<p>[[JSON-LD]] provides several mechanisms for satisfying some of the best practices found in this section:</p>
<div class="req" id="bp_use_jsonld_language_context">
<p class="advisement">For documents that use [[JSON-LD]], use of [[JSON-LD]] <code class="kw" translate="no">@context</code> and the built-in <code class="kw" translate="no">@language</code> attribute is RECOMMENDED as a document level default.</p>
</div>
<div class="req" id="bp_use_jsonld_i18n_namespace">
<p class="advisement">Specifications SHOULD use the <code class="kw" translate="no">i18n</code> Namespace feature for RDF literals, as defined in [[JSON-LD]] 1.1.</p>
</div>
<div class="req" id="bp_use_jsonld_atsign">
<p class="advisement">Where the <code class="kw" translate="no">i18n</code> Namespace is not available or is inappropriate to use, specifications SHOULD require [[JSON-LD]] plain string literals for natural language values to provide string-specific language information.</p>
</div>
</section>
<!-- The following is in String-Meta but probably not appropriate for us to include here yet.
<div class="req" id="bp_localizable">
<p class="advisement">For [[WebIDL]]-defined data structures, define each <a>localizable text</a> (natural language text) field as a <q><a>Localizable</a></q>.</p>
</div>
-->
</section>
<section id="lang_detection" class="subtopic">
<h3>Detecting & matching language</h3>
<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Alang_detection" target="_blank">See related review comments.</a></p>
<aside class="issue"><p>This section is under development.</p></aside>
<div class="req" id="lang_matching_bcp">
<p class="advisement">Reference BCP47 for language tag matching.</p>
<details class="links"><summary>more</summary>
<p><a href="https://www.rfc-editor.org/rfc/bcp/bcp47.txt">BCP 47</a></p>
</details>
</div>
<p>In addition to defining language tags (in RFC 5646) BCP 47 also contains an RFC on the topic of matching language tags to a [=language range=]. Just as it is most appropriate to refer to the stable identifier BCP 47 for the definition of language tags, it is best to refer to BCP 47 when referencing matching schemes found therein.</p>
<p>Unicode's [[CLDR]] project defines additional algorithms, rules and processes for matching language tags when used as [=locale=] identifiers.</p>
</section>
</section>
<section id="text_direction" class="topic">
<h2>Text direction</h2>
<div id="text_direction_checklist" class="summaryC"></div>
<!--p>In this section:</p>
<ul class="summary">
<li>[[[#sec_dir_basic]]]</li>
<li>[[[#sec_dir_background]]]</li>
<li>[[[#sec_bidi_markup]]]</li>
<li>[[[#sec_bidi_strings]]]</li>
<li>[[[#bidi_inline]]]</li>
</ul-->
<p>It is important to establish direction for text written or mixed with right-to-left scripts. Characters in these scripts are stored in memory in the order they are typed and pronounced – called the logical order. The Unicode Bidirectional Algorithm (UBA) provides a lot of support for automatically rendering a sequence of characters stored in logical order so that they are visually ordered as expected. Unfortunately, the UBA alone is not sufficient to correctly render bidirectional text, and additional information has to be provided about the default directional context to apply for a given sequence of characters.</p>
<!--details class="checklist" style="cursor: pointer;">
<summary onClick="showChecklist(this.parentNode.parentNode, 'dir_checklist')">Show recommendations as a checklist</summary>
<div id="dir_checklist"></div>
</details-->
<section id="dir_misc" class="subtopic">
<h3>Basic requirements</h3>
<p>The basic requirements are as follows.</p>
<div class="req" id="dir_paragraphs">
<p class="advisement">It must be possible to indicate [=base direction=] for each individual paragraph-level item of <a>natural language</a> text that will be read by someone.</p>
</div>
<p>A special case of the above applies to [=natural language=] string values in data structures and document formats:</p>
<div class="req" id="dir_strings">
<p class="advisement">For any string field containing [=natural language=] text, it MUST be possible to determine the language and [=string direction=] of that specific string. Such determination SHOULD use metadata at the string or document level and SHOULD NOT depend on heuristics.</p>
</div>
<div class="req" id="dir_inline">
<p class="advisement">It must be possible to indicate base direction changes for embedded runs of inline bidirectional text for all <a>localizable text</a>.</p>
</div>
<div class="req" id="dir_reasonable">
<p class="advisement">Annotating right-to-left text must require the minimum amount of effort for people who work natively with right-to-left scripts.</p>
</div>
<p>Requiring a speaker of Arabic, Divehi, Hebrew, Persian, Urdu, etc. to add markup or control characters to every paragraph or small data item they write is far too much to be manageable. Typically, the format should establish a default direction and require the user to intervene only when exceptions have to be dealt with.</p>
</section>
<section id="sec_dir_background" class="subtopic">
<h3>Background information</h3>
<p>In this section we try to set out some key concepts associated with text direction, so that it will be easier to understand the recommendations that follow.</p>
<aside class="links" id="links_text_direction">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/articles/inline-bidi-markup/uba-basics">Unicode Bidirectional Algorithm basics</a>.</p></li>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta">String on the Web: Language and Direction Metadata</a> [[STRING-META]]</p></li>
</ul>
</aside>
<section id="sec_dir_defs">
<h4>Important definitions</h4>
<p>In order to correctly display text written in a 'right-to-left' script or left-to-right text containing bidirectional elements, it is important to establish the <a href="https://www.w3.org/International/articles/inline-bidi-markup/uba-basics#context" class="termref">base direction</a> that will be used to dictate the order in which elements of the text will be displayed.</p>
<p>If you are not familiar with what the Unicode Bidirectional Algorithm (UBA) does and doesn't do, and why the base direction is so important, read <a href="https://www.w3.org/International/articles/inline-bidi-markup/uba-basics">Unicode Bidirectional Algorithm basics</a>.</p>
<aside class="example" id="sec-dir-example">
<p>For example, the following annotation will not display correctly unless the application doing the display knows that the base direction needs to be right-to-left.</p>
<pre>{
"@context": "http://www.w3.org/ns/anno.jsonld",
"id": "http://example.org/anno5",
"type":"Annotation",
"body": {
"type" : "TextualBody",
"text" : "פעילות הבינאום, W3C",
"format" : "text/html",
"language" : "he"
},
"target": "http://example.org/photo1"
}
</pre>
<p>If there is no indication that the [=base direction=] is right-to-left, the display of the item <code>text</code> will be incorrect if the text is placed into a left-to-right context (such as the table below):</p>
<table dir="ltr" class="bidi-example-table">
<thead>
<tr><th>Description</th><th>HTML</th><th style="width:25%">Appearance</th></tr>
</thead>
<tbody>
<tr>
<td>Incorrect:<br>(without <code>dir</code>)</td>
<td><pre class="html"><span lang="he">ות הבינאום, W3C</span></pre></td>
<td class="spilloverExample"><span lang="he">ות הבינאום, W3C</span></td>
</tr>
<tr>
<td>Correct:<br>(with <code>dir</code>)</td>
<td><pre class="html"><span lang="he" dir="rtl">ות הבינאום, W3C</span></pre></td>
<td class="spilloverExample"><span dir="rtl" lang="he">ות הבינאום, W3C</span></td>
</tr>
</tbody>
</table>
</aside>
<p>In this section, the word <dfn class="lint-ignore">paragraph</dfn> indicates a run of text followed by a hard line-break in plain text, but may signify different things in other situations. In CSV it equates to 'cell', so a single line of comma-separated items is actually a set of comma-separated paragraphs. In HTML it equates to the lowest level of block element, which is often a <code class="kw" translate="no">p</code> element, but may be things such as <code class="kw" translate="no">div</code>, <code class="kw" translate="no">li</code>, etc., if they only contain text and/or inline elements. In JSON, it often equates to a quoted string value, but if a string value uses markup then paragraphs are associated with block elements, and if the string value is multiple lines of plain text then each line is a paragraph.</p>
<aside class="note">
<p>While the Unicode Bidirectional Algorithm [[UAX9]] formally refers to <em>paragraphs</em> and <em>paragraph direction</em> (or the <em>base direction</em> of a paragraph), this can sometimes be confusing when the text in question is not in a long-form document. Instead, this document and others will sometimes use the terms "block direction" or especially [=string direction=] to refer to the paragraph direction of a specific string of natural language text.</p>
</aside>
<p>The term <a>metadata</a> is used here to mean information which could be an annotation or property associated with the data, or could be markup in scenarios that allow that, or could be a higher-level protocol, etc.</p>
</section>
<section id="setting_bd">
<h4>Ways base direction can be set for paragraphs</h4>
<p>There are a number of possible ways of setting the base direction.</p>
<ol>
<li>The base direction of a paragraph may be set by an application or a user applying metadata to the paragraph. Typical values for base direction may include <code class="kw" translate="no">ltr</code>, <code class="kw" translate="no">rtl</code> or <code class="kw" translate="no">auto</code>.
<ul>
<li>The metadata may specifically indicate that heuristics should be used. Then you would expect to consider the actual characters used in order to determine the base direction. (This is what happens if you set <code>dir=auto</code> on an HTML element.)</li>
<li>The application may expect metadata, but there may be no such information provided. In this case you would usually expect there to be a default direction specified, and the base direction for a cell would be set to that default. The default is usually LTR. (This is what happens if you have no <code class="kw" translate="no">dir</code> attributes in your HTML file.)</li>
<li>Where a format contains many paragraphs or chunks of information, and the language of text in all those chunks is the same, it is sometimes useful to allow a default base direction to be set for and inherited by all. This is what happens when you set the <code class="kw" translate="no">dir</code> attribute on the <code class="kw" translate="no">html</code> tag in HTML. Another example would be a subtitling file containing many cues, all written in Arabic; it would be best to allow the author to say at the start of the file that the default is RTL for all cue text. There should always be a way to override the direction information for a specific paragraph where needed.</li>
</ul></li>
<li>If the application expects no metadata to be available it should use heuristics to determine the base direction for each paragraph/cell. A typical solution, and one described by UAX 9 <cite>Unicode Bidirectional Algorithm</cite>, is to look for the first-strong character in the paragraph/cell. (This is likely to apply if you are looking at plain text that is not expected to be associated with metadata. It only happens with HTML if the direction is set to <code class="kw" translate="no">auto</code>, since HTML specifies a default direction.)
<ul style="margin-left:0; margin-right:1em;">
<li>Not all paragraphs using the first-strong method will have the correct base direction applied. In some cases, an Arabic or Hebrew, etc, paragraph may start with strong LTR characters. There must be a way to deal with this.</li>
<li>Where a syntactic unit contains multiple lines of plain text (for example, a multiline cue text in a subtitling file), the first-strong heuristic needs to be applied to each line separately.</li>
<li>There may be special rules that involve ignoring some sequence of characters or type of markup at the start of the paragraph before identifying the first strong character.</li>
<li>In some cases there are no strong characters in a paragraph, and the base direction can be critically important for the data to be understood correctly, eg. telephone numbers or MAC addresses. There needs to be a way to resort to an appropriate default for these cases.</li>
</ul></li>
<li>Whether or not any metadata is specified, if a paragraph contains a string that starts with one of the Unicode bidi control characters RLI, LRI, FSI, LRE, RLE, LRO, or RLO and ends with PDF/PDI, these characters will determine the base direction for the contained string. These characters, when placed in the content, explicitly override any previously set direction by creating an inline range and assigning a base direction to it.
<ul style="margin-left:0; margin-right:1em;"><li>The effect of such characters does not extend past paragraph boundaries, but the range ought to be explicitly ended using the PDF/PDI control character, especially if a paragraph end is not easily detectable by the application.)</li>
<li>Because isolation is needed for bidirectional text to work properly, the Unicode Standard says that the isolating control codes RLI, LRI and FSI should be used rather than LRE or RLE. Unfortunately, those characters are still not widely supported.</li>
<li>For structural components in markup, above the paragraph level, it is not possible to use the Unicode bidi control characters to define direction for paragraphs, since these are inline controls only, and the effect is terminated by a paragraph end.</li>
</ul>
</li>
</ol>
<p>When capturing text input by a user it is usually necessary to understand the context in which the user was inputting the data to determine the base direction of the input. In HTML, for example, this may be set by the direction inherited from the <code class="kw" translate="no">html</code> tag, or by the user pressing keys to set the base direction for a form field. It is then necessary to find some way of storing the information about base direction or associating it with the string when rendered. Typically, in this situation, any direction changes internal to the string being input are handled by the user and will be captured as part of the string.</p>
</section>
<section id="inline_changes">
<h4>Inline changes to base direction</h4>
<p>Embedded ranges of text <em>within</em> a single paragraph may need to have a different base direction. For example, </p>
<p>"The title was '!NOITASILANOITANRETNI'."</p>
<p>where the span within the single quotes is in Hebrew/Arabic/Divehi, etc., and needs to have a [=RTL=] base direction, instead of the [=LTR=] base direction of the surrounding paragraph, in order to place the exclamation mark correctly.</p>
<p>If markup is available to the content author, it is likely to be easier and safer to use markup to indicate such inline ranges (see below). In HTML you would usually use an inline element with a <code class="kw" translate="no">dir</code> attribute to establish the base direction for such runs of text. If you can't mark up the text, such as in HTML's <code class="kw" translate="no">title</code> element, or any environment that handles only plain text content, you have to resort to Unicode's paired control characters to establish the base direction for such an internal range.</p>
<p>Furthermore, inline ranges where the base direction is changed should be [=bidi isolated=] from surrounding text, so that the [=Unicode Bidirectional Algorithm=] doesn't produce incorrect results ("[=spillover=]") due to interference across boundaries.</p>
<aside class="example" title="Example of bidirectional interference across boundaries">
<p>Suppose an implementation assembled a string by concatenating various values together. For example, the description of a monitor attached to a system. This label might contain the brand name and model number (<kbd>Brand A123B</kbd>); resolution (<kbd>(1920 x 1080)</kbd>); size and type (<kbd>36" monitor</kbd>); as well as various features like a refresh rate of <kbd>75 Hz</kbd> or response time of <kbd>4 ms</kbd>. The resulting string in English might look like this (color has been added to make the effects more visible):</p>
<p class="spilloverExample" dir="ltr"><code>
<span style="color:red">Brand A123B</span> <span style="color:blue">(1920 x 1080)</span> <span style="color:green">36"</span> <span style="color:purple">monitor</span>, <span style="color:orange">75 Hz</span>, <span style="color:brown">4ms</span>
</code></p>
<p>If the same implementation assembling this string were on a system running in a locale that uses a right-to-left language (such as the Arabic examples shown below), the results of the same concatentation might look something like this:</p>
<p class="spilloverExample" dir="rtl"><code>
<span style="color:red">ماركة A123B</span> <span style="color:blue">(1920 x 1080)</span> <span style="color:green">36"</span> <span style="color:purple">شاشة الكمبيوتر</span>، <span style="color:orange">75 Hz</span>، <span style="color:brown">4 مللي ثانية</span></code></p>
<p>The logical sequence of sub-strings remains the same, but the visual presentation is no longer intelligible. Notice how different parts of the description have become broken up and mixed together. The addition of isolating bidirectional controls (either markup or, when not available, Unicode control characters) to the above string produces better results:</p>
<p class="spilloverExample" dir="rtl"><code>
<span style="color:red" dir="rtl">ماركة A123B</span> <span style="color:blue" dir="ltr">(1920 x 1080)</span> <span style="color:green" dir="ltr">36"</span> <span style="color:purple" dir="rtl">شاشة الكمبيوتر</span>، <span style="color:orange" dir="rtl">75 Hz</span>، <span style="color:brown" dir="rtl">4 مللي ثانية</span>
</code></p>
</aside>
<p>This means that if a content author is using Unicode control codes they should use the isolating controls <code class="kw" translate="no">RLI/LRI/FSI…PDI</code> rather than the embedding controls <code class="kw" translate="no">RLE/LRE…PDF</code>.</p>
<div class="xref"><span class="seealso">See also</span>
<p><a href="https://www.w3.org/International/articles/inline-bidi-markup/bidi_examples#uc5">An example of incorrect ordering of things such as text followed by numbers in HTML</a></p>
<p><a href="https://www.w3.org/International/articles/inline-bidi-markup/bidi_examples#usecase3">An example of incorrect ordering of lists</a></p>
</div>
</section>
<section id="control_problems">
<h4>Problems with control characters</h4>
<p>Reasons to avoid relying on control characters to set direction include the following:</p>
<ol>
<li>They are invisible in most editors and are therefore difficult to work with, and can easily lead to orphans and overlapping ranges. They can be particularly difficult to manage when editing bidirectional inline text because it's hard to position the cursor in the correct place. If you ask someone who writes in a right-to-left script, you are likely to find that they dislike using control codes.</li>
<li>Users often don't have the necessary characters available on their keyboard, or have difficulty inputting them.</li>
<li>It is sometimes necessary to choose which to use based on context or the type of the data, and this means that a content author typically needs to select the control codes – specifying control codes in this way for all paragraphs is time-consuming and error-prone.</li>
<li>Processors that extract parts of the data, add to it, or reuse in combination with other text may incorrectly handle the control codes.</li>
<li>Search and comparison algorithms should ignore these characters, but typically don't.</li>
</ol>
<p>The last two items above may also hold for markup, but implementers often support included markup better than included control codes.</p>
<p>Don't expect users to add control codes at the start and end of every paragraph. That's far too much work.</p>
</section>
<section id="rlmlrm">
<h4>Strong directional formatting characters: RLM, LRM, and ALM</h4>
<p>A word about the Unicode characters <span class="codepoint" translate="no"><img alt="RLM" src="images/200F.png"><code class="uname">U+200F RIGHT-TO-LEFT MARK</code></span> (RLM), <span class="codepoint" translate="no"><img alt="LRM" src="images/200E.png"><code class="uname">U+200E LEFT-TO-RIGHT MARK</code></span> (LRM), and <span class="codepoint" translate="no"><img alt="ALM" src="images/061C.png"><code class="uname">U+061C ARABIC LETTER MARK</code></span> (ALM) is warranted at this point.</p>
<p>The first point to be clear about is that these three characters do not establish the base direction for a range of text. They are simply invisible characters with strong directional properties.</p>
<p>Recalling an <a href="#sec-dir-example">earlier example</a>, this means that you cannot use RLM, for example, to make the text <kbd>W3C</kbd> appear to the left of the Hebrew text. Only using metadata or paired control characters results in the correct display.</p>
<aside class="example" id="rlm-not-working" title="Use metadata instead of strongly directional formatting characters">
<table dir="ltr" class="bidi-example-table">
<thead>
<tr><th>Description</th><th>HTML</th><th style="width:25%">Result</th></tr>
</thead>
<tbody>
<tr>
<td>With RLM<br>(incorrect)</td>
<td><pre class="html"><span lang="he">ותות הבינאום, W3C&rlm;</span></pre></td>
<td class="spilloverExample"><span lang="he">ותות הבינאום, W3C‏</span></td>
</tr>
<tr>
<td>With metadata<br>(correct)</td>
<td><pre class="html"><span lang="he" dir="rtl">ותות הבינאום, W3C</span></pre></td>
<td class="spilloverExample"><span lang="he" dir="rtl">ותות הבינאום, W3C</span></td>
</tr>
</tbody>
</table>
</aside>
<p>Of course, if you are detecting base direction using first-strong heuristics (such as <code>dir="auto"</code> in HTML), then inserting an RLM, ALM, or LRM can be useful for influencing the base direction detected where the text in question begins with something that would otherwise give the wrong result.</p>
<aside class="example" title="Using a strong directional formatting character to assist first-strong heuristics">
<p>This HTML has strongly right-to-left Arabic characters near the start, where they will be picked up by a first-strong heuristic. Notice that there is a neutral character right at the start:</p>
<p><pre class="html"><p dir="auto">"نشاط التدويل" is how you say "i18n activity" in Arabic.</p></pre></p>
<p>This produces the wrong result:</p>
<p dir="auto" class="spilloverExample">"نشاط التدويل" is how you say "i18n Activity" in Arabic.</p>
<p>Here an LRM could be placed at the start of the text to prevent the algorithm from assuming that the text should be right-to-left.</p>
<p><pre class="html"><p dir="auto">&lrm;"نشاط التدويل" is how you say "i18n activity" in Arabic.</p></pre></p>
<p dir="auto" class="spilloverExample">‎"نشاط التدويل" is how you say "i18n Activity" in Arabic.</p>
</aside>
<p>Remember that if metadata is used to set the base direction, the strong directional formatting character is ignored, unless the metadata specifically says that first-strong heuristics should be used.</p>
<p>Finally, a note about the use of <span class="codepoint" translate="no"><img alt="ALM" src="images/061C.png"><code class="uname">U+061C ARABIC LETTER MARK</code></span> (ALM). This character is used to influence the display of sequences of numbers in Arabic script text in cases where no Arabic letters occur before the number.</p>
<aside class="example" title="Example of ALM usage">
<p>In some Arabic-script languages the range <code dir="rtl">100-200</code> should appear as <code dir="rtl">؜100-200</code>. If no Arabic letters appear before the numbers, the [=Unicode Bidirectional Algorithm=] will not perform this reordering. Note that the character sequences in both cases is "100-200" and that both have a <kbd>code</kbd> element with a <code>dir="rtl"</code> around them. In the third example, an ALM is used to provide the necessary hint, like so:</p>
<table class="bidi-example-table">
<thead>
<tr><th>Description</th><th>HTML</th><th>Appearance</th></tr>
</thead>
<tbody>
<tr>
<td>Preceded by Arabic letters</td>
<td><pre class="html"><code dir="rtl" lang="ar">نشاط التدويل 100-200</code></pre></td>
<td dir="rtl" class="spilloverExample"><code dir="rtl" lang="ar">نشاط التدويل 100-200</code></td>
</tr>
<tr>
<td>Without ALM</td>
<td><pre class="html"><code dir="rtl" lang="ar">100-200</code></pre></td>
<td dir="rtl" class="spilloverExample"><code dir="rtl" lang="ar">100-200</code></td>
</tr>
<tr>
<td>With ALM</td>
<td><pre class="html"><code dir="rtl" lang="ar">&#x061C;100-200</code></pre></td>
<td dir="rtl" class="spilloverExample"><code dir="rtl" lang="ar" >؜100-200</code></td>
</tr>
</tbody>
</table>
</aside>
</section>
<section id="bd_language">
<h4>Base direction and language</h4>
<div class="req" id="bidi_lang">
<p class="advisement">Do not assume that direction can be determined from language information.</p>
<details class="links"><summary>more</summary>
<p><a href="https://www.w3.org/International/questions/qa-direction-from-language">Can we derive base direction from language?</a>, W3C article.</p>
</details>
</div>
<p>The following are all reasons you cannot use language tags to provide information about base direction:</p>
<ol>
<li>you can't produce the <code class="kw" translate="no">auto</code> value with language tags.</li>
<li>some languages are written with both RTL and LTR scripts.</li>
<li>the only reliable part of the language tag that would indicate the base direction is the script tag, but BCP47 recommends that you suppress the use of the script tag for languages that don't usually need it, such as Hebrew (<code translate="no">Suppress-Script: Hebr</code>). Languages, such as Persian, that are usually written in a RTL script may be written in transcribed form, and it's not possible to guarantee that the necessary script tag would be present to carry the directional information. In summary, you won't be able to rely on people supplying script tags as part of the language information in order to influence direction.</li>
<li>the incidence of use of language tags and base direction markers often don't coincide.</li>
<li>they are not semantically equivalent.</li>
</ol>
</section>
</section>
<section id="bidi_values" class="subtopic">
<h3>Base direction values</h3>
<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Abidi_values" target="_blank">See related review comments.</a></p>
<div class="req" id="bidi_values_req">
<p class="advisement">Values for the default base direction should include left-to-right, right-to-left, and auto.</p>
</div>
<p>The <code class="kw" translate="no">auto</code> value allows automatic detection of the base direction for a piece of text. For example, the <code class="kw" translate="no">auto</code> value of <code class="kw" translate="no">dir</code> in HTML looks for the first strong directional character in the text, but ignores certain items of markup also, to guess the base direction of the text. Note that automatic detection algorithms are far from perfect. First-strong detection is unable to correctly identify text that is really right-to-left, but that begins with a strong LTR character. Algorithms that attempt to judge the base direction based on contents of the text are also problematic. The best scenario is one where the base direction is known and declared.</p>
</section>
<section id="bidi_markup" class="subtopic">