fixed handling of escaped unicode sequences with more than 4 digits

silnrsi · Dec 9, 2024 · f452bb9 · f452bb9
1 parent cad7aa4
commit f452bb9
Show file tree

Hide file tree

Showing 7 changed files with 14 additions and 10 deletions.
diff --git a/sldr/h/hmz_Plrd.xml b/sldr/h/hmz_Plrd.xml
@@ -23,7 +23,7 @@
 		</orientation>
 	</layout>
 	<characters>
-		<exemplarCharacters>[𖼀 𖼃 𖼄 𖼇 𖼈 𖼊 𖼎 𖼐 𖼖 𖼘 𖼚 𖼞 𖼡 𖼣 𖼦 𖼨 𖼮 𖼳 𖼵 𖼷 𖼺 𖼻 𖼽 𖽂 𖽃 𖽐 𖽑 𖽔 𖽗 𖽘 𖽙 𖽝 𖽞 𖽟 𖽠 𖽡 𖽢 𖽦 𖽨 𖽪 𖽫 𖽱 𖽵 𖽷 𖽺 𖽻 𖽾 \u16F8F \u16F90 \u16F91]</exemplarCharacters>
+		<exemplarCharacters>[𖼀 𖼃 𖼄 𖼇 𖼈 𖼊 𖼎 𖼐 𖼖 𖼘 𖼚 𖼞 𖼡 𖼣 𖼦 𖼨 𖼮 𖼳 𖼵 𖼷 𖼺 𖼻 𖼽 𖽂 𖽃 𖽐 𖽑 𖽔 𖽗 𖽘 𖽙 𖽝 𖽞 𖽟 𖽠 𖽡 𖽢 𖽦 𖽨 𖽪 𖽫 𖽱 𖽵 𖽷 𖽺 𖽻 𖽾 \U00016F8F \U00016F90 \U00016F91]</exemplarCharacters>
 		<exemplarCharacters type="punctuation">[. , \: ; ? ! ' " \- / = + ( ) \[ \] * / _ __ “ ”  、 。]</exemplarCharacters>
 		<exemplarCharacters type="numbers">[0 1 2 3 4 5 6 7 8 9]</exemplarCharacters>
 	</characters>

diff --git a/sldr/l/lpo.xml b/sldr/l/lpo.xml
@@ -22,7 +22,7 @@
 		</orientation>
 	</layout>
 	<characters>
-		<exemplarCharacters>[𖼀 𖼂 𖼄 𖼇 𖼈 𖼊 𖼍 𖼐 𖼖 𖼘 𖼞 𖼠 𖼣 𖼦 𖼨 𖼮 𖼰 𖼳 𖼵 𖼷 𖼹 𖼺 𖼻 𖼽 𖽂 𖽃 𖽑 𖽔 𖽕 𖽗 𖽘 𖽙 𖽚 𖽜 𖽝 𖽡 𖽢 𖽦 𖽧 𖽨 𖽪 𖽫 𖽮 𖽱 𖽲 𖽳 𖽴 𖽶 𖽸 𖽹 𖽺 𖽻 𖽾 \u16F90 \u16F91]</exemplarCharacters>
+		<exemplarCharacters>[𖼀 𖼂 𖼄 𖼇 𖼈 𖼊 𖼍 𖼐 𖼖 𖼘 𖼞 𖼠 𖼣 𖼦 𖼨 𖼮 𖼰 𖼳 𖼵 𖼷 𖼹 𖼺 𖼻 𖼽 𖽂 𖽃 𖽑 𖽔 𖽕 𖽗 𖽘 𖽙 𖽚 𖽜 𖽝 𖽡 𖽢 𖽦 𖽧 𖽨 𖽪 𖽫 𖽮 𖽱 𖽲 𖽳 𖽴 𖽶 𖽸 𖽹 𖽺 𖽻 𖽾 \U00016F90 \U00016F91]</exemplarCharacters>
 		<exemplarCharacters type="punctuation">[. , \: ; ? ! ' " \- / = + ( ) \[ \] * / _ __ “ ”  、 。]</exemplarCharacters>
 		<exemplarCharacters type="numbers">[0 1 2 3 4 5 6 7 8 9]</exemplarCharacters>
 	</characters>

diff --git a/sldr/s/sfm.xml b/sldr/s/sfm.xml
@@ -22,7 +22,7 @@
 		</orientation>
 	</layout>
 	<characters>
-		<exemplarCharacters>[𖼀 𖼁 𖼃 𖼄 𖼇 𖼈 𖼊 𖼋 𖼎 𖼏 𖼐 𖼑 𖼖 𖼗 𖼘 𖼙 𖼞 𖼟 𖼡 𖼢 𖼣 𖼦 𖼨 𖼩 𖼮 𖼯 𖼲 𖼳 𖼵 𖼷 𖼸 𖼺 𖼽 𖽂 𖽃 𖽅 𖽆 𖽇 𖽐 𖽑 𖽔 𖽗 𖽘 𖽙 𖽜 𖽝 𖽞 𖽟 𖽡 𖽢 𖽣 𖽤 𖽦 𖽨 𖽩 𖽪 𖽫 𖽬 𖽭 𖽰 𖽱 𖽵 𖽷 𖽸 𖽹 𖽺 𖽻 𖽾 𖾂 𖾃 \u16F8F \u16F90 \u16F91]</exemplarCharacters>
+		<exemplarCharacters>[𖼀 𖼁 𖼃 𖼄 𖼇 𖼈 𖼊 𖼋 𖼎 𖼏 𖼐 𖼑 𖼖 𖼗 𖼘 𖼙 𖼞 𖼟 𖼡 𖼢 𖼣 𖼦 𖼨 𖼩 𖼮 𖼯 𖼲 𖼳 𖼵 𖼷 𖼸 𖼺 𖼽 𖽂 𖽃 𖽅 𖽆 𖽇 𖽐 𖽑 𖽔 𖽗 𖽘 𖽙 𖽜 𖽝 𖽞 𖽟 𖽡 𖽢 𖽣 𖽤 𖽦 𖽨 𖽩 𖽪 𖽫 𖽬 𖽭 𖽰 𖽱 𖽵 𖽷 𖽸 𖽹 𖽺 𖽻 𖽾 𖾂 𖾃 \U00016F8F \U00016F90 \U00016F91]</exemplarCharacters>
 		<exemplarCharacters type="punctuation">[. , \: ; ? ! ' " \- / = + ( ) \[ \] * / _ __ “ ”  、 。]</exemplarCharacters>
 		<exemplarCharacters type="numbers">[0 1 2 3 4 5 6 7 8 9]</exemplarCharacters>
 	</characters>

diff --git a/sldr/y/ygp.xml b/sldr/y/ygp.xml
@@ -22,7 +22,7 @@
 		</orientation>
 	</layout>
 	<characters>
-		<exemplarCharacters>[𖼀 𖼂 𖼄 𖼇 𖼈 𖼊 𖼎 𖼐 𖼒 𖼔 𖼖 𖼘 𖼞 𖼣 𖼦 𖼨 𖼪 𖼮 𖼲 𖼳 𖼵 𖼷 𖼺 𖼻 𖼽 𖽂 𖽃 𖽈 𖽉 𖽊 𖽐 𖽑 𖽔 𖽗 𖽘 𖽙 𖽜 𖽝 𖽠 𖽡 𖽦 𖽨 𖽪 𖽱 𖽳 𖽵 𖽶 𖽷 𖽹 𖽺 𖽻 𖽾 𖾁 𖾃 𖾄 𖾅 𖾆 𖾇 \u16F8F \u16F90 \u16F91]</exemplarCharacters>
+		<exemplarCharacters>[𖼀 𖼂 𖼄 𖼇 𖼈 𖼊 𖼎 𖼐 𖼒 𖼔 𖼖 𖼘 𖼞 𖼣 𖼦 𖼨 𖼪 𖼮 𖼲 𖼳 𖼵 𖼷 𖼺 𖼻 𖼽 𖽂 𖽃 𖽈 𖽉 𖽊 𖽐 𖽑 𖽔 𖽗 𖽘 𖽙 𖽜 𖽝 𖽠 𖽡 𖽦 𖽨 𖽪 𖽱 𖽳 𖽵 𖽶 𖽷 𖽹 𖽺 𖽻 𖽾 𖾁 𖾃 𖾄 𖾅 𖾆 𖾇 \U00016F8F \U00016F90 \U00016F91]</exemplarCharacters>
 		<exemplarCharacters type="punctuation">[. , \: ; ? ! ' " \- / = + ( ) \[ \] * / _ __ “ ”  、 。]</exemplarCharacters>
 		<exemplarCharacters type="numbers">[0 1 2 3 4 5 6 7 8 9]</exemplarCharacters>
 	</characters>

diff --git a/sldr/y/yna.xml b/sldr/y/yna.xml
@@ -22,7 +22,7 @@
 		</orientation>
 	</layout>
 	<characters>
-		<exemplarCharacters>[𖼀 𖼂 𖼄 𖼇 𖼈 𖼊 𖼍 𖼎 𖼐 𖼖 𖼘 𖼞 𖼠 𖼣 𖼦 𖼨 𖼮 𖼳 𖼵 𖼷 𖼹 𖼺 𖼻 𖼽 𖽂 𖽃  𖽏 𖽐 𖽑𖽔 𖽘 𖽙 𖽜 𖽝 𖽞 𖽡 𖽢 𖽦 𖽧 𖽨 𖽪 𖽫 𖽮 𖽱 𖽳 𖽴 𖽵 𖽶 𖽷 𖽹 𖽻 𖽾 𖾁 𖾂 \u16F8F \u16F90 \u16F91 \u16F92]</exemplarCharacters>
+		<exemplarCharacters>[𖼀 𖼂 𖼄 𖼇 𖼈 𖼊 𖼍 𖼎 𖼐 𖼖 𖼘 𖼞 𖼠 𖼣 𖼦 𖼨 𖼮 𖼳 𖼵 𖼷 𖼹 𖼺 𖼻 𖼽 𖽂 𖽃  𖽏 𖽐 𖽑𖽔 𖽘 𖽙 𖽜 𖽝 𖽞 𖽡 𖽢 𖽦 𖽧 𖽨 𖽪 𖽫 𖽮 𖽱 𖽳 𖽴 𖽵 𖽶 𖽷 𖽹 𖽻 𖽾 𖾁 𖾂 \U00016F8F \U00016F90 \U00016F91 \U00016F92]</exemplarCharacters>
 		<exemplarCharacters type="punctuation">[. , \: ; ? ! ' " \- / = + ( ) \[ \] * / _ __ “ ”  、 。]</exemplarCharacters>
 		<exemplarCharacters type="numbers">[0 1 2 3 4 5 6 7 8 9]</exemplarCharacters>
 	</characters>

diff --git a/sldr/y/ywq.xml b/sldr/y/ywq.xml
@@ -22,7 +22,7 @@
 		</orientation>
 	</layout>
 	<characters>
-		<exemplarCharacters>[𖼀 𖼂 𖼄 𖼇 𖼈 𖼊 𖼌 𖼍 𖼎 𖼐 𖼒 𖼔 𖼖 𖼘 𖼞 𖼡 𖼣 𖼦 𖼨 𖼮 𖼯 𖼰 𖼱 𖼳 𖼴 𖼵 𖼷 𖼹 𖼺 𖼻 𖼽 𖽂 𖽃 𖽐 𖽑 𖽔 𖽘 𖽙 𖽛 𖽜 𖽝 𖽡 𖽢 𖽦 𖽨 𖽪 𖽫 𖽮 𖽱 𖽳 𖽶 𖽸 𖽹 𖽺 𖽻 𖽿 𖾀 \u16F8F \u16F90 \u16F91]</exemplarCharacters>
+		<exemplarCharacters>[𖼀 𖼂 𖼄 𖼇 𖼈 𖼊 𖼌 𖼍 𖼎 𖼐 𖼒 𖼔 𖼖 𖼘 𖼞 𖼡 𖼣 𖼦 𖼨 𖼮 𖼯 𖼰 𖼱 𖼳 𖼴 𖼵 𖼷 𖼹 𖼺 𖼻 𖼽 𖽂 𖽃 𖽐 𖽑 𖽔 𖽘 𖽙 𖽛 𖽜 𖽝 𖽡 𖽢 𖽦 𖽨 𖽪 𖽫 𖽮 𖽱 𖽳 𖽶 𖽸 𖽹 𖽺 𖽻 𖽿 𖾀 \U00016F8F \U00016F90 \U00016F91]</exemplarCharacters>
 		<exemplarCharacters type="punctuation">[. , \: ; ? ! ' " \- / = + ( ) \[ \] * / _ __ “ ”  、 。]</exemplarCharacters>
 		<exemplarCharacters type="numbers">[0 1 2 3 4 5 6 7 8 9]</exemplarCharacters>
 	</characters>

diff --git a/tests/test_validate.py b/tests/test_validate.py
@@ -83,7 +83,8 @@ def test_syntax(ldml):
     for e in ldml.ldml.root.findall('.//characters/exemplarCharacters'): 
         t = e.get('type', None)
         n = t or "main"
-        exemplars_rawnocurly[t] = e.text[1:-1].strip().replace("\\", " \\").replace("{", " ").replace("}", " ").replace("  ", " ").split(' ') # adapted from the "get index exemplar" section of test_collation.py
+        exemplars_rawnocurly[t] = e.text[1:-1].strip().replace("-\\", " \\").replace("\\", " \\").replace("{", " ").replace("}", " ").replace("  ", " ").split(' ') # adapted from the "get index exemplar" section of test_collation.py
+            #THIS IS USED FOR FORMATTING AND SYNTAX TESTING ONLY, NOT FOR ACTUALLY GETTING INFO FROM THE EXEMPLAR.
         exemplars_raw[t] = e.text[1:-1].strip().split(' ') # adapted from the "get index exemplar" section of test_collation.py
         rawstring = e.text[1:-1].strip().replace(" ", "") # adapted from the "get index exemplar" section of test_collation.py
         s = usets.parse(e.text or "", 'NFD')
@@ -94,11 +95,14 @@ def test_syntax(ldml):
         for i in exemplars_rawnocurly[t]:
             if "\\" in i:
                 if r"\u" in i:
-                    assert len(i)>=6, filename + " " + n + " exemplar has unicode codepoint(s) missing hex digits: " + i
+                    if len(i)>6:
+                        assert len(i)==6, filename + " " + n + " exemplar has a 4-digit unicode codepoint(s) that should be in the 8-digit \\Uxxxxxxxx format: " + i
+                    elif len(i)<6:
+                        assert len(i)==6, filename + " " + n + " exemplar has a 4-digit unicode codepoint(s) missing hex digits: " + i
                 if r"\U" in i:
-                    assert len(i)==10, filename + " " + n + " exemplar has unicode codepoint(s) missing hex digits: " + i
+                    assert len(i)==10, filename + " " + n + " exemplar has an 8-digit unicode codepoint(s) missing hex digits: " + i
                 #this next assert does assume that spaces were added between units in an exemplar, since exemplars_rawnocurly can only insert a space BEFORE a backslash. So far nothing fails incorrectly because of that
-                #assert len(i)<3 or len(i)==6 or len(i)==10, filename + " " + n + " exemplar has unicode codepoint(s) missing 'u' or 'U': " + i
+                assert len(i)<3 or len(i)==6 or len(i)==10, filename + " " + n + " exemplar has unicode codepoint(s) missing 'u' or 'U': " + i
         # The following lines are a test if characters are incorrectly unescaped.
         # The problem with these coming tests is that if there are ranges that use special characters intentionally, they'll ping as errors. 
         # However we can't solely test for "is it a valid regex" bc they might make a valid regex on accident.