Skip to content

Commit

Permalink
fixed handling of escaped unicode sequences with more than 4 digits
Browse files Browse the repository at this point in the history
  • Loading branch information
emily-roth committed Dec 9, 2024
1 parent cad7aa4 commit f452bb9
Show file tree
Hide file tree
Showing 7 changed files with 14 additions and 10 deletions.
2 changes: 1 addition & 1 deletion sldr/h/hmz_Plrd.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
</orientation>
</layout>
<characters>
<exemplarCharacters>[𖼀 𖼃 𖼄 𖼇 𖼈 𖼊 𖼎 𖼐 𖼖 𖼘 𖼚 𖼞 𖼡 𖼣 𖼦 𖼨 𖼮 𖼳 𖼵 𖼷 𖼺 𖼻 𖼽 𖽂 𖽃 𖽐 𖽑 𖽔 𖽗 𖽘 𖽙 𖽝 𖽞 𖽟 𖽠 𖽡 𖽢 𖽦 𖽨 𖽪 𖽫 𖽱 𖽵 𖽷 𖽺 𖽻 𖽾 \u16F8F \u16F90 \u16F91]</exemplarCharacters>
<exemplarCharacters>[𖼀 𖼃 𖼄 𖼇 𖼈 𖼊 𖼎 𖼐 𖼖 𖼘 𖼚 𖼞 𖼡 𖼣 𖼦 𖼨 𖼮 𖼳 𖼵 𖼷 𖼺 𖼻 𖼽 𖽂 𖽃 𖽐 𖽑 𖽔 𖽗 𖽘 𖽙 𖽝 𖽞 𖽟 𖽠 𖽡 𖽢 𖽦 𖽨 𖽪 𖽫 𖽱 𖽵 𖽷 𖽺 𖽻 𖽾 \U00016F8F \U00016F90 \U00016F91]</exemplarCharacters>
<exemplarCharacters type="punctuation">[. , \: ; ? ! ' " \- / = + ( ) \[ \] * / _ __ “ ” 、 。]</exemplarCharacters>
<exemplarCharacters type="numbers">[0 1 2 3 4 5 6 7 8 9]</exemplarCharacters>
</characters>
Expand Down
2 changes: 1 addition & 1 deletion sldr/l/lpo.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
</orientation>
</layout>
<characters>
<exemplarCharacters>[𖼀 𖼂 𖼄 𖼇 𖼈 𖼊 𖼍 𖼐 𖼖 𖼘 𖼞 𖼠 𖼣 𖼦 𖼨 𖼮 𖼰 𖼳 𖼵 𖼷 𖼹 𖼺 𖼻 𖼽 𖽂 𖽃 𖽑 𖽔 𖽕 𖽗 𖽘 𖽙 𖽚 𖽜 𖽝 𖽡 𖽢 𖽦 𖽧 𖽨 𖽪 𖽫 𖽮 𖽱 𖽲 𖽳 𖽴 𖽶 𖽸 𖽹 𖽺 𖽻 𖽾 \u16F90 \u16F91]</exemplarCharacters>
<exemplarCharacters>[𖼀 𖼂 𖼄 𖼇 𖼈 𖼊 𖼍 𖼐 𖼖 𖼘 𖼞 𖼠 𖼣 𖼦 𖼨 𖼮 𖼰 𖼳 𖼵 𖼷 𖼹 𖼺 𖼻 𖼽 𖽂 𖽃 𖽑 𖽔 𖽕 𖽗 𖽘 𖽙 𖽚 𖽜 𖽝 𖽡 𖽢 𖽦 𖽧 𖽨 𖽪 𖽫 𖽮 𖽱 𖽲 𖽳 𖽴 𖽶 𖽸 𖽹 𖽺 𖽻 𖽾 \U00016F90 \U00016F91]</exemplarCharacters>
<exemplarCharacters type="punctuation">[. , \: ; ? ! ' " \- / = + ( ) \[ \] * / _ __ “ ” 、 。]</exemplarCharacters>
<exemplarCharacters type="numbers">[0 1 2 3 4 5 6 7 8 9]</exemplarCharacters>
</characters>
Expand Down
2 changes: 1 addition & 1 deletion sldr/s/sfm.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
</orientation>
</layout>
<characters>
<exemplarCharacters>[𖼀 𖼁 𖼃 𖼄 𖼇 𖼈 𖼊 𖼋 𖼎 𖼏 𖼐 𖼑 𖼖 𖼗 𖼘 𖼙 𖼞 𖼟 𖼡 𖼢 𖼣 𖼦 𖼨 𖼩 𖼮 𖼯 𖼲 𖼳 𖼵 𖼷 𖼸 𖼺 𖼽 𖽂 𖽃 𖽅 𖽆 𖽇 𖽐 𖽑 𖽔 𖽗 𖽘 𖽙 𖽜 𖽝 𖽞 𖽟 𖽡 𖽢 𖽣 𖽤 𖽦 𖽨 𖽩 𖽪 𖽫 𖽬 𖽭 𖽰 𖽱 𖽵 𖽷 𖽸 𖽹 𖽺 𖽻 𖽾 𖾂 𖾃 \u16F8F \u16F90 \u16F91]</exemplarCharacters>
<exemplarCharacters>[𖼀 𖼁 𖼃 𖼄 𖼇 𖼈 𖼊 𖼋 𖼎 𖼏 𖼐 𖼑 𖼖 𖼗 𖼘 𖼙 𖼞 𖼟 𖼡 𖼢 𖼣 𖼦 𖼨 𖼩 𖼮 𖼯 𖼲 𖼳 𖼵 𖼷 𖼸 𖼺 𖼽 𖽂 𖽃 𖽅 𖽆 𖽇 𖽐 𖽑 𖽔 𖽗 𖽘 𖽙 𖽜 𖽝 𖽞 𖽟 𖽡 𖽢 𖽣 𖽤 𖽦 𖽨 𖽩 𖽪 𖽫 𖽬 𖽭 𖽰 𖽱 𖽵 𖽷 𖽸 𖽹 𖽺 𖽻 𖽾 𖾂 𖾃 \U00016F8F \U00016F90 \U00016F91]</exemplarCharacters>
<exemplarCharacters type="punctuation">[. , \: ; ? ! ' " \- / = + ( ) \[ \] * / _ __ “ ” 、 。]</exemplarCharacters>
<exemplarCharacters type="numbers">[0 1 2 3 4 5 6 7 8 9]</exemplarCharacters>
</characters>
Expand Down
2 changes: 1 addition & 1 deletion sldr/y/ygp.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
</orientation>
</layout>
<characters>
<exemplarCharacters>[𖼀 𖼂 𖼄 𖼇 𖼈 𖼊 𖼎 𖼐 𖼒 𖼔 𖼖 𖼘 𖼞 𖼣 𖼦 𖼨 𖼪 𖼮 𖼲 𖼳 𖼵 𖼷 𖼺 𖼻 𖼽 𖽂 𖽃 𖽈 𖽉 𖽊 𖽐 𖽑 𖽔 𖽗 𖽘 𖽙 𖽜 𖽝 𖽠 𖽡 𖽦 𖽨 𖽪 𖽱 𖽳 𖽵 𖽶 𖽷 𖽹 𖽺 𖽻 𖽾 𖾁 𖾃 𖾄 𖾅 𖾆 𖾇 \u16F8F \u16F90 \u16F91]</exemplarCharacters>
<exemplarCharacters>[𖼀 𖼂 𖼄 𖼇 𖼈 𖼊 𖼎 𖼐 𖼒 𖼔 𖼖 𖼘 𖼞 𖼣 𖼦 𖼨 𖼪 𖼮 𖼲 𖼳 𖼵 𖼷 𖼺 𖼻 𖼽 𖽂 𖽃 𖽈 𖽉 𖽊 𖽐 𖽑 𖽔 𖽗 𖽘 𖽙 𖽜 𖽝 𖽠 𖽡 𖽦 𖽨 𖽪 𖽱 𖽳 𖽵 𖽶 𖽷 𖽹 𖽺 𖽻 𖽾 𖾁 𖾃 𖾄 𖾅 𖾆 𖾇 \U00016F8F \U00016F90 \U00016F91]</exemplarCharacters>
<exemplarCharacters type="punctuation">[. , \: ; ? ! ' " \- / = + ( ) \[ \] * / _ __ “ ” 、 。]</exemplarCharacters>
<exemplarCharacters type="numbers">[0 1 2 3 4 5 6 7 8 9]</exemplarCharacters>
</characters>
Expand Down
2 changes: 1 addition & 1 deletion sldr/y/yna.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
</orientation>
</layout>
<characters>
<exemplarCharacters>[𖼀 𖼂 𖼄 𖼇 𖼈 𖼊 𖼍 𖼎 𖼐 𖼖 𖼘 𖼞 𖼠 𖼣 𖼦 𖼨 𖼮 𖼳 𖼵 𖼷 𖼹 𖼺 𖼻 𖼽 𖽂 𖽃 𖽏 𖽐 𖽑𖽔 𖽘 𖽙 𖽜 𖽝 𖽞 𖽡 𖽢 𖽦 𖽧 𖽨 𖽪 𖽫 𖽮 𖽱 𖽳 𖽴 𖽵 𖽶 𖽷 𖽹 𖽻 𖽾 𖾁 𖾂 \u16F8F \u16F90 \u16F91 \u16F92]</exemplarCharacters>
<exemplarCharacters>[𖼀 𖼂 𖼄 𖼇 𖼈 𖼊 𖼍 𖼎 𖼐 𖼖 𖼘 𖼞 𖼠 𖼣 𖼦 𖼨 𖼮 𖼳 𖼵 𖼷 𖼹 𖼺 𖼻 𖼽 𖽂 𖽃 𖽏 𖽐 𖽑𖽔 𖽘 𖽙 𖽜 𖽝 𖽞 𖽡 𖽢 𖽦 𖽧 𖽨 𖽪 𖽫 𖽮 𖽱 𖽳 𖽴 𖽵 𖽶 𖽷 𖽹 𖽻 𖽾 𖾁 𖾂 \U00016F8F \U00016F90 \U00016F91 \U00016F92]</exemplarCharacters>
<exemplarCharacters type="punctuation">[. , \: ; ? ! ' " \- / = + ( ) \[ \] * / _ __ “ ” 、 。]</exemplarCharacters>
<exemplarCharacters type="numbers">[0 1 2 3 4 5 6 7 8 9]</exemplarCharacters>
</characters>
Expand Down
2 changes: 1 addition & 1 deletion sldr/y/ywq.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
</orientation>
</layout>
<characters>
<exemplarCharacters>[𖼀 𖼂 𖼄 𖼇 𖼈 𖼊 𖼌 𖼍 𖼎 𖼐 𖼒 𖼔 𖼖 𖼘 𖼞 𖼡 𖼣 𖼦 𖼨 𖼮 𖼯 𖼰 𖼱 𖼳 𖼴 𖼵 𖼷 𖼹 𖼺 𖼻 𖼽 𖽂 𖽃 𖽐 𖽑 𖽔 𖽘 𖽙 𖽛 𖽜 𖽝 𖽡 𖽢 𖽦 𖽨 𖽪 𖽫 𖽮 𖽱 𖽳 𖽶 𖽸 𖽹 𖽺 𖽻 𖽿 𖾀 \u16F8F \u16F90 \u16F91]</exemplarCharacters>
<exemplarCharacters>[𖼀 𖼂 𖼄 𖼇 𖼈 𖼊 𖼌 𖼍 𖼎 𖼐 𖼒 𖼔 𖼖 𖼘 𖼞 𖼡 𖼣 𖼦 𖼨 𖼮 𖼯 𖼰 𖼱 𖼳 𖼴 𖼵 𖼷 𖼹 𖼺 𖼻 𖼽 𖽂 𖽃 𖽐 𖽑 𖽔 𖽘 𖽙 𖽛 𖽜 𖽝 𖽡 𖽢 𖽦 𖽨 𖽪 𖽫 𖽮 𖽱 𖽳 𖽶 𖽸 𖽹 𖽺 𖽻 𖽿 𖾀 \U00016F8F \U00016F90 \U00016F91]</exemplarCharacters>
<exemplarCharacters type="punctuation">[. , \: ; ? ! ' " \- / = + ( ) \[ \] * / _ __ “ ” 、 。]</exemplarCharacters>
<exemplarCharacters type="numbers">[0 1 2 3 4 5 6 7 8 9]</exemplarCharacters>
</characters>
Expand Down
12 changes: 8 additions & 4 deletions tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ def test_syntax(ldml):
for e in ldml.ldml.root.findall('.//characters/exemplarCharacters'):
t = e.get('type', None)
n = t or "main"
exemplars_rawnocurly[t] = e.text[1:-1].strip().replace("\\", " \\").replace("{", " ").replace("}", " ").replace(" ", " ").split(' ') # adapted from the "get index exemplar" section of test_collation.py
exemplars_rawnocurly[t] = e.text[1:-1].strip().replace("-\\", " \\").replace("\\", " \\").replace("{", " ").replace("}", " ").replace(" ", " ").split(' ') # adapted from the "get index exemplar" section of test_collation.py
#THIS IS USED FOR FORMATTING AND SYNTAX TESTING ONLY, NOT FOR ACTUALLY GETTING INFO FROM THE EXEMPLAR.
exemplars_raw[t] = e.text[1:-1].strip().split(' ') # adapted from the "get index exemplar" section of test_collation.py
rawstring = e.text[1:-1].strip().replace(" ", "") # adapted from the "get index exemplar" section of test_collation.py
s = usets.parse(e.text or "", 'NFD')
Expand All @@ -94,11 +95,14 @@ def test_syntax(ldml):
for i in exemplars_rawnocurly[t]:
if "\\" in i:
if r"\u" in i:
assert len(i)>=6, filename + " " + n + " exemplar has unicode codepoint(s) missing hex digits: " + i
if len(i)>6:
assert len(i)==6, filename + " " + n + " exemplar has a 4-digit unicode codepoint(s) that should be in the 8-digit \\Uxxxxxxxx format: " + i
elif len(i)<6:
assert len(i)==6, filename + " " + n + " exemplar has a 4-digit unicode codepoint(s) missing hex digits: " + i
if r"\U" in i:
assert len(i)==10, filename + " " + n + " exemplar has unicode codepoint(s) missing hex digits: " + i
assert len(i)==10, filename + " " + n + " exemplar has an 8-digit unicode codepoint(s) missing hex digits: " + i
#this next assert does assume that spaces were added between units in an exemplar, since exemplars_rawnocurly can only insert a space BEFORE a backslash. So far nothing fails incorrectly because of that
#assert len(i)<3 or len(i)==6 or len(i)==10, filename + " " + n + " exemplar has unicode codepoint(s) missing 'u' or 'U': " + i
assert len(i)<3 or len(i)==6 or len(i)==10, filename + " " + n + " exemplar has unicode codepoint(s) missing 'u' or 'U': " + i
# The following lines are a test if characters are incorrectly unescaped.
# The problem with these coming tests is that if there are ranges that use special characters intentionally, they'll ping as errors.
# However we can't solely test for "is it a valid regex" bc they might make a valid regex on accident.
Expand Down

0 comments on commit f452bb9

Please sign in to comment.