From 237b087211125daf4878a576ef1df6fda31604d1 Mon Sep 17 00:00:00 2001 From: ReFreezed Date: Mon, 17 May 2021 02:42:49 +0200 Subject: [PATCH] Updated Unicode data. --- Changelog.txt | 2 +- misc/generateStringEscapeSequenceInfo.lua | 272 +++++++++++++++++++++- preprocess.lua | 30 +-- tests/quickTest.lua2p | 2 +- tests/quickTest.output.lua | 2 +- 5 files changed, 284 insertions(+), 24 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index 3745f79..92a01d2 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -3,7 +3,7 @@ LuaPreprocess v1.13.1 (2021-05-16) Library: -- Dual code now supports multiple identifiers: !!x, y = ... +- Dual code now supports multiple assignment targets: !!x, y = ... - Some non-ASCII characters in serialized strings look nicer. - Added params.fastStrings . - Fixed backtick strings not working in macros. diff --git a/misc/generateStringEscapeSequenceInfo.lua b/misc/generateStringEscapeSequenceInfo.lua index 822f10a..e350ce1 100644 --- a/misc/generateStringEscapeSequenceInfo.lua +++ b/misc/generateStringEscapeSequenceInfo.lua @@ -1,7 +1,16 @@ +-- -- Unicode characters not to encode with escape sequences in strings. --- https://en.wikipedia.org/wiki/List_of_Unicode_characters +-- Updated: 2021-05-17 +-- +-- U+1234 = include +-- !U+1234 = exclude +-- U+1x3x = 'x' means range between 0 and F local codepointsStr = [[ + +Source: https://en.wikipedia.org/wiki/List_of_Unicode_characters +---------------------------------------------------------------- + Basic Latin U+0020 (space) U+0021 ! @@ -974,17 +983,268 @@ U+203C ‼ U+203E ‾ U+2044 ⁄ U+204A ⁊ + +Source: https://en.wikipedia.org/wiki/Unicode_block +---------------------------------------------------------------- + +General Punctuation +U+201x !U+2011 +U+2020 U+2021 U+2022 U+2023 U+2024 U+2025 U+2026 U+2027 +U+203x +U+204x +U+205x !U+205F + +Superscripts and Subscripts +U+207x !U+2072 !U+2073 +U+208x !U+208F +U+209x !U+209D !U+209E !U+209F + +Currency Symbols +U+20Ax +U+20Bx + +Letterlike Symbols +U+210x +U+211x +U+212x +U+213x +U+214x + +Number Forms +U+215x +U+216x +U+217x +U+218x !U+218C !U+218D !U+218E !U+218F + +Arrows +U+219x +U+21Ax +U+21Bx +U+21Cx +U+21Dx +U+21Ex +U+21Fx + +Mathematical Operators +U+220x +U+221x +U+222x +U+223x +U+224x +U+225x +U+226x +U+227x +U+228x +U+229x +U+22Ax +U+22Bx +U+22Cx +U+22Dx +U+22Ex +U+22Fx + +Miscellaneous Technical +U+230x +U+231x +U+232x +U+233x +U+234x +U+235x +U+236x +U+237x +U+238x +U+239x +U+23Ax +U+23Bx +U+23Cx +U+23Dx +U+23Ex +U+23Fx + +Control Pictures +U+240x +U+241x +U+2420 U+2421 U+2422 U+2423 U+2424 U+2425 U+2426 + +Enclosed Alphanumerics +U+246x +U+247x +U+248x +U+249x +U+24Ax +U+24Bx +U+24Cx +U+24Dx +U+24Ex +U+24Fx + +Box Drawing +U+250x +U+251x +U+252x +U+253x +U+254x +U+255x +U+256x +U+257x + +Block Elements +U+258x +U+259x + +Geometric Shapes +U+25Ax +U+25Bx +U+25Cx +U+25Dx +U+25Ex +U+25Fx + +Miscellaneous Symbols +U+260x +U+261x +U+262x +U+263x +U+264x +U+265x +U+266x +U+267x +U+268x +U+269x +U+26Ax +U+26Bx +U+26Cx +U+26Dx +U+26Ex +U+26Fx + +Dingbats +U+270x +U+271x +U+272x +U+273x +U+274x +U+275x +U+276x +U+277x +U+278x +U+279x +U+27Ax +U+27Bx + +Miscellaneous Mathematical Symbols-A +U+27Cx +U+27Dx +U+27Ex + +Supplemental Arrows-A +U+27Fx + +Supplemental Arrows-B +U+290x +U+291x +U+292x +U+293x +U+294x +U+295x +U+296x +U+297x + +Miscellaneous Mathematical Symbols-B +U+298x +U+299x +U+29Ax +U+29Bx +U+29Cx +U+29Dx +U+29Ex +U+29Fx + +Supplemental Mathematical Operators +U+2A0x +U+2A1x +U+2A2x +U+2A3x +U+2A4x +U+2A5x +U+2A6x +U+2A7x +U+2A8x +U+2A9x +U+2AAx +U+2ABx +U+2ACx +U+2ADx +U+2AEx +U+2AFx + +Alphabetic Presentation Forms +U+FB00 U+FB01 U+FB02 U+FB03 U+FB04 U+FB05 U+FB06 + +Mathematical Alphanumeric Symbols +(some of these seem problematic) ]] local lowest = 1/0 local highest = 0 local cpSet = {} -for cpHex in codepointsStr:gmatch"U%+0*(%x+)" do - local cp = tonumber(cpHex, 16) - lowest = math.min(lowest, cp) - highest = math.max(highest, cp) - cpSet[cp] = true +local function eachCodepoint(cpHexPattern) + if not cpHexPattern:find"[Xx]" then + local cpHex = cpHexPattern + local done = false + + return function() + if not done then + done = true + return tonumber(cpHex, 16) + end + end + end + + -- Every 'x' in the hex number pattern is a variable. + local variables = {} + + for _ in cpHexPattern:gmatch"[Xx]" do + table.insert(variables, 0) + end + + variables[#variables] = -1 + + return function() + -- Increase the number represented by the variables. + for i = #variables, 1, -1 do + variables[i] = variables[i] + 1 + if variables[i] < 16 then break end + variables[i] = 0 + if i == 1 then return end -- Done! + end + + local i = 0 + + local cpHex = cpHexPattern:gsub("[Xx]", function() + i = i + 1 + return ("%X"):format(variables[i]) + end) + + return tonumber(cpHex, 16) + end +end + +for ignore, cpHexPattern in codepointsStr:gmatch"(!?)U%+0*([%xXx]+)" do + ignore = (ignore == "!") + + for cp in eachCodepoint(cpHexPattern) do + if ignore then + print(("Ignoring U+%04X"):format(cp)) + elseif cpSet[cp] then + print(("Duplicate U+%04X"):format(cp)) + end + + lowest = math.min(lowest, cp) -- (It's fine if lowest and highest becomes incorrect if ignore is ever true.) + highest = math.max(highest, cp) + cpSet[cp] = not ignore + end end local ranges = {} diff --git a/preprocess.lua b/preprocess.lua index 123794f..1ee382e 100644 --- a/preprocess.lua +++ b/preprocess.lua @@ -844,6 +844,7 @@ end +-- (Table generated by misc/generateStringEscapeSequenceInfo.lua) local UNICODE_RANGES_NOT_TO_ESCAPE = { {from=32, to=126}, {from=161, to=591}, @@ -864,17 +865,18 @@ local UNICODE_RANGES_NOT_TO_ESCAPE = { {from=7808, to=7813}, {from=7835, to=7835}, {from=7922, to=7923}, - {from=8211, to=8213}, - {from=8215, to=8222}, - {from=8224, to=8226}, - {from=8230, to=8230}, - {from=8240, to=8240}, - {from=8242, to=8243}, - {from=8249, to=8250}, - {from=8252, to=8252}, - {from=8254, to=8254}, - {from=8260, to=8260}, - {from=8266, to=8266}, + {from=8208, to=8208}, + {from=8210, to=8231}, + {from=8240, to=8286}, + {from=8304, to=8305}, + {from=8308, to=8334}, + {from=8336, to=8348}, + {from=8352, to=8383}, + {from=8448, to=8587}, + {from=8592, to=9254}, + {from=9312, to=10239}, + {from=10496, to=11007}, + {from=64256, to=64262}, } local function shouldCodepointBeEscaped(cp) @@ -979,10 +981,8 @@ function serialize(buffer, v) elseif c == quote then tableInsert(buffer, [[\]]) ; tableInsert(buffer, quote) ; pos = pos+1 -- UTF-8 character. - elseif len == 1 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos )) ; pos = pos+1 -- @Speed: We can insert multiple single-byte characters sometimes! - elseif len == 2 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos+1)) ; pos = pos+2 - elseif len == 3 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos+2)) ; pos = pos+3 - elseif len == 4 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos+3)) ; pos = pos+4 + elseif len == 1 and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos )) ; pos = pos+1 -- @Speed: We can insert multiple single-byte characters sometimes! + elseif len and not shouldCodepointBeEscaped(cp) then tableInsert(buffer, v:sub(pos, pos+len-1)) ; pos = pos+len -- Anything else. else diff --git a/tests/quickTest.lua2p b/tests/quickTest.lua2p index 297188f..38212c2 100644 --- a/tests/quickTest.lua2p +++ b/tests/quickTest.lua2p @@ -28,7 +28,7 @@ comment here...]] true !wrapped("dogs") !wrapped("clouds") -local data = !("a\n1Ü2\"\10\0003") +local data = !("a\n1Ü2\"\10\255\255\0003") diff --git a/tests/quickTest.output.lua b/tests/quickTest.output.lua index 58b8a89..d3b0206 100644 --- a/tests/quickTest.output.lua +++ b/tests/quickTest.output.lua @@ -23,7 +23,7 @@ print"Get wrapped! Also, dogs..." print"Get wrapped! Also, clouds..." -local data = 'a\n1Ü2"\n\0003' +local data = 'a\n1Ü2"\n\255\255\0003'