From 7a2c6f42d49e7a4003384cf54b187f16e64e47a1 Mon Sep 17 00:00:00 2001 From: Charlie Gordon Date: Sun, 5 May 2024 17:47:40 +0200 Subject: [PATCH] Improve libunicode and libregexp headers (#288) - move all `lre_xxx` functions to libunicode - use flags table `lre_ctype_bits` instead of bitmaps - simplify `lre_is_space`, `lre_js_is_ident_first` and `lre_js_is_ident_next` - simplify `simple_next_token`, handle UTF-8 correctly - simplify `is_let`, remove dead code --- libregexp.c | 29 +------------- libregexp.h | 43 ++------------------- libunicode.c | 94 +++++++++++++++++++++++++++++++++++++++++++++ libunicode.h | 103 ++++++++++++++++++++++++++++++++++++++------------ quickjs.c | 96 ++++++++++++++++++++++++++++------------------ unicode_gen.c | 12 +++--- 6 files changed, 243 insertions(+), 134 deletions(-) diff --git a/libregexp.c b/libregexp.c index d73a19f39..1091506fe 100644 --- a/libregexp.c +++ b/libregexp.c @@ -30,6 +30,7 @@ #include "cutils.h" #include "libregexp.h" +#include "libunicode.h" /* TODO: @@ -141,32 +142,6 @@ static const uint16_t char_range_s[] = { 0xFEFF, 0xFEFF + 1, }; -BOOL lre_is_space(int c) -{ - int i, n, low, high; - n = (countof(char_range_s) - 1) / 2; - for(i = 0; i < n; i++) { - low = char_range_s[2 * i + 1]; - if (c < low) - return FALSE; - high = char_range_s[2 * i + 2]; - if (c < high) - return TRUE; - } - return FALSE; -} - -uint32_t const lre_id_start_table_ascii[4] = { - /* $ A-Z _ a-z */ - 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE -}; - -uint32_t const lre_id_continue_table_ascii[4] = { - /* $ 0-9 A-Z _ a-z */ - 0x00000000, 0x03FF0010, 0x87FFFFFE, 0x07FFFFFE -}; - - static const uint16_t char_range_w[] = { 4, 0x0030, 0x0039 + 1, @@ -186,7 +161,7 @@ typedef enum { CHAR_RANGE_W, } CharRangeEnum; -static const uint16_t *char_range_table[] = { +static const uint16_t * const char_range_table[] = { char_range_d, char_range_s, char_range_w, diff --git a/libregexp.h b/libregexp.h index 757b27730..7af7ece0f 100644 --- a/libregexp.h +++ b/libregexp.h @@ -25,10 +25,7 @@ #define LIBREGEXP_H #include - -#include "libunicode.h" - -#define LRE_BOOL int /* for documentation purposes */ +#include #define LRE_FLAG_GLOBAL (1 << 0) #define LRE_FLAG_IGNORECASE (1 << 1) @@ -50,43 +47,9 @@ int lre_exec(uint8_t **capture, int cbuf_type, void *opaque); int lre_parse_escape(const uint8_t **pp, int allow_utf16); -LRE_BOOL lre_is_space(int c); -/* must be provided by the user */ -LRE_BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size); +/* must be provided by the user, return non zero if overflow */ +int lre_check_stack_overflow(void *opaque, size_t alloca_size); void *lre_realloc(void *opaque, void *ptr, size_t size); -/* JS identifier test */ -extern uint32_t const lre_id_start_table_ascii[4]; -extern uint32_t const lre_id_continue_table_ascii[4]; - -static inline int lre_js_is_ident_first(int c) -{ - if ((uint32_t)c < 128) { - return (lre_id_start_table_ascii[c >> 5] >> (c & 31)) & 1; - } else { -#ifdef CONFIG_ALL_UNICODE - return lre_is_id_start(c); -#else - return !lre_is_space(c); -#endif - } -} - -static inline int lre_js_is_ident_next(int c) -{ - if ((uint32_t)c < 128) { - return (lre_id_continue_table_ascii[c >> 5] >> (c & 31)) & 1; - } else { - /* ZWNJ and ZWJ are accepted in identifiers */ -#ifdef CONFIG_ALL_UNICODE - return lre_is_id_continue(c) || c == 0x200C || c == 0x200D; -#else - return !lre_is_space(c) || c == 0x200C || c == 0x200D; -#endif - } -} - -#undef LRE_BOOL - #endif /* LIBREGEXP_H */ diff --git a/libunicode.c b/libunicode.c index a631bbd2c..c80d2f3d1 100644 --- a/libunicode.c +++ b/libunicode.c @@ -1814,3 +1814,97 @@ int unicode_prop(CharRange *cr, const char *prop_name) } #endif /* CONFIG_ALL_UNICODE */ + +/*---- lre codepoint categorizing functions ----*/ + +#define S UNICODE_C_SPACE +#define D UNICODE_C_DIGIT +#define X UNICODE_C_XDIGIT +#define U UNICODE_C_UPPER +#define L UNICODE_C_LOWER +#define _ UNICODE_C_UNDER +#define d UNICODE_C_DOLLAR + +uint8_t const lre_ctype_bits[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, S, S, S, S, S, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + S, 0, 0, 0, d, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + X|D, X|D, X|D, X|D, X|D, X|D, X|D, X|D, + X|D, X|D, 0, 0, 0, 0, 0, 0, + + 0, X|U, X|U, X|U, X|U, X|U, X|U, U, + U, U, U, U, U, U, U, U, + U, U, U, U, U, U, U, U, + U, U, U, 0, 0, 0, 0, _, + + 0, X|L, X|L, X|L, X|L, X|L, X|L, L, + L, L, L, L, L, L, L, L, + L, L, L, L, L, L, L, L, + L, L, L, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + S, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +#undef S +#undef D +#undef X +#undef U +#undef L +#undef _ +#undef d + +/* code point ranges for Zs,Zl or Zp property */ +static const uint16_t char_range_s[] = { + 10, + 0x0009, 0x000D + 1, + 0x0020, 0x0020 + 1, + 0x00A0, 0x00A0 + 1, + 0x1680, 0x1680 + 1, + 0x2000, 0x200A + 1, + /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */ + /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */ + 0x2028, 0x2029 + 1, + 0x202F, 0x202F + 1, + 0x205F, 0x205F + 1, + 0x3000, 0x3000 + 1, + /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */ + 0xFEFF, 0xFEFF + 1, +}; + +BOOL lre_is_space_non_ascii(uint32_t c) +{ + size_t i, n; + + n = countof(char_range_s); + for(i = 5; i < n; i += 2) { + uint32_t low = char_range_s[i]; + uint32_t high = char_range_s[i + 1]; + if (c < low) + return FALSE; + if (c < high) + return TRUE; + } + return FALSE; +} diff --git a/libunicode.h b/libunicode.h index f416157b9..cc2f244c7 100644 --- a/libunicode.h +++ b/libunicode.h @@ -24,27 +24,13 @@ #ifndef LIBUNICODE_H #define LIBUNICODE_H -#include - -#define LRE_BOOL int /* for documentation purposes */ +#include /* define it to include all the unicode tables (40KB larger) */ #define CONFIG_ALL_UNICODE #define LRE_CC_RES_LEN_MAX 3 -typedef enum { - UNICODE_NFC, - UNICODE_NFD, - UNICODE_NFKC, - UNICODE_NFKD, -} UnicodeNormalizationEnum; - -int lre_case_conv(uint32_t *res, uint32_t c, int conv_type); -int lre_canonicalize(uint32_t c, LRE_BOOL is_unicode); -LRE_BOOL lre_is_cased(uint32_t c); -LRE_BOOL lre_is_case_ignorable(uint32_t c); - /* char ranges */ typedef struct { @@ -102,12 +88,14 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len, int cr_invert(CharRange *cr); -int cr_regexp_canonicalize(CharRange *cr, LRE_BOOL is_unicode); - -#ifdef CONFIG_ALL_UNICODE +int cr_regexp_canonicalize(CharRange *cr, int is_unicode); -LRE_BOOL lre_is_id_start(uint32_t c); -LRE_BOOL lre_is_id_continue(uint32_t c); +typedef enum { + UNICODE_NFC, + UNICODE_NFD, + UNICODE_NFKC, + UNICODE_NFKD, +} UnicodeNormalizationEnum; int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len, UnicodeNormalizationEnum n_type, @@ -115,13 +103,80 @@ int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len, /* Unicode character range functions */ -int unicode_script(CharRange *cr, - const char *script_name, LRE_BOOL is_ext); +int unicode_script(CharRange *cr, const char *script_name, int is_ext); int unicode_general_category(CharRange *cr, const char *gc_name); int unicode_prop(CharRange *cr, const char *prop_name); -#endif /* CONFIG_ALL_UNICODE */ +int lre_case_conv(uint32_t *res, uint32_t c, int conv_type); +int lre_canonicalize(uint32_t c, int is_unicode); + +/* Code point type categories */ +enum { + UNICODE_C_SPACE = (1 << 0), + UNICODE_C_DIGIT = (1 << 1), + UNICODE_C_UPPER = (1 << 2), + UNICODE_C_LOWER = (1 << 3), + UNICODE_C_UNDER = (1 << 4), + UNICODE_C_DOLLAR = (1 << 5), + UNICODE_C_XDIGIT = (1 << 6), +}; +extern uint8_t const lre_ctype_bits[256]; + +/* zero or non-zero return value */ +int lre_is_cased(uint32_t c); +int lre_is_case_ignorable(uint32_t c); +int lre_is_id_start(uint32_t c); +int lre_is_id_continue(uint32_t c); + +static inline int lre_is_space_byte(uint8_t c) { + return lre_ctype_bits[c] & UNICODE_C_SPACE; +} + +static inline int lre_is_id_start_byte(uint8_t c) { + return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER | + UNICODE_C_UNDER | UNICODE_C_DOLLAR); +} -#undef LRE_BOOL +static inline int lre_is_id_continue_byte(uint8_t c) { + return lre_ctype_bits[c] & (UNICODE_C_UPPER | UNICODE_C_LOWER | + UNICODE_C_UNDER | UNICODE_C_DOLLAR | + UNICODE_C_DIGIT); +} + +int lre_is_space_non_ascii(uint32_t c); + +static inline int lre_is_space(uint32_t c) { + if (c < 256) + return lre_is_space_byte(c); + else + return lre_is_space_non_ascii(c); +} + +static inline int lre_js_is_ident_first(uint32_t c) { + if (c < 128) { + return lre_is_id_start_byte(c); + } else { +#ifdef CONFIG_ALL_UNICODE + return lre_is_id_start(c); +#else + return !lre_is_space_non_ascii(c); +#endif + } +} + +static inline int lre_js_is_ident_next(uint32_t c) { + if (c < 128) { + return lre_is_id_continue_byte(c); + } else { + /* ZWNJ and ZWJ are accepted in identifiers */ + if (c >= 0x200C && c <= 0x200D) + return TRUE; +#ifdef CONFIG_ALL_UNICODE + return lre_is_id_continue(c); +#else + return !lre_is_space_non_ascii(c); +#endif + } +} #endif /* LIBUNICODE_H */ diff --git a/quickjs.c b/quickjs.c index e8fdd8aa7..283419539 100644 --- a/quickjs.c +++ b/quickjs.c @@ -44,6 +44,7 @@ #include "list.h" #include "quickjs.h" #include "libregexp.h" +#include "libunicode.h" #include "libbf.h" #define OPTIMIZE 1 @@ -21188,8 +21189,7 @@ static JSAtom json_parse_ident(JSParseState *s, const uint8_t **pp, int c) for(;;) { buf[ident_pos++] = c; c = *p; - if (c >= 128 || - !((lre_id_continue_table_ascii[c >> 5] >> (c & 31)) & 1)) + if (c >= 128 || !lre_is_id_continue_byte(c)) break; p++; if (unlikely(ident_pos >= ident_size - UTF8_CHAR_LEN_MAX)) { @@ -21401,9 +21401,29 @@ static __exception int json_next_token(JSParseState *s) return -1; } -/* only used for ':' and '=>', 'let' or 'function' look-ahead. *pp is - only set if TOK_IMPORT is returned */ -/* XXX: handle all unicode cases */ +static int match_identifier(const uint8_t *p, const char *s) { + uint32_t c; + while (*s) { + if ((uint8_t)*s++ != *p++) + return 0; + } + c = *p; + if (c >= 128) + c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); + return !lre_js_is_ident_next(c); +} + +/* simple_next_token() is used to check for the next token in simple cases. + It is only used for ':' and '=>', 'let' or 'function' look-ahead. + (*pp) is only set if TOK_IMPORT is returned for JS_DetectModule() + Whitespace and comments are skipped correctly. + Then the next token is analyzed, only for specific words. + Return values: + - '\n' if !no_line_terminator + - TOK_ARROW, TOK_IN, TOK_IMPORT, TOK_OF, TOK_EXPORT, TOK_FUNCTION + - TOK_IDENT is returned for other identifiers and keywords + - otherwise the next character or unicode codepoint is returned. + */ static int simple_next_token(const uint8_t **pp, BOOL no_line_terminator) { const uint8_t *p; @@ -21447,33 +21467,42 @@ static int simple_next_token(const uint8_t **pp, BOOL no_line_terminator) if (*p == '>') return TOK_ARROW; break; + case 'i': + if (match_identifier(p, "n")) + return TOK_IN; + if (match_identifier(p, "mport")) { + *pp = p + 5; + return TOK_IMPORT; + } + return TOK_IDENT; + case 'o': + if (match_identifier(p, "f")) + return TOK_OF; + return TOK_IDENT; + case 'e': + if (match_identifier(p, "xport")) + return TOK_EXPORT; + return TOK_IDENT; + case 'f': + if (match_identifier(p, "unction")) + return TOK_FUNCTION; + return TOK_IDENT; + case '\\': + if (*p == 'u') { + if (lre_js_is_ident_first(lre_parse_escape(&p, TRUE))) + return TOK_IDENT; + } + break; default: - if (lre_js_is_ident_first(c)) { - if (c == 'i') { - if (p[0] == 'n' && !lre_js_is_ident_next(p[1])) { - return TOK_IN; - } - if (p[0] == 'm' && p[1] == 'p' && p[2] == 'o' && - p[3] == 'r' && p[4] == 't' && - !lre_js_is_ident_next(p[5])) { - *pp = p + 5; - return TOK_IMPORT; - } - } else if (c == 'o' && *p == 'f' && !lre_js_is_ident_next(p[1])) { - return TOK_OF; - } else if (c == 'e' && - p[0] == 'x' && p[1] == 'p' && p[2] == 'o' && - p[3] == 'r' && p[4] == 't' && - !lre_js_is_ident_next(p[5])) { - *pp = p + 5; - return TOK_EXPORT; - } else if (c == 'f' && p[0] == 'u' && p[1] == 'n' && - p[2] == 'c' && p[3] == 't' && p[4] == 'i' && - p[5] == 'o' && p[6] == 'n' && !lre_js_is_ident_next(p[7])) { - return TOK_FUNCTION; - } - return TOK_IDENT; + if (c >= 128) { + c = unicode_from_utf8(p - 1, UTF8_CHAR_LEN_MAX, &p); + if (no_line_terminator && (c == CP_PS || c == CP_LS)) + return '\n'; } + if (lre_is_space(c)) + continue; + if (lre_js_is_ident_first(c)) + return TOK_IDENT; break; } return c; @@ -26211,7 +26240,6 @@ static int is_let(JSParseState *s, int decl_mask) int res = FALSE; if (token_is_pseudo_keyword(s, JS_ATOM_let)) { -#if 1 JSParsePos pos; js_parse_get_pos(s, &pos); for (;;) { @@ -26244,12 +26272,6 @@ static int is_let(JSParseState *s, int decl_mask) if (js_parse_seek_token(s, &pos)) { res = -1; } -#else - int tok = peek_token(s, TRUE); - if (tok == '{' || tok == TOK_IDENT || peek_token(s, FALSE) == '[') { - res = TRUE; - } -#endif } return res; } diff --git a/unicode_gen.c b/unicode_gen.c index 14811ef49..4f38052be 100644 --- a/unicode_gen.c +++ b/unicode_gen.c @@ -273,7 +273,7 @@ int find_name(const char **tab, int tab_len, const char *name) return -1; } -static int get_prop(uint32_t c, int prop_idx) +static BOOL get_prop(uint32_t c, int prop_idx) { return (unicode_db[c].prop_bitmap_tab[prop_idx >> 5] >> (prop_idx & 0x1f)) & 1; } @@ -1981,7 +1981,7 @@ void check_flags(void) BOOL flag_ref, flag; for(c = 0; c <= CHARCODE_MAX; c++) { flag_ref = get_prop(c, PROP_Cased); - flag = lre_is_cased(c); + flag = !!lre_is_cased(c); if (flag != flag_ref) { printf("ERROR: c=%05x cased=%d ref=%d\n", c, flag, flag_ref); @@ -1989,7 +1989,7 @@ void check_flags(void) } flag_ref = get_prop(c, PROP_Case_Ignorable); - flag = lre_is_case_ignorable(c); + flag = !!lre_is_case_ignorable(c); if (flag != flag_ref) { printf("ERROR: c=%05x case_ignorable=%d ref=%d\n", c, flag, flag_ref); @@ -1997,7 +1997,7 @@ void check_flags(void) } flag_ref = get_prop(c, PROP_ID_Start); - flag = lre_is_id_start(c); + flag = !!lre_is_id_start(c); if (flag != flag_ref) { printf("ERROR: c=%05x id_start=%d ref=%d\n", c, flag, flag_ref); @@ -2005,7 +2005,7 @@ void check_flags(void) } flag_ref = get_prop(c, PROP_ID_Continue); - flag = lre_is_id_continue(c); + flag = !!lre_is_id_continue(c); if (flag != flag_ref) { printf("ERROR: c=%05x id_cont=%d ref=%d\n", c, flag, flag_ref); @@ -2019,7 +2019,7 @@ void check_flags(void) count = 0; for(c = 0x20; c <= 0xffff; c++) { flag_ref = get_prop(c, PROP_ID_Start); - flag = lre_is_id_start(c); + flag = !!lre_is_id_start(c); assert(flag == flag_ref); count++; }