diff --git a/icu4c/source/i18n/messageformat2_parser.cpp b/icu4c/source/i18n/messageformat2_parser.cpp index d0e79fcaf823..7260b89c6866 100644 --- a/icu4c/source/i18n/messageformat2_parser.cpp +++ b/icu4c/source/i18n/messageformat2_parser.cpp @@ -24,21 +24,22 @@ using namespace data_model; The `ERROR()` macro sets a syntax error in the context and sets the offset in `parseError` to `index`. It does not alter control flow. */ -#define ERROR(parseError, errorCode, index) \ +#define ERROR(errorCode) \ if (!errors.hasSyntaxError()) { \ setParseError(parseError, index); \ errors.addSyntaxError(errorCode); \ } -// Returns true iff `index` is a valid index for the string `source` -static bool inBounds(const UnicodeString &source, uint32_t index) { - return (((int32_t)index) < source.length()); -} +#define ERROR_AT(errorCode, i) \ + if (!errors.hasSyntaxError()) { \ + setParseError(parseError, i); \ + errors.addSyntaxError(errorCode); \ + } // Increments the line number and updates the "characters seen before -// current line" count in `parseError`, iff `source[index]` is a newline +// current line" count in `parseError`, iff `peek()` is a newline void Parser::maybeAdvanceLine() { - if (source[index] == LF) { + if (peek() == LF) { parseError.line++; // add 1 to index to get the number of characters seen so far // (including the newline) @@ -50,10 +51,15 @@ void Parser::maybeAdvanceLine() { Signals an error and returns either if `parseError` already denotes an error, or `index` is out of bounds for the string `source` */ -#define CHECK_BOUNDS(source, index, parseError, errorCode) \ - if (!inBounds(source, index)) { \ - ERROR(parseError, errorCode, index); \ - return; \ +#define CHECK_BOUNDS(errorCode) \ + if (!inBounds()) { \ + ERROR(errorCode); \ + return; \ + } +#define CHECK_BOUNDS_1(errorCode) \ + if (!inBounds(1)) { \ + ERROR_AT(errorCode, index + 1); \ + return; \ } // ------------------------------------- @@ -254,15 +260,13 @@ static bool isKeyStart(UChar32 c) { return (c == ASTERISK || isLiteralStart(c)); } -inline bool isDeclarationStart(const UnicodeString& source, int32_t index) { - int32_t len = source.length(); - int32_t next = index + 1; - return (source[index] == ID_LOCAL[0] - && next < len - && source[next] == ID_LOCAL[1]) - || (source[index] == ID_INPUT[0] - && next < len - && source[next] == ID_INPUT[1]); +bool Parser::isDeclarationStart() { + return (peek() == ID_LOCAL[0] + && inBounds(1) + && peek(1) == ID_LOCAL[1]) + || (peek() == ID_INPUT[0] + && inBounds(1) + && peek(1) == ID_INPUT[1]); } // ------------------------------------- @@ -382,14 +386,14 @@ option, or the optional space before an attribute. Unless otherwise noted in a comment, all helper functions that take a `source` string, an `index` unsigned int, and an `errorCode` `UErrorCode` have the precondition: - `index` < `source.length()` + `index` < `len()` and the postcondition: - `U_FAILURE(errorCode)` || `index < `source.length()` + `U_FAILURE(errorCode)` || `index < `len()` */ /* No pre, no post. - A message may end with whitespace, so `index` may equal `source.length()` on exit. + A message may end with whitespace, so `index` may equal `len()` on exit. */ void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode) { bool sawWhitespace = false; @@ -398,7 +402,7 @@ void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode) // or when we see a non-whitespace character. while (true) { // Check if all input has been consumed - if (!inBounds(source, index)) { + if (!inBounds()) { // If whitespace isn't required -- or if we saw it already -- // then the caller is responsible for checking this case and // setting an error if necessary. @@ -408,24 +412,24 @@ void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode) } // Otherwise, whitespace is required; the end of the input has // been reached without whitespace. This is an error. - ERROR(parseError, errorCode, index); + ERROR(errorCode); return; } // Input remains; process the next character if it's whitespace, // exit the loop otherwise - if (isWhitespace(source[index])) { + if (isWhitespace(peek())) { sawWhitespace = true; // Increment line number in parse error if we consume a newline maybeAdvanceLine(); - index++; + next(); } else { break; } } if (!sawWhitespace && required) { - ERROR(parseError, errorCode, index); + ERROR(errorCode); } } @@ -444,37 +448,37 @@ void Parser::parseOptionalWhitespace(UErrorCode& errorCode) { parseWhitespaceMaybeRequired(false, errorCode); } -// Consumes a single character, signaling an error if `source[index]` != `c` +// Consumes a single character, signaling an error if `peek()` != `c` // No postcondition -- a message can end with a '}' token void Parser::parseToken(UChar32 c, UErrorCode& errorCode) { - CHECK_BOUNDS(source, index, parseError, errorCode); + CHECK_BOUNDS(errorCode); - if (source[index] == c) { - index++; + if (peek() == c) { + next(); normalizedInput += c; return; } // Next character didn't match -- error out - ERROR(parseError, errorCode, index); + ERROR(errorCode); } /* Consumes a fixed-length token, signaling an error if the token isn't a prefix of - the string beginning at `source[index]` + the string beginning at `peek()` No postcondition -- a message can end with a '}' token */ template void Parser::parseToken(const UChar32 (&token)[N], UErrorCode& errorCode) { - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); int32_t tokenPos = 0; while (tokenPos < N - 1) { - if (source[index] != token[tokenPos]) { - ERROR(parseError, errorCode, index); + if (peek() != token[tokenPos]) { + ERROR(errorCode); return; } normalizedInput += token[tokenPos]; - index++; + next(); tokenPos++; } } @@ -490,11 +494,11 @@ void Parser::parseTokenWithWhitespace(const UChar32 (&token)[N], UErrorCode& err // No need for error check or bounds check before parseOptionalWhitespace parseOptionalWhitespace(errorCode); // Establish precondition - CHECK_BOUNDS(source, index, parseError, errorCode); + CHECK_BOUNDS(errorCode); parseToken(token); parseOptionalWhitespace(errorCode); // Guarantee postcondition - CHECK_BOUNDS(source, index, parseError, errorCode); + CHECK_BOUNDS(errorCode); } /* @@ -507,11 +511,11 @@ void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) { // No need for error check or bounds check before parseOptionalWhitespace(errorCode) parseOptionalWhitespace(errorCode); // Establish precondition - CHECK_BOUNDS(source, index, parseError, errorCode); + CHECK_BOUNDS(errorCode); parseToken(c, errorCode); parseOptionalWhitespace(errorCode); // Guarantee postcondition - CHECK_BOUNDS(source, index, parseError, errorCode); + CHECK_BOUNDS(errorCode); } /* @@ -526,19 +530,20 @@ void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) { UnicodeString Parser::parseName(UErrorCode& errorCode) { UnicodeString name; - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); - if (!isNameStart(source[index])) { - ERROR(parseError, errorCode, index); + if (!isNameStart(peek())) { + ERROR(errorCode); return name; } - while (isNameChar(source[index])) { - name += source[index]; - normalizedInput += source[index]; - index++; - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + while (isNameChar(peek())) { + UChar32 c = peek(); + name += c; + normalizedInput += c; + next(); + if (!inBounds()) { + ERROR(errorCode); break; } } @@ -554,13 +559,13 @@ UnicodeString Parser::parseName(UErrorCode& errorCode) { VariableName Parser::parseVariableName(UErrorCode& errorCode) { VariableName result; - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); // If the '$' is missing, we don't want a binding // for this variable to be created. - bool valid = source[index] == DOLLAR; + bool valid = peek() == DOLLAR; parseToken(DOLLAR, errorCode); - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); return result; } UnicodeString varName = parseName(errorCode); @@ -576,7 +581,7 @@ VariableName Parser::parseVariableName(UErrorCode& errorCode) { Corresponds to the `identifier` nonterminal in the grammar */ UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) { - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); UnicodeString result; // The following is a hack to get around ambiguity in the grammar: @@ -591,7 +596,7 @@ UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) { // Parse namespace result += parseName(errorCode); int32_t firstColon = -1; - while (inBounds(source, index) && source[index] == COLON) { + while (inBounds() && peek() == COLON) { // Parse ':' separator if (firstColon == -1) { firstColon = index; @@ -599,8 +604,8 @@ UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) { parseToken(COLON, errorCode); result += COLON; // Check for message ending with something like "foo:" - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); } else { // Parse name part result += parseName(errorCode); @@ -612,7 +617,7 @@ UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) { if (firstColon != -1) { for (int32_t i = firstColon + 1; i < result.length(); i++) { if (result[i] == COLON) { - ERROR(parseError, errorCode, i); + ERROR_AT(errorCode, i); return {}; } } @@ -628,16 +633,16 @@ UnicodeString Parser::parseIdentifier(UErrorCode& errorCode) { Returns the function name. */ FunctionName Parser::parseFunction(UErrorCode& errorCode) { - U_ASSERT(inBounds(source, index)); - if (!isFunctionStart(source[index])) { - ERROR(parseError, errorCode, index); + U_ASSERT(inBounds()); + if (!isFunctionStart(peek())) { + ERROR(errorCode); return FunctionName(); } - normalizedInput += source[index]; - index++; // Consume the function start character - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + normalizedInput += peek(); + next(); // Consume the function start character + if (!inBounds()) { + ERROR(errorCode); return FunctionName(); } return parseIdentifier(errorCode); @@ -645,7 +650,7 @@ FunctionName Parser::parseFunction(UErrorCode& errorCode) { /* - Precondition: source[index] == BACKSLASH + Precondition: peek() == BACKSLASH Consume an escaped character. Corresponds to `escaped-char` in the grammar. @@ -653,24 +658,24 @@ FunctionName Parser::parseFunction(UErrorCode& errorCode) { No postcondition (a message can end with an escaped char) */ UnicodeString Parser::parseEscapeSequence(UErrorCode& errorCode) { - U_ASSERT(inBounds(source, index)); - U_ASSERT(source[index] == BACKSLASH); + U_ASSERT(inBounds()); + U_ASSERT(peek() == BACKSLASH); normalizedInput += BACKSLASH; - index++; // Skip the initial backslash + next(); // Skip the initial backslash UnicodeString str; - if (inBounds(source, index)) { + if (inBounds()) { // Expect a '{', '|' or '}' - switch (source[index]) { + switch (peek()) { case LEFT_CURLY_BRACE: case RIGHT_CURLY_BRACE: case PIPE: case BACKSLASH: { /* Append to the output string */ - str += source[index]; + str += peek(); /* Update normalizedInput */ - normalizedInput += source[index]; + normalizedInput += peek(); /* Consume the character */ - index++; + next(); return str; } default: { @@ -680,7 +685,7 @@ UnicodeString Parser::parseEscapeSequence(UErrorCode& errorCode) { } } // If control reaches here, there was an error - ERROR(parseError, errorCode, index); + ERROR(errorCode); return str; } @@ -695,34 +700,34 @@ Literal Parser::parseQuotedLiteral(UErrorCode& errorCode) { if (U_SUCCESS(errorCode)) { // Parse the opening '|' parseToken(PIPE, errorCode); - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); error = true; } else { // Parse the contents bool done = false; while (!done) { - if (source[index] == BACKSLASH) { + if (peek() == BACKSLASH) { contents += parseEscapeSequence(errorCode); - } else if (isQuotedChar(source[index])) { - contents += source[index]; + } else if (isQuotedChar(peek())) { + contents += peek(); // Handle cases like: // |}{| -- we want to escape everywhere that // can be escaped, to make round-trip checking // easier -- so this case normalizes to // |\}\{| - if (isEscapableChar(source[index])) { + if (isEscapableChar(peek())) { normalizedInput += BACKSLASH; } - normalizedInput += source[index]; - index++; // Consume this character + normalizedInput += peek(); + next(); // Consume this character maybeAdvanceLine(); } else { // Assume the sequence of literal characters ends here done = true; } - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); error = true; break; } @@ -746,18 +751,18 @@ UnicodeString Parser::parseDigits(UErrorCode& errorCode) { return {}; } - U_ASSERT(isDigit(source[index])); + U_ASSERT(isDigit(peek())); UnicodeString contents; do { - contents += source[index]; - normalizedInput += source[index]; - index++; - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + contents += peek(); + normalizedInput += peek(); + next(); + if (!inBounds()) { + ERROR(errorCode); return {}; } - } while (isDigit(source[index])); + } while (isDigit(peek())); return contents; } @@ -770,7 +775,7 @@ Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) { } // unquoted -> name - if (isNameStart(source[index])) { + if (isNameStart(peek())) { return Literal(false, parseName(errorCode)); } @@ -779,75 +784,75 @@ Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) { UnicodeString contents; // Parse the sign - if (source[index] == HYPHEN) { - contents += source[index]; - normalizedInput += source[index]; - index++; + if (peek() == HYPHEN) { + contents += peek(); + normalizedInput += peek(); + next(); } - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); return {}; } // Parse the integer part - if (source[index] == ((UChar32)0x0030) /* 0 */) { - contents += source[index]; - normalizedInput += source[index]; - index++; - } else if (isDigit(source[index])) { + if (peek() == ((UChar32)0x0030) /* 0 */) { + contents += peek(); + normalizedInput += peek(); + next(); + } else if (isDigit(peek())) { contents += parseDigits(errorCode); } else { // Error -- nothing else can start a number literal - ERROR(parseError, errorCode, index); + ERROR(errorCode); return {}; } // Parse the decimal point if present - if (source[index] == PERIOD) { - contents += source[index]; - normalizedInput += source[index]; - index++; - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (peek() == PERIOD) { + contents += peek(); + normalizedInput += peek(); + next(); + if (!inBounds()) { + ERROR(errorCode); return {}; } // Parse the fraction part - if (isDigit(source[index])) { + if (isDigit(peek())) { contents += parseDigits(errorCode); } else { // '.' not followed by digit is a parse error - ERROR(parseError, errorCode, index); + ERROR(errorCode); return {}; } } - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); return {}; } // Parse the exponent part if present - if (source[index] == UPPERCASE_E || source[index] == LOWERCASE_E) { - contents += source[index]; - normalizedInput += source[index]; - index++; - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (peek() == UPPERCASE_E || peek() == LOWERCASE_E) { + contents += peek(); + normalizedInput += peek(); + next(); + if (!inBounds()) { + ERROR(errorCode); return {}; } // Parse sign if present - if (source[index] == PLUS || source[index] == HYPHEN) { - contents += source[index]; - normalizedInput += source[index]; - index++; - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (peek() == PLUS || peek() == HYPHEN) { + contents += peek(); + normalizedInput += peek(); + next(); + if (!inBounds()) { + ERROR(errorCode); return {}; } } // Parse exponent digits - if (!isDigit(source[index])) { - ERROR(parseError, errorCode, index); + if (!isDigit(peek())) { + ERROR(errorCode); return {}; } contents += parseDigits(errorCode); @@ -861,17 +866,17 @@ Literal Parser::parseUnquotedLiteral(UErrorCode& errorCode) { */ Literal Parser::parseLiteral(UErrorCode& errorCode) { Literal result; - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); } else { - if (source[index] == PIPE) { + if (peek() == PIPE) { result = parseQuotedLiteral(errorCode); } else { result = parseUnquotedLiteral(errorCode); } // Guarantee postcondition - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); } } @@ -885,9 +890,9 @@ Literal Parser::parseLiteral(UErrorCode& errorCode) { */ template void Parser::parseAttribute(AttributeAdder& attrAdder, UErrorCode& errorCode) { - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); - U_ASSERT(source[index] == AT); + U_ASSERT(peek() == AT); // Consume the '@' parseToken(AT, errorCode); @@ -901,13 +906,13 @@ void Parser::parseAttribute(AttributeAdder& attrAdder, UErrorCode& errorCode) parseOptionalWhitespace(errorCode); Operand rand; - if (source[index] == EQUALS) { + if (peek() == EQUALS) { // Parse '=' parseTokenWithWhitespace(EQUALS, errorCode); UnicodeString rhsStr; // Parse RHS, which is either a literal or variable - switch (source[index]) { + switch (peek()) { case DOLLAR: { rand = Operand(parseVariableName(errorCode)); break; @@ -936,7 +941,7 @@ void Parser::parseAttribute(AttributeAdder& attrAdder, UErrorCode& errorCode) */ template void Parser::parseOption(OptionAdder& addOption, UErrorCode& errorCode) { - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); // Parse LHS UnicodeString lhs = parseIdentifier(errorCode); @@ -947,7 +952,7 @@ void Parser::parseOption(OptionAdder& addOption, UErrorCode& errorCode) { UnicodeString rhsStr; Operand rand; // Parse RHS, which is either a literal or variable - switch (source[index]) { + switch (peek()) { case DOLLAR: { rand = Operand(parseVariableName(errorCode)); break; @@ -984,7 +989,7 @@ void Parser::parseOption(OptionAdder& addOption, UErrorCode& errorCode) { template void Parser::parseOptions(OptionAdder& addOption, UErrorCode& errorCode) { // Early exit if out of bounds -- no more work is possible - CHECK_BOUNDS(source, index, parseError, errorCode); + CHECK_BOUNDS(errorCode); /* Arbitrary lookahead is required to parse option lists. To see why, consider @@ -1032,7 +1037,7 @@ an option or an attribute. // If the next character is not whitespace, that means we've already // parsed the entire options list (which may have been empty) and there's // no trailing whitespace. In that case, exit. - if (!isWhitespace(source[index])) { + if (!isWhitespace(peek())) { break; } int32_t firstWhitespace = index; @@ -1041,7 +1046,7 @@ an option or an attribute. // one whitespace character. parseRequiredWhitespace(errorCode); // Restore precondition - CHECK_BOUNDS(source, index, parseError, errorCode); + CHECK_BOUNDS(errorCode); // If a name character follows, then at least one more option remains // in the list. @@ -1049,7 +1054,7 @@ an option or an attribute. // and can exit. // Note that exiting is sort of like backtracking: "(s option)" doesn't apply, // so we back out to [s]. - if (!isNameStart(source[index])) { + if (!isNameStart(peek())) { // We've consumed all the options (meaning that either we consumed non-empty // whitespace, or consumed at least one option.) // Done. @@ -1072,8 +1077,8 @@ template void Parser::parseAttributes(AttributeAdder& attrAdder, UErrorCode& errorCode) { // Early exit if out of bounds -- no more work is possible - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); return; } @@ -1086,7 +1091,7 @@ Arbitrary lookahead is required to parse attribute lists, similarly to option li // If the next character is not whitespace, that means we've already // parsed the entire attributes list (which may have been empty) and there's // no trailing whitespace. In that case, exit. - if (!isWhitespace(source[index])) { + if (!isWhitespace(peek())) { break; } @@ -1094,8 +1099,8 @@ Arbitrary lookahead is required to parse attribute lists, similarly to option li // one whitespace character. parseRequiredWhitespace(errorCode); // Restore precondition - if (!inBounds(source, index)) { - ERROR(parseError, errorCode, index); + if (!inBounds()) { + ERROR(errorCode); break; } @@ -1105,7 +1110,7 @@ Arbitrary lookahead is required to parse attribute lists, similarly to option li // and can exit. // Note that exiting is sort of like backtracking: "(s attributes)" doesn't apply, // so we back out to [s]. - if (source[index] != AT) { + if (peek() != AT) { // We've consumed all the attributes (meaning that either we consumed non-empty // whitespace, or consumed at least one attribute.) // Done. @@ -1128,16 +1133,16 @@ void Parser::parseReservedChunk(Reserved::Builder& result, UErrorCode& status) { bool empty = true; UnicodeString chunk; - while(reservedChunkFollows(source[index])) { + while(reservedChunkFollows(peek())) { empty = false; // reserved-char - if (isReservedChar(source[index])) { - chunk += source[index]; - normalizedInput += source[index]; + if (isReservedChar(peek())) { + chunk += peek(); + normalizedInput += peek(); // consume the char - index++; + next(); // Restore precondition - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); continue; } @@ -1146,11 +1151,11 @@ void Parser::parseReservedChunk(Reserved::Builder& result, UErrorCode& status) { chunk.setTo(u"", 0); } - if (source[index] == BACKSLASH) { + if (peek() == BACKSLASH) { // reserved-escape result.add(Literal(false, parseEscapeSequence(status)), status); chunk.setTo(u"", 0); - } else if (source[index] == PIPE || isUnquotedStart(source[index])) { + } else if (peek() == PIPE || isUnquotedStart(peek())) { result.add(parseLiteral(status), status); } else { // The reserved chunk ends here @@ -1166,7 +1171,7 @@ void Parser::parseReservedChunk(Reserved::Builder& result, UErrorCode& status) { } if (empty) { - ERROR(parseError, status, index); + ERROR(status); } } @@ -1183,23 +1188,23 @@ Reserved Parser::parseReserved(UErrorCode& status) { return {}; } - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); // Require a `reservedStart` character - if (!isReservedStart(source[index])) { - ERROR(parseError, status, index); + if (!isReservedStart(peek())) { + ERROR(status); return Reserved(); } // Add the start char as a separate text chunk - UnicodeString firstCharString(source[index]); + UnicodeString firstCharString(peek()); builder.add(Literal(false, firstCharString), status); if (U_FAILURE(status)) { return {}; } // Consume reservedStart - normalizedInput += source[index]; - index++; + normalizedInput += peek(); + next(); return parseReservedBody(builder, status); } @@ -1243,42 +1248,42 @@ Reserved Parser::parseReservedBody(Reserved::Builder& builder, UErrorCode& statu (`parseExpression()` is responsible for checking that the next character is in fact a '}'.) */ - if (!inBounds(source, index)) { + if (!inBounds()) { break; } int32_t numWhitespaceChars = 0; int32_t savedIndex = index; - if (isWhitespace(source[index])) { + if (isWhitespace(peek())) { parseOptionalWhitespace(status); numWhitespaceChars = index - savedIndex; // Restore precondition - if (!inBounds(source, index)) { + if (!inBounds()) { break; } } - if (reservedChunkFollows(source[index])) { + if (reservedChunkFollows(peek())) { parseReservedChunk(builder, status); // Avoid looping infinitely - if (U_FAILURE(status) || !inBounds(source, index)) { + if (U_FAILURE(status) || !inBounds()) { break; } } else { if (numWhitespaceChars > 0) { - if (source[index] == LEFT_CURLY_BRACE) { + if (peek() == LEFT_CURLY_BRACE) { // Resolve even more ambiguity (space preceding another piece of // a `reserved-body`, vs. space preceding an expression in `reserved-statement` // "Backtrack" index -= numWhitespaceChars; break; } - if (source[index] == RIGHT_CURLY_BRACE) { + if (peek() == RIGHT_CURLY_BRACE) { // Not an error: just means there's no trailing whitespace // after this `reserved` break; } - if (source[index] == AT) { + if (peek() == AT) { // Not an error, but we have to "backtrack" due to the ambiguity // between an `s` preceding another reserved chunk // and an `s` preceding an attribute list @@ -1287,7 +1292,7 @@ Reserved Parser::parseReservedBody(Reserved::Builder& builder, UErrorCode& statu } // Error: if there's whitespace, it must either be followed // by a non-empty sequence or by '}' - ERROR(parseError, status, index); + ERROR(status); break; } // If there was no whitespace, it's not an error, @@ -1306,12 +1311,12 @@ Reserved Parser::parseReservedBody(Reserved::Builder& builder, UErrorCode& statu Returns an `Operator` representing this (a reserved is a parse error) */ Operator Parser::parseAnnotation(UErrorCode& status) { - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); Operator::Builder ratorBuilder(status); if (U_FAILURE(status)) { return {}; } - if (isFunctionStart(source[index])) { + if (isFunctionStart(peek())) { // Consume the function name FunctionName func = parseFunction(status); ratorBuilder.setFunctionName(std::move(func)); @@ -1343,7 +1348,7 @@ void Parser::parseLiteralOrVariableWithAnnotation(bool isVariable, UErrorCode& status) { CHECK_ERROR(status); - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); Operand rand; if (isVariable) { @@ -1377,7 +1382,7 @@ refactoring for the `expression` nonterminal that `parseOptions()` relies on. Se the comment in `parseOptions()` for details. */ - if (isWhitespace(source[index])) { + if (isWhitespace(peek())) { int32_t firstWhitespace = index; // If the next character is whitespace, either [s annotation] or [s] applies @@ -1386,10 +1391,10 @@ the comment in `parseOptions()` for details. // one does apply. parseOptionalWhitespace(status); // Restore precondition - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); // This next check resolves the ambiguity between [s annotation] and [s] - bool isSAnnotation = isAnnotationStart(source[index]); + bool isSAnnotation = isAnnotationStart(peek()); if (isSAnnotation) { normalizedInput += SPACE; @@ -1450,7 +1455,7 @@ Expression Parser::parseExpression(UErrorCode& status) { } // Early return if out of input -- no more work is possible - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); // Parse opening brace parseToken(LEFT_CURLY_BRACE, status); @@ -1459,11 +1464,11 @@ Expression Parser::parseExpression(UErrorCode& status) { Expression::Builder exprBuilder(status); // Restore precondition - if (!inBounds(source, index)) { + if (!inBounds()) { exprFallback(exprBuilder); } else { // literal '|', variable '$' or annotation - switch (source[index]) { + switch (peek()) { case PIPE: { // Quoted literal parseLiteralOrVariableWithAnnotation(false, exprBuilder, status); @@ -1475,15 +1480,15 @@ Expression Parser::parseExpression(UErrorCode& status) { break; } default: { - if (isAnnotationStart(source[index])) { + if (isAnnotationStart(peek())) { Operator rator = parseAnnotation(status); exprBuilder.setOperator(std::move(rator)); - } else if (isUnquotedStart(source[index])) { + } else if (isUnquotedStart(peek())) { // Unquoted literal parseLiteralOrVariableWithAnnotation(false, exprBuilder, status); } else { // Not a literal, variable or annotation -- error out - ERROR(parseError, status, index); + ERROR(status); exprFallback(exprBuilder); break; } @@ -1507,8 +1512,8 @@ Expression Parser::parseExpression(UErrorCode& status) { U_ASSERT(U_SUCCESS(localStatus)); // Check for end-of-input and missing '}' - if (!inBounds(source, index)) { - ERROR(parseError, status, index); + if (!inBounds()) { + ERROR(status); } else { // Otherwise, it's safe to check for the '}' parseToken(RIGHT_CURLY_BRACE, status); @@ -1523,17 +1528,17 @@ Expression Parser::parseExpression(UErrorCode& status) { void Parser::parseLocalDeclaration(UErrorCode& status) { // End-of-input here would be an error; even empty // declarations must be followed by a body - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); parseToken(ID_LOCAL, status); parseRequiredWhitespace(status); // Restore precondition - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); VariableName lhs = parseVariableName(status); parseTokenWithWhitespace(EQUALS, status); // Restore precondition before calling parseExpression() - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); Expression rhs = parseExpression(status); @@ -1571,13 +1576,13 @@ void Parser::parseLocalDeclaration(UErrorCode& status) { void Parser::parseInputDeclaration(UErrorCode& status) { // End-of-input here would be an error; even empty // declarations must be followed by a body - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); parseToken(ID_INPUT, status); parseOptionalWhitespace(status); // Restore precondition before calling parseExpression() - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); // Save the index for error diagnostics int32_t exprIndex = index; @@ -1587,7 +1592,7 @@ void Parser::parseInputDeclaration(UErrorCode& status) { if (!rhs.getOperand().isVariable()) { // This case is a syntax error; report it at the beginning // of the expression - ERROR(parseError, status, exprIndex); + ERROR_AT(status, exprIndex); return; } @@ -1614,7 +1619,7 @@ void Parser::parseInputDeclaration(UErrorCode& status) { Parses a `reserved-statement` per the grammar */ void Parser::parseUnsupportedStatement(UErrorCode& status) { - U_ASSERT(inBounds(source, index) && source[index] == PERIOD); + U_ASSERT(inBounds() && peek() == PERIOD); UnsupportedStatement::Builder builder(status); CHECK_ERROR(status); @@ -1622,7 +1627,7 @@ void Parser::parseUnsupportedStatement(UErrorCode& status) { // Parse the keyword UnicodeString keyword(PERIOD); normalizedInput += UnicodeString(PERIOD); - index++; + next(); keyword += parseName(status); builder.setKeyword(keyword); @@ -1634,18 +1639,18 @@ void Parser::parseUnsupportedStatement(UErrorCode& status) { // a '{') // * a '{' - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); - if (source[index] != LEFT_CURLY_BRACE) { - if (!isWhitespace(source[index])) { - ERROR(parseError, status, index); + if (peek() != LEFT_CURLY_BRACE) { + if (!isWhitespace(peek())) { + ERROR(status); return; } // Expect a reserved-body start int32_t savedIndex = index; parseRequiredWhitespace(status); - CHECK_BOUNDS(source, index, parseError, status); - if (isReservedBodyStart(source[index])) { + CHECK_BOUNDS(status); + if (isReservedBodyStart(peek())) { // There is a reserved body Reserved::Builder r(status); builder.setBody(parseReservedBody(r, status)); @@ -1656,8 +1661,8 @@ void Parser::parseUnsupportedStatement(UErrorCode& status) { } // Otherwise, the next character must be a '{' // to open the required expression (or optional whitespace) - if (source[index] != LEFT_CURLY_BRACE && !isWhitespace(source[index])) { - ERROR(parseError, status, index); + if (peek() != LEFT_CURLY_BRACE && !isWhitespace(peek())) { + ERROR(status); return; } } @@ -1667,12 +1672,12 @@ void Parser::parseUnsupportedStatement(UErrorCode& status) { // Need to look ahead to disambiguate a '{' beginning // an expression from one beginning with a quoted pattern int32_t expressionCount = 0; - while (source[index] == LEFT_CURLY_BRACE || isWhitespace(source[index])) { + while (peek() == LEFT_CURLY_BRACE || isWhitespace(peek())) { parseOptionalWhitespace(status); - bool nextIsLbrace = source[index] == LEFT_CURLY_BRACE; - bool nextIsQuotedPattern = nextIsLbrace && inBounds(source, index + 1) - && source[index + 1] == LEFT_CURLY_BRACE; + bool nextIsLbrace = peek() == LEFT_CURLY_BRACE; + bool nextIsQuotedPattern = nextIsLbrace && inBounds(1) + && peek(1) == LEFT_CURLY_BRACE; if (nextIsQuotedPattern) { break; } @@ -1682,7 +1687,7 @@ void Parser::parseUnsupportedStatement(UErrorCode& status) { } if (expressionCount <= 0) { // At least one expression is required - ERROR(parseError, status, index); + ERROR(status); return; } dataModel.addUnsupportedStatement(builder.build(status), status); @@ -1693,7 +1698,7 @@ void Parser::parseUnsupportedStatement(UErrorCode& status) { // (It's convenient that all supported keywords have the same length) bool Parser::nextIs(const UChar32 (&keyword)[7]) const { for(int32_t i = 0; i < 6; i++) { - if (!inBounds(source, index + i) || source[index + i] != keyword[i]) { + if (!inBounds(i) || peek(i) != keyword[i]) { return false; } } @@ -1709,17 +1714,17 @@ bool Parser::nextIs(const UChar32 (&keyword)[7]) const { void Parser::parseDeclarations(UErrorCode& status) { // End-of-input here would be an error; even empty // declarations must be followed by a body - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); - while (source[index] == PERIOD) { - CHECK_BOUNDS(source, index + 1, parseError, status); + while (peek() == PERIOD) { + CHECK_BOUNDS_1(status); // Check for unsupported statements first // Lookahead is needed to disambiguate keyword from known keywords if (!nextIs(ID_MATCH) && !nextIs(ID_LOCAL) && !nextIs(ID_INPUT)) { parseUnsupportedStatement(status); - } else if (source[index + 1] == ID_LOCAL[1]) { + } else if (peek(1) == ID_LOCAL[1]) { parseLocalDeclaration(status); - } else if (source[index + 1] == ID_INPUT[1]) { + } else if (peek(1) == ID_INPUT[1]) { parseInputDeclaration(status); } else { // Done parsing declarations @@ -1731,7 +1736,7 @@ void Parser::parseDeclarations(UErrorCode& status) { parseOptionalWhitespace(status); // Restore precondition - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); } } @@ -1743,17 +1748,17 @@ void Parser::parseDeclarations(UErrorCode& status) { */ UnicodeString Parser::parseTextChar(UErrorCode& status) { UnicodeString str; - if (!inBounds(source, index) || !(isTextChar(source[index]))) { + if (!inBounds() || !(isTextChar(peek()))) { // Error -- text-char is expected here - ERROR(parseError, status, index); + ERROR(status); } else { // See comment in parseQuotedLiteral() - if (isEscapableChar(source[index])) { + if (isEscapableChar(peek())) { normalizedInput += BACKSLASH; } - normalizedInput += source[index]; - str += source[index]; - index++; + normalizedInput += peek(); + str += peek(); + next(); maybeAdvanceLine(); } return str; @@ -1764,17 +1769,17 @@ UnicodeString Parser::parseTextChar(UErrorCode& status) { the `key` nonterminal in the grammar */ Key Parser::parseKey(UErrorCode& status) { - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); Key k; // wildcard by default // Literal | '*' - switch (source[index]) { + switch (peek()) { case ASTERISK: { - index++; + next(); normalizedInput += ASTERISK; // Guarantee postcondition - if (!inBounds(source, index)) { - ERROR(parseError, status, index); + if (!inBounds()) { + ERROR(status); return k; } break; @@ -1800,7 +1805,7 @@ SelectorKeys Parser::parseNonEmptyKeys(UErrorCode& status) { return result; } - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); /* Arbitrary lookahead is required to parse key lists. To see why, consider @@ -1832,31 +1837,31 @@ This is addressed using "backtracking" (similarly to `parseOptions()`). keysBuilder.add(parseKey(status), status); // Restore precondition - if (!inBounds(source, index)) { - ERROR(parseError, status, index); + if (!inBounds()) { + ERROR(status); return result; } // We've seen at least one whitespace-key pair, so now we can parse // *(s key) [s] - while (source[index] != LEFT_CURLY_BRACE || isWhitespace(source[index])) { // Try to recover from errors - bool wasWhitespace = isWhitespace(source[index]); + while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek())) { // Try to recover from errors + bool wasWhitespace = isWhitespace(peek()); parseRequiredWhitespace(status); if (!wasWhitespace) { // Avoid infinite loop when parsing something like: // when * @{!... - index++; + next(); } // Restore precondition - if (!inBounds(source, index)) { - ERROR(parseError, status, index); + if (!inBounds()) { + ERROR(status); return result; } // At this point, it's ambiguous whether we are inside (s key) or [s]. // This check resolves that ambiguity. - if (source[index] == LEFT_CURLY_BRACE) { + if (peek() == LEFT_CURLY_BRACE) { // A pattern follows, so what we just parsed was the optional // trailing whitespace. All the keys have been parsed. @@ -1871,7 +1876,7 @@ This is addressed using "backtracking" (similarly to `parseOptions()`). } Pattern Parser::parseQuotedPattern(UErrorCode& status) { - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); parseToken(LEFT_CURLY_BRACE, status); parseToken(LEFT_CURLY_BRACE, status); @@ -1886,9 +1891,9 @@ Pattern Parser::parseQuotedPattern(UErrorCode& status) { No postcondition (a markup can end a message) */ Markup Parser::parseMarkup(UErrorCode& status) { - U_ASSERT(inBounds(source, index + 1)); + U_ASSERT(inBounds(1)); - U_ASSERT(source[index] == LEFT_CURLY_BRACE); + U_ASSERT(peek() == LEFT_CURLY_BRACE); Markup::Builder builder(status); if (U_FAILURE(status)) { @@ -1896,26 +1901,26 @@ Markup Parser::parseMarkup(UErrorCode& status) { } // Consume the '{' - index++; + next(); normalizedInput += LEFT_CURLY_BRACE; parseOptionalWhitespace(status); bool closing = false; - switch (source[index]) { + switch (peek()) { case NUMBER_SIGN: { // Open or standalone; consume the '#' - normalizedInput += source[index]; - index++; + normalizedInput += peek(); + next(); break; } case SLASH: { // Closing - normalizedInput += source[index]; + normalizedInput += peek(); closing = true; - index++; + next(); break; } default: { - ERROR(parseError, status, index); + ERROR(status); return {}; } } @@ -1925,14 +1930,14 @@ Markup Parser::parseMarkup(UErrorCode& status) { // Parse the options, which must begin with a ' ' // if present - if (inBounds(source, index) && isWhitespace(source[index])) { + if (inBounds() && isWhitespace(peek())) { OptionAdder optionAdder(builder); parseOptions(optionAdder, status); } // Parse the attributes, which also must begin // with a ' ' - if (inBounds(source, index) && isWhitespace(source[index])) { + if (inBounds() && isWhitespace(peek())) { AttributeAdder attrAdder(builder); parseAttributes(attrAdder, status); } @@ -1942,10 +1947,10 @@ Markup Parser::parseMarkup(UErrorCode& status) { bool standalone = false; // Check if this is a standalone or not if (!closing) { - if (inBounds(source, index) && source[index] == SLASH) { + if (inBounds() && peek() == SLASH) { standalone = true; normalizedInput += SLASH; - index++; + next(); } } @@ -1967,23 +1972,25 @@ Markup Parser::parseMarkup(UErrorCode& status) { No postcondition (a placeholder can end a message) */ std::variant Parser::parsePlaceholder(UErrorCode& status) { - U_ASSERT(source[index] == LEFT_CURLY_BRACE); + U_ASSERT(peek() == LEFT_CURLY_BRACE); - if (!inBounds(source, index)) { - ERROR(parseError, status, index); + if (!inBounds()) { + ERROR(status); return exprFallback(status); } - // Need to look ahead arbitrarily since can appear before the '{' and '#' + // Need to look ahead arbitrarily since whitespace + // can appear before the '{' and '#' // in markup - int32_t tempIndex = index + 1; + int32_t tempIndex = 1; bool isMarkup = false; - while (inBounds(source, tempIndex)) { - if (source[tempIndex] == NUMBER_SIGN || source[tempIndex] == SLASH) { + while (inBounds(1)) { + UChar32 c = peek(tempIndex); + if (c == NUMBER_SIGN || c == SLASH) { isMarkup = true; break; } - if (!isWhitespace(source[tempIndex])){ + if (!isWhitespace(c)){ break; } tempIndex++; @@ -1997,7 +2004,7 @@ std::variant Parser::parsePlaceholder(UErrorCode& status) { /* Consume a `simple-message`, matching the nonterminal in the grammar - Postcondition: `index == source.length()` or U_FAILURE(status); + Postcondition: `index == len()` or U_FAILURE(status); for a syntactically correct message, this will consume the entire input */ Pattern Parser::parseSimpleMessage(UErrorCode& status) { @@ -2005,8 +2012,8 @@ Pattern Parser::parseSimpleMessage(UErrorCode& status) { if (U_SUCCESS(status)) { Expression expression; - while (inBounds(source, index)) { - switch (source[index]) { + while (inBounds()) { + switch (peek()) { case LEFT_CURLY_BRACE: { // Must be placeholder std::variant piece = parsePlaceholder(status); @@ -2034,7 +2041,7 @@ Pattern Parser::parseSimpleMessage(UErrorCode& status) { break; } } - if (source[index] == RIGHT_CURLY_BRACE) { + if (peek() == RIGHT_CURLY_BRACE) { // End of quoted pattern break; } @@ -2052,13 +2059,13 @@ Pattern Parser::parseSimpleMessage(UErrorCode& status) { Consume a `selectors` (matching the nonterminal in the grammar), followed by a non-empty sequence of `variant`s (matching the nonterminal in the grammar) preceded by whitespace - No postcondition (on return, `index` might equal `source.length()` with no syntax error + No postcondition (on return, `index` might equal `len()` with no syntax error because a message can end with a variant) */ void Parser::parseSelectors(UErrorCode& status) { CHECK_ERROR(status); - U_ASSERT(inBounds(source, index)); + U_ASSERT(inBounds()); parseToken(ID_MATCH, status); @@ -2066,11 +2073,11 @@ void Parser::parseSelectors(UErrorCode& status) { // Parse selectors // "Backtracking" is required here. It's not clear if whitespace is // (`[s]` selector) or (`[s]` variant) - while (isWhitespace(source[index]) || source[index] == LEFT_CURLY_BRACE) { + while (isWhitespace(peek()) || peek() == LEFT_CURLY_BRACE) { parseOptionalWhitespace(status); // Restore precondition - CHECK_BOUNDS(source, index, parseError, status); - if (source[index] != LEFT_CURLY_BRACE) { + CHECK_BOUNDS(status); + if (peek() != LEFT_CURLY_BRACE) { // This is not necessarily an error, but rather, // means the whitespace we parsed was the optional // whitespace preceding the first variant, not the @@ -2087,20 +2094,20 @@ void Parser::parseSelectors(UErrorCode& status) { // At least one selector is required if (empty) { - ERROR(parseError, status, index); + ERROR(status); return; } #define CHECK_END_OF_INPUT \ - if (((int32_t)index) >= source.length()) { \ + if (!inBounds()) { \ break; \ } \ // Parse variants - while (isWhitespace(source[index]) || isKeyStart(source[index])) { + while (isWhitespace(peek()) || isKeyStart(peek())) { // Trailing whitespace is allowed parseOptionalWhitespace(status); - if (!inBounds(source, index)) { + if (!inBounds()) { return; } @@ -2114,7 +2121,7 @@ void Parser::parseSelectors(UErrorCode& status) { // Restore precondition before calling parsePattern() // (which must return a non-null value) - CHECK_BOUNDS(source, index, parseError, status); + CHECK_BOUNDS(status); Pattern rhs = parseQuotedPattern(status); dataModel.addVariant(std::move(keyList), std::move(rhs), status); @@ -2131,7 +2138,7 @@ void Parser::parseSelectors(UErrorCode& status) { /* Consume a `body` (matching the nonterminal in the grammar), - No postcondition (on return, `index` might equal `source.length()` with no syntax error, + No postcondition (on return, `index` might equal `len()` with no syntax error, because a message can end with a body (trailing whitespace is optional) */ @@ -2150,8 +2157,9 @@ void Parser::errorPattern(UErrorCode& status) { whether this is the intent behind the spec */ UnicodeString partStr(LEFT_CURLY_BRACE); - while (inBounds(source, index)) { - partStr += source[index++]; + while (inBounds()) { + partStr += peek(); + next(); } // Add curly braces around the entire output (same comment as above) partStr += RIGHT_CURLY_BRACE; @@ -2163,13 +2171,13 @@ void Parser::parseBody(UErrorCode& status) { CHECK_ERROR(status); // Out-of-input is a syntax warning - if (!inBounds(source, index)) { + if (!inBounds()) { errorPattern(status); return; } // Body must be either a pattern or selectors - switch (source[index]) { + switch (peek()) { case LEFT_CURLY_BRACE: { // Pattern dataModel.setPattern(parseQuotedPattern(status)); @@ -2181,7 +2189,7 @@ void Parser::parseBody(UErrorCode& status) { return; } default: { - ERROR(parseError, status, index); + ERROR(status); errorPattern(status); return; } @@ -2197,11 +2205,11 @@ void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) { bool simple = true; // Message can be empty, so we need to only look ahead // if we know it's non-empty - if (inBounds(source, index)) { - if (source[index] == PERIOD - || (index < ((uint32_t) source.length() + 1) - && source[index] == LEFT_CURLY_BRACE - && source[index + 1] == LEFT_CURLY_BRACE)) { + if (inBounds()) { + if (peek() == PERIOD + || (inBounds(1) + && peek() == LEFT_CURLY_BRACE + && peek(1) == LEFT_CURLY_BRACE)) { // A complex message begins with a '.' or '{' parseDeclarations(status); parseBody(status); @@ -2222,8 +2230,8 @@ void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) { CHECK_ERROR(status); // There are no errors; finally, check that the entire input was consumed - if (((int32_t)index) != source.length()) { - ERROR(parseError, status, index); + if (!allConsumed()) { + ERROR(status); } // Finally, copy the relevant fields of the internal `MessageParseError` diff --git a/icu4c/source/i18n/messageformat2_parser.h b/icu4c/source/i18n/messageformat2_parser.h index 49c9158dafd3..76e9bbf6768e 100644 --- a/icu4c/source/i18n/messageformat2_parser.h +++ b/icu4c/source/i18n/messageformat2_parser.h @@ -142,10 +142,22 @@ namespace message2 { SelectorKeys parseNonEmptyKeys(UErrorCode&); void errorPattern(UErrorCode& status); Pattern parseQuotedPattern(UErrorCode&); + bool isDeclarationStart(); + + UChar32 peek() const { return source.char32At(index) ; } + UChar32 peek(uint32_t i) const { + return source.char32At(source.moveIndex32(index, i)); + } + uint32_t len() const { return source.countChar32(0, source.length()); } + void next() { index = source.moveIndex32(index, 1); } + + bool inBounds() const { return (int32_t) index < source.length(); } + bool inBounds(uint32_t i) const { return source.moveIndex32(index, i) < ((int32_t) len()); } + bool allConsumed() const { return (int32_t) index == source.length(); } // The input string const UnicodeString &source; - // The current position within the input string + // The current position within the input string -- counting in UChar32 uint32_t index; // Represents the current line (and when an error is indicated), // character offset within the line of the parse error diff --git a/testdata/message2/syntax-errors-diagnostics.json b/testdata/message2/syntax-errors-diagnostics.json index b00279429381..79a29a027f9b 100644 --- a/testdata/message2/syntax-errors-diagnostics.json +++ b/testdata/message2/syntax-errors-diagnostics.json @@ -338,6 +338,10 @@ "src": ".match {1} {{_}}", "char": 12, "comment": "Disambiguating a wrong .match from an unsupported statement" + }, + { + "src": "{{{/p o4.􍅲 = 1}}}", + "char": 9 } ] } diff --git a/testdata/message2/valid-tests.json b/testdata/message2/valid-tests.json index 148ef47c715e..4b1ce8d67fb6 100644 --- a/testdata/message2/valid-tests.json +++ b/testdata/message2/valid-tests.json @@ -143,6 +143,14 @@ "comment": "Trailing whitespace after match is valid", "src": ".match {1 :string} * {{}} ", "exp": "" + }, + { + "src" : "𴢸", + "exp" : "𴢸" + }, + { + "src" : "{{􍅲}}", + "exp" : "􍅲" } ] }