From f1db9275b6480f5e71512ee6d73c3ad5013d2eb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Hohwiller?= Date: Wed, 25 Dec 2024 15:44:53 +0100 Subject: [PATCH] m-m-m/base#8: full unicode support --- .../scanner/AbstractCharStreamScanner.java | 554 ++-- .../github/mmm/scanner/CharEscapeHelper.java | 42 +- .../github/mmm/scanner/CharReaderScanner.java | 122 +- .../github/mmm/scanner/CharScannerSyntax.java | 504 ++-- .../mmm/scanner/CharScannerSyntaxBean.java | 489 ++-- .../mmm/scanner/CharSequenceScanner.java | 937 +++---- .../github/mmm/scanner/CharStreamScanner.java | 2356 +++++++++-------- .../number/CharScannerNumberParser.java | 9 +- .../number/CharScannerNumberParserBase.java | 26 +- .../number/CharScannerNumberParserLang.java | 14 +- .../AbstractCharStreamScannerTest.java | 4 +- 11 files changed, 2467 insertions(+), 2590 deletions(-) diff --git a/core/src/main/java/io/github/mmm/scanner/AbstractCharStreamScanner.java b/core/src/main/java/io/github/mmm/scanner/AbstractCharStreamScanner.java index 18cb815..75fe784 100644 --- a/core/src/main/java/io/github/mmm/scanner/AbstractCharStreamScanner.java +++ b/core/src/main/java/io/github/mmm/scanner/AbstractCharStreamScanner.java @@ -21,8 +21,8 @@ /** * Abstract implementation of {@link CharStreamScanner}.
- * ATTENTION:
- * This implementation and its sub-classes are NOT thread-safe and have no intention to be thread-safe. + * + * @since 1.0.0 */ public abstract class AbstractCharStreamScanner implements CharStreamScanner { @@ -33,7 +33,7 @@ public abstract class AbstractCharStreamScanner implements CharStreamScanner { private final TextFormatMessageHandler messageHandler; /** The internal buffer with character data. */ - protected char[] buffer; + protected String buffer; /** The start position in the {@link #buffer} from where reading operations consumer data from. */ protected int offset; @@ -56,36 +56,12 @@ public abstract class AbstractCharStreamScanner implements CharStreamScanner { /** * The constructor. * - * @param capacity the capacity of the internal buffer in {@code char}s. + * @param charBuffer the internal {@code char[]} buffer. * @param messageHandler the {@link TextFormatMessageHandler}. */ - public AbstractCharStreamScanner(int capacity, TextFormatMessageHandler messageHandler) { + public AbstractCharStreamScanner(String charBuffer, TextFormatMessageHandler messageHandler) { - this(new char[capacity], messageHandler); - } - - /** - * The constructor. - * - * @param capacity the capacity of the internal buffer in {@code char}s. - * @param messageHandler the {@link TextFormatMessageHandler}. - * @param line the initial {@link #getLine() line}. - * @param column the initial {@link #getColumn() column}. - */ - public AbstractCharStreamScanner(int capacity, TextFormatMessageHandler messageHandler, int line, int column) { - - this(new char[capacity], messageHandler, line, column); - } - - /** - * The constructor. - * - * @param buffer the internal {@code char[]} buffer. - * @param messageHandler the {@link TextFormatMessageHandler}. - */ - public AbstractCharStreamScanner(char[] buffer, TextFormatMessageHandler messageHandler) { - - this(buffer, messageHandler, 1, 1); + this(charBuffer, messageHandler, 1, 1); } /** @@ -96,7 +72,7 @@ public AbstractCharStreamScanner(char[] buffer, TextFormatMessageHandler message * @param line the initial {@link #getLine() line}. * @param column the initial {@link #getColumn() column}. */ - public AbstractCharStreamScanner(char[] buffer, TextFormatMessageHandler messageHandler, int line, int column) { + public AbstractCharStreamScanner(String buffer, TextFormatMessageHandler messageHandler, int line, int column) { super(); if (messageHandler == null) { @@ -181,7 +157,7 @@ protected StringBuilder append(StringBuilder builder, int start, int end) { return builder; } StringBuilder b = builder(builder); - b.append(this.buffer, start, len); + b.append(this.buffer, start, end); return b; } @@ -196,14 +172,13 @@ protected StringBuilder append(StringBuilder builder, int start, int end) { */ protected String getAppended(StringBuilder builder, int start, int end) { - int len = end - start; - if (len <= 0) { + if (end <= start) { return eot(builder, true); } if (builder == null) { - return new String(this.buffer, start, len); + return this.buffer.substring(start, end); } else { - builder.append(this.buffer, start, len); + builder.append(this.buffer, start, end); return builder.toString(); } } @@ -264,29 +239,29 @@ protected boolean fill() { } @Override - public char next() { + public int next() { if (hasNext()) { - return handleChar(this.buffer[this.offset++]); + return handleCodePoint(this.buffer.codePointAt(this.offset++)); } - return 0; + return EOS; } /** * Updates {@link #getColumn() column} and {@link #getLine() line} if the given character is consumed. * - * @param c the character to handle. + * @param codePoint the character to handle. * @return the given character. */ - protected char handleChar(char c) { + protected int handleCodePoint(int codePoint) { - if (c == '\n') { + if (codePoint == '\n') { this.line++; this.column = 1; } else { this.column++; } - return c; + return codePoint; } /** @@ -297,15 +272,15 @@ protected void setOffset(int newOffset) { assert (newOffset >= this.offset); assert (newOffset <= this.limit); while (this.offset < newOffset) { - handleChar(this.buffer[this.offset++]); + handleCodePoint(this.buffer.codePointAt(this.offset++)); } } @Override - public char peek() { + public int peek() { if (hasNext()) { - return this.buffer[this.offset]; + return this.buffer.codePointAt(this.offset); } return 0; } @@ -328,7 +303,7 @@ protected String eot(StringBuilder builder, boolean acceptEot) { } @Override - public String readUntil(char stop, boolean acceptEot) { + public String readUntil(int stop, boolean acceptEot) { if (!hasNext()) { return eot(null, acceptEot); @@ -337,9 +312,9 @@ public String readUntil(char stop, boolean acceptEot) { while (true) { int start = this.offset; while (this.offset < this.limit) { - char c = this.buffer[this.offset++]; - handleChar(c); - if (c == stop) { + int codePoint = this.buffer.codePointAt(this.offset++); + handleCodePoint(codePoint); + if (codePoint == stop) { return getAppended(builder, start, this.offset - 1); } } @@ -351,7 +326,7 @@ public String readUntil(char stop, boolean acceptEot) { } @Override - public String readUntil(final char stop, boolean acceptEot, CharScannerSyntax syntax) { + public String readUntil(int stop, boolean acceptEot, CharScannerSyntax syntax) { String result = readUntil(c -> (c == stop), acceptEot, syntax); expectOne(stop); @@ -368,12 +343,12 @@ public String readUntil(CharFilter filter, boolean acceptEot, CharScannerSyntax while (true) { // int end = this.limit; while (this.offset < this.limit) { - char c = this.buffer[this.offset]; - state.parse(c); + int codePoint = this.buffer.codePointAt(this.offset); + state.parse(codePoint); if (state.done) { return state.builder.toString(); } - handleChar(c); + handleCodePoint(codePoint); this.offset++; } boolean eot = isEot(); @@ -405,7 +380,7 @@ public String readUntil(CharFilter filter, boolean acceptEot, CharScannerSyntax } @Override - public String readUntil(char stop, boolean acceptEot, char escape) { + public String readUntil(int stop, boolean acceptEot, int escape) { if (!hasNext()) { eot(null, acceptEot); @@ -414,9 +389,9 @@ public String readUntil(char stop, boolean acceptEot, char escape) { while (true) { int start = this.offset; while (this.offset < this.limit) { - char c = this.buffer[this.offset++]; - handleChar(c); - if (c == escape) { + int codePoint = this.buffer.codePointAt(this.offset++); + handleCodePoint(codePoint); + if (codePoint == escape) { builder = append(builder, start, this.offset - 1); // lookahead if (this.offset >= this.limit) { @@ -424,18 +399,18 @@ public String readUntil(char stop, boolean acceptEot, char escape) { return eot(builder, acceptEot); } } - c = this.buffer[this.offset]; - if ((escape == stop) && (c != stop)) { + codePoint = this.buffer.codePointAt(this.offset); + if ((escape == stop) && (codePoint != stop)) { return eot(builder, true); } else { // escape character builder = builder(builder); - builder.append(c); - handleChar(c); + builder.appendCodePoint(codePoint); + handleCodePoint(codePoint); this.offset++; start = this.offset; } - } else if (c == stop) { + } else if (codePoint == stop) { return getAppended(builder, start, this.offset - 1); } } @@ -456,11 +431,11 @@ public String readUntil(CharFilter filter, boolean acceptEot) { while (true) { int start = this.offset; while (this.offset < this.limit) { - char c = this.buffer[this.offset]; - if (filter.accept(c)) { + int codePoint = this.buffer.codePointAt(this.offset); + if (filter.accept(codePoint)) { return getAppended(builder, start, this.offset - 1); } - handleChar(c); + handleCodePoint(codePoint); this.offset++; } builder = append(builder, start, this.limit); @@ -485,13 +460,13 @@ public String readUntil(CharFilter stopFilter, boolean acceptEot, String stop, b if (trim) { skipWhile(' '); } - char[] stopChars; + String stopChars; if (ignoreCase) { - stopChars = CaseHelper.toLowerCase(stop).toCharArray(); + stopChars = CaseHelper.toLowerCase(stop); } else { - stopChars = stop.toCharArray(); + stopChars = stop; } - char first = stopChars[0]; + int first = stopChars.codePointAt(0); Appender appender = newAppender(trim); while (true) { appender.start = this.offset; @@ -503,18 +478,18 @@ public String readUntil(CharFilter stopFilter, boolean acceptEot, String stop, b max -= stopLength; } while (this.offset < max) { - char c = this.buffer[this.offset]; - if (stopFilter.accept(c)) { + int codePoint = this.buffer.codePointAt(this.offset); + if (stopFilter.accept(codePoint)) { return appender.getAppended(); } - if (c == first || (ignoreCase && (Character.toLowerCase(c) == first))) { + if (codePoint == first || (ignoreCase && (Character.toLowerCase(codePoint) == first))) { // found first character boolean found = expectRestWithLookahead(stopChars, ignoreCase, appender, false); if (found) { return appender.getAppended(this.offset); } } - if (trim && (c != ' ')) { + if (trim && (codePoint != ' ')) { appender.foundNonSpace(); } this.offset++; @@ -538,8 +513,7 @@ protected void verifyLookahead(int length) { } /** - * @param stopChars the stop {@link String} as {@link String#toCharArray() char[]}. If {@code ignoreCase} is - * {@code true} in lower case. + * @param stopChars the stop {@link String}. If {@code ignoreCase} is {@code true} in lower case. * @param ignoreCase - {@code true} to (also) compare chars in {@link Character#toLowerCase(char) lower case}, * {@code false} otherwise. * @param appender an optional lambda to {@link Runnable#run() run} before shifting buffers to append data. @@ -550,7 +524,7 @@ protected void verifyLookahead(int length) { * @see #readUntil(CharFilter, boolean, String, boolean) * @see #skipOver(String, boolean, CharFilter) */ - protected abstract boolean expectRestWithLookahead(char[] stopChars, boolean ignoreCase, Runnable appender, + protected abstract boolean expectRestWithLookahead(String stopChars, boolean ignoreCase, Runnable appender, boolean skip); @Override @@ -558,7 +532,7 @@ public void require(String expected, boolean ignoreCase) { int off = this.offset; int lim = this.limit; - char[] buf = this.buffer; + String buf = this.buffer; if (!expectUnsafe(expected, ignoreCase)) { int length = expected.length(); StringBuilder error = new StringBuilder(24 + 2 * length); @@ -569,24 +543,24 @@ public void require(String expected, boolean ignoreCase) { if (len > length) { len = length; } - error.append(new String(buf, off, len)); + error.append(buf.substring(off, lim)); // rest after shifting buffers? len = length - len; if ((len > 0) && (buf != this.buffer)) { if (len > this.offset) { len = this.offset; } - error.append(new String(this.buffer, 0, len)); + error.append(this.buffer.substring(0, len)); } throw new IllegalStateException(error.toString()); } } @Override - public boolean expectOne(char expected, boolean warning) { + public boolean expectOne(int expected, boolean warning) { - if (hasNext() && (this.buffer[this.offset] == expected)) { - handleChar(expected); + if (hasNext() && (this.buffer.codePointAt(this.offset) == expected)) { + handleCodePoint(expected); this.offset++; return true; } @@ -602,7 +576,7 @@ public boolean expectOne(CharFilter expected) { if (!hasNext()) { return false; } - if (expected.accept(this.buffer[this.offset])) { + if (expected.accept(this.buffer.codePointAt(this.offset))) { this.offset++; return true; } @@ -617,17 +591,17 @@ public boolean expectUnsafe(String expected, boolean ignoreCase) { if (!hasNext()) { return false; } - char c = this.buffer[this.offset]; - char exp = expected.charAt(i); - if (c != exp) { + int codePoint = this.buffer.codePointAt(this.offset); + int exp = expected.codePointAt(i); + if (codePoint != exp) { if (!ignoreCase) { return false; } - if (Character.toLowerCase(c) != Character.toLowerCase(exp)) { + if (Character.toLowerCase(codePoint) != Character.toLowerCase(exp)) { return false; } } - handleChar(c); + handleCodePoint(codePoint); this.offset++; } return true; @@ -647,34 +621,34 @@ public String readLine(boolean trim) { appender.start = this.offset; appender.trimEnd = this.offset; while (this.offset < this.limit) { - char c = this.buffer[this.offset]; - handleChar(c); - if (c == '\r') { + int codePoint = this.buffer.codePointAt(this.offset); + handleCodePoint(codePoint); + if (codePoint == '\r') { int end = this.offset; this.offset++; if (this.offset < this.limit) { - c = this.buffer[this.offset]; - if (c == '\n') { - handleChar(c); + codePoint = this.buffer.codePointAt(this.offset); + if (codePoint == '\n') { + handleCodePoint(codePoint); this.offset++; } return appender.getAppended(end); } else { // EOL insanity... appender.append(end); if (fill()) { - c = this.buffer[this.offset]; - if (c == '\n') { - handleChar(c); + codePoint = this.buffer.codePointAt(this.offset); + if (codePoint == '\n') { + handleCodePoint(codePoint); this.offset++; } } return appender.toString(); } - } else if (c == '\n') { + } else if (codePoint == '\n') { String result = appender.getAppended(); this.offset++; return result; - } else if (c != ' ') { + } else if (codePoint != ' ') { appender.foundNonSpace(); } this.offset++; @@ -692,21 +666,21 @@ public String readJavaStringLiteral(TextFormatMessageType severity) { if (!hasNext()) { return null; } - char c = this.buffer[this.offset]; - if (c != '"') { + int codePoint = this.buffer.codePointAt(this.offset); + if (codePoint != '"') { return null; } - handleChar(c); + handleCodePoint(codePoint); this.offset++; StringBuilder builder = null; while (hasNext()) { int start = this.offset; while (this.offset < this.limit) { - c = this.buffer[this.offset++]; - handleChar(c); - if (c == '"') { + codePoint = this.buffer.codePointAt(this.offset++); + handleCodePoint(codePoint); + if (codePoint == '"') { return getAppended(builder, start, this.offset - 1); - } else if (c == '\\') { + } else if (codePoint == '\\') { builder = append(builder, start, this.offset - 1); builder = builder(builder); parseEscapeSequence(builder, severity); @@ -729,26 +703,26 @@ public Character readJavaCharLiteral(TextFormatMessageType severity) { if (expectOne('\'')) { StringBuilder error = null; - char c = next(); - char next = 0; - if (c == '\\') { - c = next(); - if (c == 'u') { - c = parseUnicodeEscapeSequence(severity); + int cp = next(); + int next = 0; + if (cp == '\\') { + cp = next(); + if (cp == 'u') { + cp = parseUnicodeEscapeSequence(severity); if (expectOne('\'')) { - return Character.valueOf(c); + return Character.valueOf((char) cp); } - error = createUnicodeLiteralError(c); + error = createUnicodeLiteralError(cp); } else { next = next(); if (next == '\'') { - Character character = CharEscapeHelper.resolveEscape(c); + Character character = CharEscapeHelper.resolveEscape(cp); if (character != null) { return character; } - } else if (CharFilter.OCTAL_DIGIT.accept(c) && CharFilter.OCTAL_DIGIT.accept(next)) { - int value = ((c - '0') * 8) + (next - '0'); - char last = next(); + } else if (CharFilter.OCTAL_DIGIT.accept(cp) && CharFilter.OCTAL_DIGIT.accept(next)) { + int value = ((cp - '0') * 8) + (next - '0'); + int last = next(); if (CharFilter.OCTAL_DIGIT.accept(last) && (value <= 37)) { value = (value * 8) + (last - '0'); last = next(); @@ -758,20 +732,20 @@ public Character readJavaCharLiteral(TextFormatMessageType severity) { } error = new StringBuilder("'\\"); error.append(Integer.toString(value, 8)); - error.append(last); + error.appendCodePoint(last); } if (error == null) { error = new StringBuilder("'\\"); - error.append(c); - error.append(next); + error.appendCodePoint(cp); + error.appendCodePoint(next); } } } else if (expectOne('\'')) { - return Character.valueOf(c); + return Character.valueOf((char) cp); } else { error = new StringBuilder("'"); - if (c != 0) { - error.append(c); + if (cp != 0) { + error.appendCodePoint(cp); } } if (next != '\'') { @@ -800,12 +774,12 @@ public Number readJavaNumberLiteral() { return null; } Number number = null; - char c = peek(); - if ((c == 'l') || (c == 'L')) { + int codePoint = peek(); + if ((codePoint == 'l') || (codePoint == 'L')) { number = parseLong(decimal); - } else if ((c == 'f') || (c == 'F')) { + } else if ((codePoint == 'f') || (codePoint == 'F')) { number = Float.valueOf(decimal); - } else if ((c == 'd') || (c == 'D')) { + } else if ((codePoint == 'd') || (codePoint == 'D')) { number = Double.valueOf(decimal); } if (number == null) { @@ -825,23 +799,22 @@ private Long parseLong(String number) { int radix = 10; int len = number.length(); int i = 0; - char sign = 0; - char c = number.charAt(i++); - if (isNumberSign(c)) { - sign = c; - c = number.charAt(i++); + int cp = number.codePointAt(i++); + char sign = numberSign(cp); + if (sign != 0) { + cp = number.codePointAt(i++); } - if (c == '0') { + if (cp == '0') { if (i < len) { - c = number.charAt(i); - if (isRadix16(c)) { + cp = number.codePointAt(i); + if (isRadix16(cp)) { radix = 16; i++; - } else if (isRadix2(c)) { + } else if (isRadix2(cp)) { radix = 2; i++; } else { - assert (c >= '0') && (c <= '7'); + assert (cp >= '0') && (cp <= '7'); radix = 8; } number = number.substring(i); @@ -858,23 +831,22 @@ private Integer parseInteger(String number) { int radix = 10; int len = number.length(); int i = 0; - char sign = 0; - char c = number.charAt(i++); - if (isNumberSign(c)) { - sign = c; - c = number.charAt(i++); + int cp = number.codePointAt(i++); + char sign = numberSign(cp); + if (sign != 0) { + cp = number.codePointAt(i++); } - if (c == '0') { + if (cp == '0') { if (i < len) { - c = number.charAt(i); - if (isRadix16(c)) { + cp = number.codePointAt(i); + if (isRadix16(cp)) { radix = 16; i++; - } else if (isRadix2(c)) { + } else if (isRadix2(cp)) { radix = 2; i++; } else { - assert (c >= '0') && (c <= '7'); + assert (cp >= '0') && (cp <= '7'); radix = 8; } number = number.substring(i); @@ -886,12 +858,12 @@ private Integer parseInteger(String number) { return Integer.valueOf(Integer.parseInt(number, radix)); } - private StringBuilder createUnicodeLiteralError(char c) { + private StringBuilder createUnicodeLiteralError(int codePoint) { StringBuilder error; error = new StringBuilder("'\\"); error.append('u'); - String hex = Integer.toString(c, 16); + String hex = Integer.toString(codePoint, 16); int length = hex.length(); if (length == 1) { hex = "000" + hex; @@ -906,37 +878,39 @@ private StringBuilder createUnicodeLiteralError(char c) { private void parseEscapeSequence(StringBuilder builder, TextFormatMessageType severity) { - char c = next(); - if (c == 'u') { // unicode - char value = parseUnicodeEscapeSequence(severity); - builder.append(value); - } else if (CharFilter.OCTAL_DIGIT.accept(c)) { // octal C legacy stuff - int value = c - '0'; - c = peek(); - if (CharFilter.OCTAL_DIGIT.accept(c)) { + int cp = next(); + if (cp == 'u') { // unicode + int value = parseUnicodeEscapeSequence(severity); + builder.appendCodePoint(value); + } else if (CharFilter.OCTAL_DIGIT.accept(cp)) { // octal C legacy stuff + int value = cp - '0'; + cp = peek(); + if (CharFilter.OCTAL_DIGIT.accept(cp)) { next(); - value = (8 * value) + (c - '0'); + value = (8 * value) + (cp - '0'); if (value <= 31) { - c = peek(); - if (CharFilter.OCTAL_DIGIT.accept(c)) { + cp = peek(); + if (CharFilter.OCTAL_DIGIT.accept(cp)) { next(); - value = (8 * value) + (c - '0'); + value = (8 * value) + (cp - '0'); } } } - builder.append((char) value); + builder.appendCodePoint(value); } else { - Character resolved = CharEscapeHelper.resolveEscape(c); + Character resolved = CharEscapeHelper.resolveEscape(cp); if (resolved == null) { - addMessage(severity, "Illegal escape sequence \\" + c); - builder.append(c); + StringBuilder message = new StringBuilder("Illegal escape sequence \\"); + message.appendCodePoint(cp); + addMessage(severity, message.toString()); + builder.appendCodePoint(cp); } else { builder.append(resolved.charValue()); } } } - private char parseUnicodeEscapeSequence(TextFormatMessageType severity) { + private int parseUnicodeEscapeSequence(TextFormatMessageType severity) { skipWhile('u'); int i = 0; @@ -961,7 +935,7 @@ private char parseUnicodeEscapeSequence(TextFormatMessageType severity) { value = (value * radix) + digit; i++; } - return (char) value; + return value; } @SuppressWarnings("null") @@ -977,14 +951,14 @@ public String read(int count) { int len = this.limit - this.offset; if (len >= remain) { if (builder == null) { - String string = new String(this.buffer, this.offset, remain); + String string = this.buffer.substring(this.offset, this.offset + remain); setOffset(this.offset + remain); return string; } len = remain; } builder = builder(builder); - builder.append(this.buffer, this.offset, len); + builder.append(this.buffer, this.offset, this.offset + len); setOffset(this.offset + len); remain -= len; if ((remain > 0) && !fill()) { @@ -1019,43 +993,52 @@ public int readDigit(int radix) { int result = -1; if (hasNext()) { - char c = this.buffer[this.offset]; - int value = Character.digit(c, radix); + int codePoint = this.buffer.codePointAt(this.offset); + int value = Character.digit(codePoint, radix); if ((value >= 0) && (value < radix)) { result = value; - handleChar(c); + handleCodePoint(codePoint); this.offset++; } } return result; } - private boolean isNumberExponent(char c, int radix) { + private char numberExponent(int cp, int radix) { if (radix == 16) { - return (c == 'p') || (c == 'P'); + if (cp == 'p') { + return 'p'; + } else if (cp == 'P') { + return 'P'; + } + } else if (cp == 'e') { + return 'e'; + } else if (cp == 'E') { + return 'E'; } - return (c == 'e') || (c == 'E'); + return 0; } @Override public void readNumber(CharScannerNumberParser numberParser) { int skipCount = 1; - char c = peek(); - if (isNumberSign(c) && numberParser.sign(c)) { - c = peek(skipCount); + int cp = peek(); + char sign = numberSign(cp); + if ((sign != 0) && numberParser.sign(sign)) { + cp = peek(skipCount); skipCount++; } int radix = 10; - if (c == '0') { // radix? + if (cp == '0') { // radix? if (skipCount == 2) { next(); // consume sign as we have found a reasonable number skipCount--; } assert (skipCount == 1); - char radixChar = peek(skipCount); // peek character after '0' - char rc = radixChar; + int radixChar = peek(skipCount); // peek character after '0' + int rc = radixChar; int r = 10; if (isRadix16(radixChar)) { r = 16; @@ -1067,14 +1050,14 @@ public void readNumber(CharScannerNumberParser numberParser) { } else { r = 0; } - radix = numberParser.radix(r, rc); + radix = numberParser.radix(r, (char) rc); if (radix > 0) { if (r == 8) { next(); - c = radixChar; + cp = radixChar; } else { skip(2); - c = peek(); + cp = peek(); } } if (radix < 10) { @@ -1088,39 +1071,39 @@ public void readNumber(CharScannerNumberParser numberParser) { while (todo) { boolean next = false; boolean peek = true; - int digit = Character.digit(c, radix); + int digit = Character.digit(cp, radix); if (digit >= 0) { - next = numberParser.digit(digit, c); - } else if (c == '.') { + next = numberParser.digit(digit, (char) cp); + } else if (cp == '.') { next = numberParser.dot(); if (!next) { todo = false; } - } else if (isNumberExponent(c, radix)) { - char e = c; - c = peek(skipCount); - char eSign = c; - if (isNumberSign(eSign)) { - skipCount++; - } else { - eSign = 0; - } - next = numberParser.exponent(e, eSign); - if (next && (eSign == 0)) { - peek = false; - } } else { - String special = numberParser.special(c); - if (special != null) { - if (expect(special, false, false, skipCount - 1)) { - skipCount = 0; - numberParser.special(special); - next = false; // we accept but have already consumed, no next + char e = numberExponent(cp, radix); + if (e != 0) { + cp = peek(skipCount); + char eSign = numberSign(cp); + if (eSign != 0) { + skipCount++; + } + next = numberParser.exponent(e, eSign); + if (next && (eSign == 0)) { + peek = false; + } + } else { + String special = numberParser.special(cp); + if (special != null) { + if (expect(special, false, false, skipCount - 1)) { + skipCount = 0; + numberParser.special(special); + next = false; // we accept but have already consumed, no next + } else { + todo = false; + } } else { todo = false; } - } else { - todo = false; } } if (next) { @@ -1132,31 +1115,36 @@ public void readNumber(CharScannerNumberParser numberParser) { } } if (peek && todo) { - c = peek(); - todo = (c != 0); + cp = peek(); + todo = (cp != 0); } } } - private boolean isRadix2(char c) { + private boolean isRadix2(int cp) { - return (c == 'b') || (c == 'B'); + return (cp == 'b') || (cp == 'B'); } - private boolean isRadix16(char c) { + private boolean isRadix16(int cp) { - return (c == 'x') || (c == 'X'); + return (cp == 'x') || (cp == 'X'); } - private boolean isDigit(char c) { + private boolean isDigit(int cp) { - return (c >= '0') && (c <= '9'); + return (cp >= '0') && (cp <= '9'); } - private boolean isNumberSign(char c) { + private char numberSign(int cp) { - return (c == '+') || (c == '-'); + if (cp == '+') { + return '+'; + } else if (cp == '-') { + return '-'; + } + return 0; } @Override @@ -1208,10 +1196,10 @@ public long readUnsignedLong(int maxDigits) throws NumberFormatException { if (end > this.limit) { end = this.limit; } - char c = 0; + int codePoint = 0; while (this.offset < end) { - c = this.buffer[this.offset]; - if (!isDigit(c)) { + codePoint = this.buffer.codePointAt(this.offset); + if (!isDigit(codePoint)) { break; } this.offset++; @@ -1220,7 +1208,7 @@ public long readUnsignedLong(int maxDigits) throws NumberFormatException { remain -= len; if ((this.offset < end) || (remain == 0) || (start == this.limit)) { if ((len == 0) && (builder == null)) { - throw new IllegalStateException("Invalid character for long number: " + c); + throw new IllegalStateException("Invalid character for long number: " + codePoint); } String number = getAppended(builder, start, this.offset); return Long.parseLong(number); @@ -1261,10 +1249,11 @@ public int skipNewLine() { int skip = 0; if (hasNext()) { - if (this.buffer[this.offset] == '\n') { + int codePointAt = this.buffer.codePointAt(this.offset); + if (codePointAt == '\n') { skip = 1; - } else if (this.buffer[this.offset] == '\r') { - if (((this.offset + 1) < this.limit) && (this.buffer[this.offset + 1] == '\n')) { + } else if (codePointAt == '\r') { + if (((this.offset + 1) < this.limit) && (this.buffer.codePointAt(this.offset + 1) == '\n')) { skip = 2; } else if (peek(1) == '\n') { skip(2); @@ -1281,11 +1270,11 @@ public int skipNewLine() { } @Override - public boolean skipUntil(char stop) { + public boolean skipUntil(int stop) { while (hasNext()) { while (this.offset < this.limit) { - if (this.buffer[this.offset++] == stop) { + if (this.buffer.codePointAt(this.offset++) == stop) { return true; } } @@ -1294,16 +1283,16 @@ public boolean skipUntil(char stop) { } @Override - public boolean skipUntil(char stop, char escape) { + public boolean skipUntil(int stop, int escape) { boolean escapeActive = false; while (hasNext()) { while (this.offset < this.limit) { - char c = this.buffer[this.offset++]; - if (c == escape) { + int codePoint = this.buffer.codePointAt(this.offset++); + if (codePoint == escape) { escapeActive = !escapeActive; } else { - if ((c == stop) && (!escapeActive)) { + if ((codePoint == stop) && (!escapeActive)) { return true; } escapeActive = false; @@ -1314,16 +1303,16 @@ public boolean skipUntil(char stop, char escape) { } @Override - public int skipWhile(char c) { + public int skipWhile(int c) { int count = 0; while (hasNext()) { int start = this.offset; while (this.offset < this.limit) { - if (this.buffer[this.offset] != c) { + if (this.buffer.codePointAt(this.offset) != c) { return count + (this.offset - start); } - handleChar(c); + handleCodePoint(c); this.offset++; } count += (this.offset - start); @@ -1349,12 +1338,12 @@ public int skipWhile(CharFilter filter, int max) { } boolean notAccepted = false; while (this.offset < end) { - char c = this.buffer[this.offset]; - if (!filter.accept(c)) { + int cp = this.buffer.codePointAt(this.offset); + if (!filter.accept(cp)) { notAccepted = true; break; } - handleChar(c); + handleCodePoint(cp); this.offset++; } int len = this.offset - start; @@ -1377,13 +1366,13 @@ public boolean skipOver(String substring, boolean ignoreCase, CharFilter stopFil if (!hasNext()) { return false; } - char[] subChars; + String subChars; if (ignoreCase) { - subChars = CaseHelper.toLowerCase(substring).toCharArray(); + subChars = CaseHelper.toLowerCase(substring); } else { - subChars = substring.toCharArray(); + subChars = substring; } - char first = subChars[0]; + int first = subChars.codePointAt(0); while (true) { int max = this.limit; if (isEos()) { @@ -1392,18 +1381,18 @@ public boolean skipOver(String substring, boolean ignoreCase, CharFilter stopFil max -= subLength; } while (this.offset <= max) { - char c = this.buffer[this.offset]; - if ((stopFilter != null) && stopFilter.accept(c)) { + int cp = this.buffer.codePointAt(this.offset); + if ((stopFilter != null) && stopFilter.accept(cp)) { return false; } - if (c == first || (ignoreCase && (Character.toLowerCase(c) == first))) { + if (cp == first || (ignoreCase && (Character.toLowerCase(cp) == first))) { // found first character boolean found = expectRestWithLookahead(subChars, ignoreCase, null, true); if (found) { return true; } } else { - handleChar(c); + handleCodePoint(cp); } this.offset++; } @@ -1440,11 +1429,11 @@ public String readWhile(CharFilter filter, int min, int max) { end = this.limit; } while (this.offset < end) { - char c = this.buffer[this.offset]; - if (!filter.accept(c)) { + int cp = this.buffer.codePointAt(this.offset); + if (!filter.accept(cp)) { return requireMin(getAppended(builder, start, this.offset), min, filter); } - handleChar(c); + handleCodePoint(cp); this.offset++; } int len = this.offset - start; @@ -1482,14 +1471,14 @@ private String requireMin(String result, int min, CharFilter filter) { @Override public String getBufferParsed() { - return new String(this.buffer, 0, this.offset); + return this.buffer.substring(0, this.offset); } @Override public String getBufferToParse() { if (this.offset < this.limit) { - return new String(this.buffer, this.offset, this.limit - this.offset); + return this.buffer.substring(this.offset, this.limit); } else { return ""; } @@ -1507,41 +1496,41 @@ private class CharScannerSyntaxState { private final CharFilter filter; - private final char quoteStart; + private final int quoteStart; - private final char quoteEnd; + private final int quoteEnd; - private final char escape; + private final int escape; - private final char quoteEscape; + private final int quoteEscape; private final boolean quoteEscapeLazy; - private final char altQuoteStart; + private final int altQuoteStart; - private final char altQuoteEnd; + private final int altQuoteEnd; - private final char altQuoteEscape; + private final int altQuoteEscape; private final boolean altQuoteEscapeLazy; - private final char entityStart; + private final int entityStart; - private final char entityEnd; + private final int entityEnd; private int start; private boolean escapeActive; - private char activeQuoteEnd; + private int activeQuoteEnd; - private char activeQuoteEscape; + private int activeQuoteEscape; - private char activeQuoteLazyEnd; + private int activeQuoteLazyEnd; private boolean activeQuoteLazy; - private char activeEntityEnd; + private int activeEntityEnd; private StringBuilder builder; @@ -1589,7 +1578,7 @@ public StringBuilder getEntityBuilder() { return this.entityBuilder; } - private void parse(char c) { + private void parse(int codePoint) { boolean append = false; if (this.escapeActive) { @@ -1598,27 +1587,27 @@ private void parse(char c) { this.escapeActive = false; } else if (this.activeQuoteEnd != 0) { // parse quote - if ((this.activeQuoteLazyEnd != 0) && (c == this.activeQuoteLazyEnd)) { + if ((this.activeQuoteLazyEnd != 0) && (codePoint == this.activeQuoteLazyEnd)) { this.activeQuoteEnd = 0; - this.builder.append(c); // quote (was escaped lazily) + this.builder.appendCodePoint(codePoint); // quote (was escaped lazily) this.start = AbstractCharStreamScanner.this.offset + 1; } else if (this.quoteEscapeActive) { this.quoteEscapeActive = false; - if (c == this.activeQuoteEnd) { - this.builder.append(c); // quote (was escaped) + if (codePoint == this.activeQuoteEnd) { + this.builder.appendCodePoint(codePoint); // quote (was escaped) this.start = AbstractCharStreamScanner.this.offset + 1; } else if (this.activeQuoteEscape == this.activeQuoteEnd) { // quotation done this.activeQuoteEnd = 0; this.start = AbstractCharStreamScanner.this.offset; } - } else if ((c == this.activeQuoteEscape) + } else if ((codePoint == this.activeQuoteEscape) // && (!this.activeQuoteLazy || (this.activeQuoteEscape != this.activeQuoteEnd)) ) { // escape in quote append = true; this.quoteEscapeActive = true; - } else if (c == this.activeQuoteEnd) { + } else if (codePoint == this.activeQuoteEnd) { // quotation done this.activeQuoteEnd = 0; append = true; @@ -1626,13 +1615,13 @@ private void parse(char c) { this.activeQuoteLazyEnd = 0; } else if (this.activeEntityEnd != 0) { // parse entity - if (c == this.activeEntityEnd) { + if (codePoint == this.activeEntityEnd) { // entity end detected... this.activeEntityEnd = 0; int len = AbstractCharStreamScanner.this.offset - this.start; String entity; if (this.entityBuilder == null) { - entity = new String(AbstractCharStreamScanner.this.buffer, this.start, len); + entity = AbstractCharStreamScanner.this.buffer.substring(this.start, AbstractCharStreamScanner.this.offset); } else { this.entityBuilder.append(AbstractCharStreamScanner.this.buffer, this.start, len); entity = this.entityBuilder.toString(); @@ -1641,21 +1630,21 @@ private void parse(char c) { this.builder.append(this.syntax.resolveEntity(entity)); this.start = AbstractCharStreamScanner.this.offset + 1; } - } else if (this.filter.accept(c)) { + } else if (this.filter.accept(codePoint)) { append = true; this.done = true; - } else if (c == this.escape) { + } else if (codePoint == this.escape) { append = true; this.escapeActive = true; - } else if (c == this.entityStart) { + } else if (codePoint == this.entityStart) { this.activeEntityEnd = this.entityEnd; append = true; } else { - if (c == this.quoteStart) { + if (codePoint == this.quoteStart) { this.activeQuoteEnd = this.quoteEnd; this.activeQuoteEscape = this.quoteEscape; this.activeQuoteLazy = this.quoteEscapeLazy; - } else if (c == this.altQuoteStart) { + } else if (codePoint == this.altQuoteStart) { this.activeQuoteEnd = this.altQuoteEnd; this.activeQuoteEscape = this.altQuoteEscape; this.activeQuoteLazy = this.altQuoteEscapeLazy; @@ -1664,15 +1653,14 @@ private void parse(char c) { this.quoteEscapeActive = false; append = true; if (this.activeQuoteLazy && (this.activeQuoteEnd == this.activeQuoteEscape) - && (c == this.activeQuoteEscape)) { + && (codePoint == this.activeQuoteEscape)) { this.activeQuoteLazyEnd = this.activeQuoteEnd; } } } if (append) { - int len = AbstractCharStreamScanner.this.offset - this.start; - if (len > 0) { - this.builder.append(AbstractCharStreamScanner.this.buffer, this.start, len); + if (AbstractCharStreamScanner.this.offset > this.start) { + this.builder.append(AbstractCharStreamScanner.this.buffer, this.start, AbstractCharStreamScanner.this.offset); } this.start = AbstractCharStreamScanner.this.offset + 1; } diff --git a/core/src/main/java/io/github/mmm/scanner/CharEscapeHelper.java b/core/src/main/java/io/github/mmm/scanner/CharEscapeHelper.java index aaa4c22..fb4d1e5 100644 --- a/core/src/main/java/io/github/mmm/scanner/CharEscapeHelper.java +++ b/core/src/main/java/io/github/mmm/scanner/CharEscapeHelper.java @@ -43,9 +43,8 @@ public class CharEscapeHelper { * @param c the character that was escaped (e.g. 't' for tab, 'n' for line feed, 'r' for carriage return, '0' for NUL, * etc.) * @return the resolved (unescaped) character according to JLS 3.10.6 or {@code null} for invalid escape character. - * @see #resolveEscape(String) */ - public static Character resolveEscape(char c) { + public static Character resolveEscape(int c) { switch (c) { case '0': @@ -84,43 +83,4 @@ public static Character resolveEscape(char c) { return null; } - /** - * @param sequence the sequence of characters that has been escaped (e.g. "u000A" for line feed, or "u00df" for - * szlig/ß, etc.) - * @return the resolved (unescaped) character according to JLS 3.10.6 or {@code null} for invalid escape sequence. - * @see #resolveEscape(char) - */ - public static Character resolveEscape(String sequence) { - - if (sequence == null) { - return null; - } - int length = sequence.length(); - if (length == 0) { - return null; - } else if (length == 1) { - return resolveEscape(sequence.charAt(0)); - } else if (length < 4) { // octal C compatibility legacy stuff - for (int i = 0; i < length; i++) { - char c = sequence.charAt(i); - if ((c < '0') || (c > '7')) { - return null; - } - } - int value = Integer.parseInt(sequence, 8); - if (value <= 0377) { - return Character.valueOf((char) value); - } - } else if (length >= 5) { - int start = length - 4; - for (int i = start - 1; i >= 0; i--) { - if (sequence.charAt(i) != 'u') { - return null; - } - } - int value = Integer.parseInt(sequence.substring(start), 16); - return Character.valueOf((char) value); - } - return null; - } } diff --git a/core/src/main/java/io/github/mmm/scanner/CharReaderScanner.java b/core/src/main/java/io/github/mmm/scanner/CharReaderScanner.java index b1faf5a..1363e7e 100644 --- a/core/src/main/java/io/github/mmm/scanner/CharReaderScanner.java +++ b/core/src/main/java/io/github/mmm/scanner/CharReaderScanner.java @@ -9,13 +9,19 @@ import io.github.mmm.base.text.TextFormatMessageHandler; /** - * Implementation of {@link CharStreamScanner} that adapts a {@link Reader} to read and parse textual data. + * Implementation of {@link CharStreamScanner} that adapts a {@link Reader} to read and parse textual data. Unlike + * {@link CharStreamScanner} it allows to parse very long textual data without reading it entirely into the heap memory + * as a {@link String}. + * + * @since 1.0.0 */ public class CharReaderScanner extends AbstractCharStreamScanner { private Reader reader; - private char[] lookaheadBuffer; + private final char[] charBuffer; + + private String lookaheadBuffer; private int lookaheadLimit; @@ -102,7 +108,8 @@ public CharReaderScanner(int capacity, Reader reader) { */ public CharReaderScanner(int capacity, TextFormatMessageHandler messageHandler, Reader reader) { - super(capacity, messageHandler); + super("", messageHandler); + this.charBuffer = new char[capacity]; this.reader = reader; } @@ -113,17 +120,17 @@ public int getPosition() { } @Override - public char peek(int lookaheadOffset) { + public int peek(int lookaheadOffset) { if (hasNext()) { int i = this.offset + lookaheadOffset; if (i < this.limit) { - return this.buffer[i]; + return this.buffer.codePointAt(i); } if (fillLookahead()) { i = i - this.limit; if (i < this.lookaheadLimit) { - return this.lookaheadBuffer[i]; + return this.lookaheadBuffer.codePointAt(i); } else { throwLookaheadError(lookaheadOffset); } @@ -140,7 +147,7 @@ public String peekString(int count) { } int rest = this.limit - this.offset; if (rest >= count) { - return new String(this.buffer, this.offset, count); + return this.buffer.substring(this.offset, this.offset + count); } else if (fillLookahead()) { int fullRest = rest + this.lookaheadLimit; if ((count > fullRest) && !isEos()) { @@ -151,7 +158,7 @@ public String peekString(int count) { sb.append(this.lookaheadBuffer, 0, count - rest); return sb.toString(); } else { - return new String(this.buffer, this.offset, rest); + return this.buffer.substring(this.offset, this.limit); } } @@ -161,38 +168,39 @@ public String peekWhile(CharFilter filter, int maxLen) { if (!hasNext()) { return ""; } - int rest = this.limit - this.offset; - if (rest > maxLen) { - rest = maxLen; + int end = this.offset + maxLen; + if (end > this.limit) { + end = this.limit; } - int len = 0; - while (len < rest) { - char c = this.buffer[this.offset + len]; - if (!filter.accept(c)) { - return new String(this.buffer, this.offset, len); + int i = this.offset; + while (i < end) { + int cp = this.buffer.codePointAt(i); + if (!filter.accept(cp)) { + return this.buffer.substring(this.offset, i); } - len++; + i++; } if (fillLookahead()) { + int rest = i - this.offset; int fullRest = rest + this.lookaheadLimit; if ((maxLen > fullRest) && !isEos()) { throwLookaheadError(maxLen); } - len = 0; - int end = maxLen - rest; - while (len < end) { - char c = this.lookaheadBuffer[len]; - if (!filter.accept(c)) { + i = 0; + end = maxLen - rest; + while (i < end) { + int cp = this.lookaheadBuffer.codePointAt(i); + if (!filter.accept(cp)) { break; } - len++; + i++; } - StringBuilder sb = new StringBuilder(rest + len); - sb.append(this.buffer, this.offset, rest); - sb.append(this.lookaheadBuffer, 0, len); + StringBuilder sb = new StringBuilder(rest + i); + sb.append(this.buffer, this.offset, this.limit); + sb.append(this.lookaheadBuffer, 0, i); return sb.toString(); } else { - return new String(this.buffer, this.offset, rest); + return this.buffer.substring(this.offset, end); } } @@ -200,14 +208,14 @@ public String peekWhile(CharFilter filter, int maxLen) { public String getBufferToParse() { if (this.offset < this.limit) { - int count = this.limit - this.offset; if (this.lookaheadLimit > 0) { + int count = this.limit - this.offset; StringBuilder sb = new StringBuilder(this.lookaheadLimit + count); sb.append(this.buffer, this.offset, count); sb.append(this.lookaheadBuffer, 0, this.lookaheadLimit); return sb.toString(); } else { - return new String(this.buffer, this.offset, count); + return this.buffer.substring(this.offset, this.limit); } } else { return ""; @@ -244,13 +252,16 @@ protected boolean fill() { try { this.limit = 0; while (this.limit == 0) { - this.limit = this.reader.read(this.buffer); + this.limit = this.reader.read(this.charBuffer); } if (this.limit == -1) { close(); + this.buffer = ""; this.limit = 0; return false; } + this.buffer = new String(this.charBuffer, 0, this.limit); + this.limit = this.buffer.length(); return true; } catch (IOException e) { throw new IllegalStateException("Read error.", e); @@ -265,19 +276,19 @@ private boolean fillLookahead() { if (this.reader == null) { return false; } - if (this.lookaheadBuffer == null) { - this.lookaheadBuffer = new char[this.buffer.length]; - } try { this.lookaheadLimit = 0; while (this.lookaheadLimit == 0) { - this.lookaheadLimit = this.reader.read(this.lookaheadBuffer); + this.lookaheadLimit = this.reader.read(this.charBuffer); } if (this.lookaheadLimit == -1) { close(); + this.lookaheadBuffer = ""; this.lookaheadLimit = 0; return false; } + this.lookaheadBuffer = new String(this.charBuffer, 0, this.lookaheadLimit); + this.lookaheadLimit = this.lookaheadBuffer.length(); return true; } catch (IOException e) { throw new IllegalStateException("Read error.", e); @@ -288,7 +299,7 @@ private void shiftLookahead() { this.position += this.limit; setOffset(this.limit); - char[] tmp = this.lookaheadBuffer; + String tmp = this.lookaheadBuffer; this.lookaheadBuffer = this.buffer; this.buffer = tmp; this.offset = 0; @@ -363,16 +374,16 @@ public boolean expect(String expected, boolean ignoreCase, boolean lookahead, in verifyLookahead(expectedLength); } int myLimit = this.limit; - char[] myBuffer = this.buffer; + String myBuffer = this.buffer; int expectedIndex = 0; while (expectedIndex < expectedLength) { - char c = myBuffer[myOffset++]; - char exp = expected.charAt(expectedIndex++); - if (c != exp) { + int cp = myBuffer.codePointAt(myOffset++); + int expCp = expected.codePointAt(expectedIndex++); + if (cp != expCp) { if (!ignoreCase) { return false; } - if (Character.toLowerCase(c) != Character.toLowerCase(exp)) { + if (Character.toLowerCase(cp) != Character.toLowerCase(expCp)) { return false; } } @@ -400,7 +411,7 @@ public boolean expect(String expected, boolean ignoreCase, boolean lookahead, in @Override protected void verifyLookahead(int length) { - if (length > this.buffer.length) { + if (length > this.charBuffer.length) { throwLookaheadError(length); } } @@ -408,16 +419,17 @@ protected void verifyLookahead(int length) { private void throwLookaheadError(int length) { throw new IllegalArgumentException( - "Lookahead size of " + length + " characters exceeds the configured buffer size of " + this.buffer.length); + "Lookahead size of " + length + " characters exceeds the configured buffer size of " + this.charBuffer.length); } @Override - protected boolean expectRestWithLookahead(char[] stopChars, boolean ignoreCase, Runnable appender, boolean skip) { + protected boolean expectRestWithLookahead(String stopChars, boolean ignoreCase, Runnable appender, boolean skip) { - int myCharsIndex = this.offset + 1; - int subCharsIndex = 1; - while (subCharsIndex < stopChars.length) { - if (myCharsIndex == this.limit) { // lookahead required? + int bufferIndex = this.offset + 1; + int stopIndex = 1; + int stopLength = stopChars.length(); + while (stopIndex < stopLength) { + if (bufferIndex == this.limit) { // lookahead required? if (!fillLookahead()) { if (skip) { setOffset(this.limit); @@ -425,10 +437,10 @@ protected boolean expectRestWithLookahead(char[] stopChars, boolean ignoreCase, return false; } int lookaheadIndex = 0; - while (subCharsIndex < stopChars.length) { - char c = this.lookaheadBuffer[lookaheadIndex++]; - char stopChar = stopChars[subCharsIndex++]; - if (c != stopChar && (!ignoreCase || (Character.toLowerCase(c) != stopChar))) { + while (stopIndex < stopLength) { + int cp = this.lookaheadBuffer.codePointAt(lookaheadIndex++); + int stopCp = stopChars.codePointAt(stopIndex++); + if (cp != stopCp && (!ignoreCase || (Character.toLowerCase(cp) != stopCp))) { return false; } } @@ -441,15 +453,15 @@ protected boolean expectRestWithLookahead(char[] stopChars, boolean ignoreCase, } return true; } else { - char c = this.buffer[myCharsIndex++]; - char stopChar = stopChars[subCharsIndex++]; - if (c != stopChar && (!ignoreCase || (Character.toLowerCase(c) != stopChar))) { + int cp = this.buffer.codePointAt(bufferIndex++); + int stopCp = stopChars.codePointAt(stopIndex++); + if (cp != stopCp && (!ignoreCase || (Character.toLowerCase(cp) != stopCp))) { return false; } } } if (skip) { - setOffset(myCharsIndex); + setOffset(bufferIndex); } return true; } diff --git a/core/src/main/java/io/github/mmm/scanner/CharScannerSyntax.java b/core/src/main/java/io/github/mmm/scanner/CharScannerSyntax.java index 75fc118..6a505a2 100644 --- a/core/src/main/java/io/github/mmm/scanner/CharScannerSyntax.java +++ b/core/src/main/java/io/github/mmm/scanner/CharScannerSyntax.java @@ -1,252 +1,252 @@ -/* Copyright (c) The m-m-m Team, Licensed under the Apache License, Version 2.0 - * http://www.apache.org/licenses/LICENSE-2.0 */ -package io.github.mmm.scanner; - -/** - * This is the interface used to define the syntax to scan characters. - * - * @see CharStreamScanner#readUntil(char, boolean, CharScannerSyntax) - */ -public interface CharScannerSyntax { - - /** - * This method gets the character used to start a quotation that should be terminated by a {@link #getQuoteEnd() - * quote-end} character. The text inside the quote is taken as is (without the quote characters).
- * Common examples for quote characters are the single quotes ({@code '}) and double quotes ({@code "}). - * - * @return the character used to start a quotation or {@code '\0'} to disable. - */ - char getQuoteStart(); - - /** - * This method gets the character used to end a quotation. - * - * @see #getQuoteStart() - * - * @return the character used to end a quotation or {@code '\0'} to disable. - */ - char getQuoteEnd(); - - /** - * This method gets the character used as escape. It is used to mark special characters like {@link #getQuoteStart()} - * to allow these characters also in the payload. The escape itself is removed on decoding while the next character is - * taken as is without any special interpretation.
- * The most common escape character is the backslash ({@code \}).
- * Here are some examples for decoding: - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
{@link #getEscape() escape}inputoutput
\a\b\\cab\c
~a~b~~~cab~c
- * - * This allows to encode special characters like a - * {@link CharStreamScanner#readUntil(char, boolean, CharScannerSyntax) stop-character}, {@link #getQuoteStart() - * quote-start}, {@link #getAltQuoteStart() alt-quote-start}, as well as the {@link #getEscape() escape} itself.
- * ATTENTION:
- * The {@link #getEscape() escape} is disabled within {@link #getQuoteStart() quotations}. - * - * @see #getEntityStart() - * - * @return the escape character or {@code '\0'} for no escaping. - */ - char getEscape(); - - /** - * This method gets the character used to escape the {@link #getQuoteEnd() quote-end} character within a quotation. - * This may be the {@link #getQuoteEnd() quote-end} itself so a duplicate {@link #getQuoteEnd() quote-end} represents - * a single occurrence of that character within a quotation. Otherwise the escape may be any other character.
- * Please note that this escaping is only active within a quotation opened by {@link #getQuoteStart() quote-start} and - * only escapes the {@link #getQuoteEnd() quote-end} character and nothing else so in any other case the - * {@link #getQuoteEscape() quote-escape} is treated as a regular character.
- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
{@link #getQuoteStart() quote-start}{@link #getQuoteEnd() quote-end}{@link #getQuoteEscape() quote-escape}inputoutput
'''a'bc'dabcd
'''a'b''c'dab'cd
''\a'b\c\'d\\'e'fab\c'd\'ef
- * - * @return the character used to escape the {@link #getQuoteEnd() quote-end} character or {@code '\0'} to disable. - */ - char getQuoteEscape(); - - /** - * If {@link #getQuoteStart() quote-start}, {@link #getQuoteEnd() quote-end} and {@link #getQuoteEscape() - * quote-escape} all point to the same character (which is NOT {@code '\0'}), then this method determines if - * {@link #getQuoteEscape() quotation escaping} is lazy. This means that outside a quotation a double - * occurrence of the quote character is NOT treated as quotation but as escaped quote character. Otherwise if NOT - * lazy, the double quote character is treated as quotation representing the empty sequence.
- * Here are some examples: - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
{@link #getQuoteStart() quote-start}{@link #getQuoteEnd() quote-end}{@link #getQuoteEscape() quote-escape}{@link #isQuoteEscapeLazy() quote-escape-lazy}inputoutput
'''true'''
'''false'' 
'''true''''''
'''false'''''
'''true'''a''a
'''false'''a''a
- *
- * Please note that for {@code '''a'} the complete sequence is treated as quote if {@link #isQuoteEscapeLazy() - * quote-escape-lazy} is {@code false} and otherwise just the trailing {@code 'a'}. - * - * @return {@code true} if quote-escaping is lazy, {@code false} otherwise. - */ - boolean isQuoteEscapeLazy(); - - /** - * This method gets the alternative character used to start a quotation that should be terminated by a - * {@link #getAltQuoteEnd() alt-quote-end} character. The text inside the quote is taken as is (without the quote - * characters). - * - * @see #getQuoteStart() - * - * @return the alternative character used to start a quotation or {@code '\0'} to disable. - */ - char getAltQuoteStart(); - - /** - * This method gets the alternative character used to end a quotation. - * - * @see #getAltQuoteStart() - * - * @return the alternative character used to end a quotation. - */ - char getAltQuoteEnd(); - - /** - * This method gets the character used to escape the {@link #getAltQuoteEnd() alt-quote-end} character within an - * quotation opened by {@link #getAltQuoteStart() alt-quote-start}. - * - * @see #getQuoteEscape() - * - * @return the character used to escape the {@link #getQuoteEnd() quote-end} character or {@code '\0'} to disable. - */ - char getAltQuoteEscape(); - - /** - * If {@link #getAltQuoteStart() alt-quote-start}, {@link #getAltQuoteEnd() alt-quote-end} and - * {@link #getAltQuoteEscape() alt-quote-escape} all point to the same character (which is NOT {@code '\0'}), then - * this method determines if {@link #getAltQuoteEscape() alt-quotation escaping} is lazy. - * - * @see #isQuoteEscapeLazy() - * - * @return {@code true} if alt-quote-escaping is lazy, {@code false} otherwise. - */ - boolean isAltQuoteEscapeLazy(); - - /** - * This method gets the character used to start an entity. An entity is a specific encoded string surrounded with - * {@link #getEntityStart() entity-start} and {@link #getEntityEnd() entity-end}. It will be decoded by - * {@link #resolveEntity(String)}. - * - * @return the character used to start an entity or {@code '\0'} to disable. - */ - char getEntityStart(); - - /** - * This method gets the character used to end an entity. - * - * @see #getEntityStart() - * - * @return the character used to end an entity. - */ - char getEntityEnd(); - - /** - * This method resolves the given {@code entity}.
- * E.g. if {@link #getEntityStart() entity-start} is {@code '&'} and {@link #getEntityEnd()} is {@code ';'} then if - * the string {@code "<"} is scanned, this method is called with {@code "lt"} as {@code entity} argument and may - * return {@code "<"}. - * - * @param entity is the entity string that was found surrounded by {@link #getEntityStart() entity-start} and - * {@link #getEntityEnd() entity-end} excluding these characters. - * @return the decoded entity. - */ - String resolveEntity(String entity); - -} +/* Copyright (c) The m-m-m Team, Licensed under the Apache License, Version 2.0 + * http://www.apache.org/licenses/LICENSE-2.0 */ +package io.github.mmm.scanner; + +/** + * This is the interface used to define the syntax to scan characters. + * + * @see CharStreamScanner#readUntil(int, boolean, CharScannerSyntax) + */ +public interface CharScannerSyntax { + + /** + * This method gets the character used to start a quotation that should be terminated by a {@link #getQuoteEnd() + * quote-end} character. The text inside the quote is taken as is (without the quote characters).
+ * Common examples for quote characters are the single quotes ({@code '}) and double quotes ({@code "}). + * + * @return the character used to start a quotation or {@code '\0'} to disable. + */ + int getQuoteStart(); + + /** + * This method gets the character used to end a quotation. + * + * @see #getQuoteStart() + * + * @return the character used to end a quotation or {@code '\0'} to disable. + */ + int getQuoteEnd(); + + /** + * This method gets the character used as escape. It is used to mark special characters like {@link #getQuoteStart()} + * to allow these characters also in the payload. The escape itself is removed on decoding while the next character is + * taken as is without any special interpretation.
+ * The most common escape character is the backslash ({@code \}).
+ * Here are some examples for decoding: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
{@link #getEscape() escape}inputoutput
\a\b\\cab\c
~a~b~~~cab~c
+ * + * This allows to encode special characters like a {@link CharStreamScanner#readUntil(int, boolean, CharScannerSyntax) + * stop-character}, {@link #getQuoteStart() quote-start}, {@link #getAltQuoteStart() alt-quote-start}, as well as the + * {@link #getEscape() escape} itself.
+ * ATTENTION:
+ * The {@link #getEscape() escape} is disabled within {@link #getQuoteStart() quotations}. + * + * @see #getEntityStart() + * + * @return the escape character or {@code '\0'} for no escaping. + */ + int getEscape(); + + /** + * This method gets the character used to escape the {@link #getQuoteEnd() quote-end} character within a quotation. + * This may be the {@link #getQuoteEnd() quote-end} itself so a duplicate {@link #getQuoteEnd() quote-end} represents + * a single occurrence of that character within a quotation. Otherwise the escape may be any other character.
+ * Please note that this escaping is only active within a quotation opened by {@link #getQuoteStart() quote-start} and + * only escapes the {@link #getQuoteEnd() quote-end} character and nothing else so in any other case the + * {@link #getQuoteEscape() quote-escape} is treated as a regular character.
+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
{@link #getQuoteStart() quote-start}{@link #getQuoteEnd() quote-end}{@link #getQuoteEscape() quote-escape}inputoutput
'''a'bc'dabcd
'''a'b''c'dab'cd
''\a'b\c\'d\\'e'fab\c'd\'ef
+ * + * @return the character used to escape the {@link #getQuoteEnd() quote-end} character or {@code '\0'} to disable. + */ + int getQuoteEscape(); + + /** + * If {@link #getQuoteStart() quote-start}, {@link #getQuoteEnd() quote-end} and {@link #getQuoteEscape() + * quote-escape} all point to the same character (which is NOT {@code '\0'}), then this method determines if + * {@link #getQuoteEscape() quotation escaping} is lazy. This means that outside a quotation a double + * occurrence of the quote character is NOT treated as quotation but as escaped quote character. Otherwise if NOT + * lazy, the double quote character is treated as quotation representing the empty sequence.
+ * Here are some examples: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
{@link #getQuoteStart() quote-start}{@link #getQuoteEnd() quote-end}{@link #getQuoteEscape() quote-escape}{@link #isQuoteEscapeLazy() quote-escape-lazy}inputoutput
'''true'''
'''false'' 
'''true''''''
'''false'''''
'''true'''a''a
'''false'''a''a
+ *
+ * Please note that for {@code '''a'} the complete sequence is treated as quote if {@link #isQuoteEscapeLazy() + * quote-escape-lazy} is {@code false} and otherwise just the trailing {@code 'a'}. + * + * @return {@code true} if quote-escaping is lazy, {@code false} otherwise. + */ + boolean isQuoteEscapeLazy(); + + /** + * This method gets the alternative character used to start a quotation that should be terminated by a + * {@link #getAltQuoteEnd() alt-quote-end} character. The text inside the quote is taken as is (without the quote + * characters). + * + * @see #getQuoteStart() + * + * @return the alternative character used to start a quotation or {@code '\0'} to disable. + */ + int getAltQuoteStart(); + + /** + * This method gets the alternative character used to end a quotation. + * + * @see #getAltQuoteStart() + * + * @return the alternative character used to end a quotation. + */ + int getAltQuoteEnd(); + + /** + * This method gets the character used to escape the {@link #getAltQuoteEnd() alt-quote-end} character within an + * quotation opened by {@link #getAltQuoteStart() alt-quote-start}. + * + * @see #getQuoteEscape() + * + * @return the character used to escape the {@link #getQuoteEnd() quote-end} character or {@code '\0'} to disable. + */ + int getAltQuoteEscape(); + + /** + * If {@link #getAltQuoteStart() alt-quote-start}, {@link #getAltQuoteEnd() alt-quote-end} and + * {@link #getAltQuoteEscape() alt-quote-escape} all point to the same character (which is NOT {@code '\0'}), then + * this method determines if {@link #getAltQuoteEscape() alt-quotation escaping} is lazy. + * + * @see #isQuoteEscapeLazy() + * + * @return {@code true} if alt-quote-escaping is lazy, {@code false} otherwise. + */ + boolean isAltQuoteEscapeLazy(); + + /** + * This method gets the character used to start an entity. An entity is a specific encoded string surrounded with + * {@link #getEntityStart() entity-start} and {@link #getEntityEnd() entity-end}. It will be decoded by + * {@link #resolveEntity(String)}. + * + * @return the character used to start an entity or {@code '\0'} to disable. + */ + int getEntityStart(); + + /** + * This method gets the character used to end an entity. + * + * @see #getEntityStart() + * + * @return the character used to end an entity. + */ + int getEntityEnd(); + + /** + * This method resolves the given {@code entity}.
+ * E.g. if {@link #getEntityStart() entity-start} is {@code '&'} and {@link #getEntityEnd()} is {@code ';'} then if + * the string {@code "<"} is scanned, this method is called with {@code "lt"} as {@code entity} argument and may + * return {@code "<"}. + * + * @param entity is the entity string that was found surrounded by {@link #getEntityStart() entity-start} and + * {@link #getEntityEnd() entity-end} excluding these characters. + * @return the decoded entity. + */ + String resolveEntity(String entity); + +} diff --git a/core/src/main/java/io/github/mmm/scanner/CharScannerSyntaxBean.java b/core/src/main/java/io/github/mmm/scanner/CharScannerSyntaxBean.java index b38d725..5956c1b 100644 --- a/core/src/main/java/io/github/mmm/scanner/CharScannerSyntaxBean.java +++ b/core/src/main/java/io/github/mmm/scanner/CharScannerSyntaxBean.java @@ -1,253 +1,236 @@ -/* Copyright (c) The m-m-m Team, Licensed under the Apache License, Version 2.0 - * http://www.apache.org/licenses/LICENSE-2.0 */ -package io.github.mmm.scanner; - -/** - * This is the implementation of {@link CharScannerSyntax} as Java bean.
- * The actual {@code char}s like {@link #getEscape() escape} are realized as simple bean-properties and initialized with - * '\0' so they are disabled by default. - * - * @see CharStreamScanner#readUntil(char, boolean, CharScannerSyntax) - */ -public class CharScannerSyntaxBean implements CharScannerSyntax { - - private char escape; - - private char quoteStart; - - private char quoteEnd; - - private char quoteEscape; - - private boolean quoteEscapeLazy; - - private char altQuoteStart; - - private char altQuoteEnd; - - private char altQuoteEscape; - - private boolean altQuoteEscapeLazy; - - private char entityStart; - - private char entityEnd; - - /** - * The constructor. - */ - public CharScannerSyntaxBean() { - - super(); - this.escape = '\0'; - this.quoteStart = '\0'; - this.quoteEnd = '\0'; - } - - @Override - public char getEscape() { - - return this.escape; - } - - /** - * @param escape is the {@link #getEscape() escape} to set. - */ - public void setEscape(char escape) { - - this.escape = escape; - } - - @Override - public char getQuoteStart() { - - return this.quoteStart; - } - - /** - * @param quoteStart is the {@link #getQuoteStart() quoteStart} to set. - */ - public void setQuoteStart(char quoteStart) { - - this.quoteStart = quoteStart; - } - - @Override - public char getQuoteEnd() { - - return this.quoteEnd; - } - - /** - * @param quoteEnd is the {@link #getQuoteEnd() quoteEnd} to set. - */ - public void setQuoteEnd(char quoteEnd) { - - this.quoteEnd = quoteEnd; - } - - /** - * This method sets both the {@link #getQuoteStart() quote-start} and {@link #getQuoteEnd() quote-end} character. - * - * @param quote the quote character to set. - */ - public void setQuote(char quote) { - - this.quoteStart = quote; - this.quoteEnd = quote; - } - - @Override - public char getQuoteEscape() { - - return this.quoteEscape; - } - - /** - * @param quoteEscape is the {@link #getQuoteEnd() quote-escape} to set. - */ - public void setQuoteEscape(char quoteEscape) { - - this.quoteEscape = quoteEscape; - } - - @Override - public boolean isQuoteEscapeLazy() { - - return this.quoteEscapeLazy; - } - - /** - * @param quoteEscapeLazy the {@link #isQuoteEscapeLazy() quote-escape-lazy} flag to set - */ - public void setQuoteEscapeLazy(boolean quoteEscapeLazy) { - - this.quoteEscapeLazy = quoteEscapeLazy; - } - - /** - * This method gets the alternative character used to start a quotation that should be terminated by a - * {@link #getAltQuoteEnd() alt-quote-end} character. The text inside the quote is taken as is (without the quote - * characters). - * - * @see #getQuoteStart() - * - * @return the alternative character used to start a quotation or {@code '\0'} for no quotation. - */ - @Override - public char getAltQuoteStart() { - - return this.altQuoteStart; - } - - /** - * @param alternativeQuoteStart is the {@link #getAltQuoteStart() alt-quote-start} character to set. - */ - public void setAltQuoteStart(char alternativeQuoteStart) { - - this.altQuoteStart = alternativeQuoteStart; - } - - /** - * This method gets the alternative character used to end a quotation. - * - * @see #getAltQuoteStart() - * - * @return the alternative character used to end a quotation. - */ - @Override - public char getAltQuoteEnd() { - - return this.altQuoteEnd; - } - - /** - * This method sets the {@link #getAltQuoteEnd() alt-quote-end} character. - * - * @param alternativeQuoteEnd is the {@link #getAltQuoteEnd() alt-quote-end} character. - */ - public void setAltQuoteEnd(char alternativeQuoteEnd) { - - this.altQuoteEnd = alternativeQuoteEnd; - } - - /** - * This method sets both the {@link #getAltQuoteStart() alt-quote-start} and {@link #getAltQuoteEnd() alt-quote-end} - * character. - * - * @param altQuote the alt-quote character to set. - */ - public void setAltQuote(char altQuote) { - - this.altQuoteStart = altQuote; - this.altQuoteEnd = altQuote; - } - - @Override - public char getAltQuoteEscape() { - - return this.altQuoteEscape; - } - - /** - * @param altQuoteEscape is the {@link #getAltQuoteEscape() alt-quote-escape} to set. - */ - public void setAltQuoteEscape(char altQuoteEscape) { - - this.altQuoteEscape = altQuoteEscape; - } - - @Override - public boolean isAltQuoteEscapeLazy() { - - return this.altQuoteEscapeLazy; - } - - /** - * @param altQuoteEscapeLazy the {@link #isAltQuoteEscapeLazy() alt-quote-lazy} flag to set - */ - public void setAltQuoteEscapeLazy(boolean altQuoteEscapeLazy) { - - this.altQuoteEscapeLazy = altQuoteEscapeLazy; - } - - @Override - public char getEntityStart() { - - return this.entityStart; - } - - /** - * @param entityStart the {@link #getEntityStart() entity-start} to set. - */ - public void setEntityStart(char entityStart) { - - this.entityStart = entityStart; - } - - @Override - public char getEntityEnd() { - - return this.entityEnd; - } - - /** - * @param entityEnd the {@link #getEntityEnd() entity-end} to set. - */ - public void setEntityEnd(char entityEnd) { - - this.entityEnd = entityEnd; - } - - /** - * {@inheritDoc} - * - * ATTENTION:
- * You need to override this method if you want to {@link #setEntityStart(char) use} entities. - */ - @Override - public String resolveEntity(String entity) { - - throw new IllegalArgumentException(entity); - } -} +/* Copyright (c) The m-m-m Team, Licensed under the Apache License, Version 2.0 + * http://www.apache.org/licenses/LICENSE-2.0 */ +package io.github.mmm.scanner; + +/** + * This is the implementation of {@link CharScannerSyntax} as Java bean.
+ * The actual {@code char}s like {@link #getEscape() escape} are realized as simple bean-properties and initialized with + * '\0' so they are disabled by default. + * + * @see CharStreamScanner#readUntil(int, boolean, CharScannerSyntax) + */ +public class CharScannerSyntaxBean implements CharScannerSyntax { + + private int escape; + + private int quoteStart; + + private int quoteEnd; + + private int quoteEscape; + + private boolean quoteEscapeLazy; + + private int altQuoteStart; + + private int altQuoteEnd; + + private int altQuoteEscape; + + private boolean altQuoteEscapeLazy; + + private int entityStart; + + private int entityEnd; + + /** + * The constructor. + */ + public CharScannerSyntaxBean() { + + super(); + this.escape = '\0'; + this.quoteStart = '\0'; + this.quoteEnd = '\0'; + } + + @Override + public int getEscape() { + + return this.escape; + } + + /** + * @param escape is the {@link #getEscape() escape} to set. + */ + public void setEscape(int escape) { + + this.escape = escape; + } + + @Override + public int getQuoteStart() { + + return this.quoteStart; + } + + /** + * @param quoteStart is the {@link #getQuoteStart() quoteStart} to set. + */ + public void setQuoteStart(int quoteStart) { + + this.quoteStart = quoteStart; + } + + @Override + public int getQuoteEnd() { + + return this.quoteEnd; + } + + /** + * @param quoteEnd is the {@link #getQuoteEnd() quoteEnd} to set. + */ + public void setQuoteEnd(int quoteEnd) { + + this.quoteEnd = quoteEnd; + } + + /** + * This method sets both the {@link #getQuoteStart() quote-start} and {@link #getQuoteEnd() quote-end} character. + * + * @param quote the quote character to set. + */ + public void setQuote(int quote) { + + this.quoteStart = quote; + this.quoteEnd = quote; + } + + @Override + public int getQuoteEscape() { + + return this.quoteEscape; + } + + /** + * @param quoteEscape is the {@link #getQuoteEnd() quote-escape} to set. + */ + public void setQuoteEscape(int quoteEscape) { + + this.quoteEscape = quoteEscape; + } + + @Override + public boolean isQuoteEscapeLazy() { + + return this.quoteEscapeLazy; + } + + /** + * @param quoteEscapeLazy the {@link #isQuoteEscapeLazy() quote-escape-lazy} flag to set + */ + public void setQuoteEscapeLazy(boolean quoteEscapeLazy) { + + this.quoteEscapeLazy = quoteEscapeLazy; + } + + @Override + public int getAltQuoteStart() { + + return this.altQuoteStart; + } + + /** + * @param alternativeQuoteStart is the {@link #getAltQuoteStart() alt-quote-start} character to set. + */ + public void setAltQuoteStart(int alternativeQuoteStart) { + + this.altQuoteStart = alternativeQuoteStart; + } + + @Override + public int getAltQuoteEnd() { + + return this.altQuoteEnd; + } + + /** + * This method sets the {@link #getAltQuoteEnd() alt-quote-end} character. + * + * @param alternativeQuoteEnd is the {@link #getAltQuoteEnd() alt-quote-end} character. + */ + public void setAltQuoteEnd(int alternativeQuoteEnd) { + + this.altQuoteEnd = alternativeQuoteEnd; + } + + /** + * This method sets both the {@link #getAltQuoteStart() alt-quote-start} and {@link #getAltQuoteEnd() alt-quote-end} + * character. + * + * @param altQuote the alt-quote character to set. + */ + public void setAltQuote(int altQuote) { + + this.altQuoteStart = altQuote; + this.altQuoteEnd = altQuote; + } + + @Override + public int getAltQuoteEscape() { + + return this.altQuoteEscape; + } + + /** + * @param altQuoteEscape is the {@link #getAltQuoteEscape() alt-quote-escape} to set. + */ + public void setAltQuoteEscape(int altQuoteEscape) { + + this.altQuoteEscape = altQuoteEscape; + } + + @Override + public boolean isAltQuoteEscapeLazy() { + + return this.altQuoteEscapeLazy; + } + + /** + * @param altQuoteEscapeLazy the {@link #isAltQuoteEscapeLazy() alt-quote-lazy} flag to set + */ + public void setAltQuoteEscapeLazy(boolean altQuoteEscapeLazy) { + + this.altQuoteEscapeLazy = altQuoteEscapeLazy; + } + + @Override + public int getEntityStart() { + + return this.entityStart; + } + + /** + * @param entityStart the {@link #getEntityStart() entity-start} to set. + */ + public void setEntityStart(int entityStart) { + + this.entityStart = entityStart; + } + + @Override + public int getEntityEnd() { + + return this.entityEnd; + } + + /** + * @param entityEnd the {@link #getEntityEnd() entity-end} to set. + */ + public void setEntityEnd(int entityEnd) { + + this.entityEnd = entityEnd; + } + + /** + * {@inheritDoc}
+ * ATTENTION:
+ * You need to override this method if you want to {@link #setEntityStart(int) use} entities. + */ + @Override + public String resolveEntity(String entity) { + + throw new IllegalArgumentException(entity); + } +} diff --git a/core/src/main/java/io/github/mmm/scanner/CharSequenceScanner.java b/core/src/main/java/io/github/mmm/scanner/CharSequenceScanner.java index 2f30251..e0f283f 100644 --- a/core/src/main/java/io/github/mmm/scanner/CharSequenceScanner.java +++ b/core/src/main/java/io/github/mmm/scanner/CharSequenceScanner.java @@ -1,505 +1,432 @@ -/* Copyright (c) The m-m-m Team, Licensed under the Apache License, Version 2.0 - * http://www.apache.org/licenses/LICENSE-2.0 */ -package io.github.mmm.scanner; - -import io.github.mmm.base.filter.CharFilter; -import io.github.mmm.base.text.TextFormatMessageHandler; - -/** - * This class represents a {@link String} or better a sequence of characters ({@code char[]}) together with a - * {@link #getCurrentIndex() position} in that sequence.
- * It has various useful methods for scanning the sequence. This scanner is designed to be fast on long sequences and - * therefore internally {@link String#toCharArray() converts} {@link String}s to a char array instead of frequently - * calling {@link String#charAt(int)}.
- * ATTENTION:
- * This implementation is NOT thread-safe (intended by design). - * - * @since 1.0.0 - */ -public class CharSequenceScanner extends AbstractCharStreamScanner { - - private String string; - - /** The initial {@link #offset} in the {@link #buffer}. */ - private final int initialOffset; - - /** - * The constructor. - * - * @param charSequence is the {@link #getOriginalString() string} to scan. - */ - public CharSequenceScanner(CharSequence charSequence) { - - this(charSequence, null); - } - - /** - * The constructor. - * - * @param charSequence is the {@link #getOriginalString() string} to scan. - * @param messageHandler the {@link TextFormatMessageHandler}. - */ - public CharSequenceScanner(CharSequence charSequence, TextFormatMessageHandler messageHandler) { - - this(charSequence.toString(), messageHandler); - } - - /** - * The constructor. - * - * @param string is the {@link #getOriginalString() string} to parse. - */ - public CharSequenceScanner(String string) { - - this(string, null); - } - - /** - * The constructor. - * - * @param string is the {@link #getOriginalString() string} to parse. - * @param messageHandler the {@link TextFormatMessageHandler}. - */ - public CharSequenceScanner(String string, TextFormatMessageHandler messageHandler) { - - this(string, messageHandler, 1, 1); - } - - /** - * The constructor. - * - * @param string is the {@link #getOriginalString() string} to parse. - * @param messageHandler the {@link TextFormatMessageHandler}. - * @param line the initial {@link #getLine() line}. - * @param column the initial {@link #getColumn() column}. - */ - public CharSequenceScanner(String string, TextFormatMessageHandler messageHandler, int line, int column) { - - this(string.toCharArray(), messageHandler, line, column); - this.string = string; - } - - /** - * The constructor. - * - * @param characters is an array containing the characters to scan. - */ - public CharSequenceScanner(char[] characters) { - - this(characters, null); - } - - /** - * The constructor. - * - * @param characters is an array containing the characters to scan. - * @param messageHandler the {@link TextFormatMessageHandler}. - */ - public CharSequenceScanner(char[] characters, TextFormatMessageHandler messageHandler) { - - this(characters, 0, characters.length, messageHandler); - } - - /** - * The constructor. - * - * @param characters is an array containing the characters to scan. - * @param messageHandler the {@link TextFormatMessageHandler}. - * @param line the initial {@link #getLine() line}. - * @param column the initial {@link #getColumn() column}. - */ - public CharSequenceScanner(char[] characters, TextFormatMessageHandler messageHandler, int line, int column) { - - this(characters, 0, characters.length, messageHandler, line, column); - } - - /** - * The constructor. - * - * @param characters is an array containing the characters to scan. - * @param offset is the index of the first char to scan in {@code characters} (typically {@code 0} to start at the - * beginning of the array). - * @param length is the {@link #getLength() number of characters} to scan from {@code characters} starting at - * {@code offset} (typically characters.length - offset). - */ - public CharSequenceScanner(char[] characters, int offset, int length) { - - this(characters, offset, length, null); - } - - /** - * The constructor. - * - * @param characters is an array containing the characters to scan. - * @param offset is the index of the first char to scan in {@code characters} (typically {@code 0} to start at the - * beginning of the array). - * @param length is the {@link #getLength() number of characters} to scan from {@code characters} starting at - * {@code offset} (typically characters.length - offset). - * @param messageHandler the {@link TextFormatMessageHandler}. - */ - public CharSequenceScanner(char[] characters, int offset, int length, TextFormatMessageHandler messageHandler) { - - this(characters, offset, length, messageHandler, 1, 1); - } - - /** - * The constructor. - * - * @param characters is an array containing the characters to scan. - * @param offset is the index of the first char to scan in {@code characters} (typically {@code 0} to start at the - * beginning of the array). - * @param length is the {@link #getLength() number of characters} to scan from {@code characters} starting at - * {@code offset} (typically characters.length - offset). - * @param messageHandler the {@link TextFormatMessageHandler}. - * @param line the initial {@link #getLine() line}. - * @param column the initial {@link #getColumn() column}. - */ - public CharSequenceScanner(char[] characters, int offset, int length, TextFormatMessageHandler messageHandler, - int line, int column) { - - super(characters, messageHandler, line, column); - if (offset < 0) { - throw new IndexOutOfBoundsException(Integer.toString(offset)); - } - if (length < 0) { - throw new IndexOutOfBoundsException(Integer.toString(length)); - } - if (offset > characters.length - length) { - throw new IndexOutOfBoundsException(Integer.toString(offset + length)); - } - this.offset = offset; - this.initialOffset = offset; - this.limit = offset + length; - this.offset = this.initialOffset; - } - - /** - * @see java.lang.CharSequence#charAt(int) - * - * @param index is the index of the requested character. - * @return the character at the given {@code index}. - */ - public char charAt(int index) { - - return this.buffer[this.initialOffset + index]; - } - - @Override - public int getPosition() { - - return this.offset - this.initialOffset; - } - - /** - * @see java.lang.CharSequence#length() - * - * @return the total length of the {@link #getOriginalString() string to parse}. - */ - public int getLength() { - - return this.limit - this.initialOffset; - } - - /** - * @see String#substring(int, int) - * - * @param start the start index, inclusive. - * @param end the end index, exclusive. - * @return the specified substring. - */ - public String substring(int start, int end) { - - return new String(this.buffer, this.initialOffset + start, end - start); - } - - /** - * This method gets the {@link #getOriginalString() original string} where the {@link #substring(int, int) substring} - * specified by {@code start} and {@code end} is replaced by {@code substitute}. - * - * @param substitute is the string used as replacement. - * @param start is the inclusive start index of the substring to replace. - * @param end is the exclusive end index of the substring to replace. - * @return the {@link #getOriginalString() original string} with the specified substring replaced by - * {@code substitute}. - */ - public String getReplaced(String substitute, int start, int end) { - - int restLength = this.limit - end; - StringBuilder builder = builder(null); - builder.append(this.buffer, this.initialOffset, start); - builder.append(substitute); - builder.append(this.buffer, this.initialOffset + end, restLength); - return builder.toString(); - } - - /** - * This method appends the {@link #substring(int, int) substring} specified by {@code start} and {@code end} to the - * given {@code buffer}.
- * This avoids the overhead of creating a new string and copying the char array. - * - * @param appendable is the buffer where to append the substring to. - * @param start the start index, inclusive. - * @param end the end index, exclusive. - */ - public void appendSubstring(StringBuilder appendable, int start, int end) { - - appendable.append(this.buffer, this.initialOffset + start, end - start); - } - - /** - * This method gets the current position in the stream to scan. It will initially be {@code 0}. In other words this - * method returns the number of characters that have already been {@link #next() consumed}. - * - * @return the current index position. - */ - public int getCurrentIndex() { - - return this.offset - this.initialOffset; - } - - /** - * This method sets the {@link #getCurrentIndex() current index}. - * - * @param index is the next index position to set. The value has to be greater or equal to {@code 0} and less or equal - * to {@link #getLength()} . - */ - public void setCurrentIndex(int index) { - - // yes, index == getLength() is allowed - that is the state when the end is reached and - // setCurrentIndex(getCurrentPosition()) should NOT cause an exception... - if ((index < 0) || (index > getLength())) { - throw new IndexOutOfBoundsException(Integer.toString(index)); - } - this.offset = this.initialOffset + index; - } - - @Override - public boolean hasNext() { - - return (this.offset < this.limit); - } - - @Override - public char next() { - - if (this.offset < this.limit) { - return handleChar(this.buffer[this.offset++]); - } else { - return 0; - } - } - - @Override - public char peek() { - - if (this.offset < this.limit) { - return this.buffer[this.offset]; - } else { - return 0; - } - } - - @Override - public char peek(int lookaheadOffset) { - - int i = this.offset + lookaheadOffset; - if ((i < this.limit) && (i >= this.initialOffset)) { - if (i < this.limit) { - return this.buffer[i]; - } - } - return EOS; - } - - /** - * This method peeks the number of {@link #peek() next characters} given by {@code count} and returns them as string. - * If there are less characters {@link #hasNext() available} the returned string will be shorter than {@code count} - * and only contain the available characters. Unlike {@link #read(int)} this method does NOT consume the characters - * and will therefore NOT change the state of this scanner. - * - * @param count is the number of characters to peek. You may use {@link Integer#MAX_VALUE} to peek until the end of - * text (EOT) if the data-size is suitable. - * @return a string with the given number of characters or all available characters if less than {@code count}. Will - * be the empty string if no character is {@link #hasNext() available} at all. - */ - @Override - public String peekString(int count) { - - int len = this.limit - this.offset; - if (len > count) { - len = count; - } - String result = new String(this.buffer, this.offset, len); - return result; - } - - @Override - public String peekWhile(CharFilter filter, int maxLen) { - - if (maxLen < 0) { - throw new IllegalArgumentException("Max must NOT be negative: " + maxLen); - } - int len = 0; - int end = this.limit - this.offset; - if (end > maxLen) { - end = maxLen; - } - while (len < end) { - char c = this.buffer[this.offset + len]; - if (!filter.accept(c)) { - break; - } - len++; - } - if (len == 0) { - return ""; - } else { - return new String(this.buffer, this.offset, len); - } - } - - @Override - public String readUntil(CharFilter filter, boolean acceptEot) { - - int start = this.offset; - while (this.offset < this.limit) { - char c = this.buffer[this.offset]; - if (filter.accept(c)) { - return new String(this.buffer, start, this.offset - start); - } - handleChar(c); - this.offset++; - } - if (acceptEot) { - int len = this.offset - start; - if (len > 0) { - return new String(this.buffer, start, len); - } else { - return ""; - } - } else { - return null; - } - } - - @Override - protected boolean expectRestWithLookahead(char[] stopChars, boolean ignoreCase, Runnable appender, boolean skip) { - - int myCharsIndex = this.offset + 1; - int stopCharsIndex = 1; - while (stopCharsIndex < stopChars.length) { - char c = this.buffer[myCharsIndex++]; - char stopChar = stopChars[stopCharsIndex++]; - if ((c != stopChar) && (!ignoreCase || (Character.toLowerCase(c) != stopChar))) { - return false; - } - } - if (skip) { - setOffset(myCharsIndex); - } - return true; - } - - @Override - public boolean expect(String expected, boolean ignoreCase, boolean lookahead, int off) { - - int len = expected.length(); - int newPos = this.offset + off; - if (newPos + len > this.limit) { - return false; - } - for (int i = 0; i < len; i++) { - char c = this.buffer[newPos]; - char exp = expected.charAt(i); - if (c != exp) { - if (!ignoreCase) { - return false; - } - if (Character.toLowerCase(c) != Character.toLowerCase(exp)) { - return false; - } - } - newPos++; - } - if (!lookahead) { - setOffset(newPos); - } - return true; - } - - /** - * This method gets the tail of this scanner without changing the state. - * - * @return the tail of this scanner. - */ - protected String getTail() { - - String tail = ""; - if (this.offset < this.limit) { - tail = new String(this.buffer, this.offset, this.limit - this.offset + 1); - } - return tail; - } - - /** - * This method gets the tail of this scanner limited (truncated) to the given {@code maximum} number of characters - * without changing the state. - * - * @param maximum is the maximum number of characters to return from the {@link #getTail() tail}. - * @return the tail of this scanner. - */ - protected String getTail(int maximum) { - - String tail = ""; - if (this.offset < this.limit) { - int count = this.limit - this.offset + 1; - if (count > maximum) { - count = maximum; - } - tail = new String(this.buffer, this.offset, count); - } - return tail; - } - - @Override - public void require(String expected, boolean ignoreCase) { - - if (!expect(expected, ignoreCase)) { - throw new IllegalStateException("Expecting '" + expected + "' but found: " + getTail(expected.length())); - } - } - - @Override - public String readWhile(CharFilter filter, int min, int max) { - - int currentPos = this.offset; - int len = skipWhile(filter, max); - if (len == 0) { - return ""; - } else { - return new String(this.buffer, currentPos, len); - } - } - - /** - * This method gets the original string to parse. - * - * @see CharSequenceScanner#CharSequenceScanner(String) - * - * @return the original string. - */ - public String getOriginalString() { - - if (this.string != null) { - this.string = new String(this.buffer, this.initialOffset, getLength()); - } - return this.string; - } - - @Override - public void close() { - - this.buffer = null; - } - -} +/* Copyright (c) The m-m-m Team, Licensed under the Apache License, Version 2.0 + * http://www.apache.org/licenses/LICENSE-2.0 */ +package io.github.mmm.scanner; + +import io.github.mmm.base.filter.CharFilter; +import io.github.mmm.base.text.TextFormatMessageHandler; + +/** + * Implementation of {@link CharStreamScanner} based on {@link String}. + * + * @since 1.0.0 + */ +public class CharSequenceScanner extends AbstractCharStreamScanner { + + private String string; + + /** The initial {@link #offset} in the {@link #buffer}. */ + private final int initialOffset; + + /** + * The constructor. + * + * @param charSequence is the {@link #getOriginalString() string} to scan. + */ + public CharSequenceScanner(CharSequence charSequence) { + + this(charSequence, null); + } + + /** + * The constructor. + * + * @param charSequence is the {@link #getOriginalString() string} to scan. + * @param messageHandler the {@link TextFormatMessageHandler}. + */ + public CharSequenceScanner(CharSequence charSequence, TextFormatMessageHandler messageHandler) { + + this(charSequence.toString(), messageHandler); + } + + /** + * The constructor. + * + * @param string is the {@link #getOriginalString() string} to parse. + */ + public CharSequenceScanner(String string) { + + this(string, null); + } + + /** + * The constructor. + * + * @param string is the {@link #getOriginalString() string} to parse. + * @param messageHandler the {@link TextFormatMessageHandler}. + */ + public CharSequenceScanner(String string, TextFormatMessageHandler messageHandler) { + + this(string, messageHandler, 1, 1); + } + + /** + * The constructor. + * + * @param string is the {@link #getOriginalString() string} to parse. + * @param messageHandler the {@link TextFormatMessageHandler}. + * @param line the initial {@link #getLine() line}. + * @param column the initial {@link #getColumn() column}. + */ + public CharSequenceScanner(String string, TextFormatMessageHandler messageHandler, int line, int column) { + + this(string, 0, string.length(), messageHandler, line, column); + this.string = string; + } + + /** + * The constructor. + * + * @param characters is an array containing the characters to scan. + * @param offset is the index of the first char to scan in {@code characters} (typically {@code 0} to start at the + * beginning of the array). + * @param length is the {@link #getLength() number of characters} to scan from {@code characters} starting at + * {@code offset} (typically characters.length - offset). + * @param messageHandler the {@link TextFormatMessageHandler}. + * @param line the initial {@link #getLine() line}. + * @param column the initial {@link #getColumn() column}. + */ + public CharSequenceScanner(String characters, int offset, int length, TextFormatMessageHandler messageHandler, + int line, int column) { + + super(characters, messageHandler, line, column); + if (offset < 0) { + throw new IndexOutOfBoundsException(Integer.toString(offset)); + } else if (length < 0) { + throw new IndexOutOfBoundsException(Integer.toString(length)); + } else if (offset > characters.length() - length) { + throw new IndexOutOfBoundsException(Integer.toString(offset + length)); + } + this.offset = offset; + this.initialOffset = offset; + this.limit = offset + length; + this.offset = this.initialOffset; + } + + /** + * @see java.lang.CharSequence#charAt(int) + * + * @param index is the index of the requested character. + * @return the character at the given {@code index}. + */ + public int charAt(int index) { + + return this.buffer.codePointAt(this.initialOffset + index); + } + + @Override + public int getPosition() { + + return this.offset - this.initialOffset; + } + + /** + * @see java.lang.CharSequence#length() + * + * @return the total length of the {@link #getOriginalString() string to parse}. + */ + public int getLength() { + + return this.limit - this.initialOffset; + } + + /** + * @see String#substring(int, int) + * + * @param start the start index, inclusive. + * @param end the end index, exclusive. + * @return the specified substring. + */ + public String substring(int start, int end) { + + return this.buffer.substring(this.initialOffset + start, this.initialOffset + end); + } + + /** + * This method gets the {@link #getOriginalString() original string} where the {@link #substring(int, int) substring} + * specified by {@code start} and {@code end} is replaced by {@code substitute}. + * + * @param substitute is the string used as replacement. + * @param start is the inclusive start index of the substring to replace. + * @param end is the exclusive end index of the substring to replace. + * @return the {@link #getOriginalString() original string} with the specified substring replaced by + * {@code substitute}. + */ + public String getReplaced(String substitute, int start, int end) { + + StringBuilder builder = builder(null); + builder.append(this.buffer, this.initialOffset, start); + builder.append(substitute); + builder.append(this.buffer, this.initialOffset + end, this.limit); + return builder.toString(); + } + + /** + * This method appends the {@link #substring(int, int) substring} specified by {@code start} and {@code end} to the + * given {@code buffer}.
+ * This avoids the overhead of creating a new string and copying the char array. + * + * @param appendable is the buffer where to append the substring to. + * @param start the start index, inclusive. + * @param end the end index, exclusive. + */ + public void appendSubstring(StringBuilder appendable, int start, int end) { + + appendable.append(this.buffer, this.initialOffset + start, end - start); + } + + /** + * This method gets the current position in the stream to scan. It will initially be {@code 0}. In other words this + * method returns the number of characters that have already been {@link #next() consumed}. + * + * @return the current index position. + */ + public int getCurrentIndex() { + + return this.offset - this.initialOffset; + } + + /** + * This method sets the {@link #getCurrentIndex() current index}. + * + * @param index is the next index position to set. The value has to be greater or equal to {@code 0} and less or equal + * to {@link #getLength()} . + */ + public void setCurrentIndex(int index) { + + // yes, index == getLength() is allowed - that is the state when the end is reached and + // setCurrentIndex(getCurrentPosition()) should NOT cause an exception... + if ((index < 0) || (index > getLength())) { + throw new IndexOutOfBoundsException(Integer.toString(index)); + } + this.offset = this.initialOffset + index; + } + + @Override + public boolean hasNext() { + + return (this.offset < this.limit); + } + + @Override + public int next() { + + if (this.offset < this.limit) { + return handleCodePoint(this.buffer.codePointAt(this.offset++)); + } else { + return 0; + } + } + + @Override + public int peek() { + + if (this.offset < this.limit) { + return this.buffer.codePointAt(this.offset); + } else { + return 0; + } + } + + @Override + public int peek(int lookaheadOffset) { + + int i = this.offset + lookaheadOffset; + if ((i < this.limit) && (i >= this.initialOffset)) { + if (i < this.limit) { + return this.buffer.codePointAt(i); + } + } + return EOS; + } + + /** + * This method peeks the number of {@link #peek() next characters} given by {@code count} and returns them as string. + * If there are less characters {@link #hasNext() available} the returned string will be shorter than {@code count} + * and only contain the available characters. Unlike {@link #read(int)} this method does NOT consume the characters + * and will therefore NOT change the state of this scanner. + * + * @param count is the number of characters to peek. You may use {@link Integer#MAX_VALUE} to peek until the end of + * text (EOT) if the data-size is suitable. + * @return a string with the given number of characters or all available characters if less than {@code count}. Will + * be the empty string if no character is {@link #hasNext() available} at all. + */ + @Override + public String peekString(int count) { + + int end = this.offset + count; + if (end > this.limit) { + end = this.limit; + } + return this.buffer.substring(this.offset, end); + } + + @Override + public String peekWhile(CharFilter filter, int maxLen) { + + if (maxLen < 0) { + throw new IllegalArgumentException("Max must NOT be negative: " + maxLen); + } + int end = this.offset + maxLen; + if (end > this.limit) { + end = this.limit; + } + int i = this.offset; + while (i < end) { + int cp = this.buffer.codePointAt(i); + if (!filter.accept(cp)) { + break; + } + i++; + } + if (i == this.offset) { + return ""; + } else { + return this.buffer.substring(this.offset, i); + } + } + + @Override + public String readUntil(CharFilter filter, boolean acceptEot) { + + int start = this.offset; + while (this.offset < this.limit) { + int cp = this.buffer.codePointAt(this.offset); + if (filter.accept(cp)) { + return this.buffer.substring(start, this.offset + 1); + } + handleCodePoint(cp); + this.offset++; + } + if (acceptEot) { + if (start > this.offset) { + return this.buffer.substring(start, this.offset); + } else { + return ""; + } + } else { + return null; + } + } + + @Override + protected boolean expectRestWithLookahead(String stopChars, boolean ignoreCase, Runnable appender, boolean skip) { + + int bufferIndex = this.offset + 1; + int stopIndex = 1; + int stopLength = stopChars.length(); + while (stopIndex < stopLength) { + int cp = this.buffer.codePointAt(bufferIndex++); + int stopCp = stopChars.codePointAt(stopIndex++); + if ((cp != stopCp) && (!ignoreCase || (Character.toLowerCase(cp) != stopCp))) { + return false; + } + } + if (skip) { + setOffset(bufferIndex); + } + return true; + } + + @Override + public boolean expect(String expected, boolean ignoreCase, boolean lookahead, int off) { + + int len = expected.length(); + int newPos = this.offset + off; + if (newPos + len > this.limit) { + return false; + } + for (int i = 0; i < len; i++) { + int cp = this.buffer.codePointAt(newPos); + int expCp = expected.codePointAt(i); + if (cp != expCp) { + if (!ignoreCase) { + return false; + } + if (Character.toLowerCase(cp) != Character.toLowerCase(expCp)) { + return false; + } + } + newPos++; + } + if (!lookahead) { + setOffset(newPos); + } + return true; + } + + /** + * This method gets the tail of this scanner without changing the state. + * + * @return the tail of this scanner. + */ + protected String getTail() { + + String tail = ""; + if (this.offset < this.limit) { + tail = this.buffer.substring(this.offset, this.limit); + } + return tail; + } + + /** + * This method gets the tail of this scanner limited (truncated) to the given {@code maximum} number of characters + * without changing the state. + * + * @param maximum is the maximum number of characters to return from the {@link #getTail() tail}. + * @return the tail of this scanner. + */ + protected String getTail(int maximum) { + + String tail = ""; + if (this.offset < this.limit) { + int end = this.offset + maximum; + if (end > this.limit) { + end = this.limit; + } + tail = this.buffer.substring(this.offset, end); + } + return tail; + } + + @Override + public void require(String expected, boolean ignoreCase) { + + if (!expect(expected, ignoreCase)) { + throw new IllegalStateException("Expecting '" + expected + "' but found: " + getTail(expected.length())); + } + } + + @Override + public String readWhile(CharFilter filter, int min, int max) { + + int start = this.offset; + int len = skipWhile(filter, max); + if (len == 0) { + return ""; + } else { + return this.buffer.substring(start, start + len); + } + } + + /** + * This method gets the original string to parse. + * + * @see CharSequenceScanner#CharSequenceScanner(String) + * + * @return the original string. + */ + public String getOriginalString() { + + if (this.string != null) { + this.string = this.buffer.substring(this.initialOffset); + } + return this.string; + } + + @Override + public void close() { + + this.buffer = null; + } + +} diff --git a/core/src/main/java/io/github/mmm/scanner/CharStreamScanner.java b/core/src/main/java/io/github/mmm/scanner/CharStreamScanner.java index 95cd636..0e26541 100644 --- a/core/src/main/java/io/github/mmm/scanner/CharStreamScanner.java +++ b/core/src/main/java/io/github/mmm/scanner/CharStreamScanner.java @@ -1,1175 +1,1181 @@ -/* Copyright (c) The m-m-m Team, Licensed under the Apache License, Version 2.0 - * http://www.apache.org/licenses/LICENSE-2.0 */ -package io.github.mmm.scanner; - -import io.github.mmm.base.filter.CharFilter; -import io.github.mmm.base.text.TextFormatMessageType; -import io.github.mmm.base.text.TextFormatProcessor; -import io.github.mmm.scanner.number.CharScannerNumberParser; -import io.github.mmm.scanner.number.CharScannerRadixHandler; -import io.github.mmm.scanner.number.CharScannerRadixMode; - -/** - * This is the interface for a scanner that can be used to parse a stream or sequence of characters. - */ -public interface CharStreamScanner extends TextFormatProcessor, AutoCloseable { - - /** - * The NULL character {@code '\0'} used to indicate the end of stream (EOS).
- * ATTENTION: Do not confuse and mix {@code '\0'} with {@code '0'}. - * - * @see #next() - * @see #peek() - */ - char EOS = '\0'; - - /** - * This method determines if there is at least one more character available. - * - * @return {@code true} if there is at least one character available, {@code false} if the end of data has been - * reached. - */ - boolean hasNext(); - - /** - * This method reads the current character from the stream and increments the index stepping to the next character. - * You should {@link #hasNext() check} if a character is available before calling this method. Otherwise if your - * stream may contain the NUL character ('\0') you can not distinguish if the end of the stream was reached or you - * actually read the NUL character. - * - * @return the {@link #next()} character or {@link #EOS} if none is {@link #hasNext() available}. - */ - char next(); - - /** - * This method reads the current character without {@link #next() consuming} characters and will therefore not change - * the state of this scanner. - * - * @return the current character or {@link #EOS} if none is {@link #hasNext() available}. - */ - char peek(); - - /** - * Like {@link #peek()} but with further lookahead.
- * Attention:
- * This method requires lookahead. For implementations that are backed by an underlying stream (or reader) the given - * {@code lookaheadOffset} shall not exceed the available lookahead size (buffer capacity given at construction time). - * Otherwise the method may fail. - * - * @param lookaheadOffset the lookahead offset. If {@code 0} this method will behave like {@link #peek()}. In case of - * {@code 1} it will return the character after the next one and so forth. - * @return the {@link #peek() peeked} character at the given {@code lookaheadOffset} or {@link #EOS} if no such - * character exists. - */ - char peek(int lookaheadOffset); - - /** - * This method peeks the number of {@link #peek() next characters} given by {@code count} and returns them as - * {@link String}. If there are less characters {@link #hasNext() available} the returned {@link String} will be - * shorter than {@code count} and only contain the available characters. Unlike {@link #read(int)} this method does - * not {@link #next() consume} the characters and will therefore not change the state of this scanner.
- * Attention:
- * This method requires lookahead. For implementations that are backed by an underlying stream (or reader) the given - * {@code count} shall not exceed the available lookahead size (buffer capacity given at construction time). Otherwise - * the method may fail. - * - * @param count is the number of characters to peek. You may use {@link Integer#MAX_VALUE} to peek until the end of - * text (EOT) if the data-size is suitable. - * @return a string with the given number of characters or all available characters if less than {@code count}. Will - * be the empty string if no character is {@link #hasNext() available} at all. - */ - String peekString(int count); - - /** - * @param filter the {@link CharFilter} {@link CharFilter#accept(char) accepting} only the characters to peek. - * @param maxLen the maximum number of characters to peek (to get as lookahead without modifying this stream). - * @return a {@link String} with the {@link #peek() peeked} characters of the given {@code maxLen} or less if a - * character was hit that is not {@link CharFilter#accept(char) accepted} by the given {@code filter} - * or the end-of-text has been reached before. The state of this stream remains unchanged. - * @see #readWhile(CharFilter) - * @see #skip(int) - */ - String peekWhile(CharFilter filter, int maxLen); - - /** - * @param stopFilter the {@link CharFilter} that decides which characters to {@link CharFilter#accept(char) accept} as - * stop characters. - * @param maxLen the maximum number of characters to peek (get as lookahead without modifying this stream). - * @return a {@link String} with the {@link #peek() peeked} characters of the given {@code maxLen} or less if a stop - * character was hit or the end-of-text has been reached before. The state of this stream remains unchanged. - * @see #readWhile(CharFilter) - * @see #skip(int) - */ - default String peekUntil(CharFilter stopFilter, int maxLen) { - - return peekWhile(stopFilter.negate(), maxLen); - } - - /** - * This method reads the number of {@link #next() next characters} given by {@code count} and returns them as string. - * If there are less characters {@link #hasNext() available} the returned string will be shorter than {@code count} - * and only contain the available characters. - * - * @param count is the number of characters to read. You may use {@link Integer#MAX_VALUE} to read until the end of - * data if the data-size is suitable. - * @return a string with the given number of characters or all available characters if less than {@code count}. Will - * be the empty string if no character is {@link #hasNext() available} at all. - */ - String read(int count); - - /** - * This method reads the number of {@link #next() next characters} given by {@code count} and - * {@link StringBuilder#append(char) appends} them to the given {@link StringBuilder}. If there are less characters - * {@link #hasNext() available} then only the remaining characters will be appended resulting in less characters than - * {@code count}. - * - * @param count is the number of characters to read. You may use {@link Integer#MAX_VALUE} to read until the end of - * data if the data-size is suitable. - * @param builder the {@link StringBuilder} where to {@link StringBuilder#append(char) append} the characters to read. - */ - void read(int count, StringBuilder builder); - - /** - * @return the position in the sequence to scan or in other words the number of characters that have been read. Will - * initially be {@code 0}. Please note that this API is designed for scanning textual content (for parsers). - * Therefore we consider 2.1 terabyte as a suitable {@link Integer#MAX_VALUE limit}. - */ - int getPosition(); - - /** - * This method reads all {@link #next() next characters} until the given {@code stop} character or the end is reached. - *
- * After the call of this method, the current index will point to the next character after the (first) {@code stop} - * character or to the end if NO such character exists. - * - * @param stop is the character to read until. - * @param acceptEnd if {@code true} the end of data will be treated as {@code stop}, too. - * @return the string with all read characters excluding the {@code stop} character or {@code null} if there was no - * {@code stop} character and {@code acceptEnd} is {@code false}. - */ - String readUntil(char stop, boolean acceptEnd); - - /** - * This method reads all {@link #next() next characters} until the given (un-escaped) {@code stop} character or the - * end is reached.
- * In advance to {@link #readUntil(char, boolean)}, this method allows that the {@code stop} character may be used in - * the input-string by adding the given {@code escape} character. After the call of this method, the current index - * will point to the next character after the (first) {@code stop} character or to the end if NO such character - * exists.
- * This method is especially useful when quoted strings should be parsed. E.g.: - * - *
-   * {@link CharStreamScanner} scanner = getScanner();
-   * doSomething();
-   * char c = scanner.{@link #next()};
-   * if ((c == '"') || (c == '\'')) {
-   *   char escape = c; // may also be something like '\\'
-   *   String quote = scanner.{@link #readUntil(char, boolean, char) readUntil}(c, false, escape)
-   * } else {
-   *   doOtherThings();
-   * }
-   * 
- * - * @param stop is the character to read until. - * @param acceptEnd if {@code true} the end of data will be treated as {@code stop}, too. - * @param escape is the character used to escape the {@code stop} character. To add an occurrence of the - * {@code escape} character it has to be duplicated (occur twice). The {@code escape} character may also be - * equal to the {@code stop} character. If other regular characters are escaped the {@code escape} character is - * simply ignored. - * @return the string with all read characters excluding the {@code stop} character or {@code null} if there was no - * {@code stop} character and {@code acceptEnd} is {@code false}. - */ - String readUntil(char stop, boolean acceptEnd, char escape); - - /** - * This method reads all {@link #next() next characters} until the given {@code stop} character or the end of the - * string to parse is reached. In advance to {@link #readUntil(char, boolean)}, this method will scan the input using - * the given {@code syntax} which e.g. allows to {@link CharScannerSyntax#getEscape() escape} the stop character.
- * After the call of this method, the current index will point to the next character after the (first) {@code stop} - * character or to the end of the string if NO such character exists. - * - * @param stop is the character to read until. - * @param acceptEnd if {@code true} the end of data will be treated as {@code stop}, too. - * @param syntax contains the characters specific for the syntax to read. - * @return the string with all read characters excluding the {@code stop} character or {@code null} if there was no - * {@code stop} character. - * @see #readUntil(CharFilter, boolean, CharScannerSyntax) - */ - String readUntil(char stop, boolean acceptEnd, CharScannerSyntax syntax); - - /** - * This method reads all {@link #next() next characters} until the first character {@link CharFilter#accept(char) - * accepted} by the given {@code filter} or the end is reached.
- * After the call of this method, the current index will point to the first {@link CharFilter#accept(char) accepted} - * stop character or to the end if NO such character exists. - * - * @param filter is used to {@link CharFilter#accept(char) decide} where to stop. - * @param acceptEnd if {@code true} if end of data should be treated like the {@code stop} character and the rest of - * the text will be returned, {@code false} otherwise (to return {@code null} if the end of data was reached - * and the scanner has been consumed). - * @return the string with all read characters not {@link CharFilter#accept(char) accepted} by the given - * {@link CharFilter} or {@code null} if there was no {@link CharFilter#accept(char) accepted} character and - * {@code acceptEnd} is {@code false}. - */ - String readUntil(CharFilter filter, boolean acceptEnd); - - /** - * This method reads all {@link #next() next characters} until the first character {@link CharFilter#accept(char) - * accepted} by the given {@code filter}, the given {@code stop} {@link String} or the end is reached.
- * After the call of this method, the current index will point to the first {@link CharFilter#accept(char) accepted} - * stop character, or to the first character of the given {@code stop} {@link String} or to the end if NO such - * character exists. - * - * @param filter is used to {@link CharFilter#accept(char) decide} where to stop. - * @param acceptEnd if {@code true} if the end of data should be treated like the {@code stop} character and the rest - * of the text will be returned, {@code false} otherwise (to return {@code null} if end of data was reached and - * the scanner has been consumed). - * @param stop the {@link String} where to stop consuming data. Should be at least two characters long (otherwise - * accept by {@link CharFilter} instead). - * @return the string with all read characters not {@link CharFilter#accept(char) accepted} by the given - * {@link CharFilter} or until the given {@code stop} {@link String} was detected. If end of data was reached - * without a stop signal the entire rest of the data is returned or {@code null} if {@code acceptEnd} is - * {@code false}. - */ - default String readUntil(CharFilter filter, boolean acceptEnd, String stop) { - - return readUntil(filter, acceptEnd, stop, false); - } - - /** - * This method reads all {@link #next() next characters} until the first character {@link CharFilter#accept(char) - * accepted} by the given {@code filter}, the given {@code stop} {@link String} or the end is reached.
- * After the call of this method, the current index will point to the first {@link CharFilter#accept(char) accepted} - * stop character, or to the first character of the given {@code stop} {@link String} or to the end if NO such - * character exists. - * - * @param filter is used to {@link CharFilter#accept(char) decide} where to stop. - * @param acceptEnd if {@code true} if the end of data should be treated like the {@code stop} character and the rest - * of the text will be returned, {@code false} otherwise (to return {@code null} if the end of data was reached - * and the scanner has been consumed). - * @param stop the {@link String} where to stop consuming data. Should be at least two characters long (otherwise - * accept by {@link CharFilter} instead). - * @param ignoreCase - if {@code true} the case of the characters is ignored when compared with characters from - * {@code stop} {@link String}. - * @return the string with all read characters not {@link CharFilter#accept(char) accepted} by the given - * {@link CharFilter} or until the given {@code stop} {@link String} was detected. If the end of data was - * reached without a stop signal the entire rest of the data is returned or {@code null} if {@code acceptEnd} - * is {@code false}. - */ - default String readUntil(CharFilter filter, boolean acceptEnd, String stop, boolean ignoreCase) { - - return readUntil(filter, acceptEnd, stop, ignoreCase, false); - } - - /** - * This method reads all {@link #next() next characters} until the first character {@link CharFilter#accept(char) - * accepted} by the given {@code filter}, the given {@code stop} {@link String} or the end is reached.
- * After the call of this method, the current index will point to the first {@link CharFilter#accept(char) accepted} - * stop character, or to the first character of the given {@code stop} {@link String} or to the end if NO such - * character exists. - * - * @param filter is used to {@link CharFilter#accept(char) decide} where to stop. - * @param acceptEnd if {@code true} if the end of data should be treated like the {@code stop} character and the rest - * of the text will be returned, {@code false} otherwise (to return {@code null} if the end of data was reached - * and the scanner has been consumed). - * @param stop the {@link String} where to stop consuming data. Should be at least two characters long (otherwise - * accept by {@link CharFilter} instead). - * @param ignoreCase - if {@code true} the case of the characters is ignored when compared with characters from - * {@code stop} {@link String}. - * @param trim - {@code true} if the result should be {@link String#trim() trimmed}, {@code false} otherwise. - * @return the string with all read characters not {@link CharFilter#accept(char) accepted} by the given - * {@link CharFilter} or until the given {@code stop} {@link String} was detected. If the end of data was - * reached without hitting {@code stop} the entire rest of the data is returned or {@code null} if - * {@code acceptEnd} is {@code false}. Thre result will be {@link String#trim() trimmed} if {@code trim} is - * {@code true}. - */ - String readUntil(CharFilter filter, boolean acceptEnd, String stop, boolean ignoreCase, boolean trim); - - /** - * This method reads all {@link #next() next characters} until the given {@link CharFilter} - * {@link CharFilter#accept(char) accepts} the current character as stop character or the end of data is reached. In - * advance to {@link #readUntil(char, boolean)}, this method will scan the input using the given {@code syntax} which - * e.g. allows to {@link CharScannerSyntax#getEscape() escape} the stop character.
- * After the call of this method, the current index will point to the first {@link CharFilter#accept(char) accepted} - * stop character or to the end of the string if NO such character exists. - * - * @param filter is used to {@link CharFilter#accept(char) decide} where to stop. - * @param acceptEnd if {@code true} the end of data will be treated as {@code stop}, too. - * @param syntax contains the characters specific for the syntax to read. - * @return the string with all read characters excluding the {@code stop} character or {@code null} if there was no - * {@code stop} character. - * @see #readUntil(char, boolean, CharScannerSyntax) - */ - String readUntil(CharFilter filter, boolean acceptEnd, CharScannerSyntax syntax); - - /** - * @param stopFilter the {@link CharFilter} that decides which characters to {@link CharFilter#accept(char) accept} as - * stop characters. - * @param min the minimum number of characters expected. - * @param max the (maximum) length of the characters to consume. - * @return the {@link String} with all consumed characters excluding the stop character. If no {@code stop} character - * was found until {@code maxLength} characters have been consumed, this method behaves like {@link #read(int) - * read(maxLength)}. - * @throws IllegalStateException if less than the minimum number of characters have been - * {@link CharFilter#accept(char) rejected}. - * @see #read(int) - * @see #readWhile(CharFilter, int, int) - * @see #peekUntil(CharFilter, int) - */ - default String readUntil(CharFilter stopFilter, int min, int max) { - - return readWhile(stopFilter.negate(), min, max); - } - - /** - * This method reads all {@link #next() next characters} that are {@link CharFilter#accept(char) accepted} by the - * given {@code filter}.
- * After the call of this method, the current index will point to the next character that was NOT - * {@link CharFilter#accept(char) accepted} by the given {@code filter} or to the end if NO such character exists. - * - * @see #skipWhile(CharFilter) - * - * @param filter used to {@link CharFilter#accept(char) decide} which characters should be accepted. - * @return a string with all characters {@link CharFilter#accept(char) accepted} by the given {@code filter}. Will be - * the empty string if no character was accepted. - */ - default String readWhile(CharFilter filter) { - - return readWhile(filter, 0, Integer.MAX_VALUE); - } - - /** - * This method reads all {@link #next() next characters} that are {@link CharFilter#accept(char) accepted} by the - * given {@code filter}.
- * After the call of this method, the current index will point to the next character that was NOT - * {@link CharFilter#accept(char) accepted} by the given {@code filter}. If the next {@code max} characters or the - * characters left until the {@link #hasNext() end} of this scanner are {@link CharFilter#accept(char) accepted}, only - * that amount of characters are skipped. - * - * @see #skipWhile(char) - * - * @param filter used to {@link CharFilter#accept(char) decide} which characters should be accepted. - * @param min the minimum number of characters expected. - * @param max the maximum number of characters that should be read. - * @return a string with all characters {@link CharFilter#accept(char) accepted} by the given {@code filter} limited - * to the length of {@code max} and the {@link #hasNext() end} of this scanner. Will be the empty string if no - * character was accepted. - * @throws IllegalStateException if less than the minimum number of characters have been - * {@link CharFilter#accept(char) accepted}. - */ - String readWhile(CharFilter filter, int min, int max); - - /** - * @return a {@link String} with the data until the end of the current line or the end of the data. Will be - * {@code null} if the end has already been reached and {@link #hasNext()} returns {@code false}. - */ - default String readLine() { - - return readLine(false); - } - - /** - * @param trim - {@code true} if the result should be {@link String#trim() trimmed}, {@code false} otherwise. - * @return a {@link String} with the data until the end of the current line ({@link String#trim() trimmed} if - * {@code trim} is {@code true}) or the end of the data. Will be {@code null} if the end has already been - * reached and {@link #hasNext()} returns {@code false}. - */ - String readLine(boolean trim); - - /** - * Reads a {@link Boolean} value from this scanner if available. - * - * @return the consumed {@link Boolean} value or {@code null} if no such value was available and the - * {@link #getPosition() position} remains unchanged. - */ - default Boolean readBoolean() { - - return readBoolean(false, false); - } - - /** - * Reads a {@link Boolean} value from this scanner if available. - * - * @param ignoreCase - if {@code true} the case of the characters is ignored when compared, {@code false} otherwise - * (only lower case is accepted). - * @return the consumed {@link Boolean} value or {@code null} if no such value was available and the - * {@link #getPosition() position} remains unchanged. - */ - default Boolean readBoolean(boolean ignoreCase) { - - return readBoolean(ignoreCase, false); - } - - /** - * Reads a {@link Boolean} value from this scanner if available. - * - * @param ignoreCase - if {@code true} the case of the characters is ignored when compared, {@code false} otherwise - * (only lower case is accepted). - * @param acceptYesNo - if {@code true} also "yes" is accepted for {@code true} and "no" for {@code false}, - * {@code false} otherwise. - * @return the consumed {@link Boolean} value or {@code null} if no such value was available and the - * {@link #getPosition() position} remains unchanged. - */ - default Boolean readBoolean(boolean ignoreCase, boolean acceptYesNo) { - - if (expect("true", ignoreCase)) { - return Boolean.TRUE; - } else if (expect("false", ignoreCase)) { - return Boolean.FALSE; - } else if (!acceptYesNo) { - return null; - } else if (expect("yes", ignoreCase)) { - return Boolean.TRUE; - } else if (expect("no", ignoreCase)) { - return Boolean.FALSE; - } - return null; - } - - /** - * Generic way to read and parse any kind of {@link Number}. - * - * @param numberParser the {@link CharScannerNumberParser}. Can decide if sign, digits, radix, exponent, or even - * specials are - */ - void readNumber(CharScannerNumberParser numberParser); - - /** - * This method reads the double value (decimal number) starting at the current position by reading as many matching - * characters as available and returns its {@link Double#parseDouble(String) parsed} value.
- * - * @return the parsed {@code double} number or {@code null} if the current current position does not point to a - * number. - * @throws NumberFormatException if the number at the current position could not be parsed. - */ - default Double readDouble() throws NumberFormatException { - - return readDouble(CharScannerRadixMode.ONLY_10); - } - - /** - * This method reads the double value (decimal number) starting at the current position by reading as many matching - * characters as available and returns its {@link Double#parseDouble(String) parsed} value.
- * - * @param radixMode the {@link CharScannerRadixHandler} - e.g. {@link CharScannerRadixMode#ALL}. - * @return the parsed {@code double} number or {@code null} if the current current position does not point to a - * number. - * @throws NumberFormatException if the number at the current position could not be parsed. - */ - Double readDouble(CharScannerRadixHandler radixMode) throws NumberFormatException; - - /** - * This method reads a {@link Float} value from the current position {@link #next() consuming} as many matching - * characters as available. - * - * @return the parsed {@link Float} value or {@code null} if the current current position does not point to a - * {@link Float} number. - * @throws NumberFormatException if the number at the current position could not be parsed. - */ - default Float readFloat() throws NumberFormatException { - - return readFloat(CharScannerRadixMode.ONLY_10); - } - - /** - * This method reads a {@link Float} value from the current position {@link #next() consuming} as many matching - * characters as available. - * - * @param radixMode the {@link CharScannerRadixHandler} - e.g. {@link CharScannerRadixMode#ALL}. - * @return the parsed {@link Float} value or {@code null} if the current current position does not point to a - * {@link Float} number. - * @throws NumberFormatException if the number at the current position could not be parsed. - */ - Float readFloat(CharScannerRadixHandler radixMode) throws NumberFormatException; - - /** - * @return the consumed {@link Long} value or {@code null} if no number was present and the {@link #getPosition() - * position} remains unchanged. - * @throws NumberFormatException if the current current position points to a number that is not a {@link Long} value. - */ - default Long readLong() throws NumberFormatException { - - return readLong(CharScannerRadixMode.ONLY_10); - } - - /** - * @param radixMode the {@link CharScannerRadixHandler} - e.g. {@link CharScannerRadixMode#ALL}. - * @return the consumed {@link Long} value or {@code null} if no such value was present and the {@link #getPosition() - * position} remains unchanged. - * @throws NumberFormatException if the current current position points to a number that is not a {@link Long} value. - */ - Long readLong(CharScannerRadixHandler radixMode); - - /** - * @return the consumed {@link Integer} value or {@code null} if no such value was present and the - * {@link #getPosition() position} remains unchanged. - * @throws NumberFormatException if the current current position does not point to a {@link Integer} value. - */ - default Integer readInteger() throws NumberFormatException { - - return readInteger(CharScannerRadixMode.ONLY_10); - } - - /** - * @param radixMode the {@link CharScannerRadixHandler} - e.g. {@link CharScannerRadixMode#ALL}. - * @return the consumed {@link Integer} value or {@code null} if no such value was present and the - * {@link #getPosition() position} remains unchanged. - * @throws NumberFormatException if the current current position does not point to a {@link Long} value. - */ - Integer readInteger(CharScannerRadixHandler radixMode) throws NumberFormatException; - - /** - * Reads a Java {@link Number} literal (e.g. "1L" or "1.3F"). - * - * @return the consumed {@link Number} or {@code null} if no number literal was found and the {@link #getPosition() - * position} remains unchainged. - * @throws NumberFormatException if a number literal was found that has an illegal format. - */ - Number readJavaNumberLiteral(); - - /** - * This method reads the {@link #next() next character} if it is a digit. Else the state remains unchanged. - * - * @return the numeric value of the next Latin digit (e.g. {@code 0} if {@code '0'}) or {@code -1} if the - * {@link #next() next character} is no Latin digit. - */ - default int readDigit() { - - return readDigit(10); - } - - /** - * This method reads the {@link #next() next character} if it is a digit within the given {@code radix}. Else the - * state remains unchanged. - * - * @param radix the radix that defines the range of the digits. See {@link Integer#parseInt(String, int)}. E.g. - * {@code 10} to read any Latin digit (see {@link #readDigit()}), {@code 8} to read octal digit, {@code 16} to - * read hex decimal digits. - * @return the numeric value of the next digit within the given {@code radix} or {@code -1} if the {@link #next() next - * character} is no such digit. - */ - int readDigit(int radix); - - /** - * This method reads the long starting at the current position by reading as many Latin digits as available but at - * maximum the given {@code maxDigits} and returns its {@link Long#parseLong(String) parsed} value.
- * ATTENTION:
- * This method does NOT treat signs ({@code +} or {@code -}) to do so, scan them yourself before and negate the result - * as needed. - * - * @param maxDigits is the maximum number of digits that will be read. The value has to be positive (greater than - * zero). Should not be greater than {@code 19} as this will exceed the range of {@code long}. - * @return the parsed number. - * @throws NumberFormatException if the number at the current position could not be parsed. - */ - long readUnsignedLong(int maxDigits) throws NumberFormatException; - - /** - * Reads and parses a Java {@link String} literal value according to JLS 3.10.6.
- * As a complex example for the input "Hi \"\176\477\579\u2022\uuuuu2211\"\n" this scanner would return the - * {@link String} output {@code Hi "~'7/9•∑"} followed by a newline character. - * - * @return the parsed Java {@link String} literal value or {@code null} if not pointing to a {@link String} literal. - */ - default String readJavaStringLiteral() { - - return readJavaStringLiteral(TextFormatMessageType.ERROR); - } - - /** - * Reads and parses a Java {@link String} literal value according to JLS 3.10.6.
- * As a complex example for the input "Hi \"\176\477\579\u2022\uuuuu2211\"\n" this scanner would return the - * {@link String} output {@code Hi "~'7/9•∑"} followed by a newline character. - * - * @param severity the {@link TextFormatMessageType} to use to report invalid escape sequences or missing terminating - * quotation. - * @return the parsed Java {@link String} literal value or {@code null} if not pointing to a {@link String} literal. - */ - String readJavaStringLiteral(TextFormatMessageType severity); - - /** - * Reads and parses a Java {@link Character} literal value according to JLS 3.10.6.
- * Examples are given in the following table: - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
literalresultcomment
{@code 'a'}aregular char
{@code '\''}'escaped char
{@code '\176'}~escaped octal representation
{@code '\u2022'}escaped unicode representation
- * - * @return the parsed Java {@link String} literal value or {@code null} if not pointing to a {@link String} literal. - */ - default Character readJavaCharLiteral() { - - return readJavaCharLiteral(TextFormatMessageType.ERROR); - } - - /** - * Reads and parses a Java {@link Character} literal value according to JLS 3.10.6.
- * Examples are given in the following table: - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
literalresultcomment
{@code 'a'}aregular char
{@code '\''}'escaped char
{@code '\176'}~escaped octal representation
{@code '\u2022'}escaped unicode representation
- * - * @param severity the {@link TextFormatMessageType} to use to report invalid escape sequences or missing terminating - * quotation. - * @return the parsed Java {@link Character} literal value or {@code null} if not pointing to a {@link Character} - * literal. - */ - Character readJavaCharLiteral(TextFormatMessageType severity); - - /** - * This method determines if the given {@code expected} {@link String} is completely present at the current position. - * It will only {@link #next() consume} characters and change the state if the {@code expected} {@link String} was - * found (entirely).
- * Attention:
- * This method requires lookahead. For implementations that are backed by an underlying stream (or reader) the - * {@link String#length() length} of the expected {@link String} shall not exceed the available lookahead size (buffer - * capacity given at construction time). Otherwise the method may fail. - * - * @param expected is the expected string. - * @return {@code true} if the {@code expected} string was successfully consumed from this scanner, {@code false} - * otherwise. - * @see #expectUnsafe(String) - */ - default boolean expect(String expected) { - - return expect(expected, false, false, 0); - } - - /** - * This method determines if the given {@code expected} {@link String} is completely present at the current position. - * It will only {@link #next() consume} characters and change the state if the {@code expected} {@link String} was - * found (entirely).
- * Attention:
- * This method requires lookahead. For implementations that are backed by an underlying stream (or reader) the - * {@link String#length() length} of the expected {@link String} shall not exceed the available lookahead size (buffer - * capacity given at construction time). Otherwise the method may fail. - * - * @param expected the expected {@link String} to search for. - * @param ignoreCase - if {@code true} the case of the characters is ignored when compared, {@code false} otherwise. - * @return {@code true} if the {@code expected} string was successfully found and {@link #next() consume} from this - * scanner, {@code false} otherwise. - * @see #expectUnsafe(String, boolean) - */ - default boolean expect(String expected, boolean ignoreCase) { - - return expect(expected, ignoreCase, false, 0); - } - - /** - * This method determines if the given {@code expected} {@link String} is completely present at the current position. - * It will only {@link #next() consume} characters and change the state if {@code lookahead} is {@code false} and the - * {@code expected} {@link String} was found (entirely).
- * Attention:
- * This method requires lookahead. For implementations that are backed by an underlying stream (or reader) the - * {@link String#length() length} of the expected {@link String} shall not exceed the available lookahead size (buffer - * capacity given at construction time). Otherwise the method may fail. - * - * @param expected the expected {@link String} to search for. - * @param ignoreCase - if {@code true} the case of the characters is ignored when compared, {@code false} otherwise. - * @param lookahead - if {@code true} the state of the scanner remains unchanged even if the expected {@link String} - * has been found, {@code false} otherwise (expected {@link String} is consumed on match). - * @return {@code true} if the {@code expected} string was successfully found, {@code false} otherwise. - */ - default boolean expect(String expected, boolean ignoreCase, boolean lookahead) { - - return expect(expected, ignoreCase, lookahead, 0); - } - - /** - * This method determines if the given {@code expected} {@link String} is completely present at the current position. - * It will only {@link #next() consume} characters and change the state if {@code lookahead} is {@code false} and the - * {@code expected} {@link String} was found (entirely).
- * Attention:
- * This method requires lookahead. For implementations that are backed by an underlying stream (or reader) the - * {@link String#length() length} of the expected {@link String} shall not exceed the available lookahead size (buffer - * capacity given at construction time). Otherwise the method may fail. - * - * @param expected the expected {@link String} to search for. - * @param ignoreCase - if {@code true} the case of the characters is ignored when compared, {@code false} otherwise. - * @param lookahead - if {@code true} the state of the scanner remains unchanged even if the expected {@link String} - * has been found, {@code false} otherwise (expected {@link String} is consumed on match). - * @param offset the number of characters that have already been {@link #peek(int) peeked} and after which the given - * {@link String} is expected. Will typically be {@code 0}. If {@code lookahead} is {@code false} and the - * expected {@link String} was found these characters will be {@link #skip(int) skipped} together with the - * expected {@link String}. - * @return {@code true} if the {@code expected} string was successfully found, {@code false} otherwise. - */ - boolean expect(String expected, boolean ignoreCase, boolean lookahead, int offset); - - /** - * This method determines if the given {@code expected} {@link String} is completely present at the current position. - * It will only {@link #next() consume} characters and change the state if {@code lookahead} is {@code false} and the - * {@code expected} {@link String} was found (entirely).
- * Attention:
- * This method requires lookahead. For implementations that are backed by an underlying stream (or reader) the - * {@link String#length() length} of the expected {@link String} shall not exceed the available lookahead size (buffer - * capacity given at construction time). Otherwise the method may fail. - * - * @param expected the expected {@link String} to search for. - * @param ignoreCase - if {@code true} the case of the characters is ignored when compared, {@code false} otherwise. - * @param lookahead - if {@code true} the state of the scanner remains unchanged even if the expected {@link String} - * has been found, {@code false} otherwise (expected {@link String} is consumed on match). - * @param offset the number of characters that have already been {@link #peek(int) peeked} and after which the given - * {@link String} is expected. Will typically be {@code 0}. If {@code lookahead} is {@code false} and the - * expected {@link String} was found these characters will be {@link #skip(int) skipped} together with the - * expected {@link String}. - * @param warning {@code true} to {@link #addWarning(String) add a warning} in case the expected {@link String} was - * not found, {@code false} otherwise. - * @return {@code true} if the {@code expected} string was successfully found, {@code false} otherwise. - */ - default boolean expect(String expected, boolean ignoreCase, boolean lookahead, int offset, boolean warning) { - - boolean found = expect(expected, ignoreCase, lookahead, offset); - if (!found && warning) { - addWarning("Expected '" + expected + "'"); - } - return found; - } - - /** - * This method checks if the {@link #next() next character} is equal to the given {@code expected} character.
- * If the character matched with the {@code expected} character, the parser points to the next character. Otherwise - * its position will remain unchanged. - * - * @param expected is the expected character. - * @return {@code true} if the current character is the same as {@code expected}, {@code false} otherwise. - */ - default boolean expectOne(char expected) { - - return expectOne(expected, false); - } - - /** - * This method checks if the {@link #next() next character} is equal to the given {@code expected} character.
- * If the character matched with the {@code expected} character, the parser points to the next character. Otherwise - * its position will remain unchanged. - * - * @param expected the character to expect as {@link #next() next} in this stream. - * @param warning {@code true} to {@link #addWarning(String) add a warning} in case the expected character was not - * present, {@code false} otherwise. - * @return {@code true} if the expected character was found and consumer, {@code false} otherwise (and this stream - * remains unchanged). - */ - boolean expectOne(char expected, boolean warning); - - /** - * This method checks that the {@link #next() next character} is {@link CharFilter#accept(char) accepted} by the given - * {@link CharFilter}.
- * If the current character was as expected, the parser points to the next character. Otherwise its position will - * remain unchanged. - * - * @param expected is the {@link CharFilter} {@link CharFilter#accept(char) accepting} the expected chars. - * @return {@code true} if the current character is {@link CharFilter#accept(char) accepted}, {@code false} otherwise. - */ - default boolean expectOne(CharFilter expected) { - - if (!hasNext()) { - return false; - } - if (expected.accept(peek())) { - next(); - return true; - } - return false; - } - - /** - * This method skips all {@link #next() next characters} as long as they equal to the according character of the - * {@code expected} {@link String}.
- * If a character differs this method stops and the parser points to the first character that differs from - * {@code expected}. Except for the latter circumstance, this method behaves similar to the following code: - * - *
-   * {@link #read(int) read}(expected.length).equals(expected)
-   * 
- * - * ATTENTION:
- * In most cases you want to prefer {@link #expect(String)} instead of using this method. Only in specific cases and - * for highly optimized performance it may make sense to use it. In such case be careful and consider to combine with - * {@link #getPosition()} to be able to determine whether characters have been consumed if {@code false} was returned - * (e.g. otherwise when doing {@link #expectUnsafe(String) expectUnsafe}("false") and else doing - * {@link #expectUnsafe(String) expectUnsafe}("true") to parse a {@code boolean} literal your code could accept - * "falstrue" as "true"). - * - * @param expected is the expected string. - * @return {@code true} if the {@code expected} string was successfully consumed from this scanner, {@code false} - * otherwise. - * @see #expect(String) - */ - default boolean expectUnsafe(String expected) { - - return expectUnsafe(expected, false); - } - - /** - * This method skips all {@link #next() next characters} as long as they equal to the according character of the - * {@code expected} string.
- * If a character differs this method stops and the parser points to the first character that differs from - * {@code expected}. Except for the latter circumstance, this method behaves similar to the following code: - * - *
-   * {@link #read(int) read}(expected.length).equals[IgnoreCase](expected)
-   * 
- * - * ATTENTION:
- * In most cases you want to prefer {@link #expect(String, boolean)} instead of using this method. See - * {@link #expectUnsafe(String)} for details. - * - * @param expected is the expected string. - * @param ignoreCase - if {@code true} the case of the characters is ignored when compared. - * @return {@code true} if the {@code expected} string was successfully consumed from this scanner, {@code false} - * otherwise. - * @see #expect(String, boolean) - */ - boolean expectUnsafe(String expected, boolean ignoreCase); - - /** - * This method verifies that the {@link #next() next character} is equal to the given {@code expected} character.
- * If the current character was as expected, the parser points to the next character. Otherwise an exception is thrown - * indicating the problem. - * - * @param expected is the expected character. - * @throws IllegalStateException if the required character was not found. - */ - default void requireOne(char expected) throws IllegalStateException { - - if (!hasNext()) { - throw new IllegalStateException("Expecting '" + expected + "' but found end-of-stream."); - } - char next = peek(); - if (next != expected) { - throw new IllegalStateException("Expecting '" + expected + "' but found: " + next); - } - next(); - } - - /** - * This method verifies that the {@code expected} string gets consumed from this scanner with respect to - * {@code ignoreCase}. Otherwise an exception is thrown indicating the problem.
- * This method behaves functionally equivalent to the following code: - * - *
-   * if (!scanner.{@link #expectUnsafe(String, boolean) expectUnsafe}(expected, ignoreCase)) {
-   *   throw new {@link IllegalStateException}(...);
-   * }
-   * 
- * - * @param expected is the expected string. - * @param ignoreCase - if {@code true} the case of the characters is ignored during comparison. - */ - void require(String expected, boolean ignoreCase); - - /** - * @param filter the {@link CharFilter} {@link CharFilter#accept(char) accepting} the expected characters to - * {@link #skipWhile(CharFilter, int) skip}. - * @return the actual number of characters that have been skipped. - * @throws IllegalStateException if less than {@code 1} or more than {@code 1000} {@link CharFilter#accept(char) - * accepted} characters have been consumed. - */ - default int requireOne(CharFilter filter) { - - return require(filter, 1, -1); - } - - /** - * @param filter the {@link CharFilter} {@link CharFilter#accept(char) accepting} the expected characters to - * {@link #skipWhile(CharFilter, int) skip}. - * @return the actual number of characters that have been skipped. - * @throws IllegalStateException if less than {@code 1} or more than {@code 1000} {@link CharFilter#accept(char) - * accepted} characters have been consumed. - */ - default int requireOneOrMore(CharFilter filter) { - - return require(filter, 1); - } - - /** - * @param filter the {@link CharFilter} {@link CharFilter#accept(char) accepting} the expected characters to - * {@link #skipWhile(CharFilter, int) skip}. - * @param min the minimum required number of skipped characters. - * @return the actual number of characters that have been skipped. - * @throws IllegalStateException if less than {@code min} or more than {@code 1000} {@link CharFilter#accept(char) - * accepted} characters have been consumed. - */ - default int require(CharFilter filter, int min) { - - return require(filter, min, 1000); - } - - /** - * @param filter the {@link CharFilter} {@link CharFilter#accept(char) accepting} the expected characters to - * {@link #skipWhile(CharFilter, int) skip}. - * @param min the minimum required number of skipped characters. - * @param max the maximum number of skipped characters. - * @return the actual number of characters that have been skipped. - * @throws IllegalStateException if less than {@code min} or more than {@code max} {@link CharFilter#accept(char) - * accepted} characters have been consumed. - */ - default int require(CharFilter filter, int min, int max) { - - if ((min < 0) || ((min > max) && (max != -1))) { - throw new IllegalArgumentException("Invalid range: " + min + "-" + max); - } - int num = max; - if (max == -1) { - num = min; - } - int count = skipWhile(filter, num); - if (count < min) { - invalidCharCount("at least " + min, count, filter); - } - if (count == max) { - char c = peek(); - if (!filter.accept(c)) { - invalidCharCount("up to " + max, count, filter); - } - } - return count; - } - - private IllegalStateException invalidCharCount(String bound, int count, CharFilter filter) { - - String description = filter.getDescription(); - String chars = " character(s)"; - if (!CharFilter.NO_DESCRIPTION.equals(description)) { - chars = " character(s) matching " + description; - } - throw new IllegalStateException("Require " + bound + chars + " but found only " + count); - } - - /** - * This method skips all {@link #next() next characters} until the given {@code stop} character or the end is reached. - * If the {@code stop} character was reached, this scanner will point to the next character after {@code stop} when - * this method returns. - * - * @param stop is the character to read until. - * @return {@code true} if the first occurrence of the given {@code stop} character has been passed, {@code false} if - * there is no such character. - */ - boolean skipUntil(char stop); - - /** - * This method reads all {@link #next() next characters} until the given {@code stop} character or the end of the - * string to parse is reached. In advance to {@link #skipUntil(char)}, this method will read over the {@code stop} - * character if it is escaped with the given {@code escape} character. - * - * @param stop is the character to read until. - * @param escape is the character used to escape the stop character (e.g. '\'). - * @return {@code true} if the first occurrence of the given {@code stop} character has been passed, {@code false} if - * there is no such character. - */ - boolean skipUntil(char stop, char escape); - - /** - * This method skips the number of {@link #next() next characters} given by {@code count}. - * - * @param count is the number of characters to skip. You may use {@link Integer#MAX_VALUE} to read until the end of - * data if the data-size is suitable. - * @return a to total number of characters that have been skipped. Typically equal to {@code count}. Will be less in - * case the end of data was reached. - */ - int skip(int count); - - /** - * @return {@code 0} if the {@link #next() next characeter} is not a newline and the stream remains unchanged, - * {@code 1} if the {@link #next() next characeter} was '\n' and has been {@link #skip(int) skipped}, or - * {@code 2} if the{@link #next() next characeters} have been '\r' and '\n' and have been {@link #skip(int) - * skipped}. - */ - int skipNewLine(); - - /** - * This method reads all {@link #next() next characters} until the given {@code substring} has been detected.
- * After the call of this method, the current index will point to the next character after the first occurrence of - * {@code substring} or to the end of data if the given {@code substring} was NOT found.
- * - * @param substring is the substring to search and skip over starting at the current index. - * @return {@code true} if the given {@code substring} occurred and has been passed and {@code false} if the end of - * the string has been reached without any occurrence of the given {@code substring}. - */ - default boolean skipOver(String substring) { - - return skipOver(substring, false); - } - - /** - * This method reads all {@link #next() next characters} until the given {@code substring} has been detected.
- * After the call of this method, the current index will point to the next character after the first occurrence of - * {@code substring} or to the end of data if the given {@code substring} was NOT found.
- * - * @param substring is the substring to search and skip over starting at the current index. - * @param ignoreCase - if {@code true} the case of the characters is ignored when compared with characters from - * {@code substring}. - * @return {@code true} if the given {@code substring} occurred and has been passed and {@code false} if the end of - * the string has been reached without any occurrence of the given {@code substring}. - */ - default boolean skipOver(String substring, boolean ignoreCase) { - - return skipOver(substring, ignoreCase, null); - } - - /** - * This method consumes all {@link #next() next characters} until the given {@code substring} has been detected, a - * character was {@link CharFilter#accept(char) accepted} by the given {@link CharFilter} or the end of data was - * reached.
- * After the call of this method this scanner will point to the next character after the first occurrence of - * {@code substring}, to the stop character or to end of data.
- * - * @param substring is the substring to search and skip over starting at the current index. - * @param ignoreCase - if {@code true} the case of the characters is ignored when compared with characters from - * {@code substring}. - * @param stopFilter is the filter used to {@link CharFilter#accept(char) detect} stop characters. If such character - * was detected, the skip is stopped and the parser points to the character after the stop character. The - * {@code substring} should NOT contain a {@link CharFilter#accept(char) stop character}. - * @return {@code true} if the given {@code substring} occurred and has been passed and {@code false} if a stop - * character has been detected or the end of the string has been reached without any occurrence of the given - * {@code substring} or stop character. - */ - boolean skipOver(String substring, boolean ignoreCase, CharFilter stopFilter); - - /** - * This method reads all {@link #next() next characters} that are identical to the character given by {@code c}.
- * E.g. use {@link #skipWhile(char) readWhile(' ')} to skip all blanks from the current index. After the call of this - * method, the current index will point to the next character that is different to the given character {@code c} or to - * the end if NO such character exists. - * - * @param c is the character to read over. - * @return the number of characters that have been skipped. - */ - int skipWhile(char c); - - /** - * This method reads all {@link #next() next characters} that are {@link CharFilter#accept(char) accepted} by the - * given {@code filter}.
- * After the call of this method, the current index will point to the next character that was NOT - * {@link CharFilter#accept(char) accepted} by the given {@code filter} or to the end if NO such character exists. - * - * @see #skipWhile(char) - * - * @param filter is used to {@link CharFilter#accept(char) decide} which characters should be accepted. - * @return the number of characters {@link CharFilter#accept(char) accepted} by the given {@code filter} that have - * been skipped. - */ - default int skipWhile(CharFilter filter) { - - return skipWhile(filter, Integer.MAX_VALUE); - } - - /** - * This method reads all {@link #next() next characters} that are {@link CharFilter#accept(char) accepted} by the - * given {@code filter}.
- * After the call of this method, the current index will point to the next character that was NOT - * {@link CharFilter#accept(char) accepted} by the given {@code filter}. If the next {@code max} characters or the - * characters left until the {@link #hasNext() end} of this scanner are {@link CharFilter#accept(char) accepted}, only - * that amount of characters are skipped. - * - * @see #skipWhile(char) - * - * @param filter is used to {@link CharFilter#accept(char) decide} which characters should be accepted. - * @param max is the maximum number of characters that may be skipped. - * @return the number of skipped characters. - */ - int skipWhile(CharFilter filter, int max); - - /** - * Behaves like the following code: - * - *
-   * {@link #skipWhile(CharFilter) skipWhile}(filter);
-   * return {@link #peek()};
-   * 
- * - * @param filter is used to {@link CharFilter#accept(char) decide} which characters should be accepted. - * @return the first character that was not {@link CharFilter#accept(char) accepted} by the given {@link CharFilter}. - * Only the {@link CharFilter#accept(char) accepted} characters have been consumed, this scanner still points - * to the returned character. - */ - default char skipWhileAndPeek(CharFilter filter) { - - return skipWhileAndPeek(filter, Integer.MAX_VALUE); - } - - /** - * Behaves like the following code: - * - *
-   * {@link #skipWhile(CharFilter, int) skipWhile}(filter, max);
-   * return {@link #peek()};
-   * 
- * - * @param filter is used to {@link CharFilter#accept(char) decide} which characters should be accepted. - * @param max is the maximum number of characters that may be skipped. - * @return the first character that was not {@link CharFilter#accept(char) accepted} by the given {@link CharFilter}. - * Only the {@link CharFilter#accept(char) accepted} characters have been consumed, this scanner still points - * to the returned character. - */ - default char skipWhileAndPeek(CharFilter filter, int max) { - - skipWhile(filter, max); - return peek(); - } - - /** - * @return the {@link String} with the characters that have already been parsed but are still available in the - * underlying buffer. May be used for debugging or error messages. - */ - String getBufferParsed(); - - /** - * @return the {@link String} with the characters that have not yet been parsed but are available in the underlying - * buffer. May be used for debugging or error messages. - */ - String getBufferToParse(); - - @Override - // no checked exception! - void close(); - -} +/* Copyright (c) The m-m-m Team, Licensed under the Apache License, Version 2.0 + * http://www.apache.org/licenses/LICENSE-2.0 */ +package io.github.mmm.scanner; + +import io.github.mmm.base.filter.CharFilter; +import io.github.mmm.base.text.TextFormatMessageType; +import io.github.mmm.base.text.TextFormatProcessor; +import io.github.mmm.scanner.number.CharScannerNumberParser; +import io.github.mmm.scanner.number.CharScannerRadixHandler; +import io.github.mmm.scanner.number.CharScannerRadixMode; + +/** + * This is the interface for a scanner that can be used to parse a stream or sequence of character + * {@link String#codePointAt(int) code-points}. It allows easy but efficient parsing of arbitrary textual data.
+ * ATTENTION:
+ * Implementations are state-ful and NOT thread-safe (intended by design). + */ +public interface CharStreamScanner extends TextFormatProcessor, AutoCloseable { + + /** + * The NULL character {@code '\0'} used to indicate the end of stream (EOS).
+ * ATTENTION: Do not confuse and mix {@code '\0'} with {@code '0'}. + * + * @see #next() + * @see #peek() + */ + int EOS = '\0'; + + /** + * This method determines if there is at least one more character available. + * + * @return {@code true} if there is at least one character available, {@code false} if the end of data has been + * reached. + */ + boolean hasNext(); + + /** + * This method reads the current character from the stream and increments the index stepping to the next character. + * You should {@link #hasNext() check} if a character is available before calling this method. Otherwise if your + * stream may contain the NUL character ('\0') you can not distinguish if the end of the stream was reached or you + * actually read the NUL character. + * + * @return the next {@link String#codePointAt(int) code-point} or {@link #EOS} if none is {@link #hasNext() + * available}. + */ + int next(); + + /** + * This method reads the current character without {@link #next() consuming} characters and will therefore not change + * the state of this scanner. + * + * @return the current character or {@link #EOS} if none is {@link #hasNext() available}. + */ + int peek(); + + /** + * Like {@link #peek()} but with further lookahead.
+ * Attention:
+ * This method requires lookahead. For implementations that are backed by an underlying stream (or reader) the given + * {@code lookaheadOffset} shall not exceed the available lookahead size (buffer capacity given at construction time). + * Otherwise the method may fail. + * + * @param lookaheadOffset the lookahead offset. If {@code 0} this method will behave like {@link #peek()}. In case of + * {@code 1} it will return the character after the next one and so forth. + * @return the {@link #peek() peeked} {@link String#codePointAt(int) code-point} at the given {@code lookaheadOffset} + * or {@link #EOS} if no such character exists. + */ + int peek(int lookaheadOffset); + + /** + * This method peeks the number of {@link #peek() next characters} given by {@code count} and returns them as + * {@link String}. If there are less characters {@link #hasNext() available} the returned {@link String} will be + * shorter than {@code count} and only contain the available characters. Unlike {@link #read(int)} this method does + * not {@link #next() consume} the characters and will therefore not change the state of this scanner.
+ * Attention:
+ * This method requires lookahead. For implementations that are backed by an underlying stream (or reader) the given + * {@code count} shall not exceed the available lookahead size (buffer capacity given at construction time). Otherwise + * the method may fail. + * + * @param count is the number of characters to peek. You may use {@link Integer#MAX_VALUE} to peek until the end of + * text (EOT) if the data-size is suitable. + * @return a string with the given number of characters or all available characters if less than {@code count}. Will + * be the empty string if no character is {@link #hasNext() available} at all. + */ + String peekString(int count); + + /** + * @param filter the {@link CharFilter} {@link CharFilter#accept(int) accepting} only the characters to peek. + * @param maxLen the maximum number of characters to peek (to get as lookahead without modifying this stream). + * @return a {@link String} with the {@link #peek() peeked} characters of the given {@code maxLen} or less if a + * character was hit that is not {@link CharFilter#accept(int) accepted} by the given {@code filter} + * or the end-of-text has been reached before. The state of this stream remains unchanged. + * @see #readWhile(CharFilter) + * @see #skip(int) + */ + String peekWhile(CharFilter filter, int maxLen); + + /** + * @param stopFilter the {@link CharFilter} that decides which characters to {@link CharFilter#accept(int) accept} as + * stop characters. + * @param maxLen the maximum number of characters to peek (get as lookahead without modifying this stream). + * @return a {@link String} with the {@link #peek() peeked} characters of the given {@code maxLen} or less if a stop + * character was hit or the end-of-text has been reached before. The state of this stream remains unchanged. + * @see #readWhile(CharFilter) + * @see #skip(int) + */ + default String peekUntil(CharFilter stopFilter, int maxLen) { + + return peekWhile(stopFilter.negate(), maxLen); + } + + /** + * This method reads the number of {@link #next() next characters} given by {@code count} and returns them as string. + * If there are less characters {@link #hasNext() available} the returned string will be shorter than {@code count} + * and only contain the available characters. + * + * @param count is the number of characters to read. You may use {@link Integer#MAX_VALUE} to read until the end of + * data if the data-size is suitable. + * @return a string with the given number of characters or all available characters if less than {@code count}. Will + * be the empty string if no character is {@link #hasNext() available} at all. + */ + String read(int count); + + /** + * This method reads the number of {@link #next() next characters} given by {@code count} and + * {@link StringBuilder#append(char) appends} them to the given {@link StringBuilder}. If there are less characters + * {@link #hasNext() available} then only the remaining characters will be appended resulting in less characters than + * {@code count}. + * + * @param count is the number of characters to read. You may use {@link Integer#MAX_VALUE} to read until the end of + * data if the data-size is suitable. + * @param builder the {@link StringBuilder} where to {@link StringBuilder#append(char) append} the characters to read. + */ + void read(int count, StringBuilder builder); + + /** + * @return the position in the sequence to scan or in other words the number of characters that have been read. Will + * initially be {@code 0}. Please note that this API is designed for scanning textual content (for parsers). + * Therefore we consider 2.1 terabyte as a suitable {@link Integer#MAX_VALUE limit}. + */ + int getPosition(); + + /** + * This method reads all {@link #next() next characters} until the given {@code stop} character or the end is reached. + *
+ * After the call of this method, the current index will point to the next character after the (first) {@code stop} + * character or to the end if NO such character exists. + * + * @param stop is the character to read until. + * @param acceptEnd if {@code true} the end of data will be treated as {@code stop}, too. + * @return the string with all read characters excluding the {@code stop} character or {@code null} if there was no + * {@code stop} character and {@code acceptEnd} is {@code false}. + */ + String readUntil(int stop, boolean acceptEnd); + + /** + * This method reads all {@link #next() next characters} until the given (un-escaped) {@code stop} character or the + * end is reached.
+ * In advance to {@link #readUntil(int, boolean)}, this method allows that the {@code stop} character may be used in + * the input-string by adding the given {@code escape} character. After the call of this method, the current index + * will point to the next character after the (first) {@code stop} character or to the end if NO such character + * exists.
+ * This method is especially useful when quoted strings should be parsed. E.g.: + * + *
+   * {@link CharStreamScanner} scanner = getScanner();
+   * doSomething();
+   * int cp = scanner.{@link #next()};
+   * if ((cp == '"') || (cp == '\'')) {
+   *   int escape = cp; // may also be something like '\\'
+   *   String quote = scanner.{@link #readUntil(int, boolean, int) readUntil}(c, false, escape)
+   * } else {
+   *   doOtherThings();
+   * }
+   * 
+ * + * @param stop is the {@link String#codePointAt(int) code-point} to read until. + * @param acceptEnd if {@code true} the end of data will be treated as {@code stop}, too. + * @param escape is the {@link String#codePointAt(int) code-point} used to escape the {@code stop} character. To add + * an occurrence of the {@code escape} character it has to be duplicated (occur twice). The {@code escape} + * character may also be equal to the {@code stop} character. If other regular characters are escaped the + * {@code escape} character is simply ignored. + * @return the string with all read characters excluding the {@code stop} character or {@code null} if there was no + * {@code stop} character and {@code acceptEnd} is {@code false}. + */ + String readUntil(int stop, boolean acceptEnd, int escape); + + /** + * This method reads all {@link #next() next characters} until the given {@code stop} character or the end of the + * string to parse is reached. In advance to {@link #readUntil(int, boolean)}, this method will scan the input using + * the given {@code syntax} which e.g. allows to {@link CharScannerSyntax#getEscape() escape} the stop character.
+ * After the call of this method, the current index will point to the next character after the (first) {@code stop} + * character or to the end of the string if NO such character exists. + * + * @param stop is the {@link String#codePointAt(int) code-point} to read until. + * @param acceptEnd if {@code true} the end of data will be treated as {@code stop}, too. + * @param syntax contains the characters specific for the syntax to read. + * @return the string with all read characters excluding the {@code stop} character or {@code null} if there was no + * {@code stop} character. + * @see #readUntil(CharFilter, boolean, CharScannerSyntax) + */ + String readUntil(int stop, boolean acceptEnd, CharScannerSyntax syntax); + + /** + * This method reads all {@link #next() next characters} until the first character {@link CharFilter#accept(int) + * accepted} by the given {@code filter} or the end is reached.
+ * After the call of this method, the current index will point to the first {@link CharFilter#accept(int) accepted} + * stop character or to the end if NO such character exists. + * + * @param filter is used to {@link CharFilter#accept(int) decide} where to stop. + * @param acceptEnd if {@code true} if end of data should be treated like the {@code stop} character and the rest of + * the text will be returned, {@code false} otherwise (to return {@code null} if the end of data was reached + * and the scanner has been consumed). + * @return the string with all read characters not {@link CharFilter#accept(int) accepted} by the given + * {@link CharFilter} or {@code null} if there was no {@link CharFilter#accept(int) accepted} character and + * {@code acceptEnd} is {@code false}. + */ + String readUntil(CharFilter filter, boolean acceptEnd); + + /** + * This method reads all {@link #next() next characters} until the first character {@link CharFilter#accept(int) + * accepted} by the given {@code filter}, the given {@code stop} {@link String} or the end is reached.
+ * After the call of this method, the current index will point to the first {@link CharFilter#accept(int) accepted} + * stop character, or to the first character of the given {@code stop} {@link String} or to the end if NO such + * character exists. + * + * @param filter is used to {@link CharFilter#accept(int) decide} where to stop. + * @param acceptEnd if {@code true} if the end of data should be treated like the {@code stop} character and the rest + * of the text will be returned, {@code false} otherwise (to return {@code null} if end of data was reached and + * the scanner has been consumed). + * @param stop the {@link String} where to stop consuming data. Should be at least two characters long (otherwise + * accept by {@link CharFilter} instead). + * @return the string with all read characters not {@link CharFilter#accept(int) accepted} by the given + * {@link CharFilter} or until the given {@code stop} {@link String} was detected. If end of data was reached + * without a stop signal the entire rest of the data is returned or {@code null} if {@code acceptEnd} is + * {@code false}. + */ + default String readUntil(CharFilter filter, boolean acceptEnd, String stop) { + + return readUntil(filter, acceptEnd, stop, false); + } + + /** + * This method reads all {@link #next() next characters} until the first character {@link CharFilter#accept(int) + * accepted} by the given {@code filter}, the given {@code stop} {@link String} or the end is reached.
+ * After the call of this method, the current index will point to the first {@link CharFilter#accept(int) accepted} + * stop character, or to the first character of the given {@code stop} {@link String} or to the end if NO such + * character exists. + * + * @param filter is used to {@link CharFilter#accept(int) decide} where to stop. + * @param acceptEnd if {@code true} if the end of data should be treated like the {@code stop} character and the rest + * of the text will be returned, {@code false} otherwise (to return {@code null} if the end of data was reached + * and the scanner has been consumed). + * @param stop the {@link String} where to stop consuming data. Should be at least two characters long (otherwise + * accept by {@link CharFilter} instead). + * @param ignoreCase - if {@code true} the case of the characters is ignored when compared with characters from + * {@code stop} {@link String}. + * @return the string with all read characters not {@link CharFilter#accept(int) accepted} by the given + * {@link CharFilter} or until the given {@code stop} {@link String} was detected. If the end of data was + * reached without a stop signal the entire rest of the data is returned or {@code null} if {@code acceptEnd} + * is {@code false}. + */ + default String readUntil(CharFilter filter, boolean acceptEnd, String stop, boolean ignoreCase) { + + return readUntil(filter, acceptEnd, stop, ignoreCase, false); + } + + /** + * This method reads all {@link #next() next characters} until the first character {@link CharFilter#accept(int) + * accepted} by the given {@code filter}, the given {@code stop} {@link String} or the end is reached.
+ * After the call of this method, the current index will point to the first {@link CharFilter#accept(int) accepted} + * stop character, or to the first character of the given {@code stop} {@link String} or to the end if NO such + * character exists. + * + * @param filter is used to {@link CharFilter#accept(int) decide} where to stop. + * @param acceptEnd if {@code true} if the end of data should be treated like the {@code stop} character and the rest + * of the text will be returned, {@code false} otherwise (to return {@code null} if the end of data was reached + * and the scanner has been consumed). + * @param stop the {@link String} where to stop consuming data. Should be at least two characters long (otherwise + * accept by {@link CharFilter} instead). + * @param ignoreCase - if {@code true} the case of the characters is ignored when compared with characters from + * {@code stop} {@link String}. + * @param trim - {@code true} if the result should be {@link String#trim() trimmed}, {@code false} otherwise. + * @return the string with all read characters not {@link CharFilter#accept(int) accepted} by the given + * {@link CharFilter} or until the given {@code stop} {@link String} was detected. If the end of data was + * reached without hitting {@code stop} the entire rest of the data is returned or {@code null} if + * {@code acceptEnd} is {@code false}. Thre result will be {@link String#trim() trimmed} if {@code trim} is + * {@code true}. + */ + String readUntil(CharFilter filter, boolean acceptEnd, String stop, boolean ignoreCase, boolean trim); + + /** + * This method reads all {@link #next() next characters} until the given {@link CharFilter} + * {@link CharFilter#accept(int) accepts} the current character as stop character or the end of data is reached. In + * advance to {@link #readUntil(int, boolean)}, this method will scan the input using the given {@code syntax} which + * e.g. allows to {@link CharScannerSyntax#getEscape() escape} the stop character.
+ * After the call of this method, the current index will point to the first {@link CharFilter#accept(int) accepted} + * stop character or to the end of the string if NO such character exists. + * + * @param filter is used to {@link CharFilter#accept(int) decide} where to stop. + * @param acceptEnd if {@code true} the end of data will be treated as {@code stop}, too. + * @param syntax contains the characters specific for the syntax to read. + * @return the string with all read characters excluding the {@code stop} character or {@code null} if there was no + * {@code stop} character. + * @see #readUntil(int, boolean, CharScannerSyntax) + */ + String readUntil(CharFilter filter, boolean acceptEnd, CharScannerSyntax syntax); + + /** + * @param stopFilter the {@link CharFilter} that decides which characters to {@link CharFilter#accept(int) accept} as + * stop characters. + * @param min the minimum number of characters expected. + * @param max the (maximum) length of the characters to consume. + * @return the {@link String} with all consumed characters excluding the stop character. If no {@code stop} character + * was found until {@code maxLength} characters have been consumed, this method behaves like {@link #read(int) + * read(maxLength)}. + * @throws IllegalStateException if less than the minimum number of characters have been {@link CharFilter#accept(int) + * rejected}. + * @see #read(int) + * @see #readWhile(CharFilter, int, int) + * @see #peekUntil(CharFilter, int) + */ + default String readUntil(CharFilter stopFilter, int min, int max) { + + return readWhile(stopFilter.negate(), min, max); + } + + /** + * This method reads all {@link #next() next characters} that are {@link CharFilter#accept(int) accepted} by the given + * {@code filter}.
+ * After the call of this method, the current index will point to the next character that was NOT + * {@link CharFilter#accept(int) accepted} by the given {@code filter} or to the end if NO such character exists. + * + * @see #skipWhile(CharFilter) + * + * @param filter used to {@link CharFilter#accept(int) decide} which characters should be accepted. + * @return a string with all characters {@link CharFilter#accept(int) accepted} by the given {@code filter}. Will be + * the empty string if no character was accepted. + */ + default String readWhile(CharFilter filter) { + + return readWhile(filter, 0, Integer.MAX_VALUE); + } + + /** + * This method reads all {@link #next() next characters} that are {@link CharFilter#accept(int) accepted} by the given + * {@code filter}.
+ * After the call of this method, the current index will point to the next character that was NOT + * {@link CharFilter#accept(int) accepted} by the given {@code filter}. If the next {@code max} characters or the + * characters left until the {@link #hasNext() end} of this scanner are {@link CharFilter#accept(int) accepted}, only + * that amount of characters are skipped. + * + * @see #skipWhile(int) + * + * @param filter used to {@link CharFilter#accept(int) decide} which characters should be accepted. + * @param min the minimum number of characters expected. + * @param max the maximum number of characters that should be read. + * @return a string with all characters {@link CharFilter#accept(int) accepted} by the given {@code filter} limited to + * the length of {@code max} and the {@link #hasNext() end} of this scanner. Will be the empty string if no + * character was accepted. + * @throws IllegalStateException if less than the minimum number of characters have been {@link CharFilter#accept(int) + * accepted}. + */ + String readWhile(CharFilter filter, int min, int max); + + /** + * @return a {@link String} with the data until the end of the current line or the end of the data. Will be + * {@code null} if the end has already been reached and {@link #hasNext()} returns {@code false}. + */ + default String readLine() { + + return readLine(false); + } + + /** + * @param trim - {@code true} if the result should be {@link String#trim() trimmed}, {@code false} otherwise. + * @return a {@link String} with the data until the end of the current line ({@link String#trim() trimmed} if + * {@code trim} is {@code true}) or the end of the data. Will be {@code null} if the end has already been + * reached and {@link #hasNext()} returns {@code false}. + */ + String readLine(boolean trim); + + /** + * Reads a {@link Boolean} value from this scanner if available. + * + * @return the consumed {@link Boolean} value or {@code null} if no such value was available and the + * {@link #getPosition() position} remains unchanged. + */ + default Boolean readBoolean() { + + return readBoolean(false, false); + } + + /** + * Reads a {@link Boolean} value from this scanner if available. + * + * @param ignoreCase - if {@code true} the case of the characters is ignored when compared, {@code false} otherwise + * (only lower case is accepted). + * @return the consumed {@link Boolean} value or {@code null} if no such value was available and the + * {@link #getPosition() position} remains unchanged. + */ + default Boolean readBoolean(boolean ignoreCase) { + + return readBoolean(ignoreCase, false); + } + + /** + * Reads a {@link Boolean} value from this scanner if available. + * + * @param ignoreCase - if {@code true} the case of the characters is ignored when compared, {@code false} otherwise + * (only lower case is accepted). + * @param acceptYesNo - if {@code true} also "yes" is accepted for {@code true} and "no" for {@code false}, + * {@code false} otherwise. + * @return the consumed {@link Boolean} value or {@code null} if no such value was available and the + * {@link #getPosition() position} remains unchanged. + */ + default Boolean readBoolean(boolean ignoreCase, boolean acceptYesNo) { + + if (expect("true", ignoreCase)) { + return Boolean.TRUE; + } else if (expect("false", ignoreCase)) { + return Boolean.FALSE; + } else if (!acceptYesNo) { + return null; + } else if (expect("yes", ignoreCase)) { + return Boolean.TRUE; + } else if (expect("no", ignoreCase)) { + return Boolean.FALSE; + } + return null; + } + + /** + * Generic way to read and parse any kind of {@link Number}. + * + * @param numberParser the {@link CharScannerNumberParser}. Can decide if sign, digits, radix, exponent, or even + * specials are + */ + void readNumber(CharScannerNumberParser numberParser); + + /** + * This method reads the double value (decimal number) starting at the current position by reading as many matching + * characters as available and returns its {@link Double#parseDouble(String) parsed} value.
+ * + * @return the parsed {@code double} number or {@code null} if the current current position does not point to a + * number. + * @throws NumberFormatException if the number at the current position could not be parsed. + */ + default Double readDouble() throws NumberFormatException { + + return readDouble(CharScannerRadixMode.ONLY_10); + } + + /** + * This method reads the double value (decimal number) starting at the current position by reading as many matching + * characters as available and returns its {@link Double#parseDouble(String) parsed} value.
+ * + * @param radixMode the {@link CharScannerRadixHandler} - e.g. {@link CharScannerRadixMode#ALL}. + * @return the parsed {@code double} number or {@code null} if the current current position does not point to a + * number. + * @throws NumberFormatException if the number at the current position could not be parsed. + */ + Double readDouble(CharScannerRadixHandler radixMode) throws NumberFormatException; + + /** + * This method reads a {@link Float} value from the current position {@link #next() consuming} as many matching + * characters as available. + * + * @return the parsed {@link Float} value or {@code null} if the current current position does not point to a + * {@link Float} number. + * @throws NumberFormatException if the number at the current position could not be parsed. + */ + default Float readFloat() throws NumberFormatException { + + return readFloat(CharScannerRadixMode.ONLY_10); + } + + /** + * This method reads a {@link Float} value from the current position {@link #next() consuming} as many matching + * characters as available. + * + * @param radixMode the {@link CharScannerRadixHandler} - e.g. {@link CharScannerRadixMode#ALL}. + * @return the parsed {@link Float} value or {@code null} if the current current position does not point to a + * {@link Float} number. + * @throws NumberFormatException if the number at the current position could not be parsed. + */ + Float readFloat(CharScannerRadixHandler radixMode) throws NumberFormatException; + + /** + * @return the consumed {@link Long} value or {@code null} if no number was present and the {@link #getPosition() + * position} remains unchanged. + * @throws NumberFormatException if the current current position points to a number that is not a {@link Long} value. + */ + default Long readLong() throws NumberFormatException { + + return readLong(CharScannerRadixMode.ONLY_10); + } + + /** + * @param radixMode the {@link CharScannerRadixHandler} - e.g. {@link CharScannerRadixMode#ALL}. + * @return the consumed {@link Long} value or {@code null} if no such value was present and the {@link #getPosition() + * position} remains unchanged. + * @throws NumberFormatException if the current current position points to a number that is not a {@link Long} value. + */ + Long readLong(CharScannerRadixHandler radixMode); + + /** + * @return the consumed {@link Integer} value or {@code null} if no such value was present and the + * {@link #getPosition() position} remains unchanged. + * @throws NumberFormatException if the current current position does not point to a {@link Integer} value. + */ + default Integer readInteger() throws NumberFormatException { + + return readInteger(CharScannerRadixMode.ONLY_10); + } + + /** + * @param radixMode the {@link CharScannerRadixHandler} - e.g. {@link CharScannerRadixMode#ALL}. + * @return the consumed {@link Integer} value or {@code null} if no such value was present and the + * {@link #getPosition() position} remains unchanged. + * @throws NumberFormatException if the current current position does not point to a {@link Long} value. + */ + Integer readInteger(CharScannerRadixHandler radixMode) throws NumberFormatException; + + /** + * Reads a Java {@link Number} literal (e.g. "1L" or "1.3F"). + * + * @return the consumed {@link Number} or {@code null} if no number literal was found and the {@link #getPosition() + * position} remains unchainged. + * @throws NumberFormatException if a number literal was found that has an illegal format. + */ + Number readJavaNumberLiteral(); + + /** + * This method reads the {@link #next() next character} if it is a digit. Else the state remains unchanged. + * + * @return the numeric value of the next Latin digit (e.g. {@code 0} if {@code '0'}) or {@code -1} if the + * {@link #next() next character} is no Latin digit. + */ + default int readDigit() { + + return readDigit(10); + } + + /** + * This method reads the {@link #next() next character} if it is a digit within the given {@code radix}. Else the + * state remains unchanged. + * + * @param radix the radix that defines the range of the digits. See {@link Integer#parseInt(String, int)}. E.g. + * {@code 10} to read any Latin digit (see {@link #readDigit()}), {@code 8} to read octal digit, {@code 16} to + * read hex decimal digits. + * @return the numeric value of the next digit within the given {@code radix} or {@code -1} if the {@link #next() next + * character} is no such digit. + */ + int readDigit(int radix); + + /** + * This method reads the long starting at the current position by reading as many Latin digits as available but at + * maximum the given {@code maxDigits} and returns its {@link Long#parseLong(String) parsed} value.
+ * ATTENTION:
+ * This method does NOT treat signs ({@code +} or {@code -}) to do so, scan them yourself before and negate the result + * as needed. + * + * @param maxDigits is the maximum number of digits that will be read. The value has to be positive (greater than + * zero). Should not be greater than {@code 19} as this will exceed the range of {@code long}. + * @return the parsed number. + * @throws NumberFormatException if the number at the current position could not be parsed. + */ + long readUnsignedLong(int maxDigits) throws NumberFormatException; + + /** + * Reads and parses a Java {@link String} literal value according to JLS 3.10.6.
+ * As a complex example for the input "Hi \"\176\477\579\u2022\uuuuu2211\"\n" this scanner would return the + * {@link String} output {@code Hi "~'7/9•∑"} followed by a newline character. + * + * @return the parsed Java {@link String} literal value or {@code null} if not pointing to a {@link String} literal. + */ + default String readJavaStringLiteral() { + + return readJavaStringLiteral(TextFormatMessageType.ERROR); + } + + /** + * Reads and parses a Java {@link String} literal value according to JLS 3.10.6.
+ * As a complex example for the input "Hi \"\176\477\579\u2022\uuuuu2211\"\n" this scanner would return the + * {@link String} output {@code Hi "~'7/9•∑"} followed by a newline character. + * + * @param severity the {@link TextFormatMessageType} to use to report invalid escape sequences or missing terminating + * quotation. + * @return the parsed Java {@link String} literal value or {@code null} if not pointing to a {@link String} literal. + */ + String readJavaStringLiteral(TextFormatMessageType severity); + + /** + * Reads and parses a Java {@link Character} literal value according to JLS 3.10.6.
+ * Examples are given in the following table: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
literalresultcomment
{@code 'a'}aregular char
{@code '\''}'escaped char
{@code '\176'}~escaped octal representation
{@code '\u2022'}escaped unicode representation
+ * + * @return the parsed Java {@link String} literal value or {@code null} if not pointing to a {@link String} literal. + */ + default Character readJavaCharLiteral() { + + return readJavaCharLiteral(TextFormatMessageType.ERROR); + } + + /** + * Reads and parses a Java {@link Character} literal value according to JLS 3.10.6.
+ * Examples are given in the following table: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
literalresultcomment
{@code 'a'}aregular char
{@code '\''}'escaped char
{@code '\176'}~escaped octal representation
{@code '\u2022'}escaped unicode representation
+ * + * @param severity the {@link TextFormatMessageType} to use to report invalid escape sequences or missing terminating + * quotation. + * @return the parsed Java {@link Character} literal value or {@code null} if not pointing to a {@link Character} + * literal. + */ + Character readJavaCharLiteral(TextFormatMessageType severity); + + /** + * This method determines if the given {@code expected} {@link String} is completely present at the current position. + * It will only {@link #next() consume} characters and change the state if the {@code expected} {@link String} was + * found (entirely).
+ * Attention:
+ * This method requires lookahead. For implementations that are backed by an underlying stream (or reader) the + * {@link String#length() length} of the expected {@link String} shall not exceed the available lookahead size (buffer + * capacity given at construction time). Otherwise the method may fail. + * + * @param expected is the expected string. + * @return {@code true} if the {@code expected} string was successfully consumed from this scanner, {@code false} + * otherwise. + * @see #expectUnsafe(String) + */ + default boolean expect(String expected) { + + return expect(expected, false, false, 0); + } + + /** + * This method determines if the given {@code expected} {@link String} is completely present at the current position. + * It will only {@link #next() consume} characters and change the state if the {@code expected} {@link String} was + * found (entirely).
+ * Attention:
+ * This method requires lookahead. For implementations that are backed by an underlying stream (or reader) the + * {@link String#length() length} of the expected {@link String} shall not exceed the available lookahead size (buffer + * capacity given at construction time). Otherwise the method may fail. + * + * @param expected the expected {@link String} to search for. + * @param ignoreCase - if {@code true} the case of the characters is ignored when compared, {@code false} otherwise. + * @return {@code true} if the {@code expected} string was successfully found and {@link #next() consume} from this + * scanner, {@code false} otherwise. + * @see #expectUnsafe(String, boolean) + */ + default boolean expect(String expected, boolean ignoreCase) { + + return expect(expected, ignoreCase, false, 0); + } + + /** + * This method determines if the given {@code expected} {@link String} is completely present at the current position. + * It will only {@link #next() consume} characters and change the state if {@code lookahead} is {@code false} and the + * {@code expected} {@link String} was found (entirely).
+ * Attention:
+ * This method requires lookahead. For implementations that are backed by an underlying stream (or reader) the + * {@link String#length() length} of the expected {@link String} shall not exceed the available lookahead size (buffer + * capacity given at construction time). Otherwise the method may fail. + * + * @param expected the expected {@link String} to search for. + * @param ignoreCase - if {@code true} the case of the characters is ignored when compared, {@code false} otherwise. + * @param lookahead - if {@code true} the state of the scanner remains unchanged even if the expected {@link String} + * has been found, {@code false} otherwise (expected {@link String} is consumed on match). + * @return {@code true} if the {@code expected} string was successfully found, {@code false} otherwise. + */ + default boolean expect(String expected, boolean ignoreCase, boolean lookahead) { + + return expect(expected, ignoreCase, lookahead, 0); + } + + /** + * This method determines if the given {@code expected} {@link String} is completely present at the current position. + * It will only {@link #next() consume} characters and change the state if {@code lookahead} is {@code false} and the + * {@code expected} {@link String} was found (entirely).
+ * Attention:
+ * This method requires lookahead. For implementations that are backed by an underlying stream (or reader) the + * {@link String#length() length} of the expected {@link String} shall not exceed the available lookahead size (buffer + * capacity given at construction time). Otherwise the method may fail. + * + * @param expected the expected {@link String} to search for. + * @param ignoreCase - if {@code true} the case of the characters is ignored when compared, {@code false} otherwise. + * @param lookahead - if {@code true} the state of the scanner remains unchanged even if the expected {@link String} + * has been found, {@code false} otherwise (expected {@link String} is consumed on match). + * @param offset the number of characters that have already been {@link #peek(int) peeked} and after which the given + * {@link String} is expected. Will typically be {@code 0}. If {@code lookahead} is {@code false} and the + * expected {@link String} was found these characters will be {@link #skip(int) skipped} together with the + * expected {@link String}. + * @return {@code true} if the {@code expected} string was successfully found, {@code false} otherwise. + */ + boolean expect(String expected, boolean ignoreCase, boolean lookahead, int offset); + + /** + * This method determines if the given {@code expected} {@link String} is completely present at the current position. + * It will only {@link #next() consume} characters and change the state if {@code lookahead} is {@code false} and the + * {@code expected} {@link String} was found (entirely).
+ * Attention:
+ * This method requires lookahead. For implementations that are backed by an underlying stream (or reader) the + * {@link String#length() length} of the expected {@link String} shall not exceed the available lookahead size (buffer + * capacity given at construction time). Otherwise the method may fail. + * + * @param expected the expected {@link String} to search for. + * @param ignoreCase - if {@code true} the case of the characters is ignored when compared, {@code false} otherwise. + * @param lookahead - if {@code true} the state of the scanner remains unchanged even if the expected {@link String} + * has been found, {@code false} otherwise (expected {@link String} is consumed on match). + * @param offset the number of characters that have already been {@link #peek(int) peeked} and after which the given + * {@link String} is expected. Will typically be {@code 0}. If {@code lookahead} is {@code false} and the + * expected {@link String} was found these characters will be {@link #skip(int) skipped} together with the + * expected {@link String}. + * @param warning {@code true} to {@link #addWarning(String) add a warning} in case the expected {@link String} was + * not found, {@code false} otherwise. + * @return {@code true} if the {@code expected} string was successfully found, {@code false} otherwise. + */ + default boolean expect(String expected, boolean ignoreCase, boolean lookahead, int offset, boolean warning) { + + boolean found = expect(expected, ignoreCase, lookahead, offset); + if (!found && warning) { + addWarning("Expected '" + expected + "'"); + } + return found; + } + + /** + * This method checks if the {@link #next() next character} is equal to the given {@code expected} character.
+ * If the character matched with the {@code expected} character, the parser points to the next character. Otherwise + * its position will remain unchanged. + * + * @param expected is the expected character. + * @return {@code true} if the current character is the same as {@code expected}, {@code false} otherwise. + */ + default boolean expectOne(int expected) { + + return expectOne(expected, false); + } + + /** + * This method checks if the {@link #next() next character} is equal to the given {@code expected} character.
+ * If the character matched with the {@code expected} character, the parser points to the next character. Otherwise + * its position will remain unchanged. + * + * @param expected the {@link String#codePointAt(int) code-point} to expect as {@link #next() next} in this stream. + * @param warning {@code true} to {@link #addWarning(String) add a warning} in case the expected character was not + * present, {@code false} otherwise. + * @return {@code true} if the expected character was found and consumer, {@code false} otherwise (and this stream + * remains unchanged). + */ + boolean expectOne(int expected, boolean warning); + + /** + * This method checks that the {@link #next() next character} is {@link CharFilter#accept(int) accepted} by the given + * {@link CharFilter}.
+ * If the current character was as expected, the parser points to the next character. Otherwise its position will + * remain unchanged. + * + * @param expected is the {@link CharFilter} {@link CharFilter#accept(int) accepting} the expected chars. + * @return {@code true} if the current character is {@link CharFilter#accept(int) accepted}, {@code false} otherwise. + */ + default boolean expectOne(CharFilter expected) { + + if (!hasNext()) { + return false; + } + if (expected.accept(peek())) { + next(); + return true; + } + return false; + } + + /** + * This method skips all {@link #next() next characters} as long as they equal to the according character of the + * {@code expected} {@link String}.
+ * If a character differs this method stops and the parser points to the first character that differs from + * {@code expected}. Except for the latter circumstance, this method behaves similar to the following code: + * + *
+   * {@link #read(int) read}(expected.length).equals(expected)
+   * 
+ * + * ATTENTION:
+ * In most cases you want to prefer {@link #expect(String)} instead of using this method. Only in specific cases and + * for highly optimized performance it may make sense to use it. In such case be careful and consider to combine with + * {@link #getPosition()} to be able to determine whether characters have been consumed if {@code false} was returned + * (e.g. otherwise when doing {@link #expectUnsafe(String) expectUnsafe}("false") and else doing + * {@link #expectUnsafe(String) expectUnsafe}("true") to parse a {@code boolean} literal your code could accept + * "falstrue" as "true"). + * + * @param expected is the expected string. + * @return {@code true} if the {@code expected} string was successfully consumed from this scanner, {@code false} + * otherwise. + * @see #expect(String) + */ + default boolean expectUnsafe(String expected) { + + return expectUnsafe(expected, false); + } + + /** + * This method skips all {@link #next() next characters} as long as they equal to the according character of the + * {@code expected} string.
+ * If a character differs this method stops and the parser points to the first character that differs from + * {@code expected}. Except for the latter circumstance, this method behaves similar to the following code: + * + *
+   * {@link #read(int) read}(expected.length).equals[IgnoreCase](expected)
+   * 
+ * + * ATTENTION:
+ * In most cases you want to prefer {@link #expect(String, boolean)} instead of using this method. See + * {@link #expectUnsafe(String)} for details. + * + * @param expected is the expected string. + * @param ignoreCase - if {@code true} the case of the characters is ignored when compared. + * @return {@code true} if the {@code expected} string was successfully consumed from this scanner, {@code false} + * otherwise. + * @see #expect(String, boolean) + */ + boolean expectUnsafe(String expected, boolean ignoreCase); + + /** + * This method verifies that the {@link #next() next character} is equal to the given {@code expected} character.
+ * If the current character was as expected, the parser points to the next character. Otherwise an exception is thrown + * indicating the problem. + * + * @param expected is the expected {@link String#codePointAt(int) code-point}. + * @throws IllegalStateException if the required character was not found. + */ + default void requireOne(int expected) throws IllegalStateException { + + if (!hasNext()) { + throw new IllegalStateException("Expecting '" + expected + "' but found end-of-stream."); + } + int next = peek(); + if (next != expected) { + throw new IllegalStateException("Expecting '" + expected + "' but found: " + next); + } + next(); + } + + /** + * This method verifies that the {@code expected} string gets consumed from this scanner with respect to + * {@code ignoreCase}. Otherwise an exception is thrown indicating the problem.
+ * This method behaves functionally equivalent to the following code: + * + *
+   * if (!scanner.{@link #expectUnsafe(String, boolean) expectUnsafe}(expected, ignoreCase)) {
+   *   throw new {@link IllegalStateException}(...);
+   * }
+   * 
+ * + * @param expected is the expected string. + * @param ignoreCase - if {@code true} the case of the characters is ignored during comparison. + */ + void require(String expected, boolean ignoreCase); + + /** + * @param filter the {@link CharFilter} {@link CharFilter#accept(int) accepting} the expected characters to + * {@link #skipWhile(CharFilter, int) skip}. + * @return the actual number of characters that have been skipped. + * @throws IllegalStateException if less than {@code 1} or more than {@code 1000} {@link CharFilter#accept(int) + * accepted} characters have been consumed. + */ + default int requireOne(CharFilter filter) { + + return require(filter, 1, -1); + } + + /** + * @param filter the {@link CharFilter} {@link CharFilter#accept(int) accepting} the expected characters to + * {@link #skipWhile(CharFilter, int) skip}. + * @return the actual number of characters that have been skipped. + * @throws IllegalStateException if less than {@code 1} or more than {@code 1000} {@link CharFilter#accept(int) + * accepted} characters have been consumed. + */ + default int requireOneOrMore(CharFilter filter) { + + return require(filter, 1); + } + + /** + * @param filter the {@link CharFilter} {@link CharFilter#accept(int) accepting} the expected characters to + * {@link #skipWhile(CharFilter, int) skip}. + * @param min the minimum required number of skipped characters. + * @return the actual number of characters that have been skipped. + * @throws IllegalStateException if less than {@code min} or more than {@code 1000} {@link CharFilter#accept(int) + * accepted} characters have been consumed. + */ + default int require(CharFilter filter, int min) { + + return require(filter, min, 1000); + } + + /** + * @param filter the {@link CharFilter} {@link CharFilter#accept(int) accepting} the expected characters to + * {@link #skipWhile(CharFilter, int) skip}. + * @param min the minimum required number of skipped characters. + * @param max the maximum number of skipped characters. + * @return the actual number of characters that have been skipped. + * @throws IllegalStateException if less than {@code min} or more than {@code max} {@link CharFilter#accept(int) + * accepted} characters have been consumed. + */ + default int require(CharFilter filter, int min, int max) { + + if ((min < 0) || ((min > max) && (max != -1))) { + throw new IllegalArgumentException("Invalid range: " + min + "-" + max); + } + int num = max; + if (max == -1) { + num = min; + } + int count = skipWhile(filter, num); + if (count < min) { + invalidCharCount("at least " + min, count, filter); + } + if (count == max) { + int codePoint = peek(); + if (!filter.accept(codePoint)) { + invalidCharCount("up to " + max, count, filter); + } + } + return count; + } + + private IllegalStateException invalidCharCount(String bound, int count, CharFilter filter) { + + String description = filter.getDescription(); + String chars = " character(s)"; + if (!CharFilter.NO_DESCRIPTION.equals(description)) { + chars = " character(s) matching " + description; + } + throw new IllegalStateException("Require " + bound + chars + " but found only " + count); + } + + /** + * This method skips all {@link #next() next code-points} until the given {@code stop} {@link String#codePointAt(int) + * code-point} or the end is reached. If the {@code stop} {@link String#codePointAt(int) code-point} was reached, this + * scanner will point to the next character after {@code stop} when this method returns. + * + * @param stop is the {@link String#codePointAt(int) code-point} to read until. + * @return {@code true} if the first occurrence of the given {@code stop} character has been passed, {@code false} if + * there is no such character. + */ + boolean skipUntil(int stop); + + /** + * This method reads all {@link #next() next characters} until the given {@code stop} character or the end of the + * string to parse is reached. In advance to {@link #skipUntil(int)}, this method will read over the {@code stop} + * character if it is escaped with the given {@code escape} character. + * + * @param stop is the {@link String#codePointAt(int) code-point} to read until. + * @param escape is the {@link String#codePointAt(int) code-point} used to escape the stop character (e.g. '\'). + * @return {@code true} if the first occurrence of the given {@code stop} character has been passed, {@code false} if + * there is no such character. + */ + boolean skipUntil(int stop, int escape); + + /** + * This method skips the number of {@link #next() next characters} given by {@code count}. + * + * @param count is the number of characters to skip. You may use {@link Integer#MAX_VALUE} to read until the end of + * data if the data-size is suitable. + * @return a to total number of characters that have been skipped. Typically equal to {@code count}. Will be less in + * case the end of data was reached. + */ + int skip(int count); + + /** + * @return {@code 0} if the {@link #next() next characeter} is not a newline and the stream remains unchanged, + * {@code 1} if the {@link #next() next characeter} was '\n' and has been {@link #skip(int) skipped}, or + * {@code 2} if the{@link #next() next characeters} have been '\r' and '\n' and have been {@link #skip(int) + * skipped}. + */ + int skipNewLine(); + + /** + * This method reads all {@link #next() next characters} until the given {@code substring} has been detected.
+ * After the call of this method, the current index will point to the next character after the first occurrence of + * {@code substring} or to the end of data if the given {@code substring} was NOT found.
+ * + * @param substring is the substring to search and skip over starting at the current index. + * @return {@code true} if the given {@code substring} occurred and has been passed and {@code false} if the end of + * the string has been reached without any occurrence of the given {@code substring}. + */ + default boolean skipOver(String substring) { + + return skipOver(substring, false); + } + + /** + * This method reads all {@link #next() next characters} until the given {@code substring} has been detected.
+ * After the call of this method, the current index will point to the next character after the first occurrence of + * {@code substring} or to the end of data if the given {@code substring} was NOT found.
+ * + * @param substring is the substring to search and skip over starting at the current index. + * @param ignoreCase - if {@code true} the case of the characters is ignored when compared with characters from + * {@code substring}. + * @return {@code true} if the given {@code substring} occurred and has been passed and {@code false} if the end of + * the string has been reached without any occurrence of the given {@code substring}. + */ + default boolean skipOver(String substring, boolean ignoreCase) { + + return skipOver(substring, ignoreCase, null); + } + + /** + * This method consumes all {@link #next() next characters} until the given {@code substring} has been detected, a + * character was {@link CharFilter#accept(int) accepted} by the given {@link CharFilter} or the end of data was + * reached.
+ * After the call of this method this scanner will point to the next character after the first occurrence of + * {@code substring}, to the stop character or to end of data.
+ * + * @param substring is the substring to search and skip over starting at the current index. + * @param ignoreCase - if {@code true} the case of the characters is ignored when compared with characters from + * {@code substring}. + * @param stopFilter is the filter used to {@link CharFilter#accept(int) detect} stop characters. If such character + * was detected, the skip is stopped and the parser points to the character after the stop character. The + * {@code substring} should NOT contain a {@link CharFilter#accept(int) stop character}. + * @return {@code true} if the given {@code substring} occurred and has been passed and {@code false} if a stop + * character has been detected or the end of the string has been reached without any occurrence of the given + * {@code substring} or stop character. + */ + boolean skipOver(String substring, boolean ignoreCase, CharFilter stopFilter); + + /** + * This method reads all {@link #next() next code-points} that are identical to the given + * {@link String#codePointAt(int) code-point}.
+ * E.g. use {@link #skipWhile(int) skipWhile(' ')} to skip all spaces from the current index. After the call of this + * method, the current index will point to the next character that is different to the given + * {@link String#codePointAt(int) code-point} or to the {@link #EOS end} if NO such character exists. + * + * @param codePoint is the {@link String#codePointAt(int) code-point} to read over. + * @return the number of characters that have been skipped. + */ + int skipWhile(int codePoint); + + /** + * This method reads all {@link #next() next code-points} that are {@link CharFilter#accept(int) accepted} by the + * given {@link CharFilter}.
+ * After the call of this method, the current index will point to the next character that was NOT + * {@link CharFilter#accept(int) accepted} by the given {@code filter} or to the {@link #EOS end} if NO such character + * exists. + * + * @see #skipWhile(int) + * + * @param filter is used to {@link CharFilter#accept(int) decide} which characters should be accepted. + * @return the number of characters {@link CharFilter#accept(int) accepted} by the given {@code filter} that have been + * skipped. + */ + default int skipWhile(CharFilter filter) { + + return skipWhile(filter, Integer.MAX_VALUE); + } + + /** + * This method reads all {@link #next() next code-points} that are {@link CharFilter#accept(int) accepted} by the + * given {@code filter}.
+ * After the call of this method, the current index will point to the next character that was NOT + * {@link CharFilter#accept(int) accepted} by the given {@code filter}. If the next {@code max} characters or the + * characters left until the {@link #hasNext() end} of this scanner are {@link CharFilter#accept(int) accepted}, only + * that amount of characters are skipped. + * + * @see #skipWhile(int) + * + * @param filter is used to {@link CharFilter#accept(int) decide} which characters should be accepted. + * @param max is the maximum number of characters that may be skipped. + * @return the number of skipped characters. + */ + int skipWhile(CharFilter filter, int max); + + /** + * Behaves like the following code: + * + *
+   * {@link #skipWhile(CharFilter) skipWhile}(filter);
+   * return {@link #peek()};
+   * 
+ * + * @param filter is used to {@link CharFilter#accept(int) decide} which characters should be accepted. + * @return the first character that was not {@link CharFilter#accept(int) accepted} by the given {@link CharFilter}. + * Only the {@link CharFilter#accept(int) accepted} characters have been consumed, this scanner still points + * to the returned character. + */ + default int skipWhileAndPeek(CharFilter filter) { + + return skipWhileAndPeek(filter, Integer.MAX_VALUE); + } + + /** + * Behaves like the following code: + * + *
+   * {@link #skipWhile(CharFilter, int) skipWhile}(filter, max);
+   * return {@link #peek()};
+   * 
+ * + * @param filter is used to {@link CharFilter#accept(int) decide} which characters should be accepted. + * @param max is the maximum number of characters that may be skipped. + * @return the first character that was not {@link CharFilter#accept(int) accepted} by the given {@link CharFilter}. + * Only the {@link CharFilter#accept(int) accepted} characters have been consumed, this scanner still points + * to the returned character. + */ + default int skipWhileAndPeek(CharFilter filter, int max) { + + skipWhile(filter, max); + return peek(); + } + + /** + * @return the {@link String} with the characters that have already been parsed but are still available in the + * underlying buffer. May be used for debugging or error messages. + */ + String getBufferParsed(); + + /** + * @return the {@link String} with the characters that have not yet been parsed but are available in the underlying + * buffer. May be used for debugging or error messages. + */ + String getBufferToParse(); + + @Override + // no checked exception! + void close(); + +} diff --git a/core/src/main/java/io/github/mmm/scanner/number/CharScannerNumberParser.java b/core/src/main/java/io/github/mmm/scanner/number/CharScannerNumberParser.java index 384f9e5..ec67938 100644 --- a/core/src/main/java/io/github/mmm/scanner/number/CharScannerNumberParser.java +++ b/core/src/main/java/io/github/mmm/scanner/number/CharScannerNumberParser.java @@ -25,7 +25,7 @@ public interface CharScannerNumberParser extends CharScannerRadixHandler { * @return {@code true} if the given digit is accepted, {@code false} otherwise (exceeds the range of the number to * parse and the digit should not be consumed). Typical implementations should always return {@code true}. */ - boolean digit(int digit, char digitChar); + boolean digit(int digit, int digitChar); /** * @return {@code true} if the decimal dot ('.') shall be accepted, {@code false} otherwise (stop further processing @@ -47,16 +47,17 @@ public interface CharScannerNumberParser extends CharScannerRadixHandler { * special numbers such as "NaN" or "Infinity". So for 'N' it can return "NaN" and for 'I' it can return "Infinity" to * support these special numbers. For a delimiter it can return @{@code other}. Otherwise return {@code null} here. * - * @param other the special charater that was found (no digit, no dot, no exponent). + * @param other the special charater {@link String#codePointAt(int) code-point} that was found (no digit, no dot, no + * exponent). * @return {@code null} to stop without consuming the given character or a {@link String} that is expected (and shall * start with the given special character). If that {@link String} was found in the scanner, * {@link #special(String)} is called. Otherwise again not even the given characters gets consumed. */ - String special(char other); + String special(int other); /** * @param special the special {@link String} that was found and consumed. - * @see #special(char) + * @see #special(int) */ void special(String special); diff --git a/core/src/main/java/io/github/mmm/scanner/number/CharScannerNumberParserBase.java b/core/src/main/java/io/github/mmm/scanner/number/CharScannerNumberParserBase.java index 4545cc5..db6018e 100644 --- a/core/src/main/java/io/github/mmm/scanner/number/CharScannerNumberParserBase.java +++ b/core/src/main/java/io/github/mmm/scanner/number/CharScannerNumberParserBase.java @@ -64,7 +64,7 @@ public abstract class CharScannerNumberParserBase implements CharScannerNumberPa * The constructor. * * @param radixMode the {@link CharScannerRadixHandler} for {@link #radix(int, char)}. - * @param specials the {@link #special(String) special numbers} and {@link #special(char) delimiters}. + * @param specials the {@link #special(String) special numbers} and {@link #special(int) delimiters}. */ public CharScannerNumberParserBase(CharScannerRadixHandler radixMode, CharScannerNumberSpecial... specials) { @@ -75,7 +75,7 @@ public CharScannerNumberParserBase(CharScannerRadixHandler radixMode, CharScanne } /** - * @param delimiters the accepted {@link #special(char) delimiter} characters. + * @param delimiters the accepted {@link #special(int) delimiter} characters. * @param specialNumbers - {@code true} to accept the special numbers {@link #NAN} and {@link #INFINITY}. * @return the resulting {@link String} array to pass to * {@link #CharScannerNumberParserBase(CharScannerRadixHandler, CharScannerNumberSpecial...) constructor}. @@ -94,7 +94,7 @@ public static CharScannerNumberSpecial[] specials(String delimiters, boolean spe } int pos = 0; while (i < len) { - result[i++] = new CharScannerNumberSpecialDelimiter(delimiters.charAt(pos++)); + result[i++] = new CharScannerNumberSpecialDelimiter(delimiters.codePointAt(pos++)); } return result; } @@ -159,10 +159,10 @@ protected void appendRadix() { } @Override - public boolean digit(int digit, char digitChar) { + public boolean digit(int digit, int digitChar) { if (this.builder != null) { - this.builder.append(digitChar); + this.builder.appendCodePoint(digitChar); } if (this.exponentSymbol == 0) { if (digit == 0) { @@ -256,7 +256,7 @@ protected void appendExponent(boolean lazy) { } @Override - public String special(char c) { + public String special(int c) { for (CharScannerNumberSpecial special : this.specials) { if (special.isSpecialStart(c)) { @@ -311,16 +311,16 @@ public String toString() { /** * Interface for handling of a special number syntax. * - * @see CharScannerNumberParser#special(char) + * @see CharScannerNumberParser#special(int) * @see CharScannerNumberParser#special(String) */ public static interface CharScannerNumberSpecial { /** - * @param c the {@link CharScannerNumberParser#special(char) special character} to check. + * @param c the {@link CharScannerNumberParser#special(int) special character} to check. * @return {@code true} if the given character shall be handled by this {@link CharScannerNumberSpecial}. */ - boolean isSpecialStart(char c); + boolean isSpecialStart(int c); /** * @return the {@link CharScannerNumberParser#special(String) special} {@link String} to expect. @@ -334,7 +334,7 @@ public static interface CharScannerNumberSpecial { */ public static class CharScannerNumberSpecialDelimiter implements CharScannerNumberSpecial { - private final char delimiter; + private final int delimiter; private final String delimiterString; @@ -343,7 +343,7 @@ public static class CharScannerNumberSpecialDelimiter implements CharScannerNumb * * @param delimiter the delimiter. */ - public CharScannerNumberSpecialDelimiter(char delimiter) { + public CharScannerNumberSpecialDelimiter(int delimiter) { super(); this.delimiter = delimiter; @@ -351,7 +351,7 @@ public CharScannerNumberSpecialDelimiter(char delimiter) { } @Override - public boolean isSpecialStart(char c) { + public boolean isSpecialStart(int c) { if (c == this.delimiter) { // double d = 1_2.0_0e+1_0; @@ -391,7 +391,7 @@ public CharScannerNumberSpecialNonNumber(String nonNumber) { } @Override - public boolean isSpecialStart(char c) { + public boolean isSpecialStart(int c) { if (c == this.first) { return true; // +/- can be followed by "NaN" or "Infinity" but no digits before diff --git a/core/src/main/java/io/github/mmm/scanner/number/CharScannerNumberParserLang.java b/core/src/main/java/io/github/mmm/scanner/number/CharScannerNumberParserLang.java index 4eaa081..d2f22b7 100644 --- a/core/src/main/java/io/github/mmm/scanner/number/CharScannerNumberParserLang.java +++ b/core/src/main/java/io/github/mmm/scanner/number/CharScannerNumberParserLang.java @@ -93,7 +93,7 @@ public CharScannerNumberParserLang(CharScannerRadixHandler radixMode, NumberType * * @param radixMode the {@link CharScannerRadixHandler} for {@link #radix(int, char)}. * @param numberType the {@link NumberType}. - * @param delimiters the accepted {@link #special(char) delimiter} characters. + * @param delimiters the accepted {@link #special(int) delimiter} characters. */ public CharScannerNumberParserLang(CharScannerRadixHandler radixMode, NumberType numberType, String delimiters) { @@ -105,7 +105,7 @@ public CharScannerNumberParserLang(CharScannerRadixHandler radixMode, NumberType * * @param radixMode the {@link CharScannerRadixHandler} for {@link #radix(int, char)}. * @param numberType the {@link NumberType}. - * @param delimiters the accepted {@link #special(char) delimiter} characters. + * @param delimiters the accepted {@link #special(int) delimiter} characters. * @param maxNonDecimal the maximum allowed number (e.g. {@link Integer#MAX_VALUE} to parse an {@link Integer} value). */ public CharScannerNumberParserLang(CharScannerRadixHandler radixMode, NumberType numberType, String delimiters, @@ -217,16 +217,16 @@ private void appendZeros(int count2) { } } - private void error(char c) { + private void error(int c) { if (this.builder == null) { - builder().append(c); // if number was not null before, c has already been appended + builder().appendCodePoint(c); // if number was not null before, c has already been appended } this.error = true; } @Override - public boolean digit(int digit, char digitChar) { + public boolean digit(int digit, int digitChar) { super.digit(digit, digitChar); if (this.error) { @@ -352,7 +352,7 @@ private int getRadixBits() { } } - private void preventCase(int digit, char digitChar) { + private void preventCase(int digit, int digitChar) { if ((digit > 9) && (this.builder == null)) { // prevent case of letter digits (e.g. hex) boolean upper = Character.isUpperCase(digitChar); @@ -360,7 +360,7 @@ private void preventCase(int digit, char digitChar) { this.upperCase = Boolean.valueOf(upper); } else if (this.upperCase.booleanValue() != upper) { // mixed case - to preserve original string, we start building what we can otherwise prevent for performance - builder().append(digitChar); + builder().appendCodePoint(digitChar); this.upperCase = null; } } diff --git a/core/src/test/java/io/github/mmm/scanner/AbstractCharStreamScannerTest.java b/core/src/test/java/io/github/mmm/scanner/AbstractCharStreamScannerTest.java index da9ad36..729313a 100644 --- a/core/src/test/java/io/github/mmm/scanner/AbstractCharStreamScannerTest.java +++ b/core/src/test/java/io/github/mmm/scanner/AbstractCharStreamScannerTest.java @@ -688,10 +688,10 @@ public void testNext() { CharStreamScanner scanner = scanner(string); for (int i = 0; i < 10; i++) { assertThat(scanner.hasNext()).isTrue(); - char c = scanner.next(); + int cp = scanner.next(); char expected = (char) ('0' + i); // then - assertThat(c).isEqualTo(expected); + assertThat(cp).isEqualTo(expected); } assertThat(scanner.hasNext()).isFalse(); assertThat(scanner.next()).isEqualTo('\0');