Skip to content

Commit

Permalink
Improve parser performance by up to 20%, YMMV
Browse files Browse the repository at this point in the history
  • Loading branch information
garydgregory committed Sep 18, 2024
1 parent da934d0 commit 9f4bf36
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 76 deletions.
3 changes: 2 additions & 1 deletion src/changes/changes.xml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@
<action type="fix" dev="ggregory" due-to="Dávid Szigecsán">Fix documentation for CSVFormat private constructor #466.</action>
<action type="fix" issue="CSV-294" dev="ggregory" due-to="Joern Huxhorn, Gary Gregory">CSVFormat does not support explicit " as escape char.</action>
<action type="fix" issue="CSV-150" dev="ggregory" due-to="dota17, Gary Gregory, Jörn Huxhorn">Escaping is not disableable.</action>
<action type="fix" dev="ggregory" due-to="Gary Gregory">Fix Javadoc warnings on Java 23.</action>
<action type="fix" dev="ggregory" due-to="Gary Gregory">Fix Javadoc warnings on Java 23.</action>
<action type="fix" dev="ggregory" due-to="Gary Gregory">Improve parser performance by up to 20%, YMMV.</action>
<!-- UPDATE -->
<action type="update" dev="ggregory" due-to="Dependabot">Bump commons-codec:commons-codec from 1.16.1 to 1.17.1 #422, #449.</action>
<action type="update" dev="ggregory" due-to="Gary Gregory">Bump org.apache.commons:commons-parent from 69 to 75 #435, #452, #465, #468, #475.</action>
Expand Down
64 changes: 22 additions & 42 deletions src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@
import static org.apache.commons.csv.Constants.UNDEFINED;
import static org.apache.commons.io.IOUtils.EOF;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedBufferedReader;

/**
* A special buffered reader which supports sophisticated read access.
Expand All @@ -35,18 +35,19 @@
* {@link #read()}. This reader also tracks how many characters have been read with {@link #getPosition()}.
* </p>
*/
final class ExtendedBufferedReader extends BufferedReader {
final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {

/** The last char returned */
private int lastChar = UNDEFINED;
private int lastCharMark = UNDEFINED;

/** The count of EOLs (CR/LF/CRLF) seen so far */
private long lineNumber;
private long lineNumberMark;

/** The position, which is the number of characters read so far */
private long position;

private boolean closed;
private long positionMark;

/**
* Constructs a new instance using the default buffer size.
Expand All @@ -55,6 +56,22 @@ final class ExtendedBufferedReader extends BufferedReader {
super(reader);
}

@Override
public void mark(int readAheadLimit) throws IOException {
lineNumberMark = lineNumber;
lastCharMark = lastChar;
positionMark = position;
super.mark(readAheadLimit);
}

@Override
public void reset() throws IOException {
lineNumber = lineNumberMark;
lastChar = lastCharMark;
position = positionMark;
super.reset();
}

/**
* Closes the stream.
*
Expand All @@ -64,7 +81,6 @@ final class ExtendedBufferedReader extends BufferedReader {
@Override
public void close() throws IOException {
// Set ivars before calling super close() in case close() throws an IOException.
closed = true;
lastChar = EOF;
super.close();
}
Expand All @@ -74,7 +90,7 @@ public void close() throws IOException {
*
* @return the current line number
*/
long getCurrentLineNumber() {
long getLineNumber() {
// Check if we are at EOL or EOF or just starting
if (lastChar == CR || lastChar == LF || lastChar == UNDEFINED || lastChar == EOF) {
return lineNumber; // counter is accurate
Expand Down Expand Up @@ -103,42 +119,6 @@ long getPosition() {
return this.position;
}

public boolean isClosed() {
return closed;
}

/**
* Returns the next character in the current reader without consuming it. So the next call to {@link #read()} will
* still return this value. Does not affect the line number or the last character.
*
* @return the next character
*
* @throws IOException
* If an I/O error occurs
*/
int peek() throws IOException {
super.mark(1);
final int c = super.read();
super.reset();
return c;
}

/**
* Populates the buffer with the next {@code buf.length} characters in the current reader without consuming them. The next call to {@link #read()} will
* still return the next value. This doesn't affect the line number or the last character.
*
* @param buf the buffer to fill for the look ahead.
* @return The number of characters peeked, or -1 if the end of the stream has been reached.
* @throws IOException If an I/O error occurs
*/
int peek(final char[] buf) throws IOException {
final int n = buf.length;
super.mark(n);
final int c = super.read(buf, 0, n);
super.reset();
return c;
}

@Override
public int read() throws IOException {
final int current = super.read();
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/apache/commons/csv/Lexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ long getCharacterPosition() {
* @return the current line number
*/
long getCurrentLineNumber() {
return reader.getCurrentLineNumber();
return reader.getLineNumber();
}

String getFirstEol() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,32 +61,32 @@ public void testReadChar() throws Exception {
final int EOLeolct = 9;

try (final ExtendedBufferedReader br = createBufferedReader(test)) {
assertEquals(0, br.getCurrentLineNumber());
assertEquals(0, br.getLineNumber());
int lineCount = 0;
while (br.readLine() != null) {
// consume all
lineCount++;
}
assertEquals(EOLeolct, br.getCurrentLineNumber());
assertEquals(lineCount, br.getCurrentLineNumber());
assertEquals(EOLeolct, br.getLineNumber());
assertEquals(lineCount, br.getLineNumber());
}
try (final ExtendedBufferedReader br = createBufferedReader(test)) {
assertEquals(0, br.getCurrentLineNumber());
assertEquals(0, br.getLineNumber());
int readCount = 0;
while (br.read() != EOF) {
// consume all
readCount++;
}
assertEquals(EOLeolct, br.getCurrentLineNumber());
assertEquals(EOLeolct, br.getLineNumber());
assertEquals(readCount, test.length());
}
try (final ExtendedBufferedReader br = createBufferedReader(test)) {
assertEquals(0, br.getCurrentLineNumber());
assertEquals(0, br.getLineNumber());
final char[] buff = new char[10];
while (br.read(buff, 0, 3) != EOF) {
// consume all
}
assertEquals(EOLeolct, br.getCurrentLineNumber());
assertEquals(EOLeolct, br.getLineNumber());
}
}

Expand All @@ -96,7 +96,7 @@ public void testReadingInDifferentBuffer() throws Exception {
try (ExtendedBufferedReader reader = createBufferedReader("1\r\n2\r\n")) {
reader.read(tmp1, 0, 2);
reader.read(tmp2, 2, 2);
assertEquals(2, reader.getCurrentLineNumber());
assertEquals(2, reader.getLineNumber());
}
}

Expand All @@ -110,28 +110,28 @@ public void testReadLine() throws Exception {
assertNull(br.readLine());
}
try (final ExtendedBufferedReader br = createBufferedReader("foo\n\nhello")) {
assertEquals(0, br.getCurrentLineNumber());
assertEquals(0, br.getLineNumber());
assertEquals("foo", br.readLine());
assertEquals(1, br.getCurrentLineNumber());
assertEquals(1, br.getLineNumber());
assertEquals("", br.readLine());
assertEquals(2, br.getCurrentLineNumber());
assertEquals(2, br.getLineNumber());
assertEquals("hello", br.readLine());
assertEquals(3, br.getCurrentLineNumber());
assertEquals(3, br.getLineNumber());
assertNull(br.readLine());
assertEquals(3, br.getCurrentLineNumber());
assertEquals(3, br.getLineNumber());
}
try (final ExtendedBufferedReader br = createBufferedReader("foo\n\nhello")) {
assertEquals('f', br.read());
assertEquals('o', br.peek());
assertEquals("oo", br.readLine());
assertEquals(1, br.getCurrentLineNumber());
assertEquals(1, br.getLineNumber());
assertEquals('\n', br.peek());
assertEquals("", br.readLine());
assertEquals(2, br.getCurrentLineNumber());
assertEquals(2, br.getLineNumber());
assertEquals('h', br.peek());
assertEquals("hello", br.readLine());
assertNull(br.readLine());
assertEquals(3, br.getCurrentLineNumber());
assertEquals(3, br.getLineNumber());
}
try (final ExtendedBufferedReader br = createBufferedReader("foo\rbaar\r\nfoo")) {
assertEquals("foo", br.readLine());
Expand All @@ -146,58 +146,58 @@ public void testReadLine() throws Exception {
@Test
public void testReadLookahead1() throws Exception {
try (final ExtendedBufferedReader br = createBufferedReader("1\n2\r3\n")) {
assertEquals(0, br.getCurrentLineNumber());
assertEquals(0, br.getLineNumber());
assertEquals('1', br.peek());
assertEquals(UNDEFINED, br.getLastChar());
assertEquals(0, br.getCurrentLineNumber());
assertEquals(0, br.getLineNumber());
assertEquals('1', br.read()); // Start line 1
assertEquals('1', br.getLastChar());

assertEquals(1, br.getCurrentLineNumber());
assertEquals(1, br.getLineNumber());
assertEquals('\n', br.peek());
assertEquals(1, br.getCurrentLineNumber());
assertEquals(1, br.getLineNumber());
assertEquals('1', br.getLastChar());
assertEquals('\n', br.read());
assertEquals(1, br.getCurrentLineNumber());
assertEquals(1, br.getLineNumber());
assertEquals('\n', br.getLastChar());
assertEquals(1, br.getCurrentLineNumber());
assertEquals(1, br.getLineNumber());

assertEquals('2', br.peek());
assertEquals(1, br.getCurrentLineNumber());
assertEquals(1, br.getLineNumber());
assertEquals('\n', br.getLastChar());
assertEquals(1, br.getCurrentLineNumber());
assertEquals(1, br.getLineNumber());
assertEquals('2', br.read()); // Start line 2
assertEquals(2, br.getCurrentLineNumber());
assertEquals(2, br.getLineNumber());
assertEquals('2', br.getLastChar());

assertEquals('\r', br.peek());
assertEquals(2, br.getCurrentLineNumber());
assertEquals(2, br.getLineNumber());
assertEquals('2', br.getLastChar());
assertEquals('\r', br.read());
assertEquals('\r', br.getLastChar());
assertEquals(2, br.getCurrentLineNumber());
assertEquals(2, br.getLineNumber());

assertEquals('3', br.peek());
assertEquals('\r', br.getLastChar());
assertEquals('3', br.read()); // Start line 3
assertEquals('3', br.getLastChar());
assertEquals(3, br.getCurrentLineNumber());
assertEquals(3, br.getLineNumber());

assertEquals('\n', br.peek());
assertEquals(3, br.getCurrentLineNumber());
assertEquals(3, br.getLineNumber());
assertEquals('3', br.getLastChar());
assertEquals('\n', br.read());
assertEquals(3, br.getCurrentLineNumber());
assertEquals(3, br.getLineNumber());
assertEquals('\n', br.getLastChar());
assertEquals(3, br.getCurrentLineNumber());
assertEquals(3, br.getLineNumber());

assertEquals(EOF, br.peek());
assertEquals('\n', br.getLastChar());
assertEquals(EOF, br.read());
assertEquals(EOF, br.getLastChar());
assertEquals(EOF, br.read());
assertEquals(EOF, br.peek());
assertEquals(3, br.getCurrentLineNumber());
assertEquals(3, br.getLineNumber());

}
}
Expand Down

0 comments on commit 9f4bf36

Please sign in to comment.