-
Notifications
You must be signed in to change notification settings - Fork 11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Refactor to prepare for fixed-width column support. #219
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
package io.deephaven.csv.reading; | ||
|
||
import io.deephaven.csv.containers.ByteSlice; | ||
import io.deephaven.csv.tokenization.RangeTests; | ||
import io.deephaven.csv.util.MutableInt; | ||
|
||
public class ReaderUtil { | ||
public static String[] makeSyntheticHeaders(int numHeaders) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be moved / made private to the single callsite? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oops. There were supposed to be two call sites. Now there are :-). Thanks, good catch. |
||
final String[] result = new String[numHeaders]; | ||
for (int ii = 0; ii < result.length; ++ii) { | ||
result[ii] = "Column" + (ii + 1); | ||
} | ||
return result; | ||
} | ||
|
||
/** | ||
* Trim whitespace from the front and back of the slice. | ||
* | ||
* @param cs The slice, modified in-place to have whitespace (if any) removed. | ||
*/ | ||
public static void trimWhitespace(final ByteSlice cs) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This becoming newly public, I'm going to complain about the name "whitespace", as that has a more strict definition, see There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks. Renamed to |
||
final byte[] data = cs.data(); | ||
int begin = cs.begin(); | ||
int end = cs.end(); | ||
while (begin != end && RangeTests.isSpaceOrTab(data[begin])) { | ||
++begin; | ||
} | ||
while (begin != end && RangeTests.isSpaceOrTab(data[end - 1])) { | ||
--end; | ||
} | ||
cs.reset(data, begin, end); | ||
} | ||
|
||
/** | ||
* Get the expected length of a UTF-8 sequence, given its first byte, and its corresponding length in the specified | ||
* units (UTF-16 or UTF-32). | ||
* | ||
* @param firstByte The first byte of the UTF-8 sequence. | ||
* @param numBytes The number of remaining bytes in the input field (including firstByte). If the UTF-8 sequence | ||
* specifies a number of bytes larger than the number of remaining bytes, an exception is thrown. | ||
* @param useUtf32CountingConvention Whether 'charCountResult' should be in units of UTF-32 or UTF-16. | ||
* @param charCountResult The number of UTF-32 or UTF-16 units specified by the UTF-8 character. | ||
* @return The length of the UTF-8 sequence. | ||
*/ | ||
public static int getUtf8LengthAndCharLength( | ||
byte firstByte, int numBytes, | ||
boolean useUtf32CountingConvention, MutableInt charCountResult) { | ||
final int utf8Length = getUtf8Length(firstByte); | ||
if (utf8Length > numBytes) { | ||
throw new RuntimeException(String.format( | ||
"The next UTF-8 character needs %d bytes but there are only %d left in the field", | ||
utf8Length, numBytes)); | ||
} | ||
final int numChars = useUtf32CountingConvention || utf8Length < 4 ? 1 : 2; | ||
charCountResult.setValue(numChars); | ||
return utf8Length; | ||
} | ||
|
||
/** | ||
* Calculate the expected length of a UTF-8 sequence, given its first byte. | ||
* | ||
* @param firstByte The first byte of the sequence. | ||
* @return The length of the sequence, in the range 1..4 inclusive. | ||
*/ | ||
private static int getUtf8Length(byte firstByte) { | ||
if ((firstByte & 0x80) == 0) { | ||
// 0xxxxxxx | ||
// 1-byte UTF-8 character aka ASCII. | ||
// Last code point U+007F | ||
return 1; | ||
} | ||
if ((firstByte & 0xE0) == 0xC0) { | ||
// 110xxxxx | ||
// 2-byte UTF-8 character | ||
// Last code point U+07FF | ||
return 2; | ||
} | ||
if ((firstByte & 0xF0) == 0xE0) { | ||
// 1110xxxx | ||
// 3-byte UTF-8 character | ||
// Last code point U+FFFF | ||
return 3; | ||
} | ||
if ((firstByte & 0xF8) == 0xF0) { | ||
// 11110xxx | ||
// 4-byte UTF-8 character. Note: Java encodes all of these in two "char" variables. | ||
// Last code point U+10FFFF | ||
return 4; | ||
} | ||
throw new IllegalStateException(String.format("0x%x is not a valid starting byte for a UTF-8 sequence", | ||
firstByte)); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
package io.deephaven.csv.reading.cells; | ||
|
||
import io.deephaven.csv.containers.ByteSlice; | ||
import io.deephaven.csv.util.CsvReaderException; | ||
import io.deephaven.csv.util.MutableBoolean; | ||
|
||
/** | ||
* This class is used to traverse over text from a Reader, understanding both field and line delimiters, as well as the | ||
* CSV quoting convention, and breaking the text into cells for use by the calling code. | ||
*/ | ||
public interface CellGrabber { | ||
/** | ||
* Try to grab the next cell from the input, being aware of field delimiters, line delimiters, quoting, and | ||
* trimming. | ||
* | ||
* @param dest The result, as a {@link ByteSlice}. The ByteSlice is invalidated by the next call to grabNext. | ||
* @param lastInRow An out parameter which will be set to true if the cell just read was the last cell in the row, | ||
* otherwise it will be set to false. | ||
* @param endOfInput An out parameter which will be set to true if the cell just read encountered the end of the | ||
* input, otherwise it will be set to false. | ||
*/ | ||
void grabNext(final ByteSlice dest, final MutableBoolean lastInRow, | ||
final MutableBoolean endOfInput) throws CsvReaderException; | ||
|
||
/** | ||
* Returns the "physical" row number, that is the row number of the input file. This differs from the "logical" row | ||
* number, which is the row number of the CSV data being processed. The difference arises when, due to quotation | ||
* marks, a single CSV row can span multiple lines of input. | ||
*/ | ||
int physicalRowNum(); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This appears to be unused (even when reviewing other PR).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks, deleted