diff --git a/src/main/java/io/deephaven/csv/CsvSpecs.java b/src/main/java/io/deephaven/csv/CsvSpecs.java index ffe7636..88de1b6 100644 --- a/src/main/java/io/deephaven/csv/CsvSpecs.java +++ b/src/main/java/io/deephaven/csv/CsvSpecs.java @@ -24,7 +24,7 @@ public abstract class CsvSpecs { public interface Builder { /** - * Copy all of the parameters from {@code specs} into {@code this} builder. + * Copy all the parameters from {@code specs} into {@code this} builder. */ Builder from(CsvSpecs specs); @@ -117,6 +117,34 @@ public interface Builder { */ Builder headerValidator(Predicate headerValidator); + /** + * True if the input is organized into fixed width columns rather than delimited by a delimiter. + */ + Builder hasFixedWidthColumns(boolean hasFixedWidthColumns); + + /** + * When {@link #hasFixedWidthColumns} is set, the library either determines the column widths from the header + * row (provided {@link #hasHeaderRow} is set), or the column widths can be specified explicitly by the caller. + * If the caller wants to specify them explicitly, they can use this method. It is an error to set this + * parameter if {@link #hasFixedWidthColumns} is false. Note that because the library is tolerant of the last + * cell being shorter or wider than expected, the value specified here for the width of the last column is + * simply a placeholder; its value is ignored. + */ + Builder fixedColumnWidths(Iterable fixedColumnWidths); + + /** + * This setting controls what units fixed width columns are measured in. When true, fixed width columns are + * measured in Unicode code points. When false, fixed width columns are measured in UTF-16 units (aka Java + * chars). The difference arises when encountering characters outside the Unicode Basic Multilingual Plane. For + * example, the Unicode code point πŸ’” (U+1F494) is one Unicode code point, but takes two Java chars to + * represent. Along these lines, the string πŸ’”πŸ’”πŸ’” would fit in a column of width 3 when utf32CountingMode is + * true, but would require a column width of at least 6 when utf32CountingMode is false. The default setting of + * true is arguably more natural for users (the number of characters they see matches the visual width of the + * column). But some programs may want the value of false because they are counting Java chars. It is an error + * to set this parameter if {@link #hasFixedWidthColumns} is false. + */ + Builder useUtf32CountingConvention(boolean useUtf32CountingConvention); + /** * Number of data rows to skip before processing data. This is useful when you want to parse data in chunks. * Typically used together with {@link Builder#numRows}. Defaults to 0. @@ -160,7 +188,7 @@ public interface Builder { /** * The field delimiter character (the character that separates one column from the next). Must be 7-bit ASCII. - * Defaults to {code ','}. + * Defaults to {code ','}. It is an error to set this parameter if {@link #hasFixedWidthColumns} is true. */ Builder delimiter(char delimiter); @@ -179,6 +207,8 @@ public interface Builder { *
  • hello, there *
  • 456 * + * + * It is an error to set this parameter if {@link #hasFixedWidthColumns} is true. */ Builder quote(char quote); @@ -188,7 +218,8 @@ public interface Builder { Builder ignoreSurroundingSpaces(boolean ignoreSurroundingSpaces); /** - * Whether to trim leading and trailing blanks from inside quoted values. Defaults to {@code false}. + * Whether to trim leading and trailing blanks from inside quoted values. Defaults to {@code false}. It is an + * error to set this parameter if {@link #hasFixedWidthColumns} is true. */ Builder trim(boolean trim); @@ -224,6 +255,38 @@ void check() { if (!hasHeaderRow() && skipHeaderRows() > 0) { problems.add("skipHeaderRows != 0 but hasHeaderRow is not set"); } + + for (final Integer colWidth : fixedColumnWidths()) { + if (colWidth < 1) { + problems.add(String.format("Fixed column width %d is invalid", colWidth)); + } + } + + // Certain items must not be set in fixed-width column mode. Other items must not be set in delimited column + // mode. + if (hasFixedWidthColumns()) { + final String format = "Incompatible parameters: can't set %s when hasFixedWidthColumns is true"; + if (quote() != defaultQuote) { + problems.add(String.format(format, "quote")); + } + + if (delimiter() != defaultDelimiter) { + problems.add(String.format(format, "delimiter")); + } + + if (trim() != defaultTrim) { + problems.add(String.format(format, "trim")); + } + } else { + final String format = "Incompatible parameters: can't set %s when hasFixedWidthColumns is false"; + if (fixedColumnWidths().size() != 0) { + problems.add(String.format(format, "fixedColumnWidths")); + } + + if (useUtf32CountingConvention() != defaultUtf32CountingConvention) { + problems.add(String.format(format, "useUtf32CountingConvention")); + } + } if (problems.isEmpty()) { return; } @@ -340,6 +403,32 @@ public Predicate headerValidator() { return c -> true; } + /** + * See {@link Builder#hasFixedWidthColumns}. + */ + @Default + public boolean hasFixedWidthColumns() { + return false; + } + + /** + * See {@link Builder#fixedColumnWidths}. + */ + @Default + public List fixedColumnWidths() { + return Collections.emptyList(); + } + + private static final boolean defaultUtf32CountingConvention = true; + + /** + * See {@link Builder#useUtf32CountingConvention}. + */ + @Default + public boolean useUtf32CountingConvention() { + return defaultUtf32CountingConvention; + } + /** * See {@link Builder#skipRows}. */ @@ -396,20 +485,25 @@ public long skipHeaderRows() { return 0; } + private final char defaultDelimiter = ','; + /** * See {@link Builder#delimiter}. */ @Default public char delimiter() { - return ','; + return defaultDelimiter; } + + private static final char defaultQuote = '"'; + /** * See {@link Builder#quote}. */ @Default public char quote() { - return '"'; + return defaultQuote; } /** @@ -420,12 +514,14 @@ public boolean ignoreSurroundingSpaces() { return true; } + private static boolean defaultTrim = false; + /** * See {@link Builder#trim}. */ @Default public boolean trim() { - return false; + return defaultTrim; } /** diff --git a/src/main/java/io/deephaven/csv/reading/CsvReader.java b/src/main/java/io/deephaven/csv/reading/CsvReader.java index 9a9944a..68899ae 100644 --- a/src/main/java/io/deephaven/csv/reading/CsvReader.java +++ b/src/main/java/io/deephaven/csv/reading/CsvReader.java @@ -7,7 +7,9 @@ import io.deephaven.csv.parsers.Parser; import io.deephaven.csv.reading.cells.CellGrabber; import io.deephaven.csv.reading.cells.DelimitedCellGrabber; +import io.deephaven.csv.reading.cells.FixedCellGrabber; import io.deephaven.csv.reading.headers.DelimitedHeaderFinder; +import io.deephaven.csv.reading.headers.FixedHeaderFinder; import io.deephaven.csv.sinks.Sink; import io.deephaven.csv.sinks.SinkFactory; import io.deephaven.csv.util.*; @@ -63,7 +65,8 @@ private CsvReader() {} */ public static Result read(final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory) throws CsvReaderException { - return delimitedReadLogic(specs, stream, sinkFactory); + return specs.hasFixedWidthColumns() ? fixedReadLogic(specs, stream, sinkFactory) + : delimitedReadLogic(specs, stream, sinkFactory); } private static Result delimitedReadLogic( @@ -97,6 +100,16 @@ private static Result delimitedReadLogic( return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersToUse, sinkFactory); } + private static Result fixedReadLogic( + final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory) throws CsvReaderException { + final CellGrabber lineGrabber = FixedCellGrabber.makeLineGrabber(stream); + MutableObject columnWidths = new MutableObject<>(); + final String[] headers = FixedHeaderFinder.determineHeadersToUse(specs, lineGrabber, columnWidths); + final int numCols = headers.length; + final CellGrabber grabber = new FixedCellGrabber(lineGrabber, columnWidths.getValue(), + specs.ignoreSurroundingSpaces(), specs.useUtf32CountingConvention()); + return commonReadLogic(specs, grabber, null, numCols, numCols, headers, sinkFactory); + } private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber, byte[][] optionalFirstDataRow, int numInputCols, int numOutputCols, diff --git a/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java b/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java new file mode 100644 index 0000000..25ec776 --- /dev/null +++ b/src/main/java/io/deephaven/csv/reading/cells/FixedCellGrabber.java @@ -0,0 +1,113 @@ +package io.deephaven.csv.reading.cells; + +import io.deephaven.csv.containers.ByteSlice; +import io.deephaven.csv.reading.ReaderUtil; +import io.deephaven.csv.util.CsvReaderException; +import io.deephaven.csv.util.MutableBoolean; +import io.deephaven.csv.util.MutableInt; + +import java.io.InputStream; + +/** + * This class uses an underlying DelimitedCellGrabber to grab whole lines at a time from the input stream, and then it + * breaks them into fixed-sized cells to return to the caller. + */ +public class FixedCellGrabber implements CellGrabber { + /** + * Makes a degenerate CellGrabber that has no delimiters or quotes and therefore returns whole lines. This is a + * somewhat quick-and-dirty way to reuse the buffering and newline logic in DelimitedCellGrabber without rewriting + * it. + * + * @param stream The underlying stream. + * @return The "line grabber" + */ + public static CellGrabber makeLineGrabber(InputStream stream) { + final byte IllegalUtf8 = (byte) 0xff; + return new DelimitedCellGrabber(stream, IllegalUtf8, IllegalUtf8, true, false); + } + + private final CellGrabber lineGrabber; + private final int[] columnWidths; + private final boolean ignoreSurroundingSpaces; + private final boolean utf32CountingMode; + private final ByteSlice rowText; + private boolean needsUnderlyingRefresh; + private int colIndex; + private final MutableBoolean dummy1; + private final MutableInt dummy2; + + /** Constructor. */ + public FixedCellGrabber(final CellGrabber lineGrabber, final int[] columnWidths, boolean ignoreSurroundingSpaces, + boolean utf32CountingMode) { + this.lineGrabber = lineGrabber; + this.columnWidths = columnWidths; + this.ignoreSurroundingSpaces = ignoreSurroundingSpaces; + this.utf32CountingMode = utf32CountingMode; + this.rowText = new ByteSlice(); + this.needsUnderlyingRefresh = true; + this.colIndex = 0; + this.dummy1 = new MutableBoolean(); + this.dummy2 = new MutableInt(); + } + + @Override + public void grabNext(ByteSlice dest, MutableBoolean lastInRow, MutableBoolean endOfInput) + throws CsvReaderException { + if (needsUnderlyingRefresh) { + // Underlying row used up, and all columns provided. Ask underlying CellGrabber for the next line. + lineGrabber.grabNext(rowText, dummy1, endOfInput); + + if (endOfInput.booleanValue()) { + // Set dest to the empty string, and leave 'endOfInput' set to true. + dest.reset(rowText.data(), rowText.end(), rowText.end()); + return; + } + + needsUnderlyingRefresh = false; + colIndex = 0; + } + + // There is data to return. Count off N characters. The final column gets all remaining characters. + final boolean lastCol = colIndex == columnWidths.length - 1; + final int numCharsToTake = lastCol ? Integer.MAX_VALUE : columnWidths[colIndex]; + takeNCharactersInCharset(rowText, dest, numCharsToTake, utf32CountingMode, dummy2); + ++colIndex; + needsUnderlyingRefresh = lastCol || dest.size() == 0; + lastInRow.setValue(needsUnderlyingRefresh); + endOfInput.setValue(false); + + if (ignoreSurroundingSpaces) { + ReaderUtil.trimSpacesAndTabs(dest); + } + } + + private static void takeNCharactersInCharset(ByteSlice src, ByteSlice dest, int numCharsToTake, + boolean utf32CountingMode, MutableInt tempInt) { + final byte[] data = src.data(); + final int cellBegin = src.begin(); + int current = cellBegin; + while (numCharsToTake > 0) { + if (current == src.end()) { + break; + } + final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(data[current], src.end() - current, + utf32CountingMode, tempInt); + if (numCharsToTake < tempInt.intValue()) { + // There is not enough space left in the field to store this character. + // This can happen if CsvSpecs is set for the UTF16 counting convention, + // there is one unit left in the field, and we encounter a character outside + // the Basic Multilingual Plane, which would require two units. + break; + } + numCharsToTake -= tempInt.intValue(); + current += utf8Length; + } + dest.reset(src.data(), cellBegin, current); + src.reset(src.data(), current, src.end()); + } + + @Override + public int physicalRowNum() { + return lineGrabber.physicalRowNum(); + } +} diff --git a/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java b/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java new file mode 100644 index 0000000..c5ac95f --- /dev/null +++ b/src/main/java/io/deephaven/csv/reading/headers/FixedHeaderFinder.java @@ -0,0 +1,199 @@ +package io.deephaven.csv.reading.headers; + +import io.deephaven.csv.CsvSpecs; +import io.deephaven.csv.containers.ByteSlice; +import io.deephaven.csv.reading.ReaderUtil; +import io.deephaven.csv.reading.cells.CellGrabber; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.util.CsvReaderException; +import io.deephaven.csv.util.MutableBoolean; +import io.deephaven.csv.util.MutableInt; +import io.deephaven.csv.util.MutableObject; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +public class FixedHeaderFinder { + /** + * Determine which headers to use. The result comes from either the first row of the file or the user-specified + * overrides. + */ + public static String[] determineHeadersToUse( + final CsvSpecs specs, + final CellGrabber lineGrabber, + MutableObject columnWidthsResult) + throws CsvReaderException { + String[] headersToUse; + // Get user-specified column widths, if any. If none were specified, this will be an array of length 0. + // The column widths are in units of the specified convention (either UTF-16 or UTF-32 units). + int[] columnWidthsToUse = specs.fixedColumnWidths().stream().mapToInt(Integer::intValue).toArray(); + if (specs.hasHeaderRow()) { + long skipCount = specs.skipHeaderRows(); + final ByteSlice headerRow = new ByteSlice(); + MutableBoolean lastInRow = new MutableBoolean(); + MutableBoolean endOfInput = new MutableBoolean(); + while (true) { + lineGrabber.grabNext(headerRow, lastInRow, endOfInput); + if (endOfInput.booleanValue()) { + throw new CsvReaderException( + "Can't proceed because hasHeaderRow is set but input file is empty or shorter than skipHeaderRows"); + } + if (skipCount == 0) { + break; + } + --skipCount; + } + if (columnWidthsToUse.length == 0) { + columnWidthsToUse = inferColumnWidths(headerRow, specs.useUtf32CountingConvention()); + } + + headersToUse = + extractHeaders(headerRow, columnWidthsToUse, specs.useUtf32CountingConvention()); + } else { + if (columnWidthsToUse.length == 0) { + throw new CsvReaderException( + "Can't proceed because hasHeaderRow is false but fixedColumnWidths is unspecified"); + } + headersToUse = ReaderUtil.makeSyntheticHeaders(columnWidthsToUse.length); + } + + // Whether or not the input had headers, maybe override with client-specified headers. + if (specs.headers().size() != 0) { + if (specs.headers().size() != headersToUse.length) { + final String message = String.format("Library determined %d headers; caller overrode with %d headers", + headersToUse.length, specs.headers().size()); + throw new CsvReaderException(message); + } + headersToUse = specs.headers().toArray(new String[0]); + } + + // Apply column specific overrides. + for (Map.Entry entry : specs.headerForIndex().entrySet()) { + headersToUse[entry.getKey()] = entry.getValue(); + } + + columnWidthsResult.setValue(columnWidthsToUse); + return headersToUse; + } + + /** + * Infer the column widths by looking for the transition from delimiter char to non-delimiter char. + * + * @param row The input row + * @param useUtf32CountingConvention The character set convention we are using for units of width (either UTF-32 or + * UTF-16) + * @return The widths of the columns, in the specified character set convention. + */ + private static int[] inferColumnWidths(ByteSlice row, boolean useUtf32CountingConvention) { + // A column start is a non-delimiter character preceded by a delimiter (or present at the start of line). + // If the start of the line is a delimiter, that is an error. + final List columnWidths = new ArrayList<>(); + final MutableInt charCountResult = new MutableInt(); + boolean prevCharIsSpace = false; + final byte[] data = row.data(); + int numChars = 0; + int currentIndex = row.begin(); + while (true) { + if (currentIndex == row.end()) { + columnWidths.add(numChars); + return columnWidths.stream().mapToInt(Integer::intValue).toArray(); + } + // If this character is not a delimiter, but the previous one was, then this is the start of a new column. + byte ch = data[currentIndex]; + boolean thisCharIsSpace = ch == ' '; + if (currentIndex == row.begin() && thisCharIsSpace) { + throw new IllegalArgumentException("Header row cannot start with a space"); + } + if (!thisCharIsSpace && prevCharIsSpace) { + columnWidths.add(numChars); + numChars = 0; + } + prevCharIsSpace = thisCharIsSpace; + final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(ch, row.end() - currentIndex, + useUtf32CountingConvention, charCountResult); + currentIndex += utf8Length; + numChars += charCountResult.intValue(); + } + } + + /** + * Extract the headers names from 'row'. + * + * @param row The header row + * @param columnWidths The width of the columns, in the UTF-32 or UTF-16 counting convention. + * @param utf32CountingMode Whether we are in the UTF-32 or UTF-16 counting mode + * @return The array of headers + */ + private static String[] extractHeaders(ByteSlice row, int[] columnWidths, boolean utf32CountingMode) { + final int numCols = columnWidths.length; + if (numCols == 0) { + return new String[0]; + } + final int[] byteWidths = new int[numCols]; + final ByteSlice tempSlice = new ByteSlice(); + final int excessBytes = charWidthsToByteWidths(row, columnWidths, utf32CountingMode, byteWidths); + // Our policy is that the last column gets any excess bytes that are in the row. + byteWidths[numCols - 1] += excessBytes; + final String[] result = new String[numCols]; + + int beginByte = row.begin(); + for (int colNum = 0; colNum != numCols; ++colNum) { + final int proposedEndByte = beginByte + byteWidths[colNum]; + final int actualEndByte = Math.min(proposedEndByte, row.end()); + tempSlice.reset(row.data(), beginByte, actualEndByte); + ReaderUtil.trimSpacesAndTabs(tempSlice); + result[colNum] = tempSlice.toString(); + beginByte = actualEndByte; + } + return result; + } + + /** + * Convert character widths to UTF-8 widths. This converts the character widths, which are in the specified + * convention (either UTF-16 or UTF-32), which are fixed for the whole input, and which are determined by reading + * the headers (or specified by the user), into UTF-8 widths, which are specific to this row. For example if a + * charWidth is 2 and the utf32CountingMode is true, then we need to scan the row for the next two Unicode + * characters and count how many UTF-8 bytes that took up. + * + * @param row The row we are processing + * @param charWidths The column widths, in units of UTF-32 or UTF-16 units. + * @param utf32CountingMode Whether we are counting in UTF-32 or UTF-16 mode + * @param byteWidths The corresponding number of UTF-8 bytes corresponding to the charWidths for this row. + * @return The number of excess UTF-8 bytes in this row that go beyond all the charWidths. + */ + private static int charWidthsToByteWidths(ByteSlice row, int[] charWidths, boolean utf32CountingMode, + int[] byteWidths) { + int numCols = charWidths.length; + if (byteWidths.length != numCols) { + throw new IllegalArgumentException( + String.format("Expected charWidths.length (%d) == byteWidths.length (%d)", + charWidths.length, byteWidths.length)); + } + final MutableInt charCountResult = new MutableInt(); + final byte[] data = row.data(); + int start = row.begin(); + int current = start; + int colIndex = 0; + int charCount = 0; + while (true) { + if (colIndex == numCols) { + // Excess bytes not claimed by any column + return row.end() - current; + } + if (charCount == charWidths[colIndex]) { + byteWidths[colIndex] = current - start; + start = current; + charCount = 0; + ++colIndex; + continue; + } + + final byte ch = data[current]; + final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(ch, row.end() - current, utf32CountingMode, + charCountResult); + current += utf8Length; + charCount += charCountResult.intValue(); + } + } +} diff --git a/src/test/java/io/deephaven/csv/CsvReaderTest.java b/src/test/java/io/deephaven/csv/CsvReaderTest.java index 4f5e586..b42feb3 100644 --- a/src/test/java/io/deephaven/csv/CsvReaderTest.java +++ b/src/test/java/io/deephaven/csv/CsvReaderTest.java @@ -26,6 +26,9 @@ import org.jetbrains.annotations.NotNull; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Timeout; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; +import org.junit.jupiter.params.provider.ValueSource; import java.io.*; import java.lang.reflect.Array; @@ -1853,12 +1856,6 @@ public void lotsOfDataDoesntChoke() throws CsvReaderException { public void colnumPassedThrough() throws CsvReaderException { final String input = "" + "Col1,Col2,Col3\n" + "1,2,3\n" + "4,5,6\n" + "7,8,9\n"; - final ColumnSet expected = - ColumnSet.of( - Column.ofValues("Col1", 1, 4, 7), - Column.ofValues("Col2", 2, 5, 8), - Column.ofValues("Col3", 3, 6, 9)); - final InputStream inputStream = toInputStream(input); final CsvSpecs specs = defaultCsvSpecs(); final SinkFactory sinkFactory = makeBlackholeSinkFactory(); @@ -1873,6 +1870,523 @@ public void colnumPassedThrough() throws CsvReaderException { Assertions.assertThat(bh2Num).isEqualTo(2); } + /** + * Addresses A user requested that the library be + * able to read files like this. + */ + @Test + public void bug212() throws CsvReaderException { + final String input = + "" + + "NAME STATUS AGE LABELS\n" + + "argo-events Not Active 2y77d app.kubernetes.io/instance=argo-events,kubernetes.io/metadata.name=argo-events\n" + + "argo-workflows Active 2y77d app.kubernetes.io/instance=argo-workflows,kubernetes.io/metadata.name=argo-workflows\n" + + "argocd Active 5y18d kubernetes.io/metadata.name=argocd\n" + + "beta Not Active 4y235d kubernetes.io/metadata.name=beta\n"; + + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true).build(); + + final ColumnSet expected = ColumnSet.of( + Column.ofRefs("NAME", "argo-events", "argo-workflows", "argocd", "beta"), + Column.ofRefs("STATUS", "Not Active", "Active", "Active", "Not Active"), + Column.ofRefs("AGE", "2y77d", "2y77d", "5y18d", "4y235d"), + Column.ofRefs("LABELS", + "app.kubernetes.io/instance=argo-events,kubernetes.io/metadata.name=argo-events", + "app.kubernetes.io/instance=argo-workflows,kubernetes.io/metadata.name=argo-workflows", + "kubernetes.io/metadata.name=argocd", + "kubernetes.io/metadata.name=beta")); + + invokeTest(specs, input, expected); + } + + /** + * A basic test of fixed-width column support. + */ + @Test + public void simpleFixedColumnWidths() throws CsvReaderException { + final String input = + "" + + "Sym Type Price SecurityId\n" + + "GOOG Dividend 0.25 200\n" + + "T Dividend 0.15 300\n" + + "Z Dividend 0.18 500\n"; + + final ColumnSet expected = + ColumnSet.of( + Column.ofRefs("Sym", "GOOG", "T", "Z"), + Column.ofRefs("Type", "Dividend", "Dividend", "Dividend"), + Column.ofValues("Price", 0.25, 0.15, 0.18), + Column.ofValues("SecurityId", 200, 300, 500)); + + final CsvSpecs specs = + defaultCsvBuilder().hasFixedWidthColumns(true).build(); + + invokeTest(specs, input, expected); + } + + /** + * We allow fixed-width data fields to fill the whole cell, without a padding character. + */ + @Test + public void fixedColumnWidthsFullCell() throws CsvReaderException { + final String input = + "" + + "Sym Type Price SecurityId\n" + + "GOOGLEDividend!0.25 200\n" + + "T Dividend 0.15 300\n"; + + final ColumnSet expected = + ColumnSet.of( + Column.ofRefs("Sym", "GOOGLE", "T"), + Column.ofRefs("Type", "Dividend!", "Dividend"), + Column.ofValues("Price", 0.25, 0.15), + Column.ofValues("SecurityId", 200, 300)); + + final CsvSpecs specs = + defaultCsvBuilder().hasFixedWidthColumns(true).build(); + invokeTest(specs, input, expected); + } + + /** + * Fixed-width cells can keep their padding characters or trim them, via {@link CsvSpecs#ignoreSurroundingSpaces} + * Note that column headers themselves are always trimmed. + */ + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void fixedColumnsMayIncludeOrExcludeSurroundingSpaces(boolean ignoreSurroundingSpaces) + throws CsvReaderException { + final String input = + "" + + "Sym Type Price SecurityId\n" + + "GOOG Dividend 0.25 200\n" + + "T Dividend 0.15 300\n" + + "Z Coupon 0.18 500\n"; + + final String[] symData = + ignoreSurroundingSpaces ? new String[] {"GOOG", "T", "Z"} : new String[] {"GOOG ", "T ", "Z "}; + + final String[] typeData = ignoreSurroundingSpaces ? new String[] {"Dividend", "Dividend", "Coupon"} + : new String[] {"Dividend ", "Dividend ", "Coupon "}; + + + final ColumnSet expected = + ColumnSet.of( + Column.ofRefs("Sym", symData), + Column.ofRefs("Type", typeData), + Column.ofValues("Price", 0.25, 0.15, 0.18), + Column.ofValues("SecurityId", 200, 300, 500)); + + final CsvSpecs specs = + defaultCsvBuilder().hasFixedWidthColumns(true).ignoreSurroundingSpaces(ignoreSurroundingSpaces).build(); + + invokeTest(specs, input, expected); + } + + /** + * Like delimited mode, fixed-width mode allows header rows to be skipped. + */ + @Test + public void fixedColumnWidthsSkipHeaderRows() throws CsvReaderException { + final String input = + "" + + "front matter\n" + + "ignore me\n" + + "Sym Type Price SecurityId\n" + + "GOOG Dividend 0.25 200\n" + + "T Dividend 0.15 300\n" + + "Z Dividend 0.18 500\n"; + + final ColumnSet expected = + ColumnSet.of( + Column.ofRefs("Sym", "GOOG", "T", "Z"), + Column.ofRefs("Type", "Dividend", "Dividend", "Dividend"), + Column.ofValues("Price", 0.25, 0.15, 0.18), + Column.ofValues("SecurityId", 200, 300, 500)); + + final CsvSpecs specs = + defaultCsvBuilder().hasFixedWidthColumns(true).skipHeaderRows(2).build(); + + invokeTest(specs, input, expected); + } + + /** + * Like delimited mode, fixed-width mode allows data rows to be skipped. + */ + @Test + public void fixedColumnWidthsSkipDataRows() throws CsvReaderException { + final String input = + "" + + "Sym Type Price SecurityId\n" + + "GOOG Dividend 0.25 200\n" + + "T Dividend 0.15 300\n" + + "XYZ1 Coupon 0.18 500\n" + + "XYZ2 Coupon 0.37 900\n"; + + final ColumnSet expected = + ColumnSet.of( + Column.ofRefs("Sym", "T", "XYZ1"), + Column.ofRefs("Type", "Dividend", "Coupon"), + Column.ofValues("Price", 0.15, 0.18), + Column.ofValues("SecurityId", 300, 500)); + + // Skip 1 data row, take 2 data rows + final CsvSpecs specs = + defaultCsvBuilder().hasFixedWidthColumns(true).skipRows(1).numRows(2).build(); + + invokeTest(specs, input, expected); + } + + /** + * Like delimited mode, fixed-width mode allows rows to be short. + */ + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void fixedColumnWidthsShortRows(boolean allowMissingColumns) throws CsvReaderException { + final String input = + "" + + "Sym Type Price SecurityId\n" + + "GOOG\n" + + "T Dividend 0.15 300\n" + + "Z Dividend 0.18 500\n" + + "QQQ Coupon\n"; + + + final ColumnSet expected = + ColumnSet.of( + Column.ofRefs("Sym", "GOOG", "T", "Z", "QQQ"), + Column.ofRefs("Type", null, "Dividend", "Dividend", "Coupon"), + Column.ofValues("Price", Sentinels.NULL_DOUBLE, 0.15, 0.18, Sentinels.NULL_DOUBLE), + Column.ofValues("SecurityId", Sentinels.NULL_INT, 300, 500, Sentinels.NULL_INT)); + + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true) + .allowMissingColumns(allowMissingColumns).build(); + + if (allowMissingColumns) { + invokeTest(specs, input, expected); + } else { + Assertions.assertThatThrownBy(() -> invokeTest(specs, input, expected)) + .hasRootCauseMessage("Row 2 has too few columns (expected 4)"); + } + } + + /** + * Like delimited mode, fixed-width mode allows ignoring empty lines. + */ + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void fixedColumnWidthsIgnoreEmptyLines(boolean ignoreEmptyLines) throws CsvReaderException { + final String input = + "" + + "Sym Type Price SecurityId\n" + + "GOOG Dividend 0.25 200\n" + + "\n" + + "\n" + + "T Dividend 0.15 300\n" + + "\n" + + "Z Dividend 0.18 500\n"; + + + + final ColumnSet expected; + + if (ignoreEmptyLines) { + expected = ColumnSet.of( + Column.ofRefs("Sym", "GOOG", "T", "Z"), + Column.ofRefs("Type", "Dividend", "Dividend", "Dividend"), + Column.ofValues("Price", 0.25, 0.15, 0.18), + Column.ofValues("SecurityId", 200, 300, 500)); + } else { + expected = ColumnSet.of( + Column.ofRefs("Sym", "GOOG", null, null, "T", null, "Z"), + Column.ofRefs("Type", "Dividend", null, null, "Dividend", null, "Dividend"), + Column.ofValues("Price", 0.25, Sentinels.NULL_DOUBLE, Sentinels.NULL_DOUBLE, 0.15, + Sentinels.NULL_DOUBLE, 0.18), + Column.ofValues("SecurityId", 200, Sentinels.NULL_INT, Sentinels.NULL_INT, 300, Sentinels.NULL_INT, + 500)); + } + + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true) + .ignoreEmptyLines(ignoreEmptyLines).build(); + + invokeTest(specs, input, expected); + } + + /** + * In fixed width mode, if there is no header row, the caller needs to specify column widths. + */ + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void noHeaderRowRequiresFixColumnWidthsSpecified(boolean specifyColumnWidths) throws CsvReaderException { + final String input = + "" + + "GOOG Dividend 0.25 200\n" + + "T Dividend 0.15 300\n" + + "Z Coupon 0.18 500\n"; + + final ColumnSet expected = + ColumnSet.of( + Column.ofRefs("Column1", "GOOG", "T", "Z"), + Column.ofRefs("Column2", "Dividend", "Dividend", "Coupon"), + Column.ofValues("Column3", 0.25, 0.15, 0.18), + Column.ofValues("Column4", 200, 300, 500)); + + final CsvSpecs.Builder specsBase = defaultCsvBuilder().hasFixedWidthColumns(true).hasHeaderRow(false); + + if (specifyColumnWidths) { + final CsvSpecs specs = specsBase.fixedColumnWidths(Arrays.asList(6, 9, 8, 3)).build(); + invokeTest(specs, input, expected); + } else { + final CsvSpecs specs = specsBase.build(); + Assertions.assertThatThrownBy(() -> invokeTest(specs, input, expected)) + .hasMessage("Can't proceed because hasHeaderRow is false but fixedColumnWidths is unspecified"); + } + } + + /** + * Because the library is tolerant of the last cell being shorter or wider than expected, the final entry in + * fixedColumnWidths is just a placeholder. + */ + @ParameterizedTest + @ValueSource(ints = {1, 5000, 34_000_000}) + public void finalFixedColumnWidthEntryIsPlaceholder(int finalEntry) throws CsvReaderException { + final String input = + "" + + "GOOG Dividend 0.25 200\n" + + "T Dividend 0.15 300\n" + + "Z Coupon 0.18 500\n"; + + final ColumnSet expected = + ColumnSet.of( + Column.ofRefs("Column1", "GOOG", "T", "Z"), + Column.ofRefs("Column2", "Dividend", "Dividend", "Coupon"), + Column.ofValues("Column3", 0.25, 0.15, 0.18), + Column.ofValues("Column4", 200, 300, 500)); + + final CsvSpecs.Builder specsBase = defaultCsvBuilder().hasFixedWidthColumns(true).hasHeaderRow(false); + + final CsvSpecs specs = specsBase.fixedColumnWidths(Arrays.asList(6, 9, 8, finalEntry)).build(); + invokeTest(specs, input, expected); + } + + /** + * Test all the parameters incompatible with delimited mode, all at the same time. + */ + @Test + public void checkParametersIncompatibleWithDelimitedMode() { + final String expectedMessage = + "CsvSpecs failed validation for the following reasons: " + + "Incompatible parameters: can't set fixedColumnWidths when hasFixedWidthColumns is false, " + + "Incompatible parameters: can't set useUtf32CountingConvention when hasFixedWidthColumns is false"; + + Assertions.assertThatThrownBy(() -> defaultCsvBuilder().hasFixedWidthColumns(false) + .useUtf32CountingConvention(false) + .fixedColumnWidths(Arrays.asList(1, 2, 3, 4)).build()).hasMessage(expectedMessage); + } + + /** + * Test all the parameters incompatible with fixed-width mode, all at the same time. + */ + @Test + public void checkParametersIncompatibleWithFixedWidthMode() { + final String expectedMessage = + "CsvSpecs failed validation for the following reasons: " + + "Incompatible parameters: can't set quote when hasFixedWidthColumns is true, " + + "Incompatible parameters: can't set delimiter when hasFixedWidthColumns is true, " + + "Incompatible parameters: can't set trim when hasFixedWidthColumns is true"; + + Assertions.assertThatThrownBy(() -> defaultCsvBuilder().hasFixedWidthColumns(true) + .quote('X').delimiter('Y').trim(true).build()).hasMessage(expectedMessage); + } + + + /** + * Test all the parameters incompatible with fixed-width mode, all at the same time. + */ + @Test + public void validateFixedWidthModeParameters() { + final String expectedMessage = + "CsvSpecs failed validation for the following reasons: " + + "Fixed column width -5 is invalid"; + + Assertions.assertThatThrownBy( + () -> defaultCsvBuilder().hasFixedWidthColumns(true).fixedColumnWidths(Arrays.asList(-5, 3, 8)) + .build()) + .hasMessage(expectedMessage); + } + + /** + * In fixed width mode (as is also true in delimited mode), if there is no header row, the caller may specify column + * names. If they don't, synthetic column names will be generated. + */ + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void columnNamesMayBeSpecified(boolean specifyColumnNames) throws CsvReaderException { + final String input = + "" + + "GOOG Dividend 0.25 200\n" + + "T Dividend 0.15 300\n" + + "Z Coupon 0.18 500\n"; + + final String[] expectedColumnNames = specifyColumnNames ? new String[] {"Sym", "Type", "Price", "SecurityId"} + : new String[] {"Column1", "Column2", "Column3", "Column4"}; + + final ColumnSet expected = + ColumnSet.of( + Column.ofRefs(expectedColumnNames[0], "GOOG", "T", "Z"), + Column.ofRefs(expectedColumnNames[1], "Dividend", "Dividend", "Coupon"), + Column.ofValues(expectedColumnNames[2], 0.25, 0.15, 0.18), + Column.ofValues(expectedColumnNames[3], 200, 300, 500)); + + CsvSpecs.Builder specsBuilder = defaultCsvBuilder().hasFixedWidthColumns(true).hasHeaderRow(false) + .fixedColumnWidths(Arrays.asList(6, 9, 8, 3)); + + if (specifyColumnNames) { + specsBuilder = specsBuilder.headers(Arrays.asList(expectedColumnNames)); + } + + invokeTest(specsBuilder.build(), input, expected); + } + + /** + * A counting convention test relevant to fixed-width mode. All six Unicode characters β™‘β™₯β₯❦◑╳ are in the Basic + * Multilingual Plane and can all be represented with a single Java char. Therefore, they are counted the same with + * both counting conventions. + */ + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void countsBMPCharactersTheSame(boolean useUtf32CountingConvention) throws CsvReaderException { + final String input = + "" + + "Sym Type Price SecurityId\n" + + "β™‘β™₯β₯❦◑╳Dividend 0.15 300\n" + + "Z Dividend 0.18 500\n"; + + final ColumnSet expected = + ColumnSet.of( + Column.ofRefs("Sym", "β™‘β™₯β₯❦◑╳", "Z"), + Column.ofRefs("Type", "Dividend", "Dividend"), + Column.ofValues("Price", 0.15, 0.18), + Column.ofValues("SecurityId", 300, 500)); + + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true) + .useUtf32CountingConvention(useUtf32CountingConvention).build(); + + invokeTest(specs, input, expected); + } + + /** + * A counting convention test relevant to fixed-width mode. All six Unicode characters πŸ₯°πŸ˜»πŸ§‘πŸ’“πŸ’•πŸ’– are _outside_ + * the Basic Multilingual Plane and all are represented with two Java chars. The Sym column has a width of six. They + * will fit in the "Sym" column if the caller uses the UTF-32 counting convention. They will not fit in the column + * if the caller uses the UTF-16 counting convention (because it takes 12 Java chars to express them). + */ + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void countsNonBMPCharactersDifferently(boolean useUtf32CountingConvention) throws CsvReaderException { + final String input = + "" + + "Sym Type\n" + + "πŸ₯°πŸ˜»πŸ§‘πŸ’“πŸ’•πŸ’–Dividend\n" + + "Z Dividend\n"; + + final ColumnSet expected; + + if (useUtf32CountingConvention) { + expected = ColumnSet.of( + Column.ofRefs("Sym", "πŸ₯°πŸ˜»πŸ§‘πŸ’“πŸ’•πŸ’–", "Z"), + Column.ofRefs("Type", "Dividend", "Dividend")); + } else { + expected = ColumnSet.of( + Column.ofRefs("Sym", "πŸ₯°πŸ˜»πŸ§‘", "Z"), + Column.ofRefs("Type", "πŸ’“πŸ’•πŸ’–Dividend", "Dividend")); + } + + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true) + .useUtf32CountingConvention(useUtf32CountingConvention).build(); + + invokeTest(specs, input, expected); + } + + /** + * Using Unicode characters as column headers in fixed-width mode. We give one column a header with characters from + * outside the BMP, and one with characters inside the BMP and show how the behavior differs depending on the + * useUtf32CountingConvention flag. The header πŸ₯°πŸ˜»πŸ§‘ plus trailing space will be counted as width 4 in the UTF-32 + * counting convention, but width 7 in the UTF-16 column convention. Meanwhile, the header ╔═╀═╗ is counted as width + * 5 in both conventions. + */ + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void unicodeColumnHeaders(boolean useUtf32CountingConvention) throws CsvReaderException { + // In the UTF-32 counting convention, this is a column of width 4 (three Unicode characters plus the space) + // followed by a column of width 5. The first cell of the data would therefore be "abc", and the next cell + // would be "def". + + // In the UTF-16 counting convention, this is a column of width 7 (six UTF-16 units plus the space) + // followed by a column of width 5. The first cell of the data would therefore be "abc def" and the next + // cell woult be "gh". + final String input = + "" + + "πŸ₯°πŸ˜»πŸ§‘ ╔═╀═╗\n" + + "abc defgh\n"; + + final ColumnSet expected; + + if (useUtf32CountingConvention) { + expected = ColumnSet.of( + Column.ofRefs("πŸ₯°πŸ˜»πŸ§‘", "abc"), + Column.ofRefs("╔═╀═╗", "defgh")); + } else { + expected = ColumnSet.of( + Column.ofRefs("πŸ₯°πŸ˜»πŸ§‘", "abc def"), + Column.ofRefs("╔═╀═╗", "gh")); + } + + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true) + .useUtf32CountingConvention(useUtf32CountingConvention).build(); + + invokeTest(specs, input, expected); + } + + /** + * In fixed-width mode, if the library is configured for the UTF-16 counting convention, and there is only one unit + * of space left in the field, and the next character is a character outside the Basic Multilingual Plane that + * requires two units, the library will include that character in the next field rather than this one. + */ + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void brokenSurrogatePair(boolean useUtf32CountingConvention) throws CsvReaderException { + // This test has a column of width 3 (three characters plus the space) + // followed by a column of width 2. + // + // In the UTF-32 counting convention, the first column will get "πŸ₯°πŸ˜» " and the second column will + // get "πŸ§‘πŸ’“". We turn off ignoreSurroundingSpaces to highlight how this is counted. + // + // In the UTF-16 counting convention, the first column will get πŸ₯° (because πŸ₯°πŸ˜» uses characters + // outside the Basic Multilingual Plane and takes four units to represent, but the first field + // only has space for three). The next column will get "😻 πŸ§‘πŸ’“" (the rest of the row). + final String input = + "" + + "C1 C2\n" + + "πŸ₯°πŸ˜» πŸ§‘πŸ’“\n"; + + final ColumnSet expected; + + if (useUtf32CountingConvention) { + expected = ColumnSet.of( + Column.ofRefs("C1", "πŸ₯°πŸ˜» "), + Column.ofRefs("C2", "πŸ§‘πŸ’“")); + } else { + expected = ColumnSet.of( + Column.ofRefs("C1", "πŸ₯°"), + Column.ofRefs("C2", "😻 πŸ§‘πŸ’“")); + } + + final CsvSpecs specs = defaultCsvBuilder().hasFixedWidthColumns(true) + .ignoreSurroundingSpaces(false).useUtf32CountingConvention(useUtf32CountingConvention).build(); + + invokeTest(specs, input, expected); + } + private static final class RepeatingInputStream extends InputStream { private byte[] data; private final byte[] body;