From 03ee64601199344dd0afe58ad2ecf798f6eddd74 Mon Sep 17 00:00:00 2001 From: Corey Kosak Date: Fri, 8 Nov 2024 18:38:50 -0500 Subject: [PATCH] Custom parsers / null literals should use pre-legalized names (issue #190) --- src/main/java/io/deephaven/csv/CsvSpecs.java | 9 ++- .../io/deephaven/csv/reading/CsvReader.java | 16 +++-- .../java/io/deephaven/csv/CsvReaderTest.java | 65 +++++++++++++++++++ 3 files changed, 80 insertions(+), 10 deletions(-) diff --git a/src/main/java/io/deephaven/csv/CsvSpecs.java b/src/main/java/io/deephaven/csv/CsvSpecs.java index 88de1b6..c298cc8 100644 --- a/src/main/java/io/deephaven/csv/CsvSpecs.java +++ b/src/main/java/io/deephaven/csv/CsvSpecs.java @@ -105,9 +105,12 @@ public interface Builder { Builder customTimeZoneParser(Tokenizer.CustomTimeZoneParser customTimeZoneParser); /** - * An optional legalizer for column headers. The legalizer is a function that takes column names (as a - * {@code String[]}) names and returns legal column names (as a {@code String[]}). The legalizer function is - * permitted to reuse its input data structure. Defaults to {@code Function#identity()}. + * An optional legalizer for column headers. The legalizer is a function that takes the column names from the + * input (as a {@code String[]}) and returns "legal" column names (as a {@code String[]}). What constitutes + * "legal" is entirely up to the caller. For example, some applications cannot tolerate punctuation characters + * in column names and need to remove them. The CSV library itself has no limitations with regard to column + * names. The legalizer function is permitted to return the input array (perhaps with some elements modified) as + * the return value. Defaults to {@code Function#identity()}. */ Builder headerLegalizer(Function headerLegalizer); diff --git a/src/main/java/io/deephaven/csv/reading/CsvReader.java b/src/main/java/io/deephaven/csv/reading/CsvReader.java index 68899ae..dd00722 100644 --- a/src/main/java/io/deephaven/csv/reading/CsvReader.java +++ b/src/main/java/io/deephaven/csv/reading/CsvReader.java @@ -95,9 +95,7 @@ private static Result delimitedReadLogic( headersTemp2 = headersTemp; } final int numOutputCols = headersTemp2.length; - final String[] headersToUse = canonicalizeHeaders(specs, headersTemp2); - - return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersToUse, sinkFactory); + return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersTemp2, sinkFactory); } private static Result fixedReadLogic( @@ -113,14 +111,17 @@ private static Result fixedReadLogic( private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber, byte[][] optionalFirstDataRow, int numInputCols, int numOutputCols, - String[] headersToUse, final SinkFactory sinkFactory) + String[] headersBeforeLegalization, final SinkFactory sinkFactory) throws CsvReaderException { + final String[][] nullValueLiteralsToUse = new String[numOutputCols][]; for (int ii = 0; ii < numOutputCols; ++ii) { nullValueLiteralsToUse[ii] = - calcNullValueLiteralsToUse(specs, headersToUse[ii], ii).toArray(new String[0]); + calcNullValueLiteralsToUse(specs, headersBeforeLegalization[ii], ii).toArray(new String[0]); } + final String[] headersToUse = canonicalizeHeaders(specs, headersBeforeLegalization); + // Create a DenseStorageWriter for each column. The arrays are sized to "numInputCols" but only populated up to // "numOutputCols". The remaining (numInputCols - numOutputCols) are set to null. The code in // parseInputToDenseStorge knows that having a null DenseStorageWriter means that the column is all-empty and @@ -156,7 +157,7 @@ private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber, final ArrayList> sinkFutures = new ArrayList<>(); try { for (int ii = 0; ii < numOutputCols; ++ii) { - final List> parsersToUse = calcParsersToUse(specs, headersToUse[ii], ii); + final List> parsersToUse = calcParsersToUse(specs, headersBeforeLegalization[ii], ii); final int iiCopy = ii; final Future fcb = ecs.submit( @@ -230,7 +231,8 @@ private static List calcNullValueLiteralsToUse(final CsvSpecs specs, fin } private static String[] canonicalizeHeaders(CsvSpecs specs, final String[] headers) throws CsvReaderException { - final String[] legalized = specs.headerLegalizer().apply(headers); + // legalizer is allowed to mutate the input in-place, so we clone it before passing it. + final String[] legalized = specs.headerLegalizer().apply(headers.clone()); final Set unique = new HashSet<>(); final List repeats = new ArrayList<>(); final List invalidNames = new ArrayList<>(); diff --git a/src/test/java/io/deephaven/csv/CsvReaderTest.java b/src/test/java/io/deephaven/csv/CsvReaderTest.java index b42feb3..a6c3377 100644 --- a/src/test/java/io/deephaven/csv/CsvReaderTest.java +++ b/src/test/java/io/deephaven/csv/CsvReaderTest.java @@ -41,6 +41,7 @@ import java.util.Collections; import java.util.List; import java.util.function.BiFunction; +import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -206,6 +207,70 @@ public void bug162() throws CsvReaderException { invokeTest(defaultCsvBuilder().parsers(Parsers.DEFAULT).build(), input, expected); } + /** + * Reported in Deephaven CSV Issue #190. That + * issue report misidentifies the root cause as having to do with reserved keywords. This is not correct because the + * library doesn't care whether a column header is a reserved keyword. The actual root cause is an interaction + * between the user-supplied "legalizer" and user-specified parsers or null literals that are specified by column + * names. Specifically the question is whether column names mentioned in {@link CsvSpecs.Builder#putParserForName} + * and {@link CsvSpecs.Builder#putNullValueLiteralsForName} should refer to the name that the column had *before* it + * was transformed by the legalizer, or *after*. The expected behavior is "before", but prior to this fix the + * library was doing the "after" behavior. This is a parameterized test that invokes the behavior for {delimited, + * fixed columns} x {without and with a legalizer}. + */ + @ParameterizedTest + @CsvSource({"false,false", "false,true", "true,false", "true,true"}) + public void bug190(boolean hasFixedWidthColumns, boolean invokeLegalizer) throws CsvReaderException { + // +++ is the null value literal for Col1 + // *** is the null value literal for Col2 + // ??? is the null value literal for Col3 + + final String input; + + if (!hasFixedWidthColumns) { + input = "Col1,Col2,Col3\n" + + "+++,20,30\n" + + "100,***,300\n" + + "1000,2000,???\n"; + } else { + input = "Col1 Col2 Col3\n" + + "+++ 20 30\n" + + "100 *** 300\n" + + "1000 2000 ???\n"; + } + + final String[] expectedColumnNames = !invokeLegalizer ? new String[] {"Col1", "Col2", "Col3"} + : new String[] {"xyzCol1", "xyzCol2", "xyzCol3"}; + + final ColumnSet expected = + ColumnSet.of( + Column.ofValues(expectedColumnNames[0], Sentinels.NULL_LONG, (long) 100, (long) 1000), + Column.ofValues(expectedColumnNames[1], (double) 20, Sentinels.NULL_DOUBLE, (double) 2000), + Column.ofRefs(expectedColumnNames[2], "30", "300", null)); + + Function legalizer = in -> { + for (int i = 0; i != in.length; ++i) { + // e.g. transform Col1 to xyzCol1 + in[i] = "xyz" + in[i]; + } + return in; + }; + + CsvSpecs.Builder specsBase = + defaultCsvBuilder().hasFixedWidthColumns(hasFixedWidthColumns).parsers(Parsers.DEFAULT) + .putParserForName("Col1", Parsers.LONG).putParserForName("Col2", Parsers.DOUBLE) + .putParserForName("Col3", Parsers.STRING) + .putNullValueLiteralsForName("Col1", Collections.singletonList("+++")) + .putNullValueLiteralsForName("Col2", Collections.singletonList("***")) + .putNullValueLiteralsForName("Col3", Collections.singletonList("???")); + + if (invokeLegalizer) { + specsBase = specsBase.headerLegalizer(legalizer); + } + + invokeTest(specsBase.build(), input, expected); + } + @Test public void validates() { final String lengthyMessage = "CsvSpecs failed validation for the following reasons: "