From ce0c8e5dafa1e074cda0aa4562f9a7da37fb67de Mon Sep 17 00:00:00 2001 From: Bart Hanssens Date: Thu, 11 Jul 2024 16:12:28 +0100 Subject: [PATCH] GH-5058: additional parser code (WIP) --- .../eclipse/rdf4j/rio/csvw/CSVWParser.java | 34 +++++++++++++++---- .../rio/csvw/parsers/CellParserDate.java | 3 +- core/rio/csvw/src/test/resources/painters.csv | 2 +- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java index 3eafd1e952..190c1249d6 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java @@ -234,7 +234,7 @@ private CellParser getCellParser(Model metadata, Resource column) { Models.getPropertyString(metadata, column, CSVW.GROUP_CHAR).ifPresent(v -> parser.setGroupChar(v)); // mostly for date formats - Models.getPropertyString(metadata, column, CSVW.FORMAT).ifPresent(v -> parser.setFormat(v)); + getFormat(metadata, column).ifPresent(v -> parser.setFormat(v)); Models.getPropertyString(metadata, column, CSVW.VALUE_URL).ifPresent(v -> parser.setValueURL(v)); @@ -272,6 +272,24 @@ private IRI getDatatypeIRI(Model metadata, Resource column) { return XSD.valueOf(datatype.stringValue().toUpperCase()).getIri(); } + /** + * Get format string + * + * @param metadata + * @param column + * @return + */ + private Optional getFormat(Model metadata, Resource column) { + Optional val = Models.getProperty(metadata, column, CSVW.DATATYPE); + if (val.isPresent() && val.get().isBNode()) { + val = Models.getProperty(metadata, (Resource) val.get(), CSVW.FORMAT); + if (val.isPresent() && val.get().isLiteral()) { + return Optional.of(val.get().stringValue()); + } + } + return Optional.empty(); + } + /** * Get "about" URL template, to be used to create the subject of the triples * @@ -359,18 +377,20 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse private CSVReader getCSVReader(Model metadata, Resource table, Reader reader) { CSVParserBuilder parserBuilder = new CSVParserBuilder(); CSVReaderBuilder builder = new CSVReaderBuilder(reader); + builder.withSkipLines(1); - Optional dialect = Models.getProperty(metadata, table, CSVW.DIALECT); - if (dialect.isPresent()) { - Models.getPropertyString(metadata, (Resource) dialect.get(), CSVW.DELIMITER) + Optional val = Models.getProperty(metadata, table, CSVW.DIALECT); + if (val.isPresent()) { + Resource dialect = (Resource) val.get(); + Models.getPropertyString(metadata, dialect, CSVW.DELIMITER) .ifPresent(v -> parserBuilder.withSeparator(v.charAt(0))); - Models.getPropertyString(metadata, (Resource) dialect.get(), CSVW.HEADER) + Models.getPropertyString(metadata, dialect, CSVW.HEADER) .ifPresent(v -> builder.withSkipLines(v.equalsIgnoreCase("false") ? 0 : 1)); - Models.getPropertyString(metadata, (Resource) dialect.get(), CSVW.QUOTE_CHAR) + Models.getPropertyString(metadata, dialect, CSVW.QUOTE_CHAR) .ifPresent(v -> parserBuilder.withQuoteChar(v.charAt(0))); } - return new CSVReaderBuilder(reader).withCSVParser(parserBuilder.build()).build(); + return builder.withCSVParser(parserBuilder.build()).build(); } /** diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDate.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDate.java index bafee06f24..fb8d491ef8 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDate.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDate.java @@ -28,7 +28,7 @@ public class CellParserDate extends CellParser { @Override public void setFormat(String format) { super.setFormat(format); - System.err.println(format); + System.err.println("format = " + format); formatter = DateTimeFormatter.ofPattern(format); } @@ -39,6 +39,7 @@ public Value parse(String cell) { if (formatter != null) { s = DateTimeFormatter.ISO_DATE.format(formatter.parse(s)); } + System.err.println("date = " + s); return Values.literal(s, dataType); } diff --git a/core/rio/csvw/src/test/resources/painters.csv b/core/rio/csvw/src/test/resources/painters.csv index 4c3a87048c..096a2eb690 100644 --- a/core/rio/csvw/src/test/resources/painters.csv +++ b/core/rio/csvw/src/test/resources/painters.csv @@ -1,4 +1,4 @@ -"wikidata_id","first_name","last_name,country_id","country_name_nl","country_name_en","date_of_birth","married","languages" +"wikidata_id","first_name","last_name","country_id","country_name_nl","country_name_en","date_of_birth","married","languages" "Q5582","Vincent","van Gogh","Q29999","Nederland","The Netherlands","30/3/1853","No","dutch french" "Q164712","Paul","Delvaux","Q31","Belgiƫ","Belgium","23/9/1897","Yes","french" "Q46408","Georgia","O'Keeffe","Q30","Verenigde Staten","United States","15/11/1887","Yes","english"