From a56cb4b40345fbe92f36b3604d912b3af89fdc51 Mon Sep 17 00:00:00 2001 From: Bart Hanssens Date: Thu, 11 Jul 2024 11:49:33 +0100 Subject: [PATCH] GH-5058: additional parser code (WIP) --- .../eclipse/rdf4j/model/vocabulary/CSVW.java | 16 ++++ .../eclipse/rdf4j/rio/csvw/CSVWParser.java | 38 +++++---- .../rdf4j/rio/csvw/parsers/CellParser.java | 78 +++++++++++++------ .../rio/csvw/parsers/CellParserBoolean.java | 6 +- .../rio/csvw/parsers/CellParserDate.java | 7 +- .../rio/csvw/parsers/CellParserDouble.java | 38 +++++++++ .../rio/csvw/parsers/CellParserFactory.java | 27 +++++-- .../rio/csvw/parsers/CellParserLOng.java | 44 +++++++++++ .../rio/csvw/parsers/CellParserString.java | 43 ++++++++++ .../rdf4j/rio/csvw/CSVWParserTest.java | 11 +-- 10 files changed, 248 insertions(+), 60 deletions(-) create mode 100644 core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDouble.java create mode 100644 core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserLOng.java create mode 100644 core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserString.java diff --git a/core/model-vocabulary/src/main/java/org/eclipse/rdf4j/model/vocabulary/CSVW.java b/core/model-vocabulary/src/main/java/org/eclipse/rdf4j/model/vocabulary/CSVW.java index bc96966dfd..827477f1cd 100644 --- a/core/model-vocabulary/src/main/java/org/eclipse/rdf4j/model/vocabulary/CSVW.java +++ b/core/model-vocabulary/src/main/java/org/eclipse/rdf4j/model/vocabulary/CSVW.java @@ -51,15 +51,24 @@ public class CSVW { /** csvw:datatype */ public static final IRI DATATYPE; + /** csvw:decimalChar */ + public static final IRI DECIMAL_CHAR; + /** csvw:default */ public static final IRI DEFAULT; + /** csvw:delimiter */ + public static final IRI DELIMITER; + /** csvw:dialect */ public static final IRI DIALECT; /** csvw:format */ public static final IRI FORMAT; + /** csvw:groupChar */ + public static final IRI GROUP_CHAR; + /** csvw:header */ public static final IRI HEADER; @@ -90,14 +99,20 @@ public class CSVW { /** csvw:valueUrl */ public static final IRI VALUE_URL; + /** csvw:virtual */ + public static final IRI VIRTUAL; + static { ABOUT_URL = Vocabularies.createIRI(NAMESPACE, "aboutUrl"); BASE = Vocabularies.createIRI(NAMESPACE, "base"); COLUMN = Vocabularies.createIRI(NAMESPACE, "column"); DATATYPE = Vocabularies.createIRI(NAMESPACE, "datatype"); + DECIMAL_CHAR = Vocabularies.createIRI(NAMESPACE, "decimalChar"); DEFAULT = Vocabularies.createIRI(NAMESPACE, "default"); + DELIMITER = Vocabularies.createIRI(NAMESPACE, "delimiter"); DIALECT = Vocabularies.createIRI(NAMESPACE, "dialect"); FORMAT = Vocabularies.createIRI(NAMESPACE, "format"); + GROUP_CHAR = Vocabularies.createIRI(NAMESPACE, "groupChar"); HEADER = Vocabularies.createIRI(NAMESPACE, "header"); LANG = Vocabularies.createIRI(NAMESPACE, "lang"); NAME = Vocabularies.createIRI(NAMESPACE, "name"); @@ -108,5 +123,6 @@ public class CSVW { TITLE = Vocabularies.createIRI(NAMESPACE, "title"); URL = Vocabularies.createIRI(NAMESPACE, "url"); VALUE_URL = Vocabularies.createIRI(NAMESPACE, "valueUrl"); + VIRTUAL = Vocabularies.createIRI(NAMESPACE, "virtual"); } } diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java index ab0cbce216..62a1fc08ef 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/CSVWParser.java @@ -214,20 +214,21 @@ private CellParser getCellParser(Model metadata, Resource column) { CellParser parser = CellParserFactory.create(datatype); + Models.getPropertyString(metadata, column, CSVW.LANG).ifPresent(v -> parser.setLang(v)); getFormat(metadata, column).ifPresent(v -> parser.setFormat(v.stringValue())); - Models.getProperty(metadata, column, CSVW.NAME) - .ifPresentOrElse(v -> parser.setName(v.stringValue()), - () -> new RDFParseException("Metadata file does not contain name for column " + column)); - - Models.getProperty(metadata, column, CSVW.DEFAULT).ifPresent(v -> parser.setDefaultValue(v.stringValue())); - Models.getProperty(metadata, column, CSVW.REQUIRED) - .ifPresent(v -> parser.setIsRequired(Boolean.parseBoolean(v.stringValue()))); - Models.getProperty(metadata, column, CSVW.VALUE_URL).ifPresent(v -> parser.setValueURL(v.stringValue())); + Models.getPropertyString(metadata, column, CSVW.NAME) + .ifPresentOrElse(v -> parser.setName(v, + () -> new RDFParseException("Metadata file does not contain name for column " + column)); + + Models.getPropertyString(metadata, column, CSVW.DEFAULT).ifPresent(v -> parser.setDefaultValue(v); + Models.getPropertyString(metadata, column, CSVW.REQUIRED) + .ifPresent(v -> parser.setIsRequired(Boolean.parseBoolean(v)); + Models.getPropertyString(metadata, column, CSVW.VALUE_URL).ifPresent(v -> parser.setValueURL(v)); // use a property from a vocabulary as predicate, or create a property relative to the namespace of the CSV - Optional propertyURL = Models.getProperty(metadata, column, CSVW.PROPERTY_URL); - String s = propertyURL.isPresent() ? propertyURL.get().stringValue() : "_local:" + parser.getName(); + Optional propertyURL = Models.getPropertyString(metadata, column, CSVW.PROPERTY_URL); + String s = propertyURL.isPresent() ? propertyURL.get() : "_local:" + parser.getName(); parser.setPropertyURL(metadata.getNamespaces(), s); return parser; @@ -260,7 +261,7 @@ private IRI getDatatypeIRI(Model metadata, Resource column) { } /** - * Get IRI of base or derived datatype + * Get name of the generic datatype or more specific datatype * * @param metadata * @param column @@ -272,7 +273,8 @@ private Optional getFormat(Model metadata, Resource column) { Value datatype = val.get(); // derived datatype if (datatype.isBNode()) { - val = Models.getProperty(metadata, (Resource) datatype, CSVW.FORMAT); + Optional fmt = Models.getProperty(metadata, (Resource) datatype, CSVW.FORMAT); + val = Models.getProperty(metadata, (Resource) fmt.get(), CSVW.BASE); } } return val; @@ -327,10 +329,11 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse String placeholder = (aboutIndex > -1) ? cellParsers[aboutIndex].getName() : null; LOGGER.info("Parsing {}", csvFile); + long line = 0; try (InputStream is = csvFile.toURL().openStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(is)); - CSVReader csv = getCSVReader(metadata, reader)) { + CSVReader csv = getCSVReader(metadata, table, reader)) { String[] cells; while ((cells = csv.readNext()) != null) { @@ -359,8 +362,15 @@ private void parseCSV(Model metadata, RDFHandler handler, URI csvFile, CellParse * @param reader * @return */ - private CSVReader getCSVReader(Model metadata, Reader reader) { + private CSVReader getCSVReader(Model metadata, Resource table, Reader reader) { CSVParser parser = new CSVParserBuilder().build(); + CSVReaderBuilder builder = new CSVReaderBuilder(reader); + + Optional dialect = Models.getProperty(metadata, table, CSVW.DIALECT); + if (dialect.isPresent()) { + Models.getPropertyString(metadata, (Resource) dialect, CSVW.DELIMITER); + } + return new CSVReaderBuilder(reader).withSkipLines(1).withCSVParser(parser).build(); } diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParser.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParser.java index 433ece7435..99e99ca902 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParser.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParser.java @@ -10,29 +10,29 @@ *******************************************************************************/ package org.eclipse.rdf4j.rio.csvw.parsers; -import java.time.format.DateTimeFormatter; import java.util.Set; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Namespace; import org.eclipse.rdf4j.model.Value; -import org.eclipse.rdf4j.model.util.Literals; import org.eclipse.rdf4j.model.util.Values; -import org.eclipse.rdf4j.rio.RDFParseException; /** * * @author Bart Hanssens */ -public class CellParser { - private String name; +public abstract class CellParser { + protected String name; protected IRI dataType; + protected String lang; protected String defaultValue; - private boolean isRequired; - private IRI propertyIRI; - private String valueUrl; - private String format; - private String separator; + protected boolean isRequired; + protected IRI propertyIRI; + protected String valueUrl; + protected String format; + protected String decimalChar; + protected String groupChar; + protected String separator; /** * @param name @@ -55,6 +55,15 @@ public void setDataType(IRI dataType) { this.dataType = dataType; } + /** + * Set language code + * + * @param lang language code + */ + public void setLang(String lang) { + this.lang = lang; + } + /** * @param defaultValue the defaultValue to set */ @@ -123,6 +132,35 @@ public void setSeparator(String separator) { this.separator = separator; } + /** + * @return the decimal character + */ + public String getDecimalChar() { + return decimalChar; + } + + /** + * @param decimalChar the decimal character to set + */ + public void setDecimalChar(String decimalChar) { + this.decimalChar = decimalChar; + } + + + /** + * @return the group character + */ + public String getGroupChar() { + return groupChar; + } + + /** + * @param groupChar the group character to set + */ + public void setGroupChar(String groupChar) { + this.groupChar = groupChar; + } + /** * @param format */ @@ -130,23 +168,19 @@ public void setFormat(String format) { this.format = format; } + protected String getValueOrDefault(String s) { + if ((s == null || s.isEmpty()) && (defaultValue != null)) { + return defaultValue; + } + return s; + } + /** * Get the value from a cell * * @param cell * @return */ - public Value parse(String cell) { - String s = cell; - if ((s == null || s.isEmpty()) && (defaultValue != null)) { - s = defaultValue; - } - if (valueUrl != null && s != null) { - return Values.iri(valueUrl.replace("{" + name + "}", s)); - } - System.err.println(s); - System.err.println(dataType); - return Values.literal(s, dataType); - } + public abstract Value parse(String cell); } diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserBoolean.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserBoolean.java index 41a4094a90..c99ce66e5d 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserBoolean.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserBoolean.java @@ -34,10 +34,8 @@ public void setFormat(String format) { @Override public Value parse(String cell) { - String s = cell; - if ((s == null || s.isEmpty()) && (defaultValue != null)) { - s = defaultValue; - } + String s = getValueOrDefault(cell); + return Values.literal(valueTrue.equals(s) ? "true" : "false", dataType); } diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDate.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDate.java index 82e53871b6..e7416c2cc5 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDate.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDate.java @@ -11,6 +11,7 @@ package org.eclipse.rdf4j.rio.csvw.parsers; import java.time.format.DateTimeFormatter; + import org.eclipse.rdf4j.model.Value; import org.eclipse.rdf4j.model.util.Values; @@ -32,10 +33,8 @@ public void setFormat(String format) { @Override public Value parse(String cell) { - String s = cell; - if ((s == null || s.isEmpty()) && (defaultValue != null)) { - s = defaultValue; - } + String s = getValueOrDefault(cell); + if (formatter != null) { s = DateTimeFormatter.ISO_DATE.format(formatter.parse(s)); } diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDouble.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDouble.java new file mode 100644 index 0000000000..3000787e80 --- /dev/null +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserDouble.java @@ -0,0 +1,38 @@ +/******************************************************************************* + * Copyright (c) 2024 Eclipse RDF4J contributors. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Distribution License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + *******************************************************************************/ +package org.eclipse.rdf4j.rio.csvw.parsers; + +import org.eclipse.rdf4j.model.Value; +import org.eclipse.rdf4j.model.util.Values; + +/** + * + * @author Bart Hanssens + */ +public class CellParserDouble extends CellParser { + + @Override + public Value parse(String cell) { + String s = getValueOrDefault(cell); + + if (s != null && groupChar != null) { + s = s.replace(groupChar, ""); + } + + // always use a '.' in RDF, not the European-style ',' + if (s != null && !decimalChar.equals(".")) { + s = s.replace(decimalChar, "."); + } + + return Values.literal(s, dataType); + } + +} diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserFactory.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserFactory.java index 053df5bc5c..ce6c9e3c95 100644 --- a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserFactory.java +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserFactory.java @@ -27,20 +27,31 @@ public class CellParserFactory { public static CellParser create(IRI datatype) { CellParser p; - XSD xsdType = XSD.valueOf(datatype.toString()); + XSD xsdType = XSD.valueOf(datatype.getLocalName().toUpperCase()); if (xsdType == null) { - p = new CellParser(); + p = new CellParserString(); } else { - switch(xsdType) { - case DATE: - case DATETIME: - p = new CellParserDate(); - break; + switch (xsdType) { case BOOLEAN: p = new CellParserBoolean(); break; + case INTEGER: + case INT: + case SHORT: + case LONG: + p = new CellParserLong(); + break; + case FLOAT: + case DOUBLE: + p = new CellParserDouble(); + p.setDecimalChar("."); + break; + case DATE: + case DATETIME: + p = new CellParserDate(); + break; default: - p = new CellParser(); + p = new CellParserString(); } } p.setDataType(datatype); diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserLOng.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserLOng.java new file mode 100644 index 0000000000..41a4094a90 --- /dev/null +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserLOng.java @@ -0,0 +1,44 @@ +/******************************************************************************* + * Copyright (c) 2024 Eclipse RDF4J contributors. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Distribution License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + *******************************************************************************/ +package org.eclipse.rdf4j.rio.csvw.parsers; + +import java.util.Set; + +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Namespace; +import org.eclipse.rdf4j.model.Value; +import org.eclipse.rdf4j.model.util.Values; + +/** + * + * @author Bart Hanssens + */ +public class CellParserBoolean extends CellParser { + private String valueTrue; + private String valueFalse; + + @Override + public void setFormat(String format) { + String[] values = format.split("\\|"); + valueTrue = values[0]; + valueFalse = values[1]; + } + + @Override + public Value parse(String cell) { + String s = cell; + if ((s == null || s.isEmpty()) && (defaultValue != null)) { + s = defaultValue; + } + return Values.literal(valueTrue.equals(s) ? "true" : "false", dataType); + } + +} diff --git a/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserString.java b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserString.java new file mode 100644 index 0000000000..c079156df3 --- /dev/null +++ b/core/rio/csvw/src/main/java/org/eclipse/rdf4j/rio/csvw/parsers/CellParserString.java @@ -0,0 +1,43 @@ +/******************************************************************************* + * Copyright (c) 2024 Eclipse RDF4J contributors. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Distribution License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + *******************************************************************************/ +package org.eclipse.rdf4j.rio.csvw.parsers; + +import java.util.Set; + +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Namespace; +import org.eclipse.rdf4j.model.Value; +import org.eclipse.rdf4j.model.util.Values; + +/** + * + * @author Bart Hanssens + */ +public class CellParserString extends CellParser { + @Override + public Value parse(String cell) { + String s = cell; + if ((s == null || s.isEmpty()) && (defaultValue != null)) { + s = defaultValue; + } + if (valueUrl != null && s != null) { + return Values.iri(valueUrl.replace("{" + name + "}", s)); + } + System.err.println(s); + System.err.println(dataType); + + if (lang != null) { + return Values.literal(s, lang); + } + return Values.literal(s, dataType); + } + +} diff --git a/core/rio/csvw/src/test/java/org/eclipse/rdf4j/rio/csvw/CSVWParserTest.java b/core/rio/csvw/src/test/java/org/eclipse/rdf4j/rio/csvw/CSVWParserTest.java index 8bf0d7c022..ad43114c3d 100644 --- a/core/rio/csvw/src/test/java/org/eclipse/rdf4j/rio/csvw/CSVWParserTest.java +++ b/core/rio/csvw/src/test/java/org/eclipse/rdf4j/rio/csvw/CSVWParserTest.java @@ -10,22 +10,15 @@ *******************************************************************************/ package org.eclipse.rdf4j.rio.csvw; -import static org.mockserver.model.HttpRequest.request; -import static org.mockserver.model.HttpResponse.response; - import java.io.FileInputStream; import java.io.IOException; import org.eclipse.rdf4j.model.Model; import org.eclipse.rdf4j.model.impl.LinkedHashModel; -import org.eclipse.rdf4j.rio.RDFFormat; -import org.eclipse.rdf4j.rio.Rio; import org.eclipse.rdf4j.rio.helpers.BasicWriterSettings; import org.eclipse.rdf4j.rio.helpers.StatementCollector; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; -import org.mockserver.client.MockServerClient; import org.mockserver.junit.jupiter.MockServerExtension; /** @@ -43,6 +36,8 @@ public void testCSVWParser() throws IOException { parser.getParserConfig().set(BasicWriterSettings.BASE_DIRECTIVE, true); parser.parse(new FileInputStream("src/test/resources/painters-metadata.json"), getBase() + "/downloads/"); - System.err.println(model); + model.forEach(s -> { + System.err.println(s); + }); } }