diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/Model.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/Model.java index 2d6a554a..afff935c 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/Model.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/Model.java @@ -7,7 +7,7 @@ import org.apache.commons.collections4.map.LRUMap; -import com.github.romualdrousseau.any2json.base.ModelData; +import com.github.romualdrousseau.any2json.modeldata.EmptyModelData; import com.github.romualdrousseau.shuju.preprocessing.Text; import com.github.romualdrousseau.shuju.preprocessing.comparer.RegexComparer; import com.github.romualdrousseau.shuju.types.Tensor; @@ -17,7 +17,7 @@ public class Model { public static final ThreadLocal Default = new ThreadLocal<>() { @Override protected Model initialValue() { - return new ModelBuilder().build(); + return new Model(EmptyModelData.empty()); } }; @@ -27,46 +27,46 @@ public Model(final ModelData modelData) { public Model(final ModelData modelData, final Map modelAttributes) { this.modelData = modelData; - this.attributes = modelAttributes; - this.entities = modelData.getList("entities"); - this.patterns = modelData.getMap("patterns"); - this.filters = modelData.getList("filters"); - this.pivotEntities = modelData.getList("pivotEntityList"); - this.tags = modelData.getList("tags"); - this.requiredTags = modelData.getList("requiredTags"); - this.comparer = new RegexComparer(this.patterns); + this.modelAttributes = modelAttributes; + this.entityList = modelData.getList("entities"); + this.patternMap = modelData.getMap("patterns"); + this.filterList = modelData.getList("filters"); + this.pivotEntityList = modelData.getList("pivotEntityList"); + this.tagList = modelData.getList("tags"); + this.requiredTagList = modelData.getList("requiredTags"); + this.comparer = new RegexComparer(this.patternMap); } public ModelData getData() { return modelData; } - public Map getAttributes() { - return this.attributes; + public Map getModelAttributes() { + return this.modelAttributes; } public List getEntityList() { - return this.entities; + return this.entityList; } public Map getPatternMap() { - return this.patterns; + return this.patternMap; } - public List getFilters() { - return this.filters; + public List getFilterList() { + return this.filterList; } public List getPivotEntityList() { - return this.pivotEntities; + return this.pivotEntityList; } public List getTagList() { - return this.tags; + return this.tagList; } public List getRequiredTagList() { - return this.requiredTags; + return this.requiredTagList; } public String toEntityName(final String value) { @@ -87,17 +87,17 @@ public Optional toEntityValue(final String value, final String entityNam public Tensor toEntityVector(final String value) { return this.toEntityVectorCache.computeIfAbsent(value, v -> Tensor - .of(Text.to_categorical(v, this.entities, this.comparer).stream().mapToDouble(x -> x).toArray())); + .of(Text.to_categorical(v, this.entityList, this.comparer).stream().mapToDouble(x -> x).toArray())); } private final ModelData modelData; - private final Map attributes; - private final List entities; - private final Map patterns; - private final List filters; - private final List pivotEntities; - private final List tags; - private final List requiredTags; + private final Map modelAttributes; + private final List entityList; + private final Map patternMap; + private final List filterList; + private final List pivotEntityList; + private final List tagList; + private final List requiredTagList; private final RegexComparer comparer; private final LRUMap> toEntityValueCache = new LRUMap<>(); diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/ModelBuilder.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/ModelBuilder.java deleted file mode 100644 index 3054f63f..00000000 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/ModelBuilder.java +++ /dev/null @@ -1,149 +0,0 @@ -package com.github.romualdrousseau.any2json; - -import java.io.IOException; -import java.net.URISyntaxException; -import java.nio.file.Path; -import java.util.Collections; -import java.util.List; -import java.util.Map; - -import com.github.romualdrousseau.any2json.base.ModelData; - -public class ModelBuilder { - - public ModelBuilder() { - this.reset(); - } - - public ModelBuilder reset() { - this.modelData = ModelData.empty(); - this.entities = Collections.emptyList(); - this.patterns = Collections.emptyMap(); - this.filters = Collections.emptyList(); - this.pivotEntities = Collections.emptyList(); - this.tags = Collections.emptyList(); - this.requiredTags = Collections.emptyList(); - this.tableParser = null; - this.tagClassifier = null; - return this; - } - - public ModelBuilder fromModelData(final ModelData modelData) { - this.modelData = modelData; - this.entities = modelData.getList("entities"); - this.patterns = modelData.getMap("patterns"); - this.filters = modelData.getList("filters"); - this.pivotEntities = modelData.getList("pivotEntityList"); - this.tags = modelData.getList("tags"); - this.requiredTags = modelData.getList("requiredTags"); - return this; - } - - public ModelBuilder fromResource(final Class clazz, final String resourceName) - throws IOException, URISyntaxException { - return this.fromModelData(ModelData.loadFromResource(clazz, resourceName)); - } - - public ModelBuilder fromPath(final Path path) { - return this.fromModelData(ModelData.loadFromPath(path)); - } - - public ModelBuilder fromURI(final String uri) throws IOException, InterruptedException { - return this.fromModelData(ModelData.loadFromWebURL(uri)); - } - - public List getEntityList() { - return this.entities; - } - - public ModelBuilder setEntityList(final List entities) { - this.entities = entities; - return this; - } - - public Map getPatternMap() { - return this.patterns; - } - - public ModelBuilder setPatternMap(final Map patterns) { - this.patterns = patterns; - return this; - } - - public List getFilters() { - return this.filters; - } - - public ModelBuilder setFilters(final List filters) { - this.filters = filters; - return this; - } - - public List getPivotEntityList() { - return this.pivotEntities; - } - - public ModelBuilder setPivotEntityList(final List pivotEntities) { - this.pivotEntities = pivotEntities; - return this; - } - - public List getTagList() { - return this.tags; - } - - public ModelBuilder setTagList(final List tags) { - this.tags = tags; - return this; - } - - public List getRequiredTagList() { - return this.requiredTags; - } - - public ModelBuilder setRequiredTagList(final List requiredTags) { - this.requiredTags = requiredTags; - return this; - } - - public ModelBuilder setTableParser(final TableParser tableParser) { - this.tableParser = tableParser; - return this; - } - - public ModelBuilder setTagClassifier(final TagClassifier tagClassifier) { - this.tagClassifier = tagClassifier; - return this; - } - - public Model build() { - this.updateModelData(); - final var model = new Model(this.modelData); - if (this.tableParser != null) { - this.tableParser.setModel(model); - } - if (this.tagClassifier != null) { - this.tagClassifier.setModel(model); - } - return model; - } - - private void updateModelData() { - this.modelData.setList("entities", this.entities); - this.modelData.setMap("patterns", this.patterns); - this.modelData.setList("filters", this.filters); - this.modelData.setList("pivotEntityList", this.pivotEntities); - this.modelData.setList("tags", this.tags); - this.modelData.setList("requiredTags", this.requiredTags); - } - - private ModelData modelData; - private List entities; - private Map patterns; - private List filters; - private List pivotEntities; - private List tags; - private List requiredTags; - private TableParser tableParser; - private TagClassifier tagClassifier; -} diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/ModelData.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/ModelData.java new file mode 100644 index 00000000..4d22ba7f --- /dev/null +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/ModelData.java @@ -0,0 +1,23 @@ +package com.github.romualdrousseau.any2json; + +import java.nio.file.Path; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +public interface ModelData { + + Optional get(String key); + + ModelData set(String key, T value); + + List getList(String key); + + ModelData setList(String key, List values); + + Map getMap(String key); + + ModelData setMap(String key, Map values); + + void save(Path path); +} diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/Sheet.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/Sheet.java index 4814d73e..8bf42adc 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/Sheet.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/Sheet.java @@ -12,6 +12,8 @@ public interface Sheet { int getLastColumnNum(); + void applyTransformations(); + Optional getTableGraph(); Optional getTable(); diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/TagClassifier.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/TagClassifier.java index ad977b5b..d202605a 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/TagClassifier.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/TagClassifier.java @@ -14,11 +14,15 @@ enum TagStyle { TagClassifier setModel(final Model model); - String predict(final String name, final List entities, final List context); + String predict(final Table table, final Header header); TagClassifier.TagStyle getTagStyle(); TagClassifier setTagStyle(final TagClassifier.TagStyle mode); + List getLexicon(); + + TagClassifier setLexicon(final List lexion); + String ensureTagStyle(final String text); } diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseDocument.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseDocument.java index 21fa3ef0..085a53e6 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseDocument.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseDocument.java @@ -16,6 +16,7 @@ import com.github.romualdrousseau.any2json.parser.table.SimpleTableParser; import com.github.romualdrousseau.any2json.readdir.GutenbergDiagonal; import com.github.romualdrousseau.any2json.transform.op.StitchRows; +import com.github.romualdrousseau.shuju.strings.StringUtils; public abstract class BaseDocument implements Document { @@ -87,6 +88,9 @@ public Document setRawHints(final EnumSet hints) { @Override public String getRecipe() { + if (StringUtils.isBlank(this.recipe)) { + this.recipe = String.join("\n", this.model.getData().getList("recipe")); + } return this.recipe; } @@ -143,19 +147,25 @@ public void updateParsersAndClassifiers() { final var capa = this.getIntelliCapabilities(); if (capa.contains(Document.Hint.INTELLI_EXTRACT) && this.hints.contains(Document.Hint.INTELLI_EXTRACT)) { - this.sheetParser = new SheetBitmapParser(); + if (this.sheetParser instanceof SimpleSheetParser) { + this.sheetParser = new SheetBitmapParser(); + } } if (capa.contains(Document.Hint.INTELLI_LAYOUT) && this.hints.contains(Document.Hint.INTELLI_LAYOUT)) { - this.tableParser = DynamicPackages.GetElementParserFactory() - .map(x -> x.newInstance(this.model, this.tableParser.getParserOptions())) - .orElseGet(() -> new SimpleTableParser(this.model, null)); + if (this.tableParser instanceof SimpleTableParser) { + this.tableParser = DynamicPackages.GetElementParserFactory() + .map(x -> x.newInstance(this.model, this.tableParser.getParserOptions())) + .orElseGet(() -> new SimpleTableParser(this.model, null)); + } } if (capa.contains(Document.Hint.INTELLI_TAG) && this.hints.contains(Document.Hint.INTELLI_TAG)) { - this.tagClassifier = DynamicPackages.GetTagClassifierFactory() - .map(x -> x.newInstance(this.model, this.tagClassifier.getTagStyle())) - .orElseGet(() -> new SimpleTagClassifier(this.model, this.tagClassifier.getTagStyle())); + if (this.tagClassifier instanceof SimpleTagClassifier) { + this.tagClassifier = DynamicPackages.GetTagClassifierFactory() + .map(x -> x.newInstance(this.model, this.tagClassifier.getTagStyle())) + .orElseGet(() -> new SimpleTagClassifier(this.model, this.tagClassifier.getTagStyle())); + } } } diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseSheet.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseSheet.java index 6fa069a0..814e77a0 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseSheet.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseSheet.java @@ -72,6 +72,18 @@ public void addSheetListener(final SheetListener listener) { this.listeners.add(listener); } + @Override + public void applyTransformations() { + if (this.transfoApplied) { + return; + } + if (this.getLastRowNum() <= 0 || this.getLastColumnNum() <= 0) { + return; + } + TransformableSheet.of(this).applyAll(); + this.transfoApplied = true; + } + @Override public Optional getTableGraph() { @@ -81,9 +93,9 @@ public Optional getTableGraph() { return Optional.empty(); } - // Apply recipes + // Apply transformations - TransformableSheet.of(this).applyAll(); + this.applyTransformations(); if (!this.notifyStepCompleted(new SheetPreparedEvent(this))) { return Optional.empty(); } @@ -418,6 +430,7 @@ private int computeLastColumnNum() { private final List columnMask; private final int storeLastColumnNum; + private boolean transfoApplied = false; private boolean unmergedAll = false; private float capillarityThreshold = Settings.DEFAULT_CAPILLARITY_THRESHOLD; private boolean pivotEnabled; diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/base/ModelData.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/base/ModelData.java deleted file mode 100644 index ebb1677b..00000000 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/base/ModelData.java +++ /dev/null @@ -1,106 +0,0 @@ -package com.github.romualdrousseau.any2json.base; - -import java.io.IOException; -import java.net.URI; -import java.net.URISyntaxException; -import java.net.URL; -import java.net.http.HttpClient; -import java.net.http.HttpRequest; -import java.net.http.HttpResponse; -import java.nio.file.Path; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -import com.github.romualdrousseau.shuju.json.JSON; -import com.github.romualdrousseau.shuju.json.JSONObject; - -public class ModelData { - - public static ModelData empty() { - return new ModelData(JSON.newObject()); - } - - public static ModelData loadFromJSON(final JSONObject jsonObject) { - return new ModelData(jsonObject); - } - - public static ModelData loadFromResource(final Class clazz, final String resourceName) - throws IOException, URISyntaxException { - final URL resourceUrl = clazz.getResource(resourceName); - if (resourceUrl == null) { - throw new IOException("Error loading model"); - } - return new ModelData(JSON.loadObject(Path.of(resourceUrl.toURI()))); - } - - public static ModelData loadFromPath(final Path path) { - return new ModelData(JSON.loadObject(path)); - } - - public static ModelData loadFromWebURL(final String url) throws IOException, InterruptedException { - final var client = HttpClient.newHttpClient(); - final var request = HttpRequest.newBuilder().uri(URI.create(url)).build(); - final var response = client.send(request, HttpResponse.BodyHandlers.ofString()); - if (response.statusCode() != 200) { - throw new IOException("Error loading model"); - } - return new ModelData(JSON.objectOf(response.body())); - } - - private ModelData(final JSONObject backstore) { - this.backstore = backstore; - } - - public boolean hasKey(final String key) { - return this.backstore.get(key).isPresent(); - } - - public int getInt(final String key) { - return this.backstore.getInt(key); - } - - public void setInt(final String key, final int value) { - this.backstore.setInt(key, value); - } - - public void setString(final String key, final String value) { - this.backstore.setString(key, value); - } - - public String getString(final String key) { - return this.backstore.getString(key); - } - - public List getList(final String key) { - return JSON.streamOf(this.backstore.getArray(key)).collect(Collectors.toUnmodifiableList()); - } - - public List getMutableList(final String key) { - return JSON.streamOf(this.backstore.getArray(key)).collect(Collectors.toList()); - } - - public void setList(final String key, final List values) { - this.backstore.setArray(key, JSON.arrayOf(values)); - } - - public Map getMap(final String key) { - return JSON.streamOf(this.backstore.getArray(key)) - .collect(Collectors.toUnmodifiableMap(x -> x.getString("key"), x -> x.getString("value"))); - } - - public Map getMutableMap(final String key) { - return JSON.streamOf(this.backstore.getArray(key)) - .collect(Collectors.toMap(x -> x.getString("key"), x -> x.getString("value"))); - } - - public void setMap(final String key, final Map values) { - this.backstore.setArray(key, JSON.arrayOf(values)); - } - - public void save(final Path path) { - JSON.saveObject(this.backstore, path); - } - - private final JSONObject backstore; -} diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/classifier/SimpleTagClassifier.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/classifier/SimpleTagClassifier.java index 504b6964..add58b30 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/classifier/SimpleTagClassifier.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/classifier/SimpleTagClassifier.java @@ -3,7 +3,9 @@ import java.util.List; import java.util.regex.Pattern; +import com.github.romualdrousseau.any2json.Header; import com.github.romualdrousseau.any2json.Model; +import com.github.romualdrousseau.any2json.Table; import com.github.romualdrousseau.any2json.TagClassifier; import com.github.romualdrousseau.shuju.preprocessing.tokenizer.ShingleTokenizer; import com.github.romualdrousseau.shuju.strings.StringUtils; @@ -18,10 +20,10 @@ public SimpleTagClassifier(final Model model, final TagClassifier.TagStyle tagSt this.model = model; this.tagStyle = tagStyle; - final List lexicon = (model != null && model.getData().hasKey("lexicon")) + this.lexicon =(model != null && model.getData().get("lexicon").isPresent()) ? model.getData().getList("lexicon") : StringUtils.getSymbols().stream().toList(); - this.tagTokenizer = new ShingleTokenizer(lexicon, 1); + this.tagTokenizer = new ShingleTokenizer(this.getLexicon(), 1); } @Override @@ -56,6 +58,17 @@ public TagClassifier setTagStyle(final TagClassifier.TagStyle mode) { return this; } + @Override + public List getLexicon() { + return lexicon; + } + + @Override + public TagClassifier setLexicon(final List lexicon) { + this.lexicon = lexicon; + return this; + } + @Override public String ensureTagStyle(final String text) { if (this.tagStyle == TagClassifier.TagStyle.SNAKE) { @@ -76,7 +89,8 @@ public String ensureTagStyle(final String text) { } @Override - public String predict(String name, List entities, List context) { + public String predict(final Table table, final Header header) { + final var name = header.getName(); final var m = pattern.matcher(name); if (m.find()) { return m.group(1); @@ -90,4 +104,5 @@ public String predict(String name, List entities, List context) private Model model; private TagClassifier.TagStyle tagStyle; + private List lexicon; } diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/header/DataTableHeader.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/header/DataTableHeader.java index 3f529dc6..bb120e7a 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/header/DataTableHeader.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/header/DataTableHeader.java @@ -41,7 +41,7 @@ public String getValue() { } @Override - public List entities() { + public Iterable entities() { if (this.entities == null) { this.entities = (this.getColumnIndex() < 0 || this.getColumnIndex() >= this.getTable().getNumberOfColumns()) ? Collections.emptyList() @@ -74,8 +74,7 @@ public void updateTag() { this.tag = HeaderTag.None; } else { final var classifier = this.getTable().getSheet().getDocument().getTagClassifier(); - final var context = this.getTable().getHeaderNames(); - final String tagValue = classifier.predict(this.getName(), this.entities(), context); + final String tagValue = classifier.predict(this.getTable(), this); this.tag = new HeaderTag(tagValue); } } diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/intelli/IntelliHeader.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/intelli/IntelliHeader.java index c7261e7f..6fbc717b 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/intelli/IntelliHeader.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/intelli/IntelliHeader.java @@ -3,6 +3,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Optional; +import java.util.stream.StreamSupport; import com.github.romualdrousseau.any2json.Row; import com.github.romualdrousseau.any2json.base.BaseCell; @@ -22,7 +23,7 @@ public IntelliHeader(final BaseHeader header, final boolean disableAutoName) { if (header.isColumnEmpty()) { this.name = ""; } else { - this.name = this.entities().stream().findAny().map(x -> this.getEntitiesAsString()) + this.name = StreamSupport.stream(this.entities().spliterator(), false).findAny().map(x -> this.getEntitiesAsString()) .orElse(Settings.PIVOT_VALUE_SUFFIX); } } else if (this.isPivotHeader() || !disableAutoName) { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/DataContractModelBuilder.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/DataContractModelBuilder.java new file mode 100644 index 00000000..b17bf5cb --- /dev/null +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/DataContractModelBuilder.java @@ -0,0 +1,82 @@ +package com.github.romualdrousseau.any2json.modeldata; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; + +import com.github.romualdrousseau.any2json.Model; +import com.github.romualdrousseau.shuju.yaml.YAML; +import com.github.romualdrousseau.shuju.yaml.YAMLObject; + +public class DataContractModelBuilder { + + public DataContractModelBuilder() { + this.reset(); + } + + public DataContractModelBuilder reset() { + this.modelData = DataContractModelData.empty(); + this.lexicon = Collections.emptyList(); + return this; + } + + public DataContractModelBuilder fromModelData(final DataContractModelData modelData) { + this.modelData = modelData; + return this; + } + + public DataContractModelBuilder fromYAML(final YAMLObject yamlObject) { + return this.fromModelData(new DataContractModelData(yamlObject)); + } + + public DataContractModelBuilder fromResource(final Class clazz, final String resourceName) + throws IOException, URISyntaxException { + final URL resourceUrl = clazz.getResource(resourceName); + if (resourceUrl == null) { + throw new IOException("Error loading model"); + } + return this.fromModelData(new DataContractModelData(YAML.loadObject(Path.of(resourceUrl.toURI())))); + } + + public DataContractModelBuilder fromPath(final Path path) { + return this.fromModelData(new DataContractModelData(YAML.loadObject(path))); + } + + public DataContractModelBuilder fromURL(final String url) throws IOException, InterruptedException { + final var client = HttpClient.newHttpClient(); + final var request = HttpRequest.newBuilder().uri(URI.create(url)).build(); + final var response = client.send(request, HttpResponse.BodyHandlers.ofString()); + if (response.statusCode() != 200) { + throw new IOException("Error loading model"); + } + return this.fromModelData(new DataContractModelData(YAML.objectOf(response.body()))); + } + + public DataContractModelBuilder setLexicon(final String lexicon) throws IOException, URISyntaxException { + final URL resourceUrl = this.getClass().getResource("/lexicon/" + lexicon + ".json"); + if (resourceUrl == null) { + throw new IOException("Error loading lexicon"); + } + this.lexicon = YAML.loadArray(Path.of(resourceUrl.toURI())).stream().toList(); + return this; + } + + public Model build() { + this.updateModelData(); + return new Model(this.modelData); + } + + private void updateModelData() { + this.modelData.setList("lexicon", this.lexicon); + } + + private DataContractModelData modelData; + private List lexicon; +} diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/DataContractModelData.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/DataContractModelData.java new file mode 100644 index 00000000..1fd71521 --- /dev/null +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/DataContractModelData.java @@ -0,0 +1,145 @@ +package com.github.romualdrousseau.any2json.modeldata; + +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +import com.github.romualdrousseau.any2json.ModelData; +import com.github.romualdrousseau.shuju.commons.Pair; +import com.github.romualdrousseau.shuju.yaml.YAML; +import com.github.romualdrousseau.shuju.yaml.YAMLArray; +import com.github.romualdrousseau.shuju.yaml.YAMLObject; + +public class DataContractModelData implements ModelData { + + public static DataContractModelData empty() { + return new DataContractModelData(null); + } + + public DataContractModelData(final YAMLObject backstore) { + this.backstore = backstore; + + if (backstore == null) { + return; + } + + final var entities = this.backstore.get("entities").get(); + + this.entityList = StreamSupport.stream(entities.keys().spliterator(), false).toList(); + + this.patternMap = StreamSupport.stream(entities.keys().spliterator(), false).flatMap(k -> { + final var entity = entities.get(k).get(); + final var patterns = entity.get("patterns").get(); + return patterns.stream().map(v -> new Pair(v, k)); + }).collect(Collectors.toUnmodifiableMap(s -> s.getKey(), s -> s.getValue())); + + this.pivotEntityList = StreamSupport.stream(entities.keys().spliterator(), false).filter(k -> { + final var entity = entities.get(k).get(); + return entity.get("pivot").orElse(false); + }).toList(); + + this.filterList = YAML.queryStream(this.backstore, "extracts.cleanser.filters").toList(); + this.recipe = YAML.queryStream(this.backstore, "extracts.cleanser.recipe").toList(); + + this.metaLayexList = YAML.queryStream(this.backstore, "extracts.parser.meta").toList(); + this.dataLayexList = YAML.queryStream(this.backstore, "extracts.parser.data").toList(); + + final var tags = this.backstore.get("definitions").get(); + + this.tagList = StreamSupport.stream(tags.keys().spliterator(), false).toList(); + + this.requiredTagList = StreamSupport.stream(tags.keys().spliterator(), false).filter(k -> { + final var tag = tags.get(k).get(); + return tag.get("required").orElse(false); + }).toList(); + + this.definitions = List.of(tags.toString(false).split("\n")); + + this.lexicon = Collections.emptyList(); + } + + @Override + public Optional get(final String key) { + return Optional.empty(); + } + + @Override + public ModelData set(final String key, final T value) { + return this; + } + + @Override + public List getList(final String key) { + if ("definitions".equals(key)) { + return this.definitions; + } else if ("recipe".equals(key)) { + return this.recipe; + } else if ("entities".equals(key)) { + return this.entityList; + } else if ("filters".equals(key)) { + return this.filterList; + } else if ("pivotEntityList".equals(key)) { + return this.pivotEntityList; + } else if ("metaLayexes".equals(key)) { + return this.metaLayexList; + } else if ("dataLayexes".equals(key)) { + return this.dataLayexList; + } else if ("tags".equals(key)) { + return this.tagList; + } else if ("requiredTags".equals(key)) { + return this.requiredTagList; + } else if ("lexicon".equals(key)) { + return this.lexicon; + } else { + return Collections.emptyList(); + } + } + + @Override + public ModelData setList(final String key, final List values) { + if ("metaLayexes".equals(key)) { + this.metaLayexList = values; + } else if ("dataLayexes".equals(key)) { + this.dataLayexList = values; + } else if ("lexicon".equals(key)) { + this.lexicon = values; + } + return this; + } + + @Override + public Map getMap(final String key) { + if ("patterns".equals(key)) { + return this.patternMap; + } else { + return Collections.emptyMap(); + } + } + + @Override + public ModelData setMap(final String key, final Map values) { + return this; + } + + @Override + public void save(final Path path) { + throw new UnsupportedOperationException("Unimplemented method 'save'"); + } + + private final YAMLObject backstore; + private List filterList; + private List entityList; + private Map patternMap; + private List pivotEntityList; + private List recipe; + private List metaLayexList; + private List dataLayexList; + private List tagList; + private List requiredTagList; + private List definitions; + private List lexicon; +} diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/EmptyModelData.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/EmptyModelData.java new file mode 100644 index 00000000..2fd494ef --- /dev/null +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/EmptyModelData.java @@ -0,0 +1,51 @@ +package com.github.romualdrousseau.any2json.modeldata; + +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import com.github.romualdrousseau.any2json.ModelData; + +public class EmptyModelData implements ModelData { + + public static EmptyModelData empty() { + return new EmptyModelData(); + } + + @Override + public Optional get(String key) { + return Optional.empty(); + } + + @Override + public ModelData set(String key, T value) { + return this; + } + + @Override + public List getList(final String key) { + return Collections.emptyList(); + } + + @Override + public ModelData setList(final String key, final List values) { + return this; + } + + @Override + public Map getMap(final String key) { + return Collections.emptyMap(); + } + + @Override + public ModelData setMap(final String key, final Map values) { + return this; + } + + @Override + public void save(final Path path) { + throw new UnsupportedOperationException("Unimplemented method 'save'"); + } +} diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/JsonModelBuilder.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/JsonModelBuilder.java new file mode 100644 index 00000000..d81d54d5 --- /dev/null +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/JsonModelBuilder.java @@ -0,0 +1,162 @@ +package com.github.romualdrousseau.any2json.modeldata; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import com.github.romualdrousseau.any2json.Model; +import com.github.romualdrousseau.any2json.TableParser; +import com.github.romualdrousseau.any2json.TagClassifier; +import com.github.romualdrousseau.shuju.json.JSON; +import com.github.romualdrousseau.shuju.json.JSONObject; +import com.github.romualdrousseau.shuju.yaml.YAML; + +public class JsonModelBuilder { + + public JsonModelBuilder() { + this.reset(); + } + + public JsonModelBuilder reset() { + this.modelData = JsonModelData.empty(); + this.entities = Collections.emptyList(); + this.patterns = Collections.emptyMap(); + this.filters = Collections.emptyList(); + this.pivotEntities = Collections.emptyList(); + this.tags = Collections.emptyList(); + this.requiredTags = Collections.emptyList(); + this.lexicon = Collections.emptyList(); + this.tableParser = null; + this.tagClassifier = null; + return this; + } + + public JsonModelBuilder fromModelData(final JsonModelData modelData) { + this.modelData = modelData; + this.entities = modelData.getList("entities"); + this.patterns = modelData.getMap("patterns"); + this.filters = modelData.getList("filters"); + this.pivotEntities = modelData.getList("pivotEntityList"); + this.tags = modelData.getList("tags"); + this.requiredTags = modelData.getList("requiredTags"); + this.lexicon = modelData.getList("lexicon"); + return this; + } + + public JsonModelBuilder fromJSON(final JSONObject jsonObject) { + return this.fromModelData(new JsonModelData(jsonObject)); + } + + public JsonModelBuilder fromResource(final Class clazz, final String resourceName) + throws IOException, URISyntaxException { + final URL resourceUrl = clazz.getResource(resourceName); + if (resourceUrl == null) { + throw new IOException("Error loading model"); + } + return this.fromModelData(new JsonModelData(JSON.loadObject(Path.of(resourceUrl.toURI())))); + } + + public JsonModelBuilder fromPath(final Path path) { + return this.fromModelData(new JsonModelData(JSON.loadObject(path))); + } + + public JsonModelBuilder fromURL(final String url) throws IOException, InterruptedException { + final var client = HttpClient.newHttpClient(); + final var request = HttpRequest.newBuilder().uri(URI.create(url)).build(); + final var response = client.send(request, HttpResponse.BodyHandlers.ofString()); + if (response.statusCode() != 200) { + throw new IOException("Error loading model"); + } + return this.fromModelData(new JsonModelData(JSON.objectOf(response.body()))); + } + + public JsonModelBuilder setEntityList(final List entities) { + this.entities = entities; + return this; + } + + public JsonModelBuilder setPatternMap(final Map patterns) { + this.patterns = patterns; + return this; + } + + public JsonModelBuilder setFilters(final List filters) { + this.filters = filters; + return this; + } + + public JsonModelBuilder setPivotEntityList(final List pivotEntities) { + this.pivotEntities = pivotEntities; + return this; + } + + public JsonModelBuilder setTagList(final List tags) { + this.tags = tags; + return this; + } + + public JsonModelBuilder setRequiredTagList(final List requiredTags) { + this.requiredTags = requiredTags; + return this; + } + + public JsonModelBuilder setTableParser(final TableParser tableParser) { + this.tableParser = tableParser; + return this; + } + + public JsonModelBuilder setTagClassifier(final TagClassifier tagClassifier) { + this.tagClassifier = tagClassifier; + return this; + } + + public JsonModelBuilder setLexicon(final String lexicon) throws IOException, URISyntaxException { + final URL resourceUrl = this.getClass().getResource("/lexicon/" + lexicon + ".json"); + if (resourceUrl == null) { + throw new IOException("Error loading lexicon"); + } + this.lexicon = YAML.loadArray(Path.of(resourceUrl.toURI())).stream().toList(); + return this; + } + + public Model build() { + this.updateModelData(); + final var model = new Model(this.modelData); + if (this.tableParser != null) { + this.tableParser.setModel(model); + } + if (this.tagClassifier != null) { + this.tagClassifier.setModel(model); + } + return model; + } + + private void updateModelData() { + this.modelData.setList("entities", this.entities); + this.modelData.setMap("patterns", this.patterns); + this.modelData.setList("filters", this.filters); + this.modelData.setList("pivotEntityList", this.pivotEntities); + this.modelData.setList("tags", this.tags); + this.modelData.setList("requiredTags", this.requiredTags); + this.modelData.setList("lexicon", this.lexicon); + } + + private JsonModelData modelData; + private List entities; + private Map patterns; + private List filters; + private List pivotEntities; + private List tags; + private List requiredTags; + private TableParser tableParser; + private TagClassifier tagClassifier; + private List lexicon; +} diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/JsonModelData.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/JsonModelData.java new file mode 100644 index 00000000..e8eb5338 --- /dev/null +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/JsonModelData.java @@ -0,0 +1,68 @@ +package com.github.romualdrousseau.any2json.modeldata; + +import java.nio.file.Path; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import com.github.romualdrousseau.any2json.ModelData; +import com.github.romualdrousseau.shuju.json.JSON; +import com.github.romualdrousseau.shuju.json.JSONArray; +import com.github.romualdrousseau.shuju.json.JSONCollector; +import com.github.romualdrousseau.shuju.json.JSONObject; + +public class JsonModelData implements ModelData { + + public static JsonModelData empty() { + return new JsonModelData(JSON.newObject()); + } + + public JsonModelData(final JSONObject backstore) { + this.backstore = backstore; + } + + @Override + public Optional get(final String key) { + return this.backstore.get(key); + } + + @Override + public ModelData set(final String key, final T value) { + this.backstore.set(key, value); + return this; + } + + @Override + public List getList(final String key) { + return this.backstore.get(key) + .map(x -> x.stream().toList()) + .orElse(Collections.emptyList()); + } + + @Override + public ModelData setList(final String key, final List values) { + this.backstore.set(key, JSON.arrayOf(values)); + return this; + } + + @Override + public Map getMap(final String key) { + return this.backstore.get(key) + .map(x -> x.stream().collect(JSONCollector.toUnmodifiableMap("key", "value"))) + .orElse(Collections.emptyMap()); + } + + @Override + public ModelData setMap(final String key, final Map values) { + this.backstore.set(key, JSON.arrayOf(values)); + return this; + } + + @Override + public void save(final Path path) { + JSON.saveObject(this.backstore, path); + } + + private final JSONObject backstore; +} diff --git a/any2json/src/main/resources/lexicon/english.json b/any2json/src/main/resources/lexicon/english.json new file mode 100644 index 00000000..554c89da --- /dev/null +++ b/any2json/src/main/resources/lexicon/english.json @@ -0,0 +1,126 @@ +[ + "account", + "accrual", + "address,addresses,addr", + "amount", + "area", + "atc", + "audit", + "base", + "basic", + "batch", + "begin,beg", + "billing,bill", + "bonus", + "brand", + "brick", + "calculation", + "campaign", + "category", + "center", + "channel", + "charge", + "chinese,chin", + "city", + "client", + "code", + "company", + "condition", + "cost", + "counting", + "country", + "credit", + "cube", + "currency", + "customer,cust", + "date", + "delivery", + "depot", + "description,desc", + "direct", + "discount", + "district", + "division", + "end", + "english,eng", + "ethical", + "exchange", + "expiration", + "expiry", + "factor", + "flavour", + "general", + "gimmick", + "gross", + "group", + "header", + "indicator", + "industry", + "instruction", + "invoice,inv", + "item", + "key", + "line", + "list", + "local", + "location", + "manufacture", + "margin", + "material", + "memo", + "month", + "name", + "number", + "old", + "order", + "other", + "package,pack", + "payer", + "payment", + "period", + "person", + "pivot", + "plant", + "price", + "principal", + "product,products,prod", + "profit", + "property", + "purchase", + "quantity,quantities,qty", + "rate", + "reason", + "region", + "request,requested", + "restriction", + "return", + "sales", + "sample", + "search", + "sector", + "selling", + "serial", + "ship,shp", + "sold", + "special", + "step", + "strength", + "team", + "tender", + "term", + "territory,territories", + "time", + "town", + "transaction,transactions", + "type,types", + "unit,units", + "usage", + "user", + "value,values", + "vendor", + "volume", + "week", + "weight", + "year", + "zip,zp" +] diff --git a/any2json/src/test/java/com/github/romualdrousseau/any2json/ModelDB.java b/any2json/src/test/java/com/github/romualdrousseau/any2json/ModelDB.java index d4edf2b5..83f3fab6 100644 --- a/any2json/src/test/java/com/github/romualdrousseau/any2json/ModelDB.java +++ b/any2json/src/test/java/com/github/romualdrousseau/any2json/ModelDB.java @@ -1,25 +1,19 @@ package com.github.romualdrousseau.any2json; +import java.io.IOException; import java.net.URISyntaxException; -import java.net.URL; -import java.nio.file.Path; + +import com.github.romualdrousseau.any2json.modeldata.JsonModelBuilder; public class ModelDB { public static Model createConnection(final String modelName) { - return new ModelBuilder() - .fromPath(ModelDB.getResourcePath(String.format("/data/%s.json", modelName))) - .build(); - } - - private static Path getResourcePath(final String resourceName) { try { - final URL resourceUrl = new ModelDB().getClass().getResource(resourceName); - assert resourceUrl != null : resourceName + " not found"; - return Path.of(resourceUrl.toURI()); - } catch (final URISyntaxException x) { - assert false : x.getMessage(); - return null; + return new JsonModelBuilder() + .fromResource(ModelDB.class, String.format("/data/%s.json", modelName)) + .build(); + } catch (IOException | URISyntaxException e) { + throw new RuntimeException(e); } } } diff --git a/any2json/src/test/java/com/github/romualdrousseau/any2json/Test_Any2Json.java b/any2json/src/test/java/com/github/romualdrousseau/any2json/Test_Any2Json.java index 706b9ec0..c0bc6937 100644 --- a/any2json/src/test/java/com/github/romualdrousseau/any2json/Test_Any2Json.java +++ b/any2json/src/test/java/com/github/romualdrousseau/any2json/Test_Any2Json.java @@ -11,41 +11,98 @@ */ public class Test_Any2Json { - private static final Model model = ModelDB.createConnection("sales-english"); + class SimpleHeader implements Header { + + private final String name; + + public SimpleHeader(String name) { + this.name = name; + } + + @Override + public String getName() { + return this.name; + } + + @Override + public Cell getCellAtRow(Row row) { + throw new UnsupportedOperationException("Unimplemented method 'getCellAtRow'"); + } + + @Override + public Cell getCellAtRow(Row row, boolean merged) { + throw new UnsupportedOperationException("Unimplemented method 'getCellAtRow'"); + } + + @Override + public boolean hasTag() { + throw new UnsupportedOperationException("Unimplemented method 'hasTag'"); + } + + @Override + public HeaderTag getTag() { + throw new UnsupportedOperationException("Unimplemented method 'getTag'"); + } + + @Override + public Iterable entities() { + throw new UnsupportedOperationException("Unimplemented method 'entities'"); + } + + @Override + public String getEntitiesAsString() { + throw new UnsupportedOperationException("Unimplemented method 'getEntitiesAsString'"); + } + + @Override + public boolean isColumnEmpty() { + throw new UnsupportedOperationException("Unimplemented method 'isColumnEmpty'"); + } + + @Override + public boolean isColumnMerged() { + throw new UnsupportedOperationException("Unimplemented method 'isColumnMerged'"); + } + } + + private static final Model model; + static { + model = ModelDB.createConnection("sales-english"); + } @Test public void testSimpleClassifierWithEnclosedTag() throws Exception { try (final var classifer = new SimpleTagClassifier(model, TagClassifier.TagStyle.NONE)) { - assertEquals("customerName", classifer.predict("customer name ($customerName)", null, null)); + assertEquals("customerName", classifer.predict(null, new SimpleHeader("customer name ($customerName)"))); } } @Test public void testSimpleClassifierWithSnake() throws Exception { try (final var classifer = new SimpleTagClassifier(model, TagClassifier.TagStyle.SNAKE)) { - assertEquals("customer_name_u", classifer.predict("customer name_u", null, null)); - assertEquals("customer_name_u", classifer.predict("customer_name_u", null, null)); - assertEquals("customer_name_u", classifer.predict("customerName_u", null, null)); - assertEquals("customer_name_u", classifer.predict("customername_u", null, null)); + assertEquals("customer_name_u", classifer.predict(null, new SimpleHeader("customer name_u"))); + assertEquals("customer_name_u", classifer.predict(null, new SimpleHeader("customer_name_u"))); + assertEquals("customer_name_u", classifer.predict(null, new SimpleHeader("customerName_u"))); + assertEquals("customer_name_u", classifer.predict(null, new SimpleHeader("customername_u"))); } } @Test public void testSimpleClassifierWithCamel() throws Exception { try (final var classifer = new SimpleTagClassifier(model, TagClassifier.TagStyle.CAMEL)) { - assertEquals("customerNameu", classifer.predict("customer name_u", null, null)); - assertEquals("customerNameu", classifer.predict("customer_name_u", null, null)); - assertEquals("customerNameu", classifer.predict("customerName_u", null, null)); - assertEquals("customerNameu", classifer.predict("customername_u", null, null)); + assertEquals("customerNameu", classifer.predict(null, new SimpleHeader("customer name_u"))); + assertEquals("customerNameu", classifer.predict(null, new SimpleHeader("customer_name_u"))); + assertEquals("customerNameu", classifer.predict(null, new SimpleHeader("customerName_u"))); + assertEquals("customerNameu", classifer.predict(null, new SimpleHeader("customername_u"))); } } @Test public void testSimpleClassifierCompatible() throws Exception { try (final var classifer = new SimpleTagClassifier(model, TagClassifier.TagStyle.NONE)) { - assertEquals("customer_name", classifer.predict("customer name", null, null)); - assertEquals("customer_name", classifer.predict("customer_name", null, null)); - assertEquals("customerName", classifer.predict("customername", null, null)); + assertEquals("customer_name", classifer.predict(null, new SimpleHeader("customer name"))); + assertEquals("customer_name", classifer.predict(null, new SimpleHeader("customer_name"))); + assertEquals("customerName", classifer.predict(null, new SimpleHeader("customername"))); } } }