diff --git a/any2json-commons/pom.xml b/any2json-commons/pom.xml new file mode 100644 index 00000000..6bd3eec3 --- /dev/null +++ b/any2json-commons/pom.xml @@ -0,0 +1,58 @@ + + 4.0.0 + + + com.github.romualdrousseau + any2json-monorepo + 2.45-SNAPSHOT + + + com.github.romualdrousseau + any2json-commons + 2.45-SNAPSHOT + jar + + any2json-commons + + Any2Json plugin to tag tabular output implementing embeddings. + + https://github.com/romualdrousseau/any2json-monorepo + + + + + org.furyio + fury-core + ${fury.version} + + + org.furyio + fury-format + ${fury.version} + + + org.xerial.snappy + snappy-java + ${snappy.version} + + + + org.reflections + reflections + ${reflections.version} + + + + org.apache.logging.log4j + log4j-core + ${log4j.version} + test + + + org.apache.logging.log4j + log4j-slf4j2-impl + ${log4j.version} + test + + + diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/Chunk.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/Chunk.java new file mode 100644 index 00000000..3f72e015 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/Chunk.java @@ -0,0 +1,42 @@ +package com.github.romualdrousseau.any2json.commons.bigdata; + +import java.util.ArrayList; +import java.util.List; + +public class Chunk { + + private final int batchSize; + private final List batches; + + private Row[] rows; + + public Chunk(final int batchSize) { + this.batchSize = batchSize; + this.batches = new ArrayList<>(); + this.rows = new Row[this.batchSize]; + } + + public int getBatchSize() { + return this.batchSize; + } + + public List getBatches() { + return this.batches; + } + + public Row[] getRows() { + return this.rows; + } + + public void setRows(final Row[] rows) { + this.rows = rows; + } + + public void setRow(final int idx, final Row row) { + this.rows[idx] = row; + } + + public Row getRow(final int idx) { + return this.rows[idx]; + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/ChunkMetaData.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/ChunkMetaData.java new file mode 100644 index 00000000..bc7d5499 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/ChunkMetaData.java @@ -0,0 +1,24 @@ +package com.github.romualdrousseau.any2json.commons.bigdata; + +public class ChunkMetaData { + + private final long position; + private final int length; + + private ChunkMetaData(final long position, final int length) { + this.position = position; + this.length = length; + } + + public long position() { + return position; + } + + public int length() { + return length; + } + + public static ChunkMetaData of(final long position, final int length) { + return new ChunkMetaData(position, length); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/ChunkSerializer.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/ChunkSerializer.java new file mode 100644 index 00000000..543b2fdf --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/ChunkSerializer.java @@ -0,0 +1,10 @@ +package com.github.romualdrousseau.any2json.commons.bigdata; + +import java.io.IOException; + +public interface ChunkSerializer { + + byte[] serialize(Row[] batch) throws IOException; + + Row[] deserialize(byte[] bytes) throws IOException; +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/ChunkSerializerFactory.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/ChunkSerializerFactory.java new file mode 100644 index 00000000..abc057c4 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/ChunkSerializerFactory.java @@ -0,0 +1,114 @@ +package com.github.romualdrousseau.any2json.commons.bigdata; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UncheckedIOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Optional; +import java.util.Properties; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.github.romualdrousseau.any2json.commons.bigdata.serializer.ChunkSerializerFury; +import com.github.romualdrousseau.any2json.commons.bigdata.serializer.ChunkSerializerJava; + +public class ChunkSerializerFactory { + + public enum SerializerType { + DEFAULT, // DEFAULT IS FURY + JAVA, + FURY + } + + private static final Logger LOGGER = LoggerFactory.getLogger(ChunkSerializerFactory.class); + + private static final ThreadLocal CONTEXT = new ThreadLocal<>(); + + public static ChunkSerializer newInstance() { + return ChunkSerializerFactory.newInstance(SerializerType.DEFAULT); + } + + public static ChunkSerializer newInstance(final SerializerType type) { + if (CONTEXT.get() == null) { + CONTEXT.set(new ChunkSerializerFactory(type).createSerializerInstance()); + } + return CONTEXT.get(); + } + + private final SerializerType type; + + private ChunkSerializerFactory(final SerializerType type) { + try { + if (type.equals(SerializerType.DEFAULT)) { + final var prop = new Properties(); + prop.load(this.openDefaultPropertiesInputStream()); + final var typeVal = prop.getProperty("serializer"); + if (typeVal != null) { + this.type = Enum.valueOf(SerializerType.class, typeVal); + } else { + this.type = type; + } + } else { + this.type = type; + } + LOGGER.info("ChunkSerializerFactor set to {}", this.type); + } catch (final IOException x) { + LOGGER.error("Error during ChunkSerializerFactor initialization: {}", x.getMessage()); + throw new UncheckedIOException(x); + } + } + + private ChunkSerializer createSerializerInstance() { + switch (this.type) { + case JAVA: + return new ChunkSerializerJava(); + case FURY: + return new ChunkSerializerFury(); + default: + return new ChunkSerializerFury(); + } + } + + private InputStream openDefaultPropertiesInputStream() throws IOException { + return this.openPropertiesInputStream("chunk-serializer.properties") + .or(() -> this.openPropertiesInputStream("batch-serializer.properties")) + .orElseGet(InputStream::nullInputStream); + } + + private Optional openPropertiesInputStream(final String fileName) { + final var userDir = System.getProperty("user.dir"); + return this.getPathIfExists(Path.of(userDir, fileName)) + .or(() -> this.getPathIfExists(Path.of(userDir, "classes", fileName))) + .flatMap(this::pathToStream) + .or(() -> this.resolveResourceAsStream(fileName)); + } + + private Optional pathToStream(final Path x) { + try { + return Optional.of(Files.newInputStream(x)); + } catch (final IOException e) { + return Optional.empty(); + } + } + + private Optional resolveResourceAsStream(final String resourceName) { + final InputStream resource = this.getClass().getClassLoader().getResourceAsStream(resourceName); + if (resource == null) { + LOGGER.debug("module: {} not found", resourceName); + return Optional.empty(); + } + LOGGER.debug("module: {} found at {}", resourceName, this.getClass().getClassLoader().getResource(resourceName)); + return Optional.of(resource); + } + + private Optional getPathIfExists(final Path path) { + if (!path.toFile().exists()) { + LOGGER.debug("module: {} not found at {}", path.getFileName(), path); + return Optional.empty(); + } + LOGGER.debug("module: {} found at {}", path.getFileName(), path); + return Optional.of(path); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/DataFrame.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/DataFrame.java new file mode 100644 index 00000000..2ea58d69 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/DataFrame.java @@ -0,0 +1,131 @@ +package com.github.romualdrousseau.any2json.commons.bigdata; + +import java.io.Closeable; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.EnumSet; +import java.util.Iterator; +import java.util.Objects; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DataFrame implements Closeable, Iterable { + private final Logger logger = LoggerFactory.getLogger(DataFrame.class); + + private final ChunkSerializer serializer = ChunkSerializerFactory.newInstance(); + + private final Chunk chunk; + private final Path storePath; + private final int rowCount; + private final int columnCount; + private final FileChannel fileChannel; + private final MappedByteBuffer mappedBuffer; + + private int currentChunkIdx; + private boolean isClosed; + + public DataFrame(final Chunk chunk, final Path storePath, final int rowCount, final int columnCount) + throws IOException { + this.chunk = chunk; + this.storePath = storePath; + this.rowCount = rowCount; + this.columnCount = columnCount; + this.fileChannel = (FileChannel) Files.newByteChannel(this.storePath, + EnumSet.of(StandardOpenOption.READ, StandardOpenOption.DELETE_ON_CLOSE)); + if (this.fileChannel.size() <= Integer.MAX_VALUE) { + this.mappedBuffer = this.fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, this.fileChannel.size()); + } else { + this.mappedBuffer = null; + } + + this.currentChunkIdx = -1; + this.isClosed = false; + + this.logger.info("DataFrame initialized with Mapped Buffer: {}", this.isMappedBuffer()); + } + + @Override + public void close() throws IOException { + if (this.isClosed) { + return; + } + this.fileChannel.close(); + this.isClosed = true; + } + + public DataView view(final int rowStart, final int columnStart, final int rowCount, final int columnCount) { + Objects.checkFromToIndex(rowStart, rowStart + rowCount - 1, this.rowCount); + Objects.checkFromToIndex(columnStart, columnStart + columnCount - 1, this.columnCount); + return new DataView(this, rowStart, columnStart, rowCount, columnCount); + } + + public int getRowCount() { + return this.rowCount; + } + + public int getColumnCount() { + return this.columnCount; + } + + public int getColumnCount(final int row) { + Objects.checkIndex(row, this.rowCount); + final var r = this.getRow(row); + return r.size(); + } + + public Row getRow(final int row) { + Objects.checkIndex(row, this.rowCount); + final int idx = row / this.chunk.getBatchSize(); + if (this.currentChunkIdx != idx) { + this.chunk.setRows(this.loadOneBatch(this.chunk.getBatches().get(idx))); + this.currentChunkIdx = idx; + } + return this.chunk.getRow(row % this.chunk.getBatchSize()); + } + + public String getCell(final int row, final int column) { + Objects.checkIndex(row, this.rowCount); + Objects.checkIndex(column, this.columnCount); + return this.getRow(row).get(column); + } + + @Override + public Iterator iterator() { + return new DataFrameIterator(this); + } + + private Row[] loadOneBatch(final ChunkMetaData batch) { + final long startTime = System.currentTimeMillis(); + try { + + if (this.isMappedBuffer()) { + final var bytes = new byte[batch.length()]; + this.mappedBuffer.position((int) batch.position()); + this.mappedBuffer.get(bytes); + return serializer.deserialize(bytes); + } else { + final var bytes = ByteBuffer.allocate(batch.length()); + this.fileChannel.position(batch.position()); + this.fileChannel.read(bytes); + return serializer.deserialize(bytes.array()); + } + } catch (final IOException x) { + throw new UncheckedIOException(x); + } finally { + final var stopTime = System.currentTimeMillis(); + final var executionTimeInMS = (int) (stopTime - startTime); + this.logger.debug("Load a chunk in memory offset: {}, lenght: {}. Took {}ms", batch.position(), batch.length(), executionTimeInMS); + } + } + + private boolean isMappedBuffer() { + return this.mappedBuffer != null; + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/DataFrameIterator.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/DataFrameIterator.java new file mode 100644 index 00000000..eea533fd --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/DataFrameIterator.java @@ -0,0 +1,28 @@ +package com.github.romualdrousseau.any2json.commons.bigdata; + +import java.util.Iterator; +import java.util.Spliterator; +import java.util.Spliterators; + +public class DataFrameIterator implements Iterator { + private final DataFrame dataFrame; + + private int curr; + + public DataFrameIterator(final DataFrame dataFrame) { + this.dataFrame = dataFrame; + this.curr = 0; + } + + public boolean hasNext() { + return this.curr < this.dataFrame.getRowCount(); + } + + public Row next() { + return this.dataFrame.getRow(this.curr++); + } + + public Spliterator spliterator() { + return Spliterators.spliterator(this, this.dataFrame.getRowCount(), Spliterator.IMMUTABLE); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/DataFrameWriter.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/DataFrameWriter.java new file mode 100644 index 00000000..8a8d14af --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/DataFrameWriter.java @@ -0,0 +1,88 @@ +package com.github.romualdrousseau.any2json.commons.bigdata; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.EnumSet; + +public class DataFrameWriter implements Closeable { + + private final ChunkSerializer serializer = ChunkSerializerFactory.newInstance(); + + private final Chunk chunk; + private final Path storePath; + private final FileChannel fileChannel; + + private int columnCount; + private int rowCount; + private boolean isClosed; + + public DataFrameWriter(final int chunkSize) throws IOException { + this(chunkSize, 0, null); + } + + public DataFrameWriter(final int chunkSize, final int columnCount) throws IOException { + this(chunkSize, columnCount, null); + } + + public DataFrameWriter(final int batchSize, final Path path) throws IOException { + this(batchSize, 0, path); + } + + public DataFrameWriter(final int chunkSize, final int columnCount, final Path path) throws IOException { + this.chunk = new Chunk(chunkSize); + this.storePath = (path == null) ? Files.createTempFile(null, null) : Files.createTempFile(path, null, null); + this.storePath.toFile().deleteOnExit(); + this.fileChannel = (FileChannel) Files.newByteChannel(this.storePath, + EnumSet.of(StandardOpenOption.CREATE, StandardOpenOption.WRITE)); + this.columnCount = columnCount; + this.rowCount = 0; + this.isClosed = false; + } + + @Override + public void close() throws IOException { + if (this.isClosed) { + return; + } + + if ((this.rowCount % this.chunk.getBatchSize()) > 0) { + this.flushCurrentChunk(); + } + + this.fileChannel.close(); + this.isClosed = true; + } + + public int getRowCount() { + return this.rowCount; + } + + public int getColumnCount() { + return this.columnCount; + } + + public DataFrame getDataFrame() throws IOException { + this.close(); + return new DataFrame(this.chunk, this.storePath, this.rowCount, this.columnCount); + } + + public void write(final Row data) throws IOException { + this.chunk.setRow(this.rowCount % this.chunk.getBatchSize(), data); + this.columnCount = Math.max(this.columnCount, data.size()); + this.rowCount++; + if ((this.rowCount % this.chunk.getBatchSize()) == 0) { + this.flushCurrentChunk(); + } + } + + private void flushCurrentChunk() throws IOException { + final var bytes = serializer.serialize(this.chunk.getRows()); + this.chunk.getBatches().add(ChunkMetaData.of(this.fileChannel.position(), bytes.length)); + this.fileChannel.write(ByteBuffer.wrap(bytes)); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/DataView.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/DataView.java new file mode 100644 index 00000000..07a4d6c3 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/DataView.java @@ -0,0 +1,69 @@ +package com.github.romualdrousseau.any2json.commons.bigdata; + +import java.util.Iterator; + +public class DataView implements Iterable { + private final DataFrame dataFrame; + private final int rowStart; + private final int columnStart; + private final int rowCount; + private final int columnCount; + + public DataView(final DataFrame dataFrame, final int rowStart, final int columnStart, final int rowCount, final int columnCount) { + this.dataFrame = dataFrame; + this.rowStart = rowStart; + this.columnStart = columnStart; + this.rowCount = rowCount; + this.columnCount = columnCount; + } + + public DataFrame getDataFrame() { + return dataFrame; + } + + public int getRowStart() { + return rowStart; + } + + public int getRowCount() { + return this.rowCount; + } + + public int getColumnStart() { + return columnStart; + } + + public int getColumnCount() { + return columnCount; + } + + public Row getRow(int row) { + this.checkRowIndex(row); + return this.dataFrame.getRow(this.rowStart + row).view(this.columnStart, this.columnCount); + } + + public String getCell(final int row, final int column) { + this.checkRowIndex(row); + this.checkColumnIndex(column); + return this.dataFrame.getCell(this.rowStart + row, this.columnStart + column); + } + + @Override + public Iterator iterator() { + return new DataViewIterator(this); + } + + private void checkRowIndex(final int index) { + if (index < 0 || index >= this.rowCount) + throw new IndexOutOfBoundsException(this.outOfBoundsMsg(index, this.rowCount)); + } + + private void checkColumnIndex(final int index) { + if (index < 0 || index >= this.columnCount) + throw new IndexOutOfBoundsException(this.outOfBoundsMsg(index, this.columnCount)); + } + + private String outOfBoundsMsg(final int index, final int count) { + return "Index: " + index + ", Size: " + count; + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/DataViewIterator.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/DataViewIterator.java new file mode 100644 index 00000000..586e9622 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/DataViewIterator.java @@ -0,0 +1,28 @@ +package com.github.romualdrousseau.any2json.commons.bigdata; + +import java.util.Iterator; +import java.util.Spliterator; +import java.util.Spliterators; + +public class DataViewIterator implements Iterator { + private final DataView view; + + private int curr; + + public DataViewIterator(final DataView view) { + this.view = view; + this.curr = 0; + } + + public boolean hasNext() { + return this.curr < this.view.getRowCount(); + } + + public Row next() { + return this.view.getRow(this.curr++); + } + + public Spliterator spliterator() { + return Spliterators.spliterator(this, this.view.getRowCount(), Spliterator.IMMUTABLE); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/Row.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/Row.java new file mode 100644 index 00000000..9ffaabf2 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/Row.java @@ -0,0 +1,72 @@ +package com.github.romualdrousseau.any2json.commons.bigdata; + +import java.io.Serializable; +import java.util.Iterator; +import java.util.Objects; + +public class Row implements Iterable, Serializable { + + private final int columnStart; + private final int columnCount; + private final String[] data; + + public static final Row Empty = new Row(0); + + public Row(final int columnCount) { + this.columnStart = 0; + this.columnCount = columnCount; + this.data = new String[columnCount]; + } + + private Row(final String[] data) { + this.columnStart = 0; + this.columnCount = data.length; + this.data = data; + } + + private Row(final int columnStart, final int columnCount, final String[] data) { + this.columnStart = columnStart; + this.columnCount = columnCount; + this.data = data; + } + + public static Row of(String... data) { + return new Row(data); + } + + public Row view(final int columnStart, final int columnCount) { + Objects.checkFromToIndex(columnStart, columnStart + columnCount - 1, this.columnCount); + return new Row(columnStart, columnCount, this.data); + } + + public int getColumnCount() { + return this.columnCount; + } + + public int size() { + return this.data.length; + } + + public String get(final int index) { + Objects.checkIndex(index, this.columnCount); + if ((this.columnStart + index) < data.length) { + return this.data[this.columnStart + index]; + } else { + return null; + } + } + + public Row set(final int index, final String element) { + assert this != Row.Empty : "Row.Empty is not mutable"; + Objects.checkIndex(index, this.columnCount); + if ((this.columnStart + index) < data.length) { + this.data[this.columnStart + index] = element; + } + return this; + } + + @Override + public Iterator iterator() { + return new RowIterator(this.data); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/RowIterator.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/RowIterator.java new file mode 100644 index 00000000..d5991394 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/RowIterator.java @@ -0,0 +1,28 @@ +package com.github.romualdrousseau.any2json.commons.bigdata; + +import java.util.Iterator; +import java.util.Spliterator; +import java.util.Spliterators; + +public class RowIterator implements Iterator { + private final String[] row; + + private int curr; + + public RowIterator(final String[] row) { + this.row = row; + this.curr = 0; + } + + public boolean hasNext() { + return this.curr < this.row.length; + } + + public String next() { + return this.row[this.curr++]; + } + + public Spliterator spliterator() { + return Spliterators.spliterator(this, this.row.length, Spliterator.IMMUTABLE); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/serializer/ChunkSerializerFury.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/serializer/ChunkSerializerFury.java new file mode 100644 index 00000000..888be015 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/serializer/ChunkSerializerFury.java @@ -0,0 +1,36 @@ +package com.github.romualdrousseau.any2json.commons.bigdata.serializer; + +import java.io.IOException; + +import org.xerial.snappy.Snappy; + +import com.github.romualdrousseau.any2json.commons.bigdata.ChunkSerializer; +import com.github.romualdrousseau.any2json.commons.bigdata.Row; + +import io.fury.Fury; +import io.fury.config.Language; + +public class ChunkSerializerFury implements ChunkSerializer { + + private final Fury fury; + + public ChunkSerializerFury() { + this.fury = Fury.builder() + .withLanguage(Language.JAVA) + .build(); + this.fury.register(String.class); + this.fury.register(String[].class); + this.fury.register(Row.class); + this.fury.register(Row[].class); + } + + @Override + public byte[] serialize(Row[] batch) throws IOException { + return Snappy.compress(this.fury.serializeJavaObject(batch)); + } + + @Override + public Row[] deserialize(byte[] bytes) throws IOException { + return this.fury.deserializeJavaObject(Snappy.uncompress(bytes), Row[].class); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/serializer/ChunkSerializerJava.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/serializer/ChunkSerializerJava.java new file mode 100644 index 00000000..39dcfe4d --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/bigdata/serializer/ChunkSerializerJava.java @@ -0,0 +1,32 @@ +package com.github.romualdrousseau.any2json.commons.bigdata.serializer; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; + +import com.github.romualdrousseau.any2json.commons.bigdata.ChunkSerializer; +import com.github.romualdrousseau.any2json.commons.bigdata.Row; + +public class ChunkSerializerJava implements ChunkSerializer { + + @Override + public byte[] serialize(Row[] batch) throws IOException { + try ( + final var byteArrayOutputStream = new ByteArrayOutputStream(); + final var objectOutputStream = new ObjectOutputStream(byteArrayOutputStream)) { + objectOutputStream.writeObject(batch); + return byteArrayOutputStream.toByteArray(); + } + } + + @Override + public Row[] deserialize(byte[] bytes) throws IOException { + try (ObjectInputStream o = new ObjectInputStream(new ByteArrayInputStream(bytes))) { + return (Row[]) o.readObject(); + } catch (final ClassNotFoundException e) { + throw new RuntimeException(e); + } + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/Filter.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/Filter.java new file mode 100644 index 00000000..db976713 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/Filter.java @@ -0,0 +1,80 @@ +package com.github.romualdrousseau.any2json.commons.cv; + +public class Filter { + + public Filter(Template filter) { + this.filter = filter; + } + + public void apply(ISearchBitmap searchBitmap, double threshold) { + for (int y = 0; y < searchBitmap.getHeight(); y++) { + for (int x = 0; x < searchBitmap.getWidth(); x++) { + float acc = this.filter.sobel(searchBitmap, x, y); + if (acc < threshold) { + searchBitmap.set(x, y, 0); + } + } + } + } + + public void apply(ISearchBitmap searchBitmap, int[] clip, double threshold) { + for (int y = clip[1]; y < clip[3]; y++) { + for (int x = clip[0]; x < clip[2]; x++) { + float acc = this.filter.sobel(searchBitmap, x, y); + if (acc < threshold) { + searchBitmap.set(x, y, 0); + } + } + } + } + + public void applyNeg(ISearchBitmap searchBitmap, double threshold) { + for (int y = 0; y < searchBitmap.getHeight(); y++) { + for (int x = 0; x < searchBitmap.getWidth(); x++) { + float acc = this.filter.sobel(searchBitmap, x, y); + if (acc >= threshold) { + searchBitmap.set(x, y, 1); + } + } + } + } + + public void applyNeg(ISearchBitmap searchBitmap, int[] clip, double threshold) { + for (int y = clip[1]; y < clip[3]; y++) { + for (int x = clip[0]; x < clip[2]; x++) { + float acc = this.filter.sobel(searchBitmap, x, y); + if (acc >= threshold) { + searchBitmap.set(x, y, 1); + } + } + } + } + + public void apply(ISearchBitmap sourceBitmap, ISearchBitmap destBitmap, double threshold) { + for (int y = 0; y < sourceBitmap.getHeight(); y++) { + for (int x = 0; x < sourceBitmap.getWidth(); x++) { + float acc = this.filter.sobel(sourceBitmap, x, y); + if (acc < threshold) { + destBitmap.set(x, y, 0); + } else { + destBitmap.set(x, y, 1); + } + } + } + } + + public void apply(ISearchBitmap sourceBitmap, ISearchBitmap destBitmap, int[] clip, double threshold) { + for (int y = clip[1]; y < clip[3]; y++) { + for (int x = clip[0]; x < clip[2]; x++) { + float acc = this.filter.sobel(sourceBitmap, x, y); + if (acc < threshold) { + destBitmap.set(x, y, 0); + } else { + destBitmap.set(x, y, 1); + } + } + } + } + + private Template filter; +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/ISearchBitmap.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/ISearchBitmap.java new file mode 100644 index 00000000..af07063f --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/ISearchBitmap.java @@ -0,0 +1,14 @@ +package com.github.romualdrousseau.any2json.commons.cv; + +public interface ISearchBitmap { + + int getWidth(); + + int getHeight(); + + int get(int x, int y); + + void set(int x, int y, int v); + + ISearchBitmap clone(); +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/IShapeExtractor.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/IShapeExtractor.java new file mode 100644 index 00000000..46dcce5b --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/IShapeExtractor.java @@ -0,0 +1,10 @@ +package com.github.romualdrousseau.any2json.commons.cv; + +import java.util.List; + +public abstract class IShapeExtractor { + + public abstract List extractAll(ISearchBitmap bitmap); + + public abstract SearchPoint[] extractBest(ISearchBitmap bitmap); +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/SearchPoint.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/SearchPoint.java new file mode 100644 index 00000000..6f87181e --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/SearchPoint.java @@ -0,0 +1,324 @@ +package com.github.romualdrousseau.any2json.commons.cv; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +public class SearchPoint { + + public SearchPoint(int x, int y) { + this.x = x; + this.y = y; + this.sad = 0; + } + + public SearchPoint(int x, int y, float sad) { + this.x = x; + this.y = y; + this.sad = sad; + } + + public int getX() { + return this.x; + } + + public void setX(int x) { + this.x = x; + } + + public int getY() { + return this.y; + } + + public void setY(int y) { + this.y = y; + } + + public float getSAD() { + return this.sad; + } + + public void setSAD(int sad) { + this.sad = sad; + } + + public boolean equals(SearchPoint o) { + return this.x == o.x && this.y == o.y; + } + + public static boolean isValid(SearchPoint[] s) { + return s[1].getX() >= s[0].getX() && s[1].getY() >= s[0].getY(); + } + + public static int GetArea(SearchPoint[] s) { + return (s[1].getX() - s[0].getX()) * (s[1].getY() - s[0].getY()); + } + + public static boolean IsOverlap(SearchPoint[] s1, SearchPoint[] s2) { + return s1[1].getX() >= s2[0].getX() && s1[0].getX() <= s2[1].getX() && s1[1].getY() >= s2[0].getY() + && s1[0].getY() <= s2[1].getY(); + } + + public static boolean IsInside(SearchPoint[] points, int x, int y) { + return points[0].getX() <= x && x <= points[1].getX() && points[0].getY() <= y && y <= points[1].getY(); + } + + public static boolean IsDuplicate(SearchPoint[] shape1, List shapes) { + boolean foundDuplicate = false; + for (SearchPoint[] shape2 : shapes) { + if (shape1[0].equals(shape2[0]) && shape1[1].equals(shape2[1])) { + foundDuplicate = true; + } + } + return foundDuplicate; + } + + public static List RemoveDuplicates(List shapes) { + ArrayList result = new ArrayList(); + for (SearchPoint[] shape1 : shapes) { + if (!SearchPoint.IsDuplicate(shape1, result)) { + result.add(shape1); + } + } + return result; + } + + public static List RemoveOverlaps(List shapes) { + if (shapes.size() < 2) { + return shapes; + } + + Collections.sort(shapes, new Comparator() { + public int compare(SearchPoint[] o1, SearchPoint[] o2) { + return SearchPoint.GetArea(o1) - SearchPoint.GetArea(o2); + } + }); + + List result = new ArrayList(); + result.addAll(shapes); + for (SearchPoint[] shape1 : shapes) { + ArrayList tmp = new ArrayList(); + for (SearchPoint[] shape2 : result) { + tmp.addAll(SearchPoint.Clipping(shape1, shape2)); + } + result = SearchPoint.RemoveDuplicates(tmp); + } + + return SearchPoint.RemoveDuplicates(result); + } + + public static List MergeInX(List shapes) { + if (shapes.size() < 2) { + return shapes; + } + + ArrayList result = new ArrayList(); + for (SearchPoint[] shape1 : shapes) + if (shape1[0] != null && shape1[1] != null) { + for (SearchPoint[] shape2 : shapes) + if (shape1 != shape2 && shape2[0] != null && shape2[1] != null) { + if (shape1[0].getY() == shape2[0].getY() && shape1[1].getY() == shape2[1].getY()) { + shape1[0].setX(Math.min(shape1[0].getX(), shape2[0].getX())); + shape1[1].setX(Math.max(shape1[1].getX(), shape2[1].getX())); + shape2[0] = null; + shape2[1] = null; + } + } + result.add(shape1); + } + return result; + } + + public static List MergeInY(List shapes) { + if (shapes.size() < 2) { + return shapes; + } + + ArrayList result = new ArrayList(); + for (SearchPoint[] shape1 : shapes) + if (shape1[0] != null && shape1[1] != null) { + for (SearchPoint[] shape2 : shapes) + if (shape1 != shape2 && shape2[0] != null && shape2[1] != null) { + if (shape1[0].getX() == shape2[0].getX() && shape1[1].getX() == shape2[1].getX()) { + shape1[0].setY(Math.min(shape1[0].getY(), shape2[0].getY())); + shape1[1].setY(Math.max(shape1[1].getY(), shape2[1].getY())); + shape2[0] = null; + shape2[1] = null; + } + } + result.add(shape1); + } + return result; + } + + public static List TrimInX(List shapes, ISearchBitmap bitmap) { + for (SearchPoint[] shape : shapes) { + for (int i = shape[0].getX(); i <= shape[1].getX(); i++) { + if (SearchPoint.columnIsEmpty(shape, i, bitmap)) { + shape[0].setX(i + 1); + } else { + break; + } + } + + for (int i = shape[1].getX(); i >= shape[0].getX(); i--) { + if (SearchPoint.columnIsEmpty(shape, i, bitmap)) { + shape[1].setX(i - 1); + } else { + break; + } + } + } + + return shapes; + } + + public static List TrimInY(List shapes, ISearchBitmap bitmap) { + for (SearchPoint[] shape : shapes) { + for (int i = shape[0].getY(); i <= shape[1].getY(); i++) { + if (rowIsEmpty(shape, i, bitmap)) { + shape[0].setY(i + 1); + } else { + break; + } + } + + for (int i = shape[1].getY(); i >= shape[0].getY(); i--) { + if (rowIsEmpty(shape, i, bitmap)) { + shape[1].setY(i - 1); + } else { + break; + } + } + } + + return shapes; + } + + public static List ExpandInX(List shapes, ISearchBitmap bitmap) { + for (SearchPoint[] shape : shapes) { + for (int i = shape[0].getX() - 1; i > 0; i--) { + if (!SearchPoint.columnIsEmpty(shape, i, bitmap)) { + shape[0].setX(i - 1); + } else { + break; + } + } + + for (int i = shape[1].getX(); i < bitmap.getWidth(); i++) { + if (!SearchPoint.columnIsEmpty(shape, i, bitmap)) { + shape[1].setX(i + 1); + } else { + break; + } + } + } + + return shapes; + } + + public static List ExpandInY(List shapes, ISearchBitmap bitmap) { + for (SearchPoint[] shape : shapes) { + for (int i = shape[0].getY() - 1; i > 0; i--) { + if (!SearchPoint.rowIsEmpty(shape, i, bitmap)) { + shape[0].setY(i - 1); + } else { + break; + } + } + + for (int i = shape[1].getY(); i < bitmap.getHeight(); i++) { + if (!SearchPoint.rowIsEmpty(shape, i, bitmap)) { + shape[1].setY(i + 1); + } else { + break; + } + } + } + + return shapes; + } + + public static List Clipping(SearchPoint[] master, SearchPoint[] slave) { + ArrayList result = new ArrayList(); + + result.add(slave); + + // Top to bottom + for (int i = 0; i < 2; i++) { + ArrayList tmp = new ArrayList(); + for (SearchPoint[] r : result) { + if (!SearchPoint.IsOverlap(r, master)) { + tmp.add(r); + } else if (r[0].getY() < master[0].getY()) { + int d = master[0].getY() - r[0].getY(); + tmp.add(new SearchPoint[] { new SearchPoint(r[0].getX(), r[0].getY()), + new SearchPoint(r[1].getX(), r[0].getY() + d - 1) }); + tmp.add(new SearchPoint[] { new SearchPoint(r[0].getX(), r[0].getY() + d), + new SearchPoint(r[1].getX(), r[1].getY()) }); + } else if (r[1].getY() > master[1].getY()) { + int d = r[1].getY() - master[1].getY(); + tmp.add(new SearchPoint[] { new SearchPoint(r[0].getX(), r[0].getY()), + new SearchPoint(r[1].getX(), r[1].getY() - d) }); + tmp.add(new SearchPoint[] { new SearchPoint(r[0].getX(), r[1].getY() - d + 1), + new SearchPoint(r[1].getX(), r[1].getY()) }); + } else { + tmp.add(r); + } + } + result = tmp; + } + + // Left to right + for (int i = 0; i < 2; i++) { + ArrayList tmp = new ArrayList(); + for (SearchPoint[] r : result) { + if (!SearchPoint.IsOverlap(r, master)) { + tmp.add(r); + } else if (r[0].getX() < master[0].getX()) { + tmp.add(new SearchPoint[] { new SearchPoint(r[0].getX(), r[0].getY()), + new SearchPoint(master[0].getX() - 1, r[1].getY()) }); + tmp.add(new SearchPoint[] { new SearchPoint(master[0].getX(), r[0].getY()), + new SearchPoint(r[1].getX(), r[1].getY()) }); + } else if (r[1].getX() > master[1].getX()) { + tmp.add(new SearchPoint[] { new SearchPoint(r[0].getX(), r[0].getY()), + new SearchPoint(master[1].getX(), r[1].getY()) }); + tmp.add(new SearchPoint[] { new SearchPoint(master[1].getX() + 1, r[0].getY()), + new SearchPoint(r[1].getX(), r[1].getY()) }); + } + } + result = tmp; + } + + result.add(master); + + return result; + } + + private static boolean columnIsEmpty(SearchPoint[] table, int colIndex, ISearchBitmap bitmap) { + boolean isEmpty = true; + for (int i = table[0].getY(); i <= table[1].getY(); i++) { + if (bitmap.get(colIndex, i) > 0) { + isEmpty = false; + break; + } + } + return isEmpty; + } + + private static boolean rowIsEmpty(SearchPoint[] table, int rowIndex, ISearchBitmap bitmap) { + boolean isEmpty = true; + for (int i = table[0].getX(); i <= table[1].getX(); i++) { + if (bitmap.get(i, rowIndex) > 0) { + isEmpty = false; + break; + } + } + return isEmpty; + } + + private int x; + private int y; + private float sad; +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/Template.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/Template.java new file mode 100644 index 00000000..0aa1c8e3 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/Template.java @@ -0,0 +1,54 @@ +package com.github.romualdrousseau.any2json.commons.cv; + +public class Template { + + public Template(float[][] data) { + this.data = data; + this.inv_area = 1.0f / Float.valueOf(data[0].length * data.length); + } + + public int getWidth() { + return this.data[0].length; + } + + public int getHeight() { + return this.data.length; + } + + public float get(int x, int y) { + return this.data[y][x]; + } + + public float sobel(ISearchBitmap searchBitmap, int x, int y) { + int w = (this.data[0].length - 1) / 2; + int h = (this.data.length - 1) / 2; + float acc = 0; + for (int i = 0; i < this.data.length; i++) { + for (int j = 0; j < this.data[i].length; j++) { + acc += this.data[i][j] * Float.valueOf(searchBitmap.get(x - w + j, y - h + i)); + } + } + return acc; + } + + public float sad(ISearchBitmap searchBitmap, int x, int y) { + int hw = this.data[0].length / 2; + int hh = this.data.length / 2; + float acc = 0.0f; + for (int i = 0; i < this.data.length; i++) { + for (int j = 0; j < data[i].length; j++) { + float searchPixel = Float.valueOf(searchBitmap.get(x + j - hw, y + i - hh)); + float templatePixel = this.data[i][j]; + acc += Math.abs(searchPixel - templatePixel); + } + } + return acc; + } + + public float normalize(float v) { + return 1.0f - v * this.inv_area; + } + + private float[][] data; + private float inv_area; +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/TemplateMatcher.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/TemplateMatcher.java new file mode 100644 index 00000000..51295805 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/TemplateMatcher.java @@ -0,0 +1,56 @@ +package com.github.romualdrousseau.any2json.commons.cv; + +import java.util.ArrayList; +import java.util.List; + +public class TemplateMatcher { + + public TemplateMatcher(Template template) { + this.template = template; + } + + public List matchAll(ISearchBitmap searchBitmap, int x, int y, int w, int h, double threshold) { + ArrayList result = new ArrayList(); + for (int i = y; i < y + h; i++) { + for (int j = x; j < x + w; j++) { + float sad = this.template.sad(searchBitmap, j, i); + float score = this.template.normalize(sad); + if (score > threshold) { + result.add(new SearchPoint(j, i, sad)); + } + } + } + return result; + } + + public SearchPoint matchFirst(ISearchBitmap searchBitmap, int x, int y, int w, int h, double threshold) { + for (int i = y; i < y + h; i++) { + for (int j = x; j < x + w; j++) { + float sad = this.template.sad(searchBitmap, j, i); + float score = this.template.normalize(sad); + if (score > threshold) { + return new SearchPoint(j, i, sad); + } + } + } + return null; + } + + public SearchPoint matchBest(ISearchBitmap searchBitmap, int x, int y, int w, int h) { + SearchPoint result = null; + double maxScore = 0.0; + for (int i = y; i < y + h; i++) { + for (int j = x; j < x + w; j++) { + float sad = this.template.sad(searchBitmap, j, i); + float score = this.template.normalize(sad); + if (score > maxScore) { + maxScore = score; + result = new SearchPoint(j, i, sad); + } + } + } + return result; + } + + private Template template; +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/filter/EdgeFilter.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/filter/EdgeFilter.java new file mode 100644 index 00000000..00a240d1 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/filter/EdgeFilter.java @@ -0,0 +1,48 @@ +package com.github.romualdrousseau.any2json.commons.cv.filter; + +import com.github.romualdrousseau.any2json.commons.cv.Filter; +import com.github.romualdrousseau.any2json.commons.cv.ISearchBitmap; +import com.github.romualdrousseau.any2json.commons.cv.Template; + +public class EdgeFilter extends Filter { + + public EdgeFilter() { + super(null); + } + + public void apply(ISearchBitmap sourceBitmap, ISearchBitmap destBitmap, double threshold) { + for (int y = 0; y < sourceBitmap.getHeight(); y++) { + for (int x = 0; x < sourceBitmap.getWidth(); x++) { + float lx = this.edgeX.sobel(sourceBitmap, x, y); + float ly = this.edgeY.sobel(sourceBitmap, x, y); + double acc = Math.sqrt(lx * lx + ly * ly); + // double phi = Math.atan2(ly, lx); + if (acc < threshold) { + destBitmap.set(x, y, 0); + } else { + destBitmap.set(x, y, 1); + } + } + } + } + + public void apply(ISearchBitmap sourceBitmap, ISearchBitmap destBitmap, int[] clip, double threshold) { + for (int y = clip[1]; y < clip[3]; y++) { + for (int x = clip[0]; x < clip[2]; x++) { + float lx = this.edgeX.sobel(sourceBitmap, x, y); + float ly = this.edgeY.sobel(sourceBitmap, x, y); + double acc = Math.sqrt(lx * lx + ly * ly); + // double phi = Math.atan2(ly, lx); + if (acc < threshold) { + destBitmap.set(x, y, 0); + } else { + destBitmap.set(x, y, 1); + } + } + } + } + + private Template edgeX = new Template(new float[][] { { 1, 0, -1 }, { 2, 0, -2 }, { 1, 0, -1 } }); + + private Template edgeY = new Template(new float[][] { { 1, 2, 1 }, { 0, 0, 0 }, { -1, -2, -1 } }); +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/shapeextractor/RectangleExtractor.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/shapeextractor/RectangleExtractor.java new file mode 100644 index 00000000..52998865 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/cv/shapeextractor/RectangleExtractor.java @@ -0,0 +1,170 @@ +package com.github.romualdrousseau.any2json.commons.cv.shapeextractor; + +import java.util.ArrayList; +import java.util.List; + +import com.github.romualdrousseau.any2json.commons.cv.ISearchBitmap; +import com.github.romualdrousseau.any2json.commons.cv.IShapeExtractor; +import com.github.romualdrousseau.any2json.commons.cv.SearchPoint; +import com.github.romualdrousseau.any2json.commons.cv.Template; +import com.github.romualdrousseau.any2json.commons.cv.TemplateMatcher; + +public class RectangleExtractor extends IShapeExtractor { + + @Override + public List extractAll(ISearchBitmap searchBitmap) { + ArrayList result = new ArrayList(); + + ArrayList> allCorners = new ArrayList>(); + allCorners.add( + cornerTopLeft.matchAll(searchBitmap, 0, 0, searchBitmap.getWidth(), searchBitmap.getHeight(), 0.8)); + allCorners.add( + cornerTopRight.matchAll(searchBitmap, 0, 0, searchBitmap.getWidth(), searchBitmap.getHeight(), 0.8)); + allCorners.add( + cornerBottomRight.matchAll(searchBitmap, 0, 0, searchBitmap.getWidth(), searchBitmap.getHeight(), 0.8)); + allCorners.add( + cornerBottomLeft.matchAll(searchBitmap, 0, 0, searchBitmap.getWidth(), searchBitmap.getHeight(), 0.8)); + + // Simple version of Hought transformation with 4 pre-defined rotations + for (int phi = 0; phi < allCorners.size(); phi++) { + for (SearchPoint corner : allCorners.get(phi)) { + SearchPoint[] a = houghTransform(phi, corner, allCorners); + if (count(a) < (a.length - 1)) { + continue; + } + + SearchPoint[] bbox = minmax(phi, a); + + if (searchBitmap.get(bbox[0].getX(), bbox[0].getY()) > 0 + && searchBitmap.get(bbox[1].getX(), bbox[0].getY()) > 0 + && searchBitmap.get(bbox[1].getX(), bbox[1].getY()) > 0 + && searchBitmap.get(bbox[0].getX(), bbox[1].getY()) > 0) { + if(SearchPoint.isValid(bbox) && !SearchPoint.IsDuplicate(bbox, result)) { + result.add(bbox); + } + } + } + } + if (result.size() > 1) { + return SearchPoint.RemoveOverlaps(result); + } else { + return result; + } + } + + @Override + public SearchPoint[] extractBest(ISearchBitmap searchBitmap) { + SearchPoint[] result = null; + int maxArea = 0; + + ArrayList> allCorners = new ArrayList>(); + allCorners.add( + cornerTopLeft.matchAll(searchBitmap, 0, 0, searchBitmap.getWidth(), searchBitmap.getHeight(), 0.8)); + allCorners.add( + cornerTopRight.matchAll(searchBitmap, 0, 0, searchBitmap.getWidth(), searchBitmap.getHeight(), 0.8)); + allCorners.add( + cornerBottomRight.matchAll(searchBitmap, 0, 0, searchBitmap.getWidth(), searchBitmap.getHeight(), 0.8)); + allCorners.add( + cornerBottomLeft.matchAll(searchBitmap, 0, 0, searchBitmap.getWidth(), searchBitmap.getHeight(), 0.8)); + + // Simple version of Hought transformation with 4 pre-defined rotations + for (int phi = 0; phi < allCorners.size(); phi++) { + for (SearchPoint corner : allCorners.get(phi)) { + SearchPoint[] a = houghTransform(phi, corner, allCorners); + if (count(a) < (a.length - 1)) { + continue; + } + + SearchPoint[] bbox = minmax(phi, a); + + if (searchBitmap.get(bbox[0].getX(), bbox[0].getY()) > 0 + && searchBitmap.get(bbox[1].getX(), bbox[0].getY()) > 0 + && searchBitmap.get(bbox[1].getX(), bbox[1].getY()) > 0 + && searchBitmap.get(bbox[0].getX(), bbox[1].getY()) > 0) { + int area = SearchPoint.GetArea(bbox); + if (area > maxArea) { + maxArea = area; + result = bbox; + } + } + } + } + return result; + } + + private SearchPoint[] houghTransform(int phi, SearchPoint locus, List> points) { + SearchPoint[] a = { null, null, null, null }; + + a[phi] = locus; + + for (int j = 0; j < points.size(); j++) { + if (j != phi) { + for (SearchPoint point : points.get(j)) { + int[] g = gradient(locus, point); + if (g[0] == R[phi][j][0] && g[1] == R[phi][j][1]) { + if (a[j] == null || distance(locus, point) < distance(locus, a[j])) { + a[j] = point; + } + } + } + } + } + + return a; + } + + private int[] gradient(SearchPoint p1, SearchPoint p2) { + int vx = p2.getX() - p1.getX(); + vx = (vx == 0) ? 0 : ((vx > 0) ? 1 : -1); + int vy = p2.getY() - p1.getY(); + vy = (vy == 0) ? 0 : ((vy > 0) ? 1 : -1); + return new int[] { vx, vy }; + } + + private double distance(SearchPoint p1, SearchPoint p2) { + double vx = p1.getX() - p2.getX(); + double vy = p1.getY() - p2.getY(); + return Math.sqrt(vx * vx + vy * vy); + } + + private int count(SearchPoint[] points) { + int count = 0; + for (int k = 0; k < 4; k++) { + if (points[k] != null) { + count++; + } + } + return count; + } + + private SearchPoint[] minmax(int phi, SearchPoint[] points) { + int minX = points[phi].getX(); + int minY = points[phi].getY(); + int maxX = points[phi].getX(); + int maxY = points[phi].getY(); + for (int k = 0; k < 4; k++) { + if (k != phi && points[k] != null) { + minX = Math.min(minX, points[k].getX()); + minY = Math.min(minY, points[k].getY()); + maxX = Math.max(maxX, points[k].getX()); + maxY = Math.max(maxY, points[k].getY()); + } + } + return new SearchPoint[] { new SearchPoint(minX, minY), new SearchPoint(maxX, maxY) }; + } + + private int R[][][] = { { { 0, 0 }, { 1, 0 }, { 2, 2 }, { 0, 1 } }, { { -1, 0 }, { 0, 0 }, { 0, 1 }, { -2, 2 } }, + { { -2, -2 }, { 0, -1 }, { 0, 0 }, { -1, 0 } }, { { 0, -1 }, { 2, -2 }, { 1, 0 }, { 0, 0 } } }; + + private TemplateMatcher cornerTopLeft = new TemplateMatcher( + new Template(new float[][] { { 0, 0, 0 }, { 0, 1, 1 }, { 0, 1, 1 } })); + + private TemplateMatcher cornerTopRight = new TemplateMatcher( + new Template(new float[][] { { 0, 0, 0 }, { 1, 1, 0 }, { 1, 1, 0 } })); + + private TemplateMatcher cornerBottomLeft = new TemplateMatcher( + new Template(new float[][] { { 0, 1, 1 }, { 0, 1, 1 }, { 0, 0, 0 } })); + + private TemplateMatcher cornerBottomRight = new TemplateMatcher( + new Template(new float[][] { { 1, 1, 0 }, { 1, 1, 0 }, { 0, 0, 0 } })); +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/json/JSON.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/json/JSON.java new file mode 100644 index 00000000..ec6a0f23 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/json/JSON.java @@ -0,0 +1,134 @@ +package com.github.romualdrousseau.any2json.commons.json; + +import java.lang.reflect.InvocationTargetException; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Stream; + +import org.reflections.Reflections; + +public class JSON { + public final static String PACKAGE_LOADER_PREFIX = "com.github.romualdrousseau.shuju.json"; + + private static JSONFactory Factory; + static { + final Reflections reflections = new Reflections(PACKAGE_LOADER_PREFIX); + JSON.Factory = reflections.getSubTypesOf(JSONFactory.class).stream() + .map(JSON::newFactoryInstance) + .findFirst() + .get(); + } + + private static JSONFactory newFactoryInstance(Class clazz) { + try { + return (JSONFactory) clazz.getConstructor().newInstance(); + } catch (InstantiationException | IllegalAccessException + | IllegalArgumentException | InvocationTargetException + | NoSuchMethodException | SecurityException e) { + throw new RuntimeException(e); + } + } + + public static JSONArray newArray() { + return JSON.Factory.newArray(); + } + + public static JSONArray arrayOf(String data) { + return JSON.Factory.parseArray(data); + } + + public static JSONArray arrayOf(Object object) { + return JSON.Factory.parseArray(object); + } + + public static JSONArray arrayOf(final List l) { + final JSONArray array = JSON.newArray(); + l.forEach(s -> array.append(s)); + return array; + } + + public static JSONArray arrayOf(final Stream l) { + final JSONArray array = JSON.newArray(); + l.forEach(s -> array.append(s)); + return array; + } + + public static JSONArray arrayOf(final Map m) { + final JSONArray array = JSON.newArray(); + m.forEach((k, v) -> { + JSONObject pair = JSON.newObject(); + pair.set("key", k); + pair.set("value", v); + array.append(pair); + }); + return array; + } + + public static JSONArray loadArray(Path filePath) { + return JSON.Factory.loadArray(filePath); + } + + public static void saveArray(JSONArray a, Path filePath) { + JSON.Factory.saveArray(a, filePath, false); + } + + public static void saveArray(JSONArray a, Path filePath, final boolean pretty) { + JSON.Factory.saveArray(a, filePath, pretty); + } + + public static JSONObject newObject() { + return JSON.Factory.newObject(); + } + + public static JSONObject objectOf(String data) { + return JSON.Factory.parseObject(data); + } + + public static JSONObject objectOf(Object object) { + return JSON.Factory.parseObject(object); + } + + public static JSONObject objectOf(final Map m) { + final JSONObject object = JSON.newObject(); + m.forEach((k, v) -> object.set(k, v)); + return object; + } + + public static JSONObject loadObject(Path filePath) { + return JSON.Factory.loadObject(filePath); + } + + public static void saveObject(JSONObject o, Path filePath) { + JSON.Factory.saveObject(o, filePath, false); + } + + public static void saveObject(JSONObject o, Path filePath, final boolean pretty) { + JSON.Factory.saveObject(o, filePath, pretty); + } + + @SuppressWarnings("unchecked") + public static Optional query(final Object a, final String q) { + Object curr = a; + for(String token: Arrays.asList(q.split("\\."))) { + if (curr instanceof JSONArray) { + int i = Integer.parseInt(token); + curr = ((JSONArray) curr).get(i).orElse(null); + } else if (curr instanceof JSONObject) { + curr = ((JSONObject) curr).get(token).orElse(null); + } else { + curr = null; + } + } + return Optional.ofNullable((T) curr); + } + + public static Stream queryStream(final Object a, final String q) { + return JSON.query(a, q) + .filter(o -> o instanceof JSONArray) + .map(o -> ((JSONArray) o).stream()) + .orElse(Stream.empty()); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/json/JSONArray.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/json/JSONArray.java new file mode 100644 index 00000000..4f099dd9 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/json/JSONArray.java @@ -0,0 +1,45 @@ +package com.github.romualdrousseau.any2json.commons.json; + +import java.util.Iterator; +import java.util.Optional; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +public interface JSONArray { + + int size(); + + Optional get(int i); + + JSONArray set(int i, T o); + + JSONArray append(T o); + + JSONArray remove(int i); + + String toString(final boolean pretty); + + String toString(); + + default Stream stream() { + Iterable it = new Iterable() { + @Override + public Iterator iterator() { + return new Iterator() { + private int idx = 0; + + @Override + public boolean hasNext() { + return idx < JSONArray.this.size(); + } + + @Override + public T next() { + return JSONArray.this.get(idx++).get(); + } + }; + } + }; + return StreamSupport.stream(it.spliterator(), false); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/json/JSONCollector.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/json/JSONCollector.java new file mode 100644 index 00000000..376fac80 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/json/JSONCollector.java @@ -0,0 +1,16 @@ +package com.github.romualdrousseau.any2json.commons.json; + +import java.util.Map; +import java.util.stream.Collector; +import java.util.stream.Collectors; + +public class JSONCollector { + + public static Collector> toMap(final String key, final String value) { + return Collectors.toMap(x -> x.get(key).get(), x -> x.get(value).get()); + } + + public static Collector> toUnmodifiableMap(final String key, final String value) { + return Collectors.toUnmodifiableMap(x -> x.get(key).get(), x -> x.get(value).get()); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/json/JSONFactory.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/json/JSONFactory.java new file mode 100644 index 00000000..a94334b0 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/json/JSONFactory.java @@ -0,0 +1,26 @@ +package com.github.romualdrousseau.any2json.commons.json; + +import java.nio.file.Path; + +public interface JSONFactory { + + JSONArray newArray(); + + JSONArray parseArray(String data); + + JSONArray parseArray(Object object); + + JSONArray loadArray(Path filePath); + + void saveArray(JSONArray a, Path filePath, boolean pretty); + + JSONObject newObject(); + + JSONObject parseObject(String data); + + JSONObject parseObject(Object object); + + JSONObject loadObject(Path filePath); + + void saveObject(JSONObject o, Path filePath, boolean pretty); +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/json/JSONObject.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/json/JSONObject.java new file mode 100644 index 00000000..925873d0 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/json/JSONObject.java @@ -0,0 +1,18 @@ +package com.github.romualdrousseau.any2json.commons.json; + +import java.util.Optional; + +public interface JSONObject { + + Iterable keys(); + + Optional get(String k); + + JSONObject set(String k, T o); + + JSONObject remove(String k); + + String toString(final boolean pretty); + + String toString(); +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/Text.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/Text.java new file mode 100644 index 00000000..490a1cff --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/Text.java @@ -0,0 +1,149 @@ +package com.github.romualdrousseau.any2json.commons.preprocessing; + +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import com.github.romualdrousseau.any2json.commons.types.CollectionUtils; +import com.github.romualdrousseau.any2json.commons.json.JSON; +import com.github.romualdrousseau.any2json.commons.json.JSONArray; +import com.github.romualdrousseau.any2json.commons.preprocessing.hasher.DefaultHasher; +import com.github.romualdrousseau.any2json.commons.preprocessing.tokenizer.DefaultTokenizer; +import com.github.romualdrousseau.any2json.commons.preprocessing.comparer.DefaultComparer; + +public class Text { + + public interface ITokenizer extends Function> { + } + + public interface IHasher extends Function { + } + + public interface IComparer extends BiFunction, Boolean> { + String anonymize(String v); + String anonymize(final String v, final String pattern); + Optional find(String v); + Optional find(final String v, final String pattern); + } + + public static ITokenizer DefaultTokenizer = new DefaultTokenizer(); + + public static IHasher DefaultHasher = new DefaultHasher(); + + public static IComparer DefaultComparer = new DefaultComparer(); + + public static List DefaultFilters = List.of("[\\\\!\"#$%&()*+,-./:;<=>?@\\[\\]^_`{|}~\\t\\n]"); + + public static Comparator ComparatorByLength = (a, b) -> b.length() - a.length(); + + public static Map> get_lexicon(List lexicon) { + return lexicon.stream() + .map(w -> List.of(w.split(","))) + .collect(Collectors.toMap( + w -> w.get(0), + w -> w.stream().distinct().sorted(Text.ComparatorByLength).toList())); + } + + public static List all_words(final List documents) { + return Text.all_words(documents, Text.DefaultFilters); + } + + public static List all_words(final List documents, final List filters) { + return Text.all_words(documents, filters, Text.DefaultTokenizer); + } + + public static List all_words(final List documents, final List filters, final ITokenizer tokenizer) { + return documents.stream() + .flatMap(d -> d != null ? Text.to_words(d, filters, tokenizer).stream() : Stream.empty()) + .distinct().sorted().toList(); + } + + public static List to_words(final String text) { + return Text.to_words(text, Text.DefaultFilters); + } + + public static List to_words(final String text, final List filters) { + return Text.to_words(text, filters, Text.DefaultTokenizer); + } + + public static List to_words(final String text, final List filters, final ITokenizer tokenizer) { + return tokenizer.apply(filters.stream().reduce(text, (a, x) -> a.replaceAll("(?i)" + x, " "))); + } + + public static List to_categorical(final String label, final List classes) { + return Text.to_categorical(label, classes, Text.DefaultComparer); + } + + public static List to_categorical(final String label, final List classes, + final IComparer comparer) { + return Text.to_categorical(List.of(label), classes, comparer); + } + + public static List to_categorical(final List labels, final List classes) { + return Text.to_categorical(labels, classes, Text.DefaultComparer); + } + + public static List to_categorical(final List labels, final List classes, + final IComparer comparer) { + return classes.stream().map(c -> comparer.apply(c, labels) ? 1 : 0).toList(); + } + + public static String anonymize(final String label, final IComparer comparer) { + return Text.anonymize(List.of(label), comparer).get(0); + } + + public static List anonymize(final List labels, final IComparer comparer) { + return labels.stream().map(l -> comparer.anonymize(l)).toList(); + } + + public static List one_hot(final String text) { + return Text.one_hot(text, Text.DefaultFilters, Text.DefaultTokenizer, Text.DefaultHasher); + } + + public static List one_hot(final String text, final List filters) { + return Text.one_hot(text, filters, Text.DefaultTokenizer, Text.DefaultHasher); + } + + public static List one_hot(final String text, final List filters, final ITokenizer tokenizer) { + return Text.one_hot(text, filters, tokenizer, Text.DefaultHasher); + } + + public static List one_hot(final String text, final List filters, final ITokenizer tokenizer, IHasher hasher) { + return Text.to_words(text, filters, tokenizer).stream().map(hasher).toList(); + } + + public static List pad_sequence(final List sequence, final int maxLen) { + return Text.pad_sequence(sequence, maxLen, 0); + } + + public static List pad_sequence(final List sequence, final int maxLen, final int value) { + final IntStream padding = IntStream.range(sequence.size(), maxLen).map(x -> value); + return Stream.concat(sequence.stream(), padding.boxed()).toList(); + } + + public static List mutate_sequence(final List sequence) { + return Text.mutate_sequence(sequence, 0.1f, 0); + } + + public static List mutate_sequence(final List sequence, final float p) { + return Text.mutate_sequence(sequence, p, 0); + } + + public static List mutate_sequence(final List sequence, final float p, final int value) { + final var shuffler = CollectionUtils.shuffle(CollectionUtils.mutableRange(0, sequence.size())); + final Function mutator = x -> Math.random() < p ? value : sequence.get(x); + return shuffler.stream().map(mutator).filter(x -> x != value).toList(); + } + + public static JSONArray json_sequence(final List sequence) { + JSONArray result = JSON.newArray(); + sequence.forEach(x -> result.append(x)); + return result; + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/comparer/DefaultComparer.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/comparer/DefaultComparer.java new file mode 100644 index 00000000..55df0523 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/comparer/DefaultComparer.java @@ -0,0 +1,34 @@ +package com.github.romualdrousseau.any2json.commons.preprocessing.comparer; + +import java.util.List; +import java.util.Optional; + +import com.github.romualdrousseau.any2json.commons.preprocessing.Text; + +public class DefaultComparer implements Text.IComparer { + + @Override + public Boolean apply(final String a, final List b) { + return b.contains(a); + } + + @Override + public String anonymize(final String v) { + return v; + } + + @Override + public String anonymize(final String v, final String pattern) { + return v; + } + + @Override + public Optional find(final String v) { + return Optional.empty(); + } + + @Override + public Optional find(final String v, final String pattern) { + return Optional.empty(); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/comparer/RegexComparer.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/comparer/RegexComparer.java new file mode 100644 index 00000000..8100580a --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/comparer/RegexComparer.java @@ -0,0 +1,72 @@ +package com.github.romualdrousseau.any2json.commons.preprocessing.comparer; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import com.github.romualdrousseau.any2json.commons.preprocessing.Text; + +public class RegexComparer implements Text.IComparer { + + private final Map patterns; + private final Map compiledPatterns; + + public RegexComparer(final Map patterns) { + this.patterns = patterns; + this.compiledPatterns = patterns.keySet().stream() + .collect(Collectors.toUnmodifiableMap(r -> r, this::compileRegex)); + } + + @Override + public Boolean apply(final String a, final List b) { + return (a == null) ? false + : this.patterns.entrySet().stream() + .filter(p -> p.getValue().equals(a)) + .map(p -> this.compiledPatterns.get(p.getKey()).matcher("")) + .anyMatch(m -> b.stream().anyMatch(v -> v != null && m.reset(v).find())); + } + + @Override + public String anonymize(final String v) { + return (v == null) ? null + : this.patterns.entrySet().stream() + .reduce(v, (r, e) -> this.compiledPatterns.get(e.getKey()).matcher(r).replaceAll(e.getValue()), + (res1, res2) -> res1); + } + + @Override + public String anonymize(final String v, final String filter) { + return (v == null) ? null + : this.patterns.entrySet().stream() + .filter(e -> e.getValue().equals(filter)) + .reduce(v, (r, e) -> this.compiledPatterns.get(e.getKey()).matcher(r).replaceAll(e.getValue()), + (res1, res2) -> res1); + } + + @Override + public Optional find(final String v) { + return (v == null) ? Optional.empty() + : this.compiledPatterns.values().stream() + .map(e -> e.matcher(v)) + .filter(m -> m.find()) + .map(m -> m.group()) + .findFirst(); + } + + @Override + public Optional find(final String v, final String filter) { + return (v == null) ? Optional.empty() + : this.patterns.entrySet().stream() + .filter(p -> p.getValue().equals(filter)) + .map(p -> this.compiledPatterns.get(p.getKey()).matcher(v)) + .filter(m -> m.find()) + .map(m -> m.group()) + .findFirst(); + } + + private Pattern compileRegex(final String r) { + return Pattern.compile(r, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/hasher/DefaultHasher.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/hasher/DefaultHasher.java new file mode 100644 index 00000000..08121d84 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/hasher/DefaultHasher.java @@ -0,0 +1,11 @@ +package com.github.romualdrousseau.any2json.commons.preprocessing.hasher; + +import com.github.romualdrousseau.any2json.commons.preprocessing.Text; + +public class DefaultHasher implements Text.IHasher { + + @Override + public Integer apply(final String w) { + return w.hashCode(); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/hasher/VocabularyHasher.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/hasher/VocabularyHasher.java new file mode 100644 index 00000000..c051e5e1 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/hasher/VocabularyHasher.java @@ -0,0 +1,20 @@ +package com.github.romualdrousseau.any2json.commons.preprocessing.hasher; + +import java.util.Collections; +import java.util.List; + +import com.github.romualdrousseau.any2json.commons.preprocessing.Text; + +public class VocabularyHasher implements Text.IHasher { + + private final List vocabulary; + + public VocabularyHasher(final List vocabulary) { + this.vocabulary = vocabulary; + } + + @Override + public Integer apply(final String w) { + return Math.max(0, Collections.binarySearch(this.vocabulary, w) + 1); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/tokenizer/DefaultTokenizer.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/tokenizer/DefaultTokenizer.java new file mode 100644 index 00000000..19e06c94 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/tokenizer/DefaultTokenizer.java @@ -0,0 +1,15 @@ +package com.github.romualdrousseau.any2json.commons.preprocessing.tokenizer; + +import java.util.Arrays; +import java.util.List; + +import com.github.romualdrousseau.any2json.commons.preprocessing.Text; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; + +public class DefaultTokenizer implements Text.ITokenizer { + @Override + public List apply(final String w) { + final String s = StringUtils.normalizeWhiteSpaces(w).toLowerCase(); + return Arrays.asList(s.split("\s+")); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/tokenizer/NgramTokenizer.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/tokenizer/NgramTokenizer.java new file mode 100644 index 00000000..b1e8529b --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/tokenizer/NgramTokenizer.java @@ -0,0 +1,37 @@ +package com.github.romualdrousseau.any2json.commons.preprocessing.tokenizer; + +import java.util.ArrayList; +import java.util.List; + +import com.github.romualdrousseau.any2json.commons.preprocessing.Text; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; + +public class NgramTokenizer implements Text.ITokenizer { + + private final int n; + + public NgramTokenizer(final int n) { + this.n = n; + } + + @Override + public List apply(final String w) { + String s = StringUtils.normalizeWhiteSpaces(w); + + // Join by space and underscore + s = s.replaceAll("[\\s_]+", "").trim(); + + // Fill up with ? to have at least one token + while (s.length() < this.n) { + s += "?"; + } + + final ArrayList result = new ArrayList(); + for (int i = 0; i < s.length() - this.n + 1; i++) { + final String ss = s.substring(i, i + this.n); + result.add(ss.toLowerCase()); + } + + return result; + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/tokenizer/ShingleTokenizer.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/tokenizer/ShingleTokenizer.java new file mode 100644 index 00000000..fb7d90aa --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/preprocessing/tokenizer/ShingleTokenizer.java @@ -0,0 +1,87 @@ +package com.github.romualdrousseau.any2json.commons.preprocessing.tokenizer; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import com.github.romualdrousseau.any2json.commons.preprocessing.Text; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; + +public class ShingleTokenizer implements Text.ITokenizer { + + private static final int MIN_SIZE = 2; + + private static final ThreadLocal CAMEL_PATTERN = new ThreadLocal() { + @Override + protected Pattern initialValue() { + return Pattern.compile("(?>> variants; + private final int minSize; + + private boolean lemmatization; + + public ShingleTokenizer(final List lexicon) { + this(lexicon, MIN_SIZE); + } + + public ShingleTokenizer(final List lexicon, final int minSize) { + this(lexicon, minSize, true); + } + + public ShingleTokenizer(final List lexicon, final int minSize, final boolean lemmatization) { + this.variants = Text.get_lexicon(lexicon).entrySet().stream() + .sorted((a, b) -> b.getKey().length() - a.getKey().length()).toList(); + this.minSize = minSize; + this.lemmatization = lemmatization; + } + + public void enableLemmatization() { + this.lemmatization = true; + } + + public void disableLemmatization() { + this.lemmatization = false; + } + + @Override + public List apply(final String w) { + var s = StringUtils.normalizeWhiteSpaces(w); + + // Split using a lexicon of known words if any and prioritize longest variant + + final var lexems = this.variants.stream().collect(Collectors.toList()); + while (lexems.size() > 0) { + final var lexem = lexems.remove(0); + for (final String variant : lexem.getValue()) { + if (s.toLowerCase().contains(variant)) { + final var replacement = this.lemmatization ? lexem.getKey() : variant; + s = s.replaceAll("(?i)" + variant, " " + replacement + " "); + lexems.removeIf(x -> x.getValue().stream().anyMatch(y -> replacement.contains(y))); + break; + } + } + } + + // Clean by space and underscore + + s = s.replaceAll("[\\s_]+", " ").trim(); + + // Split by space and then by Camel notation words + + final ArrayList result = new ArrayList(); + for (final String ss : s.split(" ")) { + for (final String sss : CAMEL_PATTERN.get().split(ss)) { + if (sss.length() > 0 && (sss.length() > (minSize - 1) || !Character.isAlphabetic(sss.charAt(0)))) { + result.add(sss.toLowerCase()); + } + } + } + + return result; + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/python/PythonManager.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/python/PythonManager.java new file mode 100644 index 00000000..d974cb93 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/python/PythonManager.java @@ -0,0 +1,201 @@ +package com.github.romualdrousseau.any2json.commons.python; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Properties; +import java.util.stream.Stream; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class PythonManager { + private static final Logger LOGGER = LoggerFactory.getLogger(PythonManager.class); + + public PythonManager(final String moduleName) throws IOException { + final var prop = new Properties(); + prop.load(this.findPropertiesFile()); + + this.modulePath = this.getModulePath(prop.getProperty(moduleName + ".module-path")); + this.mainEntry = prop.getProperty(moduleName + ".module-main", "main.py"); + this.hasVirtualEnv = prop.getProperty(moduleName + ".virtual-env", "false").equals("true"); + this.virtualEnvPath = this.getVirtualEnvPath(prop.getProperty(moduleName + ".virtual-env-path", ".venv")); + this.hasDependencies = prop.getProperty(moduleName + ".dependencies", "false").equals("true"); + } + + public PythonManager enableVirtualEnv() throws IOException, InterruptedException { + if (this.virtualEnvPath.toFile().exists()) { + return this; + } + + LOGGER.info("venv: Create a new virtual environment"); + + final ProcessBuilder processBuilder = new ProcessBuilder("python", "-m", "venv", this.virtualEnvPath.toString()); + processBuilder.directory(this.modulePath.toFile()); + processBuilder.inheritIO(); + processBuilder.redirectErrorStream(true); + processBuilder.start().waitFor(); + return this; + } + + public PythonManager installDependencies() throws IOException, InterruptedException { + if (this.isRequirementsInstalled()) { + return this; + } + + LOGGER.info("pip: Install and update all dependencies"); + + final ProcessBuilder processBuilder = new ProcessBuilder(this.getPipScript(), "install", "-r", + "requirements.txt"); + processBuilder.directory(this.modulePath.toFile()); + processBuilder.inheritIO(); + processBuilder.redirectErrorStream(true); + processBuilder.start().waitFor(); + + final var lockFile = this.modulePath.resolve("requirements.lock").toFile(); + lockFile.createNewFile(); + + return this; + } + + public PythonManager setEnviroment(final Map environment) { + this.environment = environment; + return this; + } + + public Process run(final String... args) throws IOException, InterruptedException { + if (this.hasVirtualEnv) { + this.enableVirtualEnv(); + } + + if (this.hasDependencies) { + this.installDependencies(); + } + + final var command = Stream.of(List.of(this.getPythonScript(), this.mainEntry), List.of(args)) + .flatMap(Collection::stream).toList(); + final ProcessBuilder processBuilder = new ProcessBuilder(command); + processBuilder.directory(this.modulePath.toFile()); + processBuilder.redirectErrorStream(true); + + if (this.environment != null || this.environment.size() > 0) { + final var env = processBuilder.environment(); + this.environment.forEach((k, v) -> env.put(k, v)); + } + + LOGGER.info("python: Call {} with args: {}", this.mainEntry, args); + + return processBuilder.start(); + } + + private InputStream findPropertiesFile() throws IOException { + final var userDir = System.getProperty("user.dir"); + return this.getPathIfExists(Path.of(userDir, "python4j.properties")) + .or(() -> this.getPathIfExists(Path.of(userDir, "classes", "python4j.properties"))) + .flatMap(this::pathToStream) + .or(() -> this.resolveResourceAsStream("python4j.properties")) + .orElseThrow(() -> PythonManager.panicAndAbort("python4j.properties")); + } + + private boolean isRequirementsInstalled() throws IOException { + final var requireFile = this.modulePath.resolve("requirements.txt").toFile(); + if (!requireFile.exists()) { + return false; + } + + final var lockFile = this.modulePath.resolve("requirements.lock").toFile(); + if (lockFile.exists()) { + if (requireFile.lastModified() < lockFile.lastModified()) { + return true; + } + lockFile.delete(); + } + + return false; + } + + private String getPythonScript() { + if (this.hasVirtualEnv) { + return this.getScriptPath("bin/python") + .or(() -> this.getScriptPath("Scripts/python.exe")) + .orElseThrow(() -> PythonManager.panicAndAbort("python")) + .toString(); + } else { + return "python"; + } + } + + private String getPipScript() { + if (this.hasVirtualEnv) { + return this.getScriptPath("bin/pip") + .or(() -> this.getScriptPath("Scripts/pip.exe")) + .orElseThrow(() -> PythonManager.panicAndAbort("pip")) + .toString(); + } else { + return "pip"; + } + } + + private Optional getScriptPath(final String pathName) { + return this.getPathIfExists(this.virtualEnvPath.resolve(pathName)); + } + + private Path getModulePath(final String moduleName) { + final var userDir = System.getProperty("user.dir"); + return this.getPathIfExists(Path.of(userDir, moduleName)) + .or(() -> this.getPathIfExists(Path.of(userDir, "classes", moduleName))) + .orElseThrow(() -> PythonManager.panicAndAbort(moduleName)); + } + + private Path getVirtualEnvPath(String virtualEnvPath) { + if (Path.of(virtualEnvPath).isAbsolute()) { + return Path.of(virtualEnvPath); + } else { + return this.modulePath.resolve(virtualEnvPath); + } + } + + private Optional pathToStream(final Path x) { + try { + return Optional.of(Files.newInputStream(x)); + } catch (final IOException e) { + return Optional.empty(); + } + } + + private Optional resolveResourceAsStream(final String resourceName) { + final InputStream resource = this.getClass().getClassLoader().getResourceAsStream(resourceName); + if (resource == null) { + LOGGER.debug("module: {} not found", resourceName); + return Optional.empty(); + } + LOGGER.debug("module: {} found at {}", resourceName, resource); + return Optional.of(resource); + } + + private Optional getPathIfExists(final Path path) { + if (!path.toFile().exists()) { + LOGGER.debug("module: {} not found at {}", path.getFileName(), path); + return Optional.empty(); + } + LOGGER.debug("module: {} found at {}", path.getFileName(), path); + return Optional.of(path); + } + + private static RuntimeException panicAndAbort(final String name) { + LOGGER.error("module: {} not found, abort ...", name); + return new RuntimeException(String.format("%s not found, abort ...", name)); + } + + private final Path modulePath; + private final String mainEntry; + private final boolean hasVirtualEnv; + private final Path virtualEnvPath; + private final boolean hasDependencies; + private Map environment = null; +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/python/PythonSimpleDateFormat.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/python/PythonSimpleDateFormat.java new file mode 100644 index 00000000..15dd4a26 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/python/PythonSimpleDateFormat.java @@ -0,0 +1,81 @@ +package com.github.romualdrousseau.any2json.commons.python; + +import java.text.DateFormatSymbols; +import java.text.SimpleDateFormat; +import java.util.Locale; + +public class PythonSimpleDateFormat extends SimpleDateFormat { + + public PythonSimpleDateFormat() { + super(); + } + + public PythonSimpleDateFormat(final String pattern) { + super(PythonSimpleDateFormat.toJava(pattern)); + } + + public PythonSimpleDateFormat(final String pattern, DateFormatSymbols formatSymbols) { + super(PythonSimpleDateFormat.toJava(pattern), formatSymbols); + } + + public PythonSimpleDateFormat(final String pattern, Locale locale) { + super(PythonSimpleDateFormat.toJava(pattern), locale); + } + + public static String toPython(final String javaPattern) { + return javaPattern + .replaceAll("YYYY", "%G") + .replaceAll("yyyy", "%Y") + .replaceAll("yy", "%y") + .replaceAll("y", "%-y") + .replaceAll("MMMMM", "%B") + .replaceAll("MMM", "%b") + .replaceAll("MM", "%m") + .replaceAll("M", "%-m") + .replaceAll("DDD", "%j") + .replaceAll("dd", "%d") + .replaceAll("d", "%-d") + .replaceAll("EEEEE", "%A") + .replaceAll("EEE", "%a") + .replaceAll("ww", "%W") + .replaceAll("u", "%u") + .replaceAll("HH", "%H") + .replaceAll("H", "%-H") + .replaceAll("hh", "%I") + .replaceAll("h", "%-I") + .replaceAll("mm", "%M") + .replaceAll("m", "%-M") + .replaceAll("ss", "%S") + .replaceAll("s", "%-S"); + } + + public static String toJava(final String pythonPattern) { + return pythonPattern + .replaceAll("%G", "YYYY") + .replaceAll("%Y", "yyyy") + .replaceAll("%y", "yy") + .replaceAll("%-y", "y") + .replaceAll("%B", "MMMMM") + .replaceAll("%b", "MMM") + .replaceAll("%m", "MM") + .replaceAll("%-m", "M") + .replaceAll("%j", "DDD") + .replaceAll("%d", "dd") + .replaceAll("%-d", "d") + .replaceAll("%A", "EEEEE") + .replaceAll("%a", "EEE") + .replaceAll("%W", "ww") + .replaceAll("%w", "u") + .replaceAll("%u", "u") + .replaceAll("%U", "ww") + .replaceAll("%V", "ww") + .replaceAll("%H", "HH") + .replaceAll("%-H", "H") + .replaceAll("%I", "hh") + .replaceAll("%-I", "h") + .replaceAll("%M", "mm") + .replaceAll("%-M", "m") + .replaceAll("%S", "ss") + .replaceAll("%-S", "s"); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/redux/Action.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/redux/Action.java new file mode 100644 index 00000000..6930e07e --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/redux/Action.java @@ -0,0 +1,33 @@ +package com.github.romualdrousseau.any2json.commons.redux; + +import java.util.function.Supplier; + +public class Action implements Supplier { + + private final String type; + + public Action(final String type) { + this.type = type; + } + + public String getType() { + return this.type; + } + + public int hashCode() { + return this.type.hashCode(); + } + + public boolean equals(final Object obj) { + if (!(obj instanceof Action)) { + return false; + } + final var otherAction = (Action) obj; + return this.type.equals(otherAction.type); + } + + @Override + public Action get() { + return this; + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/redux/Reducer.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/redux/Reducer.java new file mode 100644 index 00000000..a4aaf65c --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/redux/Reducer.java @@ -0,0 +1,5 @@ +package com.github.romualdrousseau.any2json.commons.redux; + +import java.util.function.BiFunction; + +public interface Reducer extends BiFunction {} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/redux/Store.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/redux/Store.java new file mode 100644 index 00000000..c92a1658 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/redux/Store.java @@ -0,0 +1,36 @@ +package com.github.romualdrousseau.any2json.commons.redux; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class Store { + + private final Map>> subscribers = new HashMap<>(); + private final List> reducers = new ArrayList<>(); + private S state; + + public Store(final S state) { + this.state = state; + } + + public S getState() { + return this.state; + } + + public void addSubscriber(final A action, final Subscriber subscriber) { + this.subscribers.computeIfAbsent(action, x -> new ArrayList<>()).add(subscriber); + } + + public void addReducer(final Reducer reducer) { + this.reducers.add(reducer); + } + + public void dispatch(final A action) { + @SuppressWarnings("unchecked") final var result = (A) action.get(); + this.state = reducers.stream().reduce(this.state, (x, y) -> y.apply(x, result), (x, y) -> y); + this.subscribers.getOrDefault(result, Collections.emptyList()).forEach(x -> x.accept(this, result)); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/redux/Subscriber.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/redux/Subscriber.java new file mode 100644 index 00000000..fffa47af --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/redux/Subscriber.java @@ -0,0 +1,5 @@ +package com.github.romualdrousseau.any2json.commons.redux; + +import java.util.function.BiConsumer; + +public interface Subscriber extends BiConsumer, A> {} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/strings/StringFuzzy.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/strings/StringFuzzy.java new file mode 100644 index 00000000..b18f969a --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/strings/StringFuzzy.java @@ -0,0 +1,141 @@ +package com.github.romualdrousseau.any2json.commons.strings; + +import java.util.Arrays; +import java.util.ArrayList; +import java.util.List; + +public class StringFuzzy { + public static float Hamming(String s, String t) { + if(s.length() != t.length()) { + return 0.0f; + } + + if (s.length() == 0) { + return 0.0f; + } + + int n = 0; + for(int i = 0; i < s.length(); i++) { + if(s.charAt(i) == t.charAt(i)) { + n++; + } + } + return (float) Math.exp(n) / (float) Math.exp(s.length()); + } + + public static float JaroWinkler(String s, String t) { + int s_len = s.length(); + int t_len = t.length(); + + if (s_len == 0 && t_len == 0) { + return 1.0f; + } + + int match_distance = Integer.max(s_len, t_len) / 2 - 1; + + boolean[] s_matches = new boolean[s_len]; + boolean[] t_matches = new boolean[t_len]; + + int matches = 0; + int transpositions = 0; + + for (int i = 0; i < s_len; i++) { + int start = Integer.max(0, i - match_distance); + int end = Integer.min(i + match_distance + 1, t_len); + + for (int j = start; j < end; j++) { + if (t_matches[j]) + continue; + if (s.charAt(i) != t.charAt(j)) + continue; + s_matches[i] = true; + t_matches[j] = true; + matches++; + break; + } + } + + if (matches == 0) + return 0; + + int k = 0; + for (int i = 0; i < s_len; i++) { + if (!s_matches[i]) + continue; + while (!t_matches[k]) + k++; + if (s.charAt(i) != t.charAt(k)) + transpositions++; + k++; + } + + return ((((float) matches / (float) s_len) + ((float) matches / (float) t_len) + + (((float) matches - (float) transpositions / 2.0f) / (float) matches)) / 3.0f); + } + + public static float Jaccard(String s1, String s2) { + return Float.valueOf(StringFuzzy.intersect(s1, s2).length()) + / Float.valueOf(StringFuzzy.union(s1, s2).length()); + } + + public static String union(String s1, String s2) { + String result = ""; + + for (char c : s1.toCharArray()) { + if (!result.contains(String.valueOf(c))) { + result += c; + } + } + + for (char c : s2.toCharArray()) { + if (!result.contains(String.valueOf(c))) { + result += c; + } + } + + return result; + } + + public static String[] union(String[] s1, String[] s2) { + ArrayList result = new ArrayList(s1.length + s2.length); + + for (String v : s1) { + if (!result.contains(v)) { + result.add(v); + } + } + + for (String v : s2) { + if (!result.contains(v)) { + result.add(v); + } + } + + return result.toArray(new String[result.size()]); + } + + public static String intersect(String s1, String s2) { + String result = ""; + + for (char c : s1.toCharArray()) { + if (!result.contains(String.valueOf(c)) && s2.contains(String.valueOf(c))) { + result += c; + } + } + + return result; + } + + public static String[] intersect(String[] s1, String[] s2) { + ArrayList result = new ArrayList(s1.length + s2.length); + List tmp = Arrays.asList(s2); + + for (String v : s1) { + if (!result.contains(v) && tmp.contains(v)) { + result.add(v); + } + } + + return result.toArray(new String[result.size()]); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/strings/StringUtils.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/strings/StringUtils.java new file mode 100644 index 00000000..02981951 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/strings/StringUtils.java @@ -0,0 +1,153 @@ +package com.github.romualdrousseau.any2json.commons.strings; + +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.regex.Pattern; + +import com.github.romualdrousseau.any2json.commons.preprocessing.Text; + +public class StringUtils { + public static final String WHITE_SPACES = "\\s\\u00A0\\u3000"; + public static final String WRONG_UNICODE = "\\uFFFD"; + public static final char WRONG_UNICODE_CHAR = '\uFFFD'; + public static final String BOM = "\\uFEFF"; + public static final char BOM_CHAR = '\uFEFF'; + + public static final Map symbols = Map.of( + "%+", "percent", + "\\$+", "dollar" + ); + + private static final ThreadLocal CLEAN_TOKEN_REGEX1 = new ThreadLocal<>() { + @Override + protected Pattern initialValue() { + return Pattern.compile("[" + StringUtils.WHITE_SPACES + "]+"); + } + }; + private static final ThreadLocal CLEAN_TOKEN_REGEX2 = new ThreadLocal<>() { + @Override + protected Pattern initialValue() { + return Pattern.compile("^[\" ]+|[\" ]+$"); + } + }; + + public static boolean isBlank(final String s) { + return s == null || StringUtils.trim(s).equals(""); + } + + public static boolean isFastBlank(final String s) { + return s == null || s.isBlank(); + } + + public static String trim(final String s) { + return trim(s, StringUtils.WHITE_SPACES); + } + + public static String trim(final String s, final String whiteSpaces) { + if (s == null) { + return null; + } + return s.replaceAll("^[" + whiteSpaces + "]+", "").replaceAll("[" + whiteSpaces + "]+$", ""); + } + + public static String normalizeWhiteSpaces(final String s) { + return normalizeWhiteSpaces(s, StringUtils.WHITE_SPACES); + } + + public static String normalizeWhiteSpaces(final String s, final String whiteSpaces) { + if (s == null) { + return null; + } + return s.replaceAll("[" + whiteSpaces + "]", " "); + } + + public static String removeWhiteSpaces(final String s) { + return removeWhiteSpaces(s, StringUtils.WHITE_SPACES); + } + + public static String removeWhiteSpaces(final String s, final String whiteSpaces) { + if (s == null) { + return null; + } + return s.replaceAll("[" + whiteSpaces + "]", ""); + } + + public static String singleWhiteSpaces(final String s) { + return singleWhiteSpaces(s, StringUtils.WHITE_SPACES); + } + + public static String singleWhiteSpaces(final String s, final String whiteSpaces) { + if (s == null) { + return null; + } + return s.replaceAll("[" + whiteSpaces + "]+", " "); + } + + public static String capitalize(final String s) { + if (s == null) { + return null; + } + if (s.length() <= 1) { + return s.toLowerCase(); + } else { + return Character.toUpperCase(s.charAt(0)) + s.substring(1).toLowerCase(); + } + } + + public static String uncapitalize(final String s) { + if (s == null) { + return null; + } + if (s.length() <= 1) { + return s.toLowerCase(); + } else { + return Character.toLowerCase(s.charAt(0)) + s.substring(1); + } + } + + public static boolean checkIfGoodEncoding(final String s) { + if (s == null) { + return false; + } + return !Pattern.compile(StringUtils.WRONG_UNICODE).matcher(s).find(); + } + + public static String cleanToken(final String s) { + if (s == null) { + return null; + } + var ss = CLEAN_TOKEN_REGEX1.get().matcher(s).replaceAll(" ").trim(); + if (ss.startsWith("\"") && ss.endsWith("\"")) { + ss = CLEAN_TOKEN_REGEX2.get().matcher(ss).replaceAll(""); + } + return ss; + } + + public static String toSnake(final String w, final Text.ITokenizer tokenizer) { + return String.join("_", tokenizer.apply(StringUtils.encodeSymbols(w).replaceAll("\\W+", " "))).toLowerCase(); + } + + public static String toCamel(final String w, final Text.ITokenizer tokenizer) { + return uncapitalize( + String.join("", tokenizer.apply(StringUtils.encodeSymbols(w).replaceAll("\\W+", " ")).stream() + .map(StringUtils::capitalize).toArray(String[]::new))); + } + + public static String encodeSymbols(final String s) { + var tmp = s; + for(var e: symbols.entrySet()) { + tmp = tmp.replaceAll(e.getKey(), e.getValue()); + } + return tmp; + } + + public static Set getSymbols() { + return symbols.keySet(); + } + + public static Optional merge(final String sep, final List values) { + return values.stream().reduce((a, x) -> !a.contains(x) ? String.join(sep, a, x) : a); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/types/CollectionUtils.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/types/CollectionUtils.java new file mode 100644 index 00000000..e469889b --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/types/CollectionUtils.java @@ -0,0 +1,21 @@ +package com.github.romualdrousseau.any2json.commons.types; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class CollectionUtils { + + public static List mutableRange(int a, int b) { + List result = new ArrayList(); + for (int i = a; i < b; i++) { + result.add(i); + } + return result; + } + + public static List shuffle(List l) { + Collections.shuffle(l); + return l; + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/types/Pair.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/types/Pair.java new file mode 100644 index 00000000..1d2328a3 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/types/Pair.java @@ -0,0 +1,28 @@ +package com.github.romualdrousseau.any2json.commons.types; + +import java.util.Map; + +public class Pair implements Map.Entry { + private final String left; + private final String right; + + public Pair(final String left, final String right) { + this.left = left; + this.right = right; + } + + @Override + public String getKey() { + return this.left; + } + + @Override + public String getValue() { + return this.right; + } + + @Override + public String setValue(String arg0) { + throw new UnsupportedOperationException("Unimplemented method 'setValue'"); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/types/Tensor.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/types/Tensor.java new file mode 100644 index 00000000..3f0a74dc --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/types/Tensor.java @@ -0,0 +1,54 @@ +package com.github.romualdrousseau.any2json.commons.types; + +public class Tensor { + + public static final Tensor Null = Tensor.zeros(0); + + public static Tensor of(final double... data) { + final var floats = new float[data.length]; + for (int i = 0; i < data.length; i++) { + floats[i] = (float) data[i]; + } + return new Tensor(floats); + } + + public static Tensor of(final float... data) { + return new Tensor(data); + } + + public static Tensor zeros(final int size) { + final var zeros = new float[size]; + for (int i = 0; i < size; i++) { + zeros[i] = 0.0f; + } + return new Tensor(zeros); + } + + public final int size; + public final float[] data; + + public Tensor(final float[] data) { + this.data = data; + this.size = data.length; + } + + public Tensor iadd(final Tensor t) { + assert this.size == t.size; + for (int i = 0; i < this.size; i++) { + this.data[i] += t.data[i]; + } + return this; + } + + public Tensor if_lt_then(final float n, final float f, final float g) { + for (int i = 0; i < this.data.length; i++) { + this.data[i] = (this.data[i] < n) ? f : g; + } + return this; + } + + public int argmax() { + // TODO implement this + return 0; + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/yaml/YAML.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/yaml/YAML.java new file mode 100644 index 00000000..b4ddd0f7 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/yaml/YAML.java @@ -0,0 +1,134 @@ +package com.github.romualdrousseau.any2json.commons.yaml; + +import java.lang.reflect.InvocationTargetException; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Stream; + +import org.reflections.Reflections; + +public class YAML { + public final static String PACKAGE_LOADER_PREFIX = "com.github.romualdrousseau.shuju.yaml"; + + private static YAMLFactory Factory; + static { + final var reflections = new Reflections(PACKAGE_LOADER_PREFIX); + YAML.Factory = reflections.getSubTypesOf(YAMLFactory.class).stream() + .map(YAML::newFactoryInstance) + .findFirst() + .get(); + } + + private static YAMLFactory newFactoryInstance(final Class clazz) { + try { + return (YAMLFactory) clazz.getConstructor().newInstance(); + } catch (InstantiationException | IllegalAccessException + | IllegalArgumentException | InvocationTargetException + | NoSuchMethodException | SecurityException e) { + throw new RuntimeException(e); + } + } + + public static YAMLArray newArray() { + return YAML.Factory.newArray(); + } + + public static YAMLArray arrayOf(final String data) { + return YAML.Factory.parseArray(data); + } + + public static YAMLArray arrayOf(final Object object) { + return YAML.Factory.parseArray(object); + } + + public static YAMLArray arrayOf(final List l) { + final var array = YAML.newArray(); + l.forEach(s -> array.append(s)); + return array; + } + + public static YAMLArray arrayOf(final Stream l) { + final var array = YAML.newArray(); + l.forEach(s -> array.append(s)); + return array; + } + + public static YAMLArray arrayOf(final Map m) { + final var array = YAML.newArray(); + m.forEach((k, v) -> { + final var pair = YAML.newObject(); + pair.set("key", k); + pair.set("value", v); + array.append(pair); + }); + return array; + } + + public static YAMLArray loadArray(final Path filePath) { + return YAML.Factory.loadArray(filePath); + } + + public static void saveArray(final YAMLArray a, final Path filePath) { + YAML.Factory.saveArray(a, filePath, false); + } + + public static void saveArray(final YAMLArray a, final Path filePath, final boolean pretty) { + YAML.Factory.saveArray(a, filePath, pretty); + } + + public static YAMLObject newObject() { + return YAML.Factory.newObject(); + } + + public static YAMLObject objectOf(final String data) { + return YAML.Factory.parseObject(data); + } + + public static YAMLObject objectOf(final Object object) { + return YAML.Factory.parseObject(object); + } + + public static YAMLObject objectOf(final Map m) { + final YAMLObject object = YAML.newObject(); + m.forEach((k, v) -> object.set(k, v)); + return object; + } + + public static YAMLObject loadObject(final Path filePath) { + return YAML.Factory.loadObject(filePath); + } + + public static void saveObject(final YAMLObject o, final Path filePath) { + YAML.Factory.saveObject(o, filePath, false); + } + + public static void saveObject(final YAMLObject o, final Path filePath, final boolean pretty) { + YAML.Factory.saveObject(o, filePath, pretty); + } + + @SuppressWarnings("unchecked") + public static Optional query(final Object a, final String q) { + Object curr = a; + for(final var token: Arrays.asList(q.split("\\."))) { + if (curr instanceof YAMLArray) { + final int i = Integer.parseInt(token); + curr = ((YAMLArray) curr).get(i).orElse(null); + } else if (curr instanceof YAMLObject) { + curr = ((YAMLObject) curr).get(token).orElse(null); + } else { + curr = null; + } + } + return Optional.ofNullable((T) curr); + } + + public static Stream queryStream(final Object a, final String q) { + return YAML.query(a, q) + .filter(o -> o instanceof YAMLArray) + .map(o -> ((YAMLArray) o).stream()) + .orElse(Stream.empty()); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/yaml/YAMLArray.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/yaml/YAMLArray.java new file mode 100644 index 00000000..4b477754 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/yaml/YAMLArray.java @@ -0,0 +1,45 @@ +package com.github.romualdrousseau.any2json.commons.yaml; + +import java.util.Iterator; +import java.util.Optional; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +public interface YAMLArray { + + int size(); + + Optional get(int i); + + YAMLArray set(int i, T o); + + YAMLArray append(T o); + + YAMLArray remove(int i); + + String toString(final boolean pretty); + + String toString(); + + default Stream stream() { + Iterable it = new Iterable() { + @Override + public Iterator iterator() { + return new Iterator() { + private int idx = 0; + + @Override + public boolean hasNext() { + return idx < YAMLArray.this.size(); + } + + @Override + public T next() { + return YAMLArray.this.get(idx++).get(); + } + }; + } + }; + return StreamSupport.stream(it.spliterator(), false); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/yaml/YAMLCollector.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/yaml/YAMLCollector.java new file mode 100644 index 00000000..3241793f --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/yaml/YAMLCollector.java @@ -0,0 +1,16 @@ +package com.github.romualdrousseau.any2json.commons.yaml; + +import java.util.Map; +import java.util.stream.Collector; +import java.util.stream.Collectors; + +public class YAMLCollector { + + public static Collector> toMap(final String key, final String value) { + return Collectors.toMap(x -> x.get(key).get(), x -> x.get(value).get()); + } + + public static Collector> toUnmodifiableMap(final String key, final String value) { + return Collectors.toUnmodifiableMap(x -> x.get(key).get(), x -> x.get(value).get()); + } +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/yaml/YAMLFactory.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/yaml/YAMLFactory.java new file mode 100644 index 00000000..5e8ddd29 --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/yaml/YAMLFactory.java @@ -0,0 +1,26 @@ +package com.github.romualdrousseau.any2json.commons.yaml; + +import java.nio.file.Path; + +public interface YAMLFactory { + + YAMLArray newArray(); + + YAMLArray parseArray(String data); + + YAMLArray parseArray(Object object); + + YAMLArray loadArray(Path filePath); + + void saveArray(YAMLArray a, Path filePath, boolean pretty); + + YAMLObject newObject(); + + YAMLObject parseObject(String data); + + YAMLObject parseObject(Object object); + + YAMLObject loadObject(Path filePath); + + void saveObject(YAMLObject o, Path filePath, boolean pretty); +} diff --git a/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/yaml/YAMLObject.java b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/yaml/YAMLObject.java new file mode 100644 index 00000000..2cf73c4c --- /dev/null +++ b/any2json-commons/src/main/java/com/github/romualdrousseau/any2json/commons/yaml/YAMLObject.java @@ -0,0 +1,18 @@ +package com.github.romualdrousseau.any2json.commons.yaml; + +import java.util.Optional; + +public interface YAMLObject { + + Iterable keys(); + + Optional get(String k); + + YAMLObject set(String k, T o); + + YAMLObject remove(String k); + + String toString(final boolean pretty); + + String toString(); +} diff --git a/any2json-commons/src/site/markdown/index.md b/any2json-commons/src/site/markdown/index.md new file mode 100644 index 00000000..5bf7a78d --- /dev/null +++ b/any2json-commons/src/site/markdown/index.md @@ -0,0 +1,20 @@ +# About Any2Json LLM Commons + +Any2Json Commons. + +## Description + +In today's data-driven landscape, navigating the complexities of semi-structured documents poses a significant challenge +for organizations. These documents, characterized by diverse formats and a lack of standardization, often require +specialized skills for effective manipulation and analysis. However, we propose a novel framework to address this +challenge. By leveraging innovative algorithms and machine learning techniques, [Any2Json](https://github.com/RomualdRousseau/Any2Json) +offers a solution that transcends manual coding, providing enhanced accessibility to users across diverse skill levels. +Moreover, by automating the extraction process, it not only saves time but also minimizes errors, particularly beneficial +for industries dealing with large volumes of such documents. Crucially, this framework integrates seamlessly with machine +learning workflows, unlocking new possibilities for data enrichment and predictive modeling. Aligned with the paradigm of +data as a service, it offers a scalable and efficient means of managing semi-structured data, thereby expanding the toolkit +of data services available to organizations. + +## Getting Started + +You will find articles and tutorials [here](https://romualdrousseau.github.io/Any2Json-Documents/). diff --git a/any2json-commons/src/site/resources/css/site.css b/any2json-commons/src/site/resources/css/site.css new file mode 100644 index 00000000..c48367c3 --- /dev/null +++ b/any2json-commons/src/site/resources/css/site.css @@ -0,0 +1,3 @@ +#bodyColumn { + max-width: 1000px; +} \ No newline at end of file diff --git a/any2json-commons/src/site/resources/images/any2json-logo.png b/any2json-commons/src/site/resources/images/any2json-logo.png new file mode 100644 index 00000000..bc971a5f Binary files /dev/null and b/any2json-commons/src/site/resources/images/any2json-logo.png differ diff --git a/any2json-commons/src/site/site.xml b/any2json-commons/src/site/site.xml new file mode 100644 index 00000000..14a3aded --- /dev/null +++ b/any2json-commons/src/site/site.xml @@ -0,0 +1,34 @@ + + + + org.apache.maven.skins + maven-fluido-skin + 1.11.1 + + + + Any2Json LLM Classifier + images/any2json-logo.png + https://romualdrousseau.github.io/Any2Json-Commons/ + + + + +
+ + + + + + romualdrousseau/Any2Json-Commons + right + gray + + pull-right + + + + diff --git a/any2json-csv/pom.xml b/any2json-csv/pom.xml index dd226eb8..0d0fc46b 100644 --- a/any2json-csv/pom.xml +++ b/any2json-csv/pom.xml @@ -14,7 +14,7 @@ any2json-csv Convert any text file in Json - https://github.com/romualdrousseau/any2json-csv + https://github.com/romualdrousseau/any2json-monorepo @@ -24,12 +24,6 @@ ${project.version} - - com.github.romualdrousseau - shuju-jackson-json - ${shuju.version} - test - com.github.romualdrousseau any2json-net-classifier diff --git a/any2json-csv/src/main/java/com/github/romualdrousseau/any2json/loader/csv/CsvDocument.java b/any2json-csv/src/main/java/com/github/romualdrousseau/any2json/loader/csv/CsvDocument.java index 9aa6d06b..41ca2692 100644 --- a/any2json-csv/src/main/java/com/github/romualdrousseau/any2json/loader/csv/CsvDocument.java +++ b/any2json-csv/src/main/java/com/github/romualdrousseau/any2json/loader/csv/CsvDocument.java @@ -13,7 +13,7 @@ import com.github.romualdrousseau.any2json.base.BaseSheet; import com.github.romualdrousseau.any2json.transform.op.DropColumnsWhenFillRatioLessThan; import com.github.romualdrousseau.any2json.util.Disk; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; public class CsvDocument extends BaseDocument { diff --git a/any2json-csv/src/main/java/com/github/romualdrousseau/any2json/loader/csv/CsvSheet.java b/any2json-csv/src/main/java/com/github/romualdrousseau/any2json/loader/csv/CsvSheet.java index c040296d..f5b931ba 100644 --- a/any2json-csv/src/main/java/com/github/romualdrousseau/any2json/loader/csv/CsvSheet.java +++ b/any2json-csv/src/main/java/com/github/romualdrousseau/any2json/loader/csv/CsvSheet.java @@ -6,11 +6,11 @@ import java.util.ArrayList; import com.github.romualdrousseau.any2json.base.PatcheableSheetStore; -import com.github.romualdrousseau.shuju.bigdata.DataFrame; -import com.github.romualdrousseau.shuju.bigdata.DataFrameWriter; -import com.github.romualdrousseau.shuju.bigdata.Row; -import com.github.romualdrousseau.shuju.strings.StringUtils; -import com.github.romualdrousseau.shuju.types.Tensor; +import com.github.romualdrousseau.any2json.commons.bigdata.DataFrame; +import com.github.romualdrousseau.any2json.commons.bigdata.DataFrameWriter; +import com.github.romualdrousseau.any2json.commons.bigdata.Row; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.types.Tensor; class CsvSheet extends PatcheableSheetStore implements Closeable { @@ -204,7 +204,7 @@ private String guessSeparator(final String sample) { } } - final var i = (int) Tensor.of(v).argmax(0).item(0); + final var i = (int) Tensor.of(v).argmax(); return (i == 0) ? null : SEPARATORS[i - 1]; } diff --git a/any2json-dbf/pom.xml b/any2json-dbf/pom.xml index c76af609..65400408 100644 --- a/any2json-dbf/pom.xml +++ b/any2json-dbf/pom.xml @@ -14,7 +14,7 @@ any2json-dbf Convert any text file in Json - https://github.com/romualdrousseau/any2json-dbf + https://github.com/romualdrousseau/any2json-monorepo @@ -30,23 +30,11 @@ ${javadbf.version} - - com.github.romualdrousseau - shuju-jackson-json - ${shuju.version} - test - com.github.romualdrousseau any2json-net-classifier ${project.version} test - - com.github.romualdrousseau - any2json-layex-parser - ${project.version} - test - diff --git a/any2json-dbf/src/main/java/com/github/romualdrousseau/any2json/loader/dbf/DbfSheet.java b/any2json-dbf/src/main/java/com/github/romualdrousseau/any2json/loader/dbf/DbfSheet.java index f066e0a8..b3529eab 100644 --- a/any2json-dbf/src/main/java/com/github/romualdrousseau/any2json/loader/dbf/DbfSheet.java +++ b/any2json-dbf/src/main/java/com/github/romualdrousseau/any2json/loader/dbf/DbfSheet.java @@ -9,10 +9,10 @@ import java.util.List; import com.github.romualdrousseau.any2json.base.PatcheableSheetStore; -import com.github.romualdrousseau.shuju.bigdata.DataFrame; -import com.github.romualdrousseau.shuju.bigdata.DataFrameWriter; -import com.github.romualdrousseau.shuju.bigdata.Row; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.bigdata.DataFrame; +import com.github.romualdrousseau.any2json.commons.bigdata.DataFrameWriter; +import com.github.romualdrousseau.any2json.commons.bigdata.Row; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; import com.linuxense.javadbf.DBFField; import com.linuxense.javadbf.DBFReader; diff --git a/any2json-examples/pom.xml b/any2json-examples/pom.xml index 0ff4e984..a7ba0eca 100644 --- a/any2json-examples/pom.xml +++ b/any2json-examples/pom.xml @@ -17,20 +17,9 @@ This package contains a number of examples that demonstrates how you can use the Any2Json to load documents from "real life". - https://github.com/romualdrousseau/any2json-examples + https://github.com/romualdrousseau/any2json-monorepo - - - com.github.romualdrousseau - shuju - ${shuju.version} - - - com.github.romualdrousseau - shuju-jackson-json - ${shuju.version} - com.github.romualdrousseau diff --git a/any2json-excel/pom.xml b/any2json-excel/pom.xml index ccbf354e..1156d95a 100644 --- a/any2json-excel/pom.xml +++ b/any2json-excel/pom.xml @@ -14,7 +14,7 @@ any2json-excel Convert any text file in Json - https://github.com/romualdrousseau/any2json-excel + https://github.com/romualdrousseau/any2json-monorepo @@ -45,12 +45,6 @@ ${poi.version} - - com.github.romualdrousseau - shuju-jackson-json - ${shuju.version} - test - com.github.romualdrousseau any2json-layex-parser diff --git a/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xls/XlsDocument.java b/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xls/XlsDocument.java index 26ecc0d2..f38df046 100644 --- a/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xls/XlsDocument.java +++ b/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xls/XlsDocument.java @@ -13,7 +13,7 @@ import com.github.romualdrousseau.any2json.base.BaseDocument; import com.github.romualdrousseau.any2json.base.BaseSheet; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; import com.github.romualdrousseau.any2json.Document; import com.github.romualdrousseau.any2json.Sheet; diff --git a/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xls/XlsSheet.java b/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xls/XlsSheet.java index 6b4b34f6..24362fe8 100644 --- a/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xls/XlsSheet.java +++ b/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xls/XlsSheet.java @@ -6,7 +6,7 @@ import java.util.List; import com.github.romualdrousseau.any2json.base.PatcheableSheetStore; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.CellType; diff --git a/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xlsx/ContentHandler.java b/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xlsx/ContentHandler.java index dc565671..e0c07ca6 100644 --- a/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xlsx/ContentHandler.java +++ b/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xlsx/ContentHandler.java @@ -6,10 +6,10 @@ import java.util.ArrayList; import java.util.List; -import com.github.romualdrousseau.shuju.bigdata.DataFrame; -import com.github.romualdrousseau.shuju.bigdata.DataFrameWriter; -import com.github.romualdrousseau.shuju.bigdata.Row; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.bigdata.DataFrame; +import com.github.romualdrousseau.any2json.commons.bigdata.DataFrameWriter; +import com.github.romualdrousseau.any2json.commons.bigdata.Row; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; import org.apache.poi.ss.usermodel.BorderStyle; import org.apache.poi.ss.usermodel.CellStyle; diff --git a/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xlsx/XlsxDocument.java b/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xlsx/XlsxDocument.java index fe83e4df..722fb570 100644 --- a/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xlsx/XlsxDocument.java +++ b/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xlsx/XlsxDocument.java @@ -12,7 +12,7 @@ import com.github.romualdrousseau.any2json.Sheet; import com.github.romualdrousseau.any2json.base.BaseDocument; import com.github.romualdrousseau.any2json.base.BaseSheet; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; diff --git a/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xlsx/XlsxSheet.java b/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xlsx/XlsxSheet.java index 6081188b..f062f713 100644 --- a/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xlsx/XlsxSheet.java +++ b/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xlsx/XlsxSheet.java @@ -10,8 +10,8 @@ import javax.xml.parsers.SAXParserFactory; import com.github.romualdrousseau.any2json.base.PatcheableSheetStore; -import com.github.romualdrousseau.shuju.bigdata.DataFrame; -import com.github.romualdrousseau.shuju.bigdata.DataFrameWriter; +import com.github.romualdrousseau.any2json.commons.bigdata.DataFrame; +import com.github.romualdrousseau.any2json.commons.bigdata.DataFrameWriter; import org.apache.poi.ss.util.CellRangeAddress; import org.apache.poi.xssf.model.SharedStrings; diff --git a/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xml/XmlSheet.java b/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xml/XmlSheet.java index 73804249..da8b99b6 100644 --- a/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xml/XmlSheet.java +++ b/any2json-excel/src/main/java/com/github/romualdrousseau/any2json/loader/excel/xml/XmlSheet.java @@ -1,7 +1,7 @@ package com.github.romualdrousseau.any2json.loader.excel.xml; import com.github.romualdrousseau.any2json.base.PatcheableSheetStore; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; import nl.fountain.xelem.excel.Cell; import nl.fountain.xelem.excel.Row; diff --git a/any2json-layex-parser/pom.xml b/any2json-layex-parser/pom.xml index 6591c888..422a7899 100644 --- a/any2json-layex-parser/pom.xml +++ b/any2json-layex-parser/pom.xml @@ -16,7 +16,7 @@ Any2Json plugin to parse complex layout implementing Table Layout Regular Expression - Layex - https://github.com/romualdrousseau/any2json-layex-parser + https://github.com/romualdrousseau/any2json-monorepo diff --git a/any2json-llm-classifier/pom.xml b/any2json-llm-classifier/pom.xml index 60e0d533..565bcde4 100644 --- a/any2json-llm-classifier/pom.xml +++ b/any2json-llm-classifier/pom.xml @@ -16,15 +16,10 @@ Any2Json plugin to tag tabular output implementing embeddings. - https://github.com/romualdrousseau/any2json-llm-classifier + https://github.com/romualdrousseau/any2json-monorepo - - com.github.romualdrousseau - shuju - ${shuju.version} - com.github.romualdrousseau any2json @@ -49,18 +44,6 @@ ${log4j.version} test - - com.github.romualdrousseau - shuju-jackson-json - ${shuju.version} - test - - - com.github.romualdrousseau - shuju-jackson-yaml - ${shuju.version} - test - com.github.romualdrousseau any2json-layex-parser diff --git a/any2json-llm-classifier/src/main/java/com/github/romualdrousseau/any2json/classifier/LLMTagClassifier.java b/any2json-llm-classifier/src/main/java/com/github/romualdrousseau/any2json/classifier/LLMTagClassifier.java index 7565df9d..e8ede4e7 100644 --- a/any2json-llm-classifier/src/main/java/com/github/romualdrousseau/any2json/classifier/LLMTagClassifier.java +++ b/any2json-llm-classifier/src/main/java/com/github/romualdrousseau/any2json/classifier/LLMTagClassifier.java @@ -7,8 +7,8 @@ import java.util.List; import java.util.stream.StreamSupport; -import com.github.romualdrousseau.shuju.json.JSON; -import com.github.romualdrousseau.shuju.json.JSONObject; +import com.github.romualdrousseau.any2json.commons.json.JSON; +import com.github.romualdrousseau.any2json.commons.json.JSONObject; import com.github.romualdrousseau.any2json.Header; import com.github.romualdrousseau.any2json.Model; import com.github.romualdrousseau.any2json.Table; diff --git a/any2json-llm-classifier/src/site/site.xml b/any2json-llm-classifier/src/site/site.xml index 5fdefa69..a5389361 100644 --- a/any2json-llm-classifier/src/site/site.xml +++ b/any2json-llm-classifier/src/site/site.xml @@ -1,7 +1,7 @@ + name="Any2Json LLM Classifier"> org.apache.maven.skins @@ -23,7 +23,7 @@ - romualdrousseau/Any2Json-LLM-Classifierr + romualdrousseau/Any2Json-LLM-Classifier right gray diff --git a/any2json-net-classifier/pom.xml b/any2json-net-classifier/pom.xml index 1f1f37b3..40afcf09 100644 --- a/any2json-net-classifier/pom.xml +++ b/any2json-net-classifier/pom.xml @@ -16,9 +16,15 @@ Any2Json plugin to tag tabular output implementing embeddings. - https://github.com/romualdrousseau/any2json-net-classifier + https://github.com/romualdrousseau/any2json-monorepo + + + org.tensorflow + tensorflow-core-platform + ${tensorflow.version} + com.github.romualdrousseau diff --git a/any2json-net-classifier/src/main/java/com/github/romualdrousseau/any2json/classifier/NetTagClassifier.java b/any2json-net-classifier/src/main/java/com/github/romualdrousseau/any2json/classifier/NetTagClassifier.java index 529b0d96..31f42dff 100644 --- a/any2json-net-classifier/src/main/java/com/github/romualdrousseau/any2json/classifier/NetTagClassifier.java +++ b/any2json-net-classifier/src/main/java/com/github/romualdrousseau/any2json/classifier/NetTagClassifier.java @@ -19,6 +19,8 @@ import org.tensorflow.SessionFunction; import org.tensorflow.Signature; import org.tensorflow.exceptions.TensorFlowException; +import org.tensorflow.ndarray.Shape; +import org.tensorflow.ndarray.buffer.DataBuffers; import org.tensorflow.types.TFloat32; import com.github.romualdrousseau.any2json.Header; @@ -28,13 +30,13 @@ import com.github.romualdrousseau.any2json.TagClassifier; import com.github.romualdrousseau.any2json.util.Disk; import com.github.romualdrousseau.any2json.util.TempFile; -import com.github.romualdrousseau.shuju.types.Tensor; -import com.github.romualdrousseau.shuju.commons.PythonManager; -import com.github.romualdrousseau.shuju.json.JSON; -import com.github.romualdrousseau.shuju.preprocessing.Text; -import com.github.romualdrousseau.shuju.preprocessing.hasher.VocabularyHasher; -import com.github.romualdrousseau.shuju.preprocessing.tokenizer.NgramTokenizer; -import com.github.romualdrousseau.shuju.preprocessing.tokenizer.ShingleTokenizer; +import com.github.romualdrousseau.any2json.commons.types.Tensor; +import com.github.romualdrousseau.any2json.commons.python.PythonManager; +import com.github.romualdrousseau.any2json.commons.json.JSON; +import com.github.romualdrousseau.any2json.commons.preprocessing.Text; +import com.github.romualdrousseau.any2json.commons.preprocessing.hasher.VocabularyHasher; +import com.github.romualdrousseau.any2json.commons.preprocessing.tokenizer.NgramTokenizer; +import com.github.romualdrousseau.any2json.commons.preprocessing.tokenizer.ShingleTokenizer; public class NetTagClassifier extends SimpleTagClassifier implements Trainable { @@ -119,12 +121,12 @@ public String predict(final Table table, final Header header) { .toArray(); final Map inputs = Map.of( - "entity_input", Tensor.of(entityInput).reshape(1, -1).toTFloat32(), - "name_input", Tensor.of(nameInput).reshape(1, -1).toTFloat32(), - "context_input", Tensor.of(contextInput).reshape(1, -1).toTFloat32()); + "entity_input", this.toTFloat32(Tensor.of(entityInput)), + "name_input", this.toTFloat32(Tensor.of(nameInput)), + "context_input", this.toTFloat32(Tensor.of(contextInput))); - final var result = Tensor.of((TFloat32) this.tagClassifierFunc.call(inputs).get("tag_output").get()); - return this.getModel().getTagList().get((int) result.argmax(1).item(0)); + final var result = this.fromTFloat32((TFloat32) this.tagClassifierFunc.call(inputs).get("tag_output").get()); + return this.getModel().getTagList().get(result.argmax()); } @Override @@ -234,4 +236,13 @@ private Path unserializeModelML(final String modelString) { throw new UncheckedIOException(x); } } + + public Tensor fromTFloat32(final TFloat32 v) { + final double[] data = v.streamOfObjects().mapToDouble(i -> (double) i).toArray(); + return Tensor.of(data); + } + + public TFloat32 toTFloat32(final Tensor tensor) { + return TFloat32.tensorOf(Shape.of(1L, (long) tensor.size), DataBuffers.of(tensor.data)); + } } diff --git a/any2json-net-classifier/src/main/java/com/github/romualdrousseau/any2json/classifier/TrainingSetBuilder.java b/any2json-net-classifier/src/main/java/com/github/romualdrousseau/any2json/classifier/TrainingSetBuilder.java index 3bcb84d7..e06688f6 100644 --- a/any2json-net-classifier/src/main/java/com/github/romualdrousseau/any2json/classifier/TrainingSetBuilder.java +++ b/any2json-net-classifier/src/main/java/com/github/romualdrousseau/any2json/classifier/TrainingSetBuilder.java @@ -10,8 +10,8 @@ import java.util.stream.IntStream; import java.util.stream.Stream; -import com.github.romualdrousseau.shuju.commons.CollectionUtils; -import com.github.romualdrousseau.shuju.preprocessing.Text; +import com.github.romualdrousseau.any2json.commons.types.CollectionUtils; +import com.github.romualdrousseau.any2json.commons.preprocessing.Text; public class TrainingSetBuilder { private static final float TRAININGSET_AUGMENT_COEF = 0.5f; diff --git a/any2json-parquet/pom.xml b/any2json-parquet/pom.xml index f288f963..f611b274 100644 --- a/any2json-parquet/pom.xml +++ b/any2json-parquet/pom.xml @@ -14,7 +14,7 @@ any2json-parquet Convert any text file in Parquet - https://github.com/romualdrousseau/any2json-parquet + https://github.com/romualdrousseau/any2json-monorepo @@ -54,12 +54,6 @@ ${parquet.version} - - com.github.romualdrousseau - shuju-jackson-json - ${shuju.version} - test - com.github.romualdrousseau any2json-net-classifier diff --git a/any2json-parquet/src/main/java/com/github/romualdrousseau/any2json/loader/parquet/ParquetSheet.java b/any2json-parquet/src/main/java/com/github/romualdrousseau/any2json/loader/parquet/ParquetSheet.java index d6cf2c28..91e02187 100644 --- a/any2json-parquet/src/main/java/com/github/romualdrousseau/any2json/loader/parquet/ParquetSheet.java +++ b/any2json-parquet/src/main/java/com/github/romualdrousseau/any2json/loader/parquet/ParquetSheet.java @@ -7,10 +7,10 @@ import org.apache.parquet.hadoop.ParquetReader; import com.github.romualdrousseau.any2json.base.PatcheableSheetStore; -import com.github.romualdrousseau.shuju.bigdata.DataFrame; -import com.github.romualdrousseau.shuju.bigdata.DataFrameWriter; -import com.github.romualdrousseau.shuju.bigdata.Row; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.bigdata.DataFrame; +import com.github.romualdrousseau.any2json.commons.bigdata.DataFrameWriter; +import com.github.romualdrousseau.any2json.commons.bigdata.Row; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; class ParquetSheet extends PatcheableSheetStore implements Closeable { diff --git a/any2json-pdf/pom.xml b/any2json-pdf/pom.xml index e3d1ef4a..be92b5c7 100644 --- a/any2json-pdf/pom.xml +++ b/any2json-pdf/pom.xml @@ -16,7 +16,7 @@ Any2Json plugin to load PDF file. - https://github.com/romualdrousseau/any2json-pdf + https://github.com/romualdrousseau/any2json-monorepo diff --git a/any2json-pdf/src/main/java/com/github/romualdrousseau/any2json/loader/pdf/PdfSheet.java b/any2json-pdf/src/main/java/com/github/romualdrousseau/any2json/loader/pdf/PdfSheet.java index 7b712326..e0f113c5 100644 --- a/any2json-pdf/src/main/java/com/github/romualdrousseau/any2json/loader/pdf/PdfSheet.java +++ b/any2json-pdf/src/main/java/com/github/romualdrousseau/any2json/loader/pdf/PdfSheet.java @@ -8,10 +8,10 @@ import org.apache.pdfbox.pdmodel.PDDocument; import com.github.romualdrousseau.any2json.base.PatcheableSheetStore; -import com.github.romualdrousseau.shuju.bigdata.DataFrame; -import com.github.romualdrousseau.shuju.bigdata.DataFrameWriter; -import com.github.romualdrousseau.shuju.bigdata.Row; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.bigdata.DataFrame; +import com.github.romualdrousseau.any2json.commons.bigdata.DataFrameWriter; +import com.github.romualdrousseau.any2json.commons.bigdata.Row; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; import technology.tabula.ObjectExtractor; import technology.tabula.Page; diff --git a/any2json/pom.xml b/any2json/pom.xml index a082f98f..8def13e7 100644 --- a/any2json/pom.xml +++ b/any2json/pom.xml @@ -16,7 +16,7 @@ A java API to manipulate semi structured documents and extract data from them. - https://github.com/romualdrousseau/any2json + https://github.com/romualdrousseau/any2json-monorepo @@ -31,18 +31,11 @@ jython-standalone ${jython.version} - + com.github.romualdrousseau - shuju - ${shuju.version} - - - - com.github.romualdrousseau - shuju-jackson-json - ${shuju.version} - test + any2json-commons + ${project.version} diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/Model.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/Model.java index afff935c..2b7e4ec4 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/Model.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/Model.java @@ -8,9 +8,9 @@ import org.apache.commons.collections4.map.LRUMap; import com.github.romualdrousseau.any2json.modeldata.EmptyModelData; -import com.github.romualdrousseau.shuju.preprocessing.Text; -import com.github.romualdrousseau.shuju.preprocessing.comparer.RegexComparer; -import com.github.romualdrousseau.shuju.types.Tensor; +import com.github.romualdrousseau.any2json.commons.preprocessing.Text; +import com.github.romualdrousseau.any2json.commons.preprocessing.comparer.RegexComparer; +import com.github.romualdrousseau.any2json.commons.types.Tensor; public class Model { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/TransformableSheet.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/TransformableSheet.java index 979c03da..4724f03a 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/TransformableSheet.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/TransformableSheet.java @@ -17,7 +17,7 @@ import com.github.romualdrousseau.any2json.transform.op.RepeatColumnCell; import com.github.romualdrousseau.any2json.transform.op.RepeatRowCell; import com.github.romualdrousseau.any2json.transform.op.SwapRows; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; /** * TransformableSheet Class is responsible to apply transformations to a sheet diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseCell.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseCell.java index e918e400..a4c3fc2f 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseCell.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseCell.java @@ -6,8 +6,8 @@ import java.util.stream.IntStream; import com.github.romualdrousseau.any2json.Cell; -import com.github.romualdrousseau.shuju.strings.StringUtils; -import com.github.romualdrousseau.shuju.types.Tensor; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.types.Tensor; public class BaseCell implements Cell, Symbol { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseDocument.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseDocument.java index 085a53e6..b3b1d9cd 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseDocument.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseDocument.java @@ -16,7 +16,7 @@ import com.github.romualdrousseau.any2json.parser.table.SimpleTableParser; import com.github.romualdrousseau.any2json.readdir.GutenbergDiagonal; import com.github.romualdrousseau.any2json.transform.op.StitchRows; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; public abstract class BaseDocument implements Document { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseSheet.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseSheet.java index ad7d05d0..c4219276 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseSheet.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/base/BaseSheet.java @@ -22,7 +22,7 @@ import com.github.romualdrousseau.any2json.event.TableReadyEvent; import com.github.romualdrousseau.any2json.intelli.IntelliTable; import com.github.romualdrousseau.any2json.TransformableSheet; -import com.github.romualdrousseau.shuju.commons.CollectionUtils; +import com.github.romualdrousseau.any2json.commons.types.CollectionUtils; public class BaseSheet implements Sheet { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/classifier/SimpleTagClassifier.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/classifier/SimpleTagClassifier.java index add58b30..edeac21c 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/classifier/SimpleTagClassifier.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/classifier/SimpleTagClassifier.java @@ -7,8 +7,8 @@ import com.github.romualdrousseau.any2json.Model; import com.github.romualdrousseau.any2json.Table; import com.github.romualdrousseau.any2json.TagClassifier; -import com.github.romualdrousseau.shuju.preprocessing.tokenizer.ShingleTokenizer; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.preprocessing.tokenizer.ShingleTokenizer; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; public class SimpleTagClassifier implements TagClassifier { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/header/DataTableHeader.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/header/DataTableHeader.java index bb120e7a..a3b9b731 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/header/DataTableHeader.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/header/DataTableHeader.java @@ -10,8 +10,8 @@ import com.github.romualdrousseau.any2json.base.BaseRow; import com.github.romualdrousseau.any2json.base.BaseTable; import com.github.romualdrousseau.any2json.config.Settings; -import com.github.romualdrousseau.shuju.strings.StringUtils; -import com.github.romualdrousseau.shuju.types.Tensor; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.types.Tensor; public class DataTableHeader extends BaseHeader { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/intelli/IntelliHeader.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/intelli/IntelliHeader.java index 69e04416..870f623c 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/intelli/IntelliHeader.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/intelli/IntelliHeader.java @@ -10,7 +10,7 @@ import com.github.romualdrousseau.any2json.base.BaseHeader; import com.github.romualdrousseau.any2json.config.Settings; import com.github.romualdrousseau.any2json.header.DataTableHeader; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; public class IntelliHeader extends DataTableHeader { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/intelli/IntelliRow.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/intelli/IntelliRow.java index aa6e0c08..30dbf4fe 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/intelli/IntelliRow.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/intelli/IntelliRow.java @@ -2,7 +2,7 @@ import com.github.romualdrousseau.any2json.base.BaseRow; import com.github.romualdrousseau.any2json.base.BaseTable; -import com.github.romualdrousseau.shuju.bigdata.Row; +import com.github.romualdrousseau.any2json.commons.bigdata.Row; public class IntelliRow extends BaseRow { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/intelli/IntelliTable.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/intelli/IntelliTable.java index 2774e32c..7bb7cbec 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/intelli/IntelliTable.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/intelli/IntelliTable.java @@ -14,10 +14,10 @@ import com.github.romualdrousseau.any2json.base.RowGroup; import com.github.romualdrousseau.any2json.header.PivotEntry; import com.github.romualdrousseau.any2json.header.PivotKeyHeader; -import com.github.romualdrousseau.shuju.bigdata.DataFrame; -import com.github.romualdrousseau.shuju.bigdata.DataFrameWriter; -import com.github.romualdrousseau.shuju.bigdata.Row; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.bigdata.DataFrame; +import com.github.romualdrousseau.any2json.commons.bigdata.DataFrameWriter; +import com.github.romualdrousseau.any2json.commons.bigdata.Row; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; public class IntelliTable extends DataTable { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/DataContractModelBuilder.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/DataContractModelBuilder.java index b17bf5cb..2bce14d7 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/DataContractModelBuilder.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/DataContractModelBuilder.java @@ -12,8 +12,8 @@ import java.util.List; import com.github.romualdrousseau.any2json.Model; -import com.github.romualdrousseau.shuju.yaml.YAML; -import com.github.romualdrousseau.shuju.yaml.YAMLObject; +import com.github.romualdrousseau.any2json.commons.yaml.YAML; +import com.github.romualdrousseau.any2json.commons.yaml.YAMLObject; public class DataContractModelBuilder { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/DataContractModelData.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/DataContractModelData.java index 1fd71521..e69caa14 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/DataContractModelData.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/DataContractModelData.java @@ -9,10 +9,10 @@ import java.util.stream.StreamSupport; import com.github.romualdrousseau.any2json.ModelData; -import com.github.romualdrousseau.shuju.commons.Pair; -import com.github.romualdrousseau.shuju.yaml.YAML; -import com.github.romualdrousseau.shuju.yaml.YAMLArray; -import com.github.romualdrousseau.shuju.yaml.YAMLObject; +import com.github.romualdrousseau.any2json.commons.types.Pair; +import com.github.romualdrousseau.any2json.commons.yaml.YAML; +import com.github.romualdrousseau.any2json.commons.yaml.YAMLArray; +import com.github.romualdrousseau.any2json.commons.yaml.YAMLObject; public class DataContractModelData implements ModelData { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/JsonModelBuilder.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/JsonModelBuilder.java index 677f824e..1112f1ad 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/JsonModelBuilder.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/JsonModelBuilder.java @@ -15,9 +15,9 @@ import com.github.romualdrousseau.any2json.Model; import com.github.romualdrousseau.any2json.TableParser; import com.github.romualdrousseau.any2json.TagClassifier; -import com.github.romualdrousseau.shuju.json.JSON; -import com.github.romualdrousseau.shuju.json.JSONObject; -import com.github.romualdrousseau.shuju.yaml.YAML; +import com.github.romualdrousseau.any2json.commons.json.JSON; +import com.github.romualdrousseau.any2json.commons.json.JSONObject; +import com.github.romualdrousseau.any2json.commons.yaml.YAML; public class JsonModelBuilder { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/JsonModelData.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/JsonModelData.java index e8eb5338..30dd6776 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/JsonModelData.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/modeldata/JsonModelData.java @@ -7,10 +7,10 @@ import java.util.Optional; import com.github.romualdrousseau.any2json.ModelData; -import com.github.romualdrousseau.shuju.json.JSON; -import com.github.romualdrousseau.shuju.json.JSONArray; -import com.github.romualdrousseau.shuju.json.JSONCollector; -import com.github.romualdrousseau.shuju.json.JSONObject; +import com.github.romualdrousseau.any2json.commons.json.JSON; +import com.github.romualdrousseau.any2json.commons.json.JSONArray; +import com.github.romualdrousseau.any2json.commons.json.JSONCollector; +import com.github.romualdrousseau.any2json.commons.json.JSONObject; public class JsonModelData implements ModelData { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/parser/sheet/SheetBitmap.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/parser/sheet/SheetBitmap.java index e0ec5136..ba9051cd 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/parser/sheet/SheetBitmap.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/parser/sheet/SheetBitmap.java @@ -1,7 +1,7 @@ package com.github.romualdrousseau.any2json.parser.sheet; import com.github.romualdrousseau.any2json.base.BaseSheet; -import com.github.romualdrousseau.shuju.cv.ISearchBitmap; +import com.github.romualdrousseau.any2json.commons.cv.ISearchBitmap; public class SheetBitmap implements ISearchBitmap { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/parser/sheet/SheetBitmapParser.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/parser/sheet/SheetBitmapParser.java index e743b662..5ab70543 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/parser/sheet/SheetBitmapParser.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/parser/sheet/SheetBitmapParser.java @@ -7,11 +7,11 @@ import com.github.romualdrousseau.any2json.base.BaseSheet; import com.github.romualdrousseau.any2json.base.BaseTable; import com.github.romualdrousseau.any2json.event.BitmapGeneratedEvent; -import com.github.romualdrousseau.shuju.cv.Filter; -import com.github.romualdrousseau.shuju.cv.ISearchBitmap; -import com.github.romualdrousseau.shuju.cv.SearchPoint; -import com.github.romualdrousseau.shuju.cv.Template; -import com.github.romualdrousseau.shuju.cv.shapeextractor.RectangleExtractor; +import com.github.romualdrousseau.any2json.commons.cv.Filter; +import com.github.romualdrousseau.any2json.commons.cv.ISearchBitmap; +import com.github.romualdrousseau.any2json.commons.cv.SearchPoint; +import com.github.romualdrousseau.any2json.commons.cv.Template; +import com.github.romualdrousseau.any2json.commons.cv.shapeextractor.RectangleExtractor; public class SheetBitmapParser implements SheetParser { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/AutoCrop.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/AutoCrop.java index 90517831..ecee922f 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/AutoCrop.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/AutoCrop.java @@ -1,7 +1,7 @@ package com.github.romualdrousseau.any2json.transform.op; import com.github.romualdrousseau.any2json.base.BaseSheet; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; public class AutoCrop { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/DropColumnsWhenEntropyLessThan.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/DropColumnsWhenEntropyLessThan.java index 2d176b51..5234f899 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/DropColumnsWhenEntropyLessThan.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/DropColumnsWhenEntropyLessThan.java @@ -4,7 +4,7 @@ import java.util.Map.Entry; import com.github.romualdrousseau.any2json.base.BaseSheet; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; public class DropColumnsWhenEntropyLessThan { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/DropColumnsWhenFillRatioLessThan.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/DropColumnsWhenFillRatioLessThan.java index 10f9279c..ffe5391c 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/DropColumnsWhenFillRatioLessThan.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/DropColumnsWhenFillRatioLessThan.java @@ -1,7 +1,7 @@ package com.github.romualdrousseau.any2json.transform.op; import com.github.romualdrousseau.any2json.base.BaseSheet; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; public class DropColumnsWhenFillRatioLessThan { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/DropRowsWhenEntropyLessThan.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/DropRowsWhenEntropyLessThan.java index 7d59724a..553ec285 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/DropRowsWhenEntropyLessThan.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/DropRowsWhenEntropyLessThan.java @@ -4,7 +4,7 @@ import java.util.Map.Entry; import com.github.romualdrousseau.any2json.base.BaseSheet; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; public class DropRowsWhenEntropyLessThan { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/DropRowsWhenFillRatioLessThan.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/DropRowsWhenFillRatioLessThan.java index 3e3a0089..3fc27096 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/DropRowsWhenFillRatioLessThan.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/DropRowsWhenFillRatioLessThan.java @@ -1,7 +1,7 @@ package com.github.romualdrousseau.any2json.transform.op; import com.github.romualdrousseau.any2json.base.BaseSheet; -import com.github.romualdrousseau.shuju.strings.StringUtils; +import com.github.romualdrousseau.any2json.commons.strings.StringUtils; public class DropRowsWhenFillRatioLessThan { diff --git a/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/StitchRows.java b/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/StitchRows.java index f38fe6e0..fa2b780d 100644 --- a/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/StitchRows.java +++ b/any2json/src/main/java/com/github/romualdrousseau/any2json/transform/op/StitchRows.java @@ -4,7 +4,7 @@ import com.github.romualdrousseau.any2json.base.BaseSheet; import com.github.romualdrousseau.any2json.config.Settings; -import com.github.romualdrousseau.shuju.strings.StringFuzzy; +import com.github.romualdrousseau.any2json.commons.strings.StringFuzzy; public class StitchRows { diff --git a/pom.xml b/pom.xml index 59e5ae5d..d514d4e9 100644 --- a/pom.xml +++ b/pom.xml @@ -16,6 +16,7 @@ https://github.com/romualdrousseau/any2json-monorepo + any2json-commons any2json any2json-layex-parser any2json-net-classifier @@ -54,9 +55,9 @@ 3.6.3 - 1.30-SNAPSHOT 2.23.1 - 1.1.10.6 + 0.4.1 + 1.1.10.7 2.17.2 4.4 2.7.4 @@ -66,6 +67,8 @@ 1.14.2 1.0.5 0.18.2 + 0.10.2 + 0.5.0 4.13.2 1.10.2