Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/set sheet name #14

Merged
merged 6 commits into from
Sep 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,17 @@ protected EnumSet<Hint> getIntelliCapabilities() {
}

@Override
public boolean open(final File txtFile, final String encoding, final String password) {
public boolean open(final File txtFile, final String encoding, final String password, final String sheetName) {
if (txtFile == null) {
throw new IllegalArgumentException();
}

this.sheet = null;

if (encoding != null && this.openWithEncoding(txtFile, encoding)) {
final var sheetName2 = (sheetName == null) ? Disk.removeExtension(txtFile.getName()) : sheetName;
if (encoding != null && this.openWithEncoding(txtFile, encoding, sheetName2)) {
return true;
} else if (this.openWithEncoding(txtFile, "UTF-8")) {
} else if (this.openWithEncoding(txtFile, "UTF-8", sheetName2)) {
return true;
} else {
this.close();
Expand Down Expand Up @@ -82,13 +83,12 @@ public void autoRecipe(final BaseSheet sheet) {
}
}

private boolean openWithEncoding(final File txtFile, final String encoding) {
private boolean openWithEncoding(final File txtFile, final String encoding, final String sheetName) {
try {
final var reader = new BufferedReader(new InputStreamReader(new FileInputStream(txtFile), encoding));
if (encoding.startsWith("UTF-")) {
this.processUtfBOM(reader);
}
final var sheetName = Disk.removeExtension(txtFile.getName());
this.sheet = new CsvSheet(sheetName, reader);
this.sheet.checkDataEncoding();
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ protected EnumSet<Hint> getIntelliCapabilities() {
}

@Override
public boolean open(final File dbfFile, final String encoding, final String password) {
public boolean open(final File dbfFile, final String encoding, final String password, final String sheetName) {
if (dbfFile == null) {
throw new IllegalArgumentException();
}
Expand All @@ -39,9 +39,10 @@ public boolean open(final File dbfFile, final String encoding, final String pass
return false;
}

if (encoding != null && this.openWithEncoding(dbfFile, encoding)) {
final var sheetName2 = (sheetName == null) ? Disk.removeExtension(dbfFile.getName()) : sheetName;
if (encoding != null && this.openWithEncoding(dbfFile, encoding, sheetName2)) {
return true;
} else if (this.openWithEncoding(dbfFile, "ISO-8859-1")) {
} else if (this.openWithEncoding(dbfFile, "ISO-8859-1", sheetName2)) {
return true;
} else {
this.close();
Expand Down Expand Up @@ -82,10 +83,9 @@ public Sheet getSheetAt(final int i) {
public void autoRecipe(final BaseSheet sheet) {
}

private boolean openWithEncoding(final File dbfFile, final String encoding) {
private boolean openWithEncoding(final File dbfFile, final String encoding, final String sheetName) {
try {
final var reader = new DBFReader(new FileInputStream(dbfFile), Charset.forName(encoding));
final var sheetName = Disk.removeExtension(dbfFile.getName());
this.sheet = new DbfSheet(sheetName, reader);
return true;
} catch (final IOException | UnsupportedCharsetException x) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ protected EnumSet<Hint> getIntelliCapabilities() {
}

@Override
public boolean open(final File excelFile, final String encoding, final String password) {
public boolean open(final File excelFile, final String encoding, final String password, final String sheetName) {
if (excelFile == null) {
throw new IllegalArgumentException();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ protected EnumSet<Hint> getIntelliCapabilities() {
}

@Override
public boolean open(final File excelFile, final String encoding, final String password) {
public boolean open(final File excelFile, final String encoding, final String password, final String sheetName) {
if (excelFile == null) {
throw new IllegalArgumentException();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ protected EnumSet<Hint> getIntelliCapabilities() {
}

@Override
public boolean open(final File excelFile, final String encoding, final String password) {
public boolean open(final File excelFile, final String encoding, final String password, final String sheetName) {
if (excelFile == null) {
throw new IllegalArgumentException();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ protected EnumSet<Hint> getIntelliCapabilities() {
}

@Override
public boolean open(final File parquetFile, final String encoding, final String password) {
public boolean open(final File parquetFile, final String encoding, final String password, final String sheetName) {
if (parquetFile == null) {
throw new IllegalArgumentException();
}
Expand All @@ -48,8 +48,8 @@ public boolean open(final File parquetFile, final String encoding, final String
final var config = new Configuration();
final var file = HadoopInputFile.fromPath(path, config);
final var reader = AvroParquetReader.<GenericRecord>builder(file).disableCompatibility().build();
final var sheetName = Disk.removeExtension(parquetFile.getName());
this.sheet = new ParquetSheet(sheetName, reader);
final var sheetName2 = (sheetName == null) ? Disk.removeExtension(parquetFile.getName()) : sheetName;
this.sheet = new ParquetSheet(sheetName2, reader);
return true;
} catch (IOException x) {
this.close();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ protected EnumSet<Hint> getIntelliCapabilities() {
}

@Override
public boolean open(final File pdfFile, final String encoding, final String password) {
public boolean open(final File pdfFile, final String encoding, final String password, final String sheetName) {
if (pdfFile == null) {
throw new IllegalArgumentException();
}
Expand All @@ -41,9 +41,10 @@ public boolean open(final File pdfFile, final String encoding, final String pass
return false;
}

if (encoding != null && this.openWithEncoding(pdfFile, encoding)) {
final var sheetName2 = (sheetName == null) ? Disk.removeExtension(pdfFile.getName()) : sheetName;
if (encoding != null && this.openWithEncoding(pdfFile, encoding, sheetName2)) {
return true;
} else if (this.openWithEncoding(pdfFile, "ISO-8859-1")) {
} else if (this.openWithEncoding(pdfFile, "ISO-8859-1", sheetName2)) {
return true;
} else {
this.close();
Expand Down Expand Up @@ -80,10 +81,9 @@ public Sheet getSheetAt(final int i) {
return new BaseSheet(this, this.sheet.getName(), this.sheet.ensureDataLoaded());
}

private boolean openWithEncoding(final File pdfFile, final String encoding) {
private boolean openWithEncoding(final File pdfFile, final String encoding, final String sheetName) {
try {
final var reader = PDDocument.load(new FileInputStream(pdfFile));
final var sheetName = Disk.removeExtension(pdfFile.getName());
this.sheet = new PdfSheet(sheetName, reader);
return true;
} catch (final IOException | UnsupportedCharsetException x) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ class PdfSheet extends PatcheableSheetStore implements Closeable {

private static final int BATCH_SIZE = 50000;
private static final int MAX_COLUMNS = 100;
private static final double MARGIN = 6.0;
private static final int LATICE_SPACES = 3; // Number of spaces to be consider as a column separator
private static final int LATICE_MARGINS = 1; // Minimum margins to consider the begin of a row
private static final int LATICE_COLUMN_SEPARATORS = 4; // Number of column separators to consider it is a row

private final String name;

Expand Down Expand Up @@ -119,7 +121,8 @@ private DataFrame processRows(final PDDocument reader, final DataFrameWriter wri
return writer.getDataFrame();
}

private void processRowsTabular(final SpreadsheetExtractionAlgorithm sea, final Page page, final DataFrameWriter writer) throws IOException {
private void processRowsTabular(final SpreadsheetExtractionAlgorithm sea, final Page page,
final DataFrameWriter writer) throws IOException {
final var tables = sea.extract(page);
for (final var table : tables) {
final var rows = table.getRows();
Expand All @@ -135,7 +138,9 @@ private void processRowsTabular(final SpreadsheetExtractionAlgorithm sea, final
}
}

private void processRowsLatice(BasicExtractionAlgorithm bea, Page page, DataFrameWriter writer) throws IOException {
private void processRowsLatice(final BasicExtractionAlgorithm bea, final Page page, final DataFrameWriter writer)
throws IOException {
final var tableRows = new ArrayList<String>();
final var tables = bea.extract(page);
for (final var table : tables) {
final var rows = table.getRows();
Expand All @@ -148,22 +153,65 @@ private void processRowsLatice(BasicExtractionAlgorithm bea, Page page, DataFram
writer.write(Row.of(""));
writer.write(Row.of(""));
}
final var cells = new ArrayList<String>();
for (final var text : this.getCells(elements)) {
cells.add(StringUtils.cleanToken(text));
}
writer.write(Row.of(this.getCells(elements)));
tableRows.add(this.getTableRow(elements));
isPreviousTableRow = true;
} else {
if (tableRows.size() > 0) {
this.processTableLatice(tableRows, writer);
tableRows.clear();
}
if (isPreviousTableRow) {
writer.write(Row.of(""));
writer.write(Row.of(""));
}
writer.write(Row.of(StringUtils.cleanToken(this.getText(elements))));
isPreviousTableRow = false;
}
} else {
isPreviousTableRow = false;
}
}
}
if (tableRows.size() > 0) {
this.processTableLatice(tableRows, writer);
tableRows.clear();
}
}

private void processTableLatice(final ArrayList<String> rows, final DataFrameWriter writer) throws IOException {
final var tabs = new ArrayList<Integer>();
final int maxLength = rows.stream().mapToInt(x -> x.length()).max().getAsInt();

var last_tab = -1;
for (int i = 0; i < maxLength; i++) {
final int tab = i;
final var allBlanks = rows.stream().allMatch(x -> tab >= x.length() || this.isLaticeSpace(x.charAt(tab)));
if (allBlanks) {
if (last_tab >= 0 && (tab - last_tab) == 1) {
tabs.remove(tabs.size() - 1);
}
tabs.add(tab);
last_tab = tab;
}
}
tabs.add(maxLength - 1);

for (final var row : rows) {
final var cells = new ArrayList<String>();
for (int i = 0; i < tabs.size() - 1; i++) {
final var begin = tabs.get(i);
if (begin < row.length()) {
final var end = tabs.get(i + 1);
if (end < row.length() - 1) {
cells.add(StringUtils.cleanToken(row.substring(begin, end)));
} else {
cells.add(StringUtils.cleanToken(row.substring(begin)));
}
} else {
cells.add("");
}
}
writer.write(Row.of(cells.toArray(new String[] {})));
}
}

Expand All @@ -172,51 +220,48 @@ private List<TextElement> getElements(final List<RectangularTextContainer> row)
final var elements = new ArrayList<TextElement>();
for (final var cell : row) {
for (final var element : cell.getTextElements()) {
if (element instanceof TextElement){
if (element instanceof TextElement) {
elements.add((TextElement) element);
}
}
}
return elements;
}

private boolean isTableRow(List<TextElement> elements, boolean isPreviousTableRow) {
var margin = Math.floor(Math.max(elements.get(0).getX() / elements.get(0).getWidthOfSpace() - 4, 0) / 4);
var separators = 0.0;
// var symbols = 0.0;
private boolean isTableRow(final List<TextElement> elements, final boolean isPreviousTableRow) {
final var margins = (int) Math
.floor(Math.max(elements.get(0).getX() / elements.get(0).getWidthOfSpace(), 0) / LATICE_SPACES);

var separators = 0;
var x = elements.get(0).getX();
for (final TextElement element: elements) {
// if (element.getText().isBlank()) {
// symbols += 1.0;
// }
separators += Math.floor(Math.max((element.getX() - x) / element.getWidthOfSpace() - 4, 0) / 4);
for (final TextElement element : elements) {
final var spacing = Math.max((element.getX() - x) / element.getWidthOfSpace() - LATICE_SPACES, 0);
if (spacing > 0) {
separators++;
}
x = element.getX();
}

// Very naive Naive Bayes
final var pRow = pRowMargin(margin) * pRowSeparators(separators);
final var pNotRow = pNotRowMargin(margin) * pNotRowSeparators(separators);
return (!isPreviousTableRow) ? pRow > pNotRow : pRow >= pNotRow;
final var pRow = 0.5 * pRowMargin(margins) + 0.5 * pRowSeparators(separators);
return (!isPreviousTableRow) ? pRow == 1.0 : pRow >= 0.5; // Give a bit of lax if we are in a table, i.e. the
// previous row was a table row
}

private String[] getCells(List<TextElement> elements) {
var x = 0.0;
private String getTableRow(final List<TextElement> elements) {
var text = "";
for (final TextElement element: elements) {
final var spacing = Math.max((element.getX() - x) / element.getWidthOfSpace() - 4, 0);
for (final TextElement element : elements) {
final var spacing = Math.max(element.getX() / element.getWidthOfSpace() - 1, 0) - text.length();
for (int i = 0; i < spacing; i++) {
text += " ";
}
text += element.getText();
x = element.getX();
}
return text.split(" +");
return text;
}

private String getText(List<TextElement> elements) {
private String getText(final List<TextElement> elements) {
var text = "";
for (final TextElement element: elements) {
for (final TextElement element : elements) {
text += element.getText();
}
return text;
Expand All @@ -236,19 +281,15 @@ private String getCellAt(final int colIndex, final int rowIndex) {
return row.get(colIndex);
}

private float pRowMargin(final double margin) {
return margin > MARGIN ? 1.0f : 0.0f;
}

private float pNotRowMargin(final double margin) {
return 1.0f - pRowMargin(margin);
private float pRowMargin(final int margins) {
return margins >= LATICE_MARGINS ? 1.0f : 0.0f;
}

private float pRowSeparators(final double separators) {
return separators > 0.0 ? 1.0f : 0.0f;
private float pRowSeparators(final int separators) {
return separators >= LATICE_COLUMN_SEPARATORS ? 1.0f : 0.0f;
}

private float pNotRowSeparators(final double separators) {
return 1.0f - this.pRowSeparators(separators);
private boolean isLaticeSpace(final char c) {
return List.of(' ', '-', '_', '|').contains(c);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ enum Hint {

TagClassifier getTagClassifier();

boolean open(final File file, final String encoding, final String password);
boolean open(final File file, final String encoding, final String password, final String sheetName);

void close();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,24 @@
public class DocumentFactory {

public static Document createInstance(final String filePath, final String encoding) {
return DocumentFactory.createInstance(new File(filePath), encoding, null);
return DocumentFactory.createInstance(new File(filePath), encoding, null, null);
}

public static Document createInstance(final String filePath, final String encoding, final String password) {
return DocumentFactory.createInstance(new File(filePath), encoding, password);
return DocumentFactory.createInstance(new File(filePath), encoding, password, null);
}

public static Document createInstance(final File file, final String encoding) {
return DocumentFactory.createInstance(file, encoding, null);
return DocumentFactory.createInstance(file, encoding, null, null);
}

public static Document createInstance(final File file, final String encoding, final String password) {
public static Document createInstance(final File file, final String encoding, final String password, final String sheetName) {
if (file == null) {
throw new IllegalArgumentException();
}
return DynamicPackages.GetDocumentFactories().stream()
.map(DocumentClass::newInstance)
.filter(x -> x.open(file, encoding, password))
.filter(x -> x.open(file, encoding, password, sheetName))
.findFirst()
.orElseThrow(() -> new UnknownFormatConversionException(file.toString()));
}
Expand Down
Loading