Skip to content

Commit

Permalink
#5237 - Allow tuning PDF import parameters via settings
Browse files Browse the repository at this point in the history
- Added configuration options (undocumented)
  • Loading branch information
reckart committed Jan 20, 2025
1 parent 2c24d87 commit 6043581
Show file tree
Hide file tree
Showing 7 changed files with 437 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package de.tudarmstadt.ukp.inception.pdfeditor2.config;

import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

Expand All @@ -29,6 +30,7 @@
* Provides support for an PDF-oriented annotation editor.
*/
@Configuration
@EnableConfigurationProperties(PdfFormatPropertiesImpl.class)
public class PdfAnnotationEditor2SupportAutoConfiguration
{
@ConditionalOnProperty(prefix = "ui.pdf", name = "enabled", havingValue = "true", matchIfMissing = true)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.inception.pdfeditor2.config;

import de.tudarmstadt.ukp.inception.pdfeditor2.visual.VisualPDFTextStripper;

public interface PdfFormatProperties
{
/**
* @return Whether to sort the text as it appears on screen or leave it as it appears in the PDF
* file. This may help with PDFs generated by writers that output text sorted by style.
* However, it may cause problems with other types of PDFs, e.g. such that contain
* watermark text in the background.
*
* @see VisualPDFTextStripper#setSortByPosition(boolean)
*/
boolean isSortByPosition();

/**
* @return Whether to suppress duplicate overlapping text. By default the text stripper will
* attempt to remove text that overlapps each other. Word paints the same character
* several times in order to make it look bold. By setting this to false all text will
* be extracted, which means that certain sections will be duplicated, but better
* performance will be noticed.
*
* @see VisualPDFTextStripper#setSuppressDuplicateOverlappingText(boolean)
*/
boolean isSuppressDuplicateOverlappingText();

/**
* @return whether the text stripper should group the text output by a list of beads.
*
* @see VisualPDFTextStripper#setShouldSeparateByBeads(boolean)
*/
boolean isShouldSeparateByBeads();

/**
* @return whether There will some additional text formatting be added.
*
* @see VisualPDFTextStripper#setAddMoreFormatting(boolean)
*/
boolean isAddMoreFormatting();

/**
* @return the multiple of whitespace character widths for the current text which the current
* line start can be indented from the previous line start beyond which the current line
* start is considered to be a paragraph start.
*
* @see VisualPDFTextStripper#setIndentThreshold(float)
*/
float getIndentThreshold();

/**
* @return the minimum whitespace, as a multiple of the max height of the current characters
* beyond which the current line start is considered to be a paragraph start.
*
* @see VisualPDFTextStripper#setDropThreshold(float)
*/
float getDropThreshold();

/**
* @return the character width-based tolerance value that is used to estimate where spaces in
* text should be added. Note that the default value for this has been determined from
* trial and error. Setting this value larger will reduce the number of spaces added.
*
* @see VisualPDFTextStripper#setAverageCharTolerance(float)
*/
float getAverageCharTolerance();

/**
* @return the space width-based tolerance value that is used to estimate where spaces in text
* should be added. Note that the default value for this has been determined from trial
* and error. Setting this value larger will reduce the number of spaces added.
*
* @see VisualPDFTextStripper#setSpacingTolerance(float)
*/
float getSpacingTolerance();

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.inception.pdfeditor2.config;

import org.springframework.boot.context.properties.ConfigurationProperties;

@ConfigurationProperties("format.pdf")
public class PdfFormatPropertiesImpl
implements PdfFormatProperties
{
private boolean sortByPosition = false;
private boolean suppressDuplicateOverlappingText = true;
private boolean shouldSeparateByBeads = true;

private boolean addMoreFormatting = true;
private float indentThreshold = 2.0f;
private float dropThreshold = 2.5f;

private float averageCharTolerance = 0.3f;
private float spacingTolerance = 0.5f;

@Override
public boolean isSortByPosition()
{
return sortByPosition;
}

public void setSortByPosition(boolean aSortByPosition)
{
sortByPosition = aSortByPosition;
}

@Override
public boolean isSuppressDuplicateOverlappingText()
{
return suppressDuplicateOverlappingText;
}

public void setSuppressDuplicateOverlappingText(boolean aSuppressDuplicateOverlappingText)
{
suppressDuplicateOverlappingText = aSuppressDuplicateOverlappingText;
}

@Override
public boolean isShouldSeparateByBeads()
{
return shouldSeparateByBeads;
}

public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
{
shouldSeparateByBeads = aShouldSeparateByBeads;
}

@Override
public boolean isAddMoreFormatting()
{
return addMoreFormatting;
}

public void setAddMoreFormatting(boolean aAddMoreFormatting)
{
addMoreFormatting = aAddMoreFormatting;
}

@Override
public float getIndentThreshold()
{
return indentThreshold;
}

public void setIndentThreshold(float aIndentThreshold)
{
indentThreshold = aIndentThreshold;
}

@Override
public float getDropThreshold()
{
return dropThreshold;
}

public void setDropThreshold(float aDropThreshold)
{
dropThreshold = aDropThreshold;
}

@Override
public float getAverageCharTolerance()
{
return averageCharTolerance;
}

public void setAverageCharTolerance(float aAverageCharTolerance)
{
averageCharTolerance = aAverageCharTolerance;
}

@Override
public float getSpacingTolerance()
{
return spacingTolerance;
}

public void setSpacingTolerance(float aSpacingTolerance)
{
spacingTolerance = aSpacingTolerance;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,82 @@ public class VisualPdfReader
* @see VisualPDFTextStripper#setSortByPosition(boolean)
*/
public static final String PARAM_SORT_BY_POSITION = "sortByPosition";
@ConfigurationParameter(name = PARAM_SORT_BY_POSITION, mandatory = false, defaultValue = "false")
@ConfigurationParameter(name = PARAM_SORT_BY_POSITION, defaultValue = "false")
private boolean sortByPosition;

/**
* By default the text stripper will attempt to remove text that overlapps each other. Word
* paints the same character several times in order to make it look bold. By setting this to
* false all text will be extracted, which means that certain sections will be duplicated, but
* better performance will be noticed.
*
* @see VisualPDFTextStripper#setSuppressDuplicateOverlappingText(boolean)
*/
public static final String PARAM_SUPPRESS_DUPLICATE_OVERLAPPING_TEXT = "suppressDuplicateOverlappingText";
@ConfigurationParameter(name = PARAM_SUPPRESS_DUPLICATE_OVERLAPPING_TEXT, defaultValue = "true")
private boolean suppressDuplicateOverlappingText;

/**
* Set if the text stripper should group the text output by a list of beads.
*
* @see VisualPDFTextStripper#setShouldSeparateByBeads(boolean)
*/
public static final String PARAM_SHOULD_SEPARATE_BY_BEADS = "shouldSeparateByBeads";
@ConfigurationParameter(name = PARAM_SHOULD_SEPARATE_BY_BEADS, defaultValue = "true")
private boolean shouldSeparateByBeads;

/**
* There will some additional text formatting be added.
*
* @see VisualPDFTextStripper#setAddMoreFormatting(boolean)
*/
public static final String PARAM_ADD_MORE_FORMATTING = "addMoreFormatting";
@ConfigurationParameter(name = PARAM_ADD_MORE_FORMATTING, defaultValue = "true")
private boolean addMoreFormatting;

/**
* sets the multiple of whitespace character widths for the current text which the current line
* start can be indented from the previous line start beyond which the current line start is
* considered to be a paragraph start.
*
* @see VisualPDFTextStripper#setIndentThreshold(float)
*/
public static final String PARAM_INDENT_THRESHOLD = "indentThreshold";
@ConfigurationParameter(name = PARAM_INDENT_THRESHOLD, defaultValue = "2.0")
private float indentThreshold;

/**
* Sets the minimum whitespace, as a multiple of the max height of the current characters beyond
* which the current line start is considered to be a paragraph start.
*
* @see VisualPDFTextStripper#setDropThreshold(float)
*/
public static final String PARAM_DROP_THRESHOLD = "dropThreshold";
@ConfigurationParameter(name = PARAM_DROP_THRESHOLD, defaultValue = "2.5")
private float dropThreshold;

/**
* Set the character width-based tolerance value that is used to estimate where spaces in text
* should be added. Note that the default value for this has been determined from trial and
* error. Setting this value larger will reduce the number of spaces added.
*
* @see VisualPDFTextStripper#setAverageCharTolerance(float)
*/
public static final String PARAM_AVERAGE_CHAR_TOLERANCE = "averageCharTolerance";
@ConfigurationParameter(name = PARAM_AVERAGE_CHAR_TOLERANCE, defaultValue = "0.3")
private float averageCharTolerance;

/**
* Set the space width-based tolerance value that is used to estimate where spaces in text
* should be added. Note that the default value for this has been determined from trial and
* error. Setting this value larger will reduce the number of spaces added.
*
* @see VisualPDFTextStripper#setSpacingTolerance(float)
*/
public static final String PARAM_SPACING_TOLERANCE = "spacingTolerance";
@ConfigurationParameter(name = PARAM_SPACING_TOLERANCE, defaultValue = "0.5")
private float spacingTolerance;

@Override
public void getNext(JCas aJCas) throws IOException, CollectionException
{
Expand All @@ -67,8 +140,21 @@ public void getNext(JCas aJCas) throws IOException, CollectionException
try (var is = resource.getInputStream()) {
try (var doc = Loader.loadPDF(IOUtils.toByteArray(is))) {
var stripper = new VisualPDFTextStripper();

stripper.setSortByPosition(sortByPosition);

stripper.setSuppressDuplicateOverlappingText(suppressDuplicateOverlappingText);
stripper.setShouldSeparateByBeads(shouldSeparateByBeads);
stripper.setAddMoreFormatting(addMoreFormatting);

stripper.setDropThreshold(dropThreshold);
stripper.setIndentThreshold(indentThreshold);

stripper.setAverageCharTolerance(averageCharTolerance);
stripper.setSpacingTolerance(spacingTolerance);

stripper.writeText(doc, textBuffer);

vModel = stripper.getVisualModel();
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.inception.pdfeditor2.visual;

public interface PdfEventHandler
{
void documentStart();

void documentEnd();

void beforeStartParagraph(CharSequence aCharSequence);

void afterEndParagraph(CharSequence aCharSequence);

void beforeStartPage(CharSequence aCharSequence);

void afterEndPage(CharSequence aCharSequence);

void afterStartParagraph(CharSequence aCharSequence);

void beforeEndParagraph(CharSequence aCharSequence);

void afterStartPage(CharSequence aCharSequence);

void beforeEndPage(CharSequence aCharSequence);
}
Loading

0 comments on commit 6043581

Please sign in to comment.