Skip to content

Commit

Permalink
Add new OCR parameter to normalize the result text
Browse files Browse the repository at this point in the history
Signed-off-by: Stefan Weil <[email protected]>
  • Loading branch information
stweil committed Sep 22, 2023
1 parent 191bd04 commit f1764a1
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 0 deletions.
1 change: 1 addition & 0 deletions i18n/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"kraken-segmentation-model-label": "Kraken segmentation model",
"langs-placeholder": "Leave blank for automatic language detection.",
"langs-param-error": "The following {{PLURAL:$1|language is|languages are}} not supported by the OCR engine: $2",
"normalize-ocr-text": "Normalize the text from OCR",
"tesseract-options": "Tesseract options",
"tesseract-psm-label": "Page segmentation method",
"tesseract-psm-help": "Try \"Sparse text\" for better multi-column support.",
Expand Down
11 changes: 11 additions & 0 deletions src/Controller/OcrController.php
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class OcrController extends AbstractController {
'image' => '',
'engine' => self::DEFAULT_ENGINE,
'langs' => [],
'normalize' => false,
'psm' => TesseractEngine::DEFAULT_PSM,
'crop' => [],
'line_id' => TranskribusEngine::DEFAULT_LINEID,
Expand Down Expand Up @@ -114,6 +115,7 @@ private function setup(): void {
}
static::$params['langs'] = $this->getLangs( $this->request );
static::$params['image_hosts'] = $this->engine->getImageHosts();
static::$params['normalize'] = $this->request->query->get( 'normalize' );
$crop = $this->request->query->get( 'crop' );
if ( !is_array( $crop ) ) {
$crop = [];
Expand Down Expand Up @@ -240,6 +242,12 @@ public function homeAction(): Response {
* @OA\Schema(type="array", @OA\Items(type="string"))
* )
* @OA\Parameter(
* name="normalize",
* in="query",
* description="Normalize OCR text.",
* @OA\Schema(type="boolean")
* )
* @OA\Parameter(
* name="segmentation_model",
* in="query",
* description="The segmentation model for kraken.",
Expand Down Expand Up @@ -424,6 +432,9 @@ private function getResult( string $invalidLangsMode ): EngineResult {
if ( !$result instanceof EngineResult ) {
throw new Exception( 'Incorrect (possibly cached) result: ' . var_export( $result, true ) );
}
if ( static::$params['normalize'] ) {
$result->normalize();
}
return $result;
}
}
14 changes: 14 additions & 0 deletions src/Engine/EngineResult.php
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,18 @@ public function getText(): string {
public function getWarnings(): array {
return $this->warnings;
}

/**
* Normalize result by replacing some historic characters
*/
public function normalize() {
$this->text = strtr( $this->text, [
'ſ' => 's',
'' => 'r',
'' => 'M',
'' => 'ä',
'' => 'ö',
'' => 'ü',
] );
}
}
4 changes: 4 additions & 0 deletions templates/output.html.twig
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@
</select>
{% include '_transkribus_help.html.twig' with {engine: engine} %}
</div>
<div class="form-group">
<input type="checkbox" id="normalize" name="normalize" value="1">
<label for="normalize">{{ msg('normalize-ocr-text') }}</label>
</div>
</fieldset>

{% include '_kraken_options.html.twig' with {engine: engine} %}
Expand Down

0 comments on commit f1764a1

Please sign in to comment.