diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7ea6835..b049a7b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -90,3 +90,23 @@ Run container ``` ./docker/run.sh ``` + +## Structure of models.json + +The engines' model and language information is stored in `/public/models.json`, +from where it's read and returned in the `/api/available_langs` API endpoint. + +OCR engines take zero to many model names (often called 'languages' because +there's direct mapping to those, but we're moving away from this nomenclature +now because it doesn't always hold true). + +`models.json` is first grouped by engine, and then each engine has a list of models. +These are identified by a 'model code', which is what the user provides in the `langs[]` parameter. +For some engines these are passed through to the actual engine process or API, +but others don't have convenient model names and so we invent them +and add whatever extra info is needed as additional properties within `models.json`. + +In addition to the model code, every model needs to have at least a `title` and `languages` property. + +* `title`: This is what's shown (unlocalized) to the user. +* `languages`: An array of ISO639 language codes. This is (or will be) what's used to group models when the user is browsing them. diff --git a/public/models.json b/public/models.json index 3da2aa8..5fc455d 100644 --- a/public/models.json +++ b/public/models.json @@ -21,7 +21,7 @@ "title": "azərbaycanca" }, "az-cyrl": { - "languages": ["az","Cyrl"], + "languages": ["az"], "title": "Azərbaycan (qədim yazı)" }, "be": { @@ -459,8 +459,8 @@ "title": "azərbaycanca" }, "aze_cyrl": { - "languages": ["aze","cyrl"], - "title": "Azerbaijani (Cyrillic)" + "languages": ["aze"], + "title": "Azərbaycan (qədim yazı)" }, "bel": { "languages": ["be"], diff --git a/src/Controller/OcrController.php b/src/Controller/OcrController.php index e433034..c6075e8 100644 --- a/src/Controller/OcrController.php +++ b/src/Controller/OcrController.php @@ -175,8 +175,8 @@ public function homeAction(): Response { $this->setup(); // Pre-supply available langs for autocompletion in the form. - static::$params['available_langs'] = $this->engine->getValidModels(); - sort( static::$params['available_langs'] ); + static::$params['available_langs'] = $this->engine->getValidModels( true ); + ksort( static::$params['available_langs'] ); // set empty array to avoid errors while rendering template on non-transkribus engines static::$params['available_line_ids'] = []; @@ -295,7 +295,7 @@ public function apiAction(): JsonResponse { } /** - * Get a list of languages available for use with a specific OCR engine. + * Get a list of models available for use with a specific OCR engine. * * @Route("/api/available_langs", name="apiLangs", methods={"GET"}) * @OA\Parameter( @@ -305,7 +305,7 @@ public function apiAction(): JsonResponse { * example="tesseract", * @OA\Schema(type="string") * ) - * @OA\Response(response=200, description="List of available language codes and names, in JSON format.") + * @OA\Response(response=200, description="List of available model codes and names, in JSON format.") * @return JsonResponse */ public function apiAvailableLangsAction(): JsonResponse { diff --git a/src/Engine/EngineBase.php b/src/Engine/EngineBase.php index 4d638dc..4586e00 100644 --- a/src/Engine/EngineBase.php +++ b/src/Engine/EngineBase.php @@ -34,79 +34,6 @@ abstract class EngineBase { /** @var string[][] Local PHP array copy of models.json */ protected $modelList; - /** @var string[] Additional localized names for non-standard language codes. */ - public const LANG_NAMES = [ - 'Fraktur' => 'Fraktur script', - 'Latin' => 'Latin script', - 'az-cyrl' => 'Azərbaycan (qədim yazı)', - 'bali' => 'Balinese palm-leaf manuscripts 16th century', - 'ben-print' => 'Bengali Printed Books +150 New', - 'cs-space' => 'Old Czech Handwriting (with spaces)', - 'cs-no-space' => 'Old Czech Handwriting (without spaces)', - 'da-goth' => '19th century Danish Gothic handwriting v.1.1', - 'da-goth-print' => 'Danish gothic print 1859-1888 v4', - 'da-gjen' => 'Gjentofte 1881-1913 Denmark', - 'de-frk' => 'Deutsch (Fraktur)', - 'de-17' => 'Dutch_XVII_Century', - 'de-hd-m1' => 'Transkribus Dutch Handwriting M1', - 'dev' => 'Devanagari Mixed M1A', - 'el-ligo' => 'Ligorio 0.3 PyL', - 'el-print' => 'Noscemus GM 6', - 'en-b2022' => 'Transkribus B2022 English Model M4', - 'en-handwritten-m3' => 'Transkribus English Handwriting M3', - 'en-print-m1' => 'Transkribus Print M1', - 'en-typewriter' => 'Transkribus Typewriter', - 'enm' => 'Middle English (1100-1500)', - 'es-md' => 'Diario de Madrid 1788-1825', - 'es-old' => 'español (viejo)', - 'es-redonda-extended-v1_2' => 'SpanishRedonda_sXVI-XVII_extended_v1.2', - 'et-court' => 'Estonian Court Records 19thC', - 'fin' => 'NLF_Newseye_GT_FI_M2+', - 'fr-m1' => 'Transkribus French Model 1', - 'frm' => 'moyen français (1400-1600)', - 'fro' => 'Franceis, François, Romanz (1400-1600)', - 'ger-hd-m1' => 'Transkribus German handwriting M1', - 'ger-15' => '15th-16th century German', - 'he-dijest' => 'Hebrew DiJeSt 2.0', - 'hu-hand-19' => 'Hungarian handwriting 19th–20th cent.', - 'it-old' => 'italiano antico', - 'it-hd-m1' => 'Transkribus Italian Handwriting M1', - 'jv-01' => 'Javanese model v0.1 b06/24', - 'ka-old' => 'ქართული (ძველი)', - 'ko-vert' => '한국어 (세로)', - 'kur' => 'کوردی', - 'la-caro' => 'Carolingian Minuscule Model CMM 9th-11th c.', - 'la-in' => 'Latin Incunabula (Reichenau)', - 'la-med' => 'UCL–University of Toronto #7', - 'la-neo' => 'Pylaia_NeoLatin_Ravenstein', - 'nl-1605' => 'Admiraliteit Zeeland 1605-1609 compleet', - 'nl-mount' => 'Dutch Mountains (18th Century)', - 'nl-news' => 'Dutch newspapers 17th century', - 'no-1820' => 'NorHand 1820-1940', - 'no-1874' => 'Sunnhordland Partition Protocols ', - 'osd' => 'Orientation and script detection module', - 'pl-m2' => 'Transkribus Polish M2', - 'pt-m1' => 'General Portuguese M1', - 'pt-17' => 'SPJCL17C V4.2', - 'pt-hd' => 'Portuguese Handwriting 16th-19th century', - 'ro-print' => 'RTA2 (Romanian Transition Alphabet)', - 'rus-hd-2' => 'Russian generic handwriting 2', - 'rus-print' => 'Russian print of the 18th century', - 'ru-petr1708' => 'Русский (старая орфография)', - 'san' => 'Devanagari Mixed M1A', - 'sl-hand-18' => 'Slovenian 18th century manuscript', - 'sk-hand' => 'Handwritten Glagolitic', - 'sr-latn' => 'Српски (латиница)', - 'swe-3' => 'Stockholm Notaries 1700 3.0', - 'swe-lion-i' => 'The Swedish Lion I', - 'syr' => 'leššānā Suryāyā', - 'uz-cyrl' => 'oʻzbekcha', - 'uk-20th-print' => 'Printed Ukrainian 20th century', - 'uk-generic-handwriting-1' => 'Ukrainian generic handwriting 1', - 'uk-wikisource-print' => 'Ukrainian Wikisource Print', - 'yi-hd' => 'The Dybbuk for Yiddish Handwriting' - ]; - /** * EngineBase constructor. * @param Intuition $intuition @@ -185,9 +112,6 @@ public function getModelTitle( ?string $model = null ): string { if ( isset( $this->getModelList()[ $model ]['title'] ) ) { return $this->getModelList()[ $model ]['title']; } - if ( isset( static::LANG_NAMES[$model] ) ) { - return static::LANG_NAMES[$model]; - } return $this->intuition->getLangName( $model ) ?: ''; } diff --git a/templates/output.html.twig b/templates/output.html.twig index dc0abb1..cbcc566 100644 --- a/templates/output.html.twig +++ b/templates/output.html.twig @@ -48,9 +48,9 @@ data-placeholder="{{ msg('langs-placeholder') }}" {% endif %} > - {% for lang in available_langs %} - {% endfor %} diff --git a/tests/Engine/EngineBaseTest.php b/tests/Engine/EngineBaseTest.php index fa44899..d86b3b0 100644 --- a/tests/Engine/EngineBaseTest.php +++ b/tests/Engine/EngineBaseTest.php @@ -143,7 +143,7 @@ public function testLangNames(): void { // From Intuition. static::assertSame( 'français', $this->tesseractEngine->getModelTitle( 'fr' ) ); - // From EngineBase::LANG_NAMES + // From models.json static::assertSame( 'moyen français (1400-1600)', $this->tesseractEngine->getModelTitle( 'frm' ) ); // Make sure every language has a name. diff --git a/tests/Twig/AppExtensionTest.php b/tests/Twig/AppExtensionTest.php index c51f52b..33b453c 100644 --- a/tests/Twig/AppExtensionTest.php +++ b/tests/Twig/AppExtensionTest.php @@ -44,8 +44,8 @@ public function setUp(): void { * @covers AppExtension::getOcrLangName */ public function testOcrLangName(): void { - // Non-standard language code with name defined in EngineBase::LANG_NAMES - static::assertSame( 'Azərbaycan (qədim yazı)', $this->ext->getOcrLangName( 'az-cyrl' ) ); + // Non-standard language code with name defined in models.json + static::assertSame( 'Azərbaycan (qədim yazı)', $this->ext->getOcrLangName( 'aze_cyrl' ) ); // Standard language code (name provided by Intuition) static::assertSame( 'English', $this->ext->getOcrLangName( 'en' ) );