diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7ea6835..b049a7b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -90,3 +90,23 @@ Run container
```
./docker/run.sh
```
+
+## Structure of models.json
+
+The engines' model and language information is stored in `/public/models.json`,
+from where it's read and returned in the `/api/available_langs` API endpoint.
+
+OCR engines take zero to many model names (often called 'languages' because
+there's direct mapping to those, but we're moving away from this nomenclature
+now because it doesn't always hold true).
+
+`models.json` is first grouped by engine, and then each engine has a list of models.
+These are identified by a 'model code', which is what the user provides in the `langs[]` parameter.
+For some engines these are passed through to the actual engine process or API,
+but others don't have convenient model names and so we invent them
+and add whatever extra info is needed as additional properties within `models.json`.
+
+In addition to the model code, every model needs to have at least a `title` and `languages` property.
+
+* `title`: This is what's shown (unlocalized) to the user.
+* `languages`: An array of ISO639 language codes. This is (or will be) what's used to group models when the user is browsing them.
diff --git a/public/models.json b/public/models.json
index 3da2aa8..5fc455d 100644
--- a/public/models.json
+++ b/public/models.json
@@ -21,7 +21,7 @@
"title": "azərbaycanca"
},
"az-cyrl": {
- "languages": ["az","Cyrl"],
+ "languages": ["az"],
"title": "Azərbaycan (qədim yazı)"
},
"be": {
@@ -459,8 +459,8 @@
"title": "azərbaycanca"
},
"aze_cyrl": {
- "languages": ["aze","cyrl"],
- "title": "Azerbaijani (Cyrillic)"
+ "languages": ["aze"],
+ "title": "Azərbaycan (qədim yazı)"
},
"bel": {
"languages": ["be"],
diff --git a/src/Controller/OcrController.php b/src/Controller/OcrController.php
index e433034..c6075e8 100644
--- a/src/Controller/OcrController.php
+++ b/src/Controller/OcrController.php
@@ -175,8 +175,8 @@ public function homeAction(): Response {
$this->setup();
// Pre-supply available langs for autocompletion in the form.
- static::$params['available_langs'] = $this->engine->getValidModels();
- sort( static::$params['available_langs'] );
+ static::$params['available_langs'] = $this->engine->getValidModels( true );
+ ksort( static::$params['available_langs'] );
// set empty array to avoid errors while rendering template on non-transkribus engines
static::$params['available_line_ids'] = [];
@@ -295,7 +295,7 @@ public function apiAction(): JsonResponse {
}
/**
- * Get a list of languages available for use with a specific OCR engine.
+ * Get a list of models available for use with a specific OCR engine.
*
* @Route("/api/available_langs", name="apiLangs", methods={"GET"})
* @OA\Parameter(
@@ -305,7 +305,7 @@ public function apiAction(): JsonResponse {
* example="tesseract",
* @OA\Schema(type="string")
* )
- * @OA\Response(response=200, description="List of available language codes and names, in JSON format.")
+ * @OA\Response(response=200, description="List of available model codes and names, in JSON format.")
* @return JsonResponse
*/
public function apiAvailableLangsAction(): JsonResponse {
diff --git a/src/Engine/EngineBase.php b/src/Engine/EngineBase.php
index 4d638dc..4586e00 100644
--- a/src/Engine/EngineBase.php
+++ b/src/Engine/EngineBase.php
@@ -34,79 +34,6 @@ abstract class EngineBase {
/** @var string[][] Local PHP array copy of models.json */
protected $modelList;
- /** @var string[] Additional localized names for non-standard language codes. */
- public const LANG_NAMES = [
- 'Fraktur' => 'Fraktur script',
- 'Latin' => 'Latin script',
- 'az-cyrl' => 'Azərbaycan (qədim yazı)',
- 'bali' => 'Balinese palm-leaf manuscripts 16th century',
- 'ben-print' => 'Bengali Printed Books +150 New',
- 'cs-space' => 'Old Czech Handwriting (with spaces)',
- 'cs-no-space' => 'Old Czech Handwriting (without spaces)',
- 'da-goth' => '19th century Danish Gothic handwriting v.1.1',
- 'da-goth-print' => 'Danish gothic print 1859-1888 v4',
- 'da-gjen' => 'Gjentofte 1881-1913 Denmark',
- 'de-frk' => 'Deutsch (Fraktur)',
- 'de-17' => 'Dutch_XVII_Century',
- 'de-hd-m1' => 'Transkribus Dutch Handwriting M1',
- 'dev' => 'Devanagari Mixed M1A',
- 'el-ligo' => 'Ligorio 0.3 PyL',
- 'el-print' => 'Noscemus GM 6',
- 'en-b2022' => 'Transkribus B2022 English Model M4',
- 'en-handwritten-m3' => 'Transkribus English Handwriting M3',
- 'en-print-m1' => 'Transkribus Print M1',
- 'en-typewriter' => 'Transkribus Typewriter',
- 'enm' => 'Middle English (1100-1500)',
- 'es-md' => 'Diario de Madrid 1788-1825',
- 'es-old' => 'español (viejo)',
- 'es-redonda-extended-v1_2' => 'SpanishRedonda_sXVI-XVII_extended_v1.2',
- 'et-court' => 'Estonian Court Records 19thC',
- 'fin' => 'NLF_Newseye_GT_FI_M2+',
- 'fr-m1' => 'Transkribus French Model 1',
- 'frm' => 'moyen français (1400-1600)',
- 'fro' => 'Franceis, François, Romanz (1400-1600)',
- 'ger-hd-m1' => 'Transkribus German handwriting M1',
- 'ger-15' => '15th-16th century German',
- 'he-dijest' => 'Hebrew DiJeSt 2.0',
- 'hu-hand-19' => 'Hungarian handwriting 19th–20th cent.',
- 'it-old' => 'italiano antico',
- 'it-hd-m1' => 'Transkribus Italian Handwriting M1',
- 'jv-01' => 'Javanese model v0.1 b06/24',
- 'ka-old' => 'ქართული (ძველი)',
- 'ko-vert' => '한국어 (세로)',
- 'kur' => 'کوردی',
- 'la-caro' => 'Carolingian Minuscule Model CMM 9th-11th c.',
- 'la-in' => 'Latin Incunabula (Reichenau)',
- 'la-med' => 'UCL–University of Toronto #7',
- 'la-neo' => 'Pylaia_NeoLatin_Ravenstein',
- 'nl-1605' => 'Admiraliteit Zeeland 1605-1609 compleet',
- 'nl-mount' => 'Dutch Mountains (18th Century)',
- 'nl-news' => 'Dutch newspapers 17th century',
- 'no-1820' => 'NorHand 1820-1940',
- 'no-1874' => 'Sunnhordland Partition Protocols ',
- 'osd' => 'Orientation and script detection module',
- 'pl-m2' => 'Transkribus Polish M2',
- 'pt-m1' => 'General Portuguese M1',
- 'pt-17' => 'SPJCL17C V4.2',
- 'pt-hd' => 'Portuguese Handwriting 16th-19th century',
- 'ro-print' => 'RTA2 (Romanian Transition Alphabet)',
- 'rus-hd-2' => 'Russian generic handwriting 2',
- 'rus-print' => 'Russian print of the 18th century',
- 'ru-petr1708' => 'Русский (старая орфография)',
- 'san' => 'Devanagari Mixed M1A',
- 'sl-hand-18' => 'Slovenian 18th century manuscript',
- 'sk-hand' => 'Handwritten Glagolitic',
- 'sr-latn' => 'Српски (латиница)',
- 'swe-3' => 'Stockholm Notaries 1700 3.0',
- 'swe-lion-i' => 'The Swedish Lion I',
- 'syr' => 'leššānā Suryāyā',
- 'uz-cyrl' => 'oʻzbekcha',
- 'uk-20th-print' => 'Printed Ukrainian 20th century',
- 'uk-generic-handwriting-1' => 'Ukrainian generic handwriting 1',
- 'uk-wikisource-print' => 'Ukrainian Wikisource Print',
- 'yi-hd' => 'The Dybbuk for Yiddish Handwriting'
- ];
-
/**
* EngineBase constructor.
* @param Intuition $intuition
@@ -185,9 +112,6 @@ public function getModelTitle( ?string $model = null ): string {
if ( isset( $this->getModelList()[ $model ]['title'] ) ) {
return $this->getModelList()[ $model ]['title'];
}
- if ( isset( static::LANG_NAMES[$model] ) ) {
- return static::LANG_NAMES[$model];
- }
return $this->intuition->getLangName( $model ) ?: '';
}
diff --git a/templates/output.html.twig b/templates/output.html.twig
index dc0abb1..cbcc566 100644
--- a/templates/output.html.twig
+++ b/templates/output.html.twig
@@ -48,9 +48,9 @@
data-placeholder="{{ msg('langs-placeholder') }}"
{% endif %}
>
- {% for lang in available_langs %}
-
{% endfor %}
diff --git a/tests/Engine/EngineBaseTest.php b/tests/Engine/EngineBaseTest.php
index fa44899..d86b3b0 100644
--- a/tests/Engine/EngineBaseTest.php
+++ b/tests/Engine/EngineBaseTest.php
@@ -143,7 +143,7 @@ public function testLangNames(): void {
// From Intuition.
static::assertSame( 'français', $this->tesseractEngine->getModelTitle( 'fr' ) );
- // From EngineBase::LANG_NAMES
+ // From models.json
static::assertSame( 'moyen français (1400-1600)', $this->tesseractEngine->getModelTitle( 'frm' ) );
// Make sure every language has a name.
diff --git a/tests/Twig/AppExtensionTest.php b/tests/Twig/AppExtensionTest.php
index c51f52b..33b453c 100644
--- a/tests/Twig/AppExtensionTest.php
+++ b/tests/Twig/AppExtensionTest.php
@@ -44,8 +44,8 @@ public function setUp(): void {
* @covers AppExtension::getOcrLangName
*/
public function testOcrLangName(): void {
- // Non-standard language code with name defined in EngineBase::LANG_NAMES
- static::assertSame( 'Azərbaycan (qədim yazı)', $this->ext->getOcrLangName( 'az-cyrl' ) );
+ // Non-standard language code with name defined in models.json
+ static::assertSame( 'Azərbaycan (qədim yazı)', $this->ext->getOcrLangName( 'aze_cyrl' ) );
// Standard language code (name provided by Intuition)
static::assertSame( 'English', $this->ext->getOcrLangName( 'en' ) );