generated from OCR-D/gt-repo-template
-
Notifications
You must be signed in to change notification settings - Fork 3
/
gt_structure_text_METADATA_htr_united.yml
54 lines (54 loc) · 1.55 KB
/
gt_structure_text_METADATA_htr_united.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
schema: https://htr-united.github.io/schema/2023-06-27/schema.json
title: gt_structure_text
url: https://github.com/OCR-D/gt_structure_text
authors:
- name: Matthias
surname: Boenig
orcid: 0000-0003-4615-4753
roles:
- transcriber
- aligner
- project-manager
- quality-control
- digitization
- support
institutions: []
description: >-
The OCR-D Ground Truth text and structure corpus was created between 2015 -2017. In the years since 2017, this corpus has been further curated and supplemented with metadata where appropriate. The corpus includes page XML files within annotations of the text and structure include. The data is based on transcription data stored in the German Text Archive (DTA) (https://www.deutschestextarchiv.de/).
project-name: OCR-D
project-website: https://ocr-d.de/
language:
- eng
- fra
- deu
- heb
- lat
production-software: Aletheia
automatically-aligned: false
script:
- iso: Latn
- iso: Latf
script-type:
only-typed
time:
notAfter: '1900'
notBefore: '1500'
hands:
count: unknown
precision: exact
license:
name: CC-BY-SA-4.0
url: https://creativecommons.org/licenses/by-sa/4.0/
format: Page-XML
volume:
- count: 221156
metric: characters
- count: 217
metric: files
- count: 6609
metric: lines
- count: 1889
metric: regions
citation-file-link: https://github.com/OCR-D/gt_structure_text/blob/main/CITATION.cff
transcription-guidelines: >-
OCR-D Ground Truth Guidelines https://ocr-d.de/en/gt-guidelines/trans/