Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
zdenop committed Mar 1, 2024
2 parents 3785914 + 4526831 commit 4f11ccb
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 103 deletions.
8 changes: 5 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
data/ground-truth/*
data/*ground-truth/*
data/langdata/*
!data/ground-truth/.gitkeep
data/all-*
data/list.*
data/unicharset
dta19-reduced
dta19-reduced.tar.gz
*.built
tesseract-*
leptonica-*
*.BAK
/usr
data/checkpoints
Expand All @@ -21,3 +20,6 @@ master.zip
main.zip
plot/*.LOG
plot/ocrd*

# ignore temporary training data
*checkpoints*
114 changes: 38 additions & 76 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,6 @@ LAST_CHECKPOINT = $(OUTPUT_DIR)/checkpoints/$(MODEL_NAME)_checkpoint
# Name of the proto model. Default: '$(PROTO_MODEL)'
PROTO_MODEL = $(OUTPUT_DIR)/$(MODEL_NAME).traineddata

# No of cores to use for compiling leptonica/tesseract. Default: $(CORES)
CORES = 4

# Leptonica version. Default: $(LEPTONICA_VERSION)
LEPTONICA_VERSION := 1.83.0

# Tesseract commit. Default: $(TESSERACT_VERSION)
TESSERACT_VERSION := 5.3.0

# Tesseract model repo to use. Default: $(TESSDATA_REPO)
TESSDATA_REPO = _best

Expand All @@ -80,6 +71,13 @@ endif
# Network specification. Default: $(NET_SPEC)
NET_SPEC := [1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx192 O1c\#\#\#]

TESSERACT_SCRIPTS := Arabic Armenian Bengali Bopomofo Canadian_Aboriginal Cherokee Cyrillic
TESSERACT_SCRIPTS += Devanagari Ethiopic Georgian Greek Gujarati Gurmukhi
TESSERACT_SCRIPTS += Hangul Han Hebrew Hiragana Kannada Katakana Khmer Lao Latin
TESSERACT_SCRIPTS += Malayalam Myanmar Ogham Oriya Runic Sinhala Syriac Tamil Telugu Thai

TESSERACT_LANGDATA = $(LANGDATA_DIR)/radical-stroke.txt $(TESSERACT_SCRIPTS:%=$(LANGDATA_DIR)/%.unicharset)

# Language Type - Indic, RTL or blank. Default: '$(LANG_TYPE)'
LANG_TYPE ?=

Expand Down Expand Up @@ -121,7 +119,7 @@ endif

# BEGIN-EVAL makefile-parser --make-help Makefile

help:
help: default
@echo ""
@echo " Targets"
@echo ""
Expand All @@ -131,9 +129,6 @@ help:
@echo " training Start training"
@echo " traineddata Create best and fast .traineddata files from each .checkpoint file"
@echo " proto-model Build the proto model"
@echo " leptonica Build leptonica"
@echo " tesseract Build tesseract"
@echo " tesseract-langs Download minimal stock models"
@echo " tesseract-langdata Download stock unicharsets"
@echo " clean-box Clean generated .box files"
@echo " clean-lstmf Clean generated .lstmf files"
Expand All @@ -154,9 +149,6 @@ help:
@echo " PUNC_FILE Optional Punc file for Punctuation dawg. Default: $(PUNC_FILE)"
@echo " START_MODEL Name of the model to continue from. Default: '$(START_MODEL)'"
@echo " PROTO_MODEL Name of the proto model. Default: '$(PROTO_MODEL)'"
@echo " CORES No of cores to use for compiling leptonica/tesseract. Default: $(CORES)"
@echo " LEPTONICA_VERSION Leptonica version. Default: $(LEPTONICA_VERSION)"
@echo " TESSERACT_VERSION Tesseract commit. Default: $(TESSERACT_VERSION)"
@echo " TESSDATA_REPO Tesseract model repo to use (_fast or _best). Default: $(TESSDATA_REPO)"
@echo " MAX_ITERATIONS Max iterations. Default: $(MAX_ITERATIONS)"
@echo " EPOCHS Set max iterations based on the number of lines for the training. Default: none"
Expand All @@ -171,20 +163,28 @@ help:

# END-EVAL

.PRECIOUS: $(OUTPUT_DIR)/checkpoints/$(MODEL_NAME)*_checkpoint
default:
ifeq (4.2, $(firstword $(sort $(MAKE_VERSION) 4.2)))
# stuff that requires make-3.81 or higher
@echo " You are using make version: $(MAKE_VERSION)"
else
$(error This version of GNU Make is too low ($(MAKE_VERSION)). Check your path, or upgrade to 4.2 or newer.)
endif

.PRECIOUS: $(LAST_CHECKPOINT)

.PHONY: clean help leptonica lists proto-model tesseract tesseract-langs tesseract-langdata training unicharset charfreq
.PHONY: default clean help lists proto-model tesseract-langdata training unicharset charfreq

ALL_FILES = $(and $(wildcard $(GROUND_TRUTH_DIR)),$(shell find -L $(GROUND_TRUTH_DIR) -name '*.gt.txt'))
unexport ALL_FILES # prevent adding this to envp in recipes (which can cause E2BIG if too long; cf. make #44853)
ALL_GT = $(OUTPUT_DIR)/all-gt
ALL_LSTMF = $(OUTPUT_DIR)/all-lstmf

# Create unicharset
unicharset: $(OUTPUT_DIR)/unicharset
unicharset: default $(OUTPUT_DIR)/unicharset

# Show character histogram
charfreq: $(ALL_GT)
charfreq: default $(ALL_GT)
LC_ALL=C.UTF-8 grep -o . $< | sort | uniq -c | sort -rn

# Create lists of lstmf filenames for training and eval
Expand All @@ -203,12 +203,14 @@ $(OUTPUT_DIR)/list.train: $(ALL_LSTMF) | $(OUTPUT_DIR)
test "$$eval" = "0" && \
echo "Error: missing ground truth for evaluation" && exit 1; \
set -x; \
head -n "$$train" $(ALL_LSTMF) > "$(OUTPUT_DIR)/list.train"; \
tail -n "$$eval" $(ALL_LSTMF) > "$(OUTPUT_DIR)/list.eval"; \
test "$(OS)" == "Windows_NT" && \
dos2unix "$(ALL_LSTMF)"; \
dos2unix "$(OUTPUT_DIR)/list.train"; \
dos2unix "$(OUTPUT_DIR)/list.eval";
head -n "$$train" $(ALL_LSTMF) > "$(OUTPUT_DIR)/list.train" && \
tail -n "$$eval" $(ALL_LSTMF) > "$(OUTPUT_DIR)/list.eval"
ifeq (Windows_NT, $(OS))
dos2unix "$(ALL_LSTMF)"
dos2unix "$(OUTPUT_DIR)/list.train"
dos2unix "$(OUTPUT_DIR)/list.eval"
endif


ifdef START_MODEL
$(DATA_DIR)/$(START_MODEL)/$(MODEL_NAME).lstm-unicharset:
Expand All @@ -224,7 +226,7 @@ $(OUTPUT_DIR)/unicharset: $(ALL_GT) | $(OUTPUT_DIR)
endif

# Start training
training: $(OUTPUT_DIR).traineddata
training: default $(OUTPUT_DIR).traineddata

$(ALL_GT): $(ALL_FILES) | $(OUTPUT_DIR)
$(if $^,,$(error found no $(GROUND_TRUTH_DIR)/*.gt.txt for $@))
Expand Down Expand Up @@ -254,23 +256,18 @@ $(ALL_LSTMF): $(ALL_FILES:%.gt.txt=%.lstmf)

.PRECIOUS: %.lstmf
%.lstmf: %.png %.box
set -x; \
tesseract "$<" $* --psm $(PSM) lstm.train

%.lstmf: %.bin.png %.box
set -x; \
tesseract "$<" $* --psm $(PSM) lstm.train

%.lstmf: %.nrm.png %.box
set -x; \
tesseract "$<" $* --psm $(PSM) lstm.train

%.lstmf: %.raw.png %.box
set -x; \
tesseract "$<" $* --psm $(PSM) lstm.train

%.lstmf: %.tif %.box
set -x; \
tesseract "$<" $* --psm $(PSM) lstm.train

CHECKPOINT_FILES := $(wildcard $(OUTPUT_DIR)/checkpoints/$(MODEL_NAME)*.checkpoint)
Expand Down Expand Up @@ -301,6 +298,14 @@ $(OUTPUT_DIR)/tessdata_fast/%.traineddata: $(OUTPUT_DIR)/checkpoints/%.checkpoin
proto-model: $(PROTO_MODEL)

$(PROTO_MODEL): $(OUTPUT_DIR)/unicharset $(TESSERACT_LANGDATA)
ifeq (Windows_NT, $(OS))
dos2unix "$(NUMBERS_FILE)"
dos2unix "$(PUNC_FILE)"
dos2unix "$(WORDLIST_FILE)"
dos2unix "$(LANGDATA_DIR)/$(MODEL_NAME)/$(MODEL_NAME).config"
endif
$(if $(filter-out $(realpath $@),$(realpath $(DATA_DIR)/$(MODEL_NAME)/$(MODEL_NAME).traineddata)),\
$(error $@!=$(DATA_DIR)/$(MODEL_NAME)/$(MODEL_NAME).traineddata -- consider setting different values for DATA_DIR, OUTPUT_DIR, or PROTO_MODEL))
combine_lang_model \
--input_unicharset $(OUTPUT_DIR)/unicharset \
--script_dir $(LANGDATA_DIR) \
Expand Down Expand Up @@ -356,55 +361,12 @@ $(OUTPUT_DIR).traineddata: $(LAST_CHECKPOINT)
--model_output $@
endif

TESSERACT_SCRIPTS := Arabic Armenian Bengali Bopomofo Canadian_Aboriginal Cherokee Cyrillic
TESSERACT_SCRIPTS += Devanagari Ethiopic Georgian Greek Gujarati Gurmukhi
TESSERACT_SCRIPTS += Hangul Han Hebrew Hiragana Kannada Katakana Khmer Lao Latin
TESSERACT_SCRIPTS += Malayalam Myanmar Ogham Oriya Runic Sinhala Syriac Tamil Telugu Thai

TESSERACT_LANGDATA = $(LANGDATA_DIR)/radical-stroke.txt $(TESSERACT_SCRIPTS:%=$(LANGDATA_DIR)/%.unicharset)

tesseract-langdata: $(TESSERACT_LANGDATA)

$(TESSERACT_LANGDATA):
@mkdir -p $(@D)
wget -O $@ 'https://github.com/tesseract-ocr/langdata_lstm/raw/main/$(@F)'

# Build leptonica
leptonica: leptonica.built

leptonica.built: leptonica-$(LEPTONICA_VERSION)
cd $< ; \
./configure --prefix=$(LOCAL) && \
make -j$(CORES) install SUBDIRS=src && \
date > "$@"

leptonica-$(LEPTONICA_VERSION): leptonica-$(LEPTONICA_VERSION).tar.gz
tar xf "$<"

leptonica-$(LEPTONICA_VERSION).tar.gz:
wget 'http://www.leptonica.org/source/$@'

# Build tesseract
tesseract: tesseract.built tesseract-langs

tesseract.built: tesseract-$(TESSERACT_VERSION)
cd $< && \
sh autogen.sh && \
PKG_CONFIG_PATH="$(LOCAL)/lib/pkgconfig" \
./configure --prefix=$(LOCAL) && \
LDFLAGS="-L$(LOCAL)/lib"\
make -j$(CORES) install && \
LDFLAGS="-L$(LOCAL)/lib"\
make -j$(CORES) training-install && \
date > "$@"

tesseract-$(TESSERACT_VERSION):
wget https://github.com/tesseract-ocr/tesseract/archive/$(TESSERACT_VERSION).zip
unzip $(TESSERACT_VERSION).zip

# Download tesseract-langs
tesseract-langs: $(TESSDATA)/eng.traineddata

$(TESSDATA)/%.traineddata:
wget -O $@ 'https://github.com/tesseract-ocr/tessdata$(TESSDATA_REPO)/raw/main/$(@F)'

Expand All @@ -424,4 +386,4 @@ clean-output:
rm -rf $(OUTPUT_DIR)

# Clean all generated files
clean: clean-box clean-lstmf clean-output
clean: default clean-box clean-lstmf clean-output
36 changes: 14 additions & 22 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,38 +1,35 @@
# tesstrain

> Training workflow for Tesseract 4 as a Makefile for dependency tracking and building the required software from source.
> Training workflow for Tesseract 5 as a Makefile for dependency tracking.
## Install

### Auxiliaries

You will need at least GNU `make`, `wget`, `find`, `bash`, `unzip` and `bc`.
You will need at least GNU `make` (minimal version 4.2), `wget`, `find`, `bash`, `unzip` and `bc`.

### Leptonica, Tesseract

You will need a recent version (>= 4.0.0beta1) of tesseract built with the
You will need a recent version (>= 5.3) of tesseract built with the
training tools and matching leptonica bindings.
[Build](https://github.com/tesseract-ocr/tesseract/wiki/Compiling)
[instructions](https://github.com/tesseract-ocr/tesseract/wiki/Compiling-%E2%80%93-GitInstallation)
and more can be found in the [Tesseract project
wiki](https://github.com/tesseract-ocr/tesseract/wiki/).
[Build](https://tesseract-ocr.github.io/tessdoc/Compiling)
[instructions](https://tesseract-ocr.github.io/tessdoc/Compiling-%E2%80%93-GitInstallation)
and more can be found in the [Tesseract User Manual](https://tesseract-ocr.github.io/tessdoc/).

Alternatively, you can build leptonica and tesseract within this project and install it to a subdirectory `./usr` in the repo:
#### Windows

```sh
make leptonica tesseract
```

Tesseract will be built from the git repository, which requires CMake,
autotools (including autotools-archive) and some additional libraries for the
training tools. See the [installation notes in the tesseract
repository](https://github.com/tesseract-ocr/tesseract/blob/main/INSTALL.GIT.md).
1. Install the latest tesseract (e.g. from https://digi.bib.uni-mannheim.de/tesseract/), make sure that tesseract is added to your PATH.
2. Install [Python 3](https://www.python.org/downloads/)
3. Install [Git SCM to Windows](https://gitforwindows.org/) - it provides a lot of linux utilities on Windows (e.g. `find`, `unzip`, `rm`) and put `C:\Program Files\Git\usr\bin` to the begining of your PATH variable (temporarely you can do it in `cmd` with `set PATH=C:\Program Files\Git\usr\bin;%PATH%` - unfornatelly there are several Windows tools with the same name as on linux (`find`, `sort`) with different behaviour/functionality and there is need to avoid them during training.
4. Install winget/[Windows Package Manager](https://github.com/microsoft/winget-cli/releases/) and then run `winget install GnuWin32.Make` and `winget install wget` to install missing tools.
5. Download [Bc and dc calculator in Windows](https://embedeo.org/ws/command_line/bc_dc_calculator_windows/) and unzip bc.exe somewhere to your path (e.g. in my case `unzip -j bc-1.07.1-win32-embedeo-02.zip "bc-1.07.1-win32-embedeo-02/bin/bc.exe" -d "c:\Program Files\Tools"`)

### Python

You need a recent version of Python 3.x. For image processing the Python library `Pillow` is used.
If you don't have a global installation, please use the provided requirements file `pip install -r requirements.txt`.


### Language data

Tesseract expects some configuration data (a file `radical-stroke.txt` and `*.unicharset` for all scripts) in `DATA_DIR`.
Expand All @@ -43,6 +40,7 @@ To fetch them:
(This step is only needed once and already included implicitly in the `training` target,
but you might want to run explicitly it in advance.)


## Choose model name

Choose a name for your model. By convention, Tesseract stack models including
Expand Down Expand Up @@ -100,9 +98,6 @@ Run `make help` to see all the possible targets and variables:
training Start training
traineddata Create best and fast .traineddata files from each .checkpoint file
proto-model Build the proto model
leptonica Build leptonica
tesseract Build tesseract
tesseract-langs Download minimal stock models
tesseract-langdata Download stock unicharsets
clean Clean all generated files
Expand All @@ -117,9 +112,6 @@ Run `make help` to see all the possible targets and variables:
DATA_DIR Data directory for output files, proto model, start model, etc. Default: data
OUTPUT_DIR Output directory for generated files. Default: DATA_DIR/MODEL_NAME
GROUND_TRUTH_DIR Ground truth directory. Default: OUTPUT_DIR-ground-truth
CORES No of cores to use for compiling leptonica/tesseract. Default: 4
LEPTONICA_VERSION Leptonica version. Default: 1.78.0
TESSERACT_VERSION Tesseract commit. Default: 4.1.1
TESSDATA_REPO Tesseract model repo to use (_fast or _best). Default: _best
TESSDATA Path to the .traineddata directory to start finetuning from. Default: ./usr/share/tessdata
MAX_ITERATIONS Max iterations. Default: 10000
Expand Down
2 changes: 1 addition & 1 deletion plot/plot_cer.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def annot_min(boxcolor, xpos, ypos, x,y):

ax2 = ax1.twiny() # ax1 and ax2 share y-axis
ax2.set_xlabel("Training Iterations")
ax2.set_xlim(ax1.get_xlim()) # ensure the independant x-axes now span the same range
ax2.set_xlim(ax1.get_xlim()) # ensure the independent x-axes now span the same range
ax2.set_xticks(x) # copy over the locations of the x-ticks from Learning Iterations
ax2.tick_params(axis='x', rotation=45, labelsize='small')
ax2.set_xticklabels(t) # But give value of Training Iterations
Expand Down
2 changes: 1 addition & 1 deletion plot/plot_cer_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def annot_min(boxcolor, xpos, ypos, x,y):

ax2 = ax1.twiny() # ax1 and ax2 share y-axis
ax2.set_xlabel("Training Iterations")
ax2.set_xlim(ax1.get_xlim()) # ensure the independant x-axes now span the same range
ax2.set_xlim(ax1.get_xlim()) # ensure the independent x-axes now span the same range
ax2.set_xticks(x) # copy over the locations of the x-ticks from Learning Iterations
ax2.tick_params(axis='x', rotation=45, labelsize='small')
ax2.set_xticklabels(t) # But give value of Training Iterations
Expand Down

0 comments on commit 4f11ccb

Please sign in to comment.