Merge branch 'main' of https://github.com/tesseract-ocr/tesstrain

tesseract-ocr · Mar 1, 2024 · 4f11ccb · 4f11ccb
2 parents 3785914 + 4526831
commit 4f11ccb
Show file tree

Hide file tree

Showing 5 changed files with 59 additions and 103 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,13 +1,12 @@
-data/ground-truth/*
+data/*ground-truth/*
+data/langdata/*
 !data/ground-truth/.gitkeep
 data/all-*
 data/list.*
 data/unicharset
 dta19-reduced
 dta19-reduced.tar.gz
 *.built
-tesseract-*
-leptonica-*
 *.BAK
 /usr
 data/checkpoints
@@ -21,3 +20,6 @@ master.zip
 main.zip
 plot/*.LOG
 plot/ocrd*
+
+# ignore temporary training data
+*checkpoints*
diff --git a/Makefile b/Makefile
@@ -47,15 +47,6 @@ LAST_CHECKPOINT = $(OUTPUT_DIR)/checkpoints/$(MODEL_NAME)_checkpoint
 # Name of the proto model. Default: '$(PROTO_MODEL)'
 PROTO_MODEL = $(OUTPUT_DIR)/$(MODEL_NAME).traineddata
 
-# No of cores to use for compiling leptonica/tesseract. Default: $(CORES)
-CORES = 4
-
-# Leptonica version. Default: $(LEPTONICA_VERSION)
-LEPTONICA_VERSION := 1.83.0
-
-# Tesseract commit. Default: $(TESSERACT_VERSION)
-TESSERACT_VERSION := 5.3.0
-
 # Tesseract model repo to use. Default: $(TESSDATA_REPO)
 TESSDATA_REPO = _best
 
@@ -80,6 +71,13 @@ endif
 # Network specification. Default: $(NET_SPEC)
 NET_SPEC := [1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx192 O1c\#\#\#]
 
+TESSERACT_SCRIPTS := Arabic Armenian Bengali Bopomofo Canadian_Aboriginal Cherokee Cyrillic
+TESSERACT_SCRIPTS += Devanagari Ethiopic Georgian Greek Gujarati Gurmukhi
+TESSERACT_SCRIPTS += Hangul Han Hebrew Hiragana Kannada Katakana Khmer Lao Latin
+TESSERACT_SCRIPTS += Malayalam Myanmar Ogham Oriya Runic Sinhala Syriac Tamil Telugu Thai
+
+TESSERACT_LANGDATA = $(LANGDATA_DIR)/radical-stroke.txt $(TESSERACT_SCRIPTS:%=$(LANGDATA_DIR)/%.unicharset)
+
 # Language Type - Indic, RTL or blank. Default: '$(LANG_TYPE)'
 LANG_TYPE ?=
 
@@ -121,7 +119,7 @@ endif
 
 # BEGIN-EVAL makefile-parser --make-help Makefile
 
-help:
+help: default
 	@echo ""
 	@echo "  Targets"
 	@echo ""
@@ -131,9 +129,6 @@ help:
 	@echo "    training         Start training"
 	@echo "    traineddata      Create best and fast .traineddata files from each .checkpoint file"
 	@echo "    proto-model      Build the proto model"
-	@echo "    leptonica        Build leptonica"
-	@echo "    tesseract        Build tesseract"
-	@echo "    tesseract-langs  Download minimal stock models"
 	@echo "    tesseract-langdata  Download stock unicharsets"
 	@echo "    clean-box        Clean generated .box files"
 	@echo "    clean-lstmf      Clean generated .lstmf files"
@@ -154,9 +149,6 @@ help:
 	@echo "    PUNC_FILE          Optional Punc file for Punctuation dawg. Default: $(PUNC_FILE)"
 	@echo "    START_MODEL        Name of the model to continue from. Default: '$(START_MODEL)'"
 	@echo "    PROTO_MODEL        Name of the proto model. Default: '$(PROTO_MODEL)'"
-	@echo "    CORES              No of cores to use for compiling leptonica/tesseract. Default: $(CORES)"
-	@echo "    LEPTONICA_VERSION  Leptonica version. Default: $(LEPTONICA_VERSION)"
-	@echo "    TESSERACT_VERSION  Tesseract commit. Default: $(TESSERACT_VERSION)"
 	@echo "    TESSDATA_REPO      Tesseract model repo to use (_fast or _best). Default: $(TESSDATA_REPO)"
 	@echo "    MAX_ITERATIONS     Max iterations. Default: $(MAX_ITERATIONS)"
 	@echo "    EPOCHS             Set max iterations based on the number of lines for the training. Default: none"
@@ -171,20 +163,28 @@ help:
 
 # END-EVAL
 
-.PRECIOUS: $(OUTPUT_DIR)/checkpoints/$(MODEL_NAME)*_checkpoint
+default:
+ifeq (4.2, $(firstword $(sort $(MAKE_VERSION) 4.2)))
+   # stuff that requires make-3.81 or higher
+	@echo "    You are using make version: $(MAKE_VERSION)"
+else
+	$(error This version of GNU Make is too low ($(MAKE_VERSION)). Check your path, or upgrade to 4.2 or newer.)
+endif
+
+.PRECIOUS: $(LAST_CHECKPOINT)
 
-.PHONY: clean help leptonica lists proto-model tesseract tesseract-langs tesseract-langdata training unicharset charfreq
+.PHONY: default clean help lists proto-model tesseract-langdata training unicharset charfreq
 
 ALL_FILES = $(and $(wildcard $(GROUND_TRUTH_DIR)),$(shell find -L $(GROUND_TRUTH_DIR) -name '*.gt.txt'))
 unexport ALL_FILES # prevent adding this to envp in recipes (which can cause E2BIG if too long; cf. make #44853)
 ALL_GT = $(OUTPUT_DIR)/all-gt
 ALL_LSTMF = $(OUTPUT_DIR)/all-lstmf
 
 # Create unicharset
-unicharset: $(OUTPUT_DIR)/unicharset
+unicharset: default $(OUTPUT_DIR)/unicharset
 
 # Show character histogram
-charfreq: $(ALL_GT)
+charfreq: default $(ALL_GT)
 	LC_ALL=C.UTF-8 grep -o . $< | sort | uniq -c | sort -rn
 
 # Create lists of lstmf filenames for training and eval
@@ -203,12 +203,14 @@ $(OUTPUT_DIR)/list.train: $(ALL_LSTMF) | $(OUTPUT_DIR)
 	  test "$$eval" = "0" && \
 	    echo "Error: missing ground truth for evaluation" && exit 1; \
 	  set -x; \
-	  head -n "$$train" $(ALL_LSTMF) > "$(OUTPUT_DIR)/list.train"; \
-	  tail -n "$$eval" $(ALL_LSTMF) > "$(OUTPUT_DIR)/list.eval"; \
-	test "$(OS)" == "Windows_NT" && \
-		dos2unix "$(ALL_LSTMF)"; \
-		dos2unix "$(OUTPUT_DIR)/list.train"; \
-		dos2unix "$(OUTPUT_DIR)/list.eval";
+	  head -n "$$train" $(ALL_LSTMF) > "$(OUTPUT_DIR)/list.train" && \
+	  tail -n "$$eval" $(ALL_LSTMF) > "$(OUTPUT_DIR)/list.eval"
+ifeq (Windows_NT, $(OS))
+	dos2unix "$(ALL_LSTMF)"
+	dos2unix "$(OUTPUT_DIR)/list.train"
+	dos2unix "$(OUTPUT_DIR)/list.eval"
+endif
+
 
 ifdef START_MODEL
 $(DATA_DIR)/$(START_MODEL)/$(MODEL_NAME).lstm-unicharset:
@@ -224,7 +226,7 @@ $(OUTPUT_DIR)/unicharset: $(ALL_GT) | $(OUTPUT_DIR)
 endif
 
 # Start training
-training: $(OUTPUT_DIR).traineddata
+training: default $(OUTPUT_DIR).traineddata
 
 $(ALL_GT): $(ALL_FILES) | $(OUTPUT_DIR)
 	$(if $^,,$(error found no $(GROUND_TRUTH_DIR)/*.gt.txt for $@))
@@ -254,23 +256,18 @@ $(ALL_LSTMF): $(ALL_FILES:%.gt.txt=%.lstmf)
 
 .PRECIOUS: %.lstmf
 %.lstmf: %.png %.box
-	set -x; \
 	tesseract "$<" $* --psm $(PSM) lstm.train
 
 %.lstmf: %.bin.png %.box
-	set -x; \
 	tesseract "$<" $* --psm $(PSM) lstm.train
 
 %.lstmf: %.nrm.png %.box
-	set -x; \
 	tesseract "$<" $* --psm $(PSM) lstm.train
 
 %.lstmf: %.raw.png %.box
-	set -x; \
 	tesseract "$<" $* --psm $(PSM) lstm.train
 
 %.lstmf: %.tif %.box
-	set -x; \
 	tesseract "$<" $* --psm $(PSM) lstm.train
 
 CHECKPOINT_FILES := $(wildcard $(OUTPUT_DIR)/checkpoints/$(MODEL_NAME)*.checkpoint)
@@ -301,6 +298,14 @@ $(OUTPUT_DIR)/tessdata_fast/%.traineddata: $(OUTPUT_DIR)/checkpoints/%.checkpoin
 proto-model: $(PROTO_MODEL)
 
 $(PROTO_MODEL): $(OUTPUT_DIR)/unicharset $(TESSERACT_LANGDATA)
+ifeq (Windows_NT, $(OS))
+	dos2unix "$(NUMBERS_FILE)"
+	dos2unix "$(PUNC_FILE)"
+	dos2unix "$(WORDLIST_FILE)"
+	dos2unix "$(LANGDATA_DIR)/$(MODEL_NAME)/$(MODEL_NAME).config"
+endif
+	$(if $(filter-out $(realpath $@),$(realpath $(DATA_DIR)/$(MODEL_NAME)/$(MODEL_NAME).traineddata)),\
+	$(error $@!=$(DATA_DIR)/$(MODEL_NAME)/$(MODEL_NAME).traineddata -- consider setting different values for DATA_DIR, OUTPUT_DIR, or PROTO_MODEL))
 	combine_lang_model \
 	  --input_unicharset $(OUTPUT_DIR)/unicharset \
 	  --script_dir $(LANGDATA_DIR) \
@@ -356,55 +361,12 @@ $(OUTPUT_DIR).traineddata: $(LAST_CHECKPOINT)
 	--model_output $@
 endif
 
-TESSERACT_SCRIPTS := Arabic Armenian Bengali Bopomofo Canadian_Aboriginal Cherokee Cyrillic
-TESSERACT_SCRIPTS += Devanagari Ethiopic Georgian Greek Gujarati Gurmukhi
-TESSERACT_SCRIPTS += Hangul Han Hebrew Hiragana Kannada Katakana Khmer Lao Latin
-TESSERACT_SCRIPTS += Malayalam Myanmar Ogham Oriya Runic Sinhala Syriac Tamil Telugu Thai
-
-TESSERACT_LANGDATA = $(LANGDATA_DIR)/radical-stroke.txt $(TESSERACT_SCRIPTS:%=$(LANGDATA_DIR)/%.unicharset)
-
 tesseract-langdata: $(TESSERACT_LANGDATA)
 
 $(TESSERACT_LANGDATA):
 	@mkdir -p $(@D)
 	wget -O $@ 'https://github.com/tesseract-ocr/langdata_lstm/raw/main/$(@F)'
 
-# Build leptonica
-leptonica: leptonica.built
-
-leptonica.built: leptonica-$(LEPTONICA_VERSION)
-	cd $< ; \
-		./configure --prefix=$(LOCAL) && \
-		make -j$(CORES) install SUBDIRS=src && \
-		date > "$@"
-
-leptonica-$(LEPTONICA_VERSION): leptonica-$(LEPTONICA_VERSION).tar.gz
-	tar xf "$<"
-
-leptonica-$(LEPTONICA_VERSION).tar.gz:
-	wget 'http://www.leptonica.org/source/$@'
-
-# Build tesseract
-tesseract: tesseract.built tesseract-langs
-
-tesseract.built: tesseract-$(TESSERACT_VERSION)
-	cd $< && \
-		sh autogen.sh && \
-		PKG_CONFIG_PATH="$(LOCAL)/lib/pkgconfig" \
-			./configure --prefix=$(LOCAL) && \
-		LDFLAGS="-L$(LOCAL)/lib"\
-			make -j$(CORES) install && \
-		LDFLAGS="-L$(LOCAL)/lib"\
-			make -j$(CORES) training-install && \
-		date > "$@"
-
-tesseract-$(TESSERACT_VERSION):
-	wget https://github.com/tesseract-ocr/tesseract/archive/$(TESSERACT_VERSION).zip
-	unzip $(TESSERACT_VERSION).zip
-
-# Download tesseract-langs
-tesseract-langs: $(TESSDATA)/eng.traineddata
-
 $(TESSDATA)/%.traineddata:
 	wget -O $@ 'https://github.com/tesseract-ocr/tessdata$(TESSDATA_REPO)/raw/main/$(@F)'
 
@@ -424,4 +386,4 @@ clean-output:
 	rm -rf $(OUTPUT_DIR)
 
 # Clean all generated files
-clean: clean-box clean-lstmf clean-output	
+clean: default clean-box clean-lstmf clean-output
diff --git a/README.md b/README.md
@@ -1,38 +1,35 @@
 # tesstrain
 
-> Training workflow for Tesseract 4 as a Makefile for dependency tracking and building the required software from source.
+> Training workflow for Tesseract 5 as a Makefile for dependency tracking.
 
 ## Install
 
 ### Auxiliaries
 
-You will need at least GNU `make`, `wget`, `find`, `bash`, `unzip` and `bc`.
+You will need at least GNU `make` (minimal version 4.2), `wget`, `find`, `bash`, `unzip` and `bc`.
 
 ### Leptonica, Tesseract
 
-You will need a recent version (>= 4.0.0beta1) of tesseract built with the
+You will need a recent version (>= 5.3) of tesseract built with the
 training tools and matching leptonica bindings.
-[Build](https://github.com/tesseract-ocr/tesseract/wiki/Compiling)
-[instructions](https://github.com/tesseract-ocr/tesseract/wiki/Compiling-%E2%80%93-GitInstallation)
-and more can be found in the [Tesseract project
-wiki](https://github.com/tesseract-ocr/tesseract/wiki/).
+[Build](https://tesseract-ocr.github.io/tessdoc/Compiling)
+[instructions](https://tesseract-ocr.github.io/tessdoc/Compiling-%E2%80%93-GitInstallation)
+and more can be found in the [Tesseract User Manual](https://tesseract-ocr.github.io/tessdoc/).
 
-Alternatively, you can build leptonica and tesseract within this project and install it to a subdirectory `./usr` in the repo:
+#### Windows
 
-```sh
-  make leptonica tesseract
-```
-
-Tesseract will be built from the git repository, which requires CMake,
-autotools (including autotools-archive) and some additional libraries for the
-training tools. See the [installation notes in the tesseract
-repository](https://github.com/tesseract-ocr/tesseract/blob/main/INSTALL.GIT.md).
+  1. Install the latest tesseract (e.g. from https://digi.bib.uni-mannheim.de/tesseract/), make sure that tesseract is added to your PATH.
+  2. Install [Python 3](https://www.python.org/downloads/)
+  3. Install [Git SCM to Windows](https://gitforwindows.org/) - it provides a lot of linux utilities on Windows (e.g. `find`, `unzip`, `rm`) and put `C:\Program Files\Git\usr\bin` to the begining of your PATH variable (temporarely you can do it in `cmd` with `set PATH=C:\Program Files\Git\usr\bin;%PATH%` - unfornatelly there are several Windows tools with the same name as on linux (`find`, `sort`) with different behaviour/functionality and there is need to avoid them during training.
+  4. Install winget/[Windows Package Manager](https://github.com/microsoft/winget-cli/releases/) and then run `winget install GnuWin32.Make` and `winget install wget` to install missing tools.
+  5. Download [Bc and dc calculator in Windows](https://embedeo.org/ws/command_line/bc_dc_calculator_windows/) and unzip bc.exe somewhere to your path (e.g. in my case `unzip -j bc-1.07.1-win32-embedeo-02.zip "bc-1.07.1-win32-embedeo-02/bin/bc.exe" -d "c:\Program Files\Tools"`)
 
 ### Python
 
 You need a recent version of Python 3.x. For image processing the Python library `Pillow` is used.
 If you don't have a global installation, please use the provided requirements file `pip install -r requirements.txt`.
 
+
 ### Language data
 
 Tesseract expects some configuration data (a file `radical-stroke.txt` and `*.unicharset` for all scripts) in `DATA_DIR`.
@@ -43,6 +40,7 @@ To fetch them:
 (This step is only needed once and already included implicitly in the `training` target,
 but you might want to run explicitly it in advance.)
 
+
 ## Choose model name
 
 Choose a name for your model. By convention, Tesseract stack models including
@@ -100,9 +98,6 @@ Run `make help` to see all the possible targets and variables:
     training         Start training
     traineddata      Create best and fast .traineddata files from each .checkpoint file
     proto-model      Build the proto model
-    leptonica        Build leptonica
-    tesseract        Build tesseract
-    tesseract-langs  Download minimal stock models
     tesseract-langdata  Download stock unicharsets
     clean            Clean all generated files
 
@@ -117,9 +112,6 @@ Run `make help` to see all the possible targets and variables:
     DATA_DIR           Data directory for output files, proto model, start model, etc. Default: data
     OUTPUT_DIR         Output directory for generated files. Default: DATA_DIR/MODEL_NAME
     GROUND_TRUTH_DIR   Ground truth directory. Default: OUTPUT_DIR-ground-truth
-    CORES              No of cores to use for compiling leptonica/tesseract. Default: 4
-    LEPTONICA_VERSION  Leptonica version. Default: 1.78.0
-    TESSERACT_VERSION  Tesseract commit. Default: 4.1.1
     TESSDATA_REPO      Tesseract model repo to use (_fast or _best). Default: _best
     TESSDATA           Path to the .traineddata directory to start finetuning from. Default: ./usr/share/tessdata
     MAX_ITERATIONS     Max iterations. Default: 10000

diff --git a/plot/plot_cer.py b/plot/plot_cer.py
@@ -58,7 +58,7 @@ def annot_min(boxcolor, xpos, ypos, x,y):
 
 ax2 = ax1.twiny() # ax1 and ax2 share y-axis
 ax2.set_xlabel("Training Iterations")
-ax2.set_xlim(ax1.get_xlim()) # ensure the independant x-axes now span the same range
+ax2.set_xlim(ax1.get_xlim()) # ensure the independent x-axes now span the same range
 ax2.set_xticks(x) # copy over the locations of the x-ticks from Learning Iterations
 ax2.tick_params(axis='x', rotation=45, labelsize='small')
 ax2.set_xticklabels(t) # But give value of Training Iterations

diff --git a/plot/plot_cer_validation.py b/plot/plot_cer_validation.py
@@ -57,7 +57,7 @@ def annot_min(boxcolor, xpos, ypos, x,y):
 
 ax2 = ax1.twiny() # ax1 and ax2 share y-axis
 ax2.set_xlabel("Training Iterations")
-ax2.set_xlim(ax1.get_xlim()) # ensure the independant x-axes now span the same range
+ax2.set_xlim(ax1.get_xlim()) # ensure the independent x-axes now span the same range
 ax2.set_xticks(x) # copy over the locations of the x-ticks from Learning Iterations
 ax2.tick_params(axis='x', rotation=45, labelsize='small')
 ax2.set_xticklabels(t) # But give value of Training Iterations