tesseract-ocr · stweil · Nov 17, 2020 · Oct 31, 2020 · Nov 6, 2020 · Nov 6, 2020
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,5 @@ data/*.traineddata
 wackenroder_herzensergiessungen_*.gt.txt
 wackenroder_herzensergiessungen_*.tif
 master.zip
+plot/*.LOG
+plot/ocrd*
diff --git a/README.md b/README.md
@@ -156,6 +156,24 @@ It is also possible to create models for selected checkpoints only. Examples:
 
 Add `MODEL_NAME` and `OUTPUT_DIR` and replace `data/foo` by the output directory if needed.
 
+## Plotting CER (experimental)
+
+Training and Evaluation CER can be plotted using matplotlib. A couple of scripts are provided
+as a starting point in `plot` subdirectory for plotting of different training scenarios. The training
+log is expected to be saved in `plot/TESSTRAIN.LOG`.
+
+As an example, use the training data provided in 
+[ocrd-testset.zip](./ocrd-testset.zip) to do training and generate the plots.
+Plotting can be done while training is running also to depict the training status till then.
+```
+unzip ocrd-testset.zip -d data/ocrd-ground-truth
+nohup make training MODEL_NAME=ocrd START_MODEL=frk TESSDATA=~/tessdata_best MAX_ITERATIONS=10000 > plot/TESSTRAIN.LOG &
+```
+```
+cd ./plot
+./plot_cer.sh 
+```
+
 ## License
 
 Software is provided under the terms of the `Apache 2.0` license.

diff --git a/plot/plot_cer.py b/plot/plot_cer.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+maxticks=10
+dataframe = pd.read_csv("plot_cer.csv",sep='\t', encoding='utf-8')
+dataframe['TrainingIteration'] = dataframe['TrainingIteration'].fillna(-2)
+dataframe['TrainingIteration'] = dataframe['TrainingIteration'].astype(int)
+dataframe['TrainingIteration'] = dataframe['TrainingIteration'].astype(str)
+dataframe['TrainingIteration'] = dataframe['TrainingIteration'].replace('-2', np.nan)
+t = dataframe['TrainingIteration']
+x = dataframe['LearningIteration']
+y = dataframe.IterationCER
+c = dataframe.CheckpointCER
+e = dataframe.EvalCER
+cmax = c[np.argmax(c)]
+maxCERtoDisplay=cmax+2
+
+def annot_min(boxcolor, xpos, ypos, x,y):
+    xmin = x[np.argmin(y)]
+    ymin = y.min()
+    boxtext= "{:.3f}% at Learning Iteration {:.0f}" .format(ymin,xmin)
+    ax1.annotate(boxtext, xy=(xmin, ymin), xytext=(xpos,ypos), textcoords='offset points',
+            arrowprops=dict(shrinkA=0.05, shrinkB=1, fc='black', ec='white', connectionstyle="arc3"),
+            bbox=dict(boxstyle='round,pad=0.2', fc=boxcolor, alpha=0.3))
+
+PlotTitle="Tesseract LSTM training and Evaluation Character Error Rates (-1 to " + str(maxCERtoDisplay) + "%)"
+plt.title(label=PlotTitle)
+
+fig = plt.figure(figsize=(11,8.5)) #size is in inches
+ax1 = fig.add_subplot()
+ax1.set_ylim([-1,maxCERtoDisplay])
+ax1.set_xlim([-1000,30000])
+ax1.set_xlabel('Learning Iterations')
+ax1.set_ylabel('Character Error Rate (%)')
+ax1.set_xticks(x)
+ax1.tick_params(axis='x', rotation=45, labelsize='small')
+ax1.locator_params(axis='x', nbins=maxticks)  # limit ticks on x-axis
+ax1.grid(True)
+
+if not c.dropna().empty: # not NaN or empty
+	ax1.scatter(x, c, c='gold', s=50, label='Best Model Checkpoints CER')
+	ax1.plot(x, c, 'gold')
+	annot_min('gold',-150,-30,x,c)
+
+ax1.scatter(x, y, s=3, c='teal', label='CER every 100 Training Iterations')
+ax1.plot(x, y, 'teal', linewidth=0.7)
+
+if not e.dropna().empty: # not NaN or empty
+	ax1.plot(x, e, 'magenta')
+	ax1.scatter(x, e, c='magenta', s=50, label='Evaluation CER')
+	annot_min('magenta',-150,40,x,e) 
+
+plt.legend(loc='upper right')
+
+ax2 = ax1.twiny() # ax1 and ax2 share y-axis
+ax2.set_xlabel("Training Iterations")
+ax2.set_xlim(ax1.get_xlim()) # ensure the independant x-axes now span the same range
+ax2.set_xticks(x) # copy over the locations of the x-ticks from Learning Iterations
+ax2.tick_params(axis='x', rotation=45, labelsize='small')
+ax2.set_xticklabels(t) # But give value of Training Iterations
+ax2.locator_params(axis='x', nbins=maxticks)  #  limit ticks on secondary x-axis
+
+plt.savefig("plot_cer.png")
diff --git a/plot/plot_cer_validation.py b/plot/plot_cer_validation.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+maxticks=10
+dataframe = pd.read_csv("plot_cer_validation.csv",sep='\t', encoding='utf-8')
+t = dataframe['TrainingIteration']
+x = dataframe['LearningIteration']
+v = dataframe.ValidationCER
+c = dataframe.CheckpointCER
+cmax = c[np.argmax(c)]
+vmax = v[np.argmax(v)]
+if vmax > cmax:
+    maxCERtoDisplay=vmax+2
+else:
+    maxCERtoDisplay=cmax+2
+
+def annot_min(boxcolor, xpos, ypos, x,y):
+    xmin = x[np.argmin(y)]
+    ymin = y.min()
+    boxtext= "{:.3f}% at Learning Iteration {:}" .format(ymin,xmin)
+    ax1.annotate(boxtext, xy=(xmin, ymin), xytext=(xpos,ypos), textcoords='offset points',
+            bbox=dict(boxstyle='round,pad=0.2', fc=boxcolor, alpha=0.3))
+
+PlotTitle="Tesseract LSTM Training and Validation Character Error Rate %"
+fig = plt.figure(figsize=(11,8.5)) #size is in inches
+ax1 = fig.add_subplot()
+ax1.set_ylim([-1,maxCERtoDisplay])
+ax1.set_xlabel('Learning Iterations')
+ax1.set_ylabel('Character Error Rate (%)')
+ax1.set_xticks(x)
+ax1.tick_params(axis='x', rotation=45, labelsize='small')
+ax1.locator_params(axis='x', nbins=maxticks)  # limit ticks on x-axis
+ax1.grid(True)
+
+if not c.dropna().empty: # not NaN or empty
+	ax1.scatter(x, c, c='gold', s=50, label='Best Model Checkpoints CER')
+	ax1.plot(x, c, 'gold')
+	annot_min('gold',-100,-30,x,c)
+
+if not v.dropna().empty: # not NaN or empty
+	ax1.plot(x, v, 'blue')
+	ax1.scatter(x, v, c='blue', s=50, label='Validation CER')
+	annot_min('blue',-100,-30,x,v)
+
+# CER of START_MODEL using same eval list
+dflang = pd.read_csv("plot_cer_lang.csv",sep='\t', encoding='utf-8')
+ax1.text(x.min(),dflang.LangCER[0], 
+               "{:.3f}% for START_MODEL {}" .format(dflang.LangCER[0],dflang.Name[0]), 
+                color='red')
+
+plt.title(label=PlotTitle)
+plt.legend(loc='upper right')
+
+ax2 = ax1.twiny() # ax1 and ax2 share y-axis
+ax2.set_xlabel("Training Iterations")
+ax2.set_xlim(ax1.get_xlim()) # ensure the independant x-axes now span the same range
+ax2.set_xticks(x) # copy over the locations of the x-ticks from Learning Iterations
+ax2.tick_params(axis='x', rotation=45, labelsize='small')
+ax2.set_xticklabels(t) # But give value of Training Iterations
+ax2.locator_params(axis='x', nbins=maxticks)  #  limit ticks on secondary x-axis
+
+plt.savefig("plot_cer_validation.png")