Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

script to plot CER from training logfile #203

Merged
merged 6 commits into from
Nov 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,5 @@ data/*.traineddata
wackenroder_herzensergiessungen_*.gt.txt
wackenroder_herzensergiessungen_*.tif
master.zip
plot/*.LOG
plot/ocrd*
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,24 @@ It is also possible to create models for selected checkpoints only. Examples:

Add `MODEL_NAME` and `OUTPUT_DIR` and replace `data/foo` by the output directory if needed.

## Plotting CER (experimental)

Training and Evaluation CER can be plotted using matplotlib. A couple of scripts are provided
as a starting point in `plot` subdirectory for plotting of different training scenarios. The training
log is expected to be saved in `plot/TESSTRAIN.LOG`.

As an example, use the training data provided in
[ocrd-testset.zip](./ocrd-testset.zip) to do training and generate the plots.
Plotting can be done while training is running also to depict the training status till then.
```
unzip ocrd-testset.zip -d data/ocrd-ground-truth
nohup make training MODEL_NAME=ocrd START_MODEL=frk TESSDATA=~/tessdata_best MAX_ITERATIONS=10000 > plot/TESSTRAIN.LOG &
```
```
cd ./plot
./plot_cer.sh
```

## License

Software is provided under the terms of the `Apache 2.0` license.
Expand Down
67 changes: 67 additions & 0 deletions plot/plot_cer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

maxticks=10
dataframe = pd.read_csv("plot_cer.csv",sep='\t', encoding='utf-8')
dataframe['TrainingIteration'] = dataframe['TrainingIteration'].fillna(-2)
dataframe['TrainingIteration'] = dataframe['TrainingIteration'].astype(int)
dataframe['TrainingIteration'] = dataframe['TrainingIteration'].astype(str)
dataframe['TrainingIteration'] = dataframe['TrainingIteration'].replace('-2', np.nan)
t = dataframe['TrainingIteration']
x = dataframe['LearningIteration']
y = dataframe.IterationCER
c = dataframe.CheckpointCER
e = dataframe.EvalCER
cmax = c[np.argmax(c)]
maxCERtoDisplay=cmax+2

def annot_min(boxcolor, xpos, ypos, x,y):
xmin = x[np.argmin(y)]
ymin = y.min()
boxtext= "{:.3f}% at Learning Iteration {:.0f}" .format(ymin,xmin)
ax1.annotate(boxtext, xy=(xmin, ymin), xytext=(xpos,ypos), textcoords='offset points',
arrowprops=dict(shrinkA=0.05, shrinkB=1, fc='black', ec='white', connectionstyle="arc3"),
bbox=dict(boxstyle='round,pad=0.2', fc=boxcolor, alpha=0.3))

PlotTitle="Tesseract LSTM training and Evaluation Character Error Rates (-1 to " + str(maxCERtoDisplay) + "%)"
plt.title(label=PlotTitle)

fig = plt.figure(figsize=(11,8.5)) #size is in inches
ax1 = fig.add_subplot()
ax1.set_ylim([-1,maxCERtoDisplay])
ax1.set_xlim([-1000,30000])
ax1.set_xlabel('Learning Iterations')
ax1.set_ylabel('Character Error Rate (%)')
ax1.set_xticks(x)
ax1.tick_params(axis='x', rotation=45, labelsize='small')
ax1.locator_params(axis='x', nbins=maxticks) # limit ticks on x-axis
ax1.grid(True)

if not c.dropna().empty: # not NaN or empty
ax1.scatter(x, c, c='gold', s=50, label='Best Model Checkpoints CER')
ax1.plot(x, c, 'gold')
annot_min('gold',-150,-30,x,c)

ax1.scatter(x, y, s=3, c='teal', label='CER every 100 Training Iterations')
ax1.plot(x, y, 'teal', linewidth=0.7)

if not e.dropna().empty: # not NaN or empty
ax1.plot(x, e, 'magenta')
ax1.scatter(x, e, c='magenta', s=50, label='Evaluation CER')
annot_min('magenta',-150,40,x,e)

plt.legend(loc='upper right')

ax2 = ax1.twiny() # ax1 and ax2 share y-axis
ax2.set_xlabel("Training Iterations")
ax2.set_xlim(ax1.get_xlim()) # ensure the independant x-axes now span the same range
ax2.set_xticks(x) # copy over the locations of the x-ticks from Learning Iterations
ax2.tick_params(axis='x', rotation=45, labelsize='small')
ax2.set_xticklabels(t) # But give value of Training Iterations
ax2.locator_params(axis='x', nbins=maxticks) # limit ticks on secondary x-axis

plt.savefig("plot_cer.png")
66 changes: 66 additions & 0 deletions plot/plot_cer_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

maxticks=10
dataframe = pd.read_csv("plot_cer_validation.csv",sep='\t', encoding='utf-8')
t = dataframe['TrainingIteration']
x = dataframe['LearningIteration']
v = dataframe.ValidationCER
c = dataframe.CheckpointCER
cmax = c[np.argmax(c)]
vmax = v[np.argmax(v)]
if vmax > cmax:
maxCERtoDisplay=vmax+2
else:
maxCERtoDisplay=cmax+2

def annot_min(boxcolor, xpos, ypos, x,y):
xmin = x[np.argmin(y)]
ymin = y.min()
boxtext= "{:.3f}% at Learning Iteration {:}" .format(ymin,xmin)
ax1.annotate(boxtext, xy=(xmin, ymin), xytext=(xpos,ypos), textcoords='offset points',
bbox=dict(boxstyle='round,pad=0.2', fc=boxcolor, alpha=0.3))

PlotTitle="Tesseract LSTM Training and Validation Character Error Rate %"
fig = plt.figure(figsize=(11,8.5)) #size is in inches
ax1 = fig.add_subplot()
ax1.set_ylim([-1,maxCERtoDisplay])
ax1.set_xlabel('Learning Iterations')
ax1.set_ylabel('Character Error Rate (%)')
ax1.set_xticks(x)
ax1.tick_params(axis='x', rotation=45, labelsize='small')
ax1.locator_params(axis='x', nbins=maxticks) # limit ticks on x-axis
ax1.grid(True)

if not c.dropna().empty: # not NaN or empty
ax1.scatter(x, c, c='gold', s=50, label='Best Model Checkpoints CER')
ax1.plot(x, c, 'gold')
annot_min('gold',-100,-30,x,c)

if not v.dropna().empty: # not NaN or empty
ax1.plot(x, v, 'blue')
ax1.scatter(x, v, c='blue', s=50, label='Validation CER')
annot_min('blue',-100,-30,x,v)

# CER of START_MODEL using same eval list
dflang = pd.read_csv("plot_cer_lang.csv",sep='\t', encoding='utf-8')
ax1.text(x.min(),dflang.LangCER[0],
"{:.3f}% for START_MODEL {}" .format(dflang.LangCER[0],dflang.Name[0]),
color='red')

plt.title(label=PlotTitle)
plt.legend(loc='upper right')

ax2 = ax1.twiny() # ax1 and ax2 share y-axis
ax2.set_xlabel("Training Iterations")
ax2.set_xlim(ax1.get_xlim()) # ensure the independant x-axes now span the same range
ax2.set_xticks(x) # copy over the locations of the x-ticks from Learning Iterations
ax2.tick_params(axis='x', rotation=45, labelsize='small')
ax2.set_xticklabels(t) # But give value of Training Iterations
ax2.locator_params(axis='x', nbins=maxticks) # limit ticks on secondary x-axis

plt.savefig("plot_cer_validation.png")