-
Notifications
You must be signed in to change notification settings - Fork 0
/
apptainer-train.sh
105 lines (81 loc) · 3.25 KB
/
apptainer-train.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/bin/bash
set -e
VERSION=1.2.10
# Configuration for HTR mode selection
HTRLOGHI=1
# Model configuration
HTRLOGHIMODELHEIGHT=64
HTRBASEMODEL=/proj/berzelius-2024-46/users/x_zhyan/loghi/generic-2023-02-15
#set to 1 to actually use basemodel, 0 to not use basemodel
USEBASEMODEL=1
# Define a VGSL model. If this is changed, you have to retrain the model from scratch other than finetuning.
HTRNEWMODEL="None,64,None,3 Cr3,3,24 Bn Mp2,2,2,2 Cr3,3,48 Bn Mp2,2,2,2 Cr3,3,96 Bn Cr3,3,96 Bn Mp2,2,2,2 Rc Bl256 Bl256 Bl256 Bl256 Bl256 O1s92"
# Set channels to 1 to process input as grayscale, 3 for color, 4 for color and mask
channels=4
GPU=0
# Dataset and training configuration
# These should be generated automatically by the apptainer-create-new.sh script
listdir=/proj/berzelius-2024-46/users/x_zhyan/All_training
trainlist=$listdir/training_all_train.txt
validationlist=$listdir/training_all_val.txt
datadir=./scratch/republicprint
# Set to the charlist file in the model folder
charlist=/proj/berzelius-2024-46/users/x_zhyan/loghi/generic-2023-02-15/charlist.txt
# Training configuration
# 40 epochs is a good starting point for trainng from scratch, 15 for transfer learning, 7-10 for finetuning.
epochs=10
height=$HTRLOGHIMODELHEIGHT
multiply=1
# Output directory
outdir=/proj/berzelius-2024-46/users/x_zhyan/models
# 64/128/256 are easy for A100.
batch_size=64
model_name=myfirstmodel
learning_rate=0.0003
# Create output directory, change the name if needed.
mkdir -p $outdir/output_freeze_recurrent_dense
# DO NOT EDIT BELOW THIS LINE
tmpdir=$(mktemp -d)
BASEMODEL=""
BASEMODELDIR=""
if [[ $USEBASEMODEL -eq 1 ]]; then
BASEMODEL=" --existing_model "$HTRBASEMODEL
BASEMODELDIR="-B $(dirname "${HTRBASEMODEL}"):$(dirname "${HTRBASEMODEL}")"
fi
# LoghiHTR option
# Initialize Loghi-HTR
# Some of the arguments are not listed here, but they might be useful.
# --optimizer: Optimizer to use, default is adam
# --decay_steps and --decay_rate: Learning rate decay, default is 0.99 and -1 (no decay). These are in beta stage.
# --use_float32: Use float32 instead of float16, default is float16 for training.
# --replace_recurrent_layer: Use this to replace the recurrent layers in a newly defined model with new ones.
# --freeze_conv(dense/recurrent)_layers: Freeze the conv/dense/recurrent layers of the model. This is useful for transfer learning.
if [[ $HTRLOGHI -eq 1 ]]; then
echo "Starting Loghi HTR"
apptainer exec --nv\
$BASEMODELDIR \
--bind $outdir:$outdir \
--bind $listdir:$listdir \
htr.sif python3 /src/loghi-htr/src/main.py \
--do_train \
--train_list $trainlist \
--do_validate \
--validation_list $validationlist \
--learning_rate $learning_rate \
--channels $channels \
--batch_size $batch_size \
--epochs $epochs \
--gpu $GPU \
--height $height \
--use_mask \
--seed 1 \
--beam_width 1 \
--model "$HTRNEWMODEL" \
--multiply $multiply \
--output $listdir \
--model_name $model_name \
--output_charlist $outdir/output_charlist.charlist \
--output $outdir/output_freeze_recurrent_dense $BASEMODEL
fi
echo "Results can be found at:"
echo $outdir