-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile.def
338 lines (228 loc) · 8.81 KB
/
Makefile.def
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
# -*-makefile-*-
#--------------------------------------------------------------------
# Joerg Tiedemann
#
# general variables for the OPUS corpus project
#--------------------------------------------------------------------
SHELL := bash
ifeq (${shell hostname --domain 2>/dev/null},bullx)
HPC_HOST = puhti
endif
## CSC specific information
CSC_WORK_PROJECT ?= project_2003093
CSC_RELEASE_PROJECT ?= project_2000661
CSC_OPUSMT_PROJECT ?= project_2003288
## CSC default billing project
CSC_PROJECT ?= ${CSC_WORK_PROJECT}
## NLPL space
NLPLHOME ?= /projappl/nlpl
OPUSNLPL := ${NLPLHOME}/data/OPUS
## Object storage container names
WORK_CONTAINER_BASENAME = project-OPUS
RELEASE_CONTAINER_BASENAME = OPUS
## scratch/work space
## TODO: remove hard-coded scratch space location
ifneq (${wildcard /scratch/project_2000661/tmp},)
TMPDIR := /scratch/project_2000661/tmp
endif
TMPDIR ?= /tmp
ifdef LOCAL_SCRATCH
TMPDIR := ${LOCAL_SCRATCH}
endif
WORKDIR ?= ${TMPDIR}
OPUSHOME ?= $(shell pwd | sed 's!\(.*\)\(OPUS\|OPUS-ingest\)/.*!\1\2!')
LOCAL_OPUSHOME ?= ${WORKDIR}/OPUS
CORPUS ?= RF
# OPUSCORPORA ..... home directory of all corpora in OPUS
# OPUSPUB ......... public www data (public_html)
# OPUSRELEASE ..... homedir of released data packages
OPUSCORPORA = ${OPUSHOME}/corpus
OPUSPUB = ${OPUSHOME}/public_html
OPUSGITHOME = ${OPUSHOME}/OPUS
OPUSRELEASE = ${OPUSHOME}/releases
# OPUSRELEASE = ${OPUSNLPL}
# corpus-specific directories
CORPUSHOME = ${OPUSCORPORA}/${CORPUS}
CORPUSSRC = ${CORPUSHOME}/src
CORPUSXML = ${CORPUSHOME}/xml
CORPUSRAW = ${CORPUSHOME}/raw
CORPUSDIC = ${CORPUSHOME}/dic
CORPUSPARSED = ${CORPUSHOME}/parsed
CORPUSPUB = ${OPUSPUB}/${CORPUS}/${VERSION}
CORPUSRELEASE = ${OPUSRELEASE}/${CORPUS}/${VERSION}
CORPUSWORDALIGN = ${CORPUSHOME}/wordalign
## TODO: rename this to CORPUSVERSION?
VERSION ?= latest
## DEPRECATED?
OPUSDATA = ${OPUSHOME}/data
OPUSHTML = ${OPUSHOME}/download
OPUSWORDALIGN = ${OPUSHOME}/wordalign
CORPUSHTML = ${OPUSHTML}/${CORPUS}
# CWB home for standard annotation
OPUSCWB = ${OPUSHOME}/cwb
CWBDATA = ${OPUSCWB}/data
CWBREG = ${OPUSCWB}/reg
# CWB home for UD-parsed data
UDCWB = ${OPUSHOME}/cwb-ud
UDCWBDATA = ${UDCWB}/data
UDCWBREG = ${UDCWB}/reg
# LANGUAGES ....... sub-directories in CORPUSRAW
# alternatively, sub-directories in CORPUSSRC
#
# TODO: check that this really corresponds to valid lang-IDs!
ifndef LANGUAGES
LANGUAGES = $(sort $(notdir ${shell if [ -e ${CORPUSRAW} ]; then \
find ${CORPUSRAW} -mindepth 1 -maxdepth 1 -type d -printf "%f "; fi}))
endif
ifeq (${LANGUAGES},)
LANGUAGES = $(sort $(notdir ${shell if [ -e ${CORPUSSRC} ]; then \
find ${CORPUSRAW} -mindepth 1 -maxdepth 1 -type d -printf "%f "; fi}))
endif
## all language pairs sorted by lang IDs
## only one combination
LANGPAIRS = ${shell \
for s in ${LANGUAGES}; do \
for t in ${LANGUAGES}; do \
if [[ "$$s" < "$$t" ]]; then \
echo "$$s-$$t"; \
fi \
done \
done }
## all pairs for a specific source language
## (this is handy to work with all bitexts for the curren language)
SRCLANGPAIRS = ${shell \
for t in ${LANGUAGES}; do \
if [[ "${SRC}" < "$$t" ]]; then \
echo "${SRC}-$$t"; \
fi \
done }
# general file extension of files in the corpus
# (if you add .gz --> all files will be compressed)
XMLEXT = xml.gz
TXTEXT = txt.gz
ALGEXT = xml.gz
#---------------------------------------------------------------
# general tools that we will use for pre-processing & alignment
#---------------------------------------------------------------
## number of cores available and number of threads to be used
## restrict threads to 4 on login nodes
## (TODO: should we restrict threads in sort etc to leave some cores
## for other tasks as well?)
CORES := ${shell nproc}
THREADS ?= $(shell if [ `hostname | grep login | wc -l` -gt 0 ]; then echo 4; else echo ${CORES}; fi )
## use pigz for multi-threaded (de-)compression
## TODO: should we optimise some parameters?
GZIP := ${shell which pigz 2>/dev/null || echo gzip}
GZCAT := ${GZIP} -cd
ZCAT := gzip -cd
SORT := sort -T ${TMPDIR} -S1G --parallel=${THREADS}
# SORT := sort -T ${TMPDIR} -S1G --parallel=${THREADS} --compress-program=${GZIP}
UNIQ := ${SORT} -u
MERGE := ${SORT} -m -u
## I don't really trust parsort ...
# ifneq (${shell which parsort 2>/dev/null},)
# SORT := parsort -T ${TMPDIR} -S1G
# endif
## seems to be necessary to run with threads on HPC nodes
## (nproc is not reliable?)
ifneq (${GZIP},gzip)
GZIP += -p ${THREADS}
endif
# UNICODE_CLEANUP := perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;'
UNICODE_CLEANUP := ftfy
## use GNU parallel if available
## TODO: should we reduce number of threads?
## TODO: should we optimise other settings?
PARALLEL_ARGS := --max-procs 25% \
--pipe --keep-order -q
# --max-procs ${THREADS} \
# --compress --compress-program ${GZIP}
PARALLEL := ${shell if [ `which parallel 2>/dev/null | wc -l` -gt 0 ]; then echo 'parallel ${PARALLEL_ARGS}'; fi }
## convenient function to reverse a list
reverse = $(if $(wordlist 2,2,$(1)),$(call reverse,$(wordlist 2,$(words $(1)),$(1))) $(firstword $(1)),$(1))
TOOLS = ${OPUSHOME}/tools
OPUSTOOLS = ${TOOLS}
CORPUSTOOLS = ${OPUSTOOLS}/${CORPUS}
ISO639 = opus-iso639
## TODO: get rid of local UPLUG
## or create git submodules or install pre-requisites
## even better: get rid of Uplug ....
UPLUGHOME = ${TOOLS}/public/uplug/uplug-main
UPLUGCWB = ${TOOLS}/public/uplug/uplug-cwb
UPLUGTOOLS = ${UPLUGHOME}/tools
UPLUG = $(shell which uplug)
## SMT tools via Moses
## TODO: adjust to NLPL
## moses as git submodule?
MOSESHOME := ${patsubst %/bin/moses,%,${shell which moses 2>/dev/null}}
MOSESSCRIPTS := ${MOSESHOME}/scripts
SCRIPTS_ROOTDIR := ${MOSESSCRIPTS}
## TODO: locally compiled version of SALM!
## this will not work everywhere
SALMHOME := ${OPUSTOOLS}/SALM
## Word alignment tools (eflomal and atools from fast-align)
EFLOMAL = align.py
ATOOLS = atools
ifeq (${HPC_HOST},puhti)
EFLOMAL = module use -a /projappl/nlpl/software/modules/etc && module load nlpl-efmaral && align_eflomal.py
EFMARAL = module use -a /projappl/nlpl/software/modules/etc && module load nlpl-efmaral && align.py
endif
## word alignment priors
ifneq (${wildcard ${NLPLHOME}/data/translation/eflomal},)
EFLOMAL_PRIORS = ${NLPLHOME}/data/translation/eflomal
endif
EFLOMAL_PRIORS ?= $(OPUSHOME)/eflomal
## max size of a bitext to be aligned
## (the makefile will automatically split before aligning)
MAX_EFLOMAL_CORPUS_SIZE = 5000000
#--------------------------------------------------------------------
# individual pre-processing steps (using Uplug)
#
# PAR ........ paragraph markup
# SENT ....... sentence boundary detection
# TOK ........ tokenization
# PRE_ALL .... make everything (par, sent, tok)
# PRE_SENT ... make only par & sent (no tokenization)
#--------------------------------------------------------------------
POLYGLOT_SENT = ${OPUSTOOLS}/opus-polyglot-sentence-splitter.py -l ${LANGUAGE}
POLYGLOT_TOK = ${OPUSTOOLS}/opus-polyglot-tokenize.py -l ${LANGUAGE}
UPLUG_PAR = ${UPLUG} pre/markup
UPLUG_SENT = ${UPLUG} pre/sent -l ${LANGUAGE}
UPLUG_TOK = ${UPLUG} pre/tok -l ${LANGUAGE}
UPLUG_PRE = ${UPLUG} pre/par-sent
UPLUG_PRE_ALL = ${UPLUG} pre/basic
TOKSIMPLE = ${UPLUG} pre/tok-simple
TOKMOSES = ${UPLUG} pre/tok -l ${LANGUAGE}
PAR = ${UPLUG_PAR}
SENT = ${POLYGLOT_SENT}
TOK = ${POLYGLOT_TOK}
PRE = ${UPLUG_PRE}
PRE_ALL = ${UPLUG_PRE_ALL}
# language specific tools if they exist
# 1) all pre-processing steps
ifneq (,$(wildcard ${shell uplug -e pre/${LANGUAGE}-all 2>/dev/null}))
PRE_ALL=${UPLUG} pre/${LANGUAGE}-all
endif
# # 2) a language-specific tokenizer
ifneq (,$(wildcard ${shell uplug -e pre/${LANGUAGE}/tok 2>/dev/null}))
TOK=${UPLUG} pre/${LANGUAGE}/tok
endif
# 3) a tokenizer + tagger tool
ifneq (,$(wildcard ${shell uplug -e pre/${LANGUAGE}/toktag 2>/dev/null}))
TOK=${UPLUG} pre/${LANGUAGE}/toktag
endif
# 4) a language-specific sentence boundary detection tool
ifneq (,$(wildcard ${shell uplug -e pre/${LANGUAGE}/sent 2>/dev/null}))
SENT=${UPLUG} pre/${LANGUAGE}/sent
endif
#--------------------------------------------------------------------
# sentence alignment
# sent ... Gale&Church (original C-code via Uplug)
# hun .... hunalign (called from Uplug) - default
#--------------------------------------------------------------------
SENTALIGN = ${UPLUG} align/hun
# SENTALIGN = ${UPLUG} align/sent
## always load generic makefile for submitting hpc-jobs
## TODO: is it good to do that here?
include ${OPUSCORPORA}/Makefile.submit