Makefile.def

# -*-makefile-*-
#--------------------------------------------------------------------
# Joerg Tiedemann
# tiedeman@gmail.com
#
# general variables for the OPUS corpus project
#--------------------------------------------------------------------


SHELL              := bash


ifeq (${shell hostname --domain 2>/dev/null},bullx)
  HPC_HOST = puhti
endif


## CSC specific information

CSC_WORK_PROJECT    ?= project_2003093
CSC_RELEASE_PROJECT ?= project_2000661
CSC_OPUSMT_PROJECT  ?= project_2003288

## CSC default billing project
CSC_PROJECT         ?= ${CSC_WORK_PROJECT}

## NLPL space
NLPLHOME            ?= /projappl/nlpl
OPUSNLPL            := ${NLPLHOME}/data/OPUS


## Object storage container names

WORK_CONTAINER_BASENAME    = project-OPUS
RELEASE_CONTAINER_BASENAME = OPUS

## scratch/work space
## TODO: remove hard-coded scratch space location

ifneq (${wildcard /scratch/project_2000661/tmp},)
  TMPDIR := /scratch/project_2000661/tmp
endif

TMPDIR   ?= /tmp

ifdef LOCAL_SCRATCH
  TMPDIR := ${LOCAL_SCRATCH}
endif


WORKDIR         ?= ${TMPDIR}
OPUSHOME        ?= $(shell pwd | sed 's!\(.*\)\(OPUS\|OPUS-ingest\)/.*!\1\2!')
LOCAL_OPUSHOME  ?= ${WORKDIR}/OPUS
CORPUS          ?= RF


# OPUSCORPORA ..... home directory of all corpora in OPUS
# OPUSPUB ......... public www data (public_html)
# OPUSRELEASE ..... homedir of released data packages

OPUSCORPORA   = ${OPUSHOME}/corpus
OPUSPUB       = ${OPUSHOME}/public_html
OPUSGITHOME   = ${OPUSHOME}/OPUS
OPUSRELEASE   = ${OPUSHOME}/releases
# OPUSRELEASE   = ${OPUSNLPL}


# corpus-specific directories
CORPUSHOME       = ${OPUSCORPORA}/${CORPUS}
CORPUSSRC        = ${CORPUSHOME}/src
CORPUSXML        = ${CORPUSHOME}/xml
CORPUSRAW        = ${CORPUSHOME}/raw
CORPUSDIC        = ${CORPUSHOME}/dic
CORPUSPARSED     = ${CORPUSHOME}/parsed
CORPUSPUB        = ${OPUSPUB}/${CORPUS}/${VERSION}
CORPUSRELEASE    = ${OPUSRELEASE}/${CORPUS}/${VERSION}
CORPUSWORDALIGN  = ${CORPUSHOME}/wordalign

## TODO: rename this to CORPUSVERSION?
VERSION         ?= latest


## DEPRECATED?
OPUSDATA      = ${OPUSHOME}/data
OPUSHTML      = ${OPUSHOME}/download
OPUSWORDALIGN = ${OPUSHOME}/wordalign
CORPUSHTML    = ${OPUSHTML}/${CORPUS}


# CWB home for standard annotation
OPUSCWB      = ${OPUSHOME}/cwb
CWBDATA      = ${OPUSCWB}/data
CWBREG       = ${OPUSCWB}/reg

# CWB home for UD-parsed data
UDCWB        = ${OPUSHOME}/cwb-ud
UDCWBDATA    = ${UDCWB}/data
UDCWBREG     = ${UDCWB}/reg


# LANGUAGES ....... sub-directories in CORPUSRAW
#                   alternatively, sub-directories in CORPUSSRC
#
# TODO: check that this really corresponds to valid lang-IDs!

ifndef LANGUAGES
  LANGUAGES = $(sort $(notdir ${shell if [ -e ${CORPUSRAW} ]; then \
		find ${CORPUSRAW} -mindepth 1 -maxdepth 1 -type d -printf "%f "; fi}))
endif

ifeq (${LANGUAGES},)
  LANGUAGES = $(sort $(notdir ${shell if [ -e ${CORPUSSRC} ]; then \
		find ${CORPUSRAW} -mindepth 1 -maxdepth 1 -type d -printf "%f "; fi}))
endif


## all language pairs sorted by lang IDs
## only one combination
LANGPAIRS = ${shell \
	for s in ${LANGUAGES}; do \
	  for t in ${LANGUAGES}; do \
	    if [[ "$$s" < "$$t" ]]; then \
	      echo "$$s-$$t"; \
	    fi \
	  done \
	done  }

## all pairs for a specific source language
## (this is handy to work with all bitexts for the curren language)
SRCLANGPAIRS = ${shell \
	for t in ${LANGUAGES}; do \
	  if [[ "${SRC}" < "$$t" ]]; then \
	    echo "${SRC}-$$t"; \
	  fi \
	done  }


# general file extension of files in the corpus
# (if you add .gz --> all files will be compressed)

XMLEXT = xml.gz
TXTEXT = txt.gz
ALGEXT = xml.gz


#---------------------------------------------------------------
# general tools that we will use for pre-processing & alignment
#---------------------------------------------------------------

## number of cores available and number of threads to be used
## restrict threads to 4 on login nodes
## (TODO: should we restrict threads in sort etc to leave some cores
##        for other tasks as well?)
CORES    := ${shell nproc}
THREADS  ?= $(shell if [ `hostname | grep login | wc -l` -gt 0 ]; then echo 4; else echo ${CORES}; fi )

## use pigz for multi-threaded (de-)compression
## TODO: should we optimise some parameters?
GZIP     := ${shell which pigz 2>/dev/null || echo gzip}
GZCAT    := ${GZIP} -cd
ZCAT     := gzip -cd

SORT     := sort -T ${TMPDIR} -S1G --parallel=${THREADS}
# SORT   := sort -T ${TMPDIR} -S1G --parallel=${THREADS} --compress-program=${GZIP}

UNIQ     := ${SORT} -u
MERGE    := ${SORT} -m -u


## I don't really trust parsort ...

# ifneq (${shell which parsort 2>/dev/null},)
#   SORT   := parsort -T ${TMPDIR} -S1G
# endif


## seems to be necessary to run with threads on HPC nodes
## (nproc is not reliable?)
ifneq (${GZIP},gzip)
  GZIP += -p ${THREADS}
endif

# UNICODE_CLEANUP := perl -CS -pe 'tr[\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}][]cd;'
UNICODE_CLEANUP := ftfy

## use GNU parallel if available
## TODO: should we reduce number of threads?
## TODO: should we optimise other settings?
PARALLEL_ARGS := --max-procs 25% \
		 --pipe --keep-order -q

# --max-procs ${THREADS} \
# --compress --compress-program ${GZIP}

PARALLEL := ${shell if [ `which parallel 2>/dev/null | wc -l` -gt 0 ]; then echo 'parallel ${PARALLEL_ARGS}'; fi }


## convenient function to reverse a list
reverse = $(if $(wordlist 2,2,$(1)),$(call reverse,$(wordlist 2,$(words $(1)),$(1))) $(firstword $(1)),$(1))


TOOLS       = ${OPUSHOME}/tools
OPUSTOOLS   = ${TOOLS}
CORPUSTOOLS = ${OPUSTOOLS}/${CORPUS}

ISO639      = opus-iso639


## TODO: get rid of local UPLUG
##       or create git submodules or install pre-requisites
##       even better: get rid of Uplug ....

UPLUGHOME  = ${TOOLS}/public/uplug/uplug-main
UPLUGCWB   = ${TOOLS}/public/uplug/uplug-cwb
UPLUGTOOLS = ${UPLUGHOME}/tools
UPLUG      = $(shell which uplug)

## SMT tools via Moses 
## TODO: adjust to NLPL
##       moses as git submodule?


MOSESHOME       := ${patsubst %/bin/moses,%,${shell which moses 2>/dev/null}}
MOSESSCRIPTS    := ${MOSESHOME}/scripts
SCRIPTS_ROOTDIR := ${MOSESSCRIPTS}


## TODO: locally compiled version of SALM!
##       this will not work everywhere

SALMHOME        := ${OPUSTOOLS}/SALM


## Word alignment tools (eflomal and atools from fast-align)

EFLOMAL         = align.py
ATOOLS          = atools

ifeq (${HPC_HOST},puhti)
  EFLOMAL       = module use -a /projappl/nlpl/software/modules/etc && module load nlpl-efmaral && align_eflomal.py
  EFMARAL       = module use -a /projappl/nlpl/software/modules/etc && module load nlpl-efmaral && align.py
endif


## word alignment priors

ifneq (${wildcard ${NLPLHOME}/data/translation/eflomal},)
  EFLOMAL_PRIORS = ${NLPLHOME}/data/translation/eflomal
endif

EFLOMAL_PRIORS ?= $(OPUSHOME)/eflomal

## max size of a bitext to be aligned
## (the makefile will automatically split before aligning)

MAX_EFLOMAL_CORPUS_SIZE = 5000000


#--------------------------------------------------------------------
# individual pre-processing steps (using Uplug)
#
# PAR ........ paragraph markup
# SENT ....... sentence boundary detection
# TOK ........ tokenization
# PRE_ALL .... make everything (par, sent, tok)
# PRE_SENT ... make only par & sent (no tokenization)
#--------------------------------------------------------------------

POLYGLOT_SENT = ${OPUSTOOLS}/opus-polyglot-sentence-splitter.py -l ${LANGUAGE}
POLYGLOT_TOK  = ${OPUSTOOLS}/opus-polyglot-tokenize.py -l ${LANGUAGE}

UPLUG_PAR     = ${UPLUG} pre/markup
UPLUG_SENT    = ${UPLUG} pre/sent -l ${LANGUAGE}
UPLUG_TOK     = ${UPLUG} pre/tok -l ${LANGUAGE}
UPLUG_PRE     = ${UPLUG} pre/par-sent
UPLUG_PRE_ALL = ${UPLUG} pre/basic

TOKSIMPLE     = ${UPLUG} pre/tok-simple
TOKMOSES      = ${UPLUG} pre/tok -l ${LANGUAGE}

PAR     = ${UPLUG_PAR}
SENT    = ${POLYGLOT_SENT}
TOK     = ${POLYGLOT_TOK}
PRE     = ${UPLUG_PRE}
PRE_ALL = ${UPLUG_PRE_ALL}

# language specific tools if they exist

# 1) all pre-processing steps

ifneq (,$(wildcard ${shell uplug -e pre/${LANGUAGE}-all 2>/dev/null}))
  PRE_ALL=${UPLUG} pre/${LANGUAGE}-all
endif

# # 2) a language-specific tokenizer

ifneq (,$(wildcard ${shell uplug -e pre/${LANGUAGE}/tok 2>/dev/null}))
   TOK=${UPLUG} pre/${LANGUAGE}/tok
endif

# 3) a tokenizer + tagger tool

ifneq (,$(wildcard ${shell uplug -e pre/${LANGUAGE}/toktag 2>/dev/null}))
  TOK=${UPLUG} pre/${LANGUAGE}/toktag
endif

# 4) a language-specific sentence boundary detection tool

ifneq (,$(wildcard ${shell uplug -e pre/${LANGUAGE}/sent 2>/dev/null}))
  SENT=${UPLUG} pre/${LANGUAGE}/sent
endif


#--------------------------------------------------------------------
# sentence alignment
#   sent ... Gale&Church (original C-code via Uplug)
#   hun .... hunalign (called from Uplug) - default
#--------------------------------------------------------------------

SENTALIGN  = ${UPLUG} align/hun
# SENTALIGN = ${UPLUG} align/sent


## always load generic makefile for submitting hpc-jobs
## TODO: is it good to do that here?

include ${OPUSCORPORA}/Makefile.submit