forked from facebookresearch/XLM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get-data-xnli.sh
executable file
·69 lines (55 loc) · 2.29 KB
/
get-data-xnli.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# Copyright (c) 2019-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
#
# Usage: ./get-data-xnli.sh
#
set -e
# data paths
MAIN_PATH=$PWD
OUTPATH=$PWD/data/xnli
XNLI_PATH=$PWD/data/xnli/XNLI-1.0
# tools paths
TOOLS_PATH=$PWD/tools
TOKENIZE=$TOOLS_PATH/tokenize.sh
LOWER_REMOVE_ACCENT=$TOOLS_PATH/lowercase_and_remove_accent.py
# install tools
./install-tools.sh
# create directories
mkdir -p $OUTPATH
# download data
if [ ! -d $OUTPATH/XNLI-MT-1.0 ]; then
if [ ! -f $OUTPATH/XNLI-MT-1.0.zip ]; then
wget -c https://dl.fbaipublicfiles.com/XNLI/XNLI-MT-1.0.zip -P $OUTPATH
fi
unzip $OUTPATH/XNLI-MT-1.0.zip -d $OUTPATH
fi
if [ ! -d $OUTPATH/XNLI-1.0 ]; then
if [ ! -f $OUTPATH/XNLI-1.0.zip ]; then
wget -c https://dl.fbaipublicfiles.com/XNLI/XNLI-1.0.zip -P $OUTPATH
fi
unzip $OUTPATH/XNLI-1.0.zip -d $OUTPATH
fi
# English train set
echo "*** Preparing English train set ****"
cat $OUTPATH/XNLI-MT-1.0/multinli/multinli.train.en.tsv | sed 's/\tcontradictory/\tcontradiction/g' > $XNLI_PATH/en.train
# validation and test sets
for lg in ar bg de el en es fr hi ru sw th tr ur vi zh; do
echo "*** Preparing $lg validation and test sets ***"
echo -e "premise\thypo\tlabel" > $XNLI_PATH/$lg.valid
echo -e "premise\thypo\tlabel" > $XNLI_PATH/$lg.test
# label
awk -v lg=$lg '$1==lg' $XNLI_PATH/xnli.dev.tsv | cut -f2 > $XNLI_PATH/dev.f2
awk -v lg=$lg '$1==lg' $XNLI_PATH/xnli.test.tsv | cut -f2 > $XNLI_PATH/test.f2
# premise/hypothesis
awk -v lg=$lg '$1==lg' $XNLI_PATH/xnli.dev.tsv | cut -f7 | $TOKENIZE $lg | python $LOWER_REMOVE_ACCENT > $XNLI_PATH/dev.f7
awk -v lg=$lg '$1==lg' $XNLI_PATH/xnli.dev.tsv | cut -f8 | $TOKENIZE $lg | python $LOWER_REMOVE_ACCENT > $XNLI_PATH/dev.f8
awk -v lg=$lg '$1==lg' $XNLI_PATH/xnli.test.tsv | cut -f7 | $TOKENIZE $lg | python $LOWER_REMOVE_ACCENT > $XNLI_PATH/test.f7
awk -v lg=$lg '$1==lg' $XNLI_PATH/xnli.test.tsv | cut -f8 | $TOKENIZE $lg | python $LOWER_REMOVE_ACCENT > $XNLI_PATH/test.f8
paste $XNLI_PATH/dev.f7 $XNLI_PATH/dev.f8 $XNLI_PATH/dev.f2 >> $XNLI_PATH/$lg.valid
paste $XNLI_PATH/test.f7 $XNLI_PATH/test.f8 $XNLI_PATH/test.f2 >> $XNLI_PATH/$lg.test
rm $XNLI_PATH/*.f2 $XNLI_PATH/*.f7 $XNLI_PATH/*.f8
done