-
Notifications
You must be signed in to change notification settings - Fork 0
/
fad-words.sh
executable file
·56 lines (45 loc) · 1.83 KB
/
fad-words.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/bin/bash
# Find words with src=fad from smenob/nobsme, and pair them with their
# frequency in the fad-corpus. Give it some argument to skip convert2xml.
noconvert=$1
cd "$(dirname "$0")"
set -e -u
source functions.sh
trap 'kill 0' EXIT
smenob_fad () {
xmlstarlet sel -t -m '//e[@src="fad"]/mg/tg/t/text()' -c . -n $GTHOME/words/dicts/smenob/src/*.xml
}
nobsme_fad () {
xmlstarlet sel -t -m '//e[@src="fad"]/lg/l/text()' -c . -n $GTHOME/words/dicts/nobsme/src/*.xml
}
dicts_fad () {
unset LC_ALL
# words with src=fad in words/dicts/{smenob,nobsme}
cat <(smenob_fad) <(nobsme_fad) | LC_ALL=C sort -u
}
corp_fad () {
lang=$1
if [[ -z $noconvert ]]; then
echo "Converting corpus to XML first …" >&2
convert2xml $GTFREE/orig/$lang/{admin,facta,laws,news,science}
convert2xml $GTBOUND/orig/$lang/{admin,facta,laws,news,science}
else
echo "Assuming corpus already converted …" >&2
fi
cat <(ccat -a -l $lang $GTFREE/converted/$lang/{admin,facta,laws,news,science}) \
<(ccat -a -l $lang $GTBOUND/converted/$lang/{admin,facta,laws,news,science})
}
hitparade_fad () {
unset LC_ALL
lang=$1
corp_fad $lang | preproc $lang |\
sort | uniq -c | sed $'s/^ *//;s/ /\t/' |\
tee >(awk -F'\t' '{sum+=$1} END{print sum}' > "${SUM_NOB}")|\
LC_ALL=C sort -k2 -t$'\t'
}
LC_ALL=C join -t$'\t' -a1 -11 -22 <(dicts_fad) <(hitparade_fad nob) |\
awk -F'\t' 'BEGIN{OFS=FS} $2==""{$2=0} {print $2,$1}'
echo -n $'sum\t'; cat "${SUM_NOB}"
# This sum is larger than the sum of word-frequencies printed, since
# we only print those with src=fad-matches
# bzcat corpora/*.nb.txt.bz2 ~/corpora/Copyright/bokmål.txt.bz2 | preproc nob |sort |uniq -c|sed $'s/^ *//;s/ /\t/'|tee >(awk -F'\t' '{sum+=$1} END{print sum}' > SUM_NOB) |LC_ALL=C sort -k2 -t$'\t' >hitparade_nonfad.tsv