forked from pjheslin/diogenes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mk.wordlists
70 lines (52 loc) · 2.41 KB
/
mk.wordlists
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# We make a Latin wordlist by iterating over three corpora: PHI,
# Perseus and DigiLibLT. We make a Greek wordlist by iterating over
# the Perseus corpus and adding that to the TLG wordlist.
include mk.common
all: $(BUILD)/lat.words $(BUILD)/grc.words
# Download Perseus corpora and make wordlists
GREEKLIT = $(BUILD)/canonical-greekLit
LATINLIT = $(BUILD)/canonical-latinLit
$(GREEKLIT)/.git/HEAD:
rm -rf $(GREEKLIT)
cd $(BUILD) && git clone https://github.com/PerseusDL/canonical-greekLit
cd $(GREEKLIT) && git checkout
$(LATINLIT)/.git/HEAD:
rm -rf $(LATINLIT)
cd $(BUILD) && git clone https://github.com/PerseusDL/canonical-latinLit
cd $(LATINLIT) && git checkout
$(BUILD)/lat.words-perseus: utils/make_latin_wordlist_perseus.sh $(LATINLIT)/.git/HEAD
mkdir -p $(BUILD)
./utils/make_latin_wordlist_perseus.sh $(LATINLIT) > $@
$(BUILD)/grc.words-perseus: utils/make_greek_wordlist_perseus.sh $(GREEKLIT)/.git/HEAD
mkdir -p $(BUILD)
./utils/make_greek_wordlist_perseus.sh $(GREEKLIT) > $@
# DigilibLT has to be downloaded separately after registering for an
# account.
#DIGILIBDIR = $(HOME)/digiliblt
DIGILIBDIR = ../../diogenes-web/app/static/texts/DigiLibLT/
$(BUILD)/lat.words-digilib: utils/make_latin_wordlist_digilib.sh
mkdir -p $(BUILD)
./utils/make_latin_wordlist_digilib.sh $(DIGILIBDIR) > $@
# PHI and TLG wordlists
PHIDIR = $(HOME)/Classics-Data/phi
TLGDIR = $(HOME)/Classics-Data/tlg_e
$(BUILD)/lat.words-phi: utils/make_latin_wordlist.pl
mkdir -p $(BUILD) $(DATA)
find $(PHIDIR) -type f -exec sha256sum '{}' ';' | sed 's: $(PHIDIR): :g' > $(DATA)/phisums
./utils/make_latin_wordlist.pl $(PHIDIR) > $@
$(BUILD)/grc.words-tlg: utils/make_greek_wordlist.pl
mkdir -p $(BUILD) $(DATA)
find $(TLGDIR) -type f -exec sha256sum '{}' ';' | sed 's: $(TLGDIR): :g' > $(DATA)/tlgsums
./utils/make_greek_wordlist.pl $(TLGDIR) > $@
$(BUILD)/lat.words: $(BUILD)/lat.words-perseus $(BUILD)/lat.words-digilib $(BUILD)/lat.words-phi
export LC_ALL=C; cat $(BUILD)/lat.words-perseus $(BUILD)/lat.words-digilib $(BUILD)/lat.words-phi | sort | uniq > $@
$(BUILD)/grc.words: $(BUILD)/grc.words-perseus $(BUILD)/grc.words-tlg
export LC_ALL=C; cat $(BUILD)/grc.words-perseus $(BUILD)/grc.words-tlg | sort | uniq > $@
clean:
rm -f $(BUILD)/lat.words-perseus
rm -f $(BUILD)/lat.words-digilib
rm -f $(BUILD)/lat.words-phi
rm -f $(BUILD)/lat.words
rm -f $(BUILD)/grc.words-perseus
rm -f $(BUILD)/grc.words-tlg
rm -f $(BUILD)/grc.words