forked from christiancasey/iip-word-lists
-
Notifications
You must be signed in to change notification settings - Fork 1
/
pseudocode.txt
90 lines (73 loc) · 3.19 KB
/
pseudocode.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
###############################################################################
# This document is NOT COMPLETE and NOT COMPREHENSIVE #
###############################################################################
FUNCTION main(args)
FOR file IN args.files
occurrences = NEW list()
languages = NEW set()
word_dict = NEW dictionary() [FOOTNOTE 1]
FOR div IN file WHERE div.type == edition AND div.subtype IN ["transcription", "translation"]
new_occurrences = get_words_from_element(div)
plaintext_file = open(args.flat + "/" + file.replace("xml", "txt"))
IF div.subtype == translation
plaintext_file = open(args.flat + "/" + file.replace("xml", "") + "_tranl.txt")
FOR occurence IN new_occurences
occurence.text >> plaintext_file
occurence += new_occurrences
FOR occurrence IN occurrences
languages.add(occurrence.language)
word = word_dict[occurrence.lemma][occurence.language]
IF word.is_empty()
word = NEW iip_word()
word.occurences.append(occurrence)
print(occurences)
IF args.html_general
word_list_to_html(word_dict, languages)
FUNCTION get_words_from_element(element)
walker = NEW xml_walker(element)
occurences = NEW list()
new_occurence = NEW iip_occurence()
within = NEW list()
FOR step IN walker:
FOR tag IN step.opening
within.append(tag)
FOR tag IN step.closing
within.remove(tag)
IF is_word_terminating(step)
occurences.append(new_occurence)
new_occurence = NEW iip_occurence()
ELSE
new_occurence.text += step.occurence
RETURN words
FUNCTION is_word_terminating(step, within)
IF "lb" IN [x.tag_text FOR x IN step.self_closing IF x.break != "false"]
return true
IF step.character.is_whitespace()
IF step.character.is_indentation()
return false
IF step.character == "\n" AND step.previous_tag IN include_trailing_line_breaks:
return false
return true
FUNCTION word_list_to_html(word_dict, languages)
FOR language IN languages
mkdir "docs/" + language
FOR word IN word_dict
FOR language IN word
write_word_info_page(word)
FOR language IN languages
FOR word IN word_dict
IF exists(word[language])
index_template.add(word)
write(index_template, "docs/" + language + "/index.html")
FUNCTION write_word_info_page(word)
══════════════════════════════════════════════════════════════════════════════
[FOOTNOTE 1]
word_dict is a dictionary mapping lemmas to dictionaries mapping languages to
iip_word objects in the following manner:
┌──▶ English ──▶ iip_word | occurences = [a, b, c]
┌──▶ no ─┤
word_dict ──┤ └──▶ Spanish ──▶ iip_word | occurences = [d]
│
└──▶ dog ────▶ English ──▶ iip_word | occurences = [e, f, g]
ex) word_dict[no][spanish].occurences == [d]
══════════════════════════════════════════════════════════════════════════════