-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnicot.py
executable file
·64 lines (55 loc) · 1.98 KB
/
nicot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import re
from glob import glob
def parser(dico_path):
## Body
body = re.compile('<body>')
## Headword regex
head = re.compile('<div1 type="entry')
clean_head = re.compile('(?u)([^,]+),.+')
clean_head2 = re.compile('\.')
find_ou = re.compile(' ou ')
find_et = re.compile(' et ')
## entry regex
definition = re.compile('</div1')
my_data = glob(dico_path + 'nicot/*')
dico = {}
for chunk in my_data:
headwords = []
entry = ''
begin = False
for line in open(chunk.rstrip()):
if body.search(line):
begin = True
if not begin:
continue
if head.search(line):
my_match = re.search('<div1 type="entry" id="([^"]+)', line)
headword = my_match.group(1).decode('utf-8').lower()
headword = clean_head.sub('\\1', headword)
headword = clean_head2.sub('', headword)
if find_ou.search(headword):
headwords = headword.split(' ou ')
elif find_et.search(headword):
headwords = headword.split(' et ')
else:
headwords = [headword,]
for word in headwords:
if word not in dico:
dico[word] = []
continue
line = line.replace('</p>', '')
line = line.replace('<p>', '<br>')
line = line.replace('<b>', '')
line = line.replace('</b>', '')
try:
line = line.decode('utf-8')
except:
line = line.decode('latin-1')
if definition.search(line):
entry = re.sub('^\s*<br>', '', entry)
for word in headwords:
dico[word].append(entry)
entry = ''
continue
entry += line
return dico