forked from TechnionTDK/regex-ml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
labeled_function.py
108 lines (98 loc) · 2.64 KB
/
labeled_function.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import re
from utility import SHMOT_PRAKIM
from utility import MASACHTOT_BAVLI
from utility import ABSTAIN
from utility import REF
from utility import NO_REF
from snorkel.labeling import labeling_function
@labeling_function()
def masechet_then_parans(x):
"""check if data contain mashechet and finish with parenthesis """
count = 0
"""check if contains mashechet"""
for mashechet in MASACHTOT_BAVLI:
if mashechet in x.text:
count = 1
"""finish with parenthesis"""
pattern1 = '.*([(].*[)])$'
result = re.match(pattern1, x.text)
if result:
count += 1
if count >= 2:
return REF
return ABSTAIN
'''
@labeling_function()
def mashechet_and_sham(x):
"""check if data contain mashechet and שם """
count = 0
"""check if contains mashechet"""
for mashechet in MASACHTOT_BAVLI:
if mashechet in x.text:
count = 1
"""finish with שם """
pattern1 = '.*(שם)$'
result = re.match(pattern1, x.text)
if result:
count += 1
if count >= 2:
return REF
return ABSTAIN
'''
@labeling_function()
def perek_then_parans(x):
"""check if data contain perek and finish with parenthesis """
count = 0
"""check if contains perek"""
for perek in SHMOT_PRAKIM:
if perek in x.text:
count = 1
"""finish with parenthesis"""
pattern1 = '.*([(].*[)])$'
result = re.match(pattern1, x.text)
if result:
count += 1
if count >= 2:
return REF
return ABSTAIN
'''
@labeling_function()
def perek_and_sham(x):
"""check if data contain perek and שם """
count = 0
"""check if contains perek"""
for perek in SHMOT_PRAKIM:
if perek in x.text:
count = 1
"""finish with שם """
pattern1 = '.*(שם)$'
result = re.match(pattern1, x.text)
if result:
count += 1
if count >= 2:
return REF
return ABSTAIN
'''
@labeling_function()
def daf_in_parntes(x):
"""check if data contain daf in parents and finish with parenthesis """
"""finish with parenthesis"""
pattern1 = '.*(([(].*דף.*[)])[:]?)$'
result = re.match(pattern1, x.text)
if result:
return REF
return ABSTAIN
@labeling_function()
def no_double_parans(x):
""" Check if data contains more then one left/right parenthesis """
if x.text.count("(") > 1 or x.text.count(")") > 1:
return NO_REF
if x.text.count("(") != x.text.count("("):
return NO_REF
return ABSTAIN
@labeling_function()
def no_mishna(x):
""" Check if data contains the world mishna """
if 'משנה' in x.text:
return NO_REF
return ABSTAIN