Skip to content

Commit

Permalink
Merge pull request #29 from JarbasAl/feat/get_gender_pt
Browse files Browse the repository at this point in the history
improve get_gender PT
  • Loading branch information
krisgesling authored Oct 15, 2019
2 parents b4d97d9 + 7ebe136 commit aa04301
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 18 deletions.
15 changes: 15 additions & 0 deletions lingua_franca/lang/common_data_pt.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,21 @@

_ARTICLES_PT = ["o", "a", "os", "as"]

# word rules for gender
_FEMALE_ENDINGS_PT = ["a", "as"]
_MALE_ENDINGS_PT = ["o", "os"]

# special cases, word lookup for words not covered by above rule
_GENDERS_PT = {
"mulher": "f",
"mulheres": "f",
"homem": "m"
}

# context rules for gender
_MALE_DETERMINANTS_PT = ["o", "os", "este", "estes", "esse", "esses"]
_FEMALE_DETERMINANTS_PT = ["a", "as", "estas", "estas", "essa", "essas"]

_NUMBERS_PT = {
"zero": 0,
"um": 1,
Expand Down
46 changes: 31 additions & 15 deletions lingua_franca/lang/parse_pt.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
from dateutil.relativedelta import relativedelta
from lingua_franca.lang.parse_common import is_numeric, look_for_fractions
from lingua_franca.lang.common_data_pt import _FRACTION_STRING_PT, \
_ARTICLES_PT, _NUMBERS_PT
_ARTICLES_PT, _NUMBERS_PT, _FEMALE_DETERMINANTS_PT, _FEMALE_ENDINGS_PT,\
_MALE_DETERMINANTS_PT, _MALE_ENDINGS_PT, _GENDERS_PT


def isFractional_pt(input_str):
Expand Down Expand Up @@ -667,7 +668,7 @@ def date_found():
dayOffset -= 2
elif wordNext == "ante" and wordNextNext == "ontem":
dayOffset -= 2
elif (wordNext == "ante" and wordNext == "ante" and
elif (wordNext == "ante" and wordNextNext == "ante" and
wordNextNextNext == "ontem"):
dayOffset -= 3
elif wordNext in days:
Expand Down Expand Up @@ -1122,18 +1123,33 @@ def pt_pruning(text, symbols=True, accents=True, agressive=True):
return text


def get_gender_pt(word, raw_string=""):
word = word.rstrip("s")
gender = None
words = raw_string.split(" ")
def get_gender_pt(word, text=""):
# parse gender taking context into account
word = word.lower()
words = text.lower().split(" ")
for idx, w in enumerate(words):
if w == word and idx != 0:
previous = words[idx - 1]
gender = get_gender_pt(previous)
break
if not gender:
if word[-1] == "a":
gender = "f"
if word[-1] == "o" or word[-1] == "e":
gender = "m"
return gender
# in portuguese usually the previous word (a determinant)
# assigns gender to the next word
previous = words[idx - 1].lower()
if previous in _MALE_DETERMINANTS_PT:
return "m"
elif previous in _FEMALE_DETERMINANTS_PT:
return "f"

# get gender using only the individual word
# see if this word has the gender defined
if word in _GENDERS_PT:
return _GENDERS_PT[word]
singular = word.rstrip("s")
if singular in _GENDERS_PT:
return _GENDERS_PT[singular]
# in portuguese the last vowel usually defines the gender of a word
# the gender of the determinant takes precedence over this rule
for end_str in _FEMALE_ENDINGS_PT:
if word.endswith(end_str):
return "f"
for end_str in _MALE_ENDINGS_PT:
if word.endswith(end_str):
return "m"
return None
18 changes: 15 additions & 3 deletions test/test_parse_pt.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class TestNormalize(unittest.TestCase):
"""
Test cases for Portuguese parsing
"""

def test_articles_pt(self):
self.assertEqual(normalize(u"isto é o teste",
lang="pt", remove_articles=True),
Expand Down Expand Up @@ -244,15 +245,26 @@ def test_extractdatetime_default_pt(self):
anchor, lang='pt-pt', default_time=default)
self.assertEqual(default, res[0].time())


class TestExtractGender(unittest.TestCase):
def test_gender_pt(self):
# words with well defined grammatical gender rules
self.assertEqual(get_gender("vaca", lang="pt"), "f")
self.assertEqual(get_gender("cavalo", lang="pt"), "m")
self.assertEqual(get_gender("vacas", lang="pt"), "f")
self.assertEqual(get_gender("boi", "o boi come erva", lang="pt"), "m")

# words specifically defined in a lookup dictionary
self.assertEqual(get_gender("homem", lang="pt"), "m")
self.assertEqual(get_gender("mulher", lang="pt"), "f")
self.assertEqual(get_gender("homems", lang="pt"), "m")
self.assertEqual(get_gender("mulheres", lang="pt"), "f")

# words where gender rules do not work but context does
self.assertEqual(get_gender("boi", lang="pt"), None)
self.assertEqual(get_gender("homem", "estes homem come merda",
self.assertEqual(get_gender("boi", "o boi come erva", lang="pt"), "m")
self.assertEqual(get_gender("homem", "este homem come bois",
lang="pt"), "m")
self.assertEqual(get_gender("ponte", lang="pt"), "m")
self.assertEqual(get_gender("ponte", lang="pt"), None)
self.assertEqual(get_gender("ponte", "essa ponte caiu",
lang="pt"), "f")

Expand Down

0 comments on commit aa04301

Please sign in to comment.