Skip to content
This repository has been archived by the owner on Sep 8, 2024. It is now read-only.

Commit

Permalink
Merge pull request #722 from SoloVeniaASaludar/feature/issue-721
Browse files Browse the repository at this point in the history
i18n: Spanish normalize_es
  • Loading branch information
kfezer authored May 5, 2017
2 parents 24b545b + d85583a commit 8240e54
Show file tree
Hide file tree
Showing 2 changed files with 239 additions and 1 deletion.
181 changes: 180 additions & 1 deletion mycroft/util/parse.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@

# -*- coding: iso-8859-15 -*-

# Copyright 2017 Mycroft AI, Inc.
#
# This file is part of Mycroft Core.
Expand Down Expand Up @@ -28,8 +31,12 @@ def normalize(text, lang="en-us", remove_articles=True):
Returns:
(str): The normalized string.
"""
if str(lang).lower().startswith("en"):

lang_lower = str(lang).lower()
if lang_lower.startswith("en"):
return normalize_en(text, remove_articles)
elif lang_lower.startswith("es"):
return normalize_es(text, remove_articles)

# TODO: Normalization for other languages
return text
Expand Down Expand Up @@ -102,3 +109,175 @@ def normalize_en(text, remove_articles):
normalized += " " + word

return normalized[1:] # strip the initial space


####################################################################
# Spanish normalization
#
# TODO: numbers greater than 999999
####################################################################

# Undefined articles ["un", "una", "unos", "unas"] can not be supressed,
# in Spanish, "un caballo" means "a horse" or "one horse".
es_articles = ["el", "la", "los", "las"]

es_numbers_xlat = {
"un": 1,
"uno": 1,
"una": 1,
"dos": 2,
"tres": 3,
"cuatro": 4,
"cinco": 5,
"seis": 6,
"siete": 7,
"ocho": 8,
"nueve": 9,
"diez": 10,
"once": 11,
"doce": 12,
"trece": 13,
"catorce": 14,
"quince": 15,
u"dieciséis": 16,
"diecisiete": 17,
"dieciocho": 18,
"diecinueve": 19,
"veinte": 20,
"veintiuno": 21,
u"veintidós": 22,
u"veintitrés": 23,
"veinticuatro": 24,
"veinticinco": 25,
u"veintiséis": 26,
"veintisiete": 27,
"veintiocho": 28,
"veintinueve": 29,
"treinta": 30,
"cuarenta": 40,
"cincuenta": 50,
"sesenta": 60,
"setenta": 70,
"ochenta": 80,
"noventa": 90,
"cien": 100,
"ciento": 100,
"doscientos": 200,
"doscientas": 200,
"trescientos": 300,
"trescientas": 300,
"cuatrocientos": 400,
"cuatrocientas": 400,
"quinientos": 500,
"quinientas": 500,
"seiscientos": 600,
"seiscientas": 600,
"setecientos": 700,
"setecientas": 700,
"ochocientos": 800,
"ochocientas": 800,
"novecientos": 900,
"novecientas": 900}


def es_parse(words, i):
def es_cte(i, s):
if i < len(words) and s == words[i]:
return s, i+1
return None

def es_number_word(i, mi, ma):
if i < len(words):
v = es_numbers_xlat.get(words[i])
if v and v >= mi and v <= ma:
return v, i+1
return None

def es_number_1_99(i):
r1 = es_number_word(i, 1, 29)
if r1:
return r1

r1 = es_number_word(i, 30, 90)
if r1:
v1, i1 = r1
r2 = es_cte(i1, "y")
if r2:
v2, i2 = r2
r3 = es_number_word(i2, 1, 9)
if r3:
v3, i3 = r3
return v1+v3, i3
return r1
return None

def es_number_1_999(i):
# [2-9]cientos [1-99]?
r1 = es_number_word(i, 100, 900)
if r1:
v1, i1 = r1
r2 = es_number_1_99(i1)
if r2:
v2, i2 = r2
return v1+v2, i2
else:
return r1

# [1-99]
r1 = es_number_1_99(i)
if r1:
return r1

return None

def es_number(i):
# check for cero
r1 = es_number_word(i, 0, 0)
if r1:
return r1

# check for [1-999] (mil [0-999])?
r1 = es_number_1_999(i)
if r1:
v1, i1 = r1
r2 = es_cte(i1, "mil")
if r2:
v2, i2 = r2
r3 = es_number_1_999(i2)
if r3:
v3, i3 = r3
return v1*1000+v3, i3
else:
return v1*1000, i2
else:
return r1
return None

return es_number(i)


def normalize_es(text, remove_articles):
""" Spanish string normalization """

words = text.split() # this also removed extra spaces

normalized = ""
i = 0
while i < len(words):
word = words[i]

if remove_articles and word in es_articles:
i += 1
continue

# Convert numbers into digits
r = es_parse(words, i)
if r:
v, i = r
normalized += " " + str(v)
continue

normalized += " " + word
i += 1

return normalized[1:] # strip the initial space
59 changes: 59 additions & 0 deletions test/util/test_parse.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@

# -*- coding: iso-8859-15 -*-

import unittest
from mycroft.util.parse import normalize

Expand Down Expand Up @@ -193,6 +196,62 @@ def test_combinations(self):

self.assertEqual(normalize("whats 8 + 4"), "what is 8 + 4")

#
# Spanish
#
def test_articles_es(self):
self.assertEqual(normalize("esta es la prueba", lang="es",
remove_articles=True),
"esta es prueba")
self.assertEqual(normalize("y otra prueba", lang="es",
remove_articles=True),
"y otra prueba")

def test_numbers_es(self):
self.assertEqual(normalize("esto es un uno una", lang="es"),
"esto es 1 1 1")
self.assertEqual(normalize("esto es dos tres prueba", lang="es"),
"esto es 2 3 prueba")
self.assertEqual(normalize("esto es cuatro cinco seis prueba",
lang="es"),
"esto es 4 5 6 prueba")
self.assertEqual(normalize("siete más ocho más nueve", lang="es"),
"7 más 8 más 9")
self.assertEqual(normalize("diez once doce trece catorce quince",
lang="es"),
"10 11 12 13 14 15")
self.assertEqual(normalize(u"dieciséis diecisiete", lang="es"),
"16 17")
self.assertEqual(normalize(u"dieciocho diecinueve", lang="es"),
"18 19")
self.assertEqual(normalize(u"veinte treinta cuarenta", lang="es"),
"20 30 40")
self.assertEqual(normalize(u"treinta y dos caballos", lang="es"),
"32 caballos")
self.assertEqual(normalize(u"cien caballos", lang="es"),
"100 caballos")
self.assertEqual(normalize(u"ciento once caballos", lang="es"),
"111 caballos")
self.assertEqual(normalize(u"había cuatrocientas una vacas",
lang="es"),
u"había 401 vacas")
self.assertEqual(normalize(u"dos mil", lang="es"),
"2000")
self.assertEqual(normalize(u"dos mil trescientas cuarenta y cinco",
lang="es"),
"2345")
self.assertEqual(normalize(
u"ciento veintitrés mil cuatrocientas cincuenta y seis",
lang="es"),
"123456")
self.assertEqual(normalize(
u"quinientas veinticinco mil", lang="es"),
"525000")
self.assertEqual(normalize(
u"novecientos noventa y nueve mil novecientos noventa y nueve",
lang="es"),
"999999")


if __name__ == "__main__":
unittest.main()

0 comments on commit 8240e54

Please sign in to comment.