scrapinghub · AmPhIbIaN26 · Apr 10, 2021 · Apr 14, 2021 · Apr 14, 2021 · May 3, 2021
diff --git a/number_parser/data/rom.py b/number_parser/data/rom.py
@@ -0,0 +1,44 @@
+info = {
+    "UNIT_NUMBERS": {
+        "i": 1,
+        "ii": 2,
+        "iii": 3,
+        "iv": 5,
+        "vi": 6,
+        "vii": 7,
+        "viii": 8,
+        "ix": 9
+    },
+    "DIRECT_NUMBERS": {
+        "x": 10,
+
+    },
+    "TENS": {
+        "xx": 20,
+        "xxx": 30,
+        "xl": 40,
+        "l": 50,
+        "lx": 60,
+        "lxx": 70,
+        "lxxx": 80,
+        "xc": 90
+    },
+    "HUNDREDS": {
+        "c": 100,
+        "cc": 200,
+        "ccc": 300,
+        "cd": 400,
+        "d": 500,
+        "dc": 600,
+        "dcc": 700,
+        "dccc": 800,
+        "cm": 900
+    },
+    "BIG_POWERS_OF_TEN": {
+        "m": 1000,
+        "mm": 2000,
+        "mmm": 3000
+    },
+    "SKIP_TOKENS": [],
+    "USE_LONG_SCALE": False
+}
diff --git a/number_parser/parser.py b/number_parser/parser.py
@@ -2,7 +2,7 @@
 from importlib import import_module
 import unicodedata
 SENTENCE_SEPARATORS = [".", ","]
-SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru']
+SUPPORTED_LANGUAGES = ['en', 'es', 'hi', 'ru', 'rom']
 RE_BUG_LANGUAGES = ['hi']
 
 
@@ -141,6 +141,8 @@ def _build_number(token_list, lang_data):
 
 def _tokenize(input_string, language):
     """Breaks string on any non-word character."""
+    if language == 'rom':
+        return re.split("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", input_string.lower())
     input_string = input_string.replace('\xad', '')
     if language in RE_BUG_LANGUAGES:
         return re.split(r'(\s+)', input_string)
@@ -310,6 +312,14 @@ def parse(input_string, language=None):
 
     tokens = _tokenize(input_string, language)
 
+    if language == 'rom':
+        tokens = _tokenize(input_string, language=None)
+        for token in tokens:
+            if re.search("^(m{0,3})(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})$", token.lower()):
+                tokens[tokens.index(token)] = str(parse_number(token, language='rom'))
+        final_sentance = ''.join(tokens)
+        return final_sentance
+
     final_sentence = []
     current_sentence = []
     tokens_taken = []