From 9f707b5e5d3e2bbd3cf14d56f79305f14c14b718 Mon Sep 17 00:00:00 2001 From: Anoop Sharma Date: Mon, 18 Mar 2024 15:31:06 +0530 Subject: [PATCH 1/5] Added utils file --- llama_parse/utils.py | 109 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 llama_parse/utils.py diff --git a/llama_parse/utils.py b/llama_parse/utils.py new file mode 100644 index 0000000..bd1fc23 --- /dev/null +++ b/llama_parse/utils.py @@ -0,0 +1,109 @@ +from enum import Enum + +# Asyncio error messages +nest_asyncio_err = "cannot be called from a running event loop" +nest_asyncio_msg = "The event loop is already running. Add `import nest_asyncio; nest_asyncio.apply()` to your code to fix this issue." + +class ResultType(str, Enum): + """The result type for the parser.""" + TXT = "text" + MD = "markdown" + +class Language(str, Enum): + """Language of the document to be parsed""" + BAZA = "abq" + ADYGHE = "ady" + AFRIKAANS = "af" + ANGIKA = "ang" + ARABIC = "ar" + ASSAMESE = "as" + AVAR = "ava" + AZERBAIJANI = "az" + BELARUSIAN = "be" + BULGARIAN = "bg" + BIHARI = "bh" + BHOJPURI = "bho" + BENGALI = "bn" + BOSNIAN = "bs" + SIMPLIFIED_CHINESE = "ch_sim" + TRADITIONAL_CHINESE = "ch_tra" + CHECHEN = "che" + CZECH = "cs" + WELSH = "cy" + DANISH = "da" + DARGWA = "dar" + GERMAN = "de" + ENGLISH = "en" + SPANISH = "es" + ESTONIAN = "et" + PERSIAN_FARSI = "fa" + FRENCH = "fr" + IRISH = "ga" + GOAN_KONKANI = "gom" + HINDI = "hi" + CROATIAN = "hr" + HUNGARIAN = "hu" + INDONESIAN = "id" + INGUSH = "inh" + ICELANDIC = "is" + ITALIAN = "it" + JAPANESE = "ja" + KABARDIAN = "kbd" + KANNADA = "kn" + KOREAN = "ko" + KURDISH = "ku" + LATIN = "la" + LAK = "lbe" + LEZGHIAN = "lez" + LITHUANIAN = "lt" + LATVIAN = "lv" + MAGAHI = "mah" + MAITHILI = "mai" + MAORI = "mi" + MONGOLIAN = "mn" + MARATHI = "mr" + MALAY = "ms" + MALTESE = "mt" + NEPALI = "ne" + NEWARI = "new" + DUTCH = "nl" + NORWEGIAN = "no" + OCCITAN = "oc" + PALI = "pi" + POLISH = "pl" + PORTUGUESE = "pt" + ROMANIAN = "ro" + RUSSIAN = "ru" + SERBIAN_CYRILLIC = "rs_cyrillic" + SERBIAN_LATIN = "rs_latin" + NAGPURI = "sck" + SLOVAK = "sk" + SLOVENIAN = "sl" + ALBANIAN = "sq" + SWEDISH = "sv" + SWAHILI = "sw" + TAMIL = "ta" + TABASSARAN = "tab" + TELUGU = "te" + THAI = "th" + TAJIK = "tjk" + TAGALOG = "tl" + TURKISH = "tr" + UYGHUR = "ug" + UKRANIAN = "uk" + URDU = "ur" + UZBEK = "uz" + VIETNAMESE = "vi" + + +SUPPORTED_FILE_TYPES = [ + ".pdf", + ".xml" + ".doc", + ".docx", + ".pptx", + ".rtf", + ".pages", + ".key", + ".epub" +] From 49d558dd8620313bf60146ed4a212cbf33cbcef4 Mon Sep 17 00:00:00 2001 From: Anoop Sharma Date: Mon, 18 Mar 2024 15:42:05 +0530 Subject: [PATCH 2/5] imported utils --- llama_parse/base.py | 111 +------------------------------------------- 1 file changed, 2 insertions(+), 109 deletions(-) diff --git a/llama_parse/base.py b/llama_parse/base.py index 8341418..b985129 100644 --- a/llama_parse/base.py +++ b/llama_parse/base.py @@ -12,114 +12,7 @@ from llama_index.core.constants import DEFAULT_BASE_URL from llama_index.core.readers.base import BasePydanticReader from llama_index.core.schema import Document - - -nest_asyncio_err = "cannot be called from a running event loop" -nest_asyncio_msg = "The event loop is already running. Add `import nest_asyncio; nest_asyncio.apply()` to your code to fix this issue." - -class ResultType(str, Enum): - """The result type for the parser.""" - TXT = "text" - MD = "markdown" - -class Language(str, Enum): - BAZA = "abq" - ADYGHE = "ady" - AFRIKAANS = "af" - ANGIKA = "ang" - ARABIC = "ar" - ASSAMESE = "as" - AVAR = "ava" - AZERBAIJANI = "az" - BELARUSIAN = "be" - BULGARIAN = "bg" - BIHARI = "bh" - BHOJPURI = "bho" - BENGALI = "bn" - BOSNIAN = "bs" - SIMPLIFIED_CHINESE = "ch_sim" - TRADITIONAL_CHINESE = "ch_tra" - CHECHEN = "che" - CZECH = "cs" - WELSH = "cy" - DANISH = "da" - DARGWA = "dar" - GERMAN = "de" - ENGLISH = "en" - SPANISH = "es" - ESTONIAN = "et" - PERSIAN_FARSI = "fa" - FRENCH = "fr" - IRISH = "ga" - GOAN_KONKANI = "gom" - HINDI = "hi" - CROATIAN = "hr" - HUNGARIAN = "hu" - INDONESIAN = "id" - INGUSH = "inh" - ICELANDIC = "is" - ITALIAN = "it" - JAPANESE = "ja" - KABARDIAN = "kbd" - KANNADA = "kn" - KOREAN = "ko" - KURDISH = "ku" - LATIN = "la" - LAK = "lbe" - LEZGHIAN = "lez" - LITHUANIAN = "lt" - LATVIAN = "lv" - MAGAHI = "mah" - MAITHILI = "mai" - MAORI = "mi" - MONGOLIAN = "mn" - MARATHI = "mr" - MALAY = "ms" - MALTESE = "mt" - NEPALI = "ne" - NEWARI = "new" - DUTCH = "nl" - NORWEGIAN = "no" - OCCITAN = "oc" - PALI = "pi" - POLISH = "pl" - PORTUGUESE = "pt" - ROMANIAN = "ro" - RUSSIAN = "ru" - SERBIAN_CYRILLIC = "rs_cyrillic" - SERBIAN_LATIN = "rs_latin" - NAGPURI = "sck" - SLOVAK = "sk" - SLOVENIAN = "sl" - ALBANIAN = "sq" - SWEDISH = "sv" - SWAHILI = "sw" - TAMIL = "ta" - TABASSARAN = "tab" - TELUGU = "te" - THAI = "th" - TAJIK = "tjk" - TAGALOG = "tl" - TURKISH = "tr" - UYGHUR = "ug" - UKRANIAN = "uk" - URDU = "ur" - UZBEK = "uz" - VIETNAMESE = "vi" - - -SUPPORTED_FILE_TYPES = [ - ".pdf", - ".xml" - ".doc", - ".docx", - ".pptx", - ".rtf", - ".pages", - ".key", - ".epub" -] - +from llama_parse.utils import nest_asyncio_err, nest_asyncio_msg, ResultType, Language, SUPPORTED_FILE_TYPES class LlamaParse(BasePydanticReader): """A smart-parser for files.""" @@ -359,4 +252,4 @@ def get_images(self, json_result: list[dict], download_path: str) -> List[dict]: return images except Exception as e: print(f"Error while downloading images from the parsed result:", e) - return [] \ No newline at end of file + return [] From c845de7c4e9a0bab81516fc37cefd0de41cafbf8 Mon Sep 17 00:00:00 2001 From: anoopshrma Date: Mon, 18 Mar 2024 16:20:51 +0530 Subject: [PATCH 3/5] removed unused import --- llama_parse/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_parse/base.py b/llama_parse/base.py index b985129..90cbf10 100644 --- a/llama_parse/base.py +++ b/llama_parse/base.py @@ -3,7 +3,6 @@ import httpx import mimetypes import time -from enum import Enum from pathlib import Path from typing import List, Optional, Union From 92c54fb735d1f70250d65a8be13a94ddc69db719 Mon Sep 17 00:00:00 2001 From: Anoop Sharma Date: Wed, 27 Mar 2024 20:12:51 +0530 Subject: [PATCH 4/5] synced base.py updates --- llama_parse/utils.py | 53 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/llama_parse/utils.py b/llama_parse/utils.py index bd1fc23..63acff7 100644 --- a/llama_parse/utils.py +++ b/llama_parse/utils.py @@ -8,9 +8,9 @@ class ResultType(str, Enum): """The result type for the parser.""" TXT = "text" MD = "markdown" + JSON = "json" class Language(str, Enum): - """Language of the document to be parsed""" BAZA = "abq" ADYGHE = "ady" AFRIKAANS = "af" @@ -98,12 +98,59 @@ class Language(str, Enum): SUPPORTED_FILE_TYPES = [ ".pdf", - ".xml" + # Microsoft word - all versions ".doc", ".docx", - ".pptx", + ".docm", + ".dot", + ".dotx", + ".dotm", + # Rich text format ".rtf", + # Microsoft Works + ".wps", + # Word Perfect + ".wpd", + + # Open Office + ".sxw", + ".stw", + ".sxg", + + # Apple ".pages", + + # Mac Write + ".mw", + ".mcw", + + + # Unified Office Format text + ".uot", + ".uof", + ".uos", + ".uop", + + # Microsoft powerpoints + ".ppt", + ".pptx", + ".pot", + ".pptm", + ".potx", + ".potm", + + + # Apple keynote ".key", + + # Open Office Presentations + ".odp", + ".odg", + ".otp", + ".fopd", + ".sxi", + ".sti", + + # ebook ".epub" ] From 0ef157e66c63b97f4c2d08c87ae045c02019760d Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Sun, 14 Apr 2024 13:29:45 -0600 Subject: [PATCH 5/5] linting --- llama_parse/utils.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/llama_parse/utils.py b/llama_parse/utils.py index 63acff7..5e6cd01 100644 --- a/llama_parse/utils.py +++ b/llama_parse/utils.py @@ -4,12 +4,15 @@ nest_asyncio_err = "cannot be called from a running event loop" nest_asyncio_msg = "The event loop is already running. Add `import nest_asyncio; nest_asyncio.apply()` to your code to fix this issue." + class ResultType(str, Enum): """The result type for the parser.""" + TXT = "text" MD = "markdown" JSON = "json" + class Language(str, Enum): BAZA = "abq" ADYGHE = "ady" @@ -90,7 +93,7 @@ class Language(str, Enum): TAGALOG = "tl" TURKISH = "tr" UYGHUR = "ug" - UKRANIAN = "uk" + UKRAINIAN = "uk" URDU = "ur" UZBEK = "uz" VIETNAMESE = "vi" @@ -111,26 +114,20 @@ class Language(str, Enum): ".wps", # Word Perfect ".wpd", - # Open Office ".sxw", - ".stw", + ".stw", ".sxg", - # Apple ".pages", - # Mac Write ".mw", ".mcw", - - # Unified Office Format text ".uot", ".uof", ".uos", ".uop", - # Microsoft powerpoints ".ppt", ".pptx", @@ -138,19 +135,15 @@ class Language(str, Enum): ".pptm", ".potx", ".potm", - - # Apple keynote ".key", - # Open Office Presentations ".odp", ".odg", ".otp", ".fopd", - ".sxi", + ".sxi", ".sti", - # ebook - ".epub" + ".epub", ]