From 0431a5b159b57705658edc9add83f4be4cfa2102 Mon Sep 17 00:00:00 2001 From: Vignesh Aigal Date: Thu, 14 Nov 2024 13:25:43 -0800 Subject: [PATCH] Add markdown support --- .../common/utils/text_extraction_service.py | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/llmstack/common/utils/text_extraction_service.py b/llmstack/common/utils/text_extraction_service.py index 170c7dec2fe..d922e3cdfc7 100644 --- a/llmstack/common/utils/text_extraction_service.py +++ b/llmstack/common/utils/text_extraction_service.py @@ -31,6 +31,18 @@ def table_html_to_text(table_html: str) -> str: return text +def table_html_to_markdown(table_html: str) -> str: + from bs4 import BeautifulSoup + + soup = BeautifulSoup(table_html, "html.parser") + text = "" + for row in soup.find_all("tr"): + for cell in row.find_all(["td", "th"]): + text += cell.get_text() + " | " + text += "\n" + return text + + class TextCanvas: def __init__(self, width: int, height: int): self.width = width @@ -150,6 +162,50 @@ def formatted_text(self): else: return "\n".join([element.formatted_text for element in self.elements]) + @property + def markdown(self): + text = "" + for element in self.elements: + print(element) + if element.element_type == "Formula": + text += f"{element.text}\n\n" + elif element.element_type == "FigureCaption": + text += f"**{element.text}**\n\n" + elif element.element_type == "NarrativeText": + text += f"{element.text}\n\n" + elif element.element_type == "ListItem": + text += "-" + continue + elif element.element_type == "Title": + text += f"# {element.text}" + elif element.element_type == "Address": + text += f"{element.text}" + elif element.element_type == "EmailAddress": + text += f"{element.text}" + elif element.element_type == "Image": + text += f"![Image metadata: {element.text}](#)\n\n" + elif element.element_type == "PageBreak": + text += '
' + elif element.element_type == "Table": + if element.provider_data and element.provider_data.get("type") == "Table": + text += table_html_to_markdown(element.provider_data.get("metadata", {}).get("text_as_html")) + else: + text += f"{element.text}" + elif element.element_type == "Header": + text += f"## {element.text}" + elif element.element_type == "Footer": + text += f"## {element.text}" + elif element.element_type == "CodeSnippet": + text += f"```{element.text}```" + elif element.element_type == "PageNumber": + text += f"Page No. {element.text}" + elif element.element_type == "UncategorizedText": + text += f"{element.text}\n" + else: + text += element.text + text += "\n" + return text + class TextractResponse(BaseModel): pages: List[Page] = [] @@ -171,6 +227,14 @@ def formatted_text(self): text += f"\n--- Page Break (Pg {page.page_no})---\n" return text + @property + def markdown(self): + text = "" + for page in self.pages: + text += page.markdown + text += '
' + return text + class TextExtractionService(ABC): def __init__(self, provider) -> None: @@ -242,6 +306,7 @@ def extract_from_bytes(self, file: bytes, **kwargs) -> TextractResponse: bottom_right=(box[2].x, box[2].y), bottom_left=(box[3].x, box[3].y), ), + element_type="UncategorizedText", ) page_element.set_midpoint_normalized(page_width, page_height) page.elements.append(page_element)