Skip to content

Commit

Permalink
Add markdown support
Browse files Browse the repository at this point in the history
  • Loading branch information
vegito22 committed Nov 14, 2024
1 parent 6da7179 commit 0431a5b
Showing 1 changed file with 65 additions and 0 deletions.
65 changes: 65 additions & 0 deletions llmstack/common/utils/text_extraction_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,18 @@ def table_html_to_text(table_html: str) -> str:
return text


def table_html_to_markdown(table_html: str) -> str:
from bs4 import BeautifulSoup

soup = BeautifulSoup(table_html, "html.parser")
text = ""
for row in soup.find_all("tr"):
for cell in row.find_all(["td", "th"]):
text += cell.get_text() + " | "
text += "\n"
return text


class TextCanvas:
def __init__(self, width: int, height: int):
self.width = width
Expand Down Expand Up @@ -150,6 +162,50 @@ def formatted_text(self):
else:
return "\n".join([element.formatted_text for element in self.elements])

@property
def markdown(self):
text = ""
for element in self.elements:
print(element)
if element.element_type == "Formula":
text += f"{element.text}\n\n"
elif element.element_type == "FigureCaption":
text += f"**{element.text}**\n\n"
elif element.element_type == "NarrativeText":
text += f"{element.text}\n\n"
elif element.element_type == "ListItem":
text += "-"
continue
elif element.element_type == "Title":
text += f"# {element.text}"
elif element.element_type == "Address":
text += f"{element.text}"
elif element.element_type == "EmailAddress":
text += f"{element.text}"
elif element.element_type == "Image":
text += f"![Image metadata: {element.text}](#)\n\n"
elif element.element_type == "PageBreak":
text += '<div class="pagebreak" />'
elif element.element_type == "Table":
if element.provider_data and element.provider_data.get("type") == "Table":
text += table_html_to_markdown(element.provider_data.get("metadata", {}).get("text_as_html"))
else:
text += f"{element.text}"
elif element.element_type == "Header":
text += f"## {element.text}"
elif element.element_type == "Footer":
text += f"## {element.text}"
elif element.element_type == "CodeSnippet":
text += f"```{element.text}```"
elif element.element_type == "PageNumber":
text += f"Page No. {element.text}"
elif element.element_type == "UncategorizedText":
text += f"{element.text}\n"
else:
text += element.text
text += "\n"
return text


class TextractResponse(BaseModel):
pages: List[Page] = []
Expand All @@ -171,6 +227,14 @@ def formatted_text(self):
text += f"\n--- Page Break (Pg {page.page_no})---\n"
return text

@property
def markdown(self):
text = ""
for page in self.pages:
text += page.markdown
text += '<div class="pagebreak" />'
return text


class TextExtractionService(ABC):
def __init__(self, provider) -> None:
Expand Down Expand Up @@ -242,6 +306,7 @@ def extract_from_bytes(self, file: bytes, **kwargs) -> TextractResponse:
bottom_right=(box[2].x, box[2].y),
bottom_left=(box[3].x, box[3].y),
),
element_type="UncategorizedText",
)
page_element.set_midpoint_normalized(page_width, page_height)
page.elements.append(page_element)
Expand Down

0 comments on commit 0431a5b

Please sign in to comment.