diff --git a/.azdo/pipelines/azure-dev.yml b/.azdo/pipelines/azure-dev.yml index 3495e06a45..8c61d4acaf 100644 --- a/.azdo/pipelines/azure-dev.yml +++ b/.azdo/pipelines/azure-dev.yml @@ -120,6 +120,7 @@ steps: DEPLOYMENT_TARGET: $(DEPLOYMENT_TARGET) AZURE_CONTAINER_APPS_WORKLOAD_PROFILE: $(AZURE_CONTAINER_APPS_WORKLOAD_PROFILE) USE_CHAT_HISTORY_BROWSER: $(USE_CHAT_HISTORY_BROWSER) + USE_MEDIA_DESCRIBER_AZURE_CU: $(USE_MEDIA_DESCRIBER_AZURE_CU) - task: AzureCLI@2 displayName: Deploy Application inputs: diff --git a/.github/workflows/azure-dev.yml b/.github/workflows/azure-dev.yml index 798e589413..860a13cbfa 100644 --- a/.github/workflows/azure-dev.yml +++ b/.github/workflows/azure-dev.yml @@ -13,7 +13,7 @@ on: # To configure required secrets for connecting to Azure, simply run `azd pipeline config` # Set up permissions for deploying with secretless Azure federated credentials -# https://learn.microsoft.com/en-us/azure/developer/github/connect-from-azure?tabs=azure-portal%2Clinux#set-up-azure-login-with-openid-connect-authentication +# https://learn.microsoft.com/azure/developer/github/connect-from-azure?tabs=azure-portal%2Clinux#set-up-azure-login-with-openid-connect-authentication permissions: id-token: write contents: read @@ -103,6 +103,7 @@ jobs: DEPLOYMENT_TARGET: ${{ vars.DEPLOYMENT_TARGET }} AZURE_CONTAINER_APPS_WORKLOAD_PROFILE: ${{ vars.AZURE_CONTAINER_APPS_WORKLOAD_PROFILE }} USE_CHAT_HISTORY_BROWSER: ${{ vars.USE_CHAT_HISTORY_BROWSER }} + USE_MEDIA_DESCRIBER_AZURE_CU: ${{ vars.USE_MEDIA_DESCRIBER_AZURE_CU }} steps: - name: Checkout uses: actions/checkout@v4 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4b54675d56..9fa92346c9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -122,6 +122,8 @@ If you followed the steps above to install the pre-commit hooks, then you can ju When adding new azd environment variables, please remember to update: +1. [main.parameters.json](./infra/main.parameters.json) +1. [appEnvVariables in main.bicep](./infra/main.bicep) 1. App Service's [azure.yaml](./azure.yaml) 1. [ADO pipeline](.azdo/pipelines/azure-dev.yml). 1. [Github workflows](.github/workflows/azure-dev.yml) diff --git a/README.md b/README.md index 1fb97c6784..49d5ea84b3 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,9 @@ However, you can try the [Azure pricing calculator](https://azure.com/e/e3490de2 - Azure AI Document Intelligence: SO (Standard) tier using pre-built layout. Pricing per document page, sample documents have 261 pages total. [Pricing](https://azure.microsoft.com/pricing/details/form-recognizer/) - Azure AI Search: Basic tier, 1 replica, free level of semantic search. Pricing per hour. [Pricing](https://azure.microsoft.com/pricing/details/search/) - Azure Blob Storage: Standard tier with ZRS (Zone-redundant storage). Pricing per storage and read operations. [Pricing](https://azure.microsoft.com/pricing/details/storage/blobs/) -- Azure Cosmos DB: Serverless tier. Pricing per request unit and storage. [Pricing](https://azure.microsoft.com/pricing/details/cosmos-db/) +- Azure Cosmos DB: Only provisioned if you enabled [chat history with Cosmos DB](docs/deploy_features.md#enabling-persistent-chat-history-with-azure-cosmos-db). Serverless tier. Pricing per request unit and storage. [Pricing](https://azure.microsoft.com/pricing/details/cosmos-db/) +- Azure AI Vision: Only provisioned if you enabled [GPT-4 with vision](docs/gpt4v.md). Pricing per 1K transactions. [Pricing](https://azure.microsoft.com/pricing/details/cognitive-services/computer-vision/) +- Azure AI Content Understanding: Only provisioned if you enabled [media description](docs/deploy_features.md#enabling-media-description-with-azure-content-understanding). Pricing per 1K images. [Pricing](https://azure.microsoft.com/pricing/details/content-understanding/) - Azure Monitor: Pay-as-you-go tier. Costs based on data ingested. [Pricing](https://azure.microsoft.com/pricing/details/monitor/) To reduce costs, you can switch to free SKUs for various services, but those SKUs have limitations. diff --git a/app/backend/gunicorn.conf.py b/app/backend/gunicorn.conf.py index 4518587695..9144e3cc00 100644 --- a/app/backend/gunicorn.conf.py +++ b/app/backend/gunicorn.conf.py @@ -7,7 +7,7 @@ bind = "0.0.0.0" timeout = 230 -# https://learn.microsoft.com/en-us/troubleshoot/azure/app-service/web-apps-performance-faqs#why-does-my-request-time-out-after-230-seconds +# https://learn.microsoft.com/troubleshoot/azure/app-service/web-apps-performance-faqs#why-does-my-request-time-out-after-230-seconds num_cpus = multiprocessing.cpu_count() if os.getenv("WEBSITE_SKU") == "LinuxFree": diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index d62a42f8cf..57cfe52e6f 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -7,6 +7,7 @@ from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider +from rich.logging import RichHandler from load_azd_env import load_azd_env from prepdocslib.blobmanager import BlobManager @@ -158,8 +159,10 @@ def setup_file_processors( local_pdf_parser: bool = False, local_html_parser: bool = False, search_images: bool = False, + use_content_understanding: bool = False, + content_understanding_endpoint: Union[str, None] = None, ): - sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images) + sentence_text_splitter = SentenceTextSplitter() doc_int_parser: Optional[DocumentAnalysisParser] = None # check if Azure Document Intelligence credentials are provided @@ -170,6 +173,8 @@ def setup_file_processors( doc_int_parser = DocumentAnalysisParser( endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/", credential=documentintelligence_creds, + use_content_understanding=use_content_understanding, + content_understanding_endpoint=content_understanding_endpoint, ) pdf_parser: Optional[Parser] = None @@ -294,10 +299,10 @@ async def main(strategy: Strategy, setup_index: bool = True): args = parser.parse_args() if args.verbose: - logging.basicConfig(format="%(message)s") + logging.basicConfig(format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)]) # We only set the level to INFO for our logger, # to avoid seeing the noisy INFO level logs from the Azure SDKs - logger.setLevel(logging.INFO) + logger.setLevel(logging.DEBUG) load_azd_env() @@ -309,6 +314,7 @@ async def main(strategy: Strategy, setup_index: bool = True): use_gptvision = os.getenv("USE_GPT4V", "").lower() == "true" use_acls = os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT") is not None dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false" + use_content_understanding = os.getenv("USE_MEDIA_DESCRIBER_AZURE_CU", "").lower() == "true" # Use the current user identity to connect to Azure services. See infra/main.bicep for role assignments. if tenant_id := os.getenv("AZURE_TENANT_ID"): @@ -406,6 +412,8 @@ async def main(strategy: Strategy, setup_index: bool = True): local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER") == "true", local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER") == "true", search_images=use_gptvision, + use_content_understanding=use_content_understanding, + content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"), ) image_embeddings_service = setup_image_embeddings_service( azure_credential=azd_credential, @@ -424,6 +432,8 @@ async def main(strategy: Strategy, setup_index: bool = True): search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"), use_acls=use_acls, category=args.category, + use_content_understanding=use_content_understanding, + content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"), ) loop.run_until_complete(main(ingestion_strategy, setup_index=not args.remove and not args.removeall)) diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py index e9f18e795a..e8d01dda52 100644 --- a/app/backend/prepdocslib/blobmanager.py +++ b/app/backend/prepdocslib/blobmanager.py @@ -171,7 +171,7 @@ def sourcepage_from_file_page(cls, filename, page=0) -> str: @classmethod def blob_image_name_from_file_page(cls, filename, page=0) -> str: - return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".png" + return os.path.splitext(os.path.basename(filename))[0] + f"-{page+1}" + ".png" @classmethod def blob_name_from_file_name(cls, filename) -> str: diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py index 55b24b6f3a..3748f67a09 100644 --- a/app/backend/prepdocslib/filestrategy.py +++ b/app/backend/prepdocslib/filestrategy.py @@ -1,10 +1,13 @@ import logging from typing import List, Optional +from azure.core.credentials import AzureKeyCredential + from .blobmanager import BlobManager from .embeddings import ImageEmbeddings, OpenAIEmbeddings from .fileprocessor import FileProcessor from .listfilestrategy import File, ListFileStrategy +from .mediadescriber import ContentUnderstandingDescriber from .searchmanager import SearchManager, Section from .strategy import DocumentAction, SearchInfo, Strategy @@ -50,6 +53,8 @@ def __init__( search_analyzer_name: Optional[str] = None, use_acls: bool = False, category: Optional[str] = None, + use_content_understanding: bool = False, + content_understanding_endpoint: Optional[str] = None, ): self.list_file_strategy = list_file_strategy self.blob_manager = blob_manager @@ -61,6 +66,8 @@ def __init__( self.search_info = search_info self.use_acls = use_acls self.category = category + self.use_content_understanding = use_content_understanding + self.content_understanding_endpoint = content_understanding_endpoint async def setup(self): search_manager = SearchManager( @@ -73,6 +80,16 @@ async def setup(self): ) await search_manager.create_index() + if self.use_content_understanding: + if self.content_understanding_endpoint is None: + raise ValueError("Content Understanding is enabled but no endpoint was provided") + if isinstance(self.search_info.credential, AzureKeyCredential): + raise ValueError( + "AzureKeyCredential is not supported for Content Understanding, use keyless auth instead" + ) + cu_manager = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.search_info.credential) + await cu_manager.create_analyzer() + async def run(self): search_manager = SearchManager( self.search_info, self.search_analyzer_name, self.use_acls, False, self.embeddings diff --git a/app/backend/prepdocslib/mediadescriber.py b/app/backend/prepdocslib/mediadescriber.py new file mode 100644 index 0000000000..5aae79232e --- /dev/null +++ b/app/backend/prepdocslib/mediadescriber.py @@ -0,0 +1,107 @@ +import logging +from abc import ABC + +import aiohttp +from azure.core.credentials_async import AsyncTokenCredential +from azure.identity.aio import get_bearer_token_provider +from rich.progress import Progress +from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed + +logger = logging.getLogger("scripts") + + +class MediaDescriber(ABC): + + async def describe_image(self, image_bytes) -> str: + raise NotImplementedError # pragma: no cover + + +class ContentUnderstandingDescriber: + CU_API_VERSION = "2024-12-01-preview" + + analyzer_schema = { + "analyzerId": "image_analyzer", + "name": "Image understanding", + "description": "Extract detailed structured information from images extracted from documents.", + "baseAnalyzerId": "prebuilt-image", + "scenario": "image", + "config": {"returnDetails": False}, + "fieldSchema": { + "name": "ImageInformation", + "descriptions": "Description of image.", + "fields": { + "Description": { + "type": "string", + "description": "Description of the image. If the image has a title, start with the title. Include a 2-sentence summary. If the image is a chart, diagram, or table, include the underlying data in an HTML table tag, with accurate numbers. If the image is a chart, describe any axis or legends. The only allowed HTML tags are the table/thead/tr/td/tbody tags.", + }, + }, + }, + } + + def __init__(self, endpoint: str, credential: AsyncTokenCredential): + self.endpoint = endpoint + self.credential = credential + + async def poll_api(self, session, poll_url, headers): + + @retry(stop=stop_after_attempt(60), wait=wait_fixed(2), retry=retry_if_exception_type(ValueError)) + async def poll(): + async with session.get(poll_url, headers=headers) as response: + response.raise_for_status() + response_json = await response.json() + if response_json["status"] == "Failed": + raise Exception("Failed") + if response_json["status"] == "Running": + raise ValueError("Running") + return response_json + + return await poll() + + async def create_analyzer(self): + logger.info("Creating analyzer '%s'...", self.analyzer_schema["analyzerId"]) + + token_provider = get_bearer_token_provider(self.credential, "https://cognitiveservices.azure.com/.default") + token = await token_provider() + headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} + params = {"api-version": self.CU_API_VERSION} + analyzer_id = self.analyzer_schema["analyzerId"] + cu_endpoint = f"{self.endpoint}/contentunderstanding/analyzers/{analyzer_id}" + async with aiohttp.ClientSession() as session: + async with session.put( + url=cu_endpoint, params=params, headers=headers, json=self.analyzer_schema + ) as response: + if response.status == 409: + logger.info("Analyzer '%s' already exists.", analyzer_id) + return + elif response.status != 201: + data = await response.text() + raise Exception("Error creating analyzer", data) + else: + poll_url = response.headers.get("Operation-Location") + + with Progress() as progress: + progress.add_task("Creating analyzer...", total=None, start=False) + await self.poll_api(session, poll_url, headers) + + async def describe_image(self, image_bytes: bytes) -> str: + logger.info("Sending image to Azure Content Understanding service...") + async with aiohttp.ClientSession() as session: + token = await self.credential.get_token("https://cognitiveservices.azure.com/.default") + headers = {"Authorization": "Bearer " + token.token} + params = {"api-version": self.CU_API_VERSION} + analyzer_name = self.analyzer_schema["analyzerId"] + async with session.post( + url=f"{self.endpoint}/contentunderstanding/analyzers/{analyzer_name}:analyze", + params=params, + headers=headers, + data=image_bytes, + ) as response: + response.raise_for_status() + poll_url = response.headers["Operation-Location"] + + with Progress() as progress: + progress.add_task("Processing...", total=None, start=False) + results = await self.poll_api(session, poll_url, headers) + + fields = results["result"]["contents"][0]["fields"] + return fields["Description"]["valueString"] diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py index f12fe70b94..857235c571 100644 --- a/app/backend/prepdocslib/page.py +++ b/app/backend/prepdocslib/page.py @@ -3,7 +3,7 @@ class Page: A single page from a document Attributes: - page_num (int): Page number + page_num (int): Page number (0-indexed) offset (int): If the text of the entire Document was concatenated into a single string, the index of the first character on the page. For example, if page 1 had the text "hello" and page 2 had the text "world", the offset of page 2 is 5 ("hellow") text (str): The text of the page """ @@ -17,6 +17,10 @@ def __init__(self, page_num: int, offset: int, text: str): class SplitPage: """ A section of a page that has been split into a smaller chunk. + + Attributes: + page_num (int): Page number (0-indexed) + text (str): The text of the section """ def __init__(self, page_num: int, text: str): diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py index 6604110020..1fcbbc9531 100644 --- a/app/backend/prepdocslib/pdfparser.py +++ b/app/backend/prepdocslib/pdfparser.py @@ -1,13 +1,23 @@ import html +import io import logging +from enum import Enum from typing import IO, AsyncGenerator, Union +import pymupdf from azure.ai.documentintelligence.aio import DocumentIntelligenceClient -from azure.ai.documentintelligence.models import DocumentTable +from azure.ai.documentintelligence.models import ( + AnalyzeDocumentRequest, + AnalyzeResult, + DocumentFigure, + DocumentTable, +) from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential +from PIL import Image from pypdf import PdfReader +from .mediadescriber import ContentUnderstandingDescriber from .page import Page from .parser import Parser @@ -39,11 +49,18 @@ class DocumentAnalysisParser(Parser): """ def __init__( - self, endpoint: str, credential: Union[AsyncTokenCredential, AzureKeyCredential], model_id="prebuilt-layout" + self, + endpoint: str, + credential: Union[AsyncTokenCredential, AzureKeyCredential], + model_id="prebuilt-layout", + use_content_understanding=True, + content_understanding_endpoint: Union[str, None] = None, ): self.model_id = model_id self.endpoint = endpoint self.credential = credential + self.use_content_understanding = use_content_understanding + self.content_understanding_endpoint = content_understanding_endpoint async def parse(self, content: IO) -> AsyncGenerator[Page, None]: logger.info("Extracting text from '%s' using Azure Document Intelligence", content.name) @@ -51,47 +68,126 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: async with DocumentIntelligenceClient( endpoint=self.endpoint, credential=self.credential ) as document_intelligence_client: - poller = await document_intelligence_client.begin_analyze_document( - model_id=self.model_id, analyze_request=content, content_type="application/octet-stream" - ) - form_recognizer_results = await poller.result() + if self.use_content_understanding: + if self.content_understanding_endpoint is None: + raise ValueError("Content Understanding is enabled but no endpoint was provided") + if isinstance(self.credential, AzureKeyCredential): + raise ValueError( + "AzureKeyCredential is not supported for Content Understanding, use keyless auth instead" + ) + cu_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential) + content_bytes = content.read() + poller = await document_intelligence_client.begin_analyze_document( + model_id="prebuilt-layout", + analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes), + output=["figures"], + features=["ocrHighResolution"], + output_content_format="markdown", + ) + doc_for_pymupdf = pymupdf.open(stream=io.BytesIO(content_bytes)) + else: + poller = await document_intelligence_client.begin_analyze_document( + model_id=self.model_id, analyze_request=content, content_type="application/octet-stream" + ) + analyze_result: AnalyzeResult = await poller.result() offset = 0 - for page_num, page in enumerate(form_recognizer_results.pages): + for page in analyze_result.pages: tables_on_page = [ table - for table in (form_recognizer_results.tables or []) - if table.bounding_regions and table.bounding_regions[0].page_number == page_num + 1 + for table in (analyze_result.tables or []) + if table.bounding_regions and table.bounding_regions[0].page_number == page.page_number ] + figures_on_page = [] + if self.use_content_understanding: + figures_on_page = [ + figure + for figure in (analyze_result.figures or []) + if figure.bounding_regions and figure.bounding_regions[0].page_number == page.page_number + ] + + class ObjectType(Enum): + NONE = -1 + TABLE = 0 + FIGURE = 1 - # mark all positions of the table spans in the page page_offset = page.spans[0].offset page_length = page.spans[0].length - table_chars = [-1] * page_length - for table_id, table in enumerate(tables_on_page): + mask_chars: list[tuple[ObjectType, Union[int, None]]] = [(ObjectType.NONE, None)] * page_length + # mark all positions of the table spans in the page + for table_idx, table in enumerate(tables_on_page): for span in table.spans: # replace all table spans with "table_id" in table_chars array for i in range(span.length): idx = span.offset - page_offset + i if idx >= 0 and idx < page_length: - table_chars[idx] = table_id + mask_chars[idx] = (ObjectType.TABLE, table_idx) + # mark all positions of the figure spans in the page + for figure_idx, figure in enumerate(figures_on_page): + for span in figure.spans: + # replace all figure spans with "figure_id" in figure_chars array + for i in range(span.length): + idx = span.offset - page_offset + i + if idx >= 0 and idx < page_length: + mask_chars[idx] = (ObjectType.FIGURE, figure_idx) # build page text by replacing characters in table spans with table html page_text = "" - added_tables = set() - for idx, table_id in enumerate(table_chars): - if table_id == -1: - page_text += form_recognizer_results.content[page_offset + idx] - elif table_id not in added_tables: - page_text += DocumentAnalysisParser.table_to_html(tables_on_page[table_id]) - added_tables.add(table_id) - - yield Page(page_num=page_num, offset=offset, text=page_text) + added_objects = set() # set of object types todo mypy + for idx, mask_char in enumerate(mask_chars): + object_type, object_idx = mask_char + if object_type == ObjectType.NONE: + page_text += analyze_result.content[page_offset + idx] + elif object_type == ObjectType.TABLE: + if object_idx is None: + raise ValueError("Expected object_idx to be set") + if mask_char not in added_objects: + page_text += DocumentAnalysisParser.table_to_html(tables_on_page[object_idx]) + added_objects.add(mask_char) + elif object_type == ObjectType.FIGURE: + if cu_describer is None: + raise ValueError("cu_describer should not be None, unable to describe figure") + if object_idx is None: + raise ValueError("Expected object_idx to be set") + if mask_char not in added_objects: + figure_html = await DocumentAnalysisParser.figure_to_html( + doc_for_pymupdf, figures_on_page[object_idx], cu_describer + ) + page_text += figure_html + added_objects.add(mask_char) + # We remove these comments since they are not needed and skew the page numbers + page_text = page_text.replace("", "") + # We remove excess newlines at the beginning and end of the page + page_text = page_text.strip() + yield Page(page_num=page.page_number - 1, offset=offset, text=page_text) offset += len(page_text) - @classmethod - def table_to_html(cls, table: DocumentTable): - table_html = "" + @staticmethod + async def figure_to_html( + doc: pymupdf.Document, figure: DocumentFigure, cu_describer: ContentUnderstandingDescriber + ) -> str: + figure_title = (figure.caption and figure.caption.content) or "" + logger.info("Describing figure %s with title '%s'", figure.id, figure_title) + if not figure.bounding_regions: + return f"
{figure_title}
" + if len(figure.bounding_regions) > 1: + logger.warning("Figure %s has more than one bounding region, using the first one", figure.id) + first_region = figure.bounding_regions[0] + # To learn more about bounding regions, see https://aka.ms/bounding-region + bounding_box = ( + first_region.polygon[0], # x0 (left) + first_region.polygon[1], # y0 (top + first_region.polygon[4], # x1 (right) + first_region.polygon[5], # y1 (bottom) + ) + page_number = first_region["pageNumber"] # 1-indexed + cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box) + figure_description = await cu_describer.describe_image(cropped_img) + return f"
{figure_title}
{figure_description}
" + + @staticmethod + def table_to_html(table: DocumentTable): + table_html = "
" rows = [ sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index) for i in range(table.row_count) @@ -107,5 +203,32 @@ def table_to_html(cls, table: DocumentTable): cell_spans += f" rowSpan={cell.row_span}" table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}" table_html += "" - table_html += "
" + table_html += "" return table_html + + @staticmethod + def crop_image_from_pdf_page( + doc: pymupdf.Document, page_number: int, bounding_box: tuple[float, float, float, float] + ) -> bytes: + """ + Crops a region from a given page in a PDF and returns it as an image. + + :param pdf_path: Path to the PDF file. + :param page_number: The page number to crop from (0-indexed). + :param bounding_box: A tuple of (x0, y0, x1, y1) coordinates for the bounding box. + :return: A PIL Image of the cropped area. + """ + page = doc.load_page(page_number) + + # Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1). + bbx = [x * 72 for x in bounding_box] + rect = pymupdf.Rect(bbx) + # Bounding box is scaled to 72 dots per inch + # We assume the PDF has 300 DPI + # The matrix is used to convert between these 2 units + pix = page.get_pixmap(matrix=pymupdf.Matrix(300 / 72, 300 / 72), clip=rect) + + img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples) + bytes_io = io.BytesIO() + img.save(bytes_io, format="PNG") + return bytes_io.getvalue() diff --git a/app/backend/prepdocslib/textsplitter.py b/app/backend/prepdocslib/textsplitter.py index 30b0c1ad77..2c39dff850 100644 --- a/app/backend/prepdocslib/textsplitter.py +++ b/app/backend/prepdocslib/textsplitter.py @@ -87,14 +87,13 @@ class SentenceTextSplitter(TextSplitter): Class that splits pages into smaller chunks. This is required because embedding models may not be able to analyze an entire page at once """ - def __init__(self, has_image_embeddings: bool, max_tokens_per_section: int = 500): + def __init__(self, max_tokens_per_section: int = 500): self.sentence_endings = STANDARD_SENTENCE_ENDINGS + CJK_SENTENCE_ENDINGS self.word_breaks = STANDARD_WORD_BREAKS + CJK_WORD_BREAKS self.max_section_length = DEFAULT_SECTION_LENGTH self.sentence_search_limit = 100 self.max_tokens_per_section = max_tokens_per_section self.section_overlap = int(self.max_section_length * DEFAULT_OVERLAP_PERCENT / 100) - self.has_image_embeddings = has_image_embeddings def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[SplitPage, None, None]: """ @@ -192,15 +191,15 @@ def find_page(offset): section_text = all_text[start:end] yield from self.split_page_by_max_tokens(page_num=find_page(start), text=section_text) - last_table_start = section_text.rfind(" 2 * self.sentence_search_limit and last_table_start > section_text.rfind(" 2 * self.sentence_search_limit and last_figure_start > section_text.rfind( + "=1.3.7 numpy>=1,<2.1.0 # Used by openai embeddings.create to optimize embeddings (but not required) tiktoken tenacity -azure-ai-documentintelligence +azure-ai-documentintelligence==1.0.0b4 azure-cognitiveservices-speech azure-cosmos azure-search-documents==11.6.0b6 @@ -31,3 +31,4 @@ types-beautifulsoup4 msgraph-sdk==1.1.0 openai-messages-token-helper python-dotenv +rich diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt index 2efe32b484..c148bb04f8 100644 --- a/app/backend/requirements.txt +++ b/app/backend/requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.11 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # # pip-compile requirements.in @@ -24,7 +24,7 @@ asgiref==3.8.1 # via opentelemetry-instrumentation-asgi attrs==24.2.0 # via aiohttp -azure-ai-documentintelligence==1.0.0b3 +azure-ai-documentintelligence==1.0.0b4 # via -r requirements.in azure-cognitiveservices-speech==1.40.0 # via -r requirements.in @@ -155,11 +155,15 @@ jinja2==3.1.4 # quart jiter==0.5.0 # via openai +markdown-it-py==3.0.0 + # via rich markupsafe==2.1.5 # via # jinja2 # quart # werkzeug +mdurl==0.1.2 + # via markdown-it-py microsoft-kiota-abstractions==1.3.3 # via # microsoft-kiota-authentication-azure @@ -338,6 +342,8 @@ pydantic==2.8.2 # via openai pydantic-core==2.20.1 # via pydantic +pygments==2.18.0 + # via rich pyjwt[crypto]==2.9.0 # via # -r requirements.in @@ -372,6 +378,8 @@ requests==2.32.3 # tiktoken requests-oauthlib==2.0.0 # via msrest +rich==13.9.4 + # via -r requirements.in six==1.16.0 # via # azure-core diff --git a/azure.yaml b/azure.yaml index fd673f48e0..d72dc2ff13 100644 --- a/azure.yaml +++ b/azure.yaml @@ -115,6 +115,7 @@ pipeline: - DEPLOYMENT_TARGET - AZURE_CONTAINER_APPS_WORKLOAD_PROFILE - USE_CHAT_HISTORY_BROWSER + - USE_MEDIA_DESCRIBER_AZURE_CU secrets: - AZURE_SERVER_APP_SECRET - AZURE_CLIENT_APP_SECRET diff --git a/docs/data_ingestion.md b/docs/data_ingestion.md index db18bd10d3..1c8d2138ff 100644 --- a/docs/data_ingestion.md +++ b/docs/data_ingestion.md @@ -69,7 +69,7 @@ A [recent change](https://github.com/Azure-Samples/azure-search-openai-demo/pull You may want to remove documents from the index. For example, if you're using the sample data, you may want to remove the documents that are already in the index before adding your own. -To remove all documents, use `scripts/prepdocs.sh --removeall` or `scripts/prepdocs.ps1 --removeall`. +To remove all documents, use `./scripts/prepdocs.sh --removeall` or `./scripts/prepdocs.ps1 --removeall`. You can also remove individual documents by using the `--remove` flag. Open either `scripts/prepdocs.sh` or `scripts/prepdocs.ps1` and replace `/data/*` with `/data/YOUR-DOCUMENT-FILENAME-GOES-HERE.pdf`. Then run `scripts/prepdocs.sh --remove` or `scripts/prepdocs.ps1 --remove`. diff --git a/docs/deploy_features.md b/docs/deploy_features.md index b1291a00b4..ea0c7e8288 100644 --- a/docs/deploy_features.md +++ b/docs/deploy_features.md @@ -7,6 +7,7 @@ You should typically enable these features before running `azd up`. Once you've * [Using GPT-4](#using-gpt-4) * [Using text-embedding-3 models](#using-text-embedding-3-models) * [Enabling GPT-4 Turbo with Vision](#enabling-gpt-4-turbo-with-vision) +* [Enabling media description with Azure Content Understanding](#enabling-media-description-with-azure-content-understanding) * [Enabling client-side chat history](#enabling-client-side-chat-history) * [Enabling persistent chat history with Azure Cosmos DB](#enabling-persistent-chat-history-with-azure-cosmos-db) * [Enabling language picker](#enabling-language-picker) @@ -149,8 +150,31 @@ If you have already deployed: ## Enabling GPT-4 Turbo with Vision +⚠️ This feature is not currently compatible with [integrated vectorization](#enabling-integrated-vectorization). + This section covers the integration of GPT-4 Vision with Azure AI Search. Learn how to enhance your search capabilities with the power of image and text indexing, enabling advanced search functionalities over diverse document types. For a detailed guide on setup and usage, visit our [Enabling GPT-4 Turbo with Vision](gpt4v.md) page. +## Enabling media description with Azure Content Understanding + +⚠️ This feature is not currently compatible with [integrated vectorization](#enabling-integrated-vectorization). +It is compatible with [GPT vision integration](./gpt4v.md), but the features provide similar functionality. + +By default, if your documents contain image-like figures, the data ingestion process will ignore those figures, +so users will not be able to ask questions about them. + +You can optionably enable the description of media content using Azure Content Understanding. When enabled, the data ingestion process will send figures to Azure Content Understanding and replace the figure with the description in the indexed document. +To learn more about this process and compare it to the gpt-4 vision integration, see [this guide](./data_ingestion.md#media-description). + +To enable media description with Azure Content Understanding, run: + +```shell +azd env set USE_MEDIA_DESCRIBER_AZURE_CU true +``` + +If you have already run `azd up`, you will need to run `azd provision` to create the new Content Understanding service. +If you have already indexed your documents and want to re-index them with the media descriptions, +first [remove the existing documents](./data_ingestion.md#removing-documents) and then [re-ingest the data](./data_ingestion.md#indexing-additional-documents). + ## Enabling client-side chat history This feature allows users to view the chat history of their conversation, stored in the browser using [IndexedDB](https://developer.mozilla.org/docs/Web/API/IndexedDB_API). That means the chat history will be available only on the device where the chat was initiated. To enable browser-stored chat history, run: @@ -215,6 +239,8 @@ azd env set USE_SPEECH_OUTPUT_BROWSER true ## Enabling Integrated Vectorization +⚠️ This feature is not currently compatible with the [GPT vision integration](./gpt4v.md). + Azure AI search recently introduced an [integrated vectorization feature in preview mode](https://techcommunity.microsoft.com/blog/azure-ai-services-blog/announcing-the-public-preview-of-integrated-vectorization-in-azure-ai-search/3960809). This feature is a cloud-based approach to data ingestion, which takes care of document format cracking, data extraction, chunking, vectorization, and indexing, all with Azure technologies. To enable integrated vectorization with this sample: @@ -238,8 +264,6 @@ To enable integrated vectorization with this sample: 4. You can view the resources such as the indexer and skillset in Azure Portal and monitor the status of the vectorization process. -⚠️ This feature is not currently compatible with the [GPT vision integration](./gpt4v.md). - ## Enabling authentication By default, the deployed Azure web app will have no authentication or access restrictions enabled, meaning anyone with routable network access to the web app can chat with your indexed data. If you'd like to automatically setup authentication and user login as part of the `azd up` process, see [this guide](./login_and_acl.md). diff --git a/infra/abbreviations.json b/infra/abbreviations.json index 5084711603..3673672a7e 100644 --- a/infra/abbreviations.json +++ b/infra/abbreviations.json @@ -29,6 +29,7 @@ "containerInstanceContainerGroups": "ci", "containerRegistryRegistries": "cr", "containerServiceManagedClusters": "aks-", + "cognitiveServicesContentUnderstanding": "cu-", "databricksWorkspaces": "dbw-", "dataFactoryFactories": "adf-", "dataLakeAnalyticsAccounts": "dla", diff --git a/infra/main.bicep b/infra/main.bicep index 0630519c29..5c181cd525 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -119,6 +119,9 @@ param computerVisionResourceGroupName string = '' // Set in main.parameters.json param computerVisionResourceGroupLocation string = '' // Set in main.parameters.json param computerVisionSkuName string // Set in main.parameters.json +param contentUnderstandingServiceName string = '' // Set in main.parameters.json +param contentUnderstandingResourceGroupName string = '' // Set in main.parameters.json + param chatGptModelName string = '' param chatGptDeploymentName string = '' param chatGptDeploymentVersion string = '' @@ -218,6 +221,9 @@ param useVectors bool = false @description('Use Built-in integrated Vectorization feature of AI Search to vectorize and ingest documents') param useIntegratedVectorization bool = false +@description('Use media description feature with Azure Content Understanding during ingestion') +param useMediaDescriberAzureCU bool = true + @description('Enable user document upload feature') param useUserUpload bool = false param useLocalPdfParser bool = false @@ -278,6 +284,10 @@ resource computerVisionResourceGroup 'Microsoft.Resources/resourceGroups@2021-04 name: !empty(computerVisionResourceGroupName) ? computerVisionResourceGroupName : resourceGroup.name } +resource contentUnderstandingResourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' existing = if (!empty(contentUnderstandingResourceGroupName)) { + name: !empty(contentUnderstandingResourceGroupName) ? contentUnderstandingResourceGroupName : resourceGroup.name +} + resource searchServiceResourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' existing = if (!empty(searchServiceResourceGroupName)) { name: !empty(searchServiceResourceGroupName) ? searchServiceResourceGroupName : resourceGroup.name } @@ -401,6 +411,8 @@ var appEnvVariables = { AZURE_DOCUMENTINTELLIGENCE_SERVICE: documentIntelligence.outputs.name USE_LOCAL_PDF_PARSER: useLocalPdfParser USE_LOCAL_HTML_PARSER: useLocalHtmlParser + USE_MEDIA_DESCRIBER_AZURE_CU: useMediaDescriberAzureCU + AZURE_CONTENTUNDERSTANDING_ENDPOINT: useMediaDescriberAzureCU ? contentUnderstanding.outputs.endpoint : '' RUNNING_IN_PRODUCTION: 'true' } @@ -634,6 +646,28 @@ module computerVision 'br/public:avm/res/cognitive-services/account:0.7.2' = if } } + +module contentUnderstanding 'br/public:avm/res/cognitive-services/account:0.7.2' = if (useMediaDescriberAzureCU) { + name: 'content-understanding' + scope: contentUnderstandingResourceGroup + params: { + name: !empty(contentUnderstandingServiceName) + ? contentUnderstandingServiceName + : '${abbrs.cognitiveServicesContentUnderstanding}${resourceToken}' + kind: 'AIServices' + networkAcls: { + defaultAction: 'Allow' + } + customSubDomainName: !empty(contentUnderstandingServiceName) + ? contentUnderstandingServiceName + : '${abbrs.cognitiveServicesContentUnderstanding}${resourceToken}' + // Hard-coding to westus for now, due to limited availability and no overlap with Document Intelligence + location: 'westus' + tags: tags + sku: 'S0' + } +} + module speech 'br/public:avm/res/cognitive-services/account:0.7.2' = if (useSpeechOutputAzure) { name: 'speech-service' scope: speechResourceGroup @@ -1160,6 +1194,7 @@ output AZURE_SPEECH_SERVICE_ID string = useSpeechOutputAzure ? speech.outputs.re output AZURE_SPEECH_SERVICE_LOCATION string = useSpeechOutputAzure ? speech.outputs.location : '' output AZURE_VISION_ENDPOINT string = useGPT4V ? computerVision.outputs.endpoint : '' +output AZURE_CONTENTUNDERSTANDING_ENDPOINT string = useMediaDescriberAzureCU ? contentUnderstanding.outputs.endpoint : '' output AZURE_DOCUMENTINTELLIGENCE_SERVICE string = documentIntelligence.outputs.name output AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP string = documentIntelligenceResourceGroup.name diff --git a/infra/main.parameters.json b/infra/main.parameters.json index a7ba80373e..54541ca8a6 100644 --- a/infra/main.parameters.json +++ b/infra/main.parameters.json @@ -35,6 +35,12 @@ "computerVisionSkuName": { "value": "${AZURE_COMPUTER_VISION_SKU=S1}" }, + "contentUnderstandingServiceName": { + "value": "${AZURE_CONTENT_UNDERSTANDING_SERVICE}" + }, + "contentUnderstandingResourceGroupName": { + "value": "${AZURE_CONTENT_UNDERSTANDING_RESOURCE_GROUP}" + }, "documentIntelligenceServiceName": { "value": "${AZURE_DOCUMENTINTELLIGENCE_SERVICE}" }, @@ -289,6 +295,9 @@ }, "azureContainerAppsWorkloadProfile": { "value": "${AZURE_CONTAINER_APPS_WORKLOAD_PROFILE=Consumption}" + }, + "useMediaDescriberAzureCU": { + "value": "${USE_MEDIA_DESCRIBER_AZURE_CU=false}" } } } diff --git a/infra/private-endpoints.bicep b/infra/private-endpoints.bicep index 6053519cae..58fe14177e 100644 --- a/infra/private-endpoints.bicep +++ b/infra/private-endpoints.bicep @@ -84,7 +84,7 @@ module monitorDnsZones './core/networking/private-dns-zones.bicep' = [for monito var dnsZoneBlobIndex = filter(flatten(privateEndpointInfo), info => info.groupId == 'blob')[0].dnsZoneIndex // Azure Monitor Private Link Scope -// https://learn.microsoft.com/en-us/azure/azure-monitor/logs/private-link-security +// https://learn.microsoft.com/azure/azure-monitor/logs/private-link-security resource monitorPrivateLinkScope 'microsoft.insights/privateLinkScopes@2021-07-01-preview' = { name: 'mpls${resourceToken}' location: 'global' diff --git a/pyproject.toml b/pyproject.toml index 1e21fddfd4..9e12e399f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,5 +32,6 @@ module = [ "kiota.*", "azure.cognitiveservices.*", "azure.cognitiveservices.speech.*", + "pymupdf.*", ] ignore_missing_imports = true diff --git a/tests/conftest.py b/tests/conftest.py index cfc5326f31..157770b186 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -54,7 +54,7 @@ async def mock_search(self, *args, **kwargs): @pytest.fixture -def mock_compute_embeddings_call(monkeypatch): +def mock_azurehttp_calls(monkeypatch): def mock_post(*args, **kwargs): if kwargs.get("url").endswith("computervision/retrieval:vectorizeText"): return mock_computervision_response() @@ -327,7 +327,7 @@ async def client( mock_openai_embedding, mock_acs_search, mock_blob_container_client, - mock_compute_embeddings_call, + mock_azurehttp_calls, ): quart_app = app.create_app() @@ -346,7 +346,7 @@ async def client_with_expiring_token( mock_openai_embedding, mock_acs_search, mock_blob_container_client, - mock_compute_embeddings_call, + mock_azurehttp_calls, ): quart_app = app.create_app() diff --git a/tests/mocks.py b/tests/mocks.py index 13dc82ac6e..788823941c 100644 --- a/tests/mocks.py +++ b/tests/mocks.py @@ -151,12 +151,10 @@ def by_page(self): class MockResponse: - def __init__(self, text, status): - self.text = text + def __init__(self, status, text=None, headers=None): + self._text = text or "" self.status = status - - async def text(self): - return self._text + self.headers = headers or {} async def __aexit__(self, exc_type, exc, tb): pass @@ -164,8 +162,15 @@ async def __aexit__(self, exc_type, exc, tb): async def __aenter__(self): return self + async def text(self): + return self._text + async def json(self): - return json.loads(self.text) + return json.loads(self._text) + + def raise_for_status(self): + if self.status != 200: + raise Exception(f"HTTP status {self.status}") class MockEmbeddingsClient: diff --git a/tests/snapshots/test_prepdocslib_textsplitter/test_pages_with_figures/split_pages_with_figures.json b/tests/snapshots/test_prepdocslib_textsplitter/test_pages_with_figures/split_pages_with_figures.json new file mode 100644 index 0000000000..72bddc1dbe --- /dev/null +++ b/tests/snapshots/test_prepdocslib_textsplitter/test_pages_with_figures/split_pages_with_figures.json @@ -0,0 +1,50 @@ +[ + { + "text": "# Financial Market Analysis Report 2023\n\nAn In-Depth Exploration of Stocks, Cryptocurrencies, and Commodities\nPrepared by: Contoso Financial Analytics## Executive Summary\n\n. In this comprehensive report, Contoso Financial Analytics provides a\ndeep dive into the financial markets of 2023, focusing on the trends\nand fluctuations within stocks, cryptocurrencies, and commodities.\nOur analysis covers historical patterns, current market conditions, and\nfuture predictions, offering valuable insights for investors, analysts,\nand financial enthusiasts. This report leverages advanced data\nanalytics to present a clear picture of the complex interplay between\ndifferent financial markets and their potential trajectories## Introduction to Financial Markets\n\n\n
Global Financial Market Distribution (2023)
This pie chart represents the distribution of investments across four categories: Stocks, Bonds, Cryptocurrencies, and Commodities. The chart is divided into four colored sections, each representing a different category.", + "page_num": 0 + }, + { + "text": "a clear picture of the complex interplay between\ndifferent financial markets and their potential trajectories## Introduction to Financial Markets\n\n\n
Global Financial Market Distribution (2023)
This pie chart represents the distribution of investments across four categories: Stocks, Bonds, Cryptocurrencies, and Commodities. The chart is divided into four colored sections, each representing a different category. Stocks are shown in blue, Bonds in orange, Cryptocurrencies in gray, and Commodities in yellow. The chart visually indicates the proportion of each investment type within a portfolio.

CategoryColor
StocksBlue
BondsOrange
CryptocurrenciesGray
CommoditiesYellow
\n\n\nThe global financial market is a vast and intricate network of\nexchanges, instruments, and assets, ranging from traditional stocks\nand bonds to modern cryptocurrencies and commodities. Each\nsegment plays a crucial role in the overall economy, and their\ninteractions can have profound effects on global financial stability.", + "page_num": 1 + }, + { + "text": " Each\nsegment plays a crucial role in the overall economy, and their\ninteractions can have profound effects on global financial stability.\nThis section provides an overview of these segments and sets the\nstage for a detailed analysis## Stock Market Overview\n\n\n

5-Year Trend of the S&P 500 Index

This line chart shows the trend of the S&P 500 Index over a five-year period from 2018 to 2022. The index starts at around 2500 in 2018, rises steadily to a peak of about 4500 in 2021, and then declines slightly to approximately 4000 in 2022.

YearS&P 500 Index
20182500
20193000
20203500
20214500
20224000
\n\n\nThe stock market is often considered the economy's\nheartbeat, reflecting corporate health and investor\nsentiment. Over the past five years, the S&P 500 index has\nexperienced significant volatility, with notable peaks and\ntroughs corresponding to various economic events.", + "page_num": 2 + }, + { + "text": " Over the past five years, the S&P 500 index has\nexperienced significant volatility, with notable peaks and\ntroughs corresponding to various economic events. This\noverview examines the key factors that have influenced\nthe stock market's performance and what they indicate\nabout the economy's state## Cryptocurrency Market Dynamics\n\n\n
Price Fluctuations of Bitcoin and Ethereum (Last 12 Months)

This line graph shows two data series over the months from January to December. The blue line represents a data series that starts at around 32,500 in January, peaks in May at about 42,500, dips in July, and then rises steadily to approximately 47,500 in December. The orange line represents a much lower data series, remaining relatively flat throughout the year, starting at around 2,500 in January and ending slightly above 2,500 in December.

\n\n\nCryptocurrencies have emerged as a new asset\nclass, captivating investors with their potential for\nhigh returns and their role in the future of finance.\nThis section explores the price dynamics of major\ncryptocurrencies like Bitcoin and Ethereum,\nanalyzing the ", + "page_num": 3 + }, + { + "text": "\nThis section explores the price dynamics of major\ncryptocurrencies like Bitcoin and Ethereum,\nanalyzing the factors driving their volatility and the\nimplications for the broader financial market.\n\n\n

The image shows a legend with two colored lines and labels. A blue line is labeled \"Bitconin\" and an orange line is labeled \"Ethereum.\" This legend is likely used to differentiate between two data sets or categories in a chart or graph, with \"Bitconin\" and \"Ethereum\" representing different entities or variables.
### Commodity Market Fluctuations\n\n\n
Price Changes of Oil, Gold, and Wheat
This is a horizontal bar chart showing the annual percentage change in prices for Wheat, Gold, and Oil from 2014 to 2022. The chart uses different colors to represent each commodity: gray for Wheat, orange for Gold, and blue for Oil. The x-axis represents the percentage change, ranging from -25% to 35%, while the y-axis lists the years from 2014 to 2022.", + "page_num": 4 + }, + { + "text": "
### Commodity Market Fluctuations\n\n\n
Price Changes of Oil, Gold, and Wheat
This is a horizontal bar chart showing the annual percentage change in prices for Wheat, Gold, and Oil from 2014 to 2022. The chart uses different colors to represent each commodity: gray for Wheat, orange for Gold, and blue for Oil. The x-axis represents the percentage change, ranging from -25% to 35%, while the y-axis lists the years from 2014 to 2022.\n\n\n\n\n\n\n\n\n\n\n\n\n
YearWheatGoldOil
20225%2%0%
20213%4%30%
20201%5%-20%
20192%3%10%
20180%1%15%
20174%2%5%
20163%6%-5%
20151%0%10%
20142%5%-10%
\n\n\nCommodities such as oil, gold, and\nwheat are fundamental to the ", + "page_num": 4 + }, + { + "text": "20151%0%10%\n20142%5%-10%\n
\n\n\nCommodities such as oil, gold, and\nwheat are fundamental to the global\neconomy, influencing everything from\nenergy costs to food prices. This section\ndelves into the trends and factors\naffecting commodity prices, including\ngeopolitical events, supply-chain\ndisruptions, and environmental factors,\nproviding a comprehensive view of this\ncrucial market segment.### Interplay Between Different Market Segments\n\n\n
\n\n
S&P 500NASDAQBitcoinEthereumOilGold
S&P 5001
NASDAQ0.951
Bitcoin0.30.41
Ethereum0.350.450.91
Oil0.60.650.20.251
Gold-0.", + "page_num": 5 + }, + { + "text": "### Interplay Between Different Market Segments\n\n\n
S&P 500NASDAQBitcoinEthereumOilGold
S&P 5001
NASDAQ0.951
Bitcoin0.30.41
Ethereum0.350.450.91
Oil0.60.650.20.251
Gold-0.2-0.15-0.1-0.05-0.31
\n\n\nFinancial markets are interconnected, with movements in one segment often influencing others. This\nsection examines the correlations between stock indices, cryptocurrency prices, and commodity prices,\nrevealing how changes in one market can have ripple effects across the financial ecosystem.### Impact of Macroeconomic Factors\n\n\n
Impact of Interest Rates, Inflation, and GDP Growth on Financial ", + "page_num": 6 + }, + { + "text": "### Impact of Macroeconomic Factors\n\n\n
Impact of Interest Rates, Inflation, and GDP Growth on Financial Markets

The image is a line graph titled \"On Financial Markets\" showing the trends of Interest Rates %, Inflation Data %, and GDP Growth % from 2018 to 2023. The graph has three lines representing each of these metrics over the years.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n
YearInterest Rates %Inflation Data %GDP Growth %
2018223
201922.52
202011.5-4
20211.533
202223.52
20232.532.5
\n\n

The graph shows that GDP Growth % experienced a significant drop in 2020, while Inflation Data % and Interest Rates % remained relatively stable with slight fluctuations over the years.", + "page_num": 7 + }, + { + "text": "5

\n\n

The graph shows that GDP Growth % experienced a significant drop in 2020, while Inflation Data % and Interest Rates % remained relatively stable with slight fluctuations over the years.

\n\n\nMacroeconomic factors such as interest\nrates, inflation, and GDP growth play a\npivotal role in shaping financial markets.\nThis section analyzes how these factors\nhave influenced stock, cryptocurrency,\nand commodity markets over recent\nyears, providing insights into the\ncomplex relationship between the\neconomy and financial market\nperformance.## Future Predictions and Trends\n\n\n
Relative Growth Trends for S&P 500, Bitcoin, and Oil Prices (2024 Indexed to 100)

Prices (2024 Indexed to 100)

\n

This bar chart compares the indexed prices of Oil, Bitcoin, and the S&P 500 from 2024 to 2028, with 2024 set as the base year (indexed to 100). The chart shows the relative price changes over the years for each asset.

\n\n\n\n\n\n\n", + "page_num": 7 + }, + { + "text": "## Future Predictions and Trends\n\n\n
Relative Growth Trends for S&P 500, Bitcoin, and Oil Prices (2024 Indexed to 100)

Prices (2024 Indexed to 100)

\n

This bar chart compares the indexed prices of Oil, Bitcoin, and the S&P 500 from 2024 to 2028, with 2024 set as the base year (indexed to 100). The chart shows the relative price changes over the years for each asset.

\n
YearOilBitcoinS&P 500
2024100100100
\n\n\n\n\n\n\n\n\n\n\n
YearOilBitcoinS&P 500
2024100100100
2025105110108
2026110115112
2027115120116
2028120125120
\n\n\nBased on historical data, current trends,\nand economic indicators, this section\npresents predictions for the future of\nfinancial markets. We explore potential\ntrajectories for stock indices,\ncryptocurrency values, and commodity\nprices, offering investors and analysts\n", + "page_num": 8 + }, + { + "text": " We explore potential\ntrajectories for stock indices,\ncryptocurrency values, and commodity\nprices, offering investors and analysts\nforesight into what the coming years\nmight hold.## Conclusions\n\n. In conclusion, this report has traversed the multifaceted landscape of\nfinancial markets, shedding light on the intricate patterns and\ninterdependencies that define their behavior. From the volatility of\ncryptocurrencies to the steadiness of commodities, each segment\ntells a part of the story of our global economy. As Contoso Financial\nAnalytics, we are committed to providing our clients with the most\ncomprehensive and nuanced analysis, empowering them to make\ninformed financial decisions in an ever-evolving market.", + "page_num": 8 + } +] \ No newline at end of file diff --git a/tests/test-data/Financial Market Analysis Report 2023.pdf b/tests/test-data/Financial Market Analysis Report 2023.pdf new file mode 100644 index 0000000000..eef17aad75 Binary files /dev/null and b/tests/test-data/Financial Market Analysis Report 2023.pdf differ diff --git a/tests/test-data/Financial Market Analysis Report 2023_page2_figure.png b/tests/test-data/Financial Market Analysis Report 2023_page2_figure.png new file mode 100644 index 0000000000..b5dd8ae577 Binary files /dev/null and b/tests/test-data/Financial Market Analysis Report 2023_page2_figure.png differ diff --git a/tests/test-data/Simple Figure.pdf b/tests/test-data/Simple Figure.pdf new file mode 100644 index 0000000000..416f9fd61b Binary files /dev/null and b/tests/test-data/Simple Figure.pdf differ diff --git a/tests/test-data/Simple Figure_content.txt b/tests/test-data/Simple Figure_content.txt new file mode 100644 index 0000000000..3e8db25dbb --- /dev/null +++ b/tests/test-data/Simple Figure_content.txt @@ -0,0 +1,13 @@ +# Simple Figure + +This text is before the figure and NOT part of it. + + +
+ +9 + +
+ + +This is text after the figure that's not part of it. diff --git a/tests/test-data/Simple Table.pdf b/tests/test-data/Simple Table.pdf new file mode 100644 index 0000000000..0a5ae23e90 Binary files /dev/null and b/tests/test-data/Simple Table.pdf differ diff --git a/tests/test-data/Simple Table_content.txt b/tests/test-data/Simple Table_content.txt new file mode 100644 index 0000000000..cca5a0ed77 --- /dev/null +++ b/tests/test-data/Simple Table_content.txt @@ -0,0 +1,17 @@ +# Simple HTML Table + + + + + + + + + + + + + + + +
Header 1Header 2
Cell 1Cell 2
Cell 3Cell 4
diff --git a/tests/test-data/pages_with_figures.json b/tests/test-data/pages_with_figures.json new file mode 100644 index 0000000000..0b157c7f0c --- /dev/null +++ b/tests/test-data/pages_with_figures.json @@ -0,0 +1 @@ +[{"page_num": 0, "offset": 0, "text": "# Financial Market Analysis Report 2023\n\nAn In-Depth Exploration of Stocks, Cryptocurrencies, and Commodities\nPrepared by: Contoso Financial Analytics"}, {"page_num": 1, "offset": 150, "text": "## Executive Summary\n\n. In this comprehensive report, Contoso Financial Analytics provides a\ndeep dive into the financial markets of 2023, focusing on the trends\nand fluctuations within stocks, cryptocurrencies, and commodities.\nOur analysis covers historical patterns, current market conditions, and\nfuture predictions, offering valuable insights for investors, analysts,\nand financial enthusiasts. This report leverages advanced data\nanalytics to present a clear picture of the complex interplay between\ndifferent financial markets and their potential trajectories"}, {"page_num": 2, "offset": 716, "text": "## Introduction to Financial Markets\n\n\n
Global Financial Market Distribution (2023)
This pie chart represents the distribution of investments across four categories: Stocks, Bonds, Cryptocurrencies, and Commodities. The chart is divided into four colored sections, each representing a different category. Stocks are shown in blue, Bonds in orange, Cryptocurrencies in gray, and Commodities in yellow. The chart visually indicates the proportion of each investment type within a portfolio.

CategoryColor
StocksBlue
BondsOrange
CryptocurrenciesGray
CommoditiesYellow
\n\n\nThe global financial market is a vast and intricate network of\nexchanges, instruments, and assets, ranging from traditional stocks\nand bonds to modern cryptocurrencies and commodities. Each\nsegment plays a crucial role in the overall economy, and their\ninteractions can have profound effects on global financial stability.\nThis section provides an overview of these segments and sets the\nstage for a detailed analysis"}, {"page_num": 3, "offset": 1897, "text": "## Stock Market Overview\n\n\n

5-Year Trend of the S&P 500 Index

This line chart shows the trend of the S&P 500 Index over a five-year period from 2018 to 2022. The index starts at around 2500 in 2018, rises steadily to a peak of about 4500 in 2021, and then declines slightly to approximately 4000 in 2022.

YearS&P 500 Index
20182500
20193000
20203500
20214500
20224000
\n\n\nThe stock market is often considered the economy's\nheartbeat, reflecting corporate health and investor\nsentiment. Over the past five years, the S&P 500 index has\nexperienced significant volatility, with notable peaks and\ntroughs corresponding to various economic events. This\noverview examines the key factors that have influenced\nthe stock market's performance and what they indicate\nabout the economy's state"}, {"page_num": 4, "offset": 2937, "text": "## Cryptocurrency Market Dynamics\n\n\n
Price Fluctuations of Bitcoin and Ethereum (Last 12 Months)

This line graph shows two data series over the months from January to December. The blue line represents a data series that starts at around 32,500 in January, peaks in May at about 42,500, dips in July, and then rises steadily to approximately 47,500 in December. The orange line represents a much lower data series, remaining relatively flat throughout the year, starting at around 2,500 in January and ending slightly above 2,500 in December.

\n\n\nCryptocurrencies have emerged as a new asset\nclass, captivating investors with their potential for\nhigh returns and their role in the future of finance.\nThis section explores the price dynamics of major\ncryptocurrencies like Bitcoin and Ethereum,\nanalyzing the factors driving their volatility and the\nimplications for the broader financial market.\n\n\n

The image shows a legend with two colored lines and labels. A blue line is labeled \"Bitconin\" and an orange line is labeled \"Ethereum.\" This legend is likely used to differentiate between two data sets or categories in a chart or graph, with \"Bitconin\" and \"Ethereum\" representing different entities or variables.
"}, {"page_num": 5, "offset": 4243, "text": "### Commodity Market Fluctuations\n\n\n
Price Changes of Oil, Gold, and Wheat
This is a horizontal bar chart showing the annual percentage change in prices for Wheat, Gold, and Oil from 2014 to 2022. The chart uses different colors to represent each commodity: gray for Wheat, orange for Gold, and blue for Oil. The x-axis represents the percentage change, ranging from -25% to 35%, while the y-axis lists the years from 2014 to 2022.\n\n\n\n\n\n\n\n\n\n\n\n\n
YearWheatGoldOil
20225%2%0%
20213%4%30%
20201%5%-20%
20192%3%10%
20180%1%15%
20174%2%5%
20163%6%-5%
20151%0%10%
20142%5%-10%
\n\n\nCommodities such as oil, gold, and\nwheat are fundamental to the global\neconomy, influencing everything from\nenergy costs to food prices. This section\ndelves into the trends and factors\naffecting commodity prices, including\ngeopolitical events, supply-chain\ndisruptions, and environmental factors,\nproviding a comprehensive view of this\ncrucial market segment."}, {"page_num": 6, "offset": 5673, "text": "### Interplay Between Different Market Segments\n\n\n
S&P 500NASDAQBitcoinEthereumOilGold
S&P 5001
NASDAQ0.951
Bitcoin0.30.41
Ethereum0.350.450.91
Oil0.60.650.20.251
Gold-0.2-0.15-0.1-0.05-0.31
\n\n\nFinancial markets are interconnected, with movements in one segment often influencing others. This\nsection examines the correlations between stock indices, cryptocurrency prices, and commodity prices,\nrevealing how changes in one market can have ripple effects across the financial ecosystem."}, {"page_num": 7, "offset": 6695, "text": "### Impact of Macroeconomic Factors\n\n\n
Impact of Interest Rates, Inflation, and GDP Growth on Financial Markets

The image is a line graph titled \"On Financial Markets\" showing the trends of Interest Rates %, Inflation Data %, and GDP Growth % from 2018 to 2023. The graph has three lines representing each of these metrics over the years.

\n\n\n\n\n\n\n\n\n\n\n\n\n\n
YearInterest Rates %Inflation Data %GDP Growth %
2018223
201922.52
202011.5-4
20211.533
202223.52
20232.532.5
\n\n

The graph shows that GDP Growth % experienced a significant drop in 2020, while Inflation Data % and Interest Rates % remained relatively stable with slight fluctuations over the years.

\n\n\nMacroeconomic factors such as interest\nrates, inflation, and GDP growth play a\npivotal role in shaping financial markets.\nThis section analyzes how these factors\nhave influenced stock, cryptocurrency,\nand commodity markets over recent\nyears, providing insights into the\ncomplex relationship between the\neconomy and financial market\nperformance."}, {"page_num": 8, "offset": 8102, "text": "## Future Predictions and Trends\n\n\n
Relative Growth Trends for S&P 500, Bitcoin, and Oil Prices (2024 Indexed to 100)

Prices (2024 Indexed to 100)

\n

This bar chart compares the indexed prices of Oil, Bitcoin, and the S&P 500 from 2024 to 2028, with 2024 set as the base year (indexed to 100). The chart shows the relative price changes over the years for each asset.

\n\n\n\n\n\n\n\n\n\n\n\n
YearOilBitcoinS&P 500
2024100100100
2025105110108
2026110115112
2027115120116
2028120125120
\n\n\nBased on historical data, current trends,\nand economic indicators, this section\npresents predictions for the future of\nfinancial markets. We explore potential\ntrajectories for stock indices,\ncryptocurrency values, and commodity\nprices, offering investors and analysts\nforesight into what the coming years\nmight hold."}, {"page_num": 9, "offset": 9281, "text": "## Conclusions\n\n. In conclusion, this report has traversed the multifaceted landscape of\nfinancial markets, shedding light on the intricate patterns and\ninterdependencies that define their behavior. From the volatility of\ncryptocurrencies to the steadiness of commodities, each segment\ntells a part of the story of our global economy. As Contoso Financial\nAnalytics, we are committed to providing our clients with the most\ncomprehensive and nuanced analysis, empowering them to make\ninformed financial decisions in an ever-evolving market."}] diff --git a/tests/test_app.py b/tests/test_app.py index 9c20b9421a..580e31e5d1 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -15,7 +15,7 @@ def fake_response(http_code): return Response(http_code, request=Request(method="get", url="https://foo.bar/")) -# See https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/content-filter +# See https://learn.microsoft.com/azure/ai-services/openai/concepts/content-filter filtered_response = BadRequestError( message="The response was filtered", body={ diff --git a/tests/test_mediadescriber.py b/tests/test_mediadescriber.py new file mode 100644 index 0000000000..117a186281 --- /dev/null +++ b/tests/test_mediadescriber.py @@ -0,0 +1,135 @@ +import json +import logging + +import aiohttp +import pytest + +from prepdocslib.mediadescriber import ContentUnderstandingDescriber + +from .mocks import MockAzureCredential, MockResponse + + +@pytest.mark.asyncio +async def test_contentunderstanding_analyze(monkeypatch, caplog): + + def mock_post(*args, **kwargs): + if kwargs.get("url").find("badanalyzer") > 0: + return MockResponse( + status=200, + headers={ + "Operation-Location": "https://testcontentunderstanding.cognitiveservices.azure.com/contentunderstanding/analyzers/badanalyzer/operations/7f313e00-4da1-4b19-a25e-53f121c24d10?api-version=2024-12-01-preview" + }, + ) + if kwargs.get("url").endswith("contentunderstanding/analyzers/image_analyzer:analyze"): + return MockResponse( + status=200, + headers={ + "Operation-Location": "https://testcontentunderstanding.cognitiveservices.azure.com/contentunderstanding/analyzers/image_analyzer/results/53e4c016-d2c0-48a9-a9f4-38891f7d45f0?api-version=2024-12-01-preview" + }, + ) + else: + raise Exception("Unexpected URL for mock call to ClientSession.post()") + + monkeypatch.setattr(aiohttp.ClientSession, "post", mock_post) + + num_poll_calls = 0 + + def mock_get(self, url, **kwargs): + if url.endswith( + "contentunderstanding/analyzers/image_analyzer/results/53e4c016-d2c0-48a9-a9f4-38891f7d45f0?api-version=2024-12-01-preview" + ): + return MockResponse( + status=200, + text=json.dumps( + { + "id": "f8c4c1c0-71c3-410c-a723-d223e0a84a88", + "status": "Succeeded", + "result": { + "analyzerId": "image_analyzer", + "apiVersion": "2024-12-01-preview", + "createdAt": "2024-12-05T17:33:04Z", + "warnings": [], + "contents": [ + { + "markdown": "![image](image)\n", + "fields": { + "Description": { + "type": "string", + "valueString": "The bar chart titled 'Prices (2024 Indexed to 100)' compares the indexed prices of Oil, Bitcoin, and S&P 500 from 2024 to 2028. Each year is represented by a set of three horizontal bars, with Oil in gray, Bitcoin in orange, and S&P 500 in blue. The index is based on the year 2024, where all values start at 100. Over the years, Bitcoin shows the most significant increase, reaching around 130 by 2028, while Oil and S&P 500 show moderate increases.\n\n
YearOilBitcoinS&P 500
2024100100100
20251051101 08
2026110115112
2027115120116
2028120130120
", + } + }, + "kind": "document", + "startPageNumber": 1, + "endPageNumber": 1, + "unit": "pixel", + "pages": [{"pageNumber": 1}], + } + ], + }, + } + ), + ) + elif url.endswith( + "https://testcontentunderstanding.cognitiveservices.azure.com/contentunderstanding/analyzers/badanalyzer/operations/7f313e00-4da1-4b19-a25e-53f121c24d10?api-version=2024-12-01-preview" + ): + return MockResponse(status=200, text=json.dumps({"status": "Failed"})) + elif url.endswith( + "https://testcontentunderstanding.cognitiveservices.azure.com/contentunderstanding/analyzers/image_analyzer/operations/7f313e00-4da1-4b19-a25e-53f121c24d10?api-version=2024-12-01-preview" + ): + nonlocal num_poll_calls + num_poll_calls += 1 + if num_poll_calls == 1: + return MockResponse(status=200, text=json.dumps({"status": "Running"})) + elif num_poll_calls > 1: + return MockResponse(status=200, text=json.dumps({"status": "Succeeded"})) + else: + raise Exception("Unexpected URL for mock call to ClientSession.get()") + + monkeypatch.setattr(aiohttp.ClientSession, "get", mock_get) + + def mock_put(self, *args, **kwargs): + if kwargs.get("url").find("existinganalyzer") > 0: + return MockResponse(status=409) + if kwargs.get("url").find("wrongservicename") > 0: + return MockResponse( + status=404, + text=json.dumps( + {"error": {"code": "ResourceNotFound", "message": "The specified resource does not exist."}} + ), + ) + elif kwargs.get("url").endswith("contentunderstanding/analyzers/image_analyzer"): + return MockResponse( + status=201, + headers={ + "Operation-Location": "https://testcontentunderstanding.cognitiveservices.azure.com/contentunderstanding/analyzers/image_analyzer/operations/7f313e00-4da1-4b19-a25e-53f121c24d10?api-version=2024-12-01-preview" + }, + ) + else: + raise Exception("Unexpected URL for mock call to ClientSession.put()") + + monkeypatch.setattr(aiohttp.ClientSession, "put", mock_put) + + describer = ContentUnderstandingDescriber( + endpoint="https://testcontentunderstanding.cognitiveservices.azure.com", credential=MockAzureCredential() + ) + await describer.create_analyzer() + await describer.describe_image(b"imagebytes") + + describer_wrong_endpoint = ContentUnderstandingDescriber( + endpoint="https://wrongservicename.cognitiveservices.azure.com", credential=MockAzureCredential() + ) + with pytest.raises(Exception): + await describer_wrong_endpoint.create_analyzer() + + describer_existing_analyzer = ContentUnderstandingDescriber( + endpoint="https://existinganalyzer.cognitiveservices.azure.com", credential=MockAzureCredential() + ) + with caplog.at_level(logging.INFO): + await describer_existing_analyzer.create_analyzer() + assert "Analyzer 'image_analyzer' already exists." in caplog.text + + describer_bad_analyze = ContentUnderstandingDescriber( + endpoint="https://badanalyzer.cognitiveservices.azure.com", credential=MockAzureCredential() + ) + with pytest.raises(Exception): + await describer_bad_analyze.describe_image(b"imagebytes") diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py new file mode 100644 index 0000000000..408aa2b2d0 --- /dev/null +++ b/tests/test_pdfparser.py @@ -0,0 +1,310 @@ +import io +import logging +import math +import pathlib +from unittest.mock import AsyncMock, MagicMock + +import pymupdf +import pytest +from azure.ai.documentintelligence.aio import DocumentIntelligenceClient +from azure.ai.documentintelligence.models import ( + AnalyzeResult, + BoundingRegion, + DocumentCaption, + DocumentFigure, + DocumentPage, + DocumentSpan, + DocumentTable, + DocumentTableCell, +) +from PIL import Image, ImageChops + +from prepdocslib.mediadescriber import ContentUnderstandingDescriber +from prepdocslib.pdfparser import DocumentAnalysisParser + +from .mocks import MockAzureCredential + +TEST_DATA_DIR = pathlib.Path(__file__).parent / "test-data" + + +def assert_image_equal(image1, image2): + assert image1.size == image2.size + assert image1.mode == image2.mode + # Based on https://stackoverflow.com/a/55251080/1347623 + diff = ImageChops.difference(image1, image2).histogram() + sq = (value * (i % 256) ** 2 for i, value in enumerate(diff)) + rms = math.sqrt(sum(sq) / float(image1.size[0] * image1.size[1])) + assert rms < 90 + + +def test_crop_image_from_pdf_page(): + doc = pymupdf.open(TEST_DATA_DIR / "Financial Market Analysis Report 2023.pdf", filetype="pdf") + page_number = 2 + bounding_box = (1.4703, 2.8371, 5.5381, 6.6022) # Coordinates in inches + + cropped_image_bytes = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number, bounding_box) + + # Verify the output is not empty + assert cropped_image_bytes is not None + assert len(cropped_image_bytes) > 0 + + # Verify the output is a valid image + cropped_image = Image.open(io.BytesIO(cropped_image_bytes)) + assert cropped_image.format == "PNG" + assert cropped_image.size[0] > 0 + assert cropped_image.size[1] > 0 + + expected_image = Image.open(TEST_DATA_DIR / "Financial Market Analysis Report 2023_page2_figure.png") + assert_image_equal(cropped_image, expected_image) + + +def test_table_to_html(): + table = DocumentTable( + row_count=2, + column_count=2, + cells=[ + DocumentTableCell(row_index=0, column_index=0, content="Header 1", kind="columnHeader"), + DocumentTableCell(row_index=0, column_index=1, content="Header 2", kind="columnHeader"), + DocumentTableCell(row_index=1, column_index=0, content="Cell 1"), + DocumentTableCell(row_index=1, column_index=1, content="Cell 2"), + ], + ) + + expected_html = ( + "
" + "" + "" + "
Header 1Header 2
Cell 1Cell 2
" + ) + + result_html = DocumentAnalysisParser.table_to_html(table) + assert result_html == expected_html + + +def test_table_to_html_with_spans(): + table = DocumentTable( + row_count=2, + column_count=2, + cells=[ + DocumentTableCell(row_index=0, column_index=0, content="Header 1", kind="columnHeader", column_span=2), + DocumentTableCell(row_index=1, column_index=0, content="Cell 1", row_span=2), + DocumentTableCell(row_index=1, column_index=1, content="Cell 2"), + ], + ) + + expected_html = ( + "
" + "" + "" + "
Header 1
Cell 1Cell 2
" + ) + + result_html = DocumentAnalysisParser.table_to_html(table) + assert result_html == expected_html + + +@pytest.mark.asyncio +async def test_figure_to_html_without_bounding_regions(): + doc = MagicMock() + figure = DocumentFigure(id="1", caption=None, bounding_regions=None) + cu_describer = MagicMock() + + result_html = await DocumentAnalysisParser.figure_to_html(doc, figure, cu_describer) + expected_html = "
" + + assert result_html == expected_html + + +@pytest.mark.asyncio +async def test_figure_to_html_with_bounding_regions(monkeypatch, caplog): + doc = MagicMock() + figure = DocumentFigure( + id="1", + caption=DocumentCaption(content="Figure 1"), + bounding_regions=[ + BoundingRegion(page_number=1, polygon=[1.4703, 2.8371, 5.5409, 2.8415, 5.5381, 6.6022, 1.4681, 6.5978]), + BoundingRegion(page_number=2, polygon=[1.4703, 2.8371, 5.5409, 2.8415, 5.5381, 6.6022, 1.4681, 6.5978]), + ], + ) + cu_describer = AsyncMock() + + async def mock_describe_image(image_bytes): + assert image_bytes == b"image_bytes" + return "Described Image" + + monkeypatch.setattr(cu_describer, "describe_image", mock_describe_image) + + def mock_crop_image_from_pdf_page(doc, page_number, bounding_box) -> bytes: + assert page_number == 0 + assert bounding_box == (1.4703, 2.8371, 5.5381, 6.6022) + return b"image_bytes" + + monkeypatch.setattr(DocumentAnalysisParser, "crop_image_from_pdf_page", mock_crop_image_from_pdf_page) + + with caplog.at_level(logging.WARNING): + result_html = await DocumentAnalysisParser.figure_to_html(doc, figure, cu_describer) + expected_html = "
Figure 1
Described Image
" + assert result_html == expected_html + assert "Figure 1 has more than one bounding region, using the first one" in caplog.text + + +@pytest.mark.asyncio +async def test_parse_simple(monkeypatch): + mock_poller = MagicMock() + + async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs): + return mock_poller + + async def mock_poller_result(): + return AnalyzeResult( + content="Page content", + pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=12)])], + tables=[], + figures=[], + ) + + monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document) + monkeypatch.setattr(mock_poller, "result", mock_poller_result) + + parser = DocumentAnalysisParser( + endpoint="https://example.com", credential=MockAzureCredential(), use_content_understanding=False + ) + content = io.BytesIO(b"pdf content bytes") + content.name = "test.pdf" + pages = [page async for page in parser.parse(content)] + + assert len(pages) == 1 + assert pages[0].page_num == 0 + assert pages[0].offset == 0 + assert pages[0].text == "Page content" + + +@pytest.mark.asyncio +async def test_parse_doc_with_tables(monkeypatch): + mock_poller = MagicMock() + + async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs): + return mock_poller + + async def mock_poller_result(): + content = open(TEST_DATA_DIR / "Simple Table_content.txt").read() + return AnalyzeResult( + content=content, + pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=172)])], + tables=[ + DocumentTable( + bounding_regions=[ + BoundingRegion( + page_number=1, polygon=[0.4394, 1.0459, 4.2509, 1.0449, 4.2524, 1.9423, 0.4408, 1.9432] + ) + ], + row_count=3, + column_count=2, + cells=[ + DocumentTableCell( + row_index=0, + column_index=0, + content="Header 1", + kind="columnHeader", + spans=[DocumentSpan(offset=39, length=8)], + ), + DocumentTableCell( + row_index=0, + column_index=1, + content="Header 2", + kind="columnHeader", + spans=[DocumentSpan(offset=57, length=8)], + ), + DocumentTableCell( + row_index=1, column_index=0, content="Cell 1", spans=[DocumentSpan(offset=86, length=6)] + ), + DocumentTableCell( + row_index=1, column_index=1, content="Cell 2", spans=[DocumentSpan(offset=102, length=6)] + ), + DocumentTableCell( + row_index=2, column_index=0, content="Cell 3", spans=[DocumentSpan(offset=129, length=6)] + ), + DocumentTableCell( + row_index=2, column_index=1, content="Cell 4", spans=[DocumentSpan(offset=145, length=6)] + ), + ], + spans=[DocumentSpan(offset=22, length=149)], + ) + ], + figures=[], + ) + + monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document) + monkeypatch.setattr(mock_poller, "result", mock_poller_result) + + parser = DocumentAnalysisParser( + endpoint="https://example.com", credential=MockAzureCredential(), use_content_understanding=False + ) + with open(TEST_DATA_DIR / "Simple Table.pdf", "rb") as f: + content = io.BytesIO(f.read()) + content.name = "Simple Table.pdf" + pages = [page async for page in parser.parse(content)] + + assert len(pages) == 1 + assert pages[0].page_num == 0 + assert pages[0].offset == 0 + assert ( + pages[0].text + == "# Simple HTML Table\n\n\n
Header 1Header 2
Cell 1Cell 2
Cell 3Cell 4
" + ) + + +@pytest.mark.asyncio +async def test_parse_doc_with_figures(monkeypatch): + mock_poller = MagicMock() + + async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs): + return mock_poller + + async def mock_poller_result(): + content = open(TEST_DATA_DIR / "Simple Figure_content.txt").read() + return AnalyzeResult( + content=content, + pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=148)])], + figures=[ + DocumentFigure( + id="1.1", + caption=DocumentCaption(content="Figure 1"), + bounding_regions=[ + BoundingRegion( + page_number=1, polygon=[0.4295, 1.3072, 1.7071, 1.3076, 1.7067, 2.6088, 0.4291, 2.6085] + ) + ], + spans=[DocumentSpan(offset=70, length=22)], + ) + ], + ) + + monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document) + monkeypatch.setattr(mock_poller, "result", mock_poller_result) + + async def mock_describe_image(self, image_bytes): + return "Pie chart" + + monkeypatch.setattr(ContentUnderstandingDescriber, "describe_image", mock_describe_image) + + parser = DocumentAnalysisParser( + endpoint="https://example.com", + credential=MockAzureCredential(), + use_content_understanding=True, + content_understanding_endpoint="https://example.com", + ) + + with open(TEST_DATA_DIR / "Simple Figure.pdf", "rb") as f: + content = io.BytesIO(f.read()) + content.name = "Simple Figure.pdf" + + pages = [page async for page in parser.parse(content)] + + assert len(pages) == 1 + assert pages[0].page_num == 0 + assert pages[0].offset == 0 + assert ( + pages[0].text + == "# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n
Figure 1
Pie chart
\n\n\nThis is text after the figure that's not part of it." + ) diff --git a/tests/test_prepdocslib_textsplitter.py b/tests/test_prepdocslib_textsplitter.py index 87049d4dad..c71e15c826 100644 --- a/tests/test_prepdocslib_textsplitter.py +++ b/tests/test_prepdocslib_textsplitter.py @@ -17,13 +17,13 @@ def test_sentencetextsplitter_split_empty_pages(): - t = SentenceTextSplitter(has_image_embeddings=False) + t = SentenceTextSplitter() assert list(t.split_pages([])) == [] def test_sentencetextsplitter_split_small_pages(): - t = SentenceTextSplitter(has_image_embeddings=False) + t = SentenceTextSplitter() split_pages = list(t.split_pages(pages=[Page(page_num=0, offset=0, text="Not a large page")])) assert len(split_pages) == 1 @@ -33,7 +33,7 @@ def test_sentencetextsplitter_split_small_pages(): @pytest.mark.asyncio async def test_sentencetextsplitter_list_parse_and_split(tmp_path, snapshot): - text_splitter = SentenceTextSplitter(has_image_embeddings=False) + text_splitter = SentenceTextSplitter() pdf_parser = LocalPdfParser() for pdf in Path("data").glob("*.pdf"): shutil.copy(str(pdf.absolute()), tmp_path) @@ -98,7 +98,7 @@ def pytest_generate_tests(metafunc): @pytest.mark.asyncio async def test_sentencetextsplitter_multilang(test_doc, tmp_path): - text_splitter = SentenceTextSplitter(has_image_embeddings=False) + text_splitter = SentenceTextSplitter() bpe = tiktoken.encoding_for_model(ENCODING_MODEL) pdf_parser = LocalPdfParser() @@ -133,7 +133,7 @@ async def test_sentencetextsplitter_multilang(test_doc, tmp_path): def test_split_tables(): - t = SentenceTextSplitter(has_image_embeddings=False) + t = SentenceTextSplitter() test_text_without_table = """Contoso Electronics is a leader in the aerospace industry, providing advanced electronic components for both commercial and military aircraft. We specialize in creating cutting- @@ -166,3 +166,23 @@ def test_split_tables(): assert "