diff --git a/.azdo/pipelines/azure-dev.yml b/.azdo/pipelines/azure-dev.yml
index 3495e06a45..8c61d4acaf 100644
--- a/.azdo/pipelines/azure-dev.yml
+++ b/.azdo/pipelines/azure-dev.yml
@@ -120,6 +120,7 @@ steps:
DEPLOYMENT_TARGET: $(DEPLOYMENT_TARGET)
AZURE_CONTAINER_APPS_WORKLOAD_PROFILE: $(AZURE_CONTAINER_APPS_WORKLOAD_PROFILE)
USE_CHAT_HISTORY_BROWSER: $(USE_CHAT_HISTORY_BROWSER)
+ USE_MEDIA_DESCRIBER_AZURE_CU: $(USE_MEDIA_DESCRIBER_AZURE_CU)
- task: AzureCLI@2
displayName: Deploy Application
inputs:
diff --git a/.github/workflows/azure-dev.yml b/.github/workflows/azure-dev.yml
index 798e589413..860a13cbfa 100644
--- a/.github/workflows/azure-dev.yml
+++ b/.github/workflows/azure-dev.yml
@@ -13,7 +13,7 @@ on:
# To configure required secrets for connecting to Azure, simply run `azd pipeline config`
# Set up permissions for deploying with secretless Azure federated credentials
-# https://learn.microsoft.com/en-us/azure/developer/github/connect-from-azure?tabs=azure-portal%2Clinux#set-up-azure-login-with-openid-connect-authentication
+# https://learn.microsoft.com/azure/developer/github/connect-from-azure?tabs=azure-portal%2Clinux#set-up-azure-login-with-openid-connect-authentication
permissions:
id-token: write
contents: read
@@ -103,6 +103,7 @@ jobs:
DEPLOYMENT_TARGET: ${{ vars.DEPLOYMENT_TARGET }}
AZURE_CONTAINER_APPS_WORKLOAD_PROFILE: ${{ vars.AZURE_CONTAINER_APPS_WORKLOAD_PROFILE }}
USE_CHAT_HISTORY_BROWSER: ${{ vars.USE_CHAT_HISTORY_BROWSER }}
+ USE_MEDIA_DESCRIBER_AZURE_CU: ${{ vars.USE_MEDIA_DESCRIBER_AZURE_CU }}
steps:
- name: Checkout
uses: actions/checkout@v4
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4b54675d56..9fa92346c9 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -122,6 +122,8 @@ If you followed the steps above to install the pre-commit hooks, then you can ju
When adding new azd environment variables, please remember to update:
+1. [main.parameters.json](./infra/main.parameters.json)
+1. [appEnvVariables in main.bicep](./infra/main.bicep)
1. App Service's [azure.yaml](./azure.yaml)
1. [ADO pipeline](.azdo/pipelines/azure-dev.yml).
1. [Github workflows](.github/workflows/azure-dev.yml)
diff --git a/README.md b/README.md
index 1fb97c6784..49d5ea84b3 100644
--- a/README.md
+++ b/README.md
@@ -91,7 +91,9 @@ However, you can try the [Azure pricing calculator](https://azure.com/e/e3490de2
- Azure AI Document Intelligence: SO (Standard) tier using pre-built layout. Pricing per document page, sample documents have 261 pages total. [Pricing](https://azure.microsoft.com/pricing/details/form-recognizer/)
- Azure AI Search: Basic tier, 1 replica, free level of semantic search. Pricing per hour. [Pricing](https://azure.microsoft.com/pricing/details/search/)
- Azure Blob Storage: Standard tier with ZRS (Zone-redundant storage). Pricing per storage and read operations. [Pricing](https://azure.microsoft.com/pricing/details/storage/blobs/)
-- Azure Cosmos DB: Serverless tier. Pricing per request unit and storage. [Pricing](https://azure.microsoft.com/pricing/details/cosmos-db/)
+- Azure Cosmos DB: Only provisioned if you enabled [chat history with Cosmos DB](docs/deploy_features.md#enabling-persistent-chat-history-with-azure-cosmos-db). Serverless tier. Pricing per request unit and storage. [Pricing](https://azure.microsoft.com/pricing/details/cosmos-db/)
+- Azure AI Vision: Only provisioned if you enabled [GPT-4 with vision](docs/gpt4v.md). Pricing per 1K transactions. [Pricing](https://azure.microsoft.com/pricing/details/cognitive-services/computer-vision/)
+- Azure AI Content Understanding: Only provisioned if you enabled [media description](docs/deploy_features.md#enabling-media-description-with-azure-content-understanding). Pricing per 1K images. [Pricing](https://azure.microsoft.com/pricing/details/content-understanding/)
- Azure Monitor: Pay-as-you-go tier. Costs based on data ingested. [Pricing](https://azure.microsoft.com/pricing/details/monitor/)
To reduce costs, you can switch to free SKUs for various services, but those SKUs have limitations.
diff --git a/app/backend/gunicorn.conf.py b/app/backend/gunicorn.conf.py
index 4518587695..9144e3cc00 100644
--- a/app/backend/gunicorn.conf.py
+++ b/app/backend/gunicorn.conf.py
@@ -7,7 +7,7 @@
bind = "0.0.0.0"
timeout = 230
-# https://learn.microsoft.com/en-us/troubleshoot/azure/app-service/web-apps-performance-faqs#why-does-my-request-time-out-after-230-seconds
+# https://learn.microsoft.com/troubleshoot/azure/app-service/web-apps-performance-faqs#why-does-my-request-time-out-after-230-seconds
num_cpus = multiprocessing.cpu_count()
if os.getenv("WEBSITE_SKU") == "LinuxFree":
diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
index d62a42f8cf..57cfe52e6f 100644
--- a/app/backend/prepdocs.py
+++ b/app/backend/prepdocs.py
@@ -7,6 +7,7 @@
from azure.core.credentials import AzureKeyCredential
from azure.core.credentials_async import AsyncTokenCredential
from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider
+from rich.logging import RichHandler
from load_azd_env import load_azd_env
from prepdocslib.blobmanager import BlobManager
@@ -158,8 +159,10 @@ def setup_file_processors(
local_pdf_parser: bool = False,
local_html_parser: bool = False,
search_images: bool = False,
+ use_content_understanding: bool = False,
+ content_understanding_endpoint: Union[str, None] = None,
):
- sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images)
+ sentence_text_splitter = SentenceTextSplitter()
doc_int_parser: Optional[DocumentAnalysisParser] = None
# check if Azure Document Intelligence credentials are provided
@@ -170,6 +173,8 @@ def setup_file_processors(
doc_int_parser = DocumentAnalysisParser(
endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/",
credential=documentintelligence_creds,
+ use_content_understanding=use_content_understanding,
+ content_understanding_endpoint=content_understanding_endpoint,
)
pdf_parser: Optional[Parser] = None
@@ -294,10 +299,10 @@ async def main(strategy: Strategy, setup_index: bool = True):
args = parser.parse_args()
if args.verbose:
- logging.basicConfig(format="%(message)s")
+ logging.basicConfig(format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)])
# We only set the level to INFO for our logger,
# to avoid seeing the noisy INFO level logs from the Azure SDKs
- logger.setLevel(logging.INFO)
+ logger.setLevel(logging.DEBUG)
load_azd_env()
@@ -309,6 +314,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
use_gptvision = os.getenv("USE_GPT4V", "").lower() == "true"
use_acls = os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT") is not None
dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false"
+ use_content_understanding = os.getenv("USE_MEDIA_DESCRIBER_AZURE_CU", "").lower() == "true"
# Use the current user identity to connect to Azure services. See infra/main.bicep for role assignments.
if tenant_id := os.getenv("AZURE_TENANT_ID"):
@@ -406,6 +412,8 @@ async def main(strategy: Strategy, setup_index: bool = True):
local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER") == "true",
local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER") == "true",
search_images=use_gptvision,
+ use_content_understanding=use_content_understanding,
+ content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"),
)
image_embeddings_service = setup_image_embeddings_service(
azure_credential=azd_credential,
@@ -424,6 +432,8 @@ async def main(strategy: Strategy, setup_index: bool = True):
search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
use_acls=use_acls,
category=args.category,
+ use_content_understanding=use_content_understanding,
+ content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"),
)
loop.run_until_complete(main(ingestion_strategy, setup_index=not args.remove and not args.removeall))
diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py
index e9f18e795a..e8d01dda52 100644
--- a/app/backend/prepdocslib/blobmanager.py
+++ b/app/backend/prepdocslib/blobmanager.py
@@ -171,7 +171,7 @@ def sourcepage_from_file_page(cls, filename, page=0) -> str:
@classmethod
def blob_image_name_from_file_page(cls, filename, page=0) -> str:
- return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".png"
+ return os.path.splitext(os.path.basename(filename))[0] + f"-{page+1}" + ".png"
@classmethod
def blob_name_from_file_name(cls, filename) -> str:
diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py
index 55b24b6f3a..3748f67a09 100644
--- a/app/backend/prepdocslib/filestrategy.py
+++ b/app/backend/prepdocslib/filestrategy.py
@@ -1,10 +1,13 @@
import logging
from typing import List, Optional
+from azure.core.credentials import AzureKeyCredential
+
from .blobmanager import BlobManager
from .embeddings import ImageEmbeddings, OpenAIEmbeddings
from .fileprocessor import FileProcessor
from .listfilestrategy import File, ListFileStrategy
+from .mediadescriber import ContentUnderstandingDescriber
from .searchmanager import SearchManager, Section
from .strategy import DocumentAction, SearchInfo, Strategy
@@ -50,6 +53,8 @@ def __init__(
search_analyzer_name: Optional[str] = None,
use_acls: bool = False,
category: Optional[str] = None,
+ use_content_understanding: bool = False,
+ content_understanding_endpoint: Optional[str] = None,
):
self.list_file_strategy = list_file_strategy
self.blob_manager = blob_manager
@@ -61,6 +66,8 @@ def __init__(
self.search_info = search_info
self.use_acls = use_acls
self.category = category
+ self.use_content_understanding = use_content_understanding
+ self.content_understanding_endpoint = content_understanding_endpoint
async def setup(self):
search_manager = SearchManager(
@@ -73,6 +80,16 @@ async def setup(self):
)
await search_manager.create_index()
+ if self.use_content_understanding:
+ if self.content_understanding_endpoint is None:
+ raise ValueError("Content Understanding is enabled but no endpoint was provided")
+ if isinstance(self.search_info.credential, AzureKeyCredential):
+ raise ValueError(
+ "AzureKeyCredential is not supported for Content Understanding, use keyless auth instead"
+ )
+ cu_manager = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.search_info.credential)
+ await cu_manager.create_analyzer()
+
async def run(self):
search_manager = SearchManager(
self.search_info, self.search_analyzer_name, self.use_acls, False, self.embeddings
diff --git a/app/backend/prepdocslib/mediadescriber.py b/app/backend/prepdocslib/mediadescriber.py
new file mode 100644
index 0000000000..5aae79232e
--- /dev/null
+++ b/app/backend/prepdocslib/mediadescriber.py
@@ -0,0 +1,107 @@
+import logging
+from abc import ABC
+
+import aiohttp
+from azure.core.credentials_async import AsyncTokenCredential
+from azure.identity.aio import get_bearer_token_provider
+from rich.progress import Progress
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
+
+logger = logging.getLogger("scripts")
+
+
+class MediaDescriber(ABC):
+
+ async def describe_image(self, image_bytes) -> str:
+ raise NotImplementedError # pragma: no cover
+
+
+class ContentUnderstandingDescriber:
+ CU_API_VERSION = "2024-12-01-preview"
+
+ analyzer_schema = {
+ "analyzerId": "image_analyzer",
+ "name": "Image understanding",
+ "description": "Extract detailed structured information from images extracted from documents.",
+ "baseAnalyzerId": "prebuilt-image",
+ "scenario": "image",
+ "config": {"returnDetails": False},
+ "fieldSchema": {
+ "name": "ImageInformation",
+ "descriptions": "Description of image.",
+ "fields": {
+ "Description": {
+ "type": "string",
+ "description": "Description of the image. If the image has a title, start with the title. Include a 2-sentence summary. If the image is a chart, diagram, or table, include the underlying data in an HTML table tag, with accurate numbers. If the image is a chart, describe any axis or legends. The only allowed HTML tags are the table/thead/tr/td/tbody tags.",
+ },
+ },
+ },
+ }
+
+ def __init__(self, endpoint: str, credential: AsyncTokenCredential):
+ self.endpoint = endpoint
+ self.credential = credential
+
+ async def poll_api(self, session, poll_url, headers):
+
+ @retry(stop=stop_after_attempt(60), wait=wait_fixed(2), retry=retry_if_exception_type(ValueError))
+ async def poll():
+ async with session.get(poll_url, headers=headers) as response:
+ response.raise_for_status()
+ response_json = await response.json()
+ if response_json["status"] == "Failed":
+ raise Exception("Failed")
+ if response_json["status"] == "Running":
+ raise ValueError("Running")
+ return response_json
+
+ return await poll()
+
+ async def create_analyzer(self):
+ logger.info("Creating analyzer '%s'...", self.analyzer_schema["analyzerId"])
+
+ token_provider = get_bearer_token_provider(self.credential, "https://cognitiveservices.azure.com/.default")
+ token = await token_provider()
+ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
+ params = {"api-version": self.CU_API_VERSION}
+ analyzer_id = self.analyzer_schema["analyzerId"]
+ cu_endpoint = f"{self.endpoint}/contentunderstanding/analyzers/{analyzer_id}"
+ async with aiohttp.ClientSession() as session:
+ async with session.put(
+ url=cu_endpoint, params=params, headers=headers, json=self.analyzer_schema
+ ) as response:
+ if response.status == 409:
+ logger.info("Analyzer '%s' already exists.", analyzer_id)
+ return
+ elif response.status != 201:
+ data = await response.text()
+ raise Exception("Error creating analyzer", data)
+ else:
+ poll_url = response.headers.get("Operation-Location")
+
+ with Progress() as progress:
+ progress.add_task("Creating analyzer...", total=None, start=False)
+ await self.poll_api(session, poll_url, headers)
+
+ async def describe_image(self, image_bytes: bytes) -> str:
+ logger.info("Sending image to Azure Content Understanding service...")
+ async with aiohttp.ClientSession() as session:
+ token = await self.credential.get_token("https://cognitiveservices.azure.com/.default")
+ headers = {"Authorization": "Bearer " + token.token}
+ params = {"api-version": self.CU_API_VERSION}
+ analyzer_name = self.analyzer_schema["analyzerId"]
+ async with session.post(
+ url=f"{self.endpoint}/contentunderstanding/analyzers/{analyzer_name}:analyze",
+ params=params,
+ headers=headers,
+ data=image_bytes,
+ ) as response:
+ response.raise_for_status()
+ poll_url = response.headers["Operation-Location"]
+
+ with Progress() as progress:
+ progress.add_task("Processing...", total=None, start=False)
+ results = await self.poll_api(session, poll_url, headers)
+
+ fields = results["result"]["contents"][0]["fields"]
+ return fields["Description"]["valueString"]
diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py
index f12fe70b94..857235c571 100644
--- a/app/backend/prepdocslib/page.py
+++ b/app/backend/prepdocslib/page.py
@@ -3,7 +3,7 @@ class Page:
A single page from a document
Attributes:
- page_num (int): Page number
+ page_num (int): Page number (0-indexed)
offset (int): If the text of the entire Document was concatenated into a single string, the index of the first character on the page. For example, if page 1 had the text "hello" and page 2 had the text "world", the offset of page 2 is 5 ("hellow")
text (str): The text of the page
"""
@@ -17,6 +17,10 @@ def __init__(self, page_num: int, offset: int, text: str):
class SplitPage:
"""
A section of a page that has been split into a smaller chunk.
+
+ Attributes:
+ page_num (int): Page number (0-indexed)
+ text (str): The text of the section
"""
def __init__(self, page_num: int, text: str):
diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py
index 6604110020..1fcbbc9531 100644
--- a/app/backend/prepdocslib/pdfparser.py
+++ b/app/backend/prepdocslib/pdfparser.py
@@ -1,13 +1,23 @@
import html
+import io
import logging
+from enum import Enum
from typing import IO, AsyncGenerator, Union
+import pymupdf
from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
-from azure.ai.documentintelligence.models import DocumentTable
+from azure.ai.documentintelligence.models import (
+ AnalyzeDocumentRequest,
+ AnalyzeResult,
+ DocumentFigure,
+ DocumentTable,
+)
from azure.core.credentials import AzureKeyCredential
from azure.core.credentials_async import AsyncTokenCredential
+from PIL import Image
from pypdf import PdfReader
+from .mediadescriber import ContentUnderstandingDescriber
from .page import Page
from .parser import Parser
@@ -39,11 +49,18 @@ class DocumentAnalysisParser(Parser):
"""
def __init__(
- self, endpoint: str, credential: Union[AsyncTokenCredential, AzureKeyCredential], model_id="prebuilt-layout"
+ self,
+ endpoint: str,
+ credential: Union[AsyncTokenCredential, AzureKeyCredential],
+ model_id="prebuilt-layout",
+ use_content_understanding=True,
+ content_understanding_endpoint: Union[str, None] = None,
):
self.model_id = model_id
self.endpoint = endpoint
self.credential = credential
+ self.use_content_understanding = use_content_understanding
+ self.content_understanding_endpoint = content_understanding_endpoint
async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
logger.info("Extracting text from '%s' using Azure Document Intelligence", content.name)
@@ -51,47 +68,126 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
async with DocumentIntelligenceClient(
endpoint=self.endpoint, credential=self.credential
) as document_intelligence_client:
- poller = await document_intelligence_client.begin_analyze_document(
- model_id=self.model_id, analyze_request=content, content_type="application/octet-stream"
- )
- form_recognizer_results = await poller.result()
+ if self.use_content_understanding:
+ if self.content_understanding_endpoint is None:
+ raise ValueError("Content Understanding is enabled but no endpoint was provided")
+ if isinstance(self.credential, AzureKeyCredential):
+ raise ValueError(
+ "AzureKeyCredential is not supported for Content Understanding, use keyless auth instead"
+ )
+ cu_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential)
+ content_bytes = content.read()
+ poller = await document_intelligence_client.begin_analyze_document(
+ model_id="prebuilt-layout",
+ analyze_request=AnalyzeDocumentRequest(bytes_source=content_bytes),
+ output=["figures"],
+ features=["ocrHighResolution"],
+ output_content_format="markdown",
+ )
+ doc_for_pymupdf = pymupdf.open(stream=io.BytesIO(content_bytes))
+ else:
+ poller = await document_intelligence_client.begin_analyze_document(
+ model_id=self.model_id, analyze_request=content, content_type="application/octet-stream"
+ )
+ analyze_result: AnalyzeResult = await poller.result()
offset = 0
- for page_num, page in enumerate(form_recognizer_results.pages):
+ for page in analyze_result.pages:
tables_on_page = [
table
- for table in (form_recognizer_results.tables or [])
- if table.bounding_regions and table.bounding_regions[0].page_number == page_num + 1
+ for table in (analyze_result.tables or [])
+ if table.bounding_regions and table.bounding_regions[0].page_number == page.page_number
]
+ figures_on_page = []
+ if self.use_content_understanding:
+ figures_on_page = [
+ figure
+ for figure in (analyze_result.figures or [])
+ if figure.bounding_regions and figure.bounding_regions[0].page_number == page.page_number
+ ]
+
+ class ObjectType(Enum):
+ NONE = -1
+ TABLE = 0
+ FIGURE = 1
- # mark all positions of the table spans in the page
page_offset = page.spans[0].offset
page_length = page.spans[0].length
- table_chars = [-1] * page_length
- for table_id, table in enumerate(tables_on_page):
+ mask_chars: list[tuple[ObjectType, Union[int, None]]] = [(ObjectType.NONE, None)] * page_length
+ # mark all positions of the table spans in the page
+ for table_idx, table in enumerate(tables_on_page):
for span in table.spans:
# replace all table spans with "table_id" in table_chars array
for i in range(span.length):
idx = span.offset - page_offset + i
if idx >= 0 and idx < page_length:
- table_chars[idx] = table_id
+ mask_chars[idx] = (ObjectType.TABLE, table_idx)
+ # mark all positions of the figure spans in the page
+ for figure_idx, figure in enumerate(figures_on_page):
+ for span in figure.spans:
+ # replace all figure spans with "figure_id" in figure_chars array
+ for i in range(span.length):
+ idx = span.offset - page_offset + i
+ if idx >= 0 and idx < page_length:
+ mask_chars[idx] = (ObjectType.FIGURE, figure_idx)
# build page text by replacing characters in table spans with table html
page_text = ""
- added_tables = set()
- for idx, table_id in enumerate(table_chars):
- if table_id == -1:
- page_text += form_recognizer_results.content[page_offset + idx]
- elif table_id not in added_tables:
- page_text += DocumentAnalysisParser.table_to_html(tables_on_page[table_id])
- added_tables.add(table_id)
-
- yield Page(page_num=page_num, offset=offset, text=page_text)
+ added_objects = set() # set of object types todo mypy
+ for idx, mask_char in enumerate(mask_chars):
+ object_type, object_idx = mask_char
+ if object_type == ObjectType.NONE:
+ page_text += analyze_result.content[page_offset + idx]
+ elif object_type == ObjectType.TABLE:
+ if object_idx is None:
+ raise ValueError("Expected object_idx to be set")
+ if mask_char not in added_objects:
+ page_text += DocumentAnalysisParser.table_to_html(tables_on_page[object_idx])
+ added_objects.add(mask_char)
+ elif object_type == ObjectType.FIGURE:
+ if cu_describer is None:
+ raise ValueError("cu_describer should not be None, unable to describe figure")
+ if object_idx is None:
+ raise ValueError("Expected object_idx to be set")
+ if mask_char not in added_objects:
+ figure_html = await DocumentAnalysisParser.figure_to_html(
+ doc_for_pymupdf, figures_on_page[object_idx], cu_describer
+ )
+ page_text += figure_html
+ added_objects.add(mask_char)
+ # We remove these comments since they are not needed and skew the page numbers
+ page_text = page_text.replace("", "")
+ # We remove excess newlines at the beginning and end of the page
+ page_text = page_text.strip()
+ yield Page(page_num=page.page_number - 1, offset=offset, text=page_text)
offset += len(page_text)
- @classmethod
- def table_to_html(cls, table: DocumentTable):
- table_html = "
"
+ @staticmethod
+ async def figure_to_html(
+ doc: pymupdf.Document, figure: DocumentFigure, cu_describer: ContentUnderstandingDescriber
+ ) -> str:
+ figure_title = (figure.caption and figure.caption.content) or ""
+ logger.info("Describing figure %s with title '%s'", figure.id, figure_title)
+ if not figure.bounding_regions:
+ return f""
+ if len(figure.bounding_regions) > 1:
+ logger.warning("Figure %s has more than one bounding region, using the first one", figure.id)
+ first_region = figure.bounding_regions[0]
+ # To learn more about bounding regions, see https://aka.ms/bounding-region
+ bounding_box = (
+ first_region.polygon[0], # x0 (left)
+ first_region.polygon[1], # y0 (top
+ first_region.polygon[4], # x1 (right)
+ first_region.polygon[5], # y1 (bottom)
+ )
+ page_number = first_region["pageNumber"] # 1-indexed
+ cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box)
+ figure_description = await cu_describer.describe_image(cropped_img)
+ return f""
+
+ @staticmethod
+ def table_to_html(table: DocumentTable):
+ table_html = "
"
rows = [
sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index)
for i in range(table.row_count)
@@ -107,5 +203,32 @@ def table_to_html(cls, table: DocumentTable):
cell_spans += f" rowSpan={cell.row_span}"
table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}{tag}>"
table_html += ""
- table_html += "
"
+ table_html += "
"
return table_html
+
+ @staticmethod
+ def crop_image_from_pdf_page(
+ doc: pymupdf.Document, page_number: int, bounding_box: tuple[float, float, float, float]
+ ) -> bytes:
+ """
+ Crops a region from a given page in a PDF and returns it as an image.
+
+ :param pdf_path: Path to the PDF file.
+ :param page_number: The page number to crop from (0-indexed).
+ :param bounding_box: A tuple of (x0, y0, x1, y1) coordinates for the bounding box.
+ :return: A PIL Image of the cropped area.
+ """
+ page = doc.load_page(page_number)
+
+ # Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1).
+ bbx = [x * 72 for x in bounding_box]
+ rect = pymupdf.Rect(bbx)
+ # Bounding box is scaled to 72 dots per inch
+ # We assume the PDF has 300 DPI
+ # The matrix is used to convert between these 2 units
+ pix = page.get_pixmap(matrix=pymupdf.Matrix(300 / 72, 300 / 72), clip=rect)
+
+ img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
+ bytes_io = io.BytesIO()
+ img.save(bytes_io, format="PNG")
+ return bytes_io.getvalue()
diff --git a/app/backend/prepdocslib/textsplitter.py b/app/backend/prepdocslib/textsplitter.py
index 30b0c1ad77..2c39dff850 100644
--- a/app/backend/prepdocslib/textsplitter.py
+++ b/app/backend/prepdocslib/textsplitter.py
@@ -87,14 +87,13 @@ class SentenceTextSplitter(TextSplitter):
Class that splits pages into smaller chunks. This is required because embedding models may not be able to analyze an entire page at once
"""
- def __init__(self, has_image_embeddings: bool, max_tokens_per_section: int = 500):
+ def __init__(self, max_tokens_per_section: int = 500):
self.sentence_endings = STANDARD_SENTENCE_ENDINGS + CJK_SENTENCE_ENDINGS
self.word_breaks = STANDARD_WORD_BREAKS + CJK_WORD_BREAKS
self.max_section_length = DEFAULT_SECTION_LENGTH
self.sentence_search_limit = 100
self.max_tokens_per_section = max_tokens_per_section
self.section_overlap = int(self.max_section_length * DEFAULT_OVERLAP_PERCENT / 100)
- self.has_image_embeddings = has_image_embeddings
def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[SplitPage, None, None]:
"""
@@ -192,15 +191,15 @@ def find_page(offset):
section_text = all_text[start:end]
yield from self.split_page_by_max_tokens(page_num=find_page(start), text=section_text)
- last_table_start = section_text.rfind("
2 * self.sentence_search_limit and last_table_start > section_text.rfind("
2 * self.sentence_search_limit and last_figure_start > section_text.rfind(
+ "=1.3.7
numpy>=1,<2.1.0 # Used by openai embeddings.create to optimize embeddings (but not required)
tiktoken
tenacity
-azure-ai-documentintelligence
+azure-ai-documentintelligence==1.0.0b4
azure-cognitiveservices-speech
azure-cosmos
azure-search-documents==11.6.0b6
@@ -31,3 +31,4 @@ types-beautifulsoup4
msgraph-sdk==1.1.0
openai-messages-token-helper
python-dotenv
+rich
diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt
index 2efe32b484..c148bb04f8 100644
--- a/app/backend/requirements.txt
+++ b/app/backend/requirements.txt
@@ -1,5 +1,5 @@
#
-# This file is autogenerated by pip-compile with Python 3.11
+# This file is autogenerated by pip-compile with Python 3.12
# by the following command:
#
# pip-compile requirements.in
@@ -24,7 +24,7 @@ asgiref==3.8.1
# via opentelemetry-instrumentation-asgi
attrs==24.2.0
# via aiohttp
-azure-ai-documentintelligence==1.0.0b3
+azure-ai-documentintelligence==1.0.0b4
# via -r requirements.in
azure-cognitiveservices-speech==1.40.0
# via -r requirements.in
@@ -155,11 +155,15 @@ jinja2==3.1.4
# quart
jiter==0.5.0
# via openai
+markdown-it-py==3.0.0
+ # via rich
markupsafe==2.1.5
# via
# jinja2
# quart
# werkzeug
+mdurl==0.1.2
+ # via markdown-it-py
microsoft-kiota-abstractions==1.3.3
# via
# microsoft-kiota-authentication-azure
@@ -338,6 +342,8 @@ pydantic==2.8.2
# via openai
pydantic-core==2.20.1
# via pydantic
+pygments==2.18.0
+ # via rich
pyjwt[crypto]==2.9.0
# via
# -r requirements.in
@@ -372,6 +378,8 @@ requests==2.32.3
# tiktoken
requests-oauthlib==2.0.0
# via msrest
+rich==13.9.4
+ # via -r requirements.in
six==1.16.0
# via
# azure-core
diff --git a/azure.yaml b/azure.yaml
index fd673f48e0..d72dc2ff13 100644
--- a/azure.yaml
+++ b/azure.yaml
@@ -115,6 +115,7 @@ pipeline:
- DEPLOYMENT_TARGET
- AZURE_CONTAINER_APPS_WORKLOAD_PROFILE
- USE_CHAT_HISTORY_BROWSER
+ - USE_MEDIA_DESCRIBER_AZURE_CU
secrets:
- AZURE_SERVER_APP_SECRET
- AZURE_CLIENT_APP_SECRET
diff --git a/docs/data_ingestion.md b/docs/data_ingestion.md
index db18bd10d3..1c8d2138ff 100644
--- a/docs/data_ingestion.md
+++ b/docs/data_ingestion.md
@@ -69,7 +69,7 @@ A [recent change](https://github.com/Azure-Samples/azure-search-openai-demo/pull
You may want to remove documents from the index. For example, if you're using the sample data, you may want to remove the documents that are already in the index before adding your own.
-To remove all documents, use `scripts/prepdocs.sh --removeall` or `scripts/prepdocs.ps1 --removeall`.
+To remove all documents, use `./scripts/prepdocs.sh --removeall` or `./scripts/prepdocs.ps1 --removeall`.
You can also remove individual documents by using the `--remove` flag. Open either `scripts/prepdocs.sh` or `scripts/prepdocs.ps1` and replace `/data/*` with `/data/YOUR-DOCUMENT-FILENAME-GOES-HERE.pdf`. Then run `scripts/prepdocs.sh --remove` or `scripts/prepdocs.ps1 --remove`.
diff --git a/docs/deploy_features.md b/docs/deploy_features.md
index b1291a00b4..ea0c7e8288 100644
--- a/docs/deploy_features.md
+++ b/docs/deploy_features.md
@@ -7,6 +7,7 @@ You should typically enable these features before running `azd up`. Once you've
* [Using GPT-4](#using-gpt-4)
* [Using text-embedding-3 models](#using-text-embedding-3-models)
* [Enabling GPT-4 Turbo with Vision](#enabling-gpt-4-turbo-with-vision)
+* [Enabling media description with Azure Content Understanding](#enabling-media-description-with-azure-content-understanding)
* [Enabling client-side chat history](#enabling-client-side-chat-history)
* [Enabling persistent chat history with Azure Cosmos DB](#enabling-persistent-chat-history-with-azure-cosmos-db)
* [Enabling language picker](#enabling-language-picker)
@@ -149,8 +150,31 @@ If you have already deployed:
## Enabling GPT-4 Turbo with Vision
+⚠️ This feature is not currently compatible with [integrated vectorization](#enabling-integrated-vectorization).
+
This section covers the integration of GPT-4 Vision with Azure AI Search. Learn how to enhance your search capabilities with the power of image and text indexing, enabling advanced search functionalities over diverse document types. For a detailed guide on setup and usage, visit our [Enabling GPT-4 Turbo with Vision](gpt4v.md) page.
+## Enabling media description with Azure Content Understanding
+
+⚠️ This feature is not currently compatible with [integrated vectorization](#enabling-integrated-vectorization).
+It is compatible with [GPT vision integration](./gpt4v.md), but the features provide similar functionality.
+
+By default, if your documents contain image-like figures, the data ingestion process will ignore those figures,
+so users will not be able to ask questions about them.
+
+You can optionably enable the description of media content using Azure Content Understanding. When enabled, the data ingestion process will send figures to Azure Content Understanding and replace the figure with the description in the indexed document.
+To learn more about this process and compare it to the gpt-4 vision integration, see [this guide](./data_ingestion.md#media-description).
+
+To enable media description with Azure Content Understanding, run:
+
+```shell
+azd env set USE_MEDIA_DESCRIBER_AZURE_CU true
+```
+
+If you have already run `azd up`, you will need to run `azd provision` to create the new Content Understanding service.
+If you have already indexed your documents and want to re-index them with the media descriptions,
+first [remove the existing documents](./data_ingestion.md#removing-documents) and then [re-ingest the data](./data_ingestion.md#indexing-additional-documents).
+
## Enabling client-side chat history
This feature allows users to view the chat history of their conversation, stored in the browser using [IndexedDB](https://developer.mozilla.org/docs/Web/API/IndexedDB_API). That means the chat history will be available only on the device where the chat was initiated. To enable browser-stored chat history, run:
@@ -215,6 +239,8 @@ azd env set USE_SPEECH_OUTPUT_BROWSER true
## Enabling Integrated Vectorization
+⚠️ This feature is not currently compatible with the [GPT vision integration](./gpt4v.md).
+
Azure AI search recently introduced an [integrated vectorization feature in preview mode](https://techcommunity.microsoft.com/blog/azure-ai-services-blog/announcing-the-public-preview-of-integrated-vectorization-in-azure-ai-search/3960809). This feature is a cloud-based approach to data ingestion, which takes care of document format cracking, data extraction, chunking, vectorization, and indexing, all with Azure technologies.
To enable integrated vectorization with this sample:
@@ -238,8 +264,6 @@ To enable integrated vectorization with this sample:
4. You can view the resources such as the indexer and skillset in Azure Portal and monitor the status of the vectorization process.
-⚠️ This feature is not currently compatible with the [GPT vision integration](./gpt4v.md).
-
## Enabling authentication
By default, the deployed Azure web app will have no authentication or access restrictions enabled, meaning anyone with routable network access to the web app can chat with your indexed data. If you'd like to automatically setup authentication and user login as part of the `azd up` process, see [this guide](./login_and_acl.md).
diff --git a/infra/abbreviations.json b/infra/abbreviations.json
index 5084711603..3673672a7e 100644
--- a/infra/abbreviations.json
+++ b/infra/abbreviations.json
@@ -29,6 +29,7 @@
"containerInstanceContainerGroups": "ci",
"containerRegistryRegistries": "cr",
"containerServiceManagedClusters": "aks-",
+ "cognitiveServicesContentUnderstanding": "cu-",
"databricksWorkspaces": "dbw-",
"dataFactoryFactories": "adf-",
"dataLakeAnalyticsAccounts": "dla",
diff --git a/infra/main.bicep b/infra/main.bicep
index 0630519c29..5c181cd525 100644
--- a/infra/main.bicep
+++ b/infra/main.bicep
@@ -119,6 +119,9 @@ param computerVisionResourceGroupName string = '' // Set in main.parameters.json
param computerVisionResourceGroupLocation string = '' // Set in main.parameters.json
param computerVisionSkuName string // Set in main.parameters.json
+param contentUnderstandingServiceName string = '' // Set in main.parameters.json
+param contentUnderstandingResourceGroupName string = '' // Set in main.parameters.json
+
param chatGptModelName string = ''
param chatGptDeploymentName string = ''
param chatGptDeploymentVersion string = ''
@@ -218,6 +221,9 @@ param useVectors bool = false
@description('Use Built-in integrated Vectorization feature of AI Search to vectorize and ingest documents')
param useIntegratedVectorization bool = false
+@description('Use media description feature with Azure Content Understanding during ingestion')
+param useMediaDescriberAzureCU bool = true
+
@description('Enable user document upload feature')
param useUserUpload bool = false
param useLocalPdfParser bool = false
@@ -278,6 +284,10 @@ resource computerVisionResourceGroup 'Microsoft.Resources/resourceGroups@2021-04
name: !empty(computerVisionResourceGroupName) ? computerVisionResourceGroupName : resourceGroup.name
}
+resource contentUnderstandingResourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' existing = if (!empty(contentUnderstandingResourceGroupName)) {
+ name: !empty(contentUnderstandingResourceGroupName) ? contentUnderstandingResourceGroupName : resourceGroup.name
+}
+
resource searchServiceResourceGroup 'Microsoft.Resources/resourceGroups@2021-04-01' existing = if (!empty(searchServiceResourceGroupName)) {
name: !empty(searchServiceResourceGroupName) ? searchServiceResourceGroupName : resourceGroup.name
}
@@ -401,6 +411,8 @@ var appEnvVariables = {
AZURE_DOCUMENTINTELLIGENCE_SERVICE: documentIntelligence.outputs.name
USE_LOCAL_PDF_PARSER: useLocalPdfParser
USE_LOCAL_HTML_PARSER: useLocalHtmlParser
+ USE_MEDIA_DESCRIBER_AZURE_CU: useMediaDescriberAzureCU
+ AZURE_CONTENTUNDERSTANDING_ENDPOINT: useMediaDescriberAzureCU ? contentUnderstanding.outputs.endpoint : ''
RUNNING_IN_PRODUCTION: 'true'
}
@@ -634,6 +646,28 @@ module computerVision 'br/public:avm/res/cognitive-services/account:0.7.2' = if
}
}
+
+module contentUnderstanding 'br/public:avm/res/cognitive-services/account:0.7.2' = if (useMediaDescriberAzureCU) {
+ name: 'content-understanding'
+ scope: contentUnderstandingResourceGroup
+ params: {
+ name: !empty(contentUnderstandingServiceName)
+ ? contentUnderstandingServiceName
+ : '${abbrs.cognitiveServicesContentUnderstanding}${resourceToken}'
+ kind: 'AIServices'
+ networkAcls: {
+ defaultAction: 'Allow'
+ }
+ customSubDomainName: !empty(contentUnderstandingServiceName)
+ ? contentUnderstandingServiceName
+ : '${abbrs.cognitiveServicesContentUnderstanding}${resourceToken}'
+ // Hard-coding to westus for now, due to limited availability and no overlap with Document Intelligence
+ location: 'westus'
+ tags: tags
+ sku: 'S0'
+ }
+}
+
module speech 'br/public:avm/res/cognitive-services/account:0.7.2' = if (useSpeechOutputAzure) {
name: 'speech-service'
scope: speechResourceGroup
@@ -1160,6 +1194,7 @@ output AZURE_SPEECH_SERVICE_ID string = useSpeechOutputAzure ? speech.outputs.re
output AZURE_SPEECH_SERVICE_LOCATION string = useSpeechOutputAzure ? speech.outputs.location : ''
output AZURE_VISION_ENDPOINT string = useGPT4V ? computerVision.outputs.endpoint : ''
+output AZURE_CONTENTUNDERSTANDING_ENDPOINT string = useMediaDescriberAzureCU ? contentUnderstanding.outputs.endpoint : ''
output AZURE_DOCUMENTINTELLIGENCE_SERVICE string = documentIntelligence.outputs.name
output AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP string = documentIntelligenceResourceGroup.name
diff --git a/infra/main.parameters.json b/infra/main.parameters.json
index a7ba80373e..54541ca8a6 100644
--- a/infra/main.parameters.json
+++ b/infra/main.parameters.json
@@ -35,6 +35,12 @@
"computerVisionSkuName": {
"value": "${AZURE_COMPUTER_VISION_SKU=S1}"
},
+ "contentUnderstandingServiceName": {
+ "value": "${AZURE_CONTENT_UNDERSTANDING_SERVICE}"
+ },
+ "contentUnderstandingResourceGroupName": {
+ "value": "${AZURE_CONTENT_UNDERSTANDING_RESOURCE_GROUP}"
+ },
"documentIntelligenceServiceName": {
"value": "${AZURE_DOCUMENTINTELLIGENCE_SERVICE}"
},
@@ -289,6 +295,9 @@
},
"azureContainerAppsWorkloadProfile": {
"value": "${AZURE_CONTAINER_APPS_WORKLOAD_PROFILE=Consumption}"
+ },
+ "useMediaDescriberAzureCU": {
+ "value": "${USE_MEDIA_DESCRIBER_AZURE_CU=false}"
}
}
}
diff --git a/infra/private-endpoints.bicep b/infra/private-endpoints.bicep
index 6053519cae..58fe14177e 100644
--- a/infra/private-endpoints.bicep
+++ b/infra/private-endpoints.bicep
@@ -84,7 +84,7 @@ module monitorDnsZones './core/networking/private-dns-zones.bicep' = [for monito
var dnsZoneBlobIndex = filter(flatten(privateEndpointInfo), info => info.groupId == 'blob')[0].dnsZoneIndex
// Azure Monitor Private Link Scope
-// https://learn.microsoft.com/en-us/azure/azure-monitor/logs/private-link-security
+// https://learn.microsoft.com/azure/azure-monitor/logs/private-link-security
resource monitorPrivateLinkScope 'microsoft.insights/privateLinkScopes@2021-07-01-preview' = {
name: 'mpls${resourceToken}'
location: 'global'
diff --git a/pyproject.toml b/pyproject.toml
index 1e21fddfd4..9e12e399f0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,5 +32,6 @@ module = [
"kiota.*",
"azure.cognitiveservices.*",
"azure.cognitiveservices.speech.*",
+ "pymupdf.*",
]
ignore_missing_imports = true
diff --git a/tests/conftest.py b/tests/conftest.py
index cfc5326f31..157770b186 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -54,7 +54,7 @@ async def mock_search(self, *args, **kwargs):
@pytest.fixture
-def mock_compute_embeddings_call(monkeypatch):
+def mock_azurehttp_calls(monkeypatch):
def mock_post(*args, **kwargs):
if kwargs.get("url").endswith("computervision/retrieval:vectorizeText"):
return mock_computervision_response()
@@ -327,7 +327,7 @@ async def client(
mock_openai_embedding,
mock_acs_search,
mock_blob_container_client,
- mock_compute_embeddings_call,
+ mock_azurehttp_calls,
):
quart_app = app.create_app()
@@ -346,7 +346,7 @@ async def client_with_expiring_token(
mock_openai_embedding,
mock_acs_search,
mock_blob_container_client,
- mock_compute_embeddings_call,
+ mock_azurehttp_calls,
):
quart_app = app.create_app()
diff --git a/tests/mocks.py b/tests/mocks.py
index 13dc82ac6e..788823941c 100644
--- a/tests/mocks.py
+++ b/tests/mocks.py
@@ -151,12 +151,10 @@ def by_page(self):
class MockResponse:
- def __init__(self, text, status):
- self.text = text
+ def __init__(self, status, text=None, headers=None):
+ self._text = text or ""
self.status = status
-
- async def text(self):
- return self._text
+ self.headers = headers or {}
async def __aexit__(self, exc_type, exc, tb):
pass
@@ -164,8 +162,15 @@ async def __aexit__(self, exc_type, exc, tb):
async def __aenter__(self):
return self
+ async def text(self):
+ return self._text
+
async def json(self):
- return json.loads(self.text)
+ return json.loads(self._text)
+
+ def raise_for_status(self):
+ if self.status != 200:
+ raise Exception(f"HTTP status {self.status}")
class MockEmbeddingsClient:
diff --git a/tests/snapshots/test_prepdocslib_textsplitter/test_pages_with_figures/split_pages_with_figures.json b/tests/snapshots/test_prepdocslib_textsplitter/test_pages_with_figures/split_pages_with_figures.json
new file mode 100644
index 0000000000..72bddc1dbe
--- /dev/null
+++ b/tests/snapshots/test_prepdocslib_textsplitter/test_pages_with_figures/split_pages_with_figures.json
@@ -0,0 +1,50 @@
+[
+ {
+ "text": "# Financial Market Analysis Report 2023\n\nAn In-Depth Exploration of Stocks, Cryptocurrencies, and Commodities\nPrepared by: Contoso Financial Analytics## Executive Summary\n\n. In this comprehensive report, Contoso Financial Analytics provides a\ndeep dive into the financial markets of 2023, focusing on the trends\nand fluctuations within stocks, cryptocurrencies, and commodities.\nOur analysis covers historical patterns, current market conditions, and\nfuture predictions, offering valuable insights for investors, analysts,\nand financial enthusiasts. This report leverages advanced data\nanalytics to present a clear picture of the complex interplay between\ndifferent financial markets and their potential trajectories## Introduction to Financial Markets\n\n\n
Global Financial Market Distribution (2023) This pie chart represents the distribution of investments across four categories: Stocks, Bonds, Cryptocurrencies, and Commodities. The chart is divided into four colored sections, each representing a different category.",
+ "page_num": 0
+ },
+ {
+ "text": "a clear picture of the complex interplay between\ndifferent financial markets and their potential trajectories## Introduction to Financial Markets\n\n\n
Global Financial Market Distribution (2023) This pie chart represents the distribution of investments across four categories: Stocks, Bonds, Cryptocurrencies, and Commodities. The chart is divided into four colored sections, each representing a different category. Stocks are shown in blue, Bonds in orange, Cryptocurrencies in gray, and Commodities in yellow. The chart visually indicates the proportion of each investment type within a portfolio.
Category
Color
Stocks
Blue
Bonds
Orange
Cryptocurrencies
Gray
Commodities
Yellow
\n\n\nThe global financial market is a vast and intricate network of\nexchanges, instruments, and assets, ranging from traditional stocks\nand bonds to modern cryptocurrencies and commodities. Each\nsegment plays a crucial role in the overall economy, and their\ninteractions can have profound effects on global financial stability.",
+ "page_num": 1
+ },
+ {
+ "text": " Each\nsegment plays a crucial role in the overall economy, and their\ninteractions can have profound effects on global financial stability.\nThis section provides an overview of these segments and sets the\nstage for a detailed analysis## Stock Market Overview\n\n\n
5-Year Trend of the S&P 500 Index
This line chart shows the trend of the S&P 500 Index over a five-year period from 2018 to 2022. The index starts at around 2500 in 2018, rises steadily to a peak of about 4500 in 2021, and then declines slightly to approximately 4000 in 2022.
Year
S&P 500 Index
2018
2500
2019
3000
2020
3500
2021
4500
2022
4000
\n\n\nThe stock market is often considered the economy's\nheartbeat, reflecting corporate health and investor\nsentiment. Over the past five years, the S&P 500 index has\nexperienced significant volatility, with notable peaks and\ntroughs corresponding to various economic events.",
+ "page_num": 2
+ },
+ {
+ "text": " Over the past five years, the S&P 500 index has\nexperienced significant volatility, with notable peaks and\ntroughs corresponding to various economic events. This\noverview examines the key factors that have influenced\nthe stock market's performance and what they indicate\nabout the economy's state## Cryptocurrency Market Dynamics\n\n\n\n\n\nCryptocurrencies have emerged as a new asset\nclass, captivating investors with their potential for\nhigh returns and their role in the future of finance.\nThis section explores the price dynamics of major\ncryptocurrencies like Bitcoin and Ethereum,\nanalyzing the ",
+ "page_num": 3
+ },
+ {
+ "text": "\nThis section explores the price dynamics of major\ncryptocurrencies like Bitcoin and Ethereum,\nanalyzing the factors driving their volatility and the\nimplications for the broader financial market.\n\n\n### Commodity Market Fluctuations\n\n\n### Commodity Market Fluctuations\n\n\n
Price Changes of Oil, Gold, and Wheat This is a horizontal bar chart showing the annual percentage change in prices for Wheat, Gold, and Oil from 2014 to 2022. The chart uses different colors to represent each commodity: gray for Wheat, orange for Gold, and blue for Oil. The x-axis represents the percentage change, ranging from -25% to 35%, while the y-axis lists the years from 2014 to 2022.\n\n
\n
Year
Wheat
Gold
Oil
\n
2022
5%
2%
0%
\n
2021
3%
4%
30%
\n
2020
1%
5%
-20%
\n
2019
2%
3%
10%
\n
2018
0%
1%
15%
\n
2017
4%
2%
5%
\n
2016
3%
6%
-5%
\n
2015
1%
0%
10%
\n
2014
2%
5%
-10%
\n
\n\n\nCommodities such as oil, gold, and\nwheat are fundamental to the ",
+ "page_num": 4
+ },
+ {
+ "text": "
2015
1%
0%
10%
\n
2014
2%
5%
-10%
\n
\n\n\nCommodities such as oil, gold, and\nwheat are fundamental to the global\neconomy, influencing everything from\nenergy costs to food prices. This section\ndelves into the trends and factors\naffecting commodity prices, including\ngeopolitical events, supply-chain\ndisruptions, and environmental factors,\nproviding a comprehensive view of this\ncrucial market segment.### Interplay Between Different Market Segments\n\n\n
S&P 500
NASDAQ
Bitcoin
Ethereum
Oil
Gold
S&P 500
1
NASDAQ
0.95
1
Bitcoin
0.3
0.4
1
Ethereum
0.35
0.45
0.9
1
Oil
0.6
0.65
0.2
0.25
1
Gold
-0.",
+ "page_num": 5
+ },
+ {
+ "text": "### Interplay Between Different Market Segments\n\n\n
S&P 500
NASDAQ
Bitcoin
Ethereum
Oil
Gold
S&P 500
1
NASDAQ
0.95
1
Bitcoin
0.3
0.4
1
Ethereum
0.35
0.45
0.9
1
Oil
0.6
0.65
0.2
0.25
1
Gold
-0.2
-0.15
-0.1
-0.05
-0.3
1
\n\n\nFinancial markets are interconnected, with movements in one segment often influencing others. This\nsection examines the correlations between stock indices, cryptocurrency prices, and commodity prices,\nrevealing how changes in one market can have ripple effects across the financial ecosystem.### Impact of Macroeconomic Factors\n\n\n
Impact of Interest Rates, Inflation, and GDP Growth on Financial ",
+ "page_num": 6
+ },
+ {
+ "text": "### Impact of Macroeconomic Factors\n\n\n
Impact of Interest Rates, Inflation, and GDP Growth on Financial Markets
The image is a line graph titled \"On Financial Markets\" showing the trends of Interest Rates %, Inflation Data %, and GDP Growth % from 2018 to 2023. The graph has three lines representing each of these metrics over the years.
\n\n
\n\n
Year
Interest Rates %
Inflation Data %
GDP Growth %
\n\n\n
2018
2
2
3
\n
2019
2
2.5
2
\n
2020
1
1.5
-4
\n
2021
1.5
3
3
\n
2022
2
3.5
2
\n
2023
2.5
3
2.5
\n\n
\n\n
The graph shows that GDP Growth % experienced a significant drop in 2020, while Inflation Data % and Interest Rates % remained relatively stable with slight fluctuations over the years.",
+ "page_num": 7
+ },
+ {
+ "text": "5
\n\n
\n\n
The graph shows that GDP Growth % experienced a significant drop in 2020, while Inflation Data % and Interest Rates % remained relatively stable with slight fluctuations over the years.
\n\n\nMacroeconomic factors such as interest\nrates, inflation, and GDP growth play a\npivotal role in shaping financial markets.\nThis section analyzes how these factors\nhave influenced stock, cryptocurrency,\nand commodity markets over recent\nyears, providing insights into the\ncomplex relationship between the\neconomy and financial market\nperformance.## Future Predictions and Trends\n\n\n
Relative Growth Trends for S&P 500, Bitcoin, and Oil Prices (2024 Indexed to 100)
Prices (2024 Indexed to 100)
\n
This bar chart compares the indexed prices of Oil, Bitcoin, and the S&P 500 from 2024 to 2028, with 2024 set as the base year (indexed to 100). The chart shows the relative price changes over the years for each asset.
Relative Growth Trends for S&P 500, Bitcoin, and Oil Prices (2024 Indexed to 100)
Prices (2024 Indexed to 100)
\n
This bar chart compares the indexed prices of Oil, Bitcoin, and the S&P 500 from 2024 to 2028, with 2024 set as the base year (indexed to 100). The chart shows the relative price changes over the years for each asset.
\n
\n\n
Year
Oil
Bitcoin
S&P 500
\n\n\n
2024
100
100
100
\n
2025
105
110
108
\n
2026
110
115
112
\n
2027
115
120
116
\n
2028
120
125
120
\n\n
\n\n\nBased on historical data, current trends,\nand economic indicators, this section\npresents predictions for the future of\nfinancial markets. We explore potential\ntrajectories for stock indices,\ncryptocurrency values, and commodity\nprices, offering investors and analysts\n",
+ "page_num": 8
+ },
+ {
+ "text": " We explore potential\ntrajectories for stock indices,\ncryptocurrency values, and commodity\nprices, offering investors and analysts\nforesight into what the coming years\nmight hold.## Conclusions\n\n. In conclusion, this report has traversed the multifaceted landscape of\nfinancial markets, shedding light on the intricate patterns and\ninterdependencies that define their behavior. From the volatility of\ncryptocurrencies to the steadiness of commodities, each segment\ntells a part of the story of our global economy. As Contoso Financial\nAnalytics, we are committed to providing our clients with the most\ncomprehensive and nuanced analysis, empowering them to make\ninformed financial decisions in an ever-evolving market.",
+ "page_num": 8
+ }
+]
\ No newline at end of file
diff --git a/tests/test-data/Financial Market Analysis Report 2023.pdf b/tests/test-data/Financial Market Analysis Report 2023.pdf
new file mode 100644
index 0000000000..eef17aad75
Binary files /dev/null and b/tests/test-data/Financial Market Analysis Report 2023.pdf differ
diff --git a/tests/test-data/Financial Market Analysis Report 2023_page2_figure.png b/tests/test-data/Financial Market Analysis Report 2023_page2_figure.png
new file mode 100644
index 0000000000..b5dd8ae577
Binary files /dev/null and b/tests/test-data/Financial Market Analysis Report 2023_page2_figure.png differ
diff --git a/tests/test-data/Simple Figure.pdf b/tests/test-data/Simple Figure.pdf
new file mode 100644
index 0000000000..416f9fd61b
Binary files /dev/null and b/tests/test-data/Simple Figure.pdf differ
diff --git a/tests/test-data/Simple Figure_content.txt b/tests/test-data/Simple Figure_content.txt
new file mode 100644
index 0000000000..3e8db25dbb
--- /dev/null
+++ b/tests/test-data/Simple Figure_content.txt
@@ -0,0 +1,13 @@
+# Simple Figure
+
+This text is before the figure and NOT part of it.
+
+
+
+
+
+This is text after the figure that's not part of it.
diff --git a/tests/test-data/Simple Table.pdf b/tests/test-data/Simple Table.pdf
new file mode 100644
index 0000000000..0a5ae23e90
Binary files /dev/null and b/tests/test-data/Simple Table.pdf differ
diff --git a/tests/test-data/Simple Table_content.txt b/tests/test-data/Simple Table_content.txt
new file mode 100644
index 0000000000..cca5a0ed77
--- /dev/null
+++ b/tests/test-data/Simple Table_content.txt
@@ -0,0 +1,17 @@
+# Simple HTML Table
+
+
+
+
+
Header 1
+
Header 2
+
+
+
Cell 1
+
Cell 2
+
+
+
Cell 3
+
Cell 4
+
+
diff --git a/tests/test-data/pages_with_figures.json b/tests/test-data/pages_with_figures.json
new file mode 100644
index 0000000000..0b157c7f0c
--- /dev/null
+++ b/tests/test-data/pages_with_figures.json
@@ -0,0 +1 @@
+[{"page_num": 0, "offset": 0, "text": "# Financial Market Analysis Report 2023\n\nAn In-Depth Exploration of Stocks, Cryptocurrencies, and Commodities\nPrepared by: Contoso Financial Analytics"}, {"page_num": 1, "offset": 150, "text": "## Executive Summary\n\n. In this comprehensive report, Contoso Financial Analytics provides a\ndeep dive into the financial markets of 2023, focusing on the trends\nand fluctuations within stocks, cryptocurrencies, and commodities.\nOur analysis covers historical patterns, current market conditions, and\nfuture predictions, offering valuable insights for investors, analysts,\nand financial enthusiasts. This report leverages advanced data\nanalytics to present a clear picture of the complex interplay between\ndifferent financial markets and their potential trajectories"}, {"page_num": 2, "offset": 716, "text": "## Introduction to Financial Markets\n\n\n
Global Financial Market Distribution (2023) This pie chart represents the distribution of investments across four categories: Stocks, Bonds, Cryptocurrencies, and Commodities. The chart is divided into four colored sections, each representing a different category. Stocks are shown in blue, Bonds in orange, Cryptocurrencies in gray, and Commodities in yellow. The chart visually indicates the proportion of each investment type within a portfolio.
Category
Color
Stocks
Blue
Bonds
Orange
Cryptocurrencies
Gray
Commodities
Yellow
\n\n\nThe global financial market is a vast and intricate network of\nexchanges, instruments, and assets, ranging from traditional stocks\nand bonds to modern cryptocurrencies and commodities. Each\nsegment plays a crucial role in the overall economy, and their\ninteractions can have profound effects on global financial stability.\nThis section provides an overview of these segments and sets the\nstage for a detailed analysis"}, {"page_num": 3, "offset": 1897, "text": "## Stock Market Overview\n\n\n
5-Year Trend of the S&P 500 Index
This line chart shows the trend of the S&P 500 Index over a five-year period from 2018 to 2022. The index starts at around 2500 in 2018, rises steadily to a peak of about 4500 in 2021, and then declines slightly to approximately 4000 in 2022.
Year
S&P 500 Index
2018
2500
2019
3000
2020
3500
2021
4500
2022
4000
\n\n\nThe stock market is often considered the economy's\nheartbeat, reflecting corporate health and investor\nsentiment. Over the past five years, the S&P 500 index has\nexperienced significant volatility, with notable peaks and\ntroughs corresponding to various economic events. This\noverview examines the key factors that have influenced\nthe stock market's performance and what they indicate\nabout the economy's state"}, {"page_num": 4, "offset": 2937, "text": "## Cryptocurrency Market Dynamics\n\n\n\n\n\nCryptocurrencies have emerged as a new asset\nclass, captivating investors with their potential for\nhigh returns and their role in the future of finance.\nThis section explores the price dynamics of major\ncryptocurrencies like Bitcoin and Ethereum,\nanalyzing the factors driving their volatility and the\nimplications for the broader financial market.\n\n\n"}, {"page_num": 5, "offset": 4243, "text": "### Commodity Market Fluctuations\n\n\n
Price Changes of Oil, Gold, and Wheat This is a horizontal bar chart showing the annual percentage change in prices for Wheat, Gold, and Oil from 2014 to 2022. The chart uses different colors to represent each commodity: gray for Wheat, orange for Gold, and blue for Oil. The x-axis represents the percentage change, ranging from -25% to 35%, while the y-axis lists the years from 2014 to 2022.\n\n
\n
Year
Wheat
Gold
Oil
\n
2022
5%
2%
0%
\n
2021
3%
4%
30%
\n
2020
1%
5%
-20%
\n
2019
2%
3%
10%
\n
2018
0%
1%
15%
\n
2017
4%
2%
5%
\n
2016
3%
6%
-5%
\n
2015
1%
0%
10%
\n
2014
2%
5%
-10%
\n
\n\n\nCommodities such as oil, gold, and\nwheat are fundamental to the global\neconomy, influencing everything from\nenergy costs to food prices. This section\ndelves into the trends and factors\naffecting commodity prices, including\ngeopolitical events, supply-chain\ndisruptions, and environmental factors,\nproviding a comprehensive view of this\ncrucial market segment."}, {"page_num": 6, "offset": 5673, "text": "### Interplay Between Different Market Segments\n\n\n
S&P 500
NASDAQ
Bitcoin
Ethereum
Oil
Gold
S&P 500
1
NASDAQ
0.95
1
Bitcoin
0.3
0.4
1
Ethereum
0.35
0.45
0.9
1
Oil
0.6
0.65
0.2
0.25
1
Gold
-0.2
-0.15
-0.1
-0.05
-0.3
1
\n\n\nFinancial markets are interconnected, with movements in one segment often influencing others. This\nsection examines the correlations between stock indices, cryptocurrency prices, and commodity prices,\nrevealing how changes in one market can have ripple effects across the financial ecosystem."}, {"page_num": 7, "offset": 6695, "text": "### Impact of Macroeconomic Factors\n\n\n
Impact of Interest Rates, Inflation, and GDP Growth on Financial Markets
The image is a line graph titled \"On Financial Markets\" showing the trends of Interest Rates %, Inflation Data %, and GDP Growth % from 2018 to 2023. The graph has three lines representing each of these metrics over the years.
\n\n
\n\n
Year
Interest Rates %
Inflation Data %
GDP Growth %
\n\n\n
2018
2
2
3
\n
2019
2
2.5
2
\n
2020
1
1.5
-4
\n
2021
1.5
3
3
\n
2022
2
3.5
2
\n
2023
2.5
3
2.5
\n\n
\n\n
The graph shows that GDP Growth % experienced a significant drop in 2020, while Inflation Data % and Interest Rates % remained relatively stable with slight fluctuations over the years.
\n\n\nMacroeconomic factors such as interest\nrates, inflation, and GDP growth play a\npivotal role in shaping financial markets.\nThis section analyzes how these factors\nhave influenced stock, cryptocurrency,\nand commodity markets over recent\nyears, providing insights into the\ncomplex relationship between the\neconomy and financial market\nperformance."}, {"page_num": 8, "offset": 8102, "text": "## Future Predictions and Trends\n\n\n
Relative Growth Trends for S&P 500, Bitcoin, and Oil Prices (2024 Indexed to 100)
Prices (2024 Indexed to 100)
\n
This bar chart compares the indexed prices of Oil, Bitcoin, and the S&P 500 from 2024 to 2028, with 2024 set as the base year (indexed to 100). The chart shows the relative price changes over the years for each asset.
\n
\n\n
Year
Oil
Bitcoin
S&P 500
\n\n\n
2024
100
100
100
\n
2025
105
110
108
\n
2026
110
115
112
\n
2027
115
120
116
\n
2028
120
125
120
\n\n
\n\n\nBased on historical data, current trends,\nand economic indicators, this section\npresents predictions for the future of\nfinancial markets. We explore potential\ntrajectories for stock indices,\ncryptocurrency values, and commodity\nprices, offering investors and analysts\nforesight into what the coming years\nmight hold."}, {"page_num": 9, "offset": 9281, "text": "## Conclusions\n\n. In conclusion, this report has traversed the multifaceted landscape of\nfinancial markets, shedding light on the intricate patterns and\ninterdependencies that define their behavior. From the volatility of\ncryptocurrencies to the steadiness of commodities, each segment\ntells a part of the story of our global economy. As Contoso Financial\nAnalytics, we are committed to providing our clients with the most\ncomprehensive and nuanced analysis, empowering them to make\ninformed financial decisions in an ever-evolving market."}]
diff --git a/tests/test_app.py b/tests/test_app.py
index 9c20b9421a..580e31e5d1 100644
--- a/tests/test_app.py
+++ b/tests/test_app.py
@@ -15,7 +15,7 @@ def fake_response(http_code):
return Response(http_code, request=Request(method="get", url="https://foo.bar/"))
-# See https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/content-filter
+# See https://learn.microsoft.com/azure/ai-services/openai/concepts/content-filter
filtered_response = BadRequestError(
message="The response was filtered",
body={
diff --git a/tests/test_mediadescriber.py b/tests/test_mediadescriber.py
new file mode 100644
index 0000000000..117a186281
--- /dev/null
+++ b/tests/test_mediadescriber.py
@@ -0,0 +1,135 @@
+import json
+import logging
+
+import aiohttp
+import pytest
+
+from prepdocslib.mediadescriber import ContentUnderstandingDescriber
+
+from .mocks import MockAzureCredential, MockResponse
+
+
+@pytest.mark.asyncio
+async def test_contentunderstanding_analyze(monkeypatch, caplog):
+
+ def mock_post(*args, **kwargs):
+ if kwargs.get("url").find("badanalyzer") > 0:
+ return MockResponse(
+ status=200,
+ headers={
+ "Operation-Location": "https://testcontentunderstanding.cognitiveservices.azure.com/contentunderstanding/analyzers/badanalyzer/operations/7f313e00-4da1-4b19-a25e-53f121c24d10?api-version=2024-12-01-preview"
+ },
+ )
+ if kwargs.get("url").endswith("contentunderstanding/analyzers/image_analyzer:analyze"):
+ return MockResponse(
+ status=200,
+ headers={
+ "Operation-Location": "https://testcontentunderstanding.cognitiveservices.azure.com/contentunderstanding/analyzers/image_analyzer/results/53e4c016-d2c0-48a9-a9f4-38891f7d45f0?api-version=2024-12-01-preview"
+ },
+ )
+ else:
+ raise Exception("Unexpected URL for mock call to ClientSession.post()")
+
+ monkeypatch.setattr(aiohttp.ClientSession, "post", mock_post)
+
+ num_poll_calls = 0
+
+ def mock_get(self, url, **kwargs):
+ if url.endswith(
+ "contentunderstanding/analyzers/image_analyzer/results/53e4c016-d2c0-48a9-a9f4-38891f7d45f0?api-version=2024-12-01-preview"
+ ):
+ return MockResponse(
+ status=200,
+ text=json.dumps(
+ {
+ "id": "f8c4c1c0-71c3-410c-a723-d223e0a84a88",
+ "status": "Succeeded",
+ "result": {
+ "analyzerId": "image_analyzer",
+ "apiVersion": "2024-12-01-preview",
+ "createdAt": "2024-12-05T17:33:04Z",
+ "warnings": [],
+ "contents": [
+ {
+ "markdown": "![image](image)\n",
+ "fields": {
+ "Description": {
+ "type": "string",
+ "valueString": "The bar chart titled 'Prices (2024 Indexed to 100)' compares the indexed prices of Oil, Bitcoin, and S&P 500 from 2024 to 2028. Each year is represented by a set of three horizontal bars, with Oil in gray, Bitcoin in orange, and S&P 500 in blue. The index is based on the year 2024, where all values start at 100. Over the years, Bitcoin shows the most significant increase, reaching around 130 by 2028, while Oil and S&P 500 show moderate increases.\n\n