-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathapp.py
116 lines (91 loc) · 3.96 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""Example of using pydantic_ai with OpenAI's multimodal LLM for invoice processing."""
import os
import base64
import asyncio
from dataclasses import dataclass
from pydantic import BaseModel, Field
from pydantic_ai import Agent, RunContext
from dotenv import load_dotenv
from openai import OpenAI
# Load environment variables
load_dotenv()
class LineItem(BaseModel):
"""Structured representation of a line item in an invoice."""
description: str = Field(description="Description of the line item.")
quantity: int = Field(description="Quantity of the line item.")
unit_price: float = Field(description="Unit price of the line item.")
total_price: float = Field(description="Total price for the line item.")
class InvoiceExtractionResult(BaseModel):
"""Structured response for invoice extraction."""
total_amount: float = Field(description="The total amount extracted from the invoice image.")
sender: str = Field(description="The sender of the invoice.")
date: str = Field(description="The date of the invoice.")
line_items: list[LineItem] = Field(description="The list of line items in the invoice.")
class MultimodalLLMService:
"""Service to interact with OpenAI multimodal LLMs."""
def __init__(self, model: str):
self.client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
self.model = model
def get_model_name(self) -> str:
"""Return the name of the model."""
return self.model
async def perform_task(self, image_path: str, response_model: type, max_tokens: int = 5000):
"""Send an image and prompt to the LLM and return structured output."""
with open(image_path, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode("utf-8")
messages = [
{"role": "system", "content": "You are an assistant that extracts details from invoices."},
{"role": "user", "content": [
{"type": "text", "text": "Extract the details from this invoice."},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}
]}
]
response = self.client.beta.chat.completions.parse(
model=self.model,
messages=messages,
max_tokens=max_tokens,
response_format=response_model
)
return response.choices[0].message.parsed
@dataclass
class InvoiceProcessingDependencies:
"""Dependencies for the invoice processing agent."""
llm_service: MultimodalLLMService
invoice_image_path: str
invoice_processing_agent = Agent(
"openai:gpt-4o-mini",
deps_type=InvoiceProcessingDependencies,
result_type=InvoiceExtractionResult,
system_prompt="Extract the total amount, sender, date, and line items from the given invoice image."
)
summary_agent = Agent(
"openai:gpt-4o-mini",
result_type=str,
system_prompt="Summarize the extracted invoice details into a few sentences."
)
@invoice_processing_agent.tool
async def extract_invoice_details(
ctx: RunContext[InvoiceProcessingDependencies]
) -> InvoiceExtractionResult:
"""Custom tool to extract details from an invoice image."""
return await ctx.deps.llm_service.perform_task(
image_path=ctx.deps.invoice_image_path,
response_model=InvoiceExtractionResult
)
async def main():
"""Run the invoice processing agent."""
deps = InvoiceProcessingDependencies(
llm_service=MultimodalLLMService(model="gpt-4o-mini"),
invoice_image_path="images/invoice_sample.png"
)
result = await invoice_processing_agent.run(
"Extract the total amount, sender, date, and line items from this invoice.", deps=deps
)
print("Structured Result:", result.data)
print("=" * 100)
summary = await summary_agent.run(
"Summarize the invoice details in a few sentences.", message_history=result.new_messages()
)
print("Summary:", summary.data)
if __name__ == "__main__":
asyncio.run(main())