-
Notifications
You must be signed in to change notification settings - Fork 120
/
workflow.py
86 lines (69 loc) · 2.8 KB
/
workflow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from indexify import RemoteGraph
from indexify.functions_sdk.data_objects import File
from indexify.functions_sdk.graph import Graph
from indexify.functions_sdk.indexify_functions import indexify_function
from images import http_client_image
import httpx
@indexify_function(image=http_client_image)
def download_pdf(url: str) -> File:
"""
Download pdf from url
"""
import httpx
resp = httpx.get(url=url, follow_redirects=True)
resp.raise_for_status()
return File(data=resp.content, mime_type="application/pdf")
# This graph, downloads a PDF, extracts text and image embeddings from the PDF
# and writes them to the vector database
def create_graph() -> Graph:
from embedding import ImageEmbeddingExtractor, TextEmbeddingExtractor, chunk_text
from chromadb_writer import ChromaDBWriter
from pdf_parser import PDFParser
g = Graph(
"Extract_pages_tables_images_pdf",
start_node=download_pdf,
)
# Parse the PDF which was downloaded
g.add_edge(download_pdf, PDFParser)
g.add_edge(PDFParser, chunk_text)
## Embed all the text chunks in the PDF
g.add_edge(chunk_text, TextEmbeddingExtractor)
g.add_edge(PDFParser, ImageEmbeddingExtractor)
## Write all the embeddings to the vector database
g.add_edge(TextEmbeddingExtractor, ChromaDBWriter)
g.add_edge(ImageEmbeddingExtractor, ChromaDBWriter)
return g
# This graph extracts text and image embeddings from the PDF
# and writes them to the vector database
def create_graph_1() -> Graph:
from embedding import ImageEmbeddingExtractor, TextEmbeddingExtractor, chunk_text
from chromadb_writer import ChromaDBWriter
from pdf_parser import PDFParser
g = Graph(
"Extract_pages_tables_images_pdf",
start_node=PDFParser,
)
g.add_edge(PDFParser, chunk_text)
## Embed all the text chunks in the PDF
g.add_edge(chunk_text, TextEmbeddingExtractor)
g.add_edge(PDFParser, ImageEmbeddingExtractor)
## Write all the embeddings to the vector database
g.add_edge(TextEmbeddingExtractor, ChromaDBWriter)
g.add_edge(ImageEmbeddingExtractor, ChromaDBWriter)
return g
if __name__ == "__main__":
graph: Graph = create_graph_1()
# Uncomment this to run the graph locally
#invocation_id = graph.run(block_until_done=True, url="https://arxiv.org/pdf/2302.12854")
import common_objects
import images
remote_graph = RemoteGraph.deploy(graph, additional_modules=[common_objects, images])
file_url = "https://arxiv.org/pdf/1706.03762"
import httpx
resp = httpx.get(url=file_url, follow_redirects=True)
resp.raise_for_status()
file = File(data=resp.content, mime_type="application/pdf")
invocation_id = remote_graph.run(
block_until_done=True, file=file,
)
print(f"Invocation ID: {invocation_id}")