diff --git a/.github/workflows/publish_executor_containers.yaml b/.github/workflows/publish_executor_containers.yaml index 0d253b2c2..7dc958582 100644 --- a/.github/workflows/publish_executor_containers.yaml +++ b/.github/workflows/publish_executor_containers.yaml @@ -29,11 +29,7 @@ jobs: - run: | pip install indexify -U indexify-cli build-default-image - indexify-cli build-image examples/pdf_document_extraction/embedding.py - indexify-cli build-image examples/pdf_document_extraction/lancedb_functions.py - indexify-cli build-image examples/pdf_document_extraction/chromadb_writer.py - indexify-cli build-image examples/pdf_document_extraction/pdf_parser.py - indexify-cli build-image examples/pdf_document_extraction/workflow.py + indexify-cli build-image examples/pdf_document_extraction/images.py docker push tensorlake/indexify-executor-default:3.10 docker push tensorlake/indexify-executor-default:3.11 docker push tensorlake/tensorlake/pdf-blueprint-st diff --git a/examples/pdf_document_extraction/chromadb_writer.py b/examples/pdf_document_extraction/chromadb_writer.py index 760472377..f7a5a47f1 100644 --- a/examples/pdf_document_extraction/chromadb_writer.py +++ b/examples/pdf_document_extraction/chromadb_writer.py @@ -1,13 +1,11 @@ from indexify.functions_sdk.indexify_functions import IndexifyFunction -from indexify import Image from typing import Union from common_objects import ImageWithEmbedding, TextChunk - -image = Image(python="3.11").name("tensorlake/blueprints-chromadb").run("pip install chromadb").run("pip install pillow") +from images import chroma_image class ChromaDBWriter(IndexifyFunction): name = "chroma_db_writer" - image = image + image = chroma_image def __init__(self): import chromadb diff --git a/examples/pdf_document_extraction/embedding.py b/examples/pdf_document_extraction/embedding.py index 6bf46059e..624fcc32f 100644 --- a/examples/pdf_document_extraction/embedding.py +++ b/examples/pdf_document_extraction/embedding.py @@ -1,23 +1,14 @@ from typing import Any, List -from indexify import Image from indexify.functions_sdk.indexify_functions import IndexifyFunction, indexify_function from sentence_transformers import SentenceTransformer from common_objects import ImageWithEmbedding, TextChunk from inkwell.api.document import Document from inkwell.api.page import PageFragmentType import base64 +from images import st_image -image = ( - Image(python="3.11") - .name("tensorlake/pdf-blueprint-st") - .run("pip install sentence-transformers") - .run("pip install langchain") - .run("pip install pillow") - .run("pip install py-inkwell") -) - -@indexify_function(image=image) +@indexify_function(image=st_image) def chunk_text(document: Document) -> List[TextChunk]: """ Extract chunks from document @@ -54,7 +45,7 @@ class TextEmbeddingExtractor(IndexifyFunction): description = "Extractor class that captures an embedding model" system_dependencies = [] input_mime_types = ["text"] - image = image + image = st_image def __init__(self): super().__init__() @@ -69,7 +60,7 @@ def run(self, input: TextChunk) -> TextChunk: class ImageEmbeddingExtractor(IndexifyFunction): name = "image-embedding" description = "Extractor class that captures an embedding model" - image=image + image=st_image def __init__(self): super().__init__() diff --git a/examples/pdf_document_extraction/images.py b/examples/pdf_document_extraction/images.py new file mode 100644 index 000000000..1697bd486 --- /dev/null +++ b/examples/pdf_document_extraction/images.py @@ -0,0 +1,41 @@ +from indexify import Image + + +chroma_image = Image(python="3.11").name("tensorlake/blueprints-chromadb").run("pip install chromadb").run("pip install pillow") + +st_image = ( + Image(python="3.11") + .name("tensorlake/pdf-blueprint-st") + .run("pip install sentence-transformers") + .run("pip install langchain") + .run("pip install pillow") + .run("pip install py-inkwell") +) + + +lance_image = Image(python="3.11").name("tensorlake/pdf-blueprint-lancdb").run("pip install lancedb") + +inkwell_image = ( + Image(python="3.11") + .name("tensorlake/pdf-blueprint-pdf-parser") + .run("apt update") + .run("apt install -y libgl1-mesa-glx git g++") + .run("pip install torch") + .run("pip install numpy") + .run("pip install git+https://github.com/facebookresearch/detectron2.git@v0.6") + .run("apt install -y tesseract-ocr") + .run("apt install -y libtesseract-dev") + .run("pip install \"py-inkwell[inference]\"") +) + +inkwell_image_gpu = ( + Image() + .name("tensorlake/pdf-blueprint-pdf-parser-gpu") + .base_image("pytorch/pytorch:2.4.1-cuda11.8-cudnn9-runtime") + .run("apt update") + .run("apt install -y libgl1-mesa-glx git g++") + .run("pip install git+https://github.com/facebookresearch/detectron2.git@v0.6") + .run("apt install -y tesseract-ocr") + .run("apt install -y libtesseract-dev") + .run("pip install \"py-inkwell[inference]\"") +) \ No newline at end of file diff --git a/examples/pdf_document_extraction/lancedb_functions.py b/examples/pdf_document_extraction/lancedb_functions.py index 48e77dd61..aef9c49a8 100644 --- a/examples/pdf_document_extraction/lancedb_functions.py +++ b/examples/pdf_document_extraction/lancedb_functions.py @@ -1,12 +1,11 @@ from typing import Union -from indexify import Image from indexify.functions_sdk.indexify_functions import IndexifyFunction from common_objects import ImageWithEmbedding, TextChunk import lancedb from lancedb.pydantic import LanceModel, Vector +from images import lance_image -image = Image(python="3.11").name("tensorlake/pdf-blueprint-lancdb").run("pip install lancedb") class ImageEmbeddingTable(LanceModel): vector: Vector(512) @@ -20,7 +19,7 @@ class TextEmbeddingTable(LanceModel): class LanceDBWriter(IndexifyFunction): name = "lancedb_writer" - image = image + image = lance_image def __init__(self): super().__init__() diff --git a/examples/pdf_document_extraction/pdf_parser.py b/examples/pdf_document_extraction/pdf_parser.py index 36684d569..8371031e0 100644 --- a/examples/pdf_document_extraction/pdf_parser.py +++ b/examples/pdf_document_extraction/pdf_parser.py @@ -1,39 +1,14 @@ from indexify.functions_sdk.data_objects import File from indexify.functions_sdk.indexify_functions import IndexifyFunction -from indexify import Image from inkwell.api.document import Document - -image = ( - Image(python="3.11") - .name("tensorlake/pdf-blueprint-pdf-parser") - .run("apt update") - .run("apt install -y libgl1-mesa-glx git g++") - .run("pip install torch") - .run("pip install numpy") - .run("pip install git+https://github.com/facebookresearch/detectron2.git@v0.6") - .run("apt install -y tesseract-ocr") - .run("apt install -y libtesseract-dev") - .run("pip install \"py-inkwell[inference]\"") -) - -gpu_image = ( - Image() - .name("tensorlake/pdf-blueprint-pdf-parser-gpu") - .base_image("pytorch/pytorch:2.4.1-cuda11.8-cudnn9-runtime") - .run("apt update") - .run("apt install -y libgl1-mesa-glx git g++") - .run("pip install git+https://github.com/facebookresearch/detectron2.git@v0.6") - .run("apt install -y tesseract-ocr") - .run("apt install -y libtesseract-dev") - .run("pip install \"py-inkwell[inference]\"") -) +from images import inkwell_image_gpu class PDFParser(IndexifyFunction): name = "pdf-parse" description = "Parser class that captures a pdf file" # Change to gpu_image to use GPU - image = gpu_image + image = inkwell_image_gpu def __init__(self): super().__init__() diff --git a/examples/pdf_document_extraction/workflow.py b/examples/pdf_document_extraction/workflow.py index 98b471b1f..93276294a 100644 --- a/examples/pdf_document_extraction/workflow.py +++ b/examples/pdf_document_extraction/workflow.py @@ -48,8 +48,9 @@ def create_graph() -> Graph: # Uncomment this to run the graph locally #invocation_id = graph.run(block_until_done=True, url="https://arxiv.org/pdf/2302.12854") import common_objects + import images - remote_graph = RemoteGraph.deploy(graph, additional_modules=[common_objects]) + remote_graph = RemoteGraph.deploy(graph, additional_modules=[common_objects, images]) invocation_id = remote_graph.run( block_until_done=True, url="https://arxiv.org/pdf/1706.03762" )