update images

tensorlakeai · Oct 19, 2024 · d1d35a7 · d1d35a7
1 parent 01f9fa0
commit d1d35a7
Show file tree

Hide file tree

Showing 7 changed files with 54 additions and 53 deletions.
diff --git a/.github/workflows/publish_executor_containers.yaml b/.github/workflows/publish_executor_containers.yaml
@@ -29,11 +29,7 @@ jobs:
       - run: |
           pip install indexify -U
           indexify-cli build-default-image 
-          indexify-cli build-image examples/pdf_document_extraction/embedding.py
-          indexify-cli build-image examples/pdf_document_extraction/lancedb_functions.py
-          indexify-cli build-image examples/pdf_document_extraction/chromadb_writer.py
-          indexify-cli build-image examples/pdf_document_extraction/pdf_parser.py
-          indexify-cli build-image examples/pdf_document_extraction/workflow.py
+          indexify-cli build-image examples/pdf_document_extraction/images.py
           docker push tensorlake/indexify-executor-default:3.10
           docker push tensorlake/indexify-executor-default:3.11
           docker push tensorlake/tensorlake/pdf-blueprint-st

diff --git a/examples/pdf_document_extraction/chromadb_writer.py b/examples/pdf_document_extraction/chromadb_writer.py
@@ -1,13 +1,11 @@
 from indexify.functions_sdk.indexify_functions import IndexifyFunction
-from indexify import Image
 from typing import Union
 from common_objects import ImageWithEmbedding, TextChunk
-
-image = Image(python="3.11").name("tensorlake/blueprints-chromadb").run("pip install chromadb").run("pip install pillow")
+from images import chroma_image
 
 class ChromaDBWriter(IndexifyFunction):
     name = "chroma_db_writer"
-    image = image
+    image = chroma_image
 
     def __init__(self):
         import chromadb

diff --git a/examples/pdf_document_extraction/embedding.py b/examples/pdf_document_extraction/embedding.py
@@ -1,23 +1,14 @@
 from typing import Any, List
 
-from indexify import Image
 from indexify.functions_sdk.indexify_functions import IndexifyFunction, indexify_function
 from sentence_transformers import SentenceTransformer
 from common_objects import ImageWithEmbedding, TextChunk
 from inkwell.api.document import Document
 from inkwell.api.page import PageFragmentType
 import base64
+from images import st_image
 
-image = (
-    Image(python="3.11")
-    .name("tensorlake/pdf-blueprint-st")
-    .run("pip install sentence-transformers")
-    .run("pip install langchain")
-    .run("pip install pillow")
-    .run("pip install py-inkwell")
-)
-
-@indexify_function(image=image)
+@indexify_function(image=st_image)
 def chunk_text(document: Document) -> List[TextChunk]:
     """
     Extract chunks from document
@@ -54,7 +45,7 @@ class TextEmbeddingExtractor(IndexifyFunction):
     description = "Extractor class that captures an embedding model"
     system_dependencies = []
     input_mime_types = ["text"]
-    image = image
+    image = st_image
 
     def __init__(self):
         super().__init__()
@@ -69,7 +60,7 @@ def run(self, input: TextChunk) -> TextChunk:
 class ImageEmbeddingExtractor(IndexifyFunction):
     name = "image-embedding"
     description = "Extractor class that captures an embedding model"
-    image=image
+    image=st_image
 
     def __init__(self):
         super().__init__()

diff --git a/examples/pdf_document_extraction/images.py b/examples/pdf_document_extraction/images.py
@@ -0,0 +1,41 @@
+from indexify import Image
+
+
+chroma_image = Image(python="3.11").name("tensorlake/blueprints-chromadb").run("pip install chromadb").run("pip install pillow")
+
+st_image = (
+    Image(python="3.11")
+    .name("tensorlake/pdf-blueprint-st")
+    .run("pip install sentence-transformers")
+    .run("pip install langchain")
+    .run("pip install pillow")
+    .run("pip install py-inkwell")
+)
+
+
+lance_image = Image(python="3.11").name("tensorlake/pdf-blueprint-lancdb").run("pip install lancedb")
+
+inkwell_image = (
+    Image(python="3.11")
+    .name("tensorlake/pdf-blueprint-pdf-parser")
+    .run("apt update")
+    .run("apt install -y libgl1-mesa-glx git g++")
+    .run("pip install torch")
+    .run("pip install numpy")
+    .run("pip install git+https://github.com/facebookresearch/[email protected]")
+    .run("apt install -y tesseract-ocr")
+    .run("apt install -y libtesseract-dev")
+    .run("pip install \"py-inkwell[inference]\"")
+)
+
+inkwell_image_gpu = (
+    Image()
+    .name("tensorlake/pdf-blueprint-pdf-parser-gpu")
+    .base_image("pytorch/pytorch:2.4.1-cuda11.8-cudnn9-runtime")
+    .run("apt update")
+    .run("apt install -y libgl1-mesa-glx git g++")
+    .run("pip install git+https://github.com/facebookresearch/[email protected]")
+    .run("apt install -y tesseract-ocr")
+    .run("apt install -y libtesseract-dev")
+    .run("pip install \"py-inkwell[inference]\"")
+)
diff --git a/examples/pdf_document_extraction/lancedb_functions.py b/examples/pdf_document_extraction/lancedb_functions.py
@@ -1,12 +1,11 @@
 from typing import Union
 
-from indexify import Image
 from indexify.functions_sdk.indexify_functions import IndexifyFunction
 from common_objects import ImageWithEmbedding, TextChunk
 import lancedb
 from lancedb.pydantic import LanceModel, Vector
+from images import lance_image
 
-image = Image(python="3.11").name("tensorlake/pdf-blueprint-lancdb").run("pip install lancedb")
 
 class ImageEmbeddingTable(LanceModel):
     vector: Vector(512)
@@ -20,7 +19,7 @@ class TextEmbeddingTable(LanceModel):
 
 class LanceDBWriter(IndexifyFunction):
     name = "lancedb_writer"
-    image = image
+    image = lance_image
 
     def __init__(self):
         super().__init__()

diff --git a/examples/pdf_document_extraction/pdf_parser.py b/examples/pdf_document_extraction/pdf_parser.py
@@ -1,39 +1,14 @@
 from indexify.functions_sdk.data_objects import File
 from indexify.functions_sdk.indexify_functions import IndexifyFunction
-from indexify import Image
 
 from inkwell.api.document import Document
-
-image = (
-    Image(python="3.11")
-    .name("tensorlake/pdf-blueprint-pdf-parser")
-    .run("apt update")
-    .run("apt install -y libgl1-mesa-glx git g++")
-    .run("pip install torch")
-    .run("pip install numpy")
-    .run("pip install git+https://github.com/facebookresearch/[email protected]")
-    .run("apt install -y tesseract-ocr")
-    .run("apt install -y libtesseract-dev")
-    .run("pip install \"py-inkwell[inference]\"")
-)
-
-gpu_image = (
-    Image()
-    .name("tensorlake/pdf-blueprint-pdf-parser-gpu")
-    .base_image("pytorch/pytorch:2.4.1-cuda11.8-cudnn9-runtime")
-    .run("apt update")
-    .run("apt install -y libgl1-mesa-glx git g++")
-    .run("pip install git+https://github.com/facebookresearch/[email protected]")
-    .run("apt install -y tesseract-ocr")
-    .run("apt install -y libtesseract-dev")
-    .run("pip install \"py-inkwell[inference]\"")
-)
+from images import inkwell_image_gpu
 
 class PDFParser(IndexifyFunction):
     name = "pdf-parse"
     description = "Parser class that captures a pdf file"
     # Change to gpu_image to use GPU
-    image = gpu_image
+    image = inkwell_image_gpu
 
     def __init__(self):
         super().__init__()

diff --git a/examples/pdf_document_extraction/workflow.py b/examples/pdf_document_extraction/workflow.py
@@ -48,8 +48,9 @@ def create_graph() -> Graph:
     # Uncomment this to run the graph locally
     #invocation_id = graph.run(block_until_done=True, url="https://arxiv.org/pdf/2302.12854")
     import common_objects
+    import images
 
-    remote_graph = RemoteGraph.deploy(graph, additional_modules=[common_objects])
+    remote_graph = RemoteGraph.deploy(graph, additional_modules=[common_objects, images])
     invocation_id = remote_graph.run(
         block_until_done=True, url="https://arxiv.org/pdf/1706.03762"
     )