Skip to content

Commit

Permalink
update images
Browse files Browse the repository at this point in the history
  • Loading branch information
diptanu committed Oct 19, 2024
1 parent 01f9fa0 commit d1d35a7
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 53 deletions.
6 changes: 1 addition & 5 deletions .github/workflows/publish_executor_containers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,7 @@ jobs:
- run: |
pip install indexify -U
indexify-cli build-default-image
indexify-cli build-image examples/pdf_document_extraction/embedding.py
indexify-cli build-image examples/pdf_document_extraction/lancedb_functions.py
indexify-cli build-image examples/pdf_document_extraction/chromadb_writer.py
indexify-cli build-image examples/pdf_document_extraction/pdf_parser.py
indexify-cli build-image examples/pdf_document_extraction/workflow.py
indexify-cli build-image examples/pdf_document_extraction/images.py
docker push tensorlake/indexify-executor-default:3.10
docker push tensorlake/indexify-executor-default:3.11
docker push tensorlake/tensorlake/pdf-blueprint-st
Expand Down
6 changes: 2 additions & 4 deletions examples/pdf_document_extraction/chromadb_writer.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
from indexify.functions_sdk.indexify_functions import IndexifyFunction
from indexify import Image
from typing import Union
from common_objects import ImageWithEmbedding, TextChunk

image = Image(python="3.11").name("tensorlake/blueprints-chromadb").run("pip install chromadb").run("pip install pillow")
from images import chroma_image

class ChromaDBWriter(IndexifyFunction):
name = "chroma_db_writer"
image = image
image = chroma_image

def __init__(self):
import chromadb
Expand Down
17 changes: 4 additions & 13 deletions examples/pdf_document_extraction/embedding.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,14 @@
from typing import Any, List

from indexify import Image
from indexify.functions_sdk.indexify_functions import IndexifyFunction, indexify_function
from sentence_transformers import SentenceTransformer
from common_objects import ImageWithEmbedding, TextChunk
from inkwell.api.document import Document
from inkwell.api.page import PageFragmentType
import base64
from images import st_image

image = (
Image(python="3.11")
.name("tensorlake/pdf-blueprint-st")
.run("pip install sentence-transformers")
.run("pip install langchain")
.run("pip install pillow")
.run("pip install py-inkwell")
)

@indexify_function(image=image)
@indexify_function(image=st_image)
def chunk_text(document: Document) -> List[TextChunk]:
"""
Extract chunks from document
Expand Down Expand Up @@ -54,7 +45,7 @@ class TextEmbeddingExtractor(IndexifyFunction):
description = "Extractor class that captures an embedding model"
system_dependencies = []
input_mime_types = ["text"]
image = image
image = st_image

def __init__(self):
super().__init__()
Expand All @@ -69,7 +60,7 @@ def run(self, input: TextChunk) -> TextChunk:
class ImageEmbeddingExtractor(IndexifyFunction):
name = "image-embedding"
description = "Extractor class that captures an embedding model"
image=image
image=st_image

def __init__(self):
super().__init__()
Expand Down
41 changes: 41 additions & 0 deletions examples/pdf_document_extraction/images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from indexify import Image


chroma_image = Image(python="3.11").name("tensorlake/blueprints-chromadb").run("pip install chromadb").run("pip install pillow")

st_image = (
Image(python="3.11")
.name("tensorlake/pdf-blueprint-st")
.run("pip install sentence-transformers")
.run("pip install langchain")
.run("pip install pillow")
.run("pip install py-inkwell")
)


lance_image = Image(python="3.11").name("tensorlake/pdf-blueprint-lancdb").run("pip install lancedb")

inkwell_image = (
Image(python="3.11")
.name("tensorlake/pdf-blueprint-pdf-parser")
.run("apt update")
.run("apt install -y libgl1-mesa-glx git g++")
.run("pip install torch")
.run("pip install numpy")
.run("pip install git+https://github.com/facebookresearch/[email protected]")
.run("apt install -y tesseract-ocr")
.run("apt install -y libtesseract-dev")
.run("pip install \"py-inkwell[inference]\"")
)

inkwell_image_gpu = (
Image()
.name("tensorlake/pdf-blueprint-pdf-parser-gpu")
.base_image("pytorch/pytorch:2.4.1-cuda11.8-cudnn9-runtime")
.run("apt update")
.run("apt install -y libgl1-mesa-glx git g++")
.run("pip install git+https://github.com/facebookresearch/[email protected]")
.run("apt install -y tesseract-ocr")
.run("apt install -y libtesseract-dev")
.run("pip install \"py-inkwell[inference]\"")
)
5 changes: 2 additions & 3 deletions examples/pdf_document_extraction/lancedb_functions.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from typing import Union

from indexify import Image
from indexify.functions_sdk.indexify_functions import IndexifyFunction
from common_objects import ImageWithEmbedding, TextChunk
import lancedb
from lancedb.pydantic import LanceModel, Vector
from images import lance_image

image = Image(python="3.11").name("tensorlake/pdf-blueprint-lancdb").run("pip install lancedb")

class ImageEmbeddingTable(LanceModel):
vector: Vector(512)
Expand All @@ -20,7 +19,7 @@ class TextEmbeddingTable(LanceModel):

class LanceDBWriter(IndexifyFunction):
name = "lancedb_writer"
image = image
image = lance_image

def __init__(self):
super().__init__()
Expand Down
29 changes: 2 additions & 27 deletions examples/pdf_document_extraction/pdf_parser.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,14 @@
from indexify.functions_sdk.data_objects import File
from indexify.functions_sdk.indexify_functions import IndexifyFunction
from indexify import Image

from inkwell.api.document import Document

image = (
Image(python="3.11")
.name("tensorlake/pdf-blueprint-pdf-parser")
.run("apt update")
.run("apt install -y libgl1-mesa-glx git g++")
.run("pip install torch")
.run("pip install numpy")
.run("pip install git+https://github.com/facebookresearch/[email protected]")
.run("apt install -y tesseract-ocr")
.run("apt install -y libtesseract-dev")
.run("pip install \"py-inkwell[inference]\"")
)

gpu_image = (
Image()
.name("tensorlake/pdf-blueprint-pdf-parser-gpu")
.base_image("pytorch/pytorch:2.4.1-cuda11.8-cudnn9-runtime")
.run("apt update")
.run("apt install -y libgl1-mesa-glx git g++")
.run("pip install git+https://github.com/facebookresearch/[email protected]")
.run("apt install -y tesseract-ocr")
.run("apt install -y libtesseract-dev")
.run("pip install \"py-inkwell[inference]\"")
)
from images import inkwell_image_gpu

class PDFParser(IndexifyFunction):
name = "pdf-parse"
description = "Parser class that captures a pdf file"
# Change to gpu_image to use GPU
image = gpu_image
image = inkwell_image_gpu

def __init__(self):
super().__init__()
Expand Down
3 changes: 2 additions & 1 deletion examples/pdf_document_extraction/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ def create_graph() -> Graph:
# Uncomment this to run the graph locally
#invocation_id = graph.run(block_until_done=True, url="https://arxiv.org/pdf/2302.12854")
import common_objects
import images

remote_graph = RemoteGraph.deploy(graph, additional_modules=[common_objects])
remote_graph = RemoteGraph.deploy(graph, additional_modules=[common_objects, images])
invocation_id = remote_graph.run(
block_until_done=True, url="https://arxiv.org/pdf/1706.03762"
)
Expand Down

0 comments on commit d1d35a7

Please sign in to comment.