From 6a79d57d348d29bed5f1ae5bddb0170c70ec0451 Mon Sep 17 00:00:00 2001 From: Matt Seddon Date: Fri, 18 Oct 2024 13:54:57 +1100 Subject: [PATCH 1/2] patch unstructured embeddings gen example --- examples/llm_and_nlp/unstructured-embeddings-gen.py | 12 +++++++----- pyproject.toml | 3 ++- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/llm_and_nlp/unstructured-embeddings-gen.py b/examples/llm_and_nlp/unstructured-embeddings-gen.py index 0c0dc5f0e..23b5043cc 100644 --- a/examples/llm_and_nlp/unstructured-embeddings-gen.py +++ b/examples/llm_and_nlp/unstructured-embeddings-gen.py @@ -12,11 +12,11 @@ group_broken_paragraphs, replace_unicode_quotes, ) -from unstructured.embed.huggingface import ( +from unstructured.partition.pdf import partition_pdf +from unstructured_ingest.embed.huggingface import ( HuggingFaceEmbeddingConfig, HuggingFaceEmbeddingEncoder, ) -from unstructured.partition.pdf import partition_pdf from datachain import C, DataChain, DataModel, File @@ -43,6 +43,7 @@ def process_pdf(file: File) -> Iterator[Chunk]: chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast") # Clean the chunks and add new columns + chunks_cleaned = [] for chunk in chunks: chunk.apply( lambda text: clean( @@ -51,16 +52,17 @@ def process_pdf(file: File) -> Iterator[Chunk]: ) chunk.apply(replace_unicode_quotes) chunk.apply(group_broken_paragraphs) + chunks_cleaned.append({"text": chunk.text}) # create embeddings - chunks_embedded = embedding_encoder.embed_documents(chunks) + chunks_embedded = embedding_encoder.embed_documents(chunks_cleaned) # Add new rows to DataChain for chunk in chunks_embedded: yield Chunk( key=file.path, - text=chunk.text, - embeddings=chunk.embeddings, + text=chunk.get("text"), + embeddings=chunk.get("embeddings"), ) diff --git a/pyproject.toml b/pyproject.toml index 2781714d6..7fb03291f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,7 +106,8 @@ examples = [ "numpy>=1,<2", "defusedxml", "accelerate", - "unstructured[pdf, embed-huggingface]", + "unstructured-ingest[embed-huggingface]", + "unstructured[pdf]", "pdfplumber==0.11.4", "huggingface_hub[hf_transfer]", "onnx==1.16.1" From fe93b130f2d98192a29cd85b9685f12ef6d86b6e Mon Sep 17 00:00:00 2001 From: Matt Seddon Date: Fri, 18 Oct 2024 19:26:00 +1100 Subject: [PATCH 2/2] reset example and set upper bound for unstructured --- examples/llm_and_nlp/unstructured-embeddings-gen.py | 12 +++++------- pyproject.toml | 3 +-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/examples/llm_and_nlp/unstructured-embeddings-gen.py b/examples/llm_and_nlp/unstructured-embeddings-gen.py index 23b5043cc..0c0dc5f0e 100644 --- a/examples/llm_and_nlp/unstructured-embeddings-gen.py +++ b/examples/llm_and_nlp/unstructured-embeddings-gen.py @@ -12,11 +12,11 @@ group_broken_paragraphs, replace_unicode_quotes, ) -from unstructured.partition.pdf import partition_pdf -from unstructured_ingest.embed.huggingface import ( +from unstructured.embed.huggingface import ( HuggingFaceEmbeddingConfig, HuggingFaceEmbeddingEncoder, ) +from unstructured.partition.pdf import partition_pdf from datachain import C, DataChain, DataModel, File @@ -43,7 +43,6 @@ def process_pdf(file: File) -> Iterator[Chunk]: chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast") # Clean the chunks and add new columns - chunks_cleaned = [] for chunk in chunks: chunk.apply( lambda text: clean( @@ -52,17 +51,16 @@ def process_pdf(file: File) -> Iterator[Chunk]: ) chunk.apply(replace_unicode_quotes) chunk.apply(group_broken_paragraphs) - chunks_cleaned.append({"text": chunk.text}) # create embeddings - chunks_embedded = embedding_encoder.embed_documents(chunks_cleaned) + chunks_embedded = embedding_encoder.embed_documents(chunks) # Add new rows to DataChain for chunk in chunks_embedded: yield Chunk( key=file.path, - text=chunk.get("text"), - embeddings=chunk.get("embeddings"), + text=chunk.text, + embeddings=chunk.embeddings, ) diff --git a/pyproject.toml b/pyproject.toml index 7fb03291f..3733a267c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,8 +106,7 @@ examples = [ "numpy>=1,<2", "defusedxml", "accelerate", - "unstructured-ingest[embed-huggingface]", - "unstructured[pdf]", + "unstructured[pdf,embed-huggingface]<0.16.0", "pdfplumber==0.11.4", "huggingface_hub[hf_transfer]", "onnx==1.16.1"