From 6a79d57d348d29bed5f1ae5bddb0170c70ec0451 Mon Sep 17 00:00:00 2001
From: Matt Seddon <mattseddon@hotmail.com>
Date: Fri, 18 Oct 2024 13:54:57 +1100
Subject: [PATCH 1/2] patch unstructured embeddings gen example

---
 examples/llm_and_nlp/unstructured-embeddings-gen.py | 12 +++++++-----
 pyproject.toml                                      |  3 ++-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/examples/llm_and_nlp/unstructured-embeddings-gen.py b/examples/llm_and_nlp/unstructured-embeddings-gen.py
index 0c0dc5f0e..23b5043cc 100644
--- a/examples/llm_and_nlp/unstructured-embeddings-gen.py
+++ b/examples/llm_and_nlp/unstructured-embeddings-gen.py
@@ -12,11 +12,11 @@
     group_broken_paragraphs,
     replace_unicode_quotes,
 )
-from unstructured.embed.huggingface import (
+from unstructured.partition.pdf import partition_pdf
+from unstructured_ingest.embed.huggingface import (
     HuggingFaceEmbeddingConfig,
     HuggingFaceEmbeddingEncoder,
 )
-from unstructured.partition.pdf import partition_pdf
 
 from datachain import C, DataChain, DataModel, File
 
@@ -43,6 +43,7 @@ def process_pdf(file: File) -> Iterator[Chunk]:
         chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")
 
     # Clean the chunks and add new columns
+    chunks_cleaned = []
     for chunk in chunks:
         chunk.apply(
             lambda text: clean(
@@ -51,16 +52,17 @@ def process_pdf(file: File) -> Iterator[Chunk]:
         )
         chunk.apply(replace_unicode_quotes)
         chunk.apply(group_broken_paragraphs)
+        chunks_cleaned.append({"text": chunk.text})
 
     # create embeddings
-    chunks_embedded = embedding_encoder.embed_documents(chunks)
+    chunks_embedded = embedding_encoder.embed_documents(chunks_cleaned)
 
     # Add new rows to DataChain
     for chunk in chunks_embedded:
         yield Chunk(
             key=file.path,
-            text=chunk.text,
-            embeddings=chunk.embeddings,
+            text=chunk.get("text"),
+            embeddings=chunk.get("embeddings"),
         )
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 2781714d6..7fb03291f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -106,7 +106,8 @@ examples = [
   "numpy>=1,<2",
   "defusedxml",
   "accelerate",
-  "unstructured[pdf, embed-huggingface]",
+  "unstructured-ingest[embed-huggingface]",
+  "unstructured[pdf]",
   "pdfplumber==0.11.4",
   "huggingface_hub[hf_transfer]",
   "onnx==1.16.1"

From fe93b130f2d98192a29cd85b9685f12ef6d86b6e Mon Sep 17 00:00:00 2001
From: Matt Seddon <mattseddon@hotmail.com>
Date: Fri, 18 Oct 2024 19:26:00 +1100
Subject: [PATCH 2/2] reset example and set upper bound for unstructured

---
 examples/llm_and_nlp/unstructured-embeddings-gen.py | 12 +++++-------
 pyproject.toml                                      |  3 +--
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/examples/llm_and_nlp/unstructured-embeddings-gen.py b/examples/llm_and_nlp/unstructured-embeddings-gen.py
index 23b5043cc..0c0dc5f0e 100644
--- a/examples/llm_and_nlp/unstructured-embeddings-gen.py
+++ b/examples/llm_and_nlp/unstructured-embeddings-gen.py
@@ -12,11 +12,11 @@
     group_broken_paragraphs,
     replace_unicode_quotes,
 )
-from unstructured.partition.pdf import partition_pdf
-from unstructured_ingest.embed.huggingface import (
+from unstructured.embed.huggingface import (
     HuggingFaceEmbeddingConfig,
     HuggingFaceEmbeddingEncoder,
 )
+from unstructured.partition.pdf import partition_pdf
 
 from datachain import C, DataChain, DataModel, File
 
@@ -43,7 +43,6 @@ def process_pdf(file: File) -> Iterator[Chunk]:
         chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")
 
     # Clean the chunks and add new columns
-    chunks_cleaned = []
     for chunk in chunks:
         chunk.apply(
             lambda text: clean(
@@ -52,17 +51,16 @@ def process_pdf(file: File) -> Iterator[Chunk]:
         )
         chunk.apply(replace_unicode_quotes)
         chunk.apply(group_broken_paragraphs)
-        chunks_cleaned.append({"text": chunk.text})
 
     # create embeddings
-    chunks_embedded = embedding_encoder.embed_documents(chunks_cleaned)
+    chunks_embedded = embedding_encoder.embed_documents(chunks)
 
     # Add new rows to DataChain
     for chunk in chunks_embedded:
         yield Chunk(
             key=file.path,
-            text=chunk.get("text"),
-            embeddings=chunk.get("embeddings"),
+            text=chunk.text,
+            embeddings=chunk.embeddings,
         )
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 7fb03291f..3733a267c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -106,8 +106,7 @@ examples = [
   "numpy>=1,<2",
   "defusedxml",
   "accelerate",
-  "unstructured-ingest[embed-huggingface]",
-  "unstructured[pdf]",
+  "unstructured[pdf,embed-huggingface]<0.16.0",
   "pdfplumber==0.11.4",
   "huggingface_hub[hf_transfer]",
   "onnx==1.16.1"