From e12a09d764ef4c7986c0ad6af3f23f434e284ba8 Mon Sep 17 00:00:00 2001 From: aminediro Date: Tue, 12 Nov 2024 15:19:06 +0100 Subject: [PATCH] megaparse in registry + chunk_size --- .../implementations/megaparse_processor.py | 10 ++-- core/quivr_core/processor/registry.py | 53 ++++++++++--------- 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/core/quivr_core/processor/implementations/megaparse_processor.py b/core/quivr_core/processor/implementations/megaparse_processor.py index 81162a9b29fa..8a8456209231 100644 --- a/core/quivr_core/processor/implementations/megaparse_processor.py +++ b/core/quivr_core/processor/implementations/megaparse_processor.py @@ -91,9 +91,7 @@ async def process_file_inner(self, file: QuivrFile) -> list[Document]: document = Document( page_content=response["result"], ) - if len(response) > self.splitter_config.chunk_size: - docs = self.text_splitter.split_documents([document]) - for doc in docs: - doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))} - return docs - return [document] + docs = self.text_splitter.split_documents([document]) + for doc in docs: + doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))} + return docs diff --git a/core/quivr_core/processor/registry.py b/core/quivr_core/processor/registry.py index 860c4d23e1e0..79399a8e8eb9 100644 --- a/core/quivr_core/processor/registry.py +++ b/core/quivr_core/processor/registry.py @@ -1,5 +1,6 @@ import importlib import logging +from os import getenv import types from dataclasses import dataclass, field from heapq import heappop, heappush @@ -119,32 +120,32 @@ def defaults_to_proc_entries( priority=None, ) - # TODO(@aminediro): Megaparse should register itself - # Append Megaparse - _append_proc_mapping( - mapping=base_processors, - file_exts=[ - FileExtension.pdf, - FileExtension.docx, - FileExtension.doc, - FileExtension.pptx, - FileExtension.xls, - FileExtension.xlsx, - FileExtension.csv, - FileExtension.epub, - FileExtension.bib, - FileExtension.odt, - FileExtension.html, - FileExtension.py, - FileExtension.markdown, - FileExtension.md, - FileExtension.mdx, - FileExtension.ipynb, - ], - cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor", - errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor", - priority=None, - ) + # TODO(@aminediro @chloedia): Megaparse should register itself + if getenv("MEGAPARSE_URL") and getenv("MEGAPARSE_API_KEY"): + _append_proc_mapping( + mapping=base_processors, + file_exts=[ + FileExtension.pdf, + FileExtension.docx, + FileExtension.doc, + FileExtension.pptx, + FileExtension.xls, + FileExtension.xlsx, + FileExtension.csv, + FileExtension.epub, + FileExtension.bib, + FileExtension.odt, + FileExtension.html, + FileExtension.py, + FileExtension.markdown, + FileExtension.md, + FileExtension.mdx, + FileExtension.ipynb, + ], + cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor", + errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor", + priority=None, + ) return base_processors