diff --git a/poetry.lock b/poetry.lock index e03ece8a0..ba0bbd188 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,17 @@ -# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. + +[[package]] +name = "aiofiles" +version = "24.1.0" +description = "File support for asyncio." +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"file-based\"" +files = [ + {file = "aiofiles-24.1.0-py3-none-any.whl", hash = "sha256:b4ec55f4195e3eb5d7abd1bf7e061763e864dd4954231fb8539a0ef8bb8260e5"}, + {file = "aiofiles-24.1.0.tar.gz", hash = "sha256:22a075c9e5a3810f0c2e48f3008c94d68c65d763b9b03857924c99e57355166c"}, +] [[package]] name = "aiohappyeyeballs" @@ -195,7 +208,7 @@ description = "Timeout context manager for asyncio programs" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"vector-db-based\" and python_version == \"3.10\"" +markers = "extra == \"vector-db-based\" and python_version < \"3.11\"" files = [ {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"}, {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, @@ -349,7 +362,7 @@ description = "Python module to generate and modify bytecode" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"manifest-server\" and python_version <= \"3.13.0\"" +markers = "extra == \"manifest-server\"" files = [ {file = "bytecode-0.16.2-py3-none-any.whl", hash = "sha256:0a7dea0387ec5cae5ec77578690c5ca7470c8a202c50ce64a426d86380cddd7f"}, {file = "bytecode-0.16.2.tar.gz", hash = "sha256:f05020b6dc1f48cdadd946f7c3a03131ba0f312bd103767c5d75559de5c308f8"}, @@ -487,19 +500,6 @@ files = [ [package.dependencies] pycparser = "*" -[[package]] -name = "chardet" -version = "5.2.0" -description = "Universal encoding detector for Python 3" -optional = true -python-versions = ">=3.7" -groups = ["main"] -markers = "extra == \"file-based\"" -files = [ - {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, - {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, -] - [[package]] name = "charset-normalizer" version = "3.4.1" @@ -1126,7 +1126,7 @@ bytecode = [ {version = ">=0.16.0", markers = "python_version >= \"3.13.0\""}, {version = ">=0.15.1", markers = "python_version ~= \"3.12.0\""}, {version = ">=0.14.0", markers = "python_version ~= \"3.11.0\""}, - {version = ">=0.13.0", markers = "python_version < \"3.11\""}, + {version = ">=0.13.0", markers = "python_version < \"3.11.0\""}, ] envier = ">=0.6.1,<0.7.0" legacy-cgi = {version = ">=2.0.0", markers = "python_version >= \"3.13.0\""} @@ -1245,6 +1245,22 @@ files = [ {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, ] +[[package]] +name = "eval-type-backport" +version = "0.2.2" +description = "Like `typing._eval_type`, but lets older Python versions use newer typing features." +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"file-based\"" +files = [ + {file = "eval_type_backport-0.2.2-py3-none-any.whl", hash = "sha256:cb6ad7c393517f476f96d456d0412ea80f0a8cf96f6892834cd9340149111b0a"}, + {file = "eval_type_backport-0.2.2.tar.gz", hash = "sha256:f0576b4cf01ebb5bd358d02314d31846af5e07678387486e2c798af0e7d849c1"}, +] + +[package.extras] +tests = ["pytest"] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -1252,7 +1268,7 @@ description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" groups = ["main", "dev"] -markers = "python_version == \"3.10\"" +markers = "python_version < \"3.11.0\"" files = [ {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, @@ -1648,7 +1664,7 @@ google-auth = ">=2.14.1,<3.0.0" googleapis-common-protos = ">=1.56.2,<2.0.0" grpcio = [ {version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, - {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, + {version = ">=1.33.2,<2.0dev", optional = true, markers = "extra == \"grpc\""}, ] grpcio-status = [ {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, @@ -1712,7 +1728,7 @@ google-auth = ">=2.14.1,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0" grpc-google-iam-v1 = ">=0.14.0,<1.0.0" proto-plus = [ {version = ">=1.25.0,<2.0.0", markers = "python_version >= \"3.13\""}, - {version = ">=1.22.3,<2.0.0", markers = "python_version < \"3.13\""}, + {version = ">=1.22.3,<2.0.0"}, ] protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<7.0.0" @@ -1785,7 +1801,7 @@ description = "Lightweight in-process concurrent programming" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "(platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\") and (extra == \"vector-db-based\" or extra == \"sql\")" +markers = "(extra == \"vector-db-based\" or extra == \"sql\") and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")" files = [ {file = "greenlet-3.1.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:0bbae94a29c9e5c7e4a2b7f0aae5c17e8e90acbfd3bf6270eeba60c39fce3563"}, {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fde093fb93f35ca72a556cf72c92ea3ebfda3d79fc35bb19fbe685853869a83"}, @@ -1998,6 +2014,29 @@ files = [ [package.extras] tests = ["pytest"] +[[package]] +name = "html5lib" +version = "1.1" +description = "HTML parser based on the WHATWG HTML specification" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] +markers = "extra == \"file-based\"" +files = [ + {file = "html5lib-1.1-py2.py3-none-any.whl", hash = "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d"}, + {file = "html5lib-1.1.tar.gz", hash = "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f"}, +] + +[package.dependencies] +six = ">=1.9" +webencodings = "*" + +[package.extras] +all = ["chardet (>=2.2)", "genshi", "lxml ; platform_python_implementation == \"CPython\""] +chardet = ["chardet (>=2.2)"] +genshi = ["genshi"] +lxml = ["lxml ; platform_python_implementation == \"CPython\""] + [[package]] name = "httpcore" version = "1.0.9" @@ -2523,7 +2562,7 @@ description = "Fork of the standard library cgi and cgitb modules removed in Pyt optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"manifest-server\" and python_version == \"3.13\"" +markers = "python_version >= \"3.13.0\" and extra == \"manifest-server\"" files = [ {file = "legacy_cgi-2.6.3-py3-none-any.whl", hash = "sha256:6df2ea5ae14c71ef6f097f8b6372b44f6685283dc018535a75c924564183cdab"}, {file = "legacy_cgi-2.6.3.tar.gz", hash = "sha256:4c119d6cb8e9d8b6ad7cc0ddad880552c62df4029622835d06dfd18f438a8154"}, @@ -3189,6 +3228,19 @@ files = [ ] markers = {main = "extra == \"vector-db-based\" or extra == \"file-based\""} +[[package]] +name = "nest-asyncio" +version = "1.6.0" +description = "Patch asyncio to allow nested event loops" +optional = true +python-versions = ">=3.5" +groups = ["main"] +markers = "extra == \"file-based\"" +files = [ + {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"}, + {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"}, +] + [[package]] name = "nltk" version = "3.9.1" @@ -3261,6 +3313,22 @@ files = [ {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, ] +[[package]] +name = "olefile" +version = "0.47" +description = "Python package to parse, read and write Microsoft OLE2 files (Structured Storage or Compound Document, Microsoft Office)" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] +markers = "extra == \"file-based\"" +files = [ + {file = "olefile-0.47-py2.py3-none-any.whl", hash = "sha256:543c7da2a7adadf21214938bb79c83ea12b473a4b6ee4ad4bf854e7715e13d1f"}, + {file = "olefile-0.47.zip", hash = "sha256:599383381a0bf3dfbd932ca0ca6515acd174ed48870cbf7fee123d698c192c1c"}, +] + +[package.extras] +tests = ["pytest", "pytest-cov"] + [[package]] name = "openai" version = "0.27.9" @@ -4323,6 +4391,30 @@ files = [ [package.extras] diagrams = ["jinja2", "railroad-diagrams"] +[[package]] +name = "pypdf" +version = "6.0.0" +description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"file-based\"" +files = [ + {file = "pypdf-6.0.0-py3-none-any.whl", hash = "sha256:56ea60100ce9f11fc3eec4f359da15e9aec3821b036c1f06d2b660d35683abb8"}, + {file = "pypdf-6.0.0.tar.gz", hash = "sha256:282a99d2cc94a84a3a3159f0d9358c0af53f85b4d28d76ea38b96e9e5ac2a08d"}, +] + +[package.dependencies] +typing_extensions = {version = ">=4.0", markers = "python_version < \"3.11\""} + +[package.extras] +crypto = ["cryptography"] +cryptodome = ["PyCryptodome"] +dev = ["black", "flit", "pip-tools", "pre-commit", "pytest-cov", "pytest-socket", "pytest-timeout", "pytest-xdist", "wheel"] +docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] +full = ["Pillow (>=8.0.0)", "cryptography"] +image = ["Pillow (>=8.0.0)"] + [[package]] name = "pyproject-flake8" version = "6.1.0" @@ -4640,21 +4732,41 @@ files = [ {file = "python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"}, ] +[[package]] +name = "python-oxmsg" +version = "0.0.2" +description = "Extract attachments from Outlook .msg files." +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"file-based\"" +files = [ + {file = "python_oxmsg-0.0.2-py3-none-any.whl", hash = "sha256:22be29b14c46016bcd05e34abddfd8e05ee82082f53b82753d115da3fc7d0355"}, + {file = "python_oxmsg-0.0.2.tar.gz", hash = "sha256:a6aff4deb1b5975d44d49dab1d9384089ffeec819e19c6940bc7ffbc84775fad"}, +] + +[package.dependencies] +click = "*" +olefile = "*" +typing_extensions = ">=4.9.0" + [[package]] name = "python-pptx" -version = "0.6.21" -description = "Generate and manipulate Open XML PowerPoint (.pptx) files" +version = "1.0.2" +description = "Create, read, and update PowerPoint 2007+ (.pptx) files." optional = true -python-versions = "*" +python-versions = ">=3.8" groups = ["main"] markers = "extra == \"file-based\"" files = [ - {file = "python-pptx-0.6.21.tar.gz", hash = "sha256:7798a2aaf89563565b3c7120c0acfe9aff775db0db3580544e3bf4840c2e378f"}, + {file = "python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba"}, + {file = "python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095"}, ] [package.dependencies] lxml = ">=3.1.0" Pillow = ">=3.3.2" +typing-extensions = ">=4.9.0" XlsxWriter = ">=0.5.7" [[package]] @@ -5710,22 +5822,6 @@ typing-extensions = {version = ">=4.10.0", markers = "python_version < \"3.13\"" [package.extras] full = ["httpx (>=0.27.0,<0.29.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.18)", "pyyaml"] -[[package]] -name = "tabulate" -version = "0.9.0" -description = "Pretty-print tabular data" -optional = true -python-versions = ">=3.7" -groups = ["main"] -markers = "extra == \"file-based\"" -files = [ - {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, - {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, -] - -[package.extras] -widechars = ["wcwidth"] - [[package]] name = "tenacity" version = "8.5.0" @@ -5900,7 +5996,7 @@ files = [ {file = "tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc"}, {file = "tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff"}, ] -markers = {main = "python_version == \"3.10\""} +markers = {main = "python_version < \"3.11.0\""} [[package]] name = "tqdm" @@ -6032,6 +6128,22 @@ files = [ mypy-extensions = ">=0.3.0" typing-extensions = ">=3.7.4" +[[package]] +name = "typing-inspection" +version = "0.4.1" +description = "Runtime typing introspection tools" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"file-based\"" +files = [ + {file = "typing_inspection-0.4.1-py3-none-any.whl", hash = "sha256:389055682238f53b04f7badcb49b989835495a96700ced5dab2d8feae4b26f51"}, + {file = "typing_inspection-0.4.1.tar.gz", hash = "sha256:6ae134cc0203c33377d43188d4064e9b357dba58cff3185f22924610e70a9d28"}, +] + +[package.dependencies] +typing-extensions = ">=4.12.0" + [[package]] name = "tzdata" version = "2025.1" @@ -6091,85 +6203,87 @@ files = [ [[package]] name = "unstructured" -version = "0.10.27" +version = "0.18.14" description = "A library that prepares raw documents for downstream ML tasks." optional = true -python-versions = ">=3.7.0" +python-versions = ">=3.10.0" groups = ["main"] markers = "extra == \"file-based\"" files = [ - {file = "unstructured-0.10.27-py3-none-any.whl", hash = "sha256:3a8a8e44302388ddc39c184059e8b4458f1cdc58032540b9af7d85f6c3eca3be"}, - {file = "unstructured-0.10.27.tar.gz", hash = "sha256:f567b5c4385993a9ab48db5563dd7b413aac4f2002bb22e6250496ea8f440f5e"}, + {file = "unstructured-0.18.14-py3-none-any.whl", hash = "sha256:cc6fadcf2f84fb6d910dd87bcbd54d8b6e2593ce29b851af167bc786d9263f73"}, + {file = "unstructured-0.18.14.tar.gz", hash = "sha256:c23760dd38dd6eca924a2803a005318f4ddc8af6a69388a7ce9d5b7fbc4b51bf"}, ] [package.dependencies] backoff = "*" beautifulsoup4 = "*" -chardet = "*" +charset-normalizer = "*" dataclasses-json = "*" emoji = "*" filetype = "*" +html5lib = "*" langdetect = "*" lxml = "*" nltk = "*" numpy = "*" -python-docx = {version = ">=1.0.1", optional = true, markers = "extra == \"docx\""} +psutil = "*" +python-docx = {version = ">=1.1.2", optional = true, markers = "extra == \"docx\""} python-iso639 = "*" python-magic = "*" -python-pptx = {version = "<=0.6.21", optional = true, markers = "extra == \"pptx\""} +python-oxmsg = "*" +python-pptx = {version = ">=1.0.1", optional = true, markers = "extra == \"pptx\""} rapidfuzz = "*" requests = "*" -tabulate = "*" +tqdm = "*" typing-extensions = "*" +unstructured-client = "*" +wrapt = "*" [package.extras] -airtable = ["pyairtable"] -all-docs = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] -azure = ["adlfs", "fsspec (==2023.9.1)"] -azure-cognitive-search = ["azure-search-documents"] -bedrock = ["boto3", "langchain"] -biomed = ["bs4"] -box = ["boxfs", "fsspec (==2023.9.1)"] -confluence = ["atlassian-python-api"] +all-docs = ["effdet", "google-cloud-vision", "markdown", "msoffcrypto-tool", "networkx", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (>=1.0.5)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] csv = ["pandas"] -delta-table = ["deltalake", "fsspec (==2023.9.1)"] -discord = ["discord-py"] -doc = ["python-docx (>=1.0.1)"] -docx = ["python-docx (>=1.0.1)"] -dropbox = ["dropboxdrivefs", "fsspec (==2023.9.1)"] -elasticsearch = ["elasticsearch", "jq"] -embed-huggingface = ["huggingface", "langchain", "sentence-transformers"] +doc = ["python-docx (>=1.1.2)"] +docx = ["python-docx (>=1.1.2)"] epub = ["pypandoc"] -gcs = ["bs4", "fsspec (==2023.9.1)", "gcsfs"] -github = ["pygithub (>1.58.0)"] -gitlab = ["python-gitlab"] -google-drive = ["google-api-python-client"] huggingface = ["langdetect", "sacremoses", "sentencepiece", "torch", "transformers"] -image = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)"] -jira = ["atlassian-python-api"] -local-inference = ["markdown", "msg-parser", "networkx", "onnx", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pypandoc", "python-docx (>=1.0.1)", "python-pptx (<=0.6.21)", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] +image = ["effdet", "google-cloud-vision", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypdf", "unstructured-inference (>=1.0.5)", "unstructured.pytesseract (>=0.3.12)"] +local-inference = ["effdet", "google-cloud-vision", "markdown", "msoffcrypto-tool", "networkx", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "openpyxl", "pandas", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypandoc", "pypdf", "python-docx (>=1.1.2)", "python-pptx (>=1.0.1)", "unstructured-inference (>=1.0.5)", "unstructured.pytesseract (>=0.3.12)", "xlrd"] md = ["markdown"] -msg = ["msg-parser"] -notion = ["htmlBuilder", "notion-client"] -odt = ["pypandoc", "python-docx (>=1.0.1)"] -onedrive = ["Office365-REST-Python-Client (<2.4.3)", "bs4", "msal"] -openai = ["langchain", "openai", "tiktoken"] +odt = ["pypandoc", "python-docx (>=1.1.2)"] org = ["pypandoc"] -outlook = ["Office365-REST-Python-Client (<2.4.3)", "msal"] -paddleocr = ["unstructured.paddleocr (==2.6.1.3)"] -pdf = ["onnx", "pdf2image", "pdfminer.six", "unstructured-inference (==0.7.10)", "unstructured.pytesseract (>=0.3.12)"] -ppt = ["python-pptx (<=0.6.21)"] -pptx = ["python-pptx (<=0.6.21)"] -reddit = ["praw"] +paddleocr = ["paddlepaddle (>=3.0.0b1)", "unstructured.paddleocr (==2.10.0)"] +pdf = ["effdet", "google-cloud-vision", "onnx (>=1.17.0)", "onnxruntime (>=1.19.0)", "pdf2image", "pdfminer.six", "pi-heif", "pikepdf", "pypdf", "unstructured-inference (>=1.0.5)", "unstructured.pytesseract (>=0.3.12)"] +ppt = ["python-pptx (>=1.0.1)"] +pptx = ["python-pptx (>=1.0.1)"] rst = ["pypandoc"] rtf = ["pypandoc"] -s3 = ["fsspec (==2023.9.1)", "s3fs"] -salesforce = ["simple-salesforce"] -sharepoint = ["Office365-REST-Python-Client (<2.4.3)", "msal"] -slack = ["slack-sdk"] tsv = ["pandas"] -wikipedia = ["wikipedia"] -xlsx = ["networkx", "openpyxl", "pandas", "xlrd"] +xlsx = ["msoffcrypto-tool", "networkx", "openpyxl", "pandas", "xlrd"] + +[[package]] +name = "unstructured-client" +version = "0.32.3" +description = "Python Client SDK for Unstructured API" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"file-based\"" +files = [ + {file = "unstructured_client-0.32.3-py3-none-any.whl", hash = "sha256:50b8198a3c3f984bdb53d848be7665d352093a99841858976f596cc2105903ec"}, + {file = "unstructured_client-0.32.3.tar.gz", hash = "sha256:1426d03325f7b93daad524ad2b954f1e7cceb0c15e67a4f4e88b49220dd2472c"}, +] + +[package.dependencies] +aiofiles = ">=24.1.0" +cryptography = ">=3.1" +eval-type-backport = ">=0.2.0" +httpx = ">=0.27.0" +nest-asyncio = ">=1.6.0" +pydantic = ">=2.10.3" +pypdf = ">=4.0" +python-dateutil = ">=2.8.2" +requests-toolbelt = ">=1.0.0" +typing-inspection = ">=0.4.0" [[package]] name = "unstructured-pytesseract" @@ -6257,6 +6371,19 @@ files = [ [package.dependencies] bracex = ">=2.1.1" +[[package]] +name = "webencodings" +version = "0.5.1" +description = "Character encoding aliases for legacy web content" +optional = true +python-versions = "*" +groups = ["main"] +markers = "extra == \"file-based\"" +files = [ + {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, + {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, +] + [[package]] name = "werkzeug" version = "3.1.3" @@ -6367,7 +6494,7 @@ description = "Module for decorators, wrappers and monkey patching." optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"manifest-server\"" +markers = "extra == \"manifest-server\" or extra == \"file-based\"" files = [ {file = "wrapt-1.17.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:88bbae4d40d5a46142e70d58bf664a89b6b4befaea7b2ecc14e03cedb8e06c04"}, {file = "wrapt-1.17.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6b13af258d6a9ad602d57d889f83b9d5543acd471eee12eb51f5b01f8eb1bc2"}, @@ -6606,4 +6733,4 @@ vector-db-based = ["cohere", "langchain", "openai", "tiktoken"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.14" -content-hash = "f8d4c72b0acd484de56c2ec8f09c760949c2ce009103daec3c50b5963e72ea98" +content-hash = "88225f4198485b6b998d5d7c5a83c7d9409a5ffdf4d6c92ef64dc7e93fa9fcfc" diff --git a/pyproject.toml b/pyproject.toml index e00ecb13f..4b105c642 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,7 +79,7 @@ nltk = { version = "3.9.1", optional = true } # This will ensure that even when you run poetry install or pip install, the compatible version of numpy will always be chosen. # airbyte-ci will try to install latest version when --use-local-cdk is used, resulting in the conflict. numpy = "<2" -unstructured = { version = "0.10.27", extras = ["docx", "pptx"], optional = true } +unstructured = { version = "0.18.14", extras = ["docx", "pptx"], optional = true } "unstructured.pytesseract" = { version = ">=0.3.12", optional = true } pyjwt = "^2.8.0" cryptography = ">=44.0.0,<45.0.0" # Constrained as transitive dependency due to a bug in newer versions