diff --git a/.changeset/long-camels-sell.md b/.changeset/long-camels-sell.md new file mode 100644 index 0000000000..ba3029a270 --- /dev/null +++ b/.changeset/long-camels-sell.md @@ -0,0 +1,5 @@ +--- +"chatbot": minor +--- + +"Add Presidio to detect and mask PII entities" diff --git a/apps/chatbot/.env.example b/apps/chatbot/.env.example index 654f93e55c..275f18527b 100644 --- a/apps/chatbot/.env.example +++ b/apps/chatbot/.env.example @@ -4,12 +4,16 @@ PYTHONPATH=app-path LOG_LEVEL=DEBUG CHB_AWS_ACCESS_KEY_ID=... CHB_AWS_SECRET_ACCESS_KEY=... -CHB_AWS_DEFAULT_REGION=eu-west-3 +CHB_AWS_DEFAULT_REGION=eu-south-1 +CHB_AWS_BEDROCK_REGION=eu-west-3 CHB_AWS_S3_BUCKET=... CHB_AWS_GUARDRAIL_ID=... CHB_AWS_GUARDRAIL_VERSION=... CHB_REDIS_URL=... CHB_WEBSITE_URL=... +CHB_REDIS_INDEX_NAME=... +CHB_LLAMAINDEX_INDEX_ID=... +CHB_DOCUMENTATION_DIR=... CHB_GOOGLE_API_KEY=... CHB_PROVIDER=... CHB_MODEL_ID=... diff --git a/apps/chatbot/README.md b/apps/chatbot/README.md index 0b174f0a19..0014889baa 100644 --- a/apps/chatbot/README.md +++ b/apps/chatbot/README.md @@ -1,10 +1,15 @@ # PagoPA Chatbot -This folder contains all the details to build a RAG using the documentation provided in [`PagoPA Developer Portal`](https://developer.pagopa.it/). The retriver chosen is the `Auto Merging Retriver` one and it was implemented using [`llama-index`](https://docs.llamaindex.ai/en/stable/). Check out `src/modules/retriever.py`. +This folder contains all the details to build a RAG using the documentation provided in [`PagoPA Developer Portal`](https://developer.pagopa.it/). -This chatbot uses [`AWS Bedrock`](https://aws.amazon.com/bedrock/) as provider, so be sure to have installed [`aws-cli`](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) and stored your credential in `~/.aws/credentials`. +This chatbot uses [Google](https://ai.google.dev/) or [AWS Bedrock](https://aws.amazon.com/bedrock/) as provider. +Even though the provider is the Google one, we stored its API key in AWS. So, be sure to have installed [aws-cli](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) and stored your credential in `~/.aws/credentials`. -All the parameters and prompts used to build the Retrieval-Augmented Generation (RAG) are available in `config`. +The Retrieval-Augmented Generation (RAG) was implemented using [llama-index](https://docs.llamaindex.ai/en/stable/). All the parameters and prompts used are stored in `config`. + +## Environment Variables + +Create a `.env` file inside this folder and store the environment variables listed in `.env.example`. ## Virtual environment @@ -27,40 +32,17 @@ The working directory is `/developer-portal/apps/chatbot`. So, to set the `PYTHO In this way, `PYTHONPATH` points to where the Python packages and modules are, not where your checkouts are. -## File for Environment Variables - -Create a `.env` file inside the folder and write to the file the following environment variables: - - CHB_AWS_ACCESS_KEY_ID=... - CHB_AWS_SECRET_ACCESS_KEY=... - CHB_AWS_DEFAULT_REGION=... - CHB_AWS_S3_BUCKET=... - CHB_AWS_GUARDRAIL_ID=... - CHB_AWS_GUARDRAIL_VERSION=... - CHB_REDIS_URL=... - CHB_REDIS_INDEX_NAME=... - CHB_WEBSITE_URL=... - CHB_GOOGLE_API_KEY=... - CHB_PROVIDER=... - CHB_MODEL_ID=... - CHB_MODEL_TEMPERATURE=... - CHB_MODEL_MAXTOKENS=... - CHB_EMBED_MODEL_ID=... - CHB_ENGINE_SIMILARITY_TOPK=... - CHB_ENGINE_SIMILARITY_CUTOFF=... - CHB_ENGINE_USE_ASYNC=... - CHB_ENGINE_USE_STREAMING=... - -## Knowledge vector database +## Knowledge index vector database To reach the remote redis instance, it is necessary to open a tunnel: + ``` ./scripts/redis-tunnel.sh ``` Verify that the HTML files that compose the Developer Portal documentation exist in a directory. Otherwise create the documentation. Once you have the documentation directory ready, put its path in `params` and, in the end, create the vector index doing: -``` +``` python src/modules/create_vector_index.py --params config/params.yaml ``` diff --git a/apps/chatbot/config/params.yaml b/apps/chatbot/config/params.yaml index 0880ff1d81..c91ad849bf 100644 --- a/apps/chatbot/config/params.yaml +++ b/apps/chatbot/config/params.yaml @@ -5,9 +5,57 @@ vector_index: path: index chunk_sizes: [2816, 704, 176] chunk_overlap: 20 - use_redis: True - use_s3: False engine: response_mode: compact verbose: False + +config_presidio: + nlp_engine_name: spacy + models: + - + lang_code: en + model_name: en_core_web_md + - + lang_code: it + model_name: it_core_news_md + # - + # lang_code: de + # model_name: de_core_news_md + # - + # lang_code: es + # model_name: es_core_news_md + # - + # lang_code: fr + # model_name: fr_core_news_md + ner_model_configuration: + labels_to_ignore: + - ORDINAL + - QUANTITY + - ORGANIZATION + - ORG + - LANGUAGE + - PRODUCT + - MONEY + - PERCENT + - O + - CARDINAL + - EVENT + - WORK_OF_ART + - LAW + - MISC + model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + LOC: LOCATION + LOCATION: LOCATION + GPE: LOCATION + ORG: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME + NORP: NRP + low_confidence_score_multiplier: 0.4 + low_score_entity_names: + - ORGANIZATION + - ORG + default_score: 0.8 diff --git a/apps/chatbot/config/prompts.yaml b/apps/chatbot/config/prompts.yaml index 20c570949e..daf7b34161 100644 --- a/apps/chatbot/config/prompts.yaml +++ b/apps/chatbot/config/prompts.yaml @@ -1,6 +1,6 @@ qa_prompt_str: | You are an customer services chatbot. - Your name is Discovery and your duty is to assist the user with the PagoPA DevPortal documentation! + Your name is Discovery and your duty is to assist the user with the PagoPA DevPortal documentation, homepage: https://dev.developer.pagopa.it! -------------------- Context information: {context_str} diff --git a/apps/chatbot/docker/app.Dockerfile b/apps/chatbot/docker/app.Dockerfile index 4e2aae36b5..323a8bff29 100644 --- a/apps/chatbot/docker/app.Dockerfile +++ b/apps/chatbot/docker/app.Dockerfile @@ -14,4 +14,6 @@ RUN poetry install COPY . ${LAMBDA_TASK_ROOT} RUN python ./scripts/nltk_download.py +RUN python ./scripts/spacy_download.py + CMD ["src.app.main.handler"] diff --git a/apps/chatbot/poetry.lock b/apps/chatbot/poetry.lock index a67da263ef..eb26c01c9e 100644 --- a/apps/chatbot/poetry.lock +++ b/apps/chatbot/poetry.lock @@ -407,6 +407,25 @@ docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphi tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] +[[package]] +name = "azure-core" +version = "1.31.0" +description = "Microsoft Azure Core Library for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "azure_core-1.31.0-py3-none-any.whl", hash = "sha256:22954de3777e0250029360ef31d80448ef1be13b80a459bff80ba7073379e2cd"}, + {file = "azure_core-1.31.0.tar.gz", hash = "sha256:656a0dd61e1869b1506b7c6a3b31d62f15984b1a573d6326f6aa2f3e4123284b"}, +] + +[package.dependencies] +requests = ">=2.21.0" +six = ">=1.11.0" +typing-extensions = ">=4.6.0" + +[package.extras] +aio = ["aiohttp (>=3.0)"] + [[package]] name = "babel" version = "2.16.0" @@ -460,6 +479,52 @@ webencodings = "*" [package.extras] css = ["tinycss2 (>=1.1.0,<1.3)"] +[[package]] +name = "blis" +version = "0.7.11" +description = "The Blis BLAS-like linear algebra library, as a self-contained C-extension." +optional = false +python-versions = "*" +files = [ + {file = "blis-0.7.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cd5fba34c5775e4c440d80e4dea8acb40e2d3855b546e07c4e21fad8f972404c"}, + {file = "blis-0.7.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:31273d9086cab9c56986d478e3ed6da6752fa4cdd0f7b5e8e5db30827912d90d"}, + {file = "blis-0.7.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d06883f83d4c8de8264154f7c4a420b4af323050ed07398c1ff201c34c25c0d2"}, + {file = "blis-0.7.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee493683e3043650d4413d531e79e580d28a3c7bdd184f1b9cfa565497bda1e7"}, + {file = "blis-0.7.11-cp310-cp310-win_amd64.whl", hash = "sha256:a73945a9d635eea528bccfdfcaa59dd35bd5f82a4a40d5ca31f08f507f3a6f81"}, + {file = "blis-0.7.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1b68df4d01d62f9adaef3dad6f96418787265a6878891fc4e0fabafd6d02afba"}, + {file = "blis-0.7.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:162e60d941a8151418d558a94ee5547cb1bbeed9f26b3b6f89ec9243f111a201"}, + {file = "blis-0.7.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:686a7d0111d5ba727cd62f374748952fd6eb74701b18177f525b16209a253c01"}, + {file = "blis-0.7.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0421d6e44cda202b113a34761f9a062b53f8c2ae8e4ec8325a76e709fca93b6e"}, + {file = "blis-0.7.11-cp311-cp311-win_amd64.whl", hash = "sha256:0dc9dcb3843045b6b8b00432409fd5ee96b8344a324e031bfec7303838c41a1a"}, + {file = "blis-0.7.11-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dadf8713ea51d91444d14ad4104a5493fa7ecc401bbb5f4a203ff6448fadb113"}, + {file = "blis-0.7.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5bcdaf370f03adaf4171d6405a89fa66cb3c09399d75fc02e1230a78cd2759e4"}, + {file = "blis-0.7.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7de19264b1d49a178bf8035406d0ae77831f3bfaa3ce02942964a81a202abb03"}, + {file = "blis-0.7.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ea55c6a4a60fcbf6a0fdce40df6e254451ce636988323a34b9c94b583fc11e5"}, + {file = "blis-0.7.11-cp312-cp312-win_amd64.whl", hash = "sha256:5a305dbfc96d202a20d0edd6edf74a406b7e1404f4fa4397d24c68454e60b1b4"}, + {file = "blis-0.7.11-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:68544a1cbc3564db7ba54d2bf8988356b8c7acd025966e8e9313561b19f0fe2e"}, + {file = "blis-0.7.11-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:075431b13b9dd7b411894d4afbd4212acf4d0f56c5a20628f4b34902e90225f1"}, + {file = "blis-0.7.11-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:324fdf62af9075831aa62b51481960e8465674b7723f977684e32af708bb7448"}, + {file = "blis-0.7.11-cp36-cp36m-win_amd64.whl", hash = "sha256:afebdb02d2dcf9059f23ce1244585d3ce7e95c02a77fd45a500e4a55b7b23583"}, + {file = "blis-0.7.11-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2e62cd14b20e960f21547fee01f3a0b2ac201034d819842865a667c969c355d1"}, + {file = "blis-0.7.11-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89b01c05a5754edc0b9a3b69be52cbee03f645b2ec69651d12216ea83b8122f0"}, + {file = "blis-0.7.11-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cfee5ec52ba1e9002311d9191f7129d7b0ecdff211e88536fb24c865d102b50d"}, + {file = "blis-0.7.11-cp37-cp37m-win_amd64.whl", hash = "sha256:844b6377e3e7f3a2e92e7333cc644095386548ad5a027fdc150122703c009956"}, + {file = "blis-0.7.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6df00c24128e323174cde5d80ebe3657df39615322098ce06613845433057614"}, + {file = "blis-0.7.11-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:809d1da1331108935bf06e22f3cf07ef73a41a572ecd81575bdedb67defe3465"}, + {file = "blis-0.7.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bfabd5272bbbe504702b8dfe30093653d278057656126716ff500d9c184b35a6"}, + {file = "blis-0.7.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca684f5c2f05269f17aefe7812360286e9a1cee3afb96d416485efd825dbcf19"}, + {file = "blis-0.7.11-cp38-cp38-win_amd64.whl", hash = "sha256:688a8b21d2521c2124ee8dfcbaf2c385981ccc27e313e052113d5db113e27d3b"}, + {file = "blis-0.7.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2ff7abd784033836b284ff9f4d0d7cb0737b7684daebb01a4c9fe145ffa5a31e"}, + {file = "blis-0.7.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f9caffcd14795bfe52add95a0dd8426d44e737b55fcb69e2b797816f4da0b1d2"}, + {file = "blis-0.7.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fb36989ed61233cfd48915896802ee6d3d87882190000f8cfe0cf4a3819f9a8"}, + {file = "blis-0.7.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ea09f961871f880d5dc622dce6c370e4859559f0ead897ae9b20ddafd6b07a2"}, + {file = "blis-0.7.11-cp39-cp39-win_amd64.whl", hash = "sha256:5bb38adabbb22f69f22c74bad025a010ae3b14de711bf5c715353980869d491d"}, + {file = "blis-0.7.11.tar.gz", hash = "sha256:cec6d48f75f7ac328ae1b6fbb372dde8c8a57c89559172277f66e01ff08d4d42"}, +] + +[package.dependencies] +numpy = {version = ">=1.19.0", markers = "python_version >= \"3.9\""} + [[package]] name = "boto3" version = "1.34.131" @@ -509,6 +574,17 @@ files = [ {file = "cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a"}, ] +[[package]] +name = "catalogue" +version = "2.0.10" +description = "Super lightweight function registries for your library" +optional = false +python-versions = ">=3.6" +files = [ + {file = "catalogue-2.0.10-py3-none-any.whl", hash = "sha256:58c2de0020aa90f4a2da7dfad161bf7b3b054c86a5f09fcedc0b2b740c109a9f"}, + {file = "catalogue-2.0.10.tar.gz", hash = "sha256:4f56daa940913d3f09d589c191c74e5a6d51762b3a9e37dd53b7437afd6cda15"}, +] + [[package]] name = "certifi" version = "2024.8.30" @@ -726,6 +802,23 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} +[[package]] +name = "cloudpathlib" +version = "0.19.0" +description = "pathlib-style classes for cloud storage services." +optional = false +python-versions = ">=3.8" +files = [ + {file = "cloudpathlib-0.19.0-py3-none-any.whl", hash = "sha256:eb7758648812d5906af44f14cf9a6a64f687342a6f547a1c20deb7241d769dcb"}, + {file = "cloudpathlib-0.19.0.tar.gz", hash = "sha256:919edbfd9a4e935d2423da210b143df89cb0ec6d378366a0dffa2e9fd0664fe8"}, +] + +[package.extras] +all = ["cloudpathlib[azure]", "cloudpathlib[gs]", "cloudpathlib[s3]"] +azure = ["azure-storage-blob (>=12)", "azure-storage-file-datalake (>=12)"] +gs = ["google-cloud-storage"] +s3 = ["boto3 (>=1.34.0)"] + [[package]] name = "colorama" version = "0.4.6" @@ -771,6 +864,21 @@ traitlets = ">=4" [package.extras] test = ["pytest"] +[[package]] +name = "confection" +version = "0.1.5" +description = "The sweetest config system for Python" +optional = false +python-versions = ">=3.6" +files = [ + {file = "confection-0.1.5-py3-none-any.whl", hash = "sha256:e29d3c3f8eac06b3f77eb9dfb4bf2fc6bcc9622a98ca00a698e3d019c6430b14"}, + {file = "confection-0.1.5.tar.gz", hash = "sha256:8e72dd3ca6bd4f48913cd220f10b8275978e740411654b6e8ca6d7008c590f0e"}, +] + +[package.dependencies] +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0" +srsly = ">=2.4.0,<3.0.0" + [[package]] name = "contourpy" version = "1.3.0" @@ -930,6 +1038,48 @@ files = [ docs = ["ipython", "matplotlib", "numpydoc", "sphinx"] tests = ["pytest", "pytest-cov", "pytest-xdist"] +[[package]] +name = "cymem" +version = "2.0.8" +description = "Manage calls to calloc/free through Cython" +optional = false +python-versions = "*" +files = [ + {file = "cymem-2.0.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:77b5d3a73c41a394efd5913ab7e48512054cd2dabb9582d489535456641c7666"}, + {file = "cymem-2.0.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bd33da892fb560ba85ea14b1528c381ff474048e861accc3366c8b491035a378"}, + {file = "cymem-2.0.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29a551eda23eebd6d076b855f77a5ed14a1d1cae5946f7b3cb5de502e21b39b0"}, + {file = "cymem-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8260445652ae5ab19fff6851f32969a7b774f309162e83367dd0f69aac5dbf7"}, + {file = "cymem-2.0.8-cp310-cp310-win_amd64.whl", hash = "sha256:a63a2bef4c7e0aec7c9908bca0a503bf91ac7ec18d41dd50dc7dff5d994e4387"}, + {file = "cymem-2.0.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6b84b780d52cb2db53d4494fe0083c4c5ee1f7b5380ceaea5b824569009ee5bd"}, + {file = "cymem-2.0.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0d5f83dc3cb5a39f0e32653cceb7c8ce0183d82f1162ca418356f4a8ed9e203e"}, + {file = "cymem-2.0.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ac218cf8a43a761dc6b2f14ae8d183aca2bbb85b60fe316fd6613693b2a7914"}, + {file = "cymem-2.0.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42c993589d1811ec665d37437d5677b8757f53afadd927bf8516ac8ce2d3a50c"}, + {file = "cymem-2.0.8-cp311-cp311-win_amd64.whl", hash = "sha256:ab3cf20e0eabee9b6025ceb0245dadd534a96710d43fb7a91a35e0b9e672ee44"}, + {file = "cymem-2.0.8-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cb51fddf1b920abb1f2742d1d385469bc7b4b8083e1cfa60255e19bc0900ccb5"}, + {file = "cymem-2.0.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9235957f8c6bc2574a6a506a1687164ad629d0b4451ded89d49ebfc61b52660c"}, + {file = "cymem-2.0.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a2cc38930ff5409f8d61f69a01e39ecb185c175785a1c9bec13bcd3ac8a614ba"}, + {file = "cymem-2.0.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bf49e3ea2c441f7b7848d5c61b50803e8cbd49541a70bb41ad22fce76d87603"}, + {file = "cymem-2.0.8-cp312-cp312-win_amd64.whl", hash = "sha256:ecd12e3bacf3eed5486e4cd8ede3c12da66ee0e0a9d0ae046962bc2bb503acef"}, + {file = "cymem-2.0.8-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:167d8019db3b40308aabf8183fd3fbbc256323b645e0cbf2035301058c439cd0"}, + {file = "cymem-2.0.8-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17cd2c2791c8f6b52f269a756ba7463f75bf7265785388a2592623b84bb02bf8"}, + {file = "cymem-2.0.8-cp36-cp36m-win_amd64.whl", hash = "sha256:6204f0a3307bf45d109bf698ba37997ce765f21e359284328e4306c7500fcde8"}, + {file = "cymem-2.0.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b9c05db55ea338648f8e5f51dd596568c7f62c5ae32bf3fa5b1460117910ebae"}, + {file = "cymem-2.0.8-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ce641f7ba0489bd1b42a4335a36f38c8507daffc29a512681afaba94a0257d2"}, + {file = "cymem-2.0.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6b83a5972a64f62796118da79dfeed71f4e1e770b2b7455e889c909504c2358"}, + {file = "cymem-2.0.8-cp37-cp37m-win_amd64.whl", hash = "sha256:ada6eb022e4a0f4f11e6356a5d804ceaa917174e6cf33c0b3e371dbea4dd2601"}, + {file = "cymem-2.0.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1e593cd57e2e19eb50c7ddaf7e230b73c890227834425b9dadcd4a86834ef2ab"}, + {file = "cymem-2.0.8-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d513f0d5c6d76facdc605e42aa42c8d50bb7dedca3144ec2b47526381764deb0"}, + {file = "cymem-2.0.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e370dd54359101b125bfb191aca0542718077b4edb90ccccba1a28116640fed"}, + {file = "cymem-2.0.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84f8c58cde71b8fc7024883031a4eec66c0a9a4d36b7850c3065493652695156"}, + {file = "cymem-2.0.8-cp38-cp38-win_amd64.whl", hash = "sha256:6a6edddb30dd000a27987fcbc6f3c23b7fe1d74f539656952cb086288c0e4e29"}, + {file = "cymem-2.0.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b896c83c08dadafe8102a521f83b7369a9c5cc3e7768eca35875764f56703f4c"}, + {file = "cymem-2.0.8-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a4f8f2bfee34f6f38b206997727d29976666c89843c071a968add7d61a1e8024"}, + {file = "cymem-2.0.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7372e2820fa66fd47d3b135f3eb574ab015f90780c3a21cfd4809b54f23a4723"}, + {file = "cymem-2.0.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4e57bee56d35b90fc2cba93e75b2ce76feaca05251936e28a96cf812a1f5dda"}, + {file = "cymem-2.0.8-cp39-cp39-win_amd64.whl", hash = "sha256:ceeab3ce2a92c7f3b2d90854efb32cb203e78cb24c836a5a9a2cac221930303b"}, + {file = "cymem-2.0.8.tar.gz", hash = "sha256:8fb09d222e21dcf1c7e907dc85cf74501d4cea6c4ed4ac6c9e016f98fb59cbbf"}, +] + [[package]] name = "dataclasses-json" version = "0.6.7" @@ -2650,6 +2800,24 @@ files = [ {file = "kiwisolver-1.4.7.tar.gz", hash = "sha256:9893ff81bd7107f7b685d3017cc6583daadb4fc26e4a888350df530e41980a60"}, ] +[[package]] +name = "langcodes" +version = "3.4.1" +description = "Tools for labeling human languages with IETF language tags" +optional = false +python-versions = ">=3.8" +files = [ + {file = "langcodes-3.4.1-py3-none-any.whl", hash = "sha256:68f686fc3d358f222674ecf697ddcee3ace3c2fe325083ecad2543fd28a20e77"}, + {file = "langcodes-3.4.1.tar.gz", hash = "sha256:a24879fed238013ac3af2424b9d1124e38b4a38b2044fd297c8ff38e5912e718"}, +] + +[package.dependencies] +language-data = ">=1.2" + +[package.extras] +build = ["build", "twine"] +test = ["pytest", "pytest-cov"] + [[package]] name = "langdetect" version = "1.0.9" @@ -2664,6 +2832,24 @@ files = [ [package.dependencies] six = "*" +[[package]] +name = "language-data" +version = "1.2.0" +description = "Supplementary data about languages used by the langcodes module" +optional = false +python-versions = "*" +files = [ + {file = "language_data-1.2.0-py3-none-any.whl", hash = "sha256:77d5cab917f91ee0b2f1aa7018443e911cf8985ef734ca2ba3940770f6a3816b"}, + {file = "language_data-1.2.0.tar.gz", hash = "sha256:82a86050bbd677bfde87d97885b17566cfe75dad3ac4f5ce44b52c28f752e773"}, +] + +[package.dependencies] +marisa-trie = ">=0.7.7" + +[package.extras] +build = ["build", "twine"] +test = ["pytest", "pytest-cov"] + [[package]] name = "llama-cloud" version = "0.1.0" @@ -2963,6 +3149,22 @@ files = [ llama-index-core = ">=0.11.0,<0.12.0" llama-index-llms-openai = ">=0.2.0,<0.3.0" +[[package]] +name = "llama-index-postprocessor-presidio" +version = "0.2.0" +description = "llama-index postprocessor presidio integration" +optional = false +python-versions = "<4.0,>=3.8.1" +files = [ + {file = "llama_index_postprocessor_presidio-0.2.0-py3-none-any.whl", hash = "sha256:9899f6b444ff37e6425b43e17e873a22dc175ce849dc85aed14c71c3f0be22e6"}, + {file = "llama_index_postprocessor_presidio-0.2.0.tar.gz", hash = "sha256:e8a9777d09676e89eb3cc68c05cad288fd6c22ff3905dda6b0f9b2a19769edf7"}, +] + +[package.dependencies] +llama-index-core = ">=0.11.0,<0.12.0" +presidio-analyzer = ">=2.2.353,<3.0.0" +presidio-anonymizer = ">=2.2.353,<3.0.0" + [[package]] name = "llama-index-program-openai" version = "0.2.0" @@ -3297,6 +3499,109 @@ files = [ [package.dependencies] typing-extensions = "*" +[[package]] +name = "marisa-trie" +version = "1.2.0" +description = "Static memory-efficient and fast Trie-like structures for Python." +optional = false +python-versions = ">=3.7" +files = [ + {file = "marisa_trie-1.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:61fab91fef677f0af0e818e61595f2334f7e0b3e122b24ec65889aae69ba468d"}, + {file = "marisa_trie-1.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5f5b3080316de735bd2b07265de5eea3ae176fa2fc60f9871aeaa9cdcddfc8f7"}, + {file = "marisa_trie-1.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:77bfde3287314e91e28d3a882c7b87519ef0ee104c921df72c7819987d5e4863"}, + {file = "marisa_trie-1.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4fbb1ec1d9e891060a0aee9f9c243acec63de1e197097a14850ba38ec8a4013"}, + {file = "marisa_trie-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e04e9c86fe8908b61c2aebb8444217cacaed15b93d2dccaac3849e36a6dc660"}, + {file = "marisa_trie-1.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a7c75a508f44e40f7af8448d466045a97534adcbb026e63989407cefb9ebfa6"}, + {file = "marisa_trie-1.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5321211647609869907e81b0230ad2dfdfa7e19fe1ee469b46304a622391e6a1"}, + {file = "marisa_trie-1.2.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:88660e6ee0f821872aaf63ba4b9a7513428b9cab20c69cc013c368bd72c3a4fe"}, + {file = "marisa_trie-1.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:4e4535fc5458de2b59789e574cdd55923d63de5612dc159d33941af79cd62786"}, + {file = "marisa_trie-1.2.0-cp310-cp310-win32.whl", hash = "sha256:bdd1d4d430e33abbe558971d1bd57da2d44ca129fa8a86924c51437dba5cb345"}, + {file = "marisa_trie-1.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:c729e2b8f9699874b1372b5a01515b340eda1292f5e08a3fe4633b745f80ad7a"}, + {file = "marisa_trie-1.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d62985a0e6f2cfeb36cd6afa0460063bbe83ef4bfd9afe189a99103487547210"}, + {file = "marisa_trie-1.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1890cc993149db4aa8242973526589e8133c3f92949b0ac74c2c9a6596707ae3"}, + {file = "marisa_trie-1.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:26177cd0dadb7b44f47c17c40e16ac157c4d22ac7ed83b5a47f44713239e10d1"}, + {file = "marisa_trie-1.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3425dc81d49a374be49e3a063cb6ccdf57973201b0a30127082acea50562a85e"}, + {file = "marisa_trie-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:525b8df41a1a7337ed7f982eb63b704d7d75f047e30970fcfbe9cf6fc22c5991"}, + {file = "marisa_trie-1.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c643c66bbde6a115e4ec8713c087a9fe9cb7b7c684e6af4cf448c120fa427ea4"}, + {file = "marisa_trie-1.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5a83fe83e0eab9154a2dc7c556898c86584b7779ddf4214c606fce4ceff07c13"}, + {file = "marisa_trie-1.2.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:49701db6bb8f1ec0133abd95f0a4891cfd6f84f3bd019e343037e31a5a5b0210"}, + {file = "marisa_trie-1.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a3f0562863deaad58c5dc3a51f706da92582bc9084189148a45f7a12fe261a51"}, + {file = "marisa_trie-1.2.0-cp311-cp311-win32.whl", hash = "sha256:b08968ccad00f54f31e38516e4452fae59dd15a3fcee56aea3101ba2304680b3"}, + {file = "marisa_trie-1.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:d3ef375491e7dd71a0a7e7bf288c88750942bd1ee0c379dcd6ad43e31af67d00"}, + {file = "marisa_trie-1.2.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:39b88f126988ea83e8458259297d2b2f9391bfba8f4dc5d7a246813aae1c1def"}, + {file = "marisa_trie-1.2.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ec167b006884a90d130ee30518a9aa44cb40211f702bf07031b2d7d4d1db569b"}, + {file = "marisa_trie-1.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1b855e6286faef5411386bf9d676dfb545c09f7d109f197f347c9366aeb12f07"}, + {file = "marisa_trie-1.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8cd287ff323224d87c2b739cba39614aac3737c95a254e0ff70e77d9b8df226d"}, + {file = "marisa_trie-1.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d8a1c0361165231f4fb915237470afc8cc4803c535f535f4fc42ca72855b124"}, + {file = "marisa_trie-1.2.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3267f438d258d7d85ee3dde363c4f96c3196ca9cd9e63fe429a59543cc544b15"}, + {file = "marisa_trie-1.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7c87a0c2cccce12b07bfcb70708637c0816970282d966a1531ecda1a24bd1cc8"}, + {file = "marisa_trie-1.2.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:d3c0e38f0501951e2322f7274a39b8e2344bbd91ceaa9da439f46022570ddc9d"}, + {file = "marisa_trie-1.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:cd88a338c87e6dc130b6cea7b697580c21f0c83a8a8b46671cfecbb713d3fe24"}, + {file = "marisa_trie-1.2.0-cp312-cp312-win32.whl", hash = "sha256:5cea60975184f03fbcff51339df0eb44d2abe106a1693983cc64415eb87b897b"}, + {file = "marisa_trie-1.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:b04a07b99b62b9bdf3eaf1d44571a3293ce249ce8971944e780c9c709593462f"}, + {file = "marisa_trie-1.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c11af35d9304de420b359741e12b885d04f11403697efcbbe8cb50f834261ebc"}, + {file = "marisa_trie-1.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2db8e74493c3bffb480c54afaa88890a39bf90063ff5b322acf64bf076e4b36e"}, + {file = "marisa_trie-1.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9bcc6613bc873136dc62609b66aaa27363e2bd46c03fdab62d638f7cf69d5f82"}, + {file = "marisa_trie-1.2.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f5cb731581effb3e05258f3ddc2a155475de74bb00f61eb280f991e13b48f783"}, + {file = "marisa_trie-1.2.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:eba1061bbeaeec4149282beab2ae163631606f119f549a10246b014e13f9047b"}, + {file = "marisa_trie-1.2.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:015594427360c6ad0fa94d51ee3d50fb83b0f7278996497fd2d69f877c3de9bd"}, + {file = "marisa_trie-1.2.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:36d65bcbf22a70cdd0202bd8608c2feecc58bdb9e5dd9a2f5a723b651fcab287"}, + {file = "marisa_trie-1.2.0-cp37-cp37m-win32.whl", hash = "sha256:bc138625b383998f5cd0cbf6cd38d66d414f3786ae6d7b4e4a6fc970140ef4e9"}, + {file = "marisa_trie-1.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:27d270a64eb655754dfb4e352c60a084b16ab999b3a97a0cdc7dbecbca3c0e35"}, + {file = "marisa_trie-1.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:fa1fa7f67d317a921315a65e266b9e156ce5a956076ec2b6dbf72d67c7df8216"}, + {file = "marisa_trie-1.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9dccef41d4af11a03558c1d101de58bd723b3039a5bc4e064250008c118037ec"}, + {file = "marisa_trie-1.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:873efd212dfef2b736ff2ff43e10b348c428d5dbac7b8cb8aa777004bc8c7b0e"}, + {file = "marisa_trie-1.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8af7a21ac2ba6dc23e4257fc3a40b3070e776275d3d0b5b2ef44473ad92caf3a"}, + {file = "marisa_trie-1.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7202ba0ca1db5245feaebbeb3d0c776b2da1fffb0abc3500dd505f679686aa1"}, + {file = "marisa_trie-1.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83d90be28c083323909d23ff8e9b4a2764b9e75520d1bae1a277e9fa7ca20d15"}, + {file = "marisa_trie-1.2.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:40e2a374026492ac84232897f1f1d8f92a4a1f8bcf3f0ded1f2b8b708d1acfff"}, + {file = "marisa_trie-1.2.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:7c6e6506bd24a5799b9b4b9cf1e8d6fa281f136396ba018a95d95d4d74715227"}, + {file = "marisa_trie-1.2.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:437bf6c0d7ba4cf17656a0e3bdd0b3c2c92c01fedfa670904177eef3116a4f45"}, + {file = "marisa_trie-1.2.0-cp38-cp38-win32.whl", hash = "sha256:6aeef7b364fb3b34dbba1cc57b79f1668fad0c3f039738d65a5b0d5ddce15f47"}, + {file = "marisa_trie-1.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:02f773e85cc566a24c0e0e28c744052db7691c4f13d02e4257bc657a49b9ab14"}, + {file = "marisa_trie-1.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:6ff705cb3b907bdeacb8c4b3bf0541691f52b101014d189a707ca41ebfacad59"}, + {file = "marisa_trie-1.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:006419c59866979188906babc42ae0918081c18cabc2bdabca027f68c081c127"}, + {file = "marisa_trie-1.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a7196691681ecb8a12629fb6277c33bafdb27cf2b6c18c28bc48fa42a15eab8f"}, + {file = "marisa_trie-1.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eaf052c0a1f4531ee12fd4c637212e77ad2af8c3b38a0d3096622abd01a22212"}, + {file = "marisa_trie-1.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fb95f3ab95ba933f6a2fa2629185e9deb9da45ff2aa4ba8cc8f722528c038ef"}, + {file = "marisa_trie-1.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7459b1e1937e33daed65a6d55f8b95f9a8601f4f8749d01641cf548ecac03840"}, + {file = "marisa_trie-1.2.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:902ea948677421093651ca98df62d255383f865f7c353f956ef666e92500e79f"}, + {file = "marisa_trie-1.2.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:fdf7a2d066907816726f3bf241b8cb05b698d6ffaa3c5ea2658d4ba69e87ec57"}, + {file = "marisa_trie-1.2.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:3540bb85b38dfc17060263e061c95a0a435681b04543d1ae7e8d7441a9790593"}, + {file = "marisa_trie-1.2.0-cp39-cp39-win32.whl", hash = "sha256:fe1394e1f262e5b45d22d30bd1ef75174d1f2772e86716b5f93f9c29dfc1a779"}, + {file = "marisa_trie-1.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:84c44cb13803723f0f76aa2ba1a657f762a0bb9d8a9b80dfff249bb1c3218dd6"}, + {file = "marisa_trie-1.2.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:035c4c8f3b313b4d7b7451ddd539da811a11077a9e359c6a0345f816b1bdccb3"}, + {file = "marisa_trie-1.2.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d4f05c2ee218a5ab09d269b640d06f9708b0cf37c842344cbdffb0661c74c472"}, + {file = "marisa_trie-1.2.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92ac63e1519598de946c7d9346df3bb52ed96968eb3021b4e89b51d79bc72a86"}, + {file = "marisa_trie-1.2.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:045f32eaeb5dcdb5beadb571ba616d7a34141764b616eebb4decce71b366f5fa"}, + {file = "marisa_trie-1.2.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cb60c2f9897ce2bfc31a69ac25a040de4f8643ab2a339bb0ff1185e1a9dedaf8"}, + {file = "marisa_trie-1.2.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f19c5fcf23c02f1303deb69c67603ee37ed8f01de2d8b19f1716a6cf5afd5455"}, + {file = "marisa_trie-1.2.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a06a77075240eb83a47b780902322e66c968a06a2b6318cab06757c65ea64190"}, + {file = "marisa_trie-1.2.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:125016400449e46ec0e5fabd14c8314959c4dfa02ffc2861195c99efa2b5b011"}, + {file = "marisa_trie-1.2.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c57647dd9f9ba16fc5bb4679c915d7d48d5c0b25134fb10f095ccd839686a027"}, + {file = "marisa_trie-1.2.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6601e74338fb31e1b20674257706150113463182a01d3a1310df6b8840720b17"}, + {file = "marisa_trie-1.2.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:ce2f68e1000c4c72820c5b2c9d037f326fcf75f036453a5e629f225f99b92cfc"}, + {file = "marisa_trie-1.2.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:069ac10a133d96b3f3ed1cc071b973a3f28490345e7941c778a1d81cf176f04a"}, + {file = "marisa_trie-1.2.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:de9911480ce2a0513582cb84ee4484e5ee8791e692276c7f5cd7378e114d1988"}, + {file = "marisa_trie-1.2.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4cfec001cf233e8853a29e1c2bb74031c217aa61e7bd19389007e04861855731"}, + {file = "marisa_trie-1.2.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd1f3ef8de89684fbdd6aaead09d53b82e718bad4375d2beb938cbd24b48c51a"}, + {file = "marisa_trie-1.2.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:65f5d8c1ecc85283b5b03a1475a5da723b94b3beda752c895b2f748477d8f1b1"}, + {file = "marisa_trie-1.2.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e7540f844c1de493a90ad7d0f5bffc6a2cba19fe312d6db7b97aceff11d97f8"}, + {file = "marisa_trie-1.2.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2fb9243f66563285677079c9dccc697d35985287bacb36c8e685305687b0e025"}, + {file = "marisa_trie-1.2.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:58e2b84cbb6394f9c567f1f4351fc2995a094e1b684da9b577d4139b145401d6"}, + {file = "marisa_trie-1.2.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b4a8d3ed1f1b8f551b52e11a1265eaf0718f06bb206654b2c529cecda0913dd"}, + {file = "marisa_trie-1.2.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a97652c5fbc92f52100afe1c4583625015611000fa81606ad17f1b3bbb9f3bfa"}, + {file = "marisa_trie-1.2.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7183d84da20c89b2a366bf581f0d79d1e248909678f164e8536f291120432e8"}, + {file = "marisa_trie-1.2.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c7f4df4163202b0aa5dad3eeddf088ecb61e9101986c8b31f1e052ebd6df9292"}, + {file = "marisa_trie-1.2.0.tar.gz", hash = "sha256:fedfc67497f8aa2757756b5cf493759f245d321fb78914ce125b6d75daa89b5f"}, +] + +[package.dependencies] +setuptools = "*" + +[package.extras] +test = ["hypothesis", "pytest", "readme-renderer"] + [[package]] name = "markdown-it-py" version = "3.0.0" @@ -3609,6 +3914,48 @@ files = [ {file = "multidict-6.1.0.tar.gz", hash = "sha256:22ae2ebf9b0c69d206c003e2f6a914ea33f0a932d4aa16f236afc049d9958f4a"}, ] +[[package]] +name = "murmurhash" +version = "1.0.10" +description = "Cython bindings for MurmurHash" +optional = false +python-versions = ">=3.6" +files = [ + {file = "murmurhash-1.0.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3e90eef568adca5e17a91f96975e9a782ace3a617bbb3f8c8c2d917096e9bfeb"}, + {file = "murmurhash-1.0.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f8ecb00cc1ab57e4b065f9fb3ea923b55160c402d959c69a0b6dbbe8bc73efc3"}, + {file = "murmurhash-1.0.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3310101004d9e2e0530c2fed30174448d998ffd1b50dcbfb7677e95db101aa4b"}, + {file = "murmurhash-1.0.10-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c65401a6f1778676253cbf89c1f45a8a7feb7d73038e483925df7d5943c08ed9"}, + {file = "murmurhash-1.0.10-cp310-cp310-win_amd64.whl", hash = "sha256:f23f2dfc7174de2cdc5007c0771ab8376a2a3f48247f32cac4a5563e40c6adcc"}, + {file = "murmurhash-1.0.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:90ed37ee2cace9381b83d56068334f77e3e30bc521169a1f886a2a2800e965d6"}, + {file = "murmurhash-1.0.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:22e9926fdbec9d24ced9b0a42f0fee68c730438be3cfb00c2499fd495caec226"}, + {file = "murmurhash-1.0.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54bfbfd68baa99717239b8844600db627f336a08b1caf4df89762999f681cdd1"}, + {file = "murmurhash-1.0.10-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18b9d200a09d48ef67f6840b77c14f151f2b6c48fd69661eb75c7276ebdb146c"}, + {file = "murmurhash-1.0.10-cp311-cp311-win_amd64.whl", hash = "sha256:e5d7cfe392c0a28129226271008e61e77bf307afc24abf34f386771daa7b28b0"}, + {file = "murmurhash-1.0.10-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:96f0a070344d4802ea76a160e0d4c88b7dc10454d2426f48814482ba60b38b9e"}, + {file = "murmurhash-1.0.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9f61862060d677c84556610ac0300a0776cb13cb3155f5075ed97e80f86e55d9"}, + {file = "murmurhash-1.0.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b3b6d2d877d8881a08be66d906856d05944be0faf22b9a0390338bcf45299989"}, + {file = "murmurhash-1.0.10-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d8f54b0031d8696fed17ed6e9628f339cdea0ba2367ca051e18ff59193f52687"}, + {file = "murmurhash-1.0.10-cp312-cp312-win_amd64.whl", hash = "sha256:97e09d675de2359e586f09de1d0de1ab39f9911edffc65c9255fb5e04f7c1f85"}, + {file = "murmurhash-1.0.10-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b64e5332932993fef598e78d633b1ba664789ab73032ed511f3dc615a631a1a"}, + {file = "murmurhash-1.0.10-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e2a38437a8497e082408aa015c6d90554b9e00c2c221fdfa79728a2d99a739e"}, + {file = "murmurhash-1.0.10-cp36-cp36m-win_amd64.whl", hash = "sha256:55f4e4f9291a53c36070330950b472d72ba7d331e4ce3ce1ab349a4f458f7bc4"}, + {file = "murmurhash-1.0.10-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:16ef9f0855952493fe08929d23865425906a8c0c40607ac8a949a378652ba6a9"}, + {file = "murmurhash-1.0.10-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cc3351ae92b89c2fcdc6e41ac6f17176dbd9b3554c96109fd0713695d8663e7"}, + {file = "murmurhash-1.0.10-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6559fef7c2e7349a42a63549067709b656d6d1580752bd76be1541d8b2d65718"}, + {file = "murmurhash-1.0.10-cp37-cp37m-win_amd64.whl", hash = "sha256:8bf49e3bb33febb7057ae3a5d284ef81243a1e55eaa62bdcd79007cddbdc0461"}, + {file = "murmurhash-1.0.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f1605fde07030516eb63d77a598dd164fb9bf217fd937dbac588fe7e47a28c40"}, + {file = "murmurhash-1.0.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4904f7e68674a64eb2b08823c72015a5e14653e0b4b109ea00c652a005a59bad"}, + {file = "murmurhash-1.0.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0438f0cb44cf1cd26251f72c1428213c4197d40a4e3f48b1efc3aea12ce18517"}, + {file = "murmurhash-1.0.10-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db1171a3f9a10571931764cdbfaa5371f4cf5c23c680639762125cb075b833a5"}, + {file = "murmurhash-1.0.10-cp38-cp38-win_amd64.whl", hash = "sha256:1c9fbcd7646ad8ba67b895f71d361d232c6765754370ecea473dd97d77afe99f"}, + {file = "murmurhash-1.0.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7024ab3498434f22f8e642ae31448322ad8228c65c8d9e5dc2d563d57c14c9b8"}, + {file = "murmurhash-1.0.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a99dedfb7f0cc5a4cd76eb409ee98d3d50eba024f934e705914f6f4d765aef2c"}, + {file = "murmurhash-1.0.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b580b8503647de5dd7972746b7613ea586270f17ac92a44872a9b1b52c36d68"}, + {file = "murmurhash-1.0.10-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d75840212bf75eb1352c946c3cf1622dacddd6d6bdda34368237d1eb3568f23a"}, + {file = "murmurhash-1.0.10-cp39-cp39-win_amd64.whl", hash = "sha256:a4209962b9f85de397c3203ea4b3a554da01ae9fd220fdab38757d4e9eba8d1a"}, + {file = "murmurhash-1.0.10.tar.gz", hash = "sha256:5282aab1317804c6ebd6dd7f69f15ba9075aee671c44a34be2bde0f1b11ef88a"}, +] + [[package]] name = "mypy-extensions" version = "1.0.0" @@ -4113,6 +4460,17 @@ files = [ [package.dependencies] ptyprocess = ">=0.5" +[[package]] +name = "phonenumbers" +version = "8.13.46" +description = "Python version of Google's common library for parsing, formatting, storing and validating international phone numbers." +optional = false +python-versions = "*" +files = [ + {file = "phonenumbers-8.13.46-py2.py3-none-any.whl", hash = "sha256:519422d407af066fdbf98e179ea2e214487060f26526d67871f817eefbbb2134"}, + {file = "phonenumbers-8.13.46.tar.gz", hash = "sha256:94bf18ba9725bb6868d29473b13f78ef01e2585c5cb561ec0200be7676e77452"}, +] + [[package]] name = "pillow" version = "10.4.0" @@ -4246,6 +4604,92 @@ files = [ greenlet = "3.0.3" pyee = "12.0.0" +[[package]] +name = "preshed" +version = "3.0.9" +description = "Cython hash table that trusts the keys are pre-hashed" +optional = false +python-versions = ">=3.6" +files = [ + {file = "preshed-3.0.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4f96ef4caf9847b2bb9868574dcbe2496f974e41c2b83d6621c24fb4c3fc57e3"}, + {file = "preshed-3.0.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a61302cf8bd30568631adcdaf9e6b21d40491bd89ba8ebf67324f98b6c2a2c05"}, + {file = "preshed-3.0.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99499e8a58f58949d3f591295a97bca4e197066049c96f5d34944dd21a497193"}, + {file = "preshed-3.0.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ea6b6566997dc3acd8c6ee11a89539ac85c77275b4dcefb2dc746d11053a5af8"}, + {file = "preshed-3.0.9-cp310-cp310-win_amd64.whl", hash = "sha256:bfd523085a84b1338ff18f61538e1cfcdedc4b9e76002589a301c364d19a2e36"}, + {file = "preshed-3.0.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e7c2364da27f2875524ce1ca754dc071515a9ad26eb5def4c7e69129a13c9a59"}, + {file = "preshed-3.0.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:182138033c0730c683a6d97e567ceb8a3e83f3bff5704f300d582238dbd384b3"}, + {file = "preshed-3.0.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:345a10be3b86bcc6c0591d343a6dc2bfd86aa6838c30ced4256dfcfa836c3a64"}, + {file = "preshed-3.0.9-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51d0192274aa061699b284f9fd08416065348edbafd64840c3889617ee1609de"}, + {file = "preshed-3.0.9-cp311-cp311-win_amd64.whl", hash = "sha256:96b857d7a62cbccc3845ac8c41fd23addf052821be4eb987f2eb0da3d8745aa1"}, + {file = "preshed-3.0.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b4fe6720012c62e6d550d6a5c1c7ad88cacef8388d186dad4bafea4140d9d198"}, + {file = "preshed-3.0.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e04f05758875be9751e483bd3c519c22b00d3b07f5a64441ec328bb9e3c03700"}, + {file = "preshed-3.0.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a55091d0e395f1fdb62ab43401bb9f8b46c7d7794d5b071813c29dc1ab22fd0"}, + {file = "preshed-3.0.9-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7de8f5138bcac7870424e09684dc3dd33c8e30e81b269f6c9ede3d8c7bb8e257"}, + {file = "preshed-3.0.9-cp312-cp312-win_amd64.whl", hash = "sha256:24229c77364628743bc29c5620c5d6607ed104f0e02ae31f8a030f99a78a5ceb"}, + {file = "preshed-3.0.9-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73b0f7ecc58095ebbc6ca26ec806008ef780190fe685ce471b550e7eef58dc2"}, + {file = "preshed-3.0.9-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5cb90ecd5bec71c21d95962db1a7922364d6db2abe284a8c4b196df8bbcc871e"}, + {file = "preshed-3.0.9-cp36-cp36m-win_amd64.whl", hash = "sha256:e304a0a8c9d625b70ba850c59d4e67082a6be9c16c4517b97850a17a282ebee6"}, + {file = "preshed-3.0.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:1fa6d3d5529b08296ff9b7b4da1485c080311fd8744bbf3a86019ff88007b382"}, + {file = "preshed-3.0.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef1e5173809d85edd420fc79563b286b88b4049746b797845ba672cf9435c0e7"}, + {file = "preshed-3.0.9-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7fe81eb21c7d99e8b9a802cc313b998c5f791bda592903c732b607f78a6b7dc4"}, + {file = "preshed-3.0.9-cp37-cp37m-win_amd64.whl", hash = "sha256:78590a4a952747c3766e605ce8b747741005bdb1a5aa691a18aae67b09ece0e6"}, + {file = "preshed-3.0.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3452b64d97ce630e200c415073040aa494ceec6b7038f7a2a3400cbd7858e952"}, + {file = "preshed-3.0.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ac970d97b905e9e817ec13d31befd5b07c9cfec046de73b551d11a6375834b79"}, + {file = "preshed-3.0.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eebaa96ece6641cd981491cba995b68c249e0b6877c84af74971eacf8990aa19"}, + {file = "preshed-3.0.9-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d473c5f6856e07a88d41fe00bb6c206ecf7b34c381d30de0b818ba2ebaf9406"}, + {file = "preshed-3.0.9-cp38-cp38-win_amd64.whl", hash = "sha256:0de63a560f10107a3f0a9e252cc3183b8fdedcb5f81a86938fd9f1dcf8a64adf"}, + {file = "preshed-3.0.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3a9ad9f738084e048a7c94c90f40f727217387115b2c9a95c77f0ce943879fcd"}, + {file = "preshed-3.0.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a671dfa30b67baa09391faf90408b69c8a9a7f81cb9d83d16c39a182355fbfce"}, + {file = "preshed-3.0.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23906d114fc97c17c5f8433342495d7562e96ecfd871289c2bb2ed9a9df57c3f"}, + {file = "preshed-3.0.9-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:778cf71f82cedd2719b256f3980d556d6fb56ec552334ba79b49d16e26e854a0"}, + {file = "preshed-3.0.9-cp39-cp39-win_amd64.whl", hash = "sha256:a6e579439b329eb93f32219ff27cb358b55fbb52a4862c31a915a098c8a22ac2"}, + {file = "preshed-3.0.9.tar.gz", hash = "sha256:721863c5244ffcd2651ad0928951a2c7c77b102f4e11a251ad85d37ee7621660"}, +] + +[package.dependencies] +cymem = ">=2.0.2,<2.1.0" +murmurhash = ">=0.28.0,<1.1.0" + +[[package]] +name = "presidio-analyzer" +version = "2.2.355" +description = "Presidio Analyzer package" +optional = false +python-versions = "<4.0,>=3.8" +files = [ + {file = "presidio_analyzer-2.2.355-py3-none-any.whl", hash = "sha256:c4c5bc6d82e4f94059fd554c31365fc5d29afe51763391b6ecc33b628bdb5858"}, +] + +[package.dependencies] +phonenumbers = ">=8.12,<9.0.0" +pyyaml = "*" +regex = "*" +spacy = ">=3.4.4,<4.0.0" +tldextract = "*" + +[package.extras] +azure-ai-language = ["azure-ai-textanalytics", "azure-core"] +server = ["flask (>=1.1)"] +stanza = ["spacy_stanza", "stanza"] +transformers = ["spacy_huggingface_pipelines"] + +[[package]] +name = "presidio-anonymizer" +version = "2.2.355" +description = "Presidio Anonymizer package - replaces analyzed text with desired values." +optional = false +python-versions = "<4.0,>=3.8" +files = [ + {file = "presidio_anonymizer-2.2.355-py3-none-any.whl", hash = "sha256:c85f5f155fcb66aff8e962fcf3984552a5512ab34bb1a433b1a52193e635c23f"}, +] + +[package.dependencies] +azure-core = "*" +pycryptodome = ">=3.10.1" + +[package.extras] +server = ["flask (>=1.1)"] + [[package]] name = "prometheus-client" version = "0.21.0" @@ -4401,6 +4845,47 @@ files = [ {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, ] +[[package]] +name = "pycryptodome" +version = "3.21.0" +description = "Cryptographic library for Python" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +files = [ + {file = "pycryptodome-3.21.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:dad9bf36eda068e89059d1f07408e397856be9511d7113ea4b586642a429a4fd"}, + {file = "pycryptodome-3.21.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:a1752eca64c60852f38bb29e2c86fca30d7672c024128ef5d70cc15868fa10f4"}, + {file = "pycryptodome-3.21.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:3ba4cc304eac4d4d458f508d4955a88ba25026890e8abff9b60404f76a62c55e"}, + {file = "pycryptodome-3.21.0-cp27-cp27m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cb087b8612c8a1a14cf37dd754685be9a8d9869bed2ffaaceb04850a8aeef7e"}, + {file = "pycryptodome-3.21.0-cp27-cp27m-musllinux_1_1_aarch64.whl", hash = "sha256:26412b21df30b2861424a6c6d5b1d8ca8107612a4cfa4d0183e71c5d200fb34a"}, + {file = "pycryptodome-3.21.0-cp27-cp27m-win32.whl", hash = "sha256:cc2269ab4bce40b027b49663d61d816903a4bd90ad88cb99ed561aadb3888dd3"}, + {file = "pycryptodome-3.21.0-cp27-cp27m-win_amd64.whl", hash = "sha256:0fa0a05a6a697ccbf2a12cec3d6d2650b50881899b845fac6e87416f8cb7e87d"}, + {file = "pycryptodome-3.21.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6cce52e196a5f1d6797ff7946cdff2038d3b5f0aba4a43cb6bf46b575fd1b5bb"}, + {file = "pycryptodome-3.21.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:a915597ffccabe902e7090e199a7bf7a381c5506a747d5e9d27ba55197a2c568"}, + {file = "pycryptodome-3.21.0-cp27-cp27mu-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4e74c522d630766b03a836c15bff77cb657c5fdf098abf8b1ada2aebc7d0819"}, + {file = "pycryptodome-3.21.0-cp27-cp27mu-musllinux_1_1_aarch64.whl", hash = "sha256:a3804675283f4764a02db05f5191eb8fec2bb6ca34d466167fc78a5f05bbe6b3"}, + {file = "pycryptodome-3.21.0-cp36-abi3-macosx_10_9_universal2.whl", hash = "sha256:2480ec2c72438430da9f601ebc12c518c093c13111a5c1644c82cdfc2e50b1e4"}, + {file = "pycryptodome-3.21.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:de18954104667f565e2fbb4783b56667f30fb49c4d79b346f52a29cb198d5b6b"}, + {file = "pycryptodome-3.21.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de4b7263a33947ff440412339cb72b28a5a4c769b5c1ca19e33dd6cd1dcec6e"}, + {file = "pycryptodome-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0714206d467fc911042d01ea3a1847c847bc10884cf674c82e12915cfe1649f8"}, + {file = "pycryptodome-3.21.0-cp36-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d85c1b613121ed3dbaa5a97369b3b757909531a959d229406a75b912dd51dd1"}, + {file = "pycryptodome-3.21.0-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:8898a66425a57bcf15e25fc19c12490b87bd939800f39a03ea2de2aea5e3611a"}, + {file = "pycryptodome-3.21.0-cp36-abi3-musllinux_1_2_i686.whl", hash = "sha256:932c905b71a56474bff8a9c014030bc3c882cee696b448af920399f730a650c2"}, + {file = "pycryptodome-3.21.0-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:18caa8cfbc676eaaf28613637a89980ad2fd96e00c564135bf90bc3f0b34dd93"}, + {file = "pycryptodome-3.21.0-cp36-abi3-win32.whl", hash = "sha256:280b67d20e33bb63171d55b1067f61fbd932e0b1ad976b3a184303a3dad22764"}, + {file = "pycryptodome-3.21.0-cp36-abi3-win_amd64.whl", hash = "sha256:b7aa25fc0baa5b1d95b7633af4f5f1838467f1815442b22487426f94e0d66c53"}, + {file = "pycryptodome-3.21.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:2cb635b67011bc147c257e61ce864879ffe6d03342dc74b6045059dfbdedafca"}, + {file = "pycryptodome-3.21.0-pp27-pypy_73-win32.whl", hash = "sha256:4c26a2f0dc15f81ea3afa3b0c87b87e501f235d332b7f27e2225ecb80c0b1cdd"}, + {file = "pycryptodome-3.21.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:d5ebe0763c982f069d3877832254f64974139f4f9655058452603ff559c482e8"}, + {file = "pycryptodome-3.21.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ee86cbde706be13f2dec5a42b52b1c1d1cbb90c8e405c68d0755134735c8dc6"}, + {file = "pycryptodome-3.21.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0fd54003ec3ce4e0f16c484a10bc5d8b9bd77fa662a12b85779a2d2d85d67ee0"}, + {file = "pycryptodome-3.21.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5dfafca172933506773482b0e18f0cd766fd3920bd03ec85a283df90d8a17bc6"}, + {file = "pycryptodome-3.21.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:590ef0898a4b0a15485b05210b4a1c9de8806d3ad3d47f74ab1dc07c67a6827f"}, + {file = "pycryptodome-3.21.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f35e442630bc4bc2e1878482d6f59ea22e280d7121d7adeaedba58c23ab6386b"}, + {file = "pycryptodome-3.21.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff99f952db3db2fbe98a0b355175f93ec334ba3d01bbde25ad3a5a33abc02b58"}, + {file = "pycryptodome-3.21.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:8acd7d34af70ee63f9a849f957558e49a98f8f1634f86a59d2be62bb8e93f71c"}, + {file = "pycryptodome-3.21.0.tar.gz", hash = "sha256:f7787e0d469bdae763b876174cf2e6c0f7be79808af26b1da96f1a64bcf47297"}, +] + [[package]] name = "pydantic" version = "2.9.2" @@ -5454,6 +5939,31 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "smart-open" +version = "7.0.4" +description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)" +optional = false +python-versions = "<4.0,>=3.7" +files = [ + {file = "smart_open-7.0.4-py3-none-any.whl", hash = "sha256:4e98489932b3372595cddc075e6033194775165702887216b65eba760dfd8d47"}, + {file = "smart_open-7.0.4.tar.gz", hash = "sha256:62b65852bdd1d1d516839fcb1f6bc50cd0f16e05b4ec44b52f43d38bcb838524"}, +] + +[package.dependencies] +wrapt = "*" + +[package.extras] +all = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage (>=2.6.0)", "paramiko", "requests", "zstandard"] +azure = ["azure-common", "azure-core", "azure-storage-blob"] +gcs = ["google-cloud-storage (>=2.6.0)"] +http = ["requests"] +s3 = ["boto3"] +ssh = ["paramiko"] +test = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage (>=2.6.0)", "moto[server]", "paramiko", "pytest", "pytest-rerunfailures", "requests", "responses", "zstandard"] +webhdfs = ["requests"] +zst = ["zstandard"] + [[package]] name = "sniffio" version = "1.3.1" @@ -5487,6 +5997,115 @@ files = [ {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"}, ] +[[package]] +name = "spacy" +version = "3.7.5" +description = "Industrial-strength Natural Language Processing (NLP) in Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "spacy-3.7.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8002897701429ee2ab5ff6921ae43560f4cd17184cb1e10dad761901c12dcb85"}, + {file = "spacy-3.7.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:43acd19efc845e9126b61a05ed7508a0aff509e96e15563f30f810c19e636b7c"}, + {file = "spacy-3.7.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f044522b1271ea54718dc43b6f593b5dad349cd31b3827764c501529b599e09a"}, + {file = "spacy-3.7.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6a7dbfbca42c1c128fefa6832631fe49e11c850e963af99229f14e2d0ae94f34"}, + {file = "spacy-3.7.5-cp310-cp310-win_amd64.whl", hash = "sha256:2a21b2a1e1e5d10d15c6f75990b7341d0fc9b454083dfd4222fdd75b9164831c"}, + {file = "spacy-3.7.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cd93c34bf2a02bbed7df73d42aed8df5e3eb9688c4ea84ec576f740ba939cce5"}, + {file = "spacy-3.7.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:190ba0032a5efdb138487c587c0ebb7a98f86adb917f464b252ee8766b8eec4a"}, + {file = "spacy-3.7.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38de1c9bbb73b8cdfea2dd6e57450f093c1a1af47515870c1c8640b85b35ab16"}, + {file = "spacy-3.7.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3dad4853950a2fe6c7a0bdfd791a762d1f8cedd2915c4ae41b2e0ca3a850eefc"}, + {file = "spacy-3.7.5-cp311-cp311-win_amd64.whl", hash = "sha256:4e00d076871af784c2e43185a71ee676b58893853a05c5b81717b8af2b666c07"}, + {file = "spacy-3.7.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:bf54c3c2425428b328b53a65913d47eb4cb27a1429aa4e8ed979ffc97d4663e0"}, + {file = "spacy-3.7.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4145cea7f9814fa7d86b2028c2dd83e02f13f80d5ac604a400b2f7d7b26a0e8c"}, + {file = "spacy-3.7.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:262f8ebb71f7ed5ffe8e4f384b2594b7a296be50241ce9fbd9277b5da2f46f38"}, + {file = "spacy-3.7.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:faa1e2b6234ae33c0b1f8dfa5a8dcb66fb891f19231725dfcff4b2666125c250"}, + {file = "spacy-3.7.5-cp312-cp312-win_amd64.whl", hash = "sha256:07677e270a6d729453cc04b5e2247a96a86320b8845e6428d9f90f217eff0f56"}, + {file = "spacy-3.7.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:3e207dda0639818e2ef8f12e3df82a526de118cc09082b0eee3053ebcd9f8332"}, + {file = "spacy-3.7.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5694dd3b2f6414c18e2a3f31111cd41ffd597e1d614b51c5779f85ff07f08f6c"}, + {file = "spacy-3.7.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d211920ff73d68b8febb1d293f10accbd54f2b2228ecd3530548227b750252b1"}, + {file = "spacy-3.7.5-cp37-cp37m-win_amd64.whl", hash = "sha256:1171bf4d8541c18a83441be01feb6c735ffc02e9308810cd691c8900a6678cd5"}, + {file = "spacy-3.7.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d9108f67675fb2078ed77cda61fd4cfc197f9256c28d35cfd946dcb080190ddc"}, + {file = "spacy-3.7.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:12fdc01a4391299a47f16915505cc515fd059e71c7239904e216523354eeb9d9"}, + {file = "spacy-3.7.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f8fbe9f6b9de1bf05d163a9dd88108b8f20b138986e6ed36f960832e3fcab33"}, + {file = "spacy-3.7.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d244d524ab5a33530ac5c50fc92c9a41da6c3980f452048b9fc29e1ff1bdd03e"}, + {file = "spacy-3.7.5-cp38-cp38-win_amd64.whl", hash = "sha256:8b493a8b79a7f3754102fa5ef7e2615568a390fec7ea20db49af55e5f0841fcf"}, + {file = "spacy-3.7.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fdbb667792d6ca93899645774d1db3fccc327088a92072029be1e4bc25d7cf15"}, + {file = "spacy-3.7.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4cfb85309e11a39681c9d4941aebb95c1f5e2e3b77a61a5451e2c3849da4b92e"}, + {file = "spacy-3.7.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b0bf1788ca397eef8e67e9c07cfd9287adac438512dd191e6e6ca0f36357201"}, + {file = "spacy-3.7.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:591d90d8504e9bd5be5b482be7c6d6a974afbaeb62c3181e966f4e407e0ab300"}, + {file = "spacy-3.7.5-cp39-cp39-win_amd64.whl", hash = "sha256:713b56fe008c79df01617f3602a0b7e523292211337eb999bdffb910ea1f4825"}, + {file = "spacy-3.7.5.tar.gz", hash = "sha256:a648c6cbf2acc7a55a69ee9e7fa4f22bdf69aa828a587a1bc5cfff08cf3c2dd3"}, +] + +[package.dependencies] +catalogue = ">=2.0.6,<2.1.0" +cymem = ">=2.0.2,<2.1.0" +jinja2 = "*" +langcodes = ">=3.2.0,<4.0.0" +murmurhash = ">=0.28.0,<1.1.0" +numpy = {version = ">=1.19.0", markers = "python_version >= \"3.9\""} +packaging = ">=20.0" +preshed = ">=3.0.2,<3.1.0" +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0" +requests = ">=2.13.0,<3.0.0" +setuptools = "*" +spacy-legacy = ">=3.0.11,<3.1.0" +spacy-loggers = ">=1.0.0,<2.0.0" +srsly = ">=2.4.3,<3.0.0" +thinc = ">=8.2.2,<8.3.0" +tqdm = ">=4.38.0,<5.0.0" +typer = ">=0.3.0,<1.0.0" +wasabi = ">=0.9.1,<1.2.0" +weasel = ">=0.1.0,<0.5.0" + +[package.extras] +apple = ["thinc-apple-ops (>=0.1.0.dev0,<1.0.0)"] +cuda = ["cupy (>=5.0.0b4,<13.0.0)"] +cuda-autodetect = ["cupy-wheel (>=11.0.0,<13.0.0)"] +cuda100 = ["cupy-cuda100 (>=5.0.0b4,<13.0.0)"] +cuda101 = ["cupy-cuda101 (>=5.0.0b4,<13.0.0)"] +cuda102 = ["cupy-cuda102 (>=5.0.0b4,<13.0.0)"] +cuda110 = ["cupy-cuda110 (>=5.0.0b4,<13.0.0)"] +cuda111 = ["cupy-cuda111 (>=5.0.0b4,<13.0.0)"] +cuda112 = ["cupy-cuda112 (>=5.0.0b4,<13.0.0)"] +cuda113 = ["cupy-cuda113 (>=5.0.0b4,<13.0.0)"] +cuda114 = ["cupy-cuda114 (>=5.0.0b4,<13.0.0)"] +cuda115 = ["cupy-cuda115 (>=5.0.0b4,<13.0.0)"] +cuda116 = ["cupy-cuda116 (>=5.0.0b4,<13.0.0)"] +cuda117 = ["cupy-cuda117 (>=5.0.0b4,<13.0.0)"] +cuda11x = ["cupy-cuda11x (>=11.0.0,<13.0.0)"] +cuda12x = ["cupy-cuda12x (>=11.5.0,<13.0.0)"] +cuda80 = ["cupy-cuda80 (>=5.0.0b4,<13.0.0)"] +cuda90 = ["cupy-cuda90 (>=5.0.0b4,<13.0.0)"] +cuda91 = ["cupy-cuda91 (>=5.0.0b4,<13.0.0)"] +cuda92 = ["cupy-cuda92 (>=5.0.0b4,<13.0.0)"] +ja = ["sudachidict-core (>=20211220)", "sudachipy (>=0.5.2,!=0.6.1)"] +ko = ["natto-py (>=0.9.0)"] +lookups = ["spacy-lookups-data (>=1.0.3,<1.1.0)"] +th = ["pythainlp (>=2.0)"] +transformers = ["spacy-transformers (>=1.1.2,<1.4.0)"] + +[[package]] +name = "spacy-legacy" +version = "3.0.12" +description = "Legacy registered functions for spaCy backwards compatibility" +optional = false +python-versions = ">=3.6" +files = [ + {file = "spacy-legacy-3.0.12.tar.gz", hash = "sha256:b37d6e0c9b6e1d7ca1cf5bc7152ab64a4c4671f59c85adaf7a3fcb870357a774"}, + {file = "spacy_legacy-3.0.12-py2.py3-none-any.whl", hash = "sha256:476e3bd0d05f8c339ed60f40986c07387c0a71479245d6d0f4298dbd52cda55f"}, +] + +[[package]] +name = "spacy-loggers" +version = "1.0.5" +description = "Logging utilities for SpaCy" +optional = false +python-versions = ">=3.6" +files = [ + {file = "spacy-loggers-1.0.5.tar.gz", hash = "sha256:d60b0bdbf915a60e516cc2e653baeff946f0cfc461b452d11a4d5458c6fe5f24"}, + {file = "spacy_loggers-1.0.5-py3-none-any.whl", hash = "sha256:196284c9c446cc0cdb944005384270d775fdeaf4f494d8e269466cfa497ef645"}, +] + [[package]] name = "spider-client" version = "0.0.27" @@ -5587,6 +6206,52 @@ postgresql-psycopgbinary = ["psycopg[binary] (>=3.0.7)"] pymysql = ["pymysql"] sqlcipher = ["sqlcipher3_binary"] +[[package]] +name = "srsly" +version = "2.4.8" +description = "Modern high-performance serialization utilities for Python" +optional = false +python-versions = ">=3.6" +files = [ + {file = "srsly-2.4.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:17f3bcb418bb4cf443ed3d4dcb210e491bd9c1b7b0185e6ab10b6af3271e63b2"}, + {file = "srsly-2.4.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0b070a58e21ab0e878fd949f932385abb4c53dd0acb6d3a7ee75d95d447bc609"}, + {file = "srsly-2.4.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98286d20014ed2067ad02b0be1e17c7e522255b188346e79ff266af51a54eb33"}, + {file = "srsly-2.4.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18685084e2e0cc47c25158cbbf3e44690e494ef77d6418c2aae0598c893f35b0"}, + {file = "srsly-2.4.8-cp310-cp310-win_amd64.whl", hash = "sha256:980a179cbf4eb5bc56f7507e53f76720d031bcf0cef52cd53c815720eb2fc30c"}, + {file = "srsly-2.4.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5472ed9f581e10c32e79424c996cf54c46c42237759f4224806a0cd4bb770993"}, + {file = "srsly-2.4.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:50f10afe9230072c5aad9f6636115ea99b32c102f4c61e8236d8642c73ec7a13"}, + {file = "srsly-2.4.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c994a89ba247a4d4f63ef9fdefb93aa3e1f98740e4800d5351ebd56992ac75e3"}, + {file = "srsly-2.4.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ace7ed4a0c20fa54d90032be32f9c656b6d75445168da78d14fe9080a0c208ad"}, + {file = "srsly-2.4.8-cp311-cp311-win_amd64.whl", hash = "sha256:7a919236a090fb93081fbd1cec030f675910f3863825b34a9afbcae71f643127"}, + {file = "srsly-2.4.8-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:7583c03d114b4478b7a357a1915305163e9eac2dfe080da900555c975cca2a11"}, + {file = "srsly-2.4.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:94ccdd2f6db824c31266aaf93e0f31c1c43b8bc531cd2b3a1d924e3c26a4f294"}, + {file = "srsly-2.4.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db72d2974f91aee652d606c7def98744ca6b899bd7dd3009fd75ebe0b5a51034"}, + {file = "srsly-2.4.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6a60c905fd2c15e848ce1fc315fd34d8a9cc72c1dee022a0d8f4c62991131307"}, + {file = "srsly-2.4.8-cp312-cp312-win_amd64.whl", hash = "sha256:e0b8d5722057000694edf105b8f492e7eb2f3aa6247a5f0c9170d1e0d074151c"}, + {file = "srsly-2.4.8-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:196b4261f9d6372d1d3d16d1216b90c7e370b4141471322777b7b3c39afd1210"}, + {file = "srsly-2.4.8-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4750017e6d78590b02b12653e97edd25aefa4734281386cc27501d59b7481e4e"}, + {file = "srsly-2.4.8-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa034cd582ba9e4a120c8f19efa263fcad0f10fc481e73fb8c0d603085f941c4"}, + {file = "srsly-2.4.8-cp36-cp36m-win_amd64.whl", hash = "sha256:5a78ab9e9d177ee8731e950feb48c57380036d462b49e3fb61a67ce529ff5f60"}, + {file = "srsly-2.4.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:087e36439af517e259843df93eb34bb9e2d2881c34fa0f541589bcfbc757be97"}, + {file = "srsly-2.4.8-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad141d8a130cb085a0ed3a6638b643e2b591cb98a4591996780597a632acfe20"}, + {file = "srsly-2.4.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24d05367b2571c0d08d00459636b951e3ca2a1e9216318c157331f09c33489d3"}, + {file = "srsly-2.4.8-cp37-cp37m-win_amd64.whl", hash = "sha256:3fd661a1c4848deea2849b78f432a70c75d10968e902ca83c07c89c9b7050ab8"}, + {file = "srsly-2.4.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ec37233fe39af97b00bf20dc2ceda04d39b9ea19ce0ee605e16ece9785e11f65"}, + {file = "srsly-2.4.8-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d2fd4bc081f1d6a6063396b6d97b00d98e86d9d3a3ac2949dba574a84e148080"}, + {file = "srsly-2.4.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7347cff1eb4ef3fc335d9d4acc89588051b2df43799e5d944696ef43da79c873"}, + {file = "srsly-2.4.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a9dc1da5cc94d77056b91ba38365c72ae08556b6345bef06257c7e9eccabafe"}, + {file = "srsly-2.4.8-cp38-cp38-win_amd64.whl", hash = "sha256:dc0bf7b6f23c9ecb49ec0924dc645620276b41e160e9b283ed44ca004c060d79"}, + {file = "srsly-2.4.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ff8df21d00d73c371bead542cefef365ee87ca3a5660de292444021ff84e3b8c"}, + {file = "srsly-2.4.8-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0ac3e340e65a9fe265105705586aa56054dc3902789fcb9a8f860a218d6c0a00"}, + {file = "srsly-2.4.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:06d1733f4275eff4448e96521cc7dcd8fdabd68ba9b54ca012dcfa2690db2644"}, + {file = "srsly-2.4.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be5b751ad88fdb58fb73871d456248c88204f213aaa3c9aab49b6a1802b3fa8d"}, + {file = "srsly-2.4.8-cp39-cp39-win_amd64.whl", hash = "sha256:822a38b8cf112348f3accbc73274a94b7bf82515cb14a85ba586d126a5a72851"}, + {file = "srsly-2.4.8.tar.gz", hash = "sha256:b24d95a65009c2447e0b49cda043ac53fecf4f09e358d87a57446458f91b8a91"}, +] + +[package.dependencies] +catalogue = ">=2.0.3,<2.1.0" + [[package]] name = "stack-data" version = "0.6.3" @@ -5684,6 +6349,75 @@ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["pre-commit", "pytest (>=7.0)", "pytest-timeout"] typing = ["mypy (>=1.6,<2.0)", "traitlets (>=5.11.1)"] +[[package]] +name = "thinc" +version = "8.2.5" +description = "A refreshing functional take on deep learning, compatible with your favorite libraries" +optional = false +python-versions = ">=3.6" +files = [ + {file = "thinc-8.2.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dc267f6aad80a681a85f50383afe91da9e2bec56fefdda86bfa2e4f529bef191"}, + {file = "thinc-8.2.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d80f1e497971c9fa0938f5cc8fe607bbe87356b405fb7bbc3ff9f32fb4eed3bb"}, + {file = "thinc-8.2.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0933adbd3e65e30d3bef903e77a368bc8a41bed34b0d18df6d4fc0536908e21f"}, + {file = "thinc-8.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54bac2ba23b208fdaf267cd6113d26a5ecbb3b0e0c6015dff784ae6a9c5e78ca"}, + {file = "thinc-8.2.5-cp310-cp310-win_amd64.whl", hash = "sha256:399260197ef3f8d9600315fc5b5a1d5940400fceb0361de642e9fe3506d82385"}, + {file = "thinc-8.2.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a75c0de3340afed594beda293661de145f3842873df56d9989bc338148f13fab"}, + {file = "thinc-8.2.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6b166d1a22003ee03bc236370fff2884744c1fb758a6209a2512d305773d07d7"}, + {file = "thinc-8.2.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34db8a023b9f70645fdf06c510584ba6d8b97ec53c1e094f42d95652bf8c875f"}, + {file = "thinc-8.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8901b30db1071ea8d5e4437429c8632535bf5ed87938ce3bb5057bed9f15aed8"}, + {file = "thinc-8.2.5-cp311-cp311-win_amd64.whl", hash = "sha256:8ef5d46d62e31f2450224ab22391a606cf427b13e20cfc570f70422e2f333872"}, + {file = "thinc-8.2.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9fc26697e2358c71a5fe243d52e98ae67ee1a3b314eead5031845b6d1c0d121c"}, + {file = "thinc-8.2.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8e299d4dc41107385d6d14d8604a060825798a031cabe2b894b22f9d75d9eaad"}, + {file = "thinc-8.2.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8a8f2f249f2be9a5ce2a81a6efe7503b68be7b57e47ad54ab28204e1f0c723b"}, + {file = "thinc-8.2.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87e729f33c76ec6df9b375989743252ab880d79f3a2b4175169b21dece90f102"}, + {file = "thinc-8.2.5-cp312-cp312-win_amd64.whl", hash = "sha256:c5f750ea2dd32ca6d46947025dacfc0f6037340c4e5f7adb9af84c75f65aa7d8"}, + {file = "thinc-8.2.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bb97e2f699a3df16112ef5460cbfb0c9189a5fbc0e76bcf170ed7d995bdce367"}, + {file = "thinc-8.2.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5c78fb218273894168d1ca2dd3a20f28dba5a7fa698c4f2a2fc425eda2086cfc"}, + {file = "thinc-8.2.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdc27da534807a2addd1c3d2a3d19f99e3eb67fdbce81c21f4e4c8bfa94ac15b"}, + {file = "thinc-8.2.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b884e56eaeb9e5c7bfeb1c8810a3cbad19a599b33b9f3152b90b67f468471ac"}, + {file = "thinc-8.2.5-cp39-cp39-win_amd64.whl", hash = "sha256:df2138cf379061017ecb8bf609a8857e7904709ef0a9a2252783c16f67a2b749"}, + {file = "thinc-8.2.5.tar.gz", hash = "sha256:c2963791c934cc7fbd8f9b942d571cac79892ad11630bfca690a868c32752b75"}, +] + +[package.dependencies] +blis = ">=0.7.8,<0.8.0" +catalogue = ">=2.0.4,<2.1.0" +confection = ">=0.0.1,<1.0.0" +cymem = ">=2.0.2,<2.1.0" +murmurhash = ">=1.0.2,<1.1.0" +numpy = {version = ">=1.19.0,<2.0.0", markers = "python_version >= \"3.9\""} +packaging = ">=20.0" +preshed = ">=3.0.2,<3.1.0" +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0" +setuptools = "*" +srsly = ">=2.4.0,<3.0.0" +wasabi = ">=0.8.1,<1.2.0" + +[package.extras] +cuda = ["cupy (>=5.0.0b4)"] +cuda-autodetect = ["cupy-wheel (>=11.0.0)"] +cuda100 = ["cupy-cuda100 (>=5.0.0b4)"] +cuda101 = ["cupy-cuda101 (>=5.0.0b4)"] +cuda102 = ["cupy-cuda102 (>=5.0.0b4)"] +cuda110 = ["cupy-cuda110 (>=5.0.0b4)"] +cuda111 = ["cupy-cuda111 (>=5.0.0b4)"] +cuda112 = ["cupy-cuda112 (>=5.0.0b4)"] +cuda113 = ["cupy-cuda113 (>=5.0.0b4)"] +cuda114 = ["cupy-cuda114 (>=5.0.0b4)"] +cuda115 = ["cupy-cuda115 (>=5.0.0b4)"] +cuda116 = ["cupy-cuda116 (>=5.0.0b4)"] +cuda117 = ["cupy-cuda117 (>=5.0.0b4)"] +cuda11x = ["cupy-cuda11x (>=11.0.0)"] +cuda12x = ["cupy-cuda12x (>=11.5.0)"] +cuda80 = ["cupy-cuda80 (>=5.0.0b4)"] +cuda90 = ["cupy-cuda90 (>=5.0.0b4)"] +cuda91 = ["cupy-cuda91 (>=5.0.0b4)"] +cuda92 = ["cupy-cuda92 (>=5.0.0b4)"] +datasets = ["ml-datasets (>=0.2.0,<0.3.0)"] +mxnet = ["mxnet (>=1.5.1,<1.6.0)"] +tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"] +torch = ["torch (>=1.6.0)"] + [[package]] name = "tiktoken" version = "0.7.0" @@ -6192,6 +6926,20 @@ files = [ docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] test = ["Cython (>=0.29.36,<0.30.0)", "aiohttp (==3.9.0b0)", "aiohttp (>=3.8.1)", "flake8 (>=5.0,<6.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=23.0.0,<23.1.0)", "pycodestyle (>=2.9.0,<2.10.0)"] +[[package]] +name = "wasabi" +version = "1.1.3" +description = "A lightweight console printing and formatting toolkit" +optional = false +python-versions = ">=3.6" +files = [ + {file = "wasabi-1.1.3-py3-none-any.whl", hash = "sha256:f76e16e8f7e79f8c4c8be49b4024ac725713ab10cd7f19350ad18a8e3f71728c"}, + {file = "wasabi-1.1.3.tar.gz", hash = "sha256:4bb3008f003809db0c3e28b4daf20906ea871a2bb43f9914197d540f4f2e0878"}, +] + +[package.dependencies] +colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\" and python_version >= \"3.7\""} + [[package]] name = "watchfiles" version = "0.24.0" @@ -6298,6 +7046,28 @@ files = [ {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, ] +[[package]] +name = "weasel" +version = "0.4.1" +description = "Weasel: A small and easy workflow system" +optional = false +python-versions = ">=3.7" +files = [ + {file = "weasel-0.4.1-py3-none-any.whl", hash = "sha256:24140a090ea1ac512a2b2f479cc64192fd1d527a7f3627671268d08ed5ac418c"}, + {file = "weasel-0.4.1.tar.gz", hash = "sha256:aabc210f072e13f6744e5c3a28037f93702433405cd35673f7c6279147085aa9"}, +] + +[package.dependencies] +cloudpathlib = ">=0.7.0,<1.0.0" +confection = ">=0.0.4,<0.2.0" +packaging = ">=20.0" +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0" +requests = ">=2.13.0,<3.0.0" +smart-open = ">=5.2.1,<8.0.0" +srsly = ">=2.4.3,<3.0.0" +typer = ">=0.3.0,<1.0.0" +wasabi = ">=0.9.1,<1.2.0" + [[package]] name = "webcolors" version = "24.8.0" @@ -6633,4 +7403,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "fdee95ccf4e92495e158ffedad4995b97152c0b5724cad47fde6f109df6e2026" +content-hash = "823a10f4d45764f1907fe80fe0a2c53bc14b2b52f46b2e1ca873decc83558e1c" diff --git a/apps/chatbot/pyproject.toml b/apps/chatbot/pyproject.toml index 7585ce7c56..af49f36694 100644 --- a/apps/chatbot/pyproject.toml +++ b/apps/chatbot/pyproject.toml @@ -37,8 +37,9 @@ llama-index-llms-gemini = "^0.3.4" google-generativeai = "^0.5.2" llama-index-embeddings-gemini = "^0.2.0" llama-index-llms-bedrock-converse = "^0.3.0" +llama-index-postprocessor-presidio = "^0.2.0" [build-system] requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" +build-backend = "poetry.core.masonry.api" \ No newline at end of file diff --git a/apps/chatbot/scripts/redis-tunnel.sh b/apps/chatbot/scripts/redis-tunnel.sh index 16251cfa54..c57aec3855 100755 --- a/apps/chatbot/scripts/redis-tunnel.sh +++ b/apps/chatbot/scripts/redis-tunnel.sh @@ -6,11 +6,14 @@ SERVICE_NAME="cms-ecs" LOAD_BALANCER_NAME="chatbot-load-balancer" REMOTE_PORT="6379" LOCAL_PORT="16379" -AWS_REGION="eu-south-1" +# Load Redis ECS Region +if [ -f .env ]; then + export $(grep CHB_AWS_DEFAULT_REGION .env | xargs); +fi # Get the ECS Task ARN -TASK_ARN=$(aws ecs list-tasks --cluster $CLUSTER_NAME --service-name $SERVICE_NAME --query 'taskArns[0]' --output text --region $AWS_REGION) +TASK_ARN=$(aws ecs list-tasks --cluster $CLUSTER_NAME --service-name $SERVICE_NAME --query 'taskArns[0]' --output text --region $CHB_AWS_DEFAULT_REGION) if [ -z "$TASK_ARN" ]; then echo "No ECS task found for the service $SERVICE_NAME in cluster $CLUSTER_NAME." @@ -26,7 +29,7 @@ if [ -z "$TASK_ID" ]; then fi # Get the container name (task definition details) -CONTAINER_ID=$(aws ecs describe-tasks --cluster $CLUSTER_NAME --tasks $TASK_ARN --query 'tasks[0].containers[0].runtimeId' --output text --region $AWS_REGION) +CONTAINER_ID=$(aws ecs describe-tasks --cluster $CLUSTER_NAME --tasks $TASK_ARN --query 'tasks[0].containers[0].runtimeId' --output text --region $CHB_AWS_DEFAULT_REGION) if [ -z "$CONTAINER_ID" ]; then echo "No container found for task $TASK_ARN." @@ -34,7 +37,7 @@ if [ -z "$CONTAINER_ID" ]; then fi # Get the Network Load Balancer DNS name -TARGET_HOST=$(aws elbv2 describe-load-balancers --names $LOAD_BALANCER_NAME --query 'LoadBalancers[0].DNSName' --output text --region $AWS_REGION) +TARGET_HOST=$(aws elbv2 describe-load-balancers --names $LOAD_BALANCER_NAME --query 'LoadBalancers[0].DNSName' --output text --region $CHB_AWS_DEFAULT_REGION) if [ -z "$TARGET_HOST" ]; then echo "No Load Balancer found with the name $LOAD_BALANCER_NAME." @@ -49,4 +52,4 @@ aws ssm start-session \ --target $TARGET \ --document-name AWS-StartPortForwardingSessionToRemoteHost \ --parameters "{\"host\":[\"$TARGET_HOST\"],\"portNumber\":[\"$REMOTE_PORT\"],\"localPortNumber\":[\"$LOCAL_PORT\"]}" \ - --region $AWS_REGION \ No newline at end of file + --region $CHB_AWS_DEFAULT_REGION \ No newline at end of file diff --git a/apps/chatbot/scripts/spacy_download.py b/apps/chatbot/scripts/spacy_download.py new file mode 100644 index 0000000000..265fdbacdb --- /dev/null +++ b/apps/chatbot/scripts/spacy_download.py @@ -0,0 +1,21 @@ +import spacy +import yaml +import os + +if __name__ == "__main__": + task_root = os.getenv("LAMBDA_TASK_ROOT") + + # Define the path to the YAML file + yaml_file_path = 'config/params.yaml' + + # Read models + with open(yaml_file_path, 'r') as file: + config = yaml.safe_load(file) + + # Access the "models" array from the "config_presidio" section + models = config.get('config_presidio', {}).get('models', []) + + # Print the models + for model in models: + print(f"Downloading model {model['model_name']}") + spacy.cli.download(model['model_name']) \ No newline at end of file diff --git a/apps/chatbot/src/modules/chatbot.py b/apps/chatbot/src/modules/chatbot.py index a53f70d78b..a187968503 100644 --- a/apps/chatbot/src/modules/chatbot.py +++ b/apps/chatbot/src/modules/chatbot.py @@ -11,11 +11,9 @@ ) from src.modules.models import get_llm, get_embed_model -from src.modules.vector_database import ( - load_automerging_index_s3, load_url_hash_table, - load_automerging_index_redis, REDIS_KVSTORE -) +from src.modules.vector_database import load_automerging_index_redis, REDIS_KVSTORE, INDEX_ID from src.modules.engine import get_automerging_query_engine +from src.modules.presidio import PresidioPII AWS_S3_BUCKET = os.getenv("CHB_AWS_S3_BUCKET") @@ -28,54 +26,25 @@ logging.getLogger().setLevel(os.getenv("LOG_LEVEL", "INFO")) + class Chatbot(): def __init__( self, params, - prompts, + prompts ): self.params = params self.prompts = prompts - self.use_redis = params["vector_index"]["use_redis"] - self.use_s3 = params["vector_index"]["use_s3"] - - if self.use_s3 and self.use_redis: - raise Exception("Vector Store Error: use s3 or Redis or none of them.") - + self.pii = PresidioPII(config=params["config_presidio"]) self.model = get_llm() self.embed_model = get_embed_model() - - if self.use_redis: - self.index = load_automerging_index_redis( - self.model, - self.embed_model, - chunk_sizes=params["vector_index"]["chunk_sizes"], - chunk_overlap=params["vector_index"]["chunk_overlap"] - ) - - elif self.use_s3: - self.hash_table = load_url_hash_table( - s3_bucket_name=AWS_S3_BUCKET, - ) - self.index = load_automerging_index_s3( - self.model, - self.embed_model, - save_dir=params["vector_index"]["path"], - s3_bucket_name=AWS_S3_BUCKET, - chunk_sizes=params["vector_index"]["chunk_sizes"], - chunk_overlap=params["vector_index"]["chunk_overlap"], - ) - else: - self.hash_table = load_url_hash_table() - self.index = load_automerging_index_s3( - self.model, - self.embed_model, - save_dir=params["vector_index"]["path"], - chunk_sizes=params["vector_index"]["chunk_sizes"], - chunk_overlap=params["vector_index"]["chunk_overlap"], - ) - + self.index = load_automerging_index_redis( + self.model, + self.embed_model, + chunk_sizes=params["vector_index"]["chunk_sizes"], + chunk_overlap=params["vector_index"]["chunk_overlap"] + ) self.history = [ ChatMessage( role=MessageRole.ASSISTANT, @@ -155,18 +124,13 @@ def _unmask_reference(self, response_str: str, nodes) -> str: logging.info(f"Generated answer has {len(hashed_urls)} references taken from {len(nodes)} nodes. First node has score: {nodes[0].score:.4f}.") for hashed_url in hashed_urls: - if self.use_redis: - url = REDIS_KVSTORE.get( - collection="hash_table", - key=hashed_url - ) - if url is None: - url = "{URL}" - else: - if hashed_url in self.hash_table.keys(): - url = self.hash_table[hashed_url] - else: - url = "{URL}" + url = REDIS_KVSTORE.get( + collection=f"hash_table_{INDEX_ID}", + key=hashed_url + ) + if url is None: + url = "{URL}" + response_str = response_str.replace(hashed_url, url) # remove sentences with generated masked url: {URL} @@ -175,6 +139,10 @@ def _unmask_reference(self, response_str: str, nodes) -> str: response_str = "".join(filtered_parts) # join the filtered parts back into a single string return response_str + + + def mask_pii(self, message: str) -> str: + return self.pii.mask_pii(message) def generate(self, query_str: str) -> str: diff --git a/apps/chatbot/src/modules/create_vector_index.py b/apps/chatbot/src/modules/create_vector_index.py index eb8aca8860..95f4a54954 100644 --- a/apps/chatbot/src/modules/create_vector_index.py +++ b/apps/chatbot/src/modules/create_vector_index.py @@ -4,18 +4,14 @@ import logging from src.modules.models import get_llm, get_embed_model -from src.modules.vector_database import ( - build_automerging_index, - build_automerging_index_redis, - build_automerging_index_s3 -) +from src.modules.vector_database import build_automerging_index_redis from dotenv import load_dotenv load_dotenv() logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO")) -CHB_AWS_DEFAULT_REGION = os.getenv("CHB_AWS_DEFAULT_REGION", os.getenv("AWS_DEFAULT_REGION")) +DOCUMENTATION_DIR = os.getenv("CHB_DOCUMENTATION_DIR") AWS_S3_BUCKET = os.getenv("CHB_AWS_S3_BUCKET", os.getenv("AWS_S3_BUCKET")) @@ -27,37 +23,15 @@ # load parameters params = yaml.safe_load(open(args.params, "r")) - if params["vector_index"]["use_redis"] and params["vector_index"]["use_s3"]: - raise Exception("Vector Store Error: use s3 or Redis or none of them.") - model = get_llm() embed_model = get_embed_model() # create vector index - if params["vector_index"]["use_redis"]: - index = build_automerging_index_redis( - model, - embed_model, - documentation_dir=params["documentation"]["path"], - chunk_sizes=params["vector_index"]["chunk_sizes"], - chunk_overlap=params["vector_index"]["chunk_overlap"] - ) - if params["vector_index"]["use_s3"]: - index = build_automerging_index_s3( - model, - embed_model, - documentation_dir=params["documentation"]["path"], - save_dir=params["vector_index"]["path"], - s3_bucket_name=AWS_S3_BUCKET, - chunk_sizes=params["vector_index"]["chunk_sizes"], - chunk_overlap=params["vector_index"]["chunk_overlap"] - ) - if not params["vector_index"]["use_redis"] and not params["vector_index"]["use_s3"]: - index = build_automerging_index( - model, - embed_model, - documentation_dir=params["documentation"]["path"], - save_dir=params["vector_index"]["path"], - chunk_sizes=params["vector_index"]["chunk_sizes"], - chunk_overlap=params["vector_index"]["chunk_overlap"] - ) + index = build_automerging_index_redis( + model, + embed_model, + documentation_dir=DOCUMENTATION_DIR, + chunk_sizes=params["vector_index"]["chunk_sizes"], + chunk_overlap=params["vector_index"]["chunk_overlap"] + ) + diff --git a/apps/chatbot/src/modules/engine.py b/apps/chatbot/src/modules/engine.py index 38c00d7235..de7acabf92 100644 --- a/apps/chatbot/src/modules/engine.py +++ b/apps/chatbot/src/modules/engine.py @@ -49,4 +49,4 @@ def get_automerging_query_engine( streaming=True if ENGINE_USE_STREAMING == "True" else False ) - return automerging_engine + return automerging_engine \ No newline at end of file diff --git a/apps/chatbot/src/modules/evaluation.py b/apps/chatbot/src/modules/evaluation.py index 44ee089d83..9072ebc1d7 100644 --- a/apps/chatbot/src/modules/evaluation.py +++ b/apps/chatbot/src/modules/evaluation.py @@ -224,4 +224,4 @@ def table_results(responses, eval_results): ) with open(os.path.join(results_dir, "avg_scores.json"), "w") as f: json.dump(avg_scores, f, indent=4) - logging.info(f"Results stored in {results_dir}") + logging.info(f"Results stored in {results_dir}") \ No newline at end of file diff --git a/apps/chatbot/src/modules/models.py b/apps/chatbot/src/modules/models.py index 3d89345967..43b7d124a6 100644 --- a/apps/chatbot/src/modules/models.py +++ b/apps/chatbot/src/modules/models.py @@ -22,8 +22,7 @@ assert PROVIDER in ["aws", "google"] -GOOGLE_PARAM_NAME = os.getenv("CHB_GOOGLE_API_KEY") -GOOGLE_API_KEY = get_ssm_parameter(name=GOOGLE_PARAM_NAME) +GOOGLE_API_KEY = get_ssm_parameter(name=os.getenv("CHB_GOOGLE_API_KEY")) AWS_ACCESS_KEY_ID = os.getenv("CHB_AWS_ACCESS_KEY_ID") AWS_SECRET_ACCESS_KEY = os.getenv("CHB_AWS_SECRET_ACCESS_KEY") CHB_AWS_DEFAULT_REGION = os.getenv("CHB_AWS_DEFAULT_REGION") diff --git a/apps/chatbot/src/modules/presidio.py b/apps/chatbot/src/modules/presidio.py new file mode 100644 index 0000000000..7de5560873 --- /dev/null +++ b/apps/chatbot/src/modules/presidio.py @@ -0,0 +1,214 @@ +import logging +from pathlib import Path +from typing import Any, Dict, List, Union +from langdetect import detect_langs + +from presidio_anonymizer.operators import Operator, OperatorType + +from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer, RecognizerResult +from presidio_analyzer.nlp_engine import NlpEngineProvider +from presidio_anonymizer import AnonymizerEngine +from presidio_anonymizer.entities import OperatorConfig + + +# see supported entities by Presidio with their description at: https://microsoft.github.io/presidio/supported_entities/ +ENTITIES = [ + "CREDIT_CARD", + "CRYPTO", + "DATE_TIME", + "EMAIL_ADDRESS", + "IBAN_CODE", + "IP_ADDRESS", + "NRP", + "LOCATION", + "PERSON", + "PHONE_NUMBER", + "MEDICAL_LICENSE", + "IT_FISCAL_CODE", + "IT_DRIVER_LICENSE", + "IT_VAT_CODE", + "IT_PASSPORT", + "IT_IDENTITY_CARD", + "IT_PHYSICAL_ADDRESS", # this is a custom entity added to the analyzer registry + # "ES_NIF", + # "ES_NIE", + # "US_BANK_NUMBER", + # "US_DRIVER_LICENSE", + # "US_ITIN", + # "US_PASSPORT", + # "US_SSN", + # "UK_NHS" +] + +ALLOW_LIST = [ + "Discovery", "discovery", "pagoPA", "PagoPA", "pagopa" +] + + +class EntityTypeCountAnonymizer(Operator): + """ + Anonymizer which replaces the entity value + with an type counter per entity. + """ + + REPLACING_FORMAT = "<{entity_type}_{index}>" + + def operate(self, text: str, params: Dict[str, Any]) -> str: + """Anonymize the input text.""" + entity_type: str = params["entity_type"] + entity_mapping: Dict[str, Dict] = params["entity_mapping"] + deanonymize_mapping: Dict[str, str] = params["deanonymize_mapping"] + + entity_mapping_for_type = entity_mapping.get(entity_type) + if not entity_mapping_for_type: + entity_mapping_for_type = entity_mapping[entity_type] = {} + + if text in entity_mapping_for_type: + return entity_mapping_for_type[text] + + new_text = self.REPLACING_FORMAT.format( + entity_type=entity_type, index=len(entity_mapping_for_type) + 1 + ) + entity_mapping[entity_type][text] = new_text + deanonymize_mapping[new_text] = text + return new_text + + def validate(self, params: Dict[str, Any]) -> None: + """Validate operator parameters.""" + if "entity_mapping" not in params: + raise ValueError("An input Dict called `entity_mapping` is required.") + if "entity_type" not in params: + raise ValueError("An entity_type param is required.") + if "deanonymize_mapping" not in params: + raise ValueError("A deanonymize_mapping param is required.") + + def operator_name(self) -> str: + return self.__class__.__name__ + + def operator_type(self) -> OperatorType: + return OperatorType.Anonymize + + +class PresidioPII(): + """Uses a presidio to analyse PIIs. + """ + + def __init__( + self, + config: Union[Path, str] | dict, + entity_mapping: Dict[str, Dict] = {}, + mapping: Dict[str, str] = {}, + entities: List[str] | None = None, + analyzer_threshold: float = 0.4 + ): + self.config = config + self.entity_mapping = entity_mapping + self.mapping = mapping + self.entities = entities if entities else ENTITIES + self.analyzer_threshold = analyzer_threshold + + if isinstance(self.config, (Path, str)): + self.provider = NlpEngineProvider(conf_file=self.config) + elif isinstance(self.config, dict): + self.provider = NlpEngineProvider(nlp_configuration=self.config) + else: + raise ValueError("Error! config should be a path or a dictionary.") + nlp_engine = self.provider.create_engine() + self.nlp_engine = nlp_engine + self.analyzer = AnalyzerEngine( + nlp_engine = self.nlp_engine, + supported_languages = ["it", "en"], # "es", "fr", "de" + default_score_threshold = analyzer_threshold + ) + self._add_italian_physical_address_entity() + self.engine = AnonymizerEngine() + self.engine.add_anonymizer(EntityTypeCountAnonymizer) + + + @classmethod + def class_name(cls) -> str: + return "PresidioPII" + + + def detect_language(self, text: str) -> str: + + try: + detected_languages = detect_langs(text) + lang_list = [] + for detected_lang in detected_languages: + if detected_lang.lang in ["it", "en", "es", "fr", "de"]: + lang_list.append(detected_lang.lang) + + if not lang_list: + logging.warning("No detected language.") + lang = "it" + elif "it" in lang_list: + lang = "it" + else: + lang = "en" # lang_list[0].lang + except: + logging.warning("No detected language.") + lang = "it" + + logging.info(f"Set presidio to detect PII in {lang} language.") + return lang + + + def detect_pii(self, text: str) -> List[RecognizerResult]: + + lang = self.detect_language(text) + results = self.analyzer.analyze( + text=text, + language=lang, + entities=self.entities, + allow_list=ALLOW_LIST + ) + + return results + + + def mask_pii(self, text: str, results: List[RecognizerResult] | None = None): + + if results is None: + results = self.detect_pii(text) + + new_text = self.engine.anonymize( + text=text, + analyzer_results=results, + operators={ + "DEFAULT": OperatorConfig( + "EntityTypeCountAnonymizer", + { + "entity_mapping": self.entity_mapping, + "deanonymize_mapping": self.mapping, + }, + ) + }, + ) + + return new_text.text + + + def _add_italian_physical_address_entity(self) -> None: + + italian_address_pattern = Pattern( + name="italian_address_pattern", + regex=r"\b(via|viale|piazza|corso|vicolo)\s+[A-Za-z\s]+\s*\d{1,4}.*\b", + score=0.8 + ) + + address_recognizer = PatternRecognizer( + supported_entity="IT_PHYSICAL_ADDRESS", + patterns=[italian_address_pattern], + supported_language="it" + ) + + self.analyzer.registry.add_recognizer(address_recognizer) + + + def get_entities(self) -> List[str]: + return self.entities + + + def get_supported_entities(self) -> List[str]: + return self.analyzer.get_supported_entities() diff --git a/apps/chatbot/src/modules/utils.py b/apps/chatbot/src/modules/utils.py index 4e511f829d..77db8fee85 100644 --- a/apps/chatbot/src/modules/utils.py +++ b/apps/chatbot/src/modules/utils.py @@ -8,7 +8,7 @@ AWS_ACCESS_KEY_ID = os.getenv("CHB_AWS_ACCESS_KEY_ID") AWS_SECRET_ACCESS_KEY = os.getenv("CHB_AWS_SECRET_ACCESS_KEY") -AWS_DEFAUL_REGION = os.getenv("GOOGLE_AND_REDIS_AWS_DEFAULT_REGION") +AWS_DEFAULT_REGION = os.getenv("CHB_AWS_DEFAULT_REGION") def get_ssm_parameter(name: str, default: str | None = None) -> str | None: @@ -23,7 +23,7 @@ def get_ssm_parameter(name: str, default: str | None = None) -> str | None: "ssm", aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, - region_name=AWS_DEFAUL_REGION + region_name=AWS_DEFAULT_REGION ) logging.debug(f"Getting parameter {name} from SSM") try: diff --git a/apps/chatbot/src/modules/vector_database.py b/apps/chatbot/src/modules/vector_database.py index de868a9c15..97c03e3239 100644 --- a/apps/chatbot/src/modules/vector_database.py +++ b/apps/chatbot/src/modules/vector_database.py @@ -34,6 +34,8 @@ import redis.asyncio as aredis from redisvl.schema import IndexSchema +from src.modules.utils import get_ssm_parameter + from dotenv import load_dotenv load_dotenv() @@ -42,9 +44,6 @@ PROVIDER = os.getenv("CHB_PROVIDER") assert PROVIDER in ["google", "aws"] -AWS_ACCESS_KEY_ID = os.getenv("CHB_AWS_ACCESS_KEY_ID") -AWS_SECRET_ACCESS_KEY = os.getenv("CHB_AWS_SECRET_ACCESS_KEY") -CHB_AWS_DEFAULT_REGION = os.getenv("CHB_AWS_DEFAULT_REGION", os.getenv("AWS_DEFAULT_REGION")) REDIS_URL = os.getenv("CHB_REDIS_URL") WEBSITE_URL = os.getenv("CHB_WEBSITE_URL") REDIS_CLIENT = Redis.from_url(REDIS_URL, socket_timeout=10) @@ -52,6 +51,7 @@ aredis.ConnectionPool.from_url(REDIS_URL) ) REDIS_INDEX_NAME = os.getenv("CHB_REDIS_INDEX_NAME") +INDEX_ID = get_ssm_parameter(os.getenv("CHB_LLAMAINDEX_INDEX_ID")) REDIS_SCHEMA = IndexSchema.from_dict({ "index": {"name": REDIS_INDEX_NAME, "prefix": "index/vector"}, "fields": [ @@ -65,17 +65,16 @@ }} ] }) -REDIS_KVSTORE = RedisKVStore(redis_client=REDIS_CLIENT, async_redis_client=REDIS_ASYNC_CLIENT) -REDIS_DOCSTORE = RedisDocumentStore(redis_kvstore=REDIS_KVSTORE) -REDIS_INDEX_STORE = RedisIndexStore(redis_kvstore=REDIS_KVSTORE) - -if not REDIS_URL: - FS = s3fs.S3FileSystem( - key=AWS_ACCESS_KEY_ID, - secret=AWS_SECRET_ACCESS_KEY, - endpoint_url=f"https://s3.{CHB_AWS_DEFAULT_REGION}.amazonaws.com" if CHB_AWS_DEFAULT_REGION else None - ) - +REDIS_KVSTORE = RedisKVStore( + redis_client=REDIS_CLIENT, + async_redis_client=REDIS_ASYNC_CLIENT +) +REDIS_DOCSTORE = RedisDocumentStore( + redis_kvstore=REDIS_KVSTORE +) +REDIS_INDEX_STORE = RedisIndexStore( + redis_kvstore=REDIS_KVSTORE +) DYNAMIC_HTMLS = [ "app-io/api/app-io-main.html", "case-histories/tari-cagliari.html", @@ -206,12 +205,6 @@ def create_documentation( } )) - # full_text += text - # full_text += "\n\n---------------------------\n\n" - - # with open("full_text.txt", "w") as f: - # f.write(full_text) - logging.info(f"Number of documents with content: {len(documents)}") logging.info(f"Number of empty pages in the documentation: {len(empty_pages)}. These are left out.") with open("empty_htmls.json", "w") as f: @@ -220,110 +213,6 @@ def create_documentation( return documents, hash_table -def build_automerging_index( - llm: BaseLLM, - embed_model: BaseEmbedding, - documentation_dir: str, - save_dir: str, - chunk_sizes: List[int], - chunk_overlap: int - ) -> VectorStoreIndex: - - logging.info("Storing vector index and hash table on AWS bucket S3..") - - Settings.llm = llm - Settings.embed_model = embed_model - Settings.node_parser = HierarchicalNodeParser.from_defaults( - chunk_sizes=chunk_sizes, - chunk_overlap=chunk_overlap - ) - - assert documentation_dir is not None - print(documentation_dir) - documents, hash_table = create_documentation(WEBSITE_URL, documentation_dir) - - logging.info("Creating index...") - nodes = Settings.node_parser.get_nodes_from_documents(documents) - leaf_nodes = get_leaf_nodes(nodes) - - storage_context = StorageContext.from_defaults() - storage_context.docstore.add_documents(nodes) - - automerging_index = VectorStoreIndex( - leaf_nodes, - storage_context=storage_context - ) - logging.info(f"Created index successfully.") - - automerging_index.storage_context.persist( - persist_dir=save_dir - ) - with open("hash_table.json", "w") as f: - json.dump(hash_table, f, indent=4) - - logging.info(f"Saved index successfully to {save_dir}.") - - return automerging_index - - -def build_automerging_index_s3( - llm: BaseLLM, - embed_model: BaseEmbedding, - documentation_dir: str, - save_dir: str, - s3_bucket_name: str | None, - chunk_sizes: List[int], - chunk_overlap: int - ) -> VectorStoreIndex: - - logging.info("Storing vector index and hash table on AWS bucket S3..") - - Settings.llm = llm - Settings.embed_model = embed_model - Settings.node_parser = HierarchicalNodeParser.from_defaults( - chunk_sizes=chunk_sizes, - chunk_overlap=chunk_overlap - ) - - assert documentation_dir is not None - documents, hash_table = create_documentation(WEBSITE_URL, documentation_dir) - - logging.info("Creating index...") - nodes = Settings.node_parser.get_nodes_from_documents(documents) - leaf_nodes = get_leaf_nodes(nodes) - - storage_context = StorageContext.from_defaults() - storage_context.docstore.add_documents(nodes) - - automerging_index = VectorStoreIndex( - leaf_nodes, - storage_context=storage_context - ) - logging.info(f"Created index successfully.") - if s3_bucket_name: - # store hash table - with FS.open(f"{s3_bucket_name}/hash_table.json", "w") as f: - json.dump(hash_table, f, indent=4) - logging.info(f"Uploaded URLs hash table successfully to S3 bucket {s3_bucket_name}/hash_table.json") - - # store vector index - automerging_index.storage_context.persist( - persist_dir=f"{s3_bucket_name}/{save_dir}", - fs = FS - ) - logging.info(f"Uploaded vector index successfully to S3 bucket at {s3_bucket_name}/{save_dir}.") - else: - automerging_index.storage_context.persist( - persist_dir=save_dir - ) - with open("hash_table.json", "w") as f: - json.dump(hash_table, f, indent=4) - - logging.info(f"Saved index successfully to {save_dir}.") - - return automerging_index - - def build_automerging_index_redis( llm: BaseLLM, embed_model: BaseEmbedding, @@ -344,7 +233,7 @@ def build_automerging_index_redis( documents, hash_table = create_documentation(WEBSITE_URL, documentation_dir) for key, value in hash_table.items(): REDIS_KVSTORE.put( - collection="hash_table", + collection=f"hash_table_{INDEX_ID}", key=key, val=value ) @@ -371,6 +260,7 @@ def build_automerging_index_redis( leaf_nodes, storage_context=storage_context ) + automerging_index.set_index_id(INDEX_ID) logging.info("Created vector index successfully and stored on Redis.") return automerging_index @@ -480,6 +370,7 @@ def load_automerging_index_redis( automerging_index = load_index_from_storage( storage_context=storage_context, + index_id=INDEX_ID ) return automerging_index diff --git a/apps/infrastructure/src/modules/chatbot/efs.tf b/apps/infrastructure/src/modules/chatbot/efs.tf index edd169d57b..53e7dd4cda 100644 --- a/apps/infrastructure/src/modules/chatbot/efs.tf +++ b/apps/infrastructure/src/modules/chatbot/efs.tf @@ -4,7 +4,7 @@ resource "aws_efs_file_system" "this" { performance_mode = "generalPurpose" throughput_mode = "bursting" - tags = merge(var.tags, { Name = "${local.prefix}-efs" }) + tags = merge(var.tags, { Name = "${local.prefix}-redis-efs" }) } resource "aws_efs_access_point" "this" { diff --git a/apps/infrastructure/src/modules/chatbot/lambda_chatbot.tf b/apps/infrastructure/src/modules/chatbot/lambda_chatbot.tf index 3338130467..0e3e15e68e 100644 --- a/apps/infrastructure/src/modules/chatbot/lambda_chatbot.tf +++ b/apps/infrastructure/src/modules/chatbot/lambda_chatbot.tf @@ -3,7 +3,7 @@ locals { CHB_AWS_S3_BUCKET = module.s3_bucket_llamaindex.s3_bucket_id CHB_AWS_GUARDRAIL_ID = awscc_bedrock_guardrail.guardrail.guardrail_id CHB_AWS_GUARDRAIL_VERSION = awscc_bedrock_guardrail_version.guardrail.version - CHB_AWS_DEFAULT_REGION = var.aws_chatbot_region + CHB_AWS_BEDROCK_REGION = var.aws_chatbot_region CHB_REDIS_URL = "redis://${module.nlb.dns_name}:${var.ecs_redis.port}" CHB_WEBSITE_URL = "https://${var.dns_domain_name}" CORS_DOMAINS = var.environment == "dev" ? jsonencode(["https://www.${var.dns_domain_name}", "https://${var.dns_domain_name}", "http://localhost:3000"]) : jsonencode(["https://www.${var.dns_domain_name}", "https://${var.dns_domain_name}"]) @@ -15,12 +15,13 @@ locals { # Be extremely careful when changing the provider # both the generation and the embedding models would be changed # embeddings size change would break the application and requires reindexing - CHB_PROVIDER = "google" - CHB_MODEL_ID = "models/gemini-1.5-flash" - CHB_EMBED_MODEL_ID = "models/text-embedding-004" - CHB_REDIS_INDEX_NAME = "gemini-index" - CHB_GOOGLE_API_KEY = module.google_api_key_ssm_parameter.ssm_parameter_name - CHB_QUERY_TABLE_PREFIX = local.prefix + CHB_PROVIDER = "google" + CHB_MODEL_ID = "models/gemini-1.5-flash" + CHB_EMBED_MODEL_ID = "models/text-embedding-004" + CHB_REDIS_INDEX_NAME = "gemini-index" + CHB_GOOGLE_API_KEY = module.google_api_key_ssm_parameter.ssm_parameter_name + CHB_QUERY_TABLE_PREFIX = local.prefix + CHB_LLAMAINDEX_INDEX_ID = module.index_id_ssm_parameter.ssm_parameter_name } } @@ -93,3 +94,13 @@ module "google_api_key_ssm_parameter" { secure_type = true ignore_value_changes = true } + +module "index_id_ssm_parameter" { + source = "git::https://github.com/terraform-aws-modules/terraform-aws-ssm-parameter.git?ref=77d2c139784197febbc8f8e18a33d23eb4736879" # v1.1.0 + + name = "/chatbot/index_id" + value = "49c13f0d-d164-49f1-b5d4-8bdc0632d0dc" + type = "String" + secure_type = true + ignore_value_changes = true +}