diff --git a/poetry.lock b/poetry.lock index 7c75e5c0..ced8523c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -694,13 +694,13 @@ files = [ [[package]] name = "dask" -version = "2024.6.0" +version = "2024.6.2" description = "Parallel PyData with Task Scheduling" optional = false python-versions = ">=3.9" files = [ - {file = "dask-2024.6.0-py3-none-any.whl", hash = "sha256:de0ced6cd46dbc6c01120c8870457af46d667940805a4be063a74dd467466804"}, - {file = "dask-2024.6.0.tar.gz", hash = "sha256:6882ce7e485336d707e540080ed48e01f9c09485d52a2928ea05f9a9e44bb433"}, + {file = "dask-2024.6.2-py3-none-any.whl", hash = "sha256:81b80ee015b2e057b93bb2d1bf13a866136e762e2b24bf54b6b621e8b86b7708"}, + {file = "dask-2024.6.2.tar.gz", hash = "sha256:d429d6b19e85fd1306ac37c188aaf99d03bbe69a6fe59d2b42882b2ac188686f"}, ] [package.dependencies] @@ -721,25 +721,28 @@ array = ["numpy (>=1.21)"] complete = ["dask[array,dataframe,diagnostics,distributed]", "lz4 (>=4.3.2)", "pyarrow (>=7.0)", "pyarrow-hotfix"] dataframe = ["dask-expr (>=1.1,<1.2)", "dask[array]", "pandas (>=1.3)"] diagnostics = ["bokeh (>=2.4.2)", "jinja2 (>=2.10.3)"] -distributed = ["distributed (==2024.6.0)"] +distributed = ["distributed (==2024.6.2)"] test = ["pandas[test]", "pre-commit", "pytest", "pytest-cov", "pytest-rerunfailures", "pytest-timeout", "pytest-xdist"] [[package]] name = "dask-expr" -version = "1.1.3" +version = "1.1.6" description = "High Level Expressions for Dask" optional = false python-versions = ">=3.9" files = [ - {file = "dask_expr-1.1.3-py3-none-any.whl", hash = "sha256:e6ad2fab9ffe7dbe0fc52451b5a0dc5588f36cd5677168cfb0b73c70f05e465f"}, - {file = "dask_expr-1.1.3.tar.gz", hash = "sha256:ce8e44dfed30b4d9e6a549d0ed8cb5798273645fb9a16733d0687dc84615a94b"}, + {file = "dask_expr-1.1.6-py3-none-any.whl", hash = "sha256:04e31cb941b7cbdce7b1384f2bcf17fd17e828e45e9c74491e28473095ee6891"}, + {file = "dask_expr-1.1.6.tar.gz", hash = "sha256:ace366c6d9c248a7fa7b48f7a34140afae3b6a0ea14ee57743babe5a9d1ef43f"}, ] [package.dependencies] -dask = "2024.6.0" +dask = "2024.6.2" pandas = ">=2" pyarrow = ">=7.0.0" +[package.extras] +analyze = ["crick", "distributed"] + [[package]] name = "defusedxml" version = "0.7.1" @@ -865,18 +868,18 @@ devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benc [[package]] name = "filelock" -version = "3.14.0" +version = "3.15.3" description = "A platform independent file lock." optional = false python-versions = ">=3.8" files = [ - {file = "filelock-3.14.0-py3-none-any.whl", hash = "sha256:43339835842f110ca7ae60f1e1c160714c5a6afd15a2873419ab185334975c0f"}, - {file = "filelock-3.14.0.tar.gz", hash = "sha256:6ea72da3be9b8c82afd3edcf99f2fffbb5076335a5ae4d03248bb5b6c3eae78a"}, + {file = "filelock-3.15.3-py3-none-any.whl", hash = "sha256:0151273e5b5d6cf753a61ec83b3a9b7d8821c39ae9af9d7ecf2f9e2f17404103"}, + {file = "filelock-3.15.3.tar.gz", hash = "sha256:e1199bf5194a2277273dacd50269f0d87d0682088a3c561c15674ea9005d8635"}, ] [package.extras] docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"] -testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)", "virtualenv (>=20.26.2)"] typing = ["typing-extensions (>=4.8)"] [[package]] @@ -1132,22 +1135,22 @@ files = [ [[package]] name = "importlib-metadata" -version = "7.1.0" +version = "7.2.0" description = "Read metadata from Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "importlib_metadata-7.1.0-py3-none-any.whl", hash = "sha256:30962b96c0c223483ed6cc7280e7f0199feb01a0e40cfae4d4450fc6fab1f570"}, - {file = "importlib_metadata-7.1.0.tar.gz", hash = "sha256:b78938b926ee8d5f020fc4772d487045805a55ddbad2ecf21c6d60938dc7fcd2"}, + {file = "importlib_metadata-7.2.0-py3-none-any.whl", hash = "sha256:04e4aad329b8b948a5711d394fa8759cb80f009225441b4f2a02bd4d8e5f426c"}, + {file = "importlib_metadata-7.2.0.tar.gz", hash = "sha256:3ff4519071ed42740522d494d04819b666541b9752c43012f85afb2cc220fcc6"}, ] [package.dependencies] zipp = ">=0.5" [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] perf = ["ipython"] -testing = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"] +test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-perf (>=0.9.2)", "pytest-ruff (>=0.2.1)"] [[package]] name = "incremental" @@ -2870,20 +2873,26 @@ full = ["numpy"] [[package]] name = "ray" -version = "2.24.0" +version = "2.30.0" description = "Ray provides a simple, universal API for building distributed applications." optional = false python-versions = ">=3.8" files = [ - {file = "ray-2.24.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:485bfa4a35f2f294e6ea0d065fd78b03eb33e11a6a76a0d3d2ee4a1fc674c6c1"}, - {file = "ray-2.24.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6d0f3ba813fd7c7b8ac7fd89aecc4bab0e093c165fb04fd7252b77f024ca5621"}, - {file = "ray-2.24.0-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:7b4077b86739629e010bc6d001a89009aa2e1fea94c408c0ee30767fc0c0ea6d"}, - {file = "ray-2.24.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:478aaf3b1f624063c4a0b7bc3caef2822b9e095a84931077228fd54f1e5dba31"}, - {file = "ray-2.24.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c7952d45a0d369642e588842dc27d473ee77867e4aaf8d3689e12eba52af7752"}, - {file = "ray-2.24.0-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:7195f4ae30a07f21aa00cf7d945648ed6968b63ca4b02a5bdc7fc30853f49410"}, - {file = "ray-2.24.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:da6a751e0d773847994018199679aedc88a26fc3a9b5e0f1b048f78df7c810e1"}, - {file = "ray-2.24.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2c9bc11dcc6acdc42557cb93e597812007ed04a7bbdc89212553eacc2bb08536"}, - {file = "ray-2.24.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:862295287ab313ece28d4f8ff8fee5bb87f5c1c5900b6c5fc03e4ba9f58662d0"}, + {file = "ray-2.30.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:abc74a591b33e63e4fa3558318e7377eaf97c57a9ecd1b8cc9bbe150f89792e7"}, + {file = "ray-2.30.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:840645be546a632067bdffdd412d3541dfaf374acbfc1e2e14229818f18e1fe5"}, + {file = "ray-2.30.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:06713b069203fc57aa286ecf6aa354a368994c2b79645161567cbe88721c88cd"}, + {file = "ray-2.30.0-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:9bc481c1b5d7294d6872fac144e00ea1394b7e130fb41249d1670db494156d81"}, + {file = "ray-2.30.0-cp310-cp310-win_amd64.whl", hash = "sha256:12294b2d72a8ac312ff894dfeb6a08c3b51fbc9b56dbf76059cac69a19637682"}, + {file = "ray-2.30.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:aa182e2ebf0e9c3d9410b57e01a527541ad252e2a1ad5ccdf7bbbc3af3893714"}, + {file = "ray-2.30.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3e5c40b138eb24290844048d4db3414a22840fd27cc137bf139c9e647866d11e"}, + {file = "ray-2.30.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:5d9f884ae61d3ff14883796e6c9ec6f837fff4c26eb23fbe65d5a93aa92a6c00"}, + {file = "ray-2.30.0-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:e203b57565a008f2ac9f9e1e929216e0d7d776f534144ccc78f66c105234f701"}, + {file = "ray-2.30.0-cp311-cp311-win_amd64.whl", hash = "sha256:57d6931c0d324d3c31ad78f1d13d14a23705c8c617097339b09097f7c0da732e"}, + {file = "ray-2.30.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:8fba074db33474d99daca49188ff3780176d99cea3e5a9a73d3221ec0961b40f"}, + {file = "ray-2.30.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:04bd8db261bf41eaf722652e687e356c13c46fdd274e7586113ab553a7752f6a"}, + {file = "ray-2.30.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:c3041cfc00657eb3782772aa7237b636d26c5def751ba8f58f81a7d8668af99e"}, + {file = "ray-2.30.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:2d2865a4432dba707bdfbecc9441dc9f979a479ea1d06b860d73bb52d5e83af4"}, + {file = "ray-2.30.0-cp39-cp39-win_amd64.whl", hash = "sha256:d6fb77b831cc5fc6c7de33fda32a5412165bd011e8ec52b0b3a3453c048f0475"}, ] [package.dependencies] @@ -2900,9 +2909,9 @@ requests = "*" [package.extras] air = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "fastapi", "fsspec", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "memray", "numpy (>=1.20)", "opencensus", "pandas", "pandas (>=1.3)", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pyarrow (>=6.0.1)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart-open", "starlette", "tensorboardX (>=1.9)", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] -all = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "dm-tree", "fastapi", "fsspec", "grpcio (!=1.56.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "gymnasium (==0.28.1)", "lz4", "memray", "numpy (>=1.20)", "opencensus", "opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk", "pandas", "pandas (>=1.3)", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pyarrow (>=6.0.1)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "pyyaml", "ray-cpp (==2.24.0)", "requests", "rich", "scikit-image", "scipy", "smart-open", "starlette", "tensorboardX (>=1.9)", "typer", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] +all = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "dm-tree", "fastapi", "fsspec", "grpcio (!=1.56.0)", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "gymnasium (==0.28.1)", "lz4", "memray", "numpy (>=1.20)", "opencensus", "opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk", "pandas", "pandas (>=1.3)", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pyarrow (>=6.0.1)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "pyyaml", "ray-cpp (==2.30.0)", "requests", "rich", "scikit-image", "scipy", "smart-open", "starlette", "tensorboardX (>=1.9)", "typer", "uvicorn[standard]", "virtualenv (>=20.0.24,!=20.21.1)", "watchfiles"] client = ["grpcio (!=1.56.0)"] -cpp = ["ray-cpp (==2.24.0)"] +cpp = ["ray-cpp (==2.30.0)"] data = ["fsspec", "numpy (>=1.20)", "pandas (>=1.3)", "pyarrow (>=6.0.1)"] default = ["aiohttp (>=3.7)", "aiohttp-cors", "colorful", "grpcio (>=1.32.0)", "grpcio (>=1.42.0)", "memray", "opencensus", "prometheus-client (>=0.7.1)", "py-spy (>=0.2.0)", "pydantic (<2.0.dev0 || >=2.5.dev0,<3)", "requests", "smart-open", "virtualenv (>=20.0.24,!=20.21.1)"] observability = ["opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk"] @@ -3882,4 +3891,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.11.0,<3.13.0" -content-hash = "f09e47495fc42990c384332d6d768edad2f54fcbfcbeab01ab6aa15dfaf9b5db" +content-hash = "729b44c4b57bf7b10e7b718ee737f7c9bc4fb75221c3480b98fee8dc113953ab" diff --git a/pyproject.toml b/pyproject.toml index bd59b434..78cbe546 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,6 @@ tenacity = "^8.2.3" sentry-sdk = "^1.44.1" coverage = "^7.4.2" modin = {extras = ["ray"], version = "^0.30.1"} -filelock = "<=3.14" [build-system] requires = ["poetry-core>=1.1"] build-backend = "poetry.core.masonry.api" diff --git a/quotaclimat/data_processing/mediatree/api_import.py b/quotaclimat/data_processing/mediatree/api_import.py index 5dbd2e32..3d591b67 100644 --- a/quotaclimat/data_processing/mediatree/api_import.py +++ b/quotaclimat/data_processing/mediatree/api_import.py @@ -54,9 +54,9 @@ async def update_pg_data(exit_event): number_of_batch = int(os.environ.get("NUMBER_OF_BATCH", 6)) program_only = os.environ.get("UPDATE_PROGRAM_ONLY", "false") == "true" if(program_only): - logging.warning("Update : Program only mode activated") - - #TODO get program here + logging.warning("Update : Program only mode activated - UPDATE_PROGRAM_ONLY") + else: + logging.warning("Update : programs will not be updated for performance issue - use UPDATE_PROGRAM_ONLY to true for this") logging.warning(f"Updating already saved data from Postgresql from offset {start_offset} - env variable START_OFFSET until {start_offset + number_of_batch * batch_size}") try: diff --git a/quotaclimat/data_processing/mediatree/update_pg_keywords.py b/quotaclimat/data_processing/mediatree/update_pg_keywords.py index 15d9e962..f0019f9c 100644 --- a/quotaclimat/data_processing/mediatree/update_pg_keywords.py +++ b/quotaclimat/data_processing/mediatree/update_pg_keywords.py @@ -24,11 +24,10 @@ def update_keywords(session: Session, batch_size: int = 50000, start_offset : in current_batch_saved_keywords = get_keywords_columns(session, i, batch_size) logging.info(f"Updating {len(current_batch_saved_keywords)} elements from {i} offsets - batch size {batch_size} - until offset {until_offset}") for keyword_id, plaintext, keywords_with_timestamp, number_of_keywords, start, srt, theme, channel_name, channel_title in current_batch_saved_keywords: - program_name, program_name_type = get_a_program_with_start_timestamp(df_programs, pd.Timestamp(start).tz_convert('Europe/Paris'), channel_name) if channel_title is None: - logging.debug("channel_title none, set it using channel_name") - channel_title = get_channel_title_for_name(channel_name) - + logging.debug("channel_title none, set it using channel_name") + channel_title = get_channel_title_for_name(channel_name) + if(not program_only): try: matching_themes, \ @@ -80,14 +79,13 @@ def update_keywords(session: Session, batch_size: int = 50000, start_offset : in ,number_of_biodiversite_causes_directes ,number_of_biodiversite_consequences ,number_of_biodiversite_solutions_directes - ,channel_program=program_name - ,channel_program_type=program_name_type ,channel_title=channel_title ,number_of_keywords_20=new_number_of_keywords_20 ,number_of_keywords_30=new_number_of_keywords_30 ,number_of_keywords_40=new_number_of_keywords_40 ) else: + program_name, program_name_type = get_a_program_with_start_timestamp(df_programs, pd.Timestamp(start).tz_convert('Europe/Paris'), channel_name) update_keyword_row_program(session ,keyword_id ,channel_program=program_name @@ -138,8 +136,6 @@ def update_keyword_row(session: Session, number_of_biodiversite_causes_directes: int, number_of_biodiversite_consequences: int, number_of_biodiversite_solutions_directes: int, - channel_program: str, - channel_program_type: str, channel_title: str ,number_of_keywords_20: int ,number_of_keywords_30: int @@ -162,8 +158,6 @@ def update_keyword_row(session: Session, Keywords.number_of_biodiversite_causes_directes:number_of_biodiversite_causes_directes , Keywords.number_of_biodiversite_consequences:number_of_biodiversite_consequences , Keywords.number_of_biodiversite_solutions_directes:number_of_biodiversite_solutions_directes, - Keywords.channel_program: channel_program, - Keywords.channel_program_type: channel_program_type, Keywords.channel_title: channel_title ,Keywords.number_of_keywords_20: number_of_keywords_20 ,Keywords.number_of_keywords_30: number_of_keywords_30 @@ -174,7 +168,6 @@ def update_keyword_row(session: Session, else: logging.warning(f"Matching themes is empty - deleting row {keyword_id}") session.query(Keywords).filter(Keywords.id == keyword_id).delete() - session.commit() def update_keyword_row_program(session: Session, keyword_id: int, diff --git a/test/sitemap/test_update_pg_keywords.py b/test/sitemap/test_update_pg_keywords.py index a9e6f521..23fa9485 100644 --- a/test/sitemap/test_update_pg_keywords.py +++ b/test/sitemap/test_update_pg_keywords.py @@ -67,13 +67,12 @@ def test_delete_keywords(): ,0 ,0 ,0 - ,"télématin" - ,"Information - Magazine" ,"M6" ,0 ,0 ,0 ) + session.commit() assert get_keyword(primary_key) == None def test_first_update_keywords(): @@ -253,9 +252,9 @@ def test_first_update_keywords(): assert number_of_biodiversite_consequences == 0 assert number_of_biodiversite_solutions_directes == 0 - # program - assert result_after_update.channel_program == "1245 le mag" - assert result_after_update.channel_program_type == "Information - Magazine" + # program - only when UPDATE_PROGRAM_ONLY for speed issues + # assert result_after_update.channel_program == "1245 le mag" + # assert result_after_update.channel_program_type == "Information - Magazine" #channel_title assert result_after_update.channel_title == "M6"