From c081dc6656b607bfa72ddd86d556ce74e66b6502 Mon Sep 17 00:00:00 2001 From: Paul Leclercq Date: Fri, 5 Apr 2024 14:25:18 +0200 Subject: [PATCH] Feat/count high risk false positive next to positive (#157) * wip * wip * wip test * test: high false positive risk transform * docker compose * chores: update poetry lock * chores: bump some libs * feat: save channel_metadata and update them when needed * doc: channel_metadata --- .github/workflows/deploy-main.yml | 2 +- .github/workflows/test.yml | 2 +- Dockerfile | 2 +- Dockerfile_api_import | 2 +- Dockerfile_ingest | 2 +- Dockerfile_streamlit | 2 +- README.md | 6 +- docker-compose.yml | 2 +- poetry.lock | 169 +++---- postgres/channel_metadata.json | 149 ++++++ postgres/schemas/models.py | 50 +- pyproject.toml | 10 +- .../mediatree/detect_keywords.py | 144 ++++-- .../mediatree/keyword/keyword.py | 13 +- quotaclimat/utils/sentry.py | 2 +- test/sitemap/test_detect_keywords.py | 439 ++++++++++++------ test/sitemap/test_main_import_api.py | 19 +- test/sitemap/test_update_pg_keywords.py | 30 +- 18 files changed, 731 insertions(+), 314 deletions(-) create mode 100755 postgres/channel_metadata.json diff --git a/.github/workflows/deploy-main.yml b/.github/workflows/deploy-main.yml index f1d04769..9088f5ca 100644 --- a/.github/workflows/deploy-main.yml +++ b/.github/workflows/deploy-main.yml @@ -12,7 +12,7 @@ on: env: PYTHON_VERSION: '3.11' - POETRY_VERSION: '1.8.1' + POETRY_VERSION: '1.8.2' jobs: build: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index daab9489..02978d9e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,7 +5,7 @@ on: env: PYTHON_VERSION: '3.11' - POETRY_VERSION: '1.8.1' + POETRY_VERSION: '1.8.2' jobs: # Label of the runner job diff --git a/Dockerfile b/Dockerfile index a45ee228..b3120025 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ WORKDIR /app COPY pyproject.toml poetry.lock ./ -RUN pip install poetry==1.8.1 +RUN pip install poetry==1.8.2 RUN poetry install diff --git a/Dockerfile_api_import b/Dockerfile_api_import index 04dde54b..74aac08a 100644 --- a/Dockerfile_api_import +++ b/Dockerfile_api_import @@ -12,7 +12,7 @@ WORKDIR /app COPY pyproject.toml poetry.lock ./ -RUN pip install poetry==1.8.1 +RUN pip install poetry==1.8.2 RUN poetry install diff --git a/Dockerfile_ingest b/Dockerfile_ingest index 4b707d4c..0be296b7 100644 --- a/Dockerfile_ingest +++ b/Dockerfile_ingest @@ -12,7 +12,7 @@ WORKDIR /app COPY pyproject.toml poetry.lock ./ -RUN pip install poetry==1.8.1 +RUN pip install poetry==1.8.2 RUN poetry install diff --git a/Dockerfile_streamlit b/Dockerfile_streamlit index ceae2c03..029fe7e1 100644 --- a/Dockerfile_streamlit +++ b/Dockerfile_streamlit @@ -12,7 +12,7 @@ WORKDIR /app COPY pyproject.toml poetry.lock ./ -RUN pip install poetry==1.8.1 +RUN pip install poetry==1.8.2 RUN poetry install diff --git a/README.md b/README.md index 6cde9e3f..1a470654 100644 --- a/README.md +++ b/README.md @@ -277,6 +277,7 @@ With Sentry, with env variable `SENTRY_DSN`. Learn more here : https://docs.sentry.io/platforms/python/configuration/options/ +## Batch import ### Batch import based on time Use env variable `START_DATE` like in docker compose (epoch second format : 1705409797). @@ -306,13 +307,16 @@ After having updated `UPDATE` env variable to true inside docker-compose.yml and update_pg_keywords.py:20 | Difference old 1000 - new_number_of_keywords 0 ``` -#### Batch update from an offset +### Batch update from an offset With +1 millions rows, we can update from an offset to fix a custom logic by using `START_OFFSET` to batch update PG from a offset. ~55 minutes to update 50K rows on a mVCPU 2240 - 4Gb RAM on Scaleway. Example inside the docker-compose.yml mediatree service -> START_OFFSET: 100 +### Channel metadata +In order to maintain channel perimeter (weekday, hours) up to date, we save the current version inside `postgres/channel_metadata.json`, if we modify this file the next deploy will update every lines of inside Postgresql table `channel_metadata`. + ### Fix linting Before committing, make sure that the line of codes you wrote are conform to PEP8 standard by running: ```bash diff --git a/docker-compose.yml b/docker-compose.yml index 46dbff68..0a0580aa 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,7 +6,7 @@ services: context: ./ dockerfile: Dockerfile entrypoint: ["poetry", "run", "pytest","-vv", "-o", "log_cli=true", "--cov-report", "term:skip-covered", "--cov=quotaclimat", "--cov=postgres", "test/"] - # entrypoint: ["sleep", "12000"] # use to debug the container if needed + # entrypoint: ["sleep", "12000"] # use to debug the container if needed environment: ENV: docker # CHANNEL: "fr3-idf" diff --git a/poetry.lock b/poetry.lock index 408bc8c0..8d00a392 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "advertools" @@ -1168,20 +1168,21 @@ files = [ [[package]] name = "dask" -version = "2024.2.0" +version = "2024.4.0" description = "Parallel PyData with Task Scheduling" optional = false python-versions = ">=3.9" files = [ - {file = "dask-2024.2.0-py3-none-any.whl", hash = "sha256:439efe5479a102d4d2712d69a52458e6c1e78b96c7020976399ce249097caf48"}, - {file = "dask-2024.2.0.tar.gz", hash = "sha256:7ab6e8a2c1e256a4c930f2996c652f057239dee49e1c5c4742f351efe6deecd1"}, + {file = "dask-2024.4.0-py3-none-any.whl", hash = "sha256:f8332781ffde3d3e49df31fe4066e1eab571a87b94a11661a8ecf06e2892ee6d"}, + {file = "dask-2024.4.0.tar.gz", hash = "sha256:d5be22660b332865e7e868df2f1322a75f6cacaf8dd9ec08057e6fa8a96a19ac"}, ] [package.dependencies] click = ">=8.1" cloudpickle = ">=1.5.0" +dask-expr = {version = ">=1.0,<1.1", optional = true, markers = "extra == \"dataframe\""} fsspec = ">=2021.09.0" -importlib-metadata = ">=4.13.0" +importlib-metadata = {version = ">=4.13.0", markers = "python_version < \"3.12\""} numpy = {version = ">=1.21", optional = true, markers = "extra == \"array\""} packaging = ">=20.0" pandas = {version = ">=1.3", optional = true, markers = "extra == \"dataframe\""} @@ -1192,26 +1193,26 @@ toolz = ">=0.10.0" [package.extras] array = ["numpy (>=1.21)"] complete = ["dask[array,dataframe,diagnostics,distributed]", "lz4 (>=4.3.2)", "pyarrow (>=7.0)", "pyarrow-hotfix"] -dataframe = ["dask[array]", "pandas (>=1.3)"] +dataframe = ["dask-expr (>=1.0,<1.1)", "dask[array]", "pandas (>=1.3)"] diagnostics = ["bokeh (>=2.4.2)", "jinja2 (>=2.10.3)"] -distributed = ["distributed (==2024.2.0)"] +distributed = ["distributed (==2024.4.0)"] test = ["pandas[test]", "pre-commit", "pytest", "pytest-cov", "pytest-rerunfailures", "pytest-timeout", "pytest-xdist"] [[package]] name = "dask-expr" -version = "0.4.2" +version = "1.0.9" description = "High Level Expressions for Dask" optional = false python-versions = ">=3.9" files = [ - {file = "dask-expr-0.4.2.tar.gz", hash = "sha256:af657c45c4e3b15bd66dac099b9347d2984d2850d2c6c5b8fb961a52b4c35d4b"}, - {file = "dask_expr-0.4.2-py3-none-any.whl", hash = "sha256:f1cfcf8255ecd491a619ee9c35eaaee5c1a812834d11ef1443ebf1f2669003e6"}, + {file = "dask-expr-1.0.9.tar.gz", hash = "sha256:da9f0b25d67e1d7e41958181dd20f418830c47123c52916acb42885b56760ecc"}, + {file = "dask_expr-1.0.9-py3-none-any.whl", hash = "sha256:74ebe50577310a10aeec5ae7e8b1ca03e5d7875211c3b2d8915eac01590e185d"}, ] [package.dependencies] -dask = "2024.2.0" +dask = "2024.4.0" pandas = ">=2" -pyarrow = "*" +pyarrow = ">=7.0.0" [[package]] name = "debugpy" @@ -1279,19 +1280,19 @@ files = [ [[package]] name = "distributed" -version = "2024.2.0" +version = "2024.4.0" description = "Distributed scheduler for Dask" optional = false python-versions = ">=3.9" files = [ - {file = "distributed-2024.2.0-py3-none-any.whl", hash = "sha256:9545a176a7684b155cdfc56c1bf9b1b588e08e107f9f937166d4912b1ee809f7"}, - {file = "distributed-2024.2.0.tar.gz", hash = "sha256:884df87d784ace510173da1f12ffc3ed3b9858288cf225327e27a01e047d098a"}, + {file = "distributed-2024.4.0-py3-none-any.whl", hash = "sha256:fea01a71877b1013fe36b6e5ed759d360ef7fdcd668c5d5ad11adcca2d386d7a"}, + {file = "distributed-2024.4.0.tar.gz", hash = "sha256:bf53bb2aac89f4525f0d372fa755262fe710c5a069b0bf0ef7b81fc3b6c7841f"}, ] [package.dependencies] click = ">=8.0" cloudpickle = ">=1.5.0" -dask = "2024.2.0" +dask = "2024.4.0" jinja2 = ">=2.10.3" locket = ">=1.0.0" msgpack = ">=1.0.0" @@ -3200,7 +3201,7 @@ files = [ {file = "msgpack-1.0.8-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fbb160554e319f7b22ecf530a80a3ff496d38e8e07ae763b9e82fadfe96f273"}, {file = "msgpack-1.0.8-cp39-cp39-win32.whl", hash = "sha256:f9af38a89b6a5c04b7d18c492c8ccf2aee7048aff1ce8437c4683bb5a1df893d"}, {file = "msgpack-1.0.8-cp39-cp39-win_amd64.whl", hash = "sha256:ed59dd52075f8fc91da6053b12e8c89e37aa043f8986efd89e61fae69dc1b011"}, - {file = "msgpack-1.0.8.tar.gz", hash = "sha256:95c02b0e27e706e48d0e5426d1710ca78e0f0628d6e89d5b5a5b91a5f12274f3"}, + {file = "msgpack-1.0.8-py3-none-any.whl", hash = "sha256:24f727df1e20b9876fa6e95f840a2a2651e34c0ad147676356f4bf5fbb0206ca"}, ] [[package]] @@ -4664,23 +4665,23 @@ files = [ [[package]] name = "pytest" -version = "7.4.4" +version = "8.1.1" description = "pytest: simple powerful testing with Python" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, - {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, + {file = "pytest-8.1.1-py3-none-any.whl", hash = "sha256:2a8386cfc11fa9d2c50ee7b2a57e7d898ef90470a7a34c4b949ff59662bb78b7"}, + {file = "pytest-8.1.1.tar.gz", hash = "sha256:ac978141a75948948817d360297b7aae0fcb9d6ff6bc9ec6d514b85d5a65c044"}, ] [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} iniconfig = "*" packaging = "*" -pluggy = ">=0.12,<2.0" +pluggy = ">=1.4,<2.0" [package.extras] -testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +testing = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] name = "pytest-asyncio" @@ -4702,13 +4703,13 @@ testing = ["coverage (>=6.2)", "hypothesis (>=5.7.1)"] [[package]] name = "pytest-cov" -version = "3.0.0" +version = "5.0.0" description = "Pytest plugin for measuring coverage." optional = false -python-versions = ">=3.6" +python-versions = ">=3.8" files = [ - {file = "pytest-cov-3.0.0.tar.gz", hash = "sha256:e7f0f5b1617d2210a2cabc266dfe2f4c75a8d32fb89eafb7ad9d06f6d076d470"}, - {file = "pytest_cov-3.0.0-py3-none-any.whl", hash = "sha256:578d5d15ac4a25e5f961c938b85a05b09fdaae9deef3bb6de9a6e766622ca7a6"}, + {file = "pytest-cov-5.0.0.tar.gz", hash = "sha256:5837b58e9f6ebd335b0f8060eecce69b662415b16dc503883a02f45dfeb14857"}, + {file = "pytest_cov-5.0.0-py3-none-any.whl", hash = "sha256:4f0764a1219df53214206bf1feea4633c3b558a2925c8b59f144f682861ce652"}, ] [package.dependencies] @@ -4716,7 +4717,7 @@ coverage = {version = ">=5.2.1", extras = ["toml"]} pytest = ">=4.6" [package.extras] -testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtualenv"] +testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"] [[package]] name = "python-dateutil" @@ -5707,13 +5708,13 @@ win32 = ["pywin32"] [[package]] name = "sentry-sdk" -version = "1.40.6" +version = "1.44.1" description = "Python client for Sentry (https://sentry.io)" optional = false python-versions = "*" files = [ - {file = "sentry-sdk-1.40.6.tar.gz", hash = "sha256:f143f3fb4bb57c90abef6e2ad06b5f6f02b2ca13e4060ec5c0549c7a9ccce3fa"}, - {file = "sentry_sdk-1.40.6-py2.py3-none-any.whl", hash = "sha256:becda09660df63e55f307570e9817c664392655a7328bbc414b507e9cb874c67"}, + {file = "sentry-sdk-1.44.1.tar.gz", hash = "sha256:24e6a53eeabffd2f95d952aa35ca52f0f4201d17f820ac9d3ff7244c665aaf68"}, + {file = "sentry_sdk-1.44.1-py2.py3-none-any.whl", hash = "sha256:5f75eb91d8ab6037c754a87b8501cc581b2827e923682f593bed3539ce5b3999"}, ] [package.dependencies] @@ -5727,6 +5728,7 @@ asyncpg = ["asyncpg (>=0.23)"] beam = ["apache-beam (>=2.12)"] bottle = ["bottle (>=0.12.13)"] celery = ["celery (>=3)"] +celery-redbeat = ["celery-redbeat (>=2)"] chalice = ["chalice (>=1.16.0)"] clickhouse-driver = ["clickhouse-driver (>=0.2.0)"] django = ["django (>=1.8)"] @@ -5737,9 +5739,10 @@ grpcio = ["grpcio (>=1.21.1)"] httpx = ["httpx (>=0.16.0)"] huey = ["huey (>=2)"] loguru = ["loguru (>=0.5)"] +openai = ["openai (>=1.0.0)", "tiktoken (>=0.3.0)"] opentelemetry = ["opentelemetry-distro (>=0.35b0)"] opentelemetry-experimental = ["opentelemetry-distro (>=0.40b0,<1.0)", "opentelemetry-instrumentation-aiohttp-client (>=0.40b0,<1.0)", "opentelemetry-instrumentation-django (>=0.40b0,<1.0)", "opentelemetry-instrumentation-fastapi (>=0.40b0,<1.0)", "opentelemetry-instrumentation-flask (>=0.40b0,<1.0)", "opentelemetry-instrumentation-requests (>=0.40b0,<1.0)", "opentelemetry-instrumentation-sqlite3 (>=0.40b0,<1.0)", "opentelemetry-instrumentation-urllib (>=0.40b0,<1.0)"] -pure-eval = ["asttokens", "executing", "pure_eval"] +pure-eval = ["asttokens", "executing", "pure-eval"] pymongo = ["pymongo (>=3.1)"] pyspark = ["pyspark (>=2.4.4)"] quart = ["blinker (>=1.1)", "quart (>=0.16.1)"] @@ -5989,60 +5992,60 @@ files = [ [[package]] name = "sqlalchemy" -version = "2.0.28" +version = "2.0.29" description = "Database Abstraction Library" optional = false python-versions = ">=3.7" files = [ - {file = "SQLAlchemy-2.0.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e0b148ab0438f72ad21cb004ce3bdaafd28465c4276af66df3b9ecd2037bf252"}, - {file = "SQLAlchemy-2.0.28-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bbda76961eb8f27e6ad3c84d1dc56d5bc61ba8f02bd20fcf3450bd421c2fcc9c"}, - {file = "SQLAlchemy-2.0.28-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feea693c452d85ea0015ebe3bb9cd15b6f49acc1a31c28b3c50f4db0f8fb1e71"}, - {file = "SQLAlchemy-2.0.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5da98815f82dce0cb31fd1e873a0cb30934971d15b74e0d78cf21f9e1b05953f"}, - {file = "SQLAlchemy-2.0.28-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4a5adf383c73f2d49ad15ff363a8748319ff84c371eed59ffd0127355d6ea1da"}, - {file = "SQLAlchemy-2.0.28-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:56856b871146bfead25fbcaed098269d90b744eea5cb32a952df00d542cdd368"}, - {file = "SQLAlchemy-2.0.28-cp310-cp310-win32.whl", hash = "sha256:943aa74a11f5806ab68278284a4ddd282d3fb348a0e96db9b42cb81bf731acdc"}, - {file = "SQLAlchemy-2.0.28-cp310-cp310-win_amd64.whl", hash = "sha256:c6c4da4843e0dabde41b8f2e8147438330924114f541949e6318358a56d1875a"}, - {file = "SQLAlchemy-2.0.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:46a3d4e7a472bfff2d28db838669fc437964e8af8df8ee1e4548e92710929adc"}, - {file = "SQLAlchemy-2.0.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0d3dd67b5d69794cfe82862c002512683b3db038b99002171f624712fa71aeaa"}, - {file = "SQLAlchemy-2.0.28-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c61e2e41656a673b777e2f0cbbe545323dbe0d32312f590b1bc09da1de6c2a02"}, - {file = "SQLAlchemy-2.0.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0315d9125a38026227f559488fe7f7cee1bd2fbc19f9fd637739dc50bb6380b2"}, - {file = "SQLAlchemy-2.0.28-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:af8ce2d31679006e7b747d30a89cd3ac1ec304c3d4c20973f0f4ad58e2d1c4c9"}, - {file = "SQLAlchemy-2.0.28-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:81ba314a08c7ab701e621b7ad079c0c933c58cdef88593c59b90b996e8b58fa5"}, - {file = "SQLAlchemy-2.0.28-cp311-cp311-win32.whl", hash = "sha256:1ee8bd6d68578e517943f5ebff3afbd93fc65f7ef8f23becab9fa8fb315afb1d"}, - {file = "SQLAlchemy-2.0.28-cp311-cp311-win_amd64.whl", hash = "sha256:ad7acbe95bac70e4e687a4dc9ae3f7a2f467aa6597049eeb6d4a662ecd990bb6"}, - {file = "SQLAlchemy-2.0.28-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d3499008ddec83127ab286c6f6ec82a34f39c9817f020f75eca96155f9765097"}, - {file = "SQLAlchemy-2.0.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9b66fcd38659cab5d29e8de5409cdf91e9986817703e1078b2fdaad731ea66f5"}, - {file = "SQLAlchemy-2.0.28-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bea30da1e76cb1acc5b72e204a920a3a7678d9d52f688f087dc08e54e2754c67"}, - {file = "SQLAlchemy-2.0.28-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:124202b4e0edea7f08a4db8c81cc7859012f90a0d14ba2bf07c099aff6e96462"}, - {file = "SQLAlchemy-2.0.28-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e23b88c69497a6322b5796c0781400692eca1ae5532821b39ce81a48c395aae9"}, - {file = "SQLAlchemy-2.0.28-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b6303bfd78fb3221847723104d152e5972c22367ff66edf09120fcde5ddc2e2"}, - {file = "SQLAlchemy-2.0.28-cp312-cp312-win32.whl", hash = "sha256:a921002be69ac3ab2cf0c3017c4e6a3377f800f1fca7f254c13b5f1a2f10022c"}, - {file = "SQLAlchemy-2.0.28-cp312-cp312-win_amd64.whl", hash = "sha256:b4a2cf92995635b64876dc141af0ef089c6eea7e05898d8d8865e71a326c0385"}, - {file = "SQLAlchemy-2.0.28-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e91b5e341f8c7f1e5020db8e5602f3ed045a29f8e27f7f565e0bdee3338f2c7"}, - {file = "SQLAlchemy-2.0.28-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45c7b78dfc7278329f27be02c44abc0d69fe235495bb8e16ec7ef1b1a17952db"}, - {file = "SQLAlchemy-2.0.28-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3eba73ef2c30695cb7eabcdb33bb3d0b878595737479e152468f3ba97a9c22a4"}, - {file = "SQLAlchemy-2.0.28-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:5df5d1dafb8eee89384fb7a1f79128118bc0ba50ce0db27a40750f6f91aa99d5"}, - {file = "SQLAlchemy-2.0.28-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:2858bbab1681ee5406650202950dc8f00e83b06a198741b7c656e63818633526"}, - {file = "SQLAlchemy-2.0.28-cp37-cp37m-win32.whl", hash = "sha256:9461802f2e965de5cff80c5a13bc945abea7edaa1d29360b485c3d2b56cdb075"}, - {file = "SQLAlchemy-2.0.28-cp37-cp37m-win_amd64.whl", hash = "sha256:a6bec1c010a6d65b3ed88c863d56b9ea5eeefdf62b5e39cafd08c65f5ce5198b"}, - {file = "SQLAlchemy-2.0.28-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:843a882cadebecc655a68bd9a5b8aa39b3c52f4a9a5572a3036fb1bb2ccdc197"}, - {file = "SQLAlchemy-2.0.28-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:dbb990612c36163c6072723523d2be7c3eb1517bbdd63fe50449f56afafd1133"}, - {file = "SQLAlchemy-2.0.28-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7e4baf9161d076b9a7e432fce06217b9bd90cfb8f1d543d6e8c4595627edb9"}, - {file = "SQLAlchemy-2.0.28-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0a5354cb4de9b64bccb6ea33162cb83e03dbefa0d892db88a672f5aad638a75"}, - {file = "SQLAlchemy-2.0.28-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:fffcc8edc508801ed2e6a4e7b0d150a62196fd28b4e16ab9f65192e8186102b6"}, - {file = "SQLAlchemy-2.0.28-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:aca7b6d99a4541b2ebab4494f6c8c2f947e0df4ac859ced575238e1d6ca5716b"}, - {file = "SQLAlchemy-2.0.28-cp38-cp38-win32.whl", hash = "sha256:8c7f10720fc34d14abad5b647bc8202202f4948498927d9f1b4df0fb1cf391b7"}, - {file = "SQLAlchemy-2.0.28-cp38-cp38-win_amd64.whl", hash = "sha256:243feb6882b06a2af68ecf4bec8813d99452a1b62ba2be917ce6283852cf701b"}, - {file = "SQLAlchemy-2.0.28-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fc4974d3684f28b61b9a90fcb4c41fb340fd4b6a50c04365704a4da5a9603b05"}, - {file = "SQLAlchemy-2.0.28-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:87724e7ed2a936fdda2c05dbd99d395c91ea3c96f029a033a4a20e008dd876bf"}, - {file = "SQLAlchemy-2.0.28-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68722e6a550f5de2e3cfe9da6afb9a7dd15ef7032afa5651b0f0c6b3adb8815d"}, - {file = "SQLAlchemy-2.0.28-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:328529f7c7f90adcd65aed06a161851f83f475c2f664a898af574893f55d9e53"}, - {file = "SQLAlchemy-2.0.28-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:df40c16a7e8be7413b885c9bf900d402918cc848be08a59b022478804ea076b8"}, - {file = "SQLAlchemy-2.0.28-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:426f2fa71331a64f5132369ede5171c52fd1df1bd9727ce621f38b5b24f48750"}, - {file = "SQLAlchemy-2.0.28-cp39-cp39-win32.whl", hash = "sha256:33157920b233bc542ce497a81a2e1452e685a11834c5763933b440fedd1d8e2d"}, - {file = "SQLAlchemy-2.0.28-cp39-cp39-win_amd64.whl", hash = "sha256:2f60843068e432311c886c5f03c4664acaef507cf716f6c60d5fde7265be9d7b"}, - {file = "SQLAlchemy-2.0.28-py3-none-any.whl", hash = "sha256:78bb7e8da0183a8301352d569900d9d3594c48ac21dc1c2ec6b3121ed8b6c986"}, - {file = "SQLAlchemy-2.0.28.tar.gz", hash = "sha256:dd53b6c4e6d960600fd6532b79ee28e2da489322fcf6648738134587faf767b6"}, + {file = "SQLAlchemy-2.0.29-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4c142852ae192e9fe5aad5c350ea6befe9db14370b34047e1f0f7cf99e63c63b"}, + {file = "SQLAlchemy-2.0.29-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:99a1e69d4e26f71e750e9ad6fdc8614fbddb67cfe2173a3628a2566034e223c7"}, + {file = "SQLAlchemy-2.0.29-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ef3fbccb4058355053c51b82fd3501a6e13dd808c8d8cd2561e610c5456013c"}, + {file = "SQLAlchemy-2.0.29-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d6753305936eddc8ed190e006b7bb33a8f50b9854823485eed3a886857ab8d1"}, + {file = "SQLAlchemy-2.0.29-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0f3ca96af060a5250a8ad5a63699180bc780c2edf8abf96c58af175921df847a"}, + {file = "SQLAlchemy-2.0.29-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c4520047006b1d3f0d89e0532978c0688219857eb2fee7c48052560ae76aca1e"}, + {file = "SQLAlchemy-2.0.29-cp310-cp310-win32.whl", hash = "sha256:b2a0e3cf0caac2085ff172c3faacd1e00c376e6884b5bc4dd5b6b84623e29e4f"}, + {file = "SQLAlchemy-2.0.29-cp310-cp310-win_amd64.whl", hash = "sha256:01d10638a37460616708062a40c7b55f73e4d35eaa146781c683e0fa7f6c43fb"}, + {file = "SQLAlchemy-2.0.29-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:308ef9cb41d099099fffc9d35781638986870b29f744382904bf9c7dadd08513"}, + {file = "SQLAlchemy-2.0.29-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:296195df68326a48385e7a96e877bc19aa210e485fa381c5246bc0234c36c78e"}, + {file = "SQLAlchemy-2.0.29-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a13b917b4ffe5a0a31b83d051d60477819ddf18276852ea68037a144a506efb9"}, + {file = "SQLAlchemy-2.0.29-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f6d971255d9ddbd3189e2e79d743ff4845c07f0633adfd1de3f63d930dbe673"}, + {file = "SQLAlchemy-2.0.29-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:61405ea2d563407d316c63a7b5271ae5d274a2a9fbcd01b0aa5503635699fa1e"}, + {file = "SQLAlchemy-2.0.29-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:de7202ffe4d4a8c1e3cde1c03e01c1a3772c92858837e8f3879b497158e4cb44"}, + {file = "SQLAlchemy-2.0.29-cp311-cp311-win32.whl", hash = "sha256:b5d7ed79df55a731749ce65ec20d666d82b185fa4898430b17cb90c892741520"}, + {file = "SQLAlchemy-2.0.29-cp311-cp311-win_amd64.whl", hash = "sha256:205f5a2b39d7c380cbc3b5dcc8f2762fb5bcb716838e2d26ccbc54330775b003"}, + {file = "SQLAlchemy-2.0.29-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d96710d834a6fb31e21381c6d7b76ec729bd08c75a25a5184b1089141356171f"}, + {file = "SQLAlchemy-2.0.29-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:52de4736404e53c5c6a91ef2698c01e52333988ebdc218f14c833237a0804f1b"}, + {file = "SQLAlchemy-2.0.29-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c7b02525ede2a164c5fa5014915ba3591730f2cc831f5be9ff3b7fd3e30958e"}, + {file = "SQLAlchemy-2.0.29-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dfefdb3e54cd15f5d56fd5ae32f1da2d95d78319c1f6dfb9bcd0eb15d603d5d"}, + {file = "SQLAlchemy-2.0.29-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a88913000da9205b13f6f195f0813b6ffd8a0c0c2bd58d499e00a30eb508870c"}, + {file = "SQLAlchemy-2.0.29-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:fecd5089c4be1bcc37c35e9aa678938d2888845a134dd016de457b942cf5a758"}, + {file = "SQLAlchemy-2.0.29-cp312-cp312-win32.whl", hash = "sha256:8197d6f7a3d2b468861ebb4c9f998b9df9e358d6e1cf9c2a01061cb9b6cf4e41"}, + {file = "SQLAlchemy-2.0.29-cp312-cp312-win_amd64.whl", hash = "sha256:9b19836ccca0d321e237560e475fd99c3d8655d03da80c845c4da20dda31b6e1"}, + {file = "SQLAlchemy-2.0.29-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:87a1d53a5382cdbbf4b7619f107cc862c1b0a4feb29000922db72e5a66a5ffc0"}, + {file = "SQLAlchemy-2.0.29-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a0732dffe32333211801b28339d2a0babc1971bc90a983e3035e7b0d6f06b93"}, + {file = "SQLAlchemy-2.0.29-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90453597a753322d6aa770c5935887ab1fc49cc4c4fdd436901308383d698b4b"}, + {file = "SQLAlchemy-2.0.29-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:ea311d4ee9a8fa67f139c088ae9f905fcf0277d6cd75c310a21a88bf85e130f5"}, + {file = "SQLAlchemy-2.0.29-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5f20cb0a63a3e0ec4e169aa8890e32b949c8145983afa13a708bc4b0a1f30e03"}, + {file = "SQLAlchemy-2.0.29-cp37-cp37m-win32.whl", hash = "sha256:e5bbe55e8552019c6463709b39634a5fc55e080d0827e2a3a11e18eb73f5cdbd"}, + {file = "SQLAlchemy-2.0.29-cp37-cp37m-win_amd64.whl", hash = "sha256:c2f9c762a2735600654c654bf48dad388b888f8ce387b095806480e6e4ff6907"}, + {file = "SQLAlchemy-2.0.29-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7e614d7a25a43a9f54fcce4675c12761b248547f3d41b195e8010ca7297c369c"}, + {file = "SQLAlchemy-2.0.29-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:471fcb39c6adf37f820350c28aac4a7df9d3940c6548b624a642852e727ea586"}, + {file = "SQLAlchemy-2.0.29-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:988569c8732f54ad3234cf9c561364221a9e943b78dc7a4aaf35ccc2265f1930"}, + {file = "SQLAlchemy-2.0.29-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dddaae9b81c88083e6437de95c41e86823d150f4ee94bf24e158a4526cbead01"}, + {file = "SQLAlchemy-2.0.29-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:334184d1ab8f4c87f9652b048af3f7abea1c809dfe526fb0435348a6fef3d380"}, + {file = "SQLAlchemy-2.0.29-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:38b624e5cf02a69b113c8047cf7f66b5dfe4a2ca07ff8b8716da4f1b3ae81567"}, + {file = "SQLAlchemy-2.0.29-cp38-cp38-win32.whl", hash = "sha256:bab41acf151cd68bc2b466deae5deeb9e8ae9c50ad113444151ad965d5bf685b"}, + {file = "SQLAlchemy-2.0.29-cp38-cp38-win_amd64.whl", hash = "sha256:52c8011088305476691b8750c60e03b87910a123cfd9ad48576d6414b6ec2a1d"}, + {file = "SQLAlchemy-2.0.29-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3071ad498896907a5ef756206b9dc750f8e57352113c19272bdfdc429c7bd7de"}, + {file = "SQLAlchemy-2.0.29-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dba622396a3170974f81bad49aacebd243455ec3cc70615aeaef9e9613b5bca5"}, + {file = "SQLAlchemy-2.0.29-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b184e3de58009cc0bf32e20f137f1ec75a32470f5fede06c58f6c355ed42a72"}, + {file = "SQLAlchemy-2.0.29-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c37f1050feb91f3d6c32f864d8e114ff5545a4a7afe56778d76a9aec62638ba"}, + {file = "SQLAlchemy-2.0.29-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bda7ce59b06d0f09afe22c56714c65c957b1068dee3d5e74d743edec7daba552"}, + {file = "SQLAlchemy-2.0.29-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:25664e18bef6dc45015b08f99c63952a53a0a61f61f2e48a9e70cec27e55f699"}, + {file = "SQLAlchemy-2.0.29-cp39-cp39-win32.whl", hash = "sha256:77d29cb6c34b14af8a484e831ab530c0f7188f8efed1c6a833a2c674bf3c26ec"}, + {file = "SQLAlchemy-2.0.29-cp39-cp39-win_amd64.whl", hash = "sha256:04c487305ab035a9548f573763915189fc0fe0824d9ba28433196f8436f1449c"}, + {file = "SQLAlchemy-2.0.29-py3-none-any.whl", hash = "sha256:dc4ee2d4ee43251905f88637d5281a8d52e916a021384ec10758826f5cbae305"}, + {file = "SQLAlchemy-2.0.29.tar.gz", hash = "sha256:bd9566b8e58cabd700bc367b60e90d9349cd16f0984973f98a9a09f9c64e86f0"}, ] [package.dependencies] @@ -7333,4 +7336,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.11.0,<3.13.0" -content-hash = "eea5bc58f780081762bf75b7a39e92bd0f2c381bb6c56b183df867fa5e36fd65" +content-hash = "a4e7ec86dc485151482cf7dc683b4f68c4aeec1ed1bf3ecb893840ee673fefd4" diff --git a/postgres/channel_metadata.json b/postgres/channel_metadata.json new file mode 100755 index 00000000..e6719870 --- /dev/null +++ b/postgres/channel_metadata.json @@ -0,0 +1,149 @@ +[ +{"ID":"58","Channel Name":"bfmtv","Channel Title":"BFM TV","Duration Minutes":"960","Weekday":"3"}, +{"ID":"59","Channel Name":"bfmtv","Channel Title":"BFM TV","Duration Minutes":"960","Weekday":"4"}, +{"ID":"60","Channel Name":"bfmtv","Channel Title":"BFM TV","Duration Minutes":"960","Weekday":"5"}, +{"ID":"61","Channel Name":"bfmtv","Channel Title":"BFM TV","Duration Minutes":"960","Weekday":"6"}, +{"ID":"62","Channel Name":"bfmtv","Channel Title":"BFM TV","Duration Minutes":"960","Weekday":"7"}, +{"ID":"49","Channel Name":"d8","Channel Title":"c8","Duration Minutes":"120","Weekday":"1"}, +{"ID":"57","Channel Name":"bfmtv","Channel Title":"BFM TV","Duration Minutes":"960","Weekday":"2"}, +{"ID":"2","Channel Name":"tf1","Channel Title":"TF1","Duration Minutes":"235","Weekday":"3"}, +{"ID":"3","Channel Name":"tf1","Channel Title":"TF1","Duration Minutes":"235","Weekday":"4"}, +{"ID":"4","Channel Name":"tf1","Channel Title":"TF1","Duration Minutes":"235","Weekday":"5"}, +{"ID":"5","Channel Name":"tf1","Channel Title":"TF1","Duration Minutes":"235","Weekday":"6"}, +{"ID":"6","Channel Name":"tf1","Channel Title":"TF1","Duration Minutes":"140","Weekday":"7"}, +{"ID":"56","Channel Name":"tmc","Channel Title":"TMC","Duration Minutes":"0","Weekday":"1"}, +{"ID":"1","Channel Name":"tf1","Channel Title":"TF1","Duration Minutes":"235","Weekday":"2"}, +{"ID":"16","Channel Name":"fr3-idf","Channel Title":"France 3-idf","Duration Minutes":"220","Weekday":"3"}, +{"ID":"17","Channel Name":"fr3-idf","Channel Title":"France 3-idf","Duration Minutes":"220","Weekday":"4"}, +{"ID":"18","Channel Name":"fr3-idf","Channel Title":"France 3-idf","Duration Minutes":"220","Weekday":"5"}, +{"ID":"19","Channel Name":"fr3-idf","Channel Title":"France 3-idf","Duration Minutes":"220","Weekday":"6"}, +{"ID":"20","Channel Name":"fr3-idf","Channel Title":"France 3-idf","Duration Minutes":"100","Weekday":"7"}, +{"ID":"14","Channel Name":"france2","Channel Title":"France 2","Duration Minutes":"225","Weekday":"1"}, +{"ID":"15","Channel Name":"fr3-idf","Channel Title":"France 3-idf","Duration Minutes":"220","Weekday":"2"}, +{"ID":"9","Channel Name":"france2","Channel Title":"France 2","Duration Minutes":"285","Weekday":"3"}, +{"ID":"10","Channel Name":"france2","Channel Title":"France 2","Duration Minutes":"285","Weekday":"4"}, +{"ID":"11","Channel Name":"france2","Channel Title":"France 2","Duration Minutes":"285","Weekday":"5"}, +{"ID":"12","Channel Name":"france2","Channel Title":"France 2","Duration Minutes":"285","Weekday":"6"}, +{"ID":"13","Channel Name":"france2","Channel Title":"France 2","Duration Minutes":"285","Weekday":"7"}, +{"ID":"22","Channel Name":"france5","Channel Title":"France 5","Duration Minutes":"120","Weekday":"2"}, +{"ID":"8","Channel Name":"france2","Channel Title":"France 2","Duration Minutes":"285","Weekday":"2"}, +{"ID":"30","Channel Name":"m6","Channel Title":"M6","Duration Minutes":"100","Weekday":"3"}, +{"ID":"31","Channel Name":"m6","Channel Title":"M6","Duration Minutes":"100","Weekday":"4"}, +{"ID":"32","Channel Name":"m6","Channel Title":"M6","Duration Minutes":"100","Weekday":"5"}, +{"ID":"33","Channel Name":"m6","Channel Title":"M6","Duration Minutes":"100","Weekday":"6"}, +{"ID":"34","Channel Name":"m6","Channel Title":"M6","Duration Minutes":"100","Weekday":"7"}, +{"ID":"106","Channel Name":"rmc","Channel Title":"RMC","Duration Minutes":"510","Weekday":"2"}, +{"ID":"29","Channel Name":"m6","Channel Title":"M6","Duration Minutes":"100","Weekday":"2"}, +{"ID":"37","Channel Name":"arte","Channel Title":"Arte","Duration Minutes":"65","Weekday":"3"}, +{"ID":"38","Channel Name":"arte","Channel Title":"Arte","Duration Minutes":"65","Weekday":"4"}, +{"ID":"39","Channel Name":"arte","Channel Title":"Arte","Duration Minutes":"65","Weekday":"5"}, +{"ID":"40","Channel Name":"arte","Channel Title":"Arte","Duration Minutes":"65","Weekday":"6"}, +{"ID":"41","Channel Name":"arte","Channel Title":"Arte","Duration Minutes":"65","Weekday":"7"}, +{"ID":"63","Channel Name":"bfmtv","Channel Title":"BFM TV","Duration Minutes":"960","Weekday":"1"}, +{"ID":"36","Channel Name":"arte","Channel Title":"Arte","Duration Minutes":"65","Weekday":"2"}, +{"ID":"51","Channel Name":"tmc","Channel Title":"TMC","Duration Minutes":"125","Weekday":"3"}, +{"ID":"52","Channel Name":"tmc","Channel Title":"TMC","Duration Minutes":"125","Weekday":"4"}, +{"ID":"53","Channel Name":"tmc","Channel Title":"TMC","Duration Minutes":"125","Weekday":"5"}, +{"ID":"54","Channel Name":"tmc","Channel Title":"TMC","Duration Minutes":"125","Weekday":"6"}, +{"ID":"55","Channel Name":"tmc","Channel Title":"TMC","Duration Minutes":"0","Weekday":"7"}, +{"ID":"50","Channel Name":"tmc","Channel Title":"TMC","Duration Minutes":"125","Weekday":"2"}, +{"ID":"79","Channel Name":"itele","Channel Title":"CNews","Duration Minutes":"960","Weekday":"3"}, +{"ID":"80","Channel Name":"itele","Channel Title":"CNews","Duration Minutes":"960","Weekday":"4"}, +{"ID":"81","Channel Name":"itele","Channel Title":"CNews","Duration Minutes":"960","Weekday":"5"}, +{"ID":"82","Channel Name":"itele","Channel Title":"CNews","Duration Minutes":"960","Weekday":"6"}, +{"ID":"83","Channel Name":"itele","Channel Title":"CNews","Duration Minutes":"960","Weekday":"7"}, +{"ID":"70","Channel Name":"lci","Channel Title":"LCI","Duration Minutes":"960","Weekday":"1"}, +{"ID":"78","Channel Name":"itele","Channel Title":"CNews","Duration Minutes":"960","Weekday":"2"}, +{"ID":"86","Channel Name":"france-inter","Channel Title":"France Inter","Duration Minutes":"420","Weekday":"3"}, +{"ID":"87","Channel Name":"france-inter","Channel Title":"France Inter","Duration Minutes":"420","Weekday":"4"}, +{"ID":"88","Channel Name":"france-inter","Channel Title":"France Inter","Duration Minutes":"420","Weekday":"5"}, +{"ID":"89","Channel Name":"france-inter","Channel Title":"France Inter","Duration Minutes":"300","Weekday":"6"}, +{"ID":"90","Channel Name":"france-inter","Channel Title":"France Inter","Duration Minutes":"270","Weekday":"7"}, +{"ID":"84","Channel Name":"itele","Channel Title":"CNews","Duration Minutes":"960","Weekday":"1"}, +{"ID":"85","Channel Name":"france-inter","Channel Title":"France Inter","Duration Minutes":"420","Weekday":"2"}, +{"ID":"135","Channel Name":"france-culture","Channel Title":"France Culture","Duration Minutes":"190","Weekday":"3"}, +{"ID":"136","Channel Name":"france-culture","Channel Title":"France Culture","Duration Minutes":"190","Weekday":"4"}, +{"ID":"137","Channel Name":"france-culture","Channel Title":"France Culture","Duration Minutes":"190","Weekday":"5"}, +{"ID":"138","Channel Name":"france-culture","Channel Title":"France Culture","Duration Minutes":"190","Weekday":"6"}, +{"ID":"139","Channel Name":"france-culture","Channel Title":"France Culture","Duration Minutes":"165","Weekday":"7"}, +{"ID":"71","Channel Name":"franceinfotv","Channel Title":"France Info","Duration Minutes":"960","Weekday":"2"}, +{"ID":"134","Channel Name":"france-culture","Channel Title":"France Culture","Duration Minutes":"190","Weekday":"2"}, +{"ID":"44","Channel Name":"d8","Channel Title":"c8","Duration Minutes":"165","Weekday":"3"}, +{"ID":"45","Channel Name":"d8","Channel Title":"c8","Duration Minutes":"165","Weekday":"4"}, +{"ID":"46","Channel Name":"d8","Channel Title":"c8","Duration Minutes":"165","Weekday":"5"}, +{"ID":"47","Channel Name":"d8","Channel Title":"c8","Duration Minutes":"165","Weekday":"6"}, +{"ID":"48","Channel Name":"d8","Channel Title":"c8","Duration Minutes":"120","Weekday":"7"}, +{"ID":"127","Channel Name":"europe1","Channel Title":"Europe 1","Duration Minutes":"570","Weekday":"2"}, +{"ID":"43","Channel Name":"d8","Channel Title":"c8","Duration Minutes":"165","Weekday":"2"}, +{"ID":"107","Channel Name":"rmc","Channel Title":"RMC","Duration Minutes":"510","Weekday":"3"}, +{"ID":"108","Channel Name":"rmc","Channel Title":"RMC","Duration Minutes":"510","Weekday":"4"}, +{"ID":"109","Channel Name":"rmc","Channel Title":"RMC","Duration Minutes":"510","Weekday":"5"}, +{"ID":"110","Channel Name":"rmc","Channel Title":"RMC","Duration Minutes":"510","Weekday":"6"}, +{"ID":"111","Channel Name":"rmc","Channel Title":"RMC","Duration Minutes":"180","Weekday":"7"}, +{"ID":"92","Channel Name":"rtl","Channel Title":"RTL","Duration Minutes":"450","Weekday":"2"}, +{"ID":"93","Channel Name":"rtl","Channel Title":"RTL","Duration Minutes":"450","Weekday":"3"}, +{"ID":"94","Channel Name":"rtl","Channel Title":"RTL","Duration Minutes":"450","Weekday":"4"}, +{"ID":"95","Channel Name":"rtl","Channel Title":"RTL","Duration Minutes":"450","Weekday":"5"}, +{"ID":"96","Channel Name":"rtl","Channel Title":"RTL","Duration Minutes":"450","Weekday":"6"}, +{"ID":"7","Channel Name":"tf1","Channel Title":"TF1","Duration Minutes":"140","Weekday":"1"}, +{"ID":"128","Channel Name":"europe1","Channel Title":"Europe 1","Duration Minutes":"570","Weekday":"3"}, +{"ID":"129","Channel Name":"europe1","Channel Title":"Europe 1","Duration Minutes":"570","Weekday":"4"}, +{"ID":"130","Channel Name":"europe1","Channel Title":"Europe 1","Duration Minutes":"570","Weekday":"5"}, +{"ID":"131","Channel Name":"europe1","Channel Title":"Europe 1","Duration Minutes":"570","Weekday":"6"}, +{"ID":"21","Channel Name":"fr3-idf","Channel Title":"France 3-idf","Duration Minutes":"100","Weekday":"1"}, +{"ID":"72","Channel Name":"franceinfotv","Channel Title":"France Info","Duration Minutes":"960","Weekday":"3"}, +{"ID":"73","Channel Name":"franceinfotv","Channel Title":"France Info","Duration Minutes":"960","Weekday":"4"}, +{"ID":"74","Channel Name":"franceinfotv","Channel Title":"France Info","Duration Minutes":"960","Weekday":"5"}, +{"ID":"75","Channel Name":"franceinfotv","Channel Title":"France Info","Duration Minutes":"960","Weekday":"6"}, +{"ID":"91","Channel Name":"france-inter","Channel Title":"France Inter","Duration Minutes":"210","Weekday":"1"}, +{"ID":"65","Channel Name":"lci","Channel Title":"LCI","Duration Minutes":"960","Weekday":"3"}, +{"ID":"66","Channel Name":"lci","Channel Title":"LCI","Duration Minutes":"960","Weekday":"4"}, +{"ID":"67","Channel Name":"lci","Channel Title":"LCI","Duration Minutes":"960","Weekday":"5"}, +{"ID":"68","Channel Name":"lci","Channel Title":"LCI","Duration Minutes":"960","Weekday":"6"}, +{"ID":"69","Channel Name":"lci","Channel Title":"LCI","Duration Minutes":"960","Weekday":"7"}, +{"ID":"35","Channel Name":"m6","Channel Title":"M6","Duration Minutes":"100","Weekday":"1"}, +{"ID":"64","Channel Name":"lci","Channel Title":"LCI","Duration Minutes":"960","Weekday":"2"}, +{"ID":"23","Channel Name":"france5","Channel Title":"France 5","Duration Minutes":"120","Weekday":"3"}, +{"ID":"24","Channel Name":"france5","Channel Title":"France 5","Duration Minutes":"120","Weekday":"4"}, +{"ID":"25","Channel Name":"france5","Channel Title":"France 5","Duration Minutes":"120","Weekday":"5"}, +{"ID":"26","Channel Name":"france5","Channel Title":"France 5","Duration Minutes":"120","Weekday":"6"}, +{"ID":"140","Channel Name":"france-culture","Channel Title":"France Culture","Duration Minutes":"15","Weekday":"1"}, +{"ID":"42","Channel Name":"arte","Channel Title":"Arte","Duration Minutes":"20","Weekday":"1"}, +{"ID":"141","Channel Name":"rfi","Channel Title":"RFI","Duration Minutes":"160","Weekday":"1"}, +{"ID":"142","Channel Name":"rfi","Channel Title":"RFI","Duration Minutes":"160","Weekday":"2"}, +{"ID":"143","Channel Name":"rfi","Channel Title":"RFI","Duration Minutes":"160","Weekday":"3"}, +{"ID":"144","Channel Name":"rfi","Channel Title":"RFI","Duration Minutes":"160","Weekday":"4"}, +{"ID":"145","Channel Name":"rfi","Channel Title":"RFI","Duration Minutes":"160","Weekday":"5"}, +{"ID":"146","Channel Name":"rfi","Channel Title":"RFI","Duration Minutes":"160","Weekday":"6"}, +{"ID":"147","Channel Name":"rfi","Channel Title":"RFI","Duration Minutes":"160","Weekday":"7"}, +{"ID":"148","Channel Name":"france24","Channel Title":"France 24","Duration Minutes":"960","Weekday":"1"}, +{"ID":"149","Channel Name":"france24","Channel Title":"France 24","Duration Minutes":"960","Weekday":"2"}, +{"ID":"150","Channel Name":"france24","Channel Title":"France 24","Duration Minutes":"960","Weekday":"3"}, +{"ID":"151","Channel Name":"france24","Channel Title":"France 24","Duration Minutes":"960","Weekday":"4"}, +{"ID":"152","Channel Name":"france24","Channel Title":"France 24","Duration Minutes":"960","Weekday":"5"}, +{"ID":"153","Channel Name":"france24","Channel Title":"France 24","Duration Minutes":"960","Weekday":"6"}, +{"ID":"154","Channel Name":"france24","Channel Title":"France 24","Duration Minutes":"960","Weekday":"7"}, +{"ID":"155","Channel Name":"sud-radio","Channel Title":"Sud Radio","Duration Minutes":"140","Weekday":"1"}, +{"ID":"156","Channel Name":"sud-radio","Channel Title":"Sud Radio","Duration Minutes":"390","Weekday":"2"}, +{"ID":"157","Channel Name":"sud-radio","Channel Title":"Sud Radio","Duration Minutes":"390","Weekday":"3"}, +{"ID":"158","Channel Name":"sud-radio","Channel Title":"Sud Radio","Duration Minutes":"390","Weekday":"4"}, +{"ID":"159","Channel Name":"sud-radio","Channel Title":"Sud Radio","Duration Minutes":"390","Weekday":"5"}, +{"ID":"160","Channel Name":"sud-radio","Channel Title":"Sud Radio","Duration Minutes":"390","Weekday":"6"}, +{"ID":"161","Channel Name":"sud-radio","Channel Title":"Sud Radio","Duration Minutes":"140","Weekday":"7"}, +{"ID":"162","Channel Name":"france-info","Channel Title":"FranceinfoRadio","Duration Minutes":"960","Weekday":"1"}, +{"ID":"163","Channel Name":"france-info","Channel Title":"FranceinfoRadio","Duration Minutes":"960","Weekday":"2"}, +{"ID":"164","Channel Name":"france-info","Channel Title":"FranceinfoRadio","Duration Minutes":"960","Weekday":"3"}, +{"ID":"165","Channel Name":"france-info","Channel Title":"FranceinfoRadio","Duration Minutes":"960","Weekday":"4"}, +{"ID":"166","Channel Name":"france-info","Channel Title":"FranceinfoRadio","Duration Minutes":"960","Weekday":"5"}, +{"ID":"167","Channel Name":"france-info","Channel Title":"FranceinfoRadio","Duration Minutes":"960","Weekday":"6"}, +{"ID":"168","Channel Name":"france-info","Channel Title":"FranceinfoRadio","Duration Minutes":"960","Weekday":"7"}, +{"ID":"133","Channel Name":"europe1","Channel Title":"Europe 1","Duration Minutes":"270","Weekday":"1"}, +{"ID":"132","Channel Name":"europe1","Channel Title":"Europe 1","Duration Minutes":"270","Weekday":"7"}, +{"ID":"28","Channel Name":"france5","Channel Title":"France 5","Duration Minutes":"75","Weekday":"1"}, +{"ID":"27","Channel Name":"france5","Channel Title":"France 5","Duration Minutes":"165","Weekday":"7"}, +{"ID":"77","Channel Name":"franceinfotv","Channel Title":"France Info","Duration Minutes":"960","Weekday":"1"}, +{"ID":"76","Channel Name":"franceinfotv","Channel Title":"France Info","Duration Minutes":"960","Weekday":"7"}, +{"ID":"112","Channel Name":"rmc","Channel Title":"RMC","Duration Minutes":"180","Weekday":"1"}, +{"ID":"98","Channel Name":"rtl","Channel Title":"RTL","Duration Minutes":"330","Weekday":"1"}, +{"ID":"97","Channel Name":"rtl","Channel Title":"RTL","Duration Minutes":"210","Weekday":"7"} +] \ No newline at end of file diff --git a/postgres/schemas/models.py b/postgres/schemas/models.py index bc060fb9..0b42ae0b 100644 --- a/postgres/schemas/models.py +++ b/postgres/schemas/models.py @@ -1,12 +1,13 @@ import logging from datetime import datetime -from sqlalchemy import Column, DateTime, String, Text, Boolean, ARRAY, JSON, Integer -from sqlalchemy.orm import declarative_base +from sqlalchemy import Column, DateTime, String, Text, Boolean, ARRAY, JSON, Integer, Table, MetaData +from sqlalchemy.orm import declarative_base, sessionmaker import pandas as pd from sqlalchemy import text from postgres.database_connection import connect_to_db, get_db_session import os +import json Base = declarative_base() @@ -33,6 +34,7 @@ def get_sitemap_cols(): # ALTER TABLE keywords_new_list # RENAME TO keywords; keywords_table = "keywords" +channel_metadata_table = "channel_metadata" class Sitemap(Base): __tablename__ = sitemap_table @@ -76,6 +78,14 @@ class Keywords(Base): number_of_biodiversite_consequences= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_consequences integer; number_of_biodiversite_solutions_directes= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_solutions_directes integer; +class Channel_Metadata(Base): + __tablename__ = channel_metadata_table + id = Column(Text, primary_key=True) + channel_name = Column(String, nullable=False) + channel_title = Column(String, nullable=False) + duration_minutes= Column(Integer) + weekday= Column(Integer) + def get_sitemap(id: str): session = get_db_session() return session.get(Sitemap, id) @@ -96,11 +106,12 @@ def get_last_month_sitemap_id(engine): def create_tables(): """Create tables in the PostgreSQL database""" - logging.info("create sitemap, keywords tables") + logging.info("create sitemap, keywords tables - update channel_metadata") try: engine = connect_to_db() Base.metadata.create_all(engine, checkfirst=True) + update_channel_metadata(engine) logging.info("Table creation done, if not already done.") except (Exception) as error: logging.error(error) @@ -108,16 +119,43 @@ def create_tables(): if engine is not None: engine.dispose() +def update_channel_metadata(engine): + logging.info("Update channel metadata") + Session = sessionmaker(bind=engine) + session = Session() + current_dir = os.path.dirname(os.path.abspath(__file__)) + json_file_path = os.path.join(current_dir, '..', 'channel_metadata.json') + with open(json_file_path, 'r') as f: + data = json.load(f) + + for item in data: + metadata = { + 'id': item['ID'], + 'channel_name': item['Channel Name'], + 'channel_title': item['Channel Title'], + 'duration_minutes': int(item['Duration Minutes']), + 'weekday': int(item['Weekday']) + } + session.merge(Channel_Metadata(**metadata)) + + # Commit all changes at once after processing all items + session.commit() + logging.info("Updated channel metadata") + def drop_tables(): - """Drop tables in the PostgreSQL database""" + """Drop table keyword in the PostgreSQL database""" if(os.environ.get("ENV") == "docker"): logging.warning("drop tables") try: + engine = connect_to_db() + metadata = MetaData(bind=engine) + metadata.reflect() + keywords_table = Table(keywords_table, metadata) + keywords_table.drop() - Base.metadata.drop_all(engine, checkfirst=True) - logging.info("Table deletion done") + logging.info(f"Table {keywords_table} deletion done") except (Exception) as error: logging.error(error) finally: diff --git a/pyproject.toml b/pyproject.toml index 50920676..b64f9c92 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ python = ">=3.11.0,<3.13.0" pandas = "^2.2.0" advertools = "^0.14.1" xmltodict = "^0.13.0" -sqlalchemy = "^2.0.21" +sqlalchemy = "^2.0.29" psycopg2-binary = "^2.9.5" streamlit = "^1.27.0" @@ -35,18 +35,18 @@ aiohttp = "^3.8.6" pytest-asyncio = "^0.23.5" swifter = "^1.4.0" tenacity = "^8.2.3" -sentry-sdk = "^1.40.5" +sentry-sdk = "^1.44.1" coverage = "^7.4.2" modin = {extras = ["all"], version = "^0.28.0"} -dask-expr = "^0.4.2" +dask-expr = "^1.0.9" [build-system] requires = ["poetry-core>=1.1"] build-backend = "poetry.core.masonry.api" [tool.poetry.group.dev.dependencies] -pytest = "^7.4.2" -pytest-cov = "^3.0.0" +pytest = "^8.1.1" +pytest-cov = "^5.0.0" poetry-bumpversion = "^0.3.1" pre-commit = "^2.18.1" black = "^22.3.0" diff --git a/quotaclimat/data_processing/mediatree/detect_keywords.py b/quotaclimat/data_processing/mediatree/detect_keywords.py index 9e864244..90508148 100644 --- a/quotaclimat/data_processing/mediatree/detect_keywords.py +++ b/quotaclimat/data_processing/mediatree/detect_keywords.py @@ -13,11 +13,14 @@ import sentry_sdk import modin.pandas as pd import dask +import copy from quotaclimat.utils.logger import getLogger logging.getLogger('modin.logger.default').setLevel(logging.ERROR) logging.getLogger('distributed.scheduler').setLevel(logging.ERROR) dask.config.set({'dataframe.query-planning': True}) +indirectes = 'indirectes' + def get_cts_in_ms_for_keywords(subtitle_duration: List[dict], keywords: List[str], theme: str) -> List[dict]: result = [] @@ -134,7 +137,6 @@ def remove_stopwords(plaintext: str) -> str: @sentry_sdk.trace def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], start: datetime): - matching_themes = [] keywords_with_timestamp = [] plaitext_without_stopwords = remove_stopwords(plaintext) @@ -146,36 +148,54 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s matching_words = [word for word in keywords if is_word_in_sentence(word, plaitext_without_stopwords)] if matching_words: logging.debug(f"theme found : {theme} with word {matching_words}") - matching_themes.append(theme) + # look for cts_in_ms inside matching_words (['economie circulaire', 'panneaux solaires', 'solaires'] from subtitle_duration keywords_to_add = get_cts_in_ms_for_keywords(subtitle_duration, matching_words, theme) if(len(keywords_to_add) == 0): logging.warning(f"Check regex - Empty keywords but themes is there {theme} - matching_words {matching_words} - {subtitle_duration}") keywords_with_timestamp.extend(keywords_to_add) - if len(matching_themes) > 0: + if len(keywords_with_timestamp) > 0: keywords_with_timestamp = filter_keyword_with_same_timestamp(keywords_with_timestamp) + # count false positive near of 15" of positive keywords + keywords_with_timestamp = tag_fifteen_second_window_number(keywords_with_timestamp, start) + keywords_with_timestamp = transform_false_positive_keywords_to_positive(keywords_with_timestamp, start) + filtered_keywords_with_timestamp = filter_indirect_words(keywords_with_timestamp) + return [ - matching_themes, - keywords_with_timestamp, - count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start), - count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start,"changement_climatique_constat"), - count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start,"changement_climatique_causes_directes"), - count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start,"changement_climatique_consequences"), - count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start,"attenuation_climatique_solutions_directes"), - count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start,"adaptation_climatique_solutions_directes"), - count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start,"ressources_naturelles_concepts_generaux"), - count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start,"ressources_naturelles_causes"), - count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start,"ressources_naturelles_solutions"), - count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start,"biodiversite_concepts_generaux"), - count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start,"biodiversite_causes_directes"), - count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start,"biodiversite_consequences"), - count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start,"biodiversite_solutions_directes") + get_themes(keywords_with_timestamp), + clean_metadata(keywords_with_timestamp), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"changement_climatique_constat"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"changement_climatique_causes"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"changement_climatique_consequences"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"attenuation_climatique_solutions"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"adaptation_climatique_solutions"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"ressources_naturelles_concepts_generaux"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"ressources_naturelles_causes"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"ressources_naturelles_solutions"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"biodiversite_concepts_generaux"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"biodiversite_causes"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"biodiversite_consequences"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"biodiversite_solutions") ] else: return [None,None,None,None,None,None,None,None,None,None,None,None,None,None,None] +def get_themes(keywords_with_timestamp: List[dict]) -> List[str]: + return list(set([kw['theme'] for kw in keywords_with_timestamp])) + +def clean_metadata(keywords_with_timestamp): + keywords_with_timestamp_copy = copy.deepcopy(keywords_with_timestamp) # immutable + if( len(keywords_with_timestamp_copy)) > 0: + for item in keywords_with_timestamp_copy: + item.pop('window_number', None) + + return keywords_with_timestamp_copy + else: + return keywords_with_timestamp_copy + def log_min_max_date(df): max_date = max(df['start']) min_date = min(df['start']) @@ -232,30 +252,9 @@ def add_primary_key(df): return get_consistent_hash("empty") # TODO improve - should be a None ? def filter_indirect_words(keywords_with_timestamp: List[dict]) -> List[dict]: - return list(filter(lambda kw: 'indirectes' not in kw['theme'], keywords_with_timestamp)) - -def get_keyword_by_fifteen_second_window(filtered_themes: List[dict], start: datetime) -> List[int]: - window_size_seconds = get_keyword_time_separation_ms() - total_seconds_in_window = get_chunk_duration_api() - number_of_windows = int(total_seconds_in_window // window_size_seconds) - fifteen_second_window = [0] * number_of_windows + return list(filter(lambda kw: indirectes not in kw['theme'], keywords_with_timestamp)) - for keyword_info in filtered_themes: - window_number = int( (keyword_info['timestamp'] - start.timestamp() * 1000) // (window_size_seconds) ) - logging.debug(f"Window number {window_number} - kwtimestamp {keyword_info['timestamp']} - start {start.timestamp() * 1000}") - if window_number >= number_of_windows and window_number >= 0: - if(window_number == number_of_windows): # give some slack to mediatree subtitle edge case - logging.warning(f"Edge cases around 2 minutes - still counting for one - kwtimestamp {keyword_info['timestamp']} - start {start.timestamp() * 1000}") - window_number = number_of_windows - 1 - fifteen_second_window[window_number] = 1 - else: - logging.error(f"Window number {window_number} is out of range - kwtimestamp {keyword_info['timestamp']} - start {start.timestamp() * 1000}") - else: - fifteen_second_window[window_number] = 1 - - return fifteen_second_window - -def count_keywords_duration_overlap_without_indirect(keywords_with_timestamp: List[dict], start: datetime, theme: str = None) -> int: +def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: str = None) -> int: total_keywords = len(keywords_with_timestamp) if(total_keywords) == 0: return 0 @@ -267,15 +266,62 @@ def count_keywords_duration_overlap_without_indirect(keywords_with_timestamp: Li logging.debug("filtering ressources_ theme") keywords_with_timestamp = list(filter(lambda kw: "ressources_" not in kw['theme'], keywords_with_timestamp)) - filtered_themes = filter_indirect_words(keywords_with_timestamp) - length_filtered_items = len(filtered_themes) - logging.debug(f"Before filtering {total_keywords} - After filtering indirect kw {length_filtered_items}") + length_filtered_items = len(keywords_with_timestamp) + if length_filtered_items > 0: - fifteen_second_window = get_keyword_by_fifteen_second_window(filtered_themes, start) - final_count = sum(fifteen_second_window) - logging.debug(f"Count with 15 second logic: {final_count} keywords") - return final_count + return count_different_window_number(keywords_with_timestamp, start) else: return 0 - +def count_different_window_number(keywords_with_timestamp: List[dict], start: datetime) -> int: + window_numbers = [item['window_number'] for item in keywords_with_timestamp if 'window_number' in item] + final_count = len(set(window_numbers)) + logging.debug(f"Count with 15 second logic: {final_count} keywords") + + return final_count + +def contains_direct_keywords(keywords_with_timestamp: List[dict]) -> bool: + return any(indirectes not in kw['theme'] for kw in keywords_with_timestamp) + +# we want to count false positive near of 15" of positive keywords +def transform_false_positive_keywords_to_positive(keywords_with_timestamp: List[dict], start) -> List[dict]: + for keyword_info in keywords_with_timestamp: + # get 15-second neighbouring keywords + neighbour_keywords = list( + filter( + lambda kw: + 1 == abs(keyword_info['window_number'] - kw['window_number']) or + 0 == abs(keyword_info['window_number'] - kw['window_number']) + , keywords_with_timestamp) + ) + + if( contains_direct_keywords(neighbour_keywords) ) : + keyword_info['theme'] = remove_indirect(keyword_info['theme']) + + return keywords_with_timestamp + +def tag_fifteen_second_window_number(keywords_with_timestamp: List[dict], start) -> List[dict]: + window_size_seconds = get_keyword_time_separation_ms() + total_seconds_in_window = get_chunk_duration_api() + number_of_windows = int(total_seconds_in_window // window_size_seconds) + + for keyword_info in keywords_with_timestamp: + window_number = int( (keyword_info['timestamp'] - start.timestamp() * 1000) // (window_size_seconds)) + logging.debug(f"Window number {window_number} out of {number_of_windows} - kwtimestamp {keyword_info['timestamp']} - start {start.timestamp() * 1000}") + if window_number >= number_of_windows and window_number >= 0: + if(window_number == number_of_windows): # give some slack to mediatree subtitle edge case + logging.warning(f"Edge cases around 2 minutes - still counting for one - kwtimestamp {keyword_info['timestamp']} - start {start.timestamp() * 1000}") + window_number = number_of_windows - 1 + keyword_info['window_number'] = window_number + else: + logging.error(f"Window number {window_number} is out of range - kwtimestamp {keyword_info['timestamp']} - start {start.timestamp() * 1000}") + else: + keyword_info['window_number'] = window_number + + return keywords_with_timestamp + +def remove_indirect(theme: str) -> str: + if indirectes in theme: + return theme.replace(f'_{indirectes}', '') + else: + return theme diff --git a/quotaclimat/data_processing/mediatree/keyword/keyword.py b/quotaclimat/data_processing/mediatree/keyword/keyword.py index f07cdf5f..a028a16e 100644 --- a/quotaclimat/data_processing/mediatree/keyword/keyword.py +++ b/quotaclimat/data_processing/mediatree/keyword/keyword.py @@ -54,7 +54,7 @@ "transition énergétique", ] , -"changement_climatique_causes_directes" : [ # 1.1.2.1 +"changement_climatique_causes" : [ # 1.1.2.1 "inaction climatique", "insuffisance climatique", "gaz à effet de serre", @@ -220,7 +220,7 @@ "espèce invasive", "retrait gonflement des argiles", ], -"attenuation_climatique_solutions_directes" : [ # 1.1.4.1 +"attenuation_climatique_solutions" : [ # 1.1.4.1 "limiter la hausse des températures", "réduction des émissions de gaz à effet de serre", "baisse des émissions de gaz à effet de serre", @@ -345,7 +345,7 @@ ,"forêt" ,"tri à la source" ,"biodégradable" - ,"Recyclage" + ,"recyclage" ,"compostage" ,"espace vert" ,"prairie" @@ -383,12 +383,11 @@ ,"Voitures électriques" ] , -"adaptation_climatique_solutions_directes": # 1.1.5.1 +"adaptation_climatique_solutions": # 1.1.5.1 [ "adaptation au changement climatique", "adaptation au dérèglement climatique", "adaptation au réchauffement climatique", - "adaptation au réchauffement climatique", "adaptation climatique", "renforcer les digues", "renforcer les dunes", @@ -567,7 +566,7 @@ "Union internationale pour la conservation de la nature", "UICN", ], -"biodiversite_causes_directes": # 1.3.2 +"biodiversite_causes": # 1.3.2 [ "polluant", "Dégradation de l'habitat", @@ -688,7 +687,7 @@ "Mortalité forestière", "disparition des forêts", ], -"biodiversite_solutions_directes": # 1.3.4.1 +"biodiversite_solutions": # 1.3.4.1 [ "moins polluant", "agroécologie", diff --git a/quotaclimat/utils/sentry.py b/quotaclimat/utils/sentry.py index f599c339..5fbbc787 100644 --- a/quotaclimat/utils/sentry.py +++ b/quotaclimat/utils/sentry.py @@ -9,7 +9,7 @@ {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.get_cts_in_ms_for_keywords"}, {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.filter_keyword_with_same_timestamp"}, {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.get_themes_keywords_duration"}, - {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.count_keywords_duration_overlap_without_indirect"}, + {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.count_keywords_duration_overlap"}, {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.filter_and_tag_by_theme"}, {"qualified_name": "quotaclimat.data_processing.mediatree.detect_keywords.add_primary_key"}, {"qualified_name": "quotaclimat.data_processing.mediatree.api_import.extract_api_sub"}, diff --git a/test/sitemap/test_detect_keywords.py b/test/sitemap/test_detect_keywords.py index 3ee7e48b..30f90b8d 100644 --- a/test/sitemap/test_detect_keywords.py +++ b/test/sitemap/test_detect_keywords.py @@ -41,6 +41,41 @@ "duration_ms": 34, "cts_in_ms": original_timestamp + 76, "text": "abusive" + }, + { + "duration_ms": 34, + "cts_in_ms": original_timestamp + 98, + "text": "barrage" + }, + { + "duration_ms": 34, + "cts_in_ms": original_timestamp + 1000, + "text": "record" + }, + { + "duration_ms": 34, + "cts_in_ms": original_timestamp + 1100, + "text": "de" + }, + { + "duration_ms": 34, + "cts_in_ms": original_timestamp + 1200, + "text": "température" + }, + { + "duration_ms": 34, + "cts_in_ms": original_timestamp + 1000, + "text": "adaptation" + }, + { + "duration_ms": 34, + "cts_in_ms": original_timestamp + 1212, + "text": "réchauffement" + }, + { + "duration_ms": 34, + "cts_in_ms": original_timestamp + 1300, + "text": "planétaire" } ] def test_default_get_themes_keywords_duration(): @@ -48,13 +83,13 @@ def test_default_get_themes_keywords_duration(): assert get_themes_keywords_duration(plaintext_nothing, subtitles, start) == [None,None,None,None,None,None,None,None,None,None,None,None,None,None,None] def test_one_theme_get_themes_keywords_duration(): - plaintext_climat = "climatique test" - assert get_themes_keywords_duration(plaintext_climat, subtitles, start) == [["changement_climatique_constat"],[], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + plaintext_climat = "réchauffement planétaire test" + assert get_themes_keywords_duration(plaintext_climat, subtitles, start) == [ + ["changement_climatique_constat"], + [{'keyword': 'réchauffement planétaire', +'theme': 'changement_climatique_constat', +'timestamp': 1706437080216}], 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] -def test_multiple_get_themes_keywords_duration(): - plaintext_multiple_themes = "climatique test bovin migrations climatiques" - assert get_themes_keywords_duration(plaintext_multiple_themes, subtitles, start) == [["changement_climatique_constat", "changement_climatique_consequences"],[], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - def test_nothing_get_themes_keywords_duration(): # should not accept theme 'bus' for keyword "abusive" plaintext_regression_incomplete_word = "abusive" @@ -68,14 +103,16 @@ def test_regression_included_get_themes_keywords_duration(): def test_three_get_themes_keywords_duration(): assert get_themes_keywords_duration("record de température pizza adaptation au dérèglement climatique", subtitles, start) == [[ - "changement_climatique_constat" - ,"changement_climatique_consequences" - ,"adaptation_climatique_solutions_directes" - ],[], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + "adaptation_climatique_solutions" + ],[{'keyword': 'adaptation au dérèglement climatique', +'theme': 'adaptation_climatique_solutions', +'timestamp': 1706437080004}], 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0] def test_long_get_themes_keywords_duration(): assert get_themes_keywords_duration("il rencontre aussi une crise majeure de la pénurie de l' offre laetitia jaoude des barrages sauvages", subtitles, start) == [ - ["adaptation_climatique_solutions_indirectes"],[], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ["adaptation_climatique_solutions_indirectes"],[{'keyword': 'barrage', +'theme': 'adaptation_climatique_solutions_indirectes', +'timestamp': 1706437079102}], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] def test_stop_word_get_themes_keywords_duration(): plaintext = "haute isolation thermique fabriqué en france pizza" @@ -163,102 +200,6 @@ def test_none_theme_filter_and_tag_by_theme(): debug_df(df) assert len(df) == 0 -def test_filter_and_tag_by_theme(): - srt = [{ - "duration_ms": 34, - "cts_in_ms": original_timestamp + 79004, - "text": "adaptation" - } - ] - df1 = pd.DataFrame([{ - "start": start, - "plaintext": "cheese pizza", - "channel_name": "m6", - "channel_radio": False, - "srt": srt, - },{ - "start": start, - "plaintext": "tomato screen", - "channel_name": "m6", - "channel_radio": False, - "srt": srt, - },{ - "start": start, - "plaintext": "méthane bovin anthropocène", - "channel_name": "m6", - "channel_radio": False, - "srt": srt, - }, - { - "start": start, - "plaintext": "cheese pizza", - "channel_name": "m6", - "channel_radio": False, - "srt": srt, - },{ - "start": start, - "plaintext": "pizza année la plus chaude", - "channel_name": "m6", - "channel_radio": False, - "srt": srt, - }]) - - expected_result = pd.DataFrame([{ - "start": start, - "plaintext": "méthane bovin anthropocène", - "channel_name": "m6", - "channel_radio": False, - "srt": srt, - "theme": [ - "changement_climatique_constat", - "changement_climatique_causes_directes", - "ressources_naturelles_concepts_generaux" - ], - "keywords_with_timestamp": [], - "number_of_keywords": 0.0, - "number_of_changement_climatique_constat": 0.0, - "number_of_changement_climatique_causes_directes": 0.0, - "number_of_changement_climatique_consequences": 0.0, - "number_of_attenuation_climatique_solutions_directes": 0.0, - "number_of_adaptation_climatique_solutions_directes": 0.0, - "number_of_ressources_naturelles_concepts_generaux": 0.0, - "number_of_ressources_naturelles_causes": 0.0, - "number_of_ressources_naturelles_solutions": 0.0, - "number_of_biodiversite_concepts_generaux": 0.0, - "number_of_biodiversite_causes_directes": 0.0, - "number_of_biodiversite_consequences": 0.0, - "number_of_biodiversite_solutions_directes" :0.0 - }, - { - "start": start, - "plaintext": "pizza année la plus chaude", - "channel_name": "m6", - "channel_radio": False, - "srt": srt, - "theme": ["changement_climatique_consequences"], - "keywords_with_timestamp": [], - "number_of_keywords": 0.0, - "number_of_changement_climatique_constat": 0.0, - "number_of_changement_climatique_causes_directes": 0.0, - "number_of_changement_climatique_consequences": 0.0, - "number_of_attenuation_climatique_solutions_directes": 0.0, - "number_of_adaptation_climatique_solutions_directes": 0.0, - "number_of_ressources_naturelles_concepts_generaux": 0.0, - "number_of_ressources_naturelles_causes": 0.0, - "number_of_ressources_naturelles_solutions": 0.0, - "number_of_biodiversite_concepts_generaux": 0.0, - "number_of_biodiversite_causes_directes": 0.0, - "number_of_biodiversite_consequences": 0.0, - "number_of_biodiversite_solutions_directes" :0.0 - }]) - - # List of words to filter on - df = filter_and_tag_by_theme(df1) - logging.info(df.dtypes) - debug_df(df) - pd.testing.assert_frame_equal(df.reset_index(drop=True), expected_result.reset_index(drop=True)) - - def test_lower_case_filter_and_tag_by_theme(): srt = [{ "duration_ms": 34, @@ -281,15 +222,13 @@ def test_lower_case_filter_and_tag_by_theme(): "channel_radio": False, "srt": srt, "theme": [ - "changement_climatique_constat", - "changement_climatique_causes_directes", - "ressources_naturelles_concepts_generaux" + "changement_climatique_causes", ], "keywords_with_timestamp": [ { "keyword" :"méthane", "timestamp": original_timestamp, - "theme": "changement_climatique_causes_directes", + "theme": "changement_climatique_causes", }], "number_of_keywords": 1, "number_of_changement_climatique_constat": 0, @@ -333,15 +272,13 @@ def test_singular_plural_case_filter_and_tag_by_theme(): "channel_radio": False, "srt": srt, "theme": [ - "changement_climatique_constat", - "changement_climatique_causes_directes", - "ressources_naturelles_concepts_generaux" + "changement_climatique_causes", ], "keywords_with_timestamp": [ { "keyword" :"méthane", "timestamp": original_timestamp, - "theme": "changement_climatique_causes_directes", + "theme": "changement_climatique_causes", }], "number_of_keywords": 1, "number_of_changement_climatique_constat": 0, @@ -479,7 +416,7 @@ def test_format_word_regex(): assert format_word_regex("d'eau") == "d' ?eaus?" assert format_word_regex("réseaux") == "réseaux?" -def test_overlap_count_keywords_duration_overlap_without_indirect(): +def test_overlap_count_keywords_duration_overlap(): keywords_with_timestamp = [{ "keyword" : 'habitabilité de la planète', "timestamp": original_timestamp + 1, @@ -502,9 +439,9 @@ def test_overlap_count_keywords_duration_overlap_without_indirect(): } ] - assert count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start) == 1 + assert count_keywords_duration_overlap(tag_fifteen_second_window_number(keywords_with_timestamp, start), start) == 1 -def test_no_overlap_count_keywords_duration_overlap_without_indirect(): +def test_no_overlap_count_keywords_duration_overlap(): keywords_with_timestamp = [{ "keyword" : 'habitabilité de la planète', "timestamp": original_timestamp, @@ -537,9 +474,9 @@ def test_no_overlap_count_keywords_duration_overlap_without_indirect(): }, ] - assert count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start) == 4 + assert count_keywords_duration_overlap(tag_fifteen_second_window_number(keywords_with_timestamp, start),start) == 4 -def test_with_a_mix_of_overlap_count_keywords_duration_overlap_without_indirect(): +def test_with_a_mix_of_overlap_count_keywords_duration_overlap(): keywords_with_timestamp = [{ "keyword" : 'habitabilité de la planète', "timestamp": original_timestamp, # count for one @@ -577,9 +514,9 @@ def test_with_a_mix_of_overlap_count_keywords_duration_overlap_without_indirect( }, ] - assert count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start) == 2 + assert count_keywords_duration_overlap(tag_fifteen_second_window_number(keywords_with_timestamp, start),start) == 2 -def test_with_15second_window_count_keywords_duration_overlap_without_indirect(): +def test_with_15second_window_count_keywords_duration_overlap(): keywords_with_timestamp = [{ "keyword" : 'habitabilité de la planète', "timestamp": original_timestamp, # count for one @@ -617,9 +554,9 @@ def test_with_15second_window_count_keywords_duration_overlap_without_indirect() } ] - assert count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start) == 1 + assert count_keywords_duration_overlap(tag_fifteen_second_window_number(keywords_with_timestamp, start),start) == 1 -def test_only_one_count_keywords_duration_overlap_without_indirect(): +def test_only_one_count_keywords_duration_overlap(): keywords_with_timestamp = [{ "keyword" : 'habitabilité de la planète', "timestamp": original_timestamp, # count for one @@ -627,9 +564,9 @@ def test_only_one_count_keywords_duration_overlap_without_indirect(): } ] - assert count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start) == 1 + assert count_keywords_duration_overlap(tag_fifteen_second_window_number(keywords_with_timestamp, start), start) == 1 -def test_indirect_count_keywords_duration_overlap_without_indirect(): +def test_indirect_count_keywords_duration_overlap(): keywords_with_timestamp = [{ "keyword" : 'digue', "timestamp": original_timestamp, @@ -637,9 +574,9 @@ def test_indirect_count_keywords_duration_overlap_without_indirect(): } ] - assert count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start) == 0 + assert count_keywords_duration_overlap(tag_fifteen_second_window_number(keywords_with_timestamp, start), start) == 1 -def test_resources_count_keywords_duration_overlap_without_indirect(): +def test_resources_count_keywords_duration_overlap(): keywords_with_timestamp = [{ "keyword" : 'lithium', "timestamp": original_timestamp, @@ -647,7 +584,7 @@ def test_resources_count_keywords_duration_overlap_without_indirect(): } ] - assert count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start) == 0 + assert count_keywords_duration_overlap(tag_fifteen_second_window_number(keywords_with_timestamp, start),start) == 0 def test_filter_indirect_words(): keywords_with_timestamp = [{ @@ -760,19 +697,19 @@ def test_keyword_second_word_a_bit_later_inside_keyword_filter_keyword_with_same keywords_with_timestamp = [{ "keyword" : 'carbone', "timestamp": later_timestamp, - "theme":"changement_climatique_causes_directes", + "theme":"changement_climatique_causes", }, { "keyword" : 'béton bas carbone', "timestamp": original_timestamp, # same timestamp, so we take longest keyword - "theme":"attenuation_climatique_solutions_directes", # different theme, keep this one + "theme":"attenuation_climatique_solutions", # different theme, keep this one } ] expected = [{ "keyword" : 'béton bas carbone', "timestamp": original_timestamp, # same timestamp, so we take longest keyword - "theme":"attenuation_climatique_solutions_directes", # different theme, keep this one + "theme":"attenuation_climatique_solutions", # different theme, keep this one } ] @@ -786,7 +723,7 @@ def test_keyword_second_word_to_keep_inside_keyword_filter_keyword_with_same_tim "keyword": "pénurie" }, { - "theme":"attenuation_climatique_solutions_directes", # different theme, keep this one + "theme":"attenuation_climatique_solutions", # different theme, keep this one "timestamp": 1707627708051, "keyword": "barrages" }, @@ -801,7 +738,7 @@ def test_keyword_second_word_to_keep_inside_keyword_filter_keyword_with_same_tim { "keyword" : 'barrages', "timestamp": 1707627708051, # same timestamp, so we take longest keyword - "theme":"attenuation_climatique_solutions_directes", # different theme, keep this one + "theme":"attenuation_climatique_solutions", # different theme, keep this one } ] @@ -860,7 +797,7 @@ def test_get_keyword_by_fifteen_second_window(): } ] - assert get_keyword_by_fifteen_second_window(keywords_with_timestamp, start) == [1, 1, 1, 1, 0, 0, 0, 0] + assert count_different_window_number(tag_fifteen_second_window_number(keywords_with_timestamp, start), start) == 4 def test_full_house_get_keyword_by_fifteen_second_window(): keywords_with_timestamp = [{ @@ -925,7 +862,7 @@ def test_full_house_get_keyword_by_fifteen_second_window(): } ] - assert get_keyword_by_fifteen_second_window(keywords_with_timestamp, start) == [1, 1, 1, 1, 1, 1, 1, 1] + assert count_different_window_number(tag_fifteen_second_window_number(keywords_with_timestamp, start), start) == 8 def test_simple_get_keyword_by_fifteen_second_window(): @@ -941,7 +878,7 @@ def test_simple_get_keyword_by_fifteen_second_window(): } ] - assert get_keyword_by_fifteen_second_window(keywords_with_timestamp, start) == [1, 0, 0, 0, 0, 0, 0, 0] + assert count_different_window_number(tag_fifteen_second_window_number(keywords_with_timestamp, start), start) == 1 def test_edge_out_of_bound_get_keyword_by_fifteen_second_window(): keywords_with_timestamp = [ @@ -952,7 +889,7 @@ def test_edge_out_of_bound_get_keyword_by_fifteen_second_window(): } ] - assert get_keyword_by_fifteen_second_window(keywords_with_timestamp, start) == [0, 0, 0, 0, 0, 0, 0, 1] + assert count_different_window_number(tag_fifteen_second_window_number(keywords_with_timestamp, start), start) == 1 def test_really_out_of_bound_get_keyword_by_fifteen_second_window(): keywords_with_timestamp = [ @@ -963,7 +900,7 @@ def test_really_out_of_bound_get_keyword_by_fifteen_second_window(): } ] - assert get_keyword_by_fifteen_second_window(keywords_with_timestamp, start) == [0, 0, 0, 0, 0, 0, 0, 0] + assert count_different_window_number(tag_fifteen_second_window_number(keywords_with_timestamp, start), start) == 0 def test_almost_out_of_bound_get_keyword_by_fifteen_second_window(): keywords_with_timestamp = [ @@ -974,4 +911,226 @@ def test_almost_out_of_bound_get_keyword_by_fifteen_second_window(): } ] - assert get_keyword_by_fifteen_second_window(keywords_with_timestamp, start) == [0, 0, 0, 0, 0, 0, 0, 1] \ No newline at end of file + assert count_different_window_number(tag_fifteen_second_window_number(keywords_with_timestamp, start), start) == 1 + +def test_tag_fifteen_second_window_number(): + keywords_with_timestamp = [ + {'keyword': 'recyclage', + 'timestamp': original_timestamp, + 'theme': 'attenuation_climatique_solutions_indirectes' # should be transformed to direct + }, + {'keyword': 'climatique', + 'timestamp': original_timestamp + 150, + 'theme': 'changement_climatique_constat' + }, + {'keyword': 'covoiturage', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() + 10000, # should be transformed to direct + 'theme': 'attenuation_climatique_solutions_indirectes' + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 2 , + 'theme': 'attenuation_climatique_solutions_indirectes' # should be transformed to direct + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 6 , + 'theme': 'attenuation_climatique_solutions_indirectes' # should be transformed to direct + } + ] + + expected = [ + {'keyword': 'recyclage', + 'timestamp': original_timestamp, + 'window_number': 0, + 'theme': 'attenuation_climatique_solutions_indirectes' + }, + {'keyword': 'climatique', + 'timestamp': original_timestamp + 150, + 'window_number': 0, + 'theme': 'changement_climatique_constat' + }, + {'keyword': 'covoiturage', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() + 10000, + 'window_number': 1, + 'theme': 'attenuation_climatique_solutions_indirectes' + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 2 , + 'window_number': 2, + 'theme': 'attenuation_climatique_solutions_indirectes' + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 6 , + 'window_number': 6, + 'theme': 'attenuation_climatique_solutions_indirectes' + } + ] + assert tag_fifteen_second_window_number(keywords_with_timestamp, start) == expected + +def test_transform_false_positive_keywords_to_positive(): + keywords_with_timestamp = [ + {'keyword': 'recyclage', + 'timestamp': original_timestamp, + 'theme': 'attenuation_climatique_solutions_indirectes' # should be transformed to direct + }, + {'keyword': 'climatique', + 'timestamp': original_timestamp + 150, + 'theme': 'changement_climatique_constat' + }, + {'keyword': 'covoiturage', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() + 10000, # should be transformed to direct + 'theme': 'attenuation_climatique_solutions_indirectes' + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 2 , + 'theme': 'attenuation_climatique_solutions_indirectes' # should be transformed to direct + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 3 , + 'theme': 'attenuation_climatique_solutions_indirectes' # should be stayed to indirect + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 5 , + 'theme': 'attenuation_climatique_solutions_indirectes' # should be stayed to indirect + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 7, + 'theme': 'attenuation_climatique_solutions_indirectes' # should be stayed to indirect + } + ] + + expected_output = [ + {'keyword': 'recyclage', + 'timestamp': original_timestamp, + 'theme': 'attenuation_climatique_solutions' # was indirect + ,'window_number': 0 + }, + {'keyword': 'climatique', + 'timestamp': original_timestamp + 150, + 'theme': 'changement_climatique_constat' # our positive keyword that transform false positive + ,'window_number': 0 + }, + {'keyword': 'covoiturage', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() + 10000, # should be transformed to direct + 'theme': 'attenuation_climatique_solutions' + ,'window_number': 1 + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 2 , + 'theme': 'attenuation_climatique_solutions' # should be transformed to direct + ,'window_number': 2 + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 3 , + 'theme': 'attenuation_climatique_solutions'# should be transformed to direct + ,'window_number': 3 + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 5 , + 'theme': 'attenuation_climatique_solutions_indirectes' # should stay to indirect + ,'window_number': 5 + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 7, + 'theme': 'attenuation_climatique_solutions_indirectes' # should stay to indirect + ,'window_number': 7 + } + ] + + assert transform_false_positive_keywords_to_positive(tag_fifteen_second_window_number(keywords_with_timestamp,start), start) == expected_output + +def test_different_steps_transform_false_positive_keywords_to_positive(): + keywords_with_timestamp = [ + {'keyword': 'climatique', + 'timestamp': original_timestamp + 150, + 'theme': 'changement_climatique_constat' + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 1 + 150, + 'theme': 'attenuation_climatique_solutions_indirectes' # should be transformed to direct + }, + {'keyword': 'agroforesterie', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 2 + 150, + 'theme': 'attenuation_climatique_solutions_indirectes' # should be stayed to indirect + }, + {'keyword': 'alternative durable', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 3 + 150, + 'theme': 'attenuation_climatique_solutions_indirectes' # should be stayed to indirect + }, + {'keyword': 'planification écologique', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 4 + 150, + 'theme': 'attenuation_climatique_solutions_indirectes' # should be stayed to indirect + }, + {'keyword': 'nucléaire', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 6 + 150, + 'theme': 'attenuation_climatique_solutions_indirectes' # should be stayed to indirect + } + ] + + expected_output = [ + {'keyword': 'climatique', + 'timestamp': original_timestamp + 150, + 'window_number': 0, + 'theme': 'changement_climatique_constat' + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 1 + 150, + 'window_number': 1, + 'theme': 'attenuation_climatique_solutions' # should be transformed to direct + }, + {'keyword': 'agroforesterie', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 2 + 150, + 'window_number': 2, + 'theme': 'attenuation_climatique_solutions' # should be transformed to direct + }, + {'keyword': 'alternative durable', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 3 + 150, + 'window_number': 3, + 'theme': 'attenuation_climatique_solutions' # should be transformed to direct + }, + {'keyword': 'planification écologique', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 4 + 150, + 'window_number': 4, + 'theme': 'attenuation_climatique_solutions' # should be transformed to direct + }, + {'keyword': 'nucléaire', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 6 + 150, + 'window_number': 6, + 'theme': 'attenuation_climatique_solutions_indirectes' # should be stayed to indirect + } + ] + + assert transform_false_positive_keywords_to_positive(tag_fifteen_second_window_number(keywords_with_timestamp,start), start) == expected_output + + +def test_count_different_window_number(): + keywords_with_timestamp = [ + {'keyword': 'recyclage', + 'timestamp': original_timestamp, # count + 'theme': 'attenuation_climatique_solutions_indirectes' + }, + {'keyword': 'climatique', + 'timestamp': original_timestamp + 150, + 'theme': 'changement_climatique_constat' + }, + {'keyword': 'covoiturage', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() + 10000, + 'theme': 'attenuation_climatique_solutions_indirectes' + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 2 , # count + 'theme': 'attenuation_climatique_solutions_indirectes' + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 3 , # count + 'theme': 'attenuation_climatique_solutions_indirectes' + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 5 , # count + 'theme': 'attenuation_climatique_solutions_indirectes' + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms() * 7, # count + 'theme': 'attenuation_climatique_solutions_indirectes' + } + ] + assert count_different_window_number(tag_fifteen_second_window_number(keywords_with_timestamp, start),start) == 6 \ No newline at end of file diff --git a/test/sitemap/test_main_import_api.py b/test/sitemap/test_main_import_api.py index 63613684..7d376373 100644 --- a/test/sitemap/test_main_import_api.py +++ b/test/sitemap/test_main_import_api.py @@ -48,14 +48,15 @@ def test_first_row_api_import(): assert specific_keyword.number_of_keywords == 0 def test_second_row_api_import(): + primary_key = "67b9cc593516b40f55d6a3e89b377fccc8ab76d263c5fd6d4bfe379626190641" specific_keyword = get_keyword(primary_key) - assert specific_keyword.theme == [ - "changement_climatique_constat", - "changement_climatique_causes_indirectes", + assert set(specific_keyword.theme) == set([ "changement_climatique_consequences", - "adaptation_climatique_solutions_indirectes" - ] + "adaptation_climatique_solutions_indirectes", + "changement_climatique_constat", + "changement_climatique_causes", + ]) assert specific_keyword.keywords_with_timestamp == [ # from metabase to speedup check { @@ -71,7 +72,7 @@ def test_second_row_api_import(): { "keyword": "puit de pétrole", "timestamp": 1707627628054, - "theme": "changement_climatique_causes_indirectes" + "theme": "changement_climatique_causes" # was indirectes before }, { "keyword": "submersion", @@ -90,10 +91,10 @@ def test_second_row_api_import(): def test_third_row_api_import(): primary_key = "975b41e76d298711cf55113a282e7f11c28157d761233838bb700253d47be262" specific_keyword = get_keyword(primary_key) - assert specific_keyword.theme == [ + assert set(specific_keyword.theme) == set([ + "changement_climatique_consequences", "changement_climatique_constat", - "changement_climatique_consequences" - ] + ]) assert specific_keyword.keywords_with_timestamp == [ { "keyword": "écologiste", diff --git a/test/sitemap/test_update_pg_keywords.py b/test/sitemap/test_update_pg_keywords.py index 8f0ee4ba..2aeea65b 100644 --- a/test/sitemap/test_update_pg_keywords.py +++ b/test/sitemap/test_update_pg_keywords.py @@ -197,24 +197,42 @@ def test_first_update_keywords(): expected_keywords_with_timestamp = [ {'keyword': 'conditions de vie sur terre', 'timestamp': 1706437094004, 'theme': 'changement_climatique_constat'}, {'keyword': 'habitabilité de la planète', 'timestamp': 1706437079010, 'theme': 'changement_climatique_constat'}, - {'keyword': 'digue', 'timestamp': 1706437111004, 'theme': 'adaptation_climatique_solutions_indirectes'}, + {'keyword': 'digue', 'timestamp': 1706437111004, 'theme': 'adaptation_climatique_solutions'}, {'keyword': 'planète', 'timestamp': 1706437079016, 'theme': 'ressources_naturelles_concepts_generaux'} ] assert result_after_update.id == result_before_update.id # number_of_keywords - assert new_value == 2 + assert new_value == number_of_changement_climatique_constat + number_of_adaptation_climatique_solutions_directes assert result_after_update.number_of_keywords == new_value assert result_before_update.number_of_keywords == wrong_value # number_of_changement_climatique_constat - assert number_of_changement_climatique_constat == new_value - assert result_after_update.number_of_changement_climatique_constat == new_value + assert number_of_changement_climatique_constat == 2 + assert result_after_update.number_of_changement_climatique_constat == 2 + # number_of_adaptation_climatique_solutions_directes + assert number_of_adaptation_climatique_solutions_directes == 1 + assert result_after_update.number_of_adaptation_climatique_solutions_directes == 1 + + + assert number_of_ressources_naturelles_concepts_generaux == 1 + # keywords_with_timestamp assert result_after_update.keywords_with_timestamp == new_keywords_with_timestamp assert expected_keywords_with_timestamp == new_keywords_with_timestamp + assert number_of_changement_climatique_causes_directes == 0 + assert number_of_changement_climatique_consequences == 0 + assert number_of_attenuation_climatique_solutions_directes == 0 + + assert number_of_ressources_naturelles_causes == 0 + assert number_of_ressources_naturelles_solutions == 0 + assert number_of_biodiversite_concepts_generaux == 0 + assert number_of_biodiversite_causes_directes == 0 + assert number_of_biodiversite_consequences == 0 + assert number_of_biodiversite_solutions_directes == 0 + # theme - assert result_after_update.theme == ["changement_climatique_constat", "adaptation_climatique_solutions_indirectes", "ressources_naturelles_concepts_generaux"] - assert new_theme == ["changement_climatique_constat", "adaptation_climatique_solutions_indirectes", "ressources_naturelles_concepts_generaux"] \ No newline at end of file + assert set(result_after_update.theme) == set(["adaptation_climatique_solutions", "ressources_naturelles_concepts_generaux","changement_climatique_constat"]) + assert set(new_theme) == set(["adaptation_climatique_solutions", "ressources_naturelles_concepts_generaux","changement_climatique_constat"]) \ No newline at end of file