From 449d562d64c70f665f6647e209d6e7f1b28de822 Mon Sep 17 00:00:00 2001 From: Paul Leclercq Date: Thu, 15 Feb 2024 19:57:22 +0100 Subject: [PATCH] Feat/number of keywords (#104) * feat: overlap number_of_keywords * includes distance time keyword into parsing * wip: update keywords saved in pg based on new logic * autoreview * docker compose --- README.md | 5 + docker-compose.yml | 1 + .../data_processing/mediatree/api_import.py | 136 +++-------------- .../data_processing/mediatree/config.py | 21 +++ .../mediatree/detect_keywords.py | 131 +++++++++++++++++ .../mediatree/update_pg_keywords.py | 37 +++++ .../data_processing/mediatree/utils.py | 7 + test/sitemap/test_mediatree.py | 138 +++++++++++++++--- test/sitemap/test_update_pg_keywords.py | 67 +++++++++ 9 files changed, 413 insertions(+), 130 deletions(-) create mode 100644 quotaclimat/data_processing/mediatree/config.py create mode 100644 quotaclimat/data_processing/mediatree/detect_keywords.py create mode 100644 quotaclimat/data_processing/mediatree/update_pg_keywords.py create mode 100644 test/sitemap/test_update_pg_keywords.py diff --git a/README.md b/README.md index 1976bac5..46b987ba 100644 --- a/README.md +++ b/README.md @@ -268,6 +268,11 @@ docker compose up mediatree Use env variable `START_DATE` like in docker compose (epoch second format : 1705409797). Otherwise, default is yesterday midnight date. + +### Batch update +In case we have a new word detection logic, we must re apply it to all saved keywords inside our database. + +We should use env variable `UPDATE` like in docker compose (should be set to "true") ### Fix linting Before committing, make sure that the line of codes you wrote are conform to PEP8 standard by running: ```bash diff --git a/docker-compose.yml b/docker-compose.yml index 1a8a896d..84353127 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -134,6 +134,7 @@ services: PORT: 5050 # healthcheck HEALTHCHECK_SERVER: "0.0.0.0" # START_DATE: 1704576615 # to test batch import + # UPDATE: "true" # to batch update PG MEDIATREE_USER : /run/secrets/username_api MEDIATREE_PASSWORD: /run/secrets/pwd_api MEDIATREE_AUTH_URL: https://keywords.mediatree.fr/api/auth/token/ diff --git a/quotaclimat/data_processing/mediatree/api_import.py b/quotaclimat/data_processing/mediatree/api_import.py index 0496f60c..8f1de148 100644 --- a/quotaclimat/data_processing/mediatree/api_import.py +++ b/quotaclimat/data_processing/mediatree/api_import.py @@ -1,37 +1,32 @@ ### Library imports import requests import pandas as pd -import datetime import json import logging import asyncio -from utils import * import time import sys import os from quotaclimat.utils.healthcheck_config import run_health_check_server from quotaclimat.utils.logger import CustomFormatter +from quotaclimat.data_processing.mediatree.utils import * +from quotaclimat.data_processing.mediatree.config import * +from quotaclimat.data_processing.mediatree.update_pg_keywords import * +from quotaclimat.data_processing.mediatree.detect_keywords import * from postgres.insert_data import save_to_pg -from postgres.schemas.models import create_tables, connect_to_db +from postgres.schemas.models import create_tables, connect_to_db, get_db_session from postgres.schemas.models import keywords_table from pandas import json_normalize from quotaclimat.data_processing.mediatree.keyword.keyword import THEME_KEYWORDS from typing import List, Optional -from quotaclimat.data_ingestion.scrap_sitemap import get_consistent_hash -import re -import swifter from tenacity import * #read whole file to a string -password = os.environ.get("MEDIATREE_PASSWORD") -if(password == '/run/secrets/pwd_api'): - password= open("/run/secrets/pwd_api", "r").read() -AUTH_URL = os.environ.get("MEDIATREE_AUTH_URL") # -USER = os.environ.get("MEDIATREE_USER") -if(USER == '/run/secrets/username_api'): - USER=open("/run/secrets/username_api", "r").read() -KEYWORDS_URL = os.environ.get("KEYWORDS_URL") #https://keywords.mediatree.fr/docs/#api-Subtitle-SubtitleList +password = get_password() +AUTH_URL = get_auth_url() +USER = get_user() +KEYWORDS_URL = get_keywords_url() def refresh_token(token, date): if is_it_tuesday(date): # refresh token every weekday for batch import @@ -40,6 +35,14 @@ def refresh_token(token, date): else: return token +# reapply word detector logic to all saved keywords +# use when word detection is changed +async def update_pg_data(exit_event): + logging.info("Updating already saved data from Postgresql") + session = get_db_session() + update_keywords(session) + exit_event.set() + async def get_and_save_api_data(exit_event): conn = connect_to_db() token=get_auth_token(password=password, user_name=USER) @@ -120,102 +123,6 @@ def get_theme_query_includes(theme_dict): def transform_theme_query_includes(themes_with_keywords = THEME_KEYWORDS): return list(map(get_theme_query_includes, themes_with_keywords)) -def get_cts_in_ms_for_keywords(subtitle_duration: List[dict], keywords: List[str], theme: str) -> List[dict]: - result = [] - - logging.debug(f"Looking for timecode for {keywords}") - for multiple_keyword in keywords: - all_keywords = multiple_keyword.split() # case with multiple words such as 'économie circulaire' - match = next((item for item in subtitle_duration if is_word_in_sentence(all_keywords[0], item.get('text'))), None) - logging.debug(f"match found {match} with {all_keywords[0].lower()}") - if match is not None: - logging.debug(f'Result added due to this match {match} based on {all_keywords[0]}') - result.append( - { - "keyword" :multiple_keyword.lower(), - "timestamp" : match['cts_in_ms'], - "theme" : theme - }) - - logging.debug(f"Timecode found {result}") - return result - -# be able to detect singular or plural for a word -def format_word_regex(word: str) -> str: - word = word.replace('\'', '\' ?') # case for d'eau -> d' eau - if not word.endswith('s') and not word.endswith('x') and not word.endswith('à'): - return word + "s?" - elif word.endswith('s'): - return word + '?' - else: - return word - -def is_word_in_sentence(words: str, sentence: str) -> bool : - # words can contain plurals and several words - words = ' '.join(list(map(( lambda x: format_word_regex(x)), words.split(" ")))) - logging.debug(f"testing {words}") - # test https://regex101.com/r/ilvs9G/1/ - if re.search(rf"\b{words}(?![\w-])", sentence, re.IGNORECASE): - logging.debug(f"words {words} found in {sentence}") - return True - else: - return False - -def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str]) -> List[Optional[List[str]]]: - matching_themes = [] - keywords_with_timestamp = [] - - for theme, keywords in THEME_KEYWORDS.items(): - logging.debug(f"searching {theme} for {keywords}") - - matching_words = [word for word in keywords if is_word_in_sentence(word, plaintext)] - if matching_words: - logging.debug(f"theme found : {theme} with word {matching_words}") - matching_themes.append(theme) - # look for cts_in_ms inside matching_words (['économie circulaire', 'panneaux solaires', 'solaires'] from subtitle_duration - keywords_to_add = get_cts_in_ms_for_keywords(subtitle_duration, matching_words, theme) - if(len(keywords_to_add) == 0): - logging.warning(f"Check regex - Empty keywords but themes is there {theme} - matching_words {matching_words} - {subtitle_duration}") - keywords_with_timestamp.extend(keywords_to_add) - - if len(matching_themes) > 0: - return [matching_themes, keywords_with_timestamp, int(len(keywords_with_timestamp))] - else: - return [None, None, None] - -def log_min_max_date(df): - max_date = max(df['start']) - min_date = min(df['start']) - logging.info(f"Date min : {min_date}, max : {max_date}") - -def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame : - count_before_filtering = len(df) - logging.info(f"{count_before_filtering} subtitles to filter by keywords and tag with themes") - log_min_max_date(df) - - logging.info(f'tagging plaintext subtitle with keywords and theme : regexp - search taking time...') - # using swifter to speed up apply https://github.com/jmcarpenter2/swifter - df[['theme', u'keywords_with_timestamp', 'number_of_keywords']] = df[['plaintext','srt']].swifter.apply(lambda row: get_themes_keywords_duration(*row), axis=1, result_type='expand') - - # remove all rows that does not have themes - df = df.dropna(subset=['theme']) - - df.drop('srt', axis=1, inplace=True) - - logging.info(f"After filtering with out keywords, we have {len(df)} out of {count_before_filtering} subtitles left that are insteresting for us") - - return df - -def add_primary_key(df): - logging.info("Adding primary key to save to PG and have idempotent result") - try: - return ( - df["start"].astype(str) + df["channel_name"] - ).apply(get_consistent_hash) - except (Exception) as error: - logging.error(error) - return get_consistent_hash("empty") # TODO improve - should be a None ? - # "Randomly wait up to 2^x * 1 seconds between each retry until the range reaches 60 seconds, then randomly up to 60 seconds afterwards" # @see https://github.com/jd/tenacity/tree/main @retry(wait=wait_random_exponential(multiplier=1, max=60),stop=stop_after_attempt(7)) @@ -314,7 +221,7 @@ def log_dataframe_size(df, channel): logging.warning(f"High Dataframe size : {bytes_size / (1000 * 1000)}") if(len(df) == 1000): logging.error("We might lose data - df size is 1000 out of 1000 - we should divide this querry") - + async def main(): logger.info("Start api mediatree import") create_tables() @@ -324,7 +231,10 @@ async def main(): health_check_task = asyncio.create_task(run_health_check_server()) # Start batch job - asyncio.create_task(get_and_save_api_data(event_finish)) + if(os.environ.get("UPDATE") == "true"): + asyncio.create_task(update_pg_data(event_finish)) + else: + asyncio.create_task(get_and_save_api_data(event_finish)) # Wait for both tasks to complete await event_finish.wait() @@ -354,3 +264,5 @@ async def main(): asyncio.run(main()) sys.exit(0) + + diff --git a/quotaclimat/data_processing/mediatree/config.py b/quotaclimat/data_processing/mediatree/config.py new file mode 100644 index 00000000..84dd55ad --- /dev/null +++ b/quotaclimat/data_processing/mediatree/config.py @@ -0,0 +1,21 @@ +import os + +#read whole file to a string +def get_password(): + password = os.environ.get("MEDIATREE_PASSWORD") + if(password == '/run/secrets/pwd_api'): + password= open("/run/secrets/pwd_api", "r").read() + return password + +def get_auth_url(): + return os.environ.get("MEDIATREE_AUTH_URL") # + +def get_user(): + USER = os.environ.get("MEDIATREE_USER") + if(USER == '/run/secrets/username_api'): + USER=open("/run/secrets/username_api", "r").read() + return USER + +#https://keywords.mediatree.fr/docs/#api-Subtitle-SubtitleList +def get_keywords_url(): + return os.environ.get("KEYWORDS_URL") \ No newline at end of file diff --git a/quotaclimat/data_processing/mediatree/detect_keywords.py b/quotaclimat/data_processing/mediatree/detect_keywords.py new file mode 100644 index 00000000..ee28e203 --- /dev/null +++ b/quotaclimat/data_processing/mediatree/detect_keywords.py @@ -0,0 +1,131 @@ +import pandas as pd + +import logging + +from quotaclimat.data_processing.mediatree.utils import * +from quotaclimat.data_processing.mediatree.config import * +from postgres.schemas.models import keywords_table +from quotaclimat.data_processing.mediatree.keyword.keyword import THEME_KEYWORDS +from typing import List, Optional +from quotaclimat.data_ingestion.scrap_sitemap import get_consistent_hash +import re +import swifter + +def get_cts_in_ms_for_keywords(subtitle_duration: List[dict], keywords: List[str], theme: str) -> List[dict]: + result = [] + + logging.debug(f"Looking for timecode for {keywords}") + for multiple_keyword in keywords: + all_keywords = multiple_keyword.split() # case with multiple words such as 'économie circulaire' + match = next((item for item in subtitle_duration if is_word_in_sentence(all_keywords[0], item.get('text'))), None) + logging.debug(f"match found {match} with {all_keywords[0].lower()}") + if match is not None: + logging.debug(f'Result added due to this match {match} based on {all_keywords[0]}') + result.append( + { + "keyword" :multiple_keyword.lower(), + "timestamp" : match['cts_in_ms'], + "theme" : theme + }) + + logging.debug(f"Timecode found {result}") + return result + +# be able to detect singular or plural for a word +def format_word_regex(word: str) -> str: + word = word.replace('\'', '\' ?') # case for d'eau -> d' eau + if not word.endswith('s') and not word.endswith('x') and not word.endswith('à'): + return word + "s?" + elif word.endswith('s'): + return word + '?' + else: + return word + +def is_word_in_sentence(words: str, sentence: str) -> bool : + # words can contain plurals and several words + words = ' '.join(list(map(( lambda x: format_word_regex(x)), words.split(" ")))) + logging.debug(f"testing {words}") + # test https://regex101.com/r/ilvs9G/1/ + if re.search(rf"\b{words}(?![\w-])", sentence, re.IGNORECASE): + logging.debug(f"words {words} found in {sentence}") + return True + else: + return False + +def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str]) -> List[Optional[List[str]]]: + matching_themes = [] + keywords_with_timestamp = [] + + for theme, keywords in THEME_KEYWORDS.items(): + logging.debug(f"searching {theme} for {keywords}") + + matching_words = [word for word in keywords if is_word_in_sentence(word, plaintext)] + if matching_words: + logging.debug(f"theme found : {theme} with word {matching_words}") + matching_themes.append(theme) + # look for cts_in_ms inside matching_words (['économie circulaire', 'panneaux solaires', 'solaires'] from subtitle_duration + keywords_to_add = get_cts_in_ms_for_keywords(subtitle_duration, matching_words, theme) + if(len(keywords_to_add) == 0): + logging.warning(f"Check regex - Empty keywords but themes is there {theme} - matching_words {matching_words} - {subtitle_duration}") + keywords_with_timestamp.extend(keywords_to_add) + + if len(matching_themes) > 0: + return [matching_themes, keywords_with_timestamp, count_keywords_duration_overlap(keywords_with_timestamp)] + else: + return [None, None, None] + +def log_min_max_date(df): + max_date = max(df['start']) + min_date = min(df['start']) + logging.info(f"Date min : {min_date}, max : {max_date}") + +def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame : + count_before_filtering = len(df) + logging.info(f"{count_before_filtering} subtitles to filter by keywords and tag with themes") + log_min_max_date(df) + + logging.info(f'tagging plaintext subtitle with keywords and theme : regexp - search taking time...') + # using swifter to speed up apply https://github.com/jmcarpenter2/swifter + df[['theme', u'keywords_with_timestamp', 'number_of_keywords']] = df[['plaintext','srt']].swifter.apply(lambda row: get_themes_keywords_duration(*row), axis=1, result_type='expand') + + # remove all rows that does not have themes + df = df.dropna(subset=['theme']) + + df.drop('srt', axis=1, inplace=True) + + logging.info(f"After filtering with out keywords, we have {len(df)} out of {count_before_filtering} subtitles left that are insteresting for us") + + return df + +def add_primary_key(df): + logging.info("Adding primary key to save to PG and have idempotent result") + try: + return ( + df["start"].astype(str) + df["channel_name"] + ).apply(get_consistent_hash) + except (Exception) as error: + logging.error(error) + return get_consistent_hash("empty") # TODO improve - should be a None ? + +def count_keywords_duration_overlap(keywords_with_timestamp: List[dict]) -> int: + if(len(keywords_with_timestamp)) <= 1: + return len(keywords_with_timestamp) + else: + # in case keywords are not in the right order + sorted_keywords = iter(sorted(keywords_with_timestamp, key=lambda x: x['timestamp'])) + + count = 1 + previous_timestamp = next(sorted_keywords)['timestamp'] + + for keyword_info in sorted_keywords: + current_timestamp = keyword_info['timestamp'] + overlap_time = current_timestamp - previous_timestamp + + if is_time_distance_between_keyword_enough(overlap_time): + logging.debug(f"No overlapping keyword {count} + 1 : {overlap_time}") + count += 1 + previous_timestamp = current_timestamp + else: + logging.debug(f"Keyword timestamp overlap : {overlap_time} - current count is {count}") + + return count \ No newline at end of file diff --git a/quotaclimat/data_processing/mediatree/update_pg_keywords.py b/quotaclimat/data_processing/mediatree/update_pg_keywords.py new file mode 100644 index 00000000..af2fa6ca --- /dev/null +++ b/quotaclimat/data_processing/mediatree/update_pg_keywords.py @@ -0,0 +1,37 @@ +### Library imports +import requests +import pandas as pd +import json + +import logging +from sqlalchemy.orm import Session +from postgres.schemas.models import Keywords +from quotaclimat.data_processing.mediatree.detect_keywords import * + +def update_keywords(session: Session) -> list: + saved_keywords = get_keywords_columns(session) + logging.info(f"Updating {len(saved_keywords)} saved keywords") + for keyword_id, plaintext, keywords_with_timestamp, number_of_keywords in saved_keywords: + new_number_of_keywords = count_keywords_duration_overlap(keywords_with_timestamp) + logging.debug(f"{keyword_id} new value {new_number_of_keywords}") + update_number_of_keywords(session, keyword_id, new_number_of_keywords) + logging.info("updated all keywords") + + +def get_keywords_columns(session: Session) -> list: + return ( + session.query( + Keywords.id, + Keywords.plaintext, + Keywords.keywords_with_timestamp, + Keywords.number_of_keywords + ) + .all() + ) + +def update_number_of_keywords(session: Session, keyword_id: int, new_number_of_keywords: int): + session.query(Keywords).filter(Keywords.id == keyword_id).update( + {Keywords.number_of_keywords: new_number_of_keywords}, + synchronize_session=False + ) + session.commit() \ No newline at end of file diff --git a/quotaclimat/data_processing/mediatree/utils.py b/quotaclimat/data_processing/mediatree/utils.py index ec39cd71..d5102228 100644 --- a/quotaclimat/data_processing/mediatree/utils.py +++ b/quotaclimat/data_processing/mediatree/utils.py @@ -8,6 +8,13 @@ timezone='Europe/Paris' + +def get_keyword_time_separation_ms(): + return 15000 + +def is_time_distance_between_keyword_enough(overlap): + return overlap >= get_keyword_time_separation_ms() + def get_exact_days_from_week_day_name( start_date , end_date diff --git a/test/sitemap/test_mediatree.py b/test/sitemap/test_mediatree.py index b031cb44..085acca9 100644 --- a/test/sitemap/test_mediatree.py +++ b/test/sitemap/test_mediatree.py @@ -3,8 +3,9 @@ from bs4 import BeautifulSoup from utils import get_localhost, debug_df -from quotaclimat.data_processing.mediatree.api_import import format_word_regex, is_word_in_sentence, get_themes_keywords_duration, get_cts_in_ms_for_keywords, filter_and_tag_by_theme, parse_reponse_subtitle, get_includes_or_query, transform_theme_query_includes -import json +from quotaclimat.data_processing.mediatree.api_import import * +from quotaclimat.data_processing.mediatree.utils import * +from quotaclimat.data_processing.mediatree.detect_keywords import * from postgres.insert_data import save_to_pg from postgres.schemas.models import keywords_table, connect_to_db, get_keyword, drop_tables from quotaclimat.data_processing.mediatree.keyword.keyword import THEME_KEYWORDS @@ -413,6 +414,8 @@ def test_singular_plural_case_filter_and_tag_by_theme(): pd.testing.assert_frame_equal(df.reset_index(drop=True), expected_result.reset_index(drop=True)) def test_complexe_filter_and_tag_by_theme(): + original_timestamp = 1706437079004 + original_timestamp_first_keyword = original_timestamp + 6 df1 = pd.DataFrame([{ "start": 1704798000, "plaintext": "cheese pizza habitabilité de la planète conditions de vie sur terre animal", @@ -420,51 +423,51 @@ def test_complexe_filter_and_tag_by_theme(): "channel_radio": False, "srt": [{ "duration_ms": 34, - "cts_in_ms": 1706437079004, + "cts_in_ms": original_timestamp, "text": "cheese" },{ "duration_ms": 34, - "cts_in_ms": 1706437079005, + "cts_in_ms":original_timestamp + 5, "text": "pizza" },{ "duration_ms": 34, - "cts_in_ms": 1706437079006, + "cts_in_ms": original_timestamp_first_keyword, "text": "habitabilité" },{ "duration_ms": 34, - "cts_in_ms": 1706437079007, + "cts_in_ms":original_timestamp + 7, "text": "de" },{ "duration_ms": 34, - "cts_in_ms": 1706437079008, + "cts_in_ms":original_timestamp + 8, "text": "la" },{ "duration_ms": 34, - "cts_in_ms": 1706437079009, + "cts_in_ms":original_timestamp + 9, "text": "planète" },{ "duration_ms": 34, - "cts_in_ms": 1706437079010, + "cts_in_ms":original_timestamp + 10, "text": "conditions" },{ "duration_ms": 34, - "cts_in_ms": 1706437079011, + "cts_in_ms":original_timestamp + 11, "text": "de" },{ "duration_ms": 34, - "cts_in_ms": 1706437079011, + "cts_in_ms":original_timestamp + 11, "text": "vie" },{ "duration_ms": 34, - "cts_in_ms": 1706437079011, + "cts_in_ms":original_timestamp + 11, "text": "sur" },{ "duration_ms": 34, - "cts_in_ms": 1706437079011, + "cts_in_ms": original_timestamp_first_keyword + get_keyword_time_separation_ms(), "text": "terre" },{ "duration_ms": 34, - "cts_in_ms": 1706437079012, + "cts_in_ms": original_timestamp + 12, "text": "animal" }, ], @@ -500,7 +503,7 @@ def test_complexe_filter_and_tag_by_theme(): "theme":"ressources_naturelles_concepts_generaux", } ] - ,"number_of_keywords": 4 + ,"number_of_keywords": 2 }]) # List of words to filter on @@ -545,7 +548,7 @@ def test_save_to_pg_keyword(): "channel_radio": False, "theme": themes, "keywords_with_timestamp": keywords_with_timestamp - ,"number_of_keywords": 4 + ,"number_of_keywords": 1 }]) df['start'] = pd.to_datetime(df['start'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('Europe/Paris') @@ -560,7 +563,7 @@ def test_save_to_pg_keyword(): assert result.channel_radio == False assert result.theme == themes assert result.keywords_with_timestamp == keywords_with_timestamp - assert result.number_of_keywords == 4 + assert result.number_of_keywords == 1 assert result.start == datetime.datetime(2024, 1, 28, 10, 17, 59, 6000) def test_is_word_in_sentence(): @@ -584,4 +587,103 @@ def test_format_word_regex(): assert format_word_regex("voitures") == "voitures?" assert format_word_regex("voiture") == "voitures?" assert format_word_regex("coraux") == "coraux" - assert format_word_regex("d'eau") == "d' ?eaus?" \ No newline at end of file + assert format_word_regex("d'eau") == "d' ?eaus?" + +def test_overlap_count_keywords_duration_overlap(): + original_timestamp = 1708010919000 + keywords_with_timestamp = [{ + "keyword" : 'habitabilité de la planète', + "timestamp": original_timestamp + 1, + "theme":"changement_climatique_constat", + }, + { + "keyword" : 'conditions de vie sur terre', + "timestamp": original_timestamp + 2, + "theme":"changement_climatique_constat", + }, + { + "keyword" : 'planète', + "timestamp": original_timestamp + 3, + "theme":"ressources_naturelles_concepts_generaux", + }, + { + "keyword" : 'terre', + "timestamp": original_timestamp + 4, + "theme":"ressources_naturelles_concepts_generaux", + } + ] + + assert count_keywords_duration_overlap(keywords_with_timestamp) == 1 + +def test_no_overlap_count_keywords_duration_overlap(): + original_timestamp = 1708010900000 + keywords_with_timestamp = [{ + "keyword" : 'habitabilité de la planète', + "timestamp": original_timestamp + get_keyword_time_separation_ms(), + "theme":"changement_climatique_constat", + }, + { + "keyword" : 'conditions de vie sur terre', + "timestamp": original_timestamp + 2 * get_keyword_time_separation_ms(), + "theme":"changement_climatique_constat", + }, + { + "keyword" : 'planète', + "timestamp": original_timestamp + 3* get_keyword_time_separation_ms(), + "theme":"ressources_naturelles_concepts_generaux", + }, + { + "keyword" : 'terre', + "timestamp": original_timestamp + 4 * get_keyword_time_separation_ms(), + "theme":"ressources_naturelles_concepts_generaux", + } + ] + + assert count_keywords_duration_overlap(keywords_with_timestamp) == 4 + +def test_with_a_mix_of_overlap_count_keywords_duration_overlap(): + original_timestamp = 1708010900000 + keywords_with_timestamp = [{ + "keyword" : 'habitabilité de la planète', + "timestamp": original_timestamp, # count for one + "theme":"changement_climatique_constat", + }, + { + "keyword" : 'conditions de vie sur terre', + "timestamp": original_timestamp + get_keyword_time_separation_ms() / 2, + "theme":"changement_climatique_constat", + }, + { + "keyword" : 'planète', + "timestamp": original_timestamp + get_keyword_time_separation_ms(), # count for one + "theme":"ressources_naturelles_concepts_generaux", + }, + { + "keyword" : 'terre', + "timestamp": original_timestamp + get_keyword_time_separation_ms() + 2000, + "theme":"ressources_naturelles_concepts_generaux", + }, + { + "keyword" : 'terre', + "timestamp": original_timestamp + get_keyword_time_separation_ms() + 10000, + "theme":"ressources_naturelles_concepts_generaux", + }, + { + "keyword" : 'terre', + "timestamp": original_timestamp + get_keyword_time_separation_ms() * 2, # count for one + "theme":"ressources_naturelles_concepts_generaux", + } + ] + + assert count_keywords_duration_overlap(keywords_with_timestamp) == 3 + +def test_only_one_count_keywords_duration_overlap(): + original_timestamp = 1708010900000 + keywords_with_timestamp = [{ + "keyword" : 'habitabilité de la planète', + "timestamp": original_timestamp, # count for one + "theme":"changement_climatique_constat", + } + ] + + assert count_keywords_duration_overlap(keywords_with_timestamp) == 1 \ No newline at end of file diff --git a/test/sitemap/test_update_pg_keywords.py b/test/sitemap/test_update_pg_keywords.py new file mode 100644 index 00000000..734b1f7a --- /dev/null +++ b/test/sitemap/test_update_pg_keywords.py @@ -0,0 +1,67 @@ +import logging + +from quotaclimat.data_processing.mediatree.update_pg_keywords import * + +from postgres.insert_data import (clean_data, + insert_data_in_sitemap_table) +from quotaclimat.data_ingestion.scrap_sitemap import (add_primary_key, get_consistent_hash) + +from postgres.schemas.models import create_tables, get_db_session, get_keyword, connect_to_db +from postgres.insert_data import save_to_pg +from quotaclimat.data_processing.mediatree.detect_keywords import * +def test_insert_data_in_sitemap_table(): + create_tables() + session = get_db_session() + conn = connect_to_db() + wrong_value = 0 + # insezrt data + primary_key = "test_save_to_pg_keyword" + keywords_with_timestamp = [{ + "keyword" : 'habitabilité de la planète', + "timestamp": 1706437079006, + "theme":"changement_climatique_constat", + }, + { + "keyword" : 'conditions de vie sur terre', + "timestamp": 1706437079010, + "theme":"changement_climatique_constat", + }, + { + "keyword" : 'planète', + "timestamp": 1706437079009, + "theme":"ressources_naturelles_concepts_generaux", + }, + { + "keyword" : 'terre', + "timestamp": 1706437079011, + "theme":"ressources_naturelles_concepts_generaux", + } + ] + themes = [ + "changement_climatique_constat", + "ressources_naturelles_concepts_generaux", + ] + channel_name = "m6" + df = pd.DataFrame([{ + "id" : primary_key, + "start": 1706437079006, + "plaintext": "cheese pizza habitabilité de la planète conditions de vie sur terre animal", + "channel_name": channel_name, + "channel_radio": False, + "theme": themes, + "keywords_with_timestamp": keywords_with_timestamp + ,"number_of_keywords": wrong_value # wrong data to reapply our custom logic for "new_value" + }]) + df['start'] = pd.to_datetime(df['start'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('Europe/Paris') + + assert save_to_pg(df, keywords_table, conn) == 1 + + # check the value is well existing + result_before_update = get_keyword(primary_key) + update_keywords(session) + result_after_update = get_keyword(primary_key) + + new_value = count_keywords_duration_overlap(keywords_with_timestamp) + assert result_after_update.id == result_before_update.id + assert result_after_update.number_of_keywords == new_value + assert result_before_update.number_of_keywords == wrong_value