diff --git a/README.md b/README.md index 7d438e832..3a1133ae8 100644 --- a/README.md +++ b/README.md @@ -247,9 +247,11 @@ Thanks to the nginx container, we can have a local server for sitemap : ``` docker compose up -d nginx # used to scrap sitemap locally - a figaro like website with only 3 news -pytest test # "test" is the folder containing tests +# docker compose up test with entrypoint modified to sleep +# docker exec test bash +pytest -vv --log-level DEBUG test # "test" is the folder containing tests # Only one test -pytest -k 'mediatree' +pytest -vv --log-level DEBUG -k detect # OR docker compose up test # test is the container name running pytest test ``` diff --git a/docker-compose.yml b/docker-compose.yml index 5c3f6ddfe..23d2a1c5b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,8 +5,8 @@ services: build: context: ./ dockerfile: Dockerfile - entrypoint: ["poetry", "run", "pytest","-vv", "--cov-report", "term:skip-covered", "--cov=quotaclimat", "--cov=postgres", "test/"] - # entrypoint: ["sleep", "12000"] # use to debug the container if needed + entrypoint: ["poetry", "run", "pytest","-vv", "-o", "log_cli=true", "--cov-report", "term:skip-covered", "--cov=quotaclimat", "--cov=postgres", "test/"] + #entrypoint: ["sleep", "12000"] # use to debug the container if needed environment: ENV: docker # CHANNEL: "fr3-idf" @@ -24,6 +24,7 @@ services: - ./postgres/:/app/postgres/ - ./test/:/app/test/ - ./app.py:/app/app.py + - ./pyproject.toml:/app/pyproject.toml depends_on: nginxtest: condition: service_healthy diff --git a/postgres/insert_data.py b/postgres/insert_data.py index cf5385438..fe85f1740 100644 --- a/postgres/insert_data.py +++ b/postgres/insert_data.py @@ -16,7 +16,7 @@ def clean_data(df: pd.DataFrame): # from https://stackoverflow.com/a/69421596/3535853 def insert_or_do_nothing_on_conflict(table, conn, keys, data_iter): data = [dict(zip(keys, row)) for row in data_iter] - logging.debug("data_iter %s", data) + insert_statement = insert(table.table).values(data) on_duplicate_key_stmt = insert_statement.on_conflict_do_update( @@ -24,7 +24,6 @@ def insert_or_do_nothing_on_conflict(table, conn, keys, data_iter): set_={c.key: c for c in insert_statement.excluded}, ) - logging.debug("insert_statement %s", on_duplicate_key_stmt) return conn.execute(on_duplicate_key_stmt) diff --git a/pyproject.toml b/pyproject.toml index 83ed54010..d8e4771fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,9 @@ name = "pypi-public" url = "https://pypi.org/simple/" priority = "primary" +[tool.pytest.ini_options] +log_cli = 1 +log_cli_level = "DEBUG" [[tool.poetry.source]] name = "PyPI" diff --git a/quotaclimat/data_processing/mediatree/api_import.py b/quotaclimat/data_processing/mediatree/api_import.py index b31b72fe5..25784401a 100644 --- a/quotaclimat/data_processing/mediatree/api_import.py +++ b/quotaclimat/data_processing/mediatree/api_import.py @@ -8,7 +8,7 @@ import sys import os from quotaclimat.utils.healthcheck_config import run_health_check_server -from quotaclimat.utils.logger import CustomFormatter +from quotaclimat.utils.logger import getLogger from quotaclimat.data_processing.mediatree.utils import * from quotaclimat.data_processing.mediatree.config import * from quotaclimat.data_processing.mediatree.update_pg_keywords import * @@ -214,9 +214,8 @@ def parse_number_pages(response_sub) -> int : def parse_reponse_subtitle(response_sub, channel = None) -> Optional[pd.DataFrame]: with sentry_sdk.start_transaction(op="task", name="parse_reponse_subtitle"): - logging.debug(f"Parsing json response:\n {response_sub}") - total_results = parse_total_results(response_sub) + logging.getLogger("modin.logger.default").setLevel(logging.WARNING) if(total_results > 0): logging.info(f"{total_results} 'total_results' field") @@ -230,8 +229,7 @@ def parse_reponse_subtitle(response_sub, channel = None) -> Optional[pd.DataFram new_df.rename(columns={'channel.name':'channel_name', 'channel.radio': 'channel_radio', 'timestamp':'start'}, inplace=True) log_dataframe_size(new_df, channel) - - logging.debug("Parsed %s" % (new_df.head(1).to_string())) + logging.debug("Parsed Schema\n%s", new_df.dtypes) return new_df @@ -267,17 +265,7 @@ async def main(): sys.exit(0) if __name__ == "__main__": - # create logger with 'spam_application' - logger = logging.getLogger() - logger.setLevel(level=os.getenv('LOGLEVEL', 'INFO').upper()) - - # create console handler with a higher log level - if (logger.hasHandlers()): - logger.handlers.clear() - ch = logging.StreamHandler() - ch.setFormatter(CustomFormatter()) - logger.addHandler(ch) - + getLogger() asyncio.run(main()) sys.exit(0) diff --git a/quotaclimat/data_processing/mediatree/detect_keywords.py b/quotaclimat/data_processing/mediatree/detect_keywords.py index 0fa36ca75..2029ed758 100644 --- a/quotaclimat/data_processing/mediatree/detect_keywords.py +++ b/quotaclimat/data_processing/mediatree/detect_keywords.py @@ -12,7 +12,7 @@ import sentry_sdk import modin.pandas as pd import dask - +from quotaclimat.utils.logger import getLogger dask.config.set({'dataframe.query-planning': True}) def get_cts_in_ms_for_keywords(subtitle_duration: List[dict], keywords: List[str], theme: str) -> List[dict]: @@ -49,7 +49,7 @@ def format_word_regex(word: str) -> str: def is_word_in_sentence(words: str, sentence: str) -> bool : # words can contain plurals and several words words = ' '.join(list(map(( lambda x: format_word_regex(x)), words.split(" ")))) - logging.debug(f"testing {words}") + # test https://regex101.com/r/ilvs9G/1/ if re.search(rf"\b{words}(?![\w-])", sentence, re.IGNORECASE): logging.debug(f"words {words} found in {sentence}") @@ -57,10 +57,42 @@ def is_word_in_sentence(words: str, sentence: str) -> bool : else: return False -# some keywords are contained inside other keywords, we need to filter them -def filter_keyword_with_same_timestamp(keywords_with_timestamp: List[dict]) -> List[dict]: - # Group keywords by timestamp +def set_timestamp_with_margin(keywords_with_timestamp: List[dict]) -> List[dict]: + number_of_keywords = len(keywords_with_timestamp) + if number_of_keywords > 1: + for i in range(len(keywords_with_timestamp) - 1): + current_timestamp = keywords_with_timestamp[i].get("timestamp") + next_timestamp = keywords_with_timestamp[i + 1].get("timestamp") + current_keyword = keywords_with_timestamp[i].get("keyword") + next_keyword = keywords_with_timestamp[i + 1].get("keyword") + + if current_timestamp is not None and next_timestamp is not None: + if next_timestamp - current_timestamp < 1000: + current_keyword = keywords_with_timestamp[i].get("keyword") + next_keyword = keywords_with_timestamp[i + 1].get("keyword") + if len(current_keyword) > len(next_keyword): + shortest_word = next_keyword + longest_word = current_keyword + timestamp_to_change = current_timestamp + else: + shortest_word = current_keyword + longest_word = next_keyword + timestamp_to_change = next_timestamp + + if shortest_word in longest_word: + logging.info(f"Close keywords - we group them {shortest_word} - {longest_word}") + keywords_with_timestamp[i]["timestamp"] = timestamp_to_change + keywords_with_timestamp[i+1]["timestamp"] = timestamp_to_change + + return keywords_with_timestamp + +# some keywords are contained inside other keywords, we need to filter them +def filter_keyword_with_same_timestamp(keywords_with_timestamp: List[dict])-> List[dict]: + logging.debug(f"Filtering keywords with same timestamp with a margin of one second") + number_of_keywords = len(keywords_with_timestamp) + keywords_with_timestamp = set_timestamp_with_margin(keywords_with_timestamp) + # Group keywords by timestamp - with a margin of 1 second grouped_keywords = {timestamp: list(group) for timestamp, group in groupby(keywords_with_timestamp, key=lambda x: x['timestamp'])} # Filter out keywords with the same timestamp and keep the longest keyword @@ -68,6 +100,10 @@ def filter_keyword_with_same_timestamp(keywords_with_timestamp: List[dict]) -> L max(group, key=lambda x: len(x['keyword'])) for group in grouped_keywords.values() ] + final_result = len(result) + + if final_result < number_of_keywords: + logging.info(f"Filtering keywords {final_result} out of {number_of_keywords} | {keywords_with_timestamp} with final result {result}") return result diff --git a/quotaclimat/utils/logger.py b/quotaclimat/utils/logger.py index 4763307f7..ed7848f8f 100644 --- a/quotaclimat/utils/logger.py +++ b/quotaclimat/utils/logger.py @@ -1,5 +1,5 @@ import logging - +import os class CustomFormatter(logging.Formatter): grey = "\x1b[38;20m" @@ -21,4 +21,17 @@ class CustomFormatter(logging.Formatter): def format(self, record): log_fmt = self.FORMATS.get(record.levelno) formatter = logging.Formatter(log_fmt) - return formatter.format(record) \ No newline at end of file + return formatter.format(record) + +def getLogger(): + # create logger with 'spam_application' + logger = logging.getLogger() + logger.setLevel(level=os.getenv('LOGLEVEL', 'INFO').upper()) + # create console handler with a higher log level + if (logger.hasHandlers()): + logger.handlers.clear() + ch = logging.StreamHandler() + ch.setFormatter(CustomFormatter()) + logger.addHandler(ch) + + return logger \ No newline at end of file diff --git a/test/sitemap/test_detect_keywords.py b/test/sitemap/test_detect_keywords.py index d4166f3b8..b3c6f47d3 100644 --- a/test/sitemap/test_detect_keywords.py +++ b/test/sitemap/test_detect_keywords.py @@ -67,6 +67,12 @@ def test_get_themes_keywords_duration(): ,"adaptation_climatique_solutions_directes" ],[], 0] + + assert get_themes_keywords_duration("il rencontre aussi une crise majeure de la pénurie de l' offre laetitia jaoude des barrages sauvages", subtitles, start) == [[ + "changement_climatique_consequences" + ,"atténuation_climatique_solutions_directes" + ],[], 0] + def test_get_cts_in_ms_for_keywords(): str = [{ "duration_ms": 34, @@ -310,7 +316,7 @@ def test_complexe_filter_and_tag_by_theme(): "text": "planète" },{ "duration_ms": 34, - "cts_in_ms":original_timestamp + 10, + "cts_in_ms": original_timestamp + get_keyword_time_separation_ms(), "text": "conditions" },{ "duration_ms": 34, @@ -354,16 +360,16 @@ def test_complexe_filter_and_tag_by_theme(): ], "keywords_with_timestamp": [{ "keyword" : 'habitabilité de la planète', - "timestamp": 1706437079006, # count for one + "timestamp": original_timestamp_first_keyword, # count for one "theme":"changement_climatique_constat", }, { "keyword" : 'conditions de vie sur terre', - "timestamp": 1706437079010, # timestamp too close + "timestamp": original_timestamp + get_keyword_time_separation_ms(), # timestamp too close "theme":"changement_climatique_constat", } ] - ,"number_of_keywords": 1 + ,"number_of_keywords": 2 }]) # List of words to filter on @@ -614,7 +620,7 @@ def test_keyword_inside_keyword_filter_keyword_with_same_timestamp(): assert filter_keyword_with_same_timestamp(keywords_with_timestamp) == expected -def test_keyword_inside_keyword_filter_keyword_with_same_timestamp(): +def test_keyword_2words_inside_keyword_filter_keyword_with_same_timestamp(): keywords_with_timestamp = [{ "keyword" : 'agriculture', "timestamp": original_timestamp, @@ -636,6 +642,59 @@ def test_keyword_inside_keyword_filter_keyword_with_same_timestamp(): assert filter_keyword_with_same_timestamp(keywords_with_timestamp) == expected +# we should keep the longest keyword, even it's come before the first one +def test_keyword_second_word_a_bit_later_inside_keyword_filter_keyword_with_same_timestamp(): + later_timestamp = original_timestamp + 960 # from real data + keywords_with_timestamp = [{ + "keyword" : 'carbone', + "timestamp": later_timestamp, + "theme":"changement_climatique_causes_directes", + }, + { + "keyword" : 'béton bas carbone', + "timestamp": original_timestamp, # same timestamp, so we take longest keyword + "theme":"atténuation_climatique_solutions_directes", # different theme, keep this one + } + ] + + expected = [{ + "keyword" : 'béton bas carbone', + "timestamp": original_timestamp, # same timestamp, so we take longest keyword + "theme":"atténuation_climatique_solutions_directes", # different theme, keep this one + } + ] + + assert filter_keyword_with_same_timestamp(keywords_with_timestamp) == expected + +# we should keep the longest keyword, even it's come before the first one +def test_keyword_second_word_to_keep_inside_keyword_filter_keyword_with_same_timestamp(): + keywords_with_timestamp = [{ + "theme": "changement_climatique_consequences", + "timestamp": 1707627703040, + "keyword": "pénurie" + }, + { + "theme":"atténuation_climatique_solutions_directes", # different theme, keep this one + "timestamp": 1707627708051, + "keyword": "barrages" + }, + ] + + expected = [ + { + "keyword": "pénurie", + "timestamp": 1707627703040, + "theme": "changement_climatique_consequences", + }, + { + "keyword" : 'barrages', + "timestamp": 1707627708051, # same timestamp, so we take longest keyword + "theme":"atténuation_climatique_solutions_directes", # different theme, keep this one + } + ] + + assert filter_keyword_with_same_timestamp(keywords_with_timestamp) == expected + def test_filter_keyword_with_same_timestamp(): keywords_with_timestamp = [{ #nothing to filter "keyword" : "période la plus chaude", @@ -644,7 +703,7 @@ def test_filter_keyword_with_same_timestamp(): }, { "keyword" : "élévation du niveau de la mer", - "timestamp": original_timestamp + 1, + "timestamp": original_timestamp + 1200, # margin superior to 1000ms "theme":"changement_climatique_consequences", } ] diff --git a/test/sitemap/test_main_import_api.py b/test/sitemap/test_main_import_api.py index 5f0949811..9174017d7 100644 --- a/test/sitemap/test_main_import_api.py +++ b/test/sitemap/test_main_import_api.py @@ -51,11 +51,12 @@ def test_second_row_api_import(): primary_key = "67b9cc593516b40f55d6a3e89b377fccc8ab76d263c5fd6d4bfe379626190641" specific_keyword = get_keyword(primary_key) assert specific_keyword.theme == [ - "changement_climatique_constat", - "changement_climatique_causes_indirectes", - "changement_climatique_consequences", - "atténuation_climatique_solutions_directes" + "changement_climatique_constat", + "changement_climatique_causes_indirectes", + "changement_climatique_consequences", + "atténuation_climatique_solutions_directes" ] + assert specific_keyword.keywords_with_timestamp == [ # from metabase to speedup check { "keyword": "écologique", @@ -67,11 +68,6 @@ def test_second_row_api_import(): "timestamp": 1707627631076, "theme": "changement_climatique_constat" }, - { - "keyword": "pétrole", - "timestamp": 1707627629004, - "theme": "changement_climatique_causes_indirectes" - }, { "keyword": "puits de pétrole", "timestamp": 1707627628054, @@ -92,7 +88,7 @@ def test_second_row_api_import(): "timestamp": 1707627686004, "theme": "atténuation_climatique_solutions_directes" } - ] + ] assert specific_keyword.number_of_keywords == 4 def test_third_row_api_import(): diff --git a/test/sitemap/test_mediatree_utils.py b/test/sitemap/test_mediatree_utils.py index 3d659b295..7b6677649 100644 --- a/test/sitemap/test_mediatree_utils.py +++ b/test/sitemap/test_mediatree_utils.py @@ -2,7 +2,7 @@ import pandas as pd from utils import get_localhost -from quotaclimat.data_processing.mediatree.utils import get_yesterday, get_date_range, get_start_end_date_env_variable_with_default, is_it_tuesday +from quotaclimat.data_processing.mediatree.utils import * import logging from time import strftime,localtime