From 50b2f2c3f8c35ac79d70cbd08e277322a0b2402e Mon Sep 17 00:00:00 2001 From: Paul Leclercq Date: Wed, 19 Jun 2024 15:49:28 +0200 Subject: [PATCH] fix: window duration (#186) --- docker-compose.yml | 2 +- .../mediatree/channel_program.py | 1 + .../mediatree/detect_keywords.py | 34 +++++++++--------- test/sitemap/test_detect_keywords.py | 36 ++++++++++--------- 4 files changed, 39 insertions(+), 34 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 6f0848729..21692cd08 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -147,7 +147,7 @@ services: # SENTRY_DSN: prod_only # UPDATE: "true" # to batch update PG #UPDATE_PROGRAM_ONLY: "true" # to batch update PG but only channel with program - START_OFFSET: 1 # to batch update PG from a offset + # START_OFFSET: 1 # to batch update PG from a offset #BATCH_SIZE: 50000 # number of records to update in one batch #NUMBER_OF_BATCH: 4 # number of batch size to process # START_DATE: 1717227223 # to test batch import diff --git a/quotaclimat/data_processing/mediatree/channel_program.py b/quotaclimat/data_processing/mediatree/channel_program.py index fbd637c2f..c15175b0f 100644 --- a/quotaclimat/data_processing/mediatree/channel_program.py +++ b/quotaclimat/data_processing/mediatree/channel_program.py @@ -22,6 +22,7 @@ def get_programs(): "program_name":pd.StringDtype, "program_type":pd.StringDtype } + logging.debug(f"Reading {json_file_path}") df_programs = pd.read_json(json_file_path, lines=True, dtype=data_dtype) df_programs['start'] = format_hour_minute(df_programs['start']) diff --git a/quotaclimat/data_processing/mediatree/detect_keywords.py b/quotaclimat/data_processing/mediatree/detect_keywords.py index 4cca79c23..24844abc9 100644 --- a/quotaclimat/data_processing/mediatree/detect_keywords.py +++ b/quotaclimat/data_processing/mediatree/detect_keywords.py @@ -116,7 +116,7 @@ def remove_stopwords(plaintext: str) -> str: @sentry_sdk.trace def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], start: datetime): keywords_with_timestamp = [] - + number_of_elements_in_array = 17 plaitext_without_stopwords = remove_stopwords(plaintext) logging.debug(f"display datetime start {start}") @@ -144,29 +144,29 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s keywords_with_timestamp_40 = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=40) filtered_keywords_with_timestamp = filter_indirect_words(keywords_with_timestamp_15) + # TODO refacto this return array and else return [ get_themes(keywords_with_timestamp_15), # theme clean_metadata(keywords_with_timestamp_15), # keywords count_keywords_duration_overlap(filtered_keywords_with_timestamp, start), # number_of_keywords - count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"changement_climatique_constat"), - count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"changement_climatique_causes"), - count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"changement_climatique_consequences"), - count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"attenuation_climatique_solutions"), - count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"adaptation_climatique_solutions"), - count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"ressources"), - count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"ressources_solutions"), - count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"biodiversite_concepts_generaux"), - count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"biodiversite_causes"), - count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"biodiversite_consequences"), - count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"biodiversite_solutions") + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="changement_climatique_constat"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="changement_climatique_causes"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="changement_climatique_consequences"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="attenuation_climatique_solutions"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="adaptation_climatique_solutions"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="ressources"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="ressources_solutions"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_concepts_generaux"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_causes"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_consequences"), + count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_solutions") # number_of_keywords with special duration to compare duration - ,count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_20), start,f"keywords_20") - ,count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_30), start,f"keywords_30") - ,count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_40), start,f"keywords_40") + ,count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_20), start) # number_of_keywords_20 + ,count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_30), start) # number_of_keywords_30 + ,count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_40), start) # number_of_keywords_40 ] - else: - return [None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None] # TODO refacto me + return [None] * number_of_elements_in_array def get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds: int = 15): logging.debug(f"using duration_seconds {duration_seconds}") diff --git a/test/sitemap/test_detect_keywords.py b/test/sitemap/test_detect_keywords.py index 253dd1e3b..4a18f33fa 100644 --- a/test/sitemap/test_detect_keywords.py +++ b/test/sitemap/test_detect_keywords.py @@ -113,6 +113,9 @@ def test_one_theme_get_themes_keywords_duration(): assert compare_unordered_lists_of_dicts(keywords_output, keywords) assert number_of_keywords == 1 + assert number_of_keywords_20 == 1 + assert number_of_keywords_30 == 1 + assert number_of_keywords_40 == 1 assert number_of_changement_climatique_constat == 1 assert number_of_changement_climatique_causes_directes == 0 assert number_of_changement_climatique_consequences == 0 @@ -490,9 +493,9 @@ def test_lower_case_filter_and_tag_by_theme(): "number_of_biodiversite_causes_directes": 0, "number_of_biodiversite_consequences": 0, "number_of_biodiversite_solutions_directes" :0 - ,'number_of_keywords_20':0, - 'number_of_keywords_30':0, - 'number_of_keywords_40':0 + ,'number_of_keywords_20':1, + 'number_of_keywords_30':1, + 'number_of_keywords_40':1 }]) # List of words to filter on @@ -543,9 +546,9 @@ def test_singular_plural_case_filter_and_tag_by_theme(): "number_of_biodiversite_causes_directes": 0, "number_of_biodiversite_consequences": 0, "number_of_biodiversite_solutions_directes" :0 - ,'number_of_keywords_20':0, - 'number_of_keywords_30':0, - 'number_of_keywords_40':0 + ,'number_of_keywords_20':1, + 'number_of_keywords_30':1, + 'number_of_keywords_40':1 }]) # List of words to filter on @@ -638,9 +641,9 @@ def test_complexe_filter_and_tag_by_theme(): "number_of_biodiversite_causes_directes": 0, "number_of_biodiversite_consequences": 0, "number_of_biodiversite_solutions_directes" :0 - ,'number_of_keywords_20':0, - 'number_of_keywords_30':0, - 'number_of_keywords_40':0 + ,'number_of_keywords_20':2, + 'number_of_keywords_30':2, + 'number_of_keywords_40':2 }]) # List of words to filter on @@ -711,7 +714,8 @@ def test_overlap_count_keywords_duration_overlap(): assert count_keywords_duration_overlap(tag_wanted_duration_second_window_number(keywords_with_timestamp, start), start) == 1 -def test_no_overlap_count_keywords_duration_overlap(): +def test_20_seconds_no_overlap_count_keywords_duration_overlap(): + duration = 20 keywords_with_timestamp = [{ "keyword" : 'habitabilité de la planète', "timestamp": original_timestamp, @@ -719,32 +723,32 @@ def test_no_overlap_count_keywords_duration_overlap(): }, { "keyword" : 'conditions de vie sur terre', - "timestamp": original_timestamp + 1 * get_keyword_time_separation_ms(), + "timestamp": original_timestamp + 1 * get_keyword_time_separation_ms(duration), "theme":"changement_climatique_constat", }, { "keyword" : 'planète', - "timestamp": original_timestamp + 2 * get_keyword_time_separation_ms(), + "timestamp": original_timestamp + 2 * get_keyword_time_separation_ms(duration), "theme":"ressources", # resources does count now }, { "keyword" : 'terre', - "timestamp": original_timestamp + 3 * get_keyword_time_separation_ms(), + "timestamp": original_timestamp + 3 * get_keyword_time_separation_ms(duration), "theme":"ressources", # resources does count now }, { "keyword" : 'habitabilité de la planète', - "timestamp": original_timestamp + 4 * get_keyword_time_separation_ms(), + "timestamp": original_timestamp + 4 * get_keyword_time_separation_ms(duration), "theme":"changement_climatique_constat", }, { "keyword" : 'conditions de vie sur terre', - "timestamp": original_timestamp + 5 * get_keyword_time_separation_ms(), + "timestamp": original_timestamp + 5 * get_keyword_time_separation_ms(duration), "theme":"changement_climatique_constat", }, ] - assert count_keywords_duration_overlap(tag_wanted_duration_second_window_number(keywords_with_timestamp, start),start) == 6 + assert count_keywords_duration_overlap(tag_wanted_duration_second_window_number(keywords_with_timestamp, start, duration),start) == 6 def test_with_a_mix_of_overlap_count_keywords_duration_overlap(): keywords_with_timestamp = [{