diff --git a/quotaclimat/data_processing/mediatree/detect_keywords.py b/quotaclimat/data_processing/mediatree/detect_keywords.py index d5c82694..7df795a4 100644 --- a/quotaclimat/data_processing/mediatree/detect_keywords.py +++ b/quotaclimat/data_processing/mediatree/detect_keywords.py @@ -307,11 +307,26 @@ def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: def count_different_window_number(keywords_with_timestamp: List[dict], start: datetime) -> int: window_numbers = [item['window_number'] for item in keywords_with_timestamp if 'window_number' in item] final_count = len(set(window_numbers)) - logging.debug(f"Count with 15 second logic: {final_count} keywords") + logging.debug(f"Count with {DEFAULT_WINDOW_DURATION} second logic: {final_count} keywords") return final_count -def contains_direct_keywords(keywords_with_timestamp: List[dict]) -> bool: +def get_subject_from_theme(theme: str) -> str: + if 'climatique' in theme: + return 'climat' + elif 'biodiversite' in theme: + return 'biodiversite' + elif 'ressources' in theme: + return 'ressources' + else: + return 'unknown' + +# only of the same subject (climate/biodiv/ressources) +def contains_direct_keywords_same_suject(keywords_with_timestamp: List[dict], theme: str) -> bool: + subject = get_subject_from_theme(theme) + logging.debug(f"subject {subject}") + # keep only keywords with timestamp from the same subject + keywords_with_timestamp = list(filter(lambda kw: get_subject_from_theme(kw['theme']) == subject, keywords_with_timestamp)) return any(indirectes not in kw['theme'] for kw in keywords_with_timestamp) # we want to count false positive near of 15" of positive keywords @@ -326,7 +341,8 @@ def transform_false_positive_keywords_to_positive(keywords_with_timestamp: List[ , keywords_with_timestamp) ) - if( contains_direct_keywords(neighbour_keywords) ) : + if( contains_direct_keywords_same_suject(neighbour_keywords, keyword_info['theme']) ) : + logging.debug(f"Transforming false positive to positive { keyword_info['keyword']} { keyword_info['theme']}") keyword_info['theme'] = remove_indirect(keyword_info['theme']) return keywords_with_timestamp diff --git a/test/sitemap/test_detect_keywords.py b/test/sitemap/test_detect_keywords.py index 30703210..ba895be8 100644 --- a/test/sitemap/test_detect_keywords.py +++ b/test/sitemap/test_detect_keywords.py @@ -1328,19 +1328,19 @@ def test_different_steps_transform_false_positive_keywords_to_positive(): }, {'keyword': 'agroforesterie', 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 2 + 150, - 'theme': 'attenuation_climatique_solutions_indirectes' # should be stayed to indirect + 'theme': 'attenuation_climatique_solutions_indirectes' # should stay indirect }, {'keyword': 'alternative durable', 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 3 + 150, - 'theme': 'attenuation_climatique_solutions_indirectes' # should be stayed to indirect + 'theme': 'attenuation_climatique_solutions_indirectes' # should stay indirect }, {'keyword': 'planification écologique', 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 4 + 150, - 'theme': 'attenuation_climatique_solutions_indirectes' # should be stayed to indirect + 'theme': 'attenuation_climatique_solutions_indirectes' # should stay indirect }, {'keyword': 'nucléaire', 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 6 + 150, - 'theme': 'attenuation_climatique_solutions_indirectes' # should be stayed to indirect + 'theme': 'attenuation_climatique_solutions_indirectes' # should stay indirect } ] @@ -1379,6 +1379,143 @@ def test_different_steps_transform_false_positive_keywords_to_positive(): assert transform_false_positive_keywords_to_positive(tag_wanted_duration_second_window_number(keywords_with_timestamp,start, duration_seconds=15), start) == expected_output +def test_transform_false_positive_keywords_to_positive_different_and_same_subject(): + keywords_with_timestamp = [ + {'keyword': 'climatique', + 'timestamp': original_timestamp + 150, + 'theme': 'changement_climatique_constat' + }, + {'keyword': "activisme climatique", + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 1 + 151, + 'theme': 'attenuation_climatique_solutions_indirectes' # should be transformed to direct + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 1 + 150, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + }, + {'keyword': 'agroforesterie', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 2 + 150, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + }, + {'keyword': 'alternative durable', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 3 + 150, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + }, + {'keyword': 'planification écologique', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 4 + 150, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + }, + {'keyword': 'nucléaire', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 6 + 150, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + } + ] + + expected_output = [ + {'keyword': 'climatique', + 'timestamp': original_timestamp + 150, + 'window_number': 0, + 'theme': 'changement_climatique_constat' + }, + {'keyword': "activisme climatique", + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 1 + 151, + 'window_number': 1, + 'theme': 'attenuation_climatique_solutions' # should be transformed to direct + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 1 + 150, + 'window_number': 1, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + }, + {'keyword': 'agroforesterie', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 2 + 150, + 'window_number': 2, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + }, + {'keyword': 'alternative durable', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 3 + 150, + 'window_number': 3, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + }, + {'keyword': 'planification écologique', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 4 + 150, + 'window_number': 4, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + }, + {'keyword': 'nucléaire', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 6 + 150, + 'window_number': 6, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + } + ] + + assert transform_false_positive_keywords_to_positive(tag_wanted_duration_second_window_number(keywords_with_timestamp,start, duration_seconds=15), start) == expected_output + + + +def test_transform_false_positive_keywords_to_positive_different_subject(): + keywords_with_timestamp = [ + {'keyword': 'climatique', + 'timestamp': original_timestamp + 150, + 'theme': 'changement_climatique_constat' + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 1 + 150, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + }, + {'keyword': 'agroforesterie', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 2 + 150, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + }, + {'keyword': 'alternative durable', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 3 + 150, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + }, + {'keyword': 'planification écologique', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 4 + 150, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + }, + {'keyword': 'nucléaire', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 6 + 150, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + } + ] + + expected_output = [ + {'keyword': 'climatique', + 'timestamp': original_timestamp + 150, + 'window_number': 0, + 'theme': 'changement_climatique_constat' + }, + {'keyword': 'industrie verte', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 1 + 150, + 'window_number': 1, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + }, + {'keyword': 'agroforesterie', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 2 + 150, + 'window_number': 2, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + }, + {'keyword': 'alternative durable', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 3 + 150, + 'window_number': 3, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + }, + {'keyword': 'planification écologique', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 4 + 150, + 'window_number': 4, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + }, + {'keyword': 'nucléaire', + 'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 6 + 150, + 'window_number': 6, + 'theme': 'biodiversite_concepts_generaux_indirectes' # should stay indirect + } + ] + + assert transform_false_positive_keywords_to_positive(tag_wanted_duration_second_window_number(keywords_with_timestamp,start, duration_seconds=15), start) == expected_output + def test_count_different_window_number(): keywords_with_timestamp = [