Skip to content

Commit

Permalink
wip: add high risk false positive column
Browse files Browse the repository at this point in the history
  • Loading branch information
polomarcus committed Nov 25, 2024
1 parent e383cf0 commit bb1f7f7
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 44 deletions.
48 changes: 26 additions & 22 deletions quotaclimat/data_processing/mediatree/detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def remove_stopwords(plaintext: str) -> str:
@sentry_sdk.trace
def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], start: datetime):
keywords_with_timestamp = []
number_of_elements_in_array = 17
number_of_elements_in_array = 28
default_window_in_seconds = DEFAULT_WINDOW_DURATION
plaitext_without_stopwords = remove_stopwords(plaintext)
logging.debug(f"display datetime start {start}")
Expand Down Expand Up @@ -192,20 +192,21 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
number_of_biodiversite_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_causes"])
number_of_biodiversite_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_consequences"])
number_of_biodiversite_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_solutions"])

number_of_changement_climatique_constat_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["changement_climatique_constat"])
number_of_changement_climatique_causes_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["changement_climatique_causes"])
number_of_changement_climatique_consequences_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["changement_climatique_consequences"])
number_of_attenuation_climatique_solutions_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["attenuation_climatique_solutions"])
number_of_adaptation_climatique_solutions_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["adaptation_climatique_solutions"])
number_of_ressources_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["ressources"])
number_of_ressources_solutions_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["ressources_solutions"])
number_of_biodiversite_concepts_generaux_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["biodiversite_concepts_generaux"])
number_of_biodiversite_causes_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["biodiversite_causes"])
number_of_biodiversite_consequences_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["biodiversite_consequences"])
number_of_biodiversite_solutions_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["biodiversite_solutions"])

return [

# No high risk of false positive counters
number_of_changement_climatique_constat_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["changement_climatique_constat"], count_high_risk_false_positive=False)
number_of_changement_climatique_causes_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["changement_climatique_causes"], count_high_risk_false_positive=False)
number_of_changement_climatique_consequences_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["changement_climatique_consequences"], count_high_risk_false_positive=False)
number_of_attenuation_climatique_solutions_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["attenuation_climatique_solutions"], count_high_risk_false_positive=False)
number_of_adaptation_climatique_solutions_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["adaptation_climatique_solutions"], count_high_risk_false_positive=False)
number_of_ressources_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["ressources"], count_high_risk_false_positive=False)
number_of_ressources_solutions_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["ressources_solutions"], count_high_risk_false_positive=False)
number_of_biodiversite_concepts_generaux_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["biodiversite_concepts_generaux"], count_high_risk_false_positive=False)
number_of_biodiversite_causes_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["biodiversite_causes"], count_high_risk_false_positive=False)
number_of_biodiversite_consequences_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["biodiversite_consequences"], count_high_risk_false_positive=False)
number_of_biodiversite_solutions_no_hrfp = count_keywords_duration_overlap(keywords_with_timestamp, start,theme=["biodiversite_solutions"], count_high_risk_false_positive=False)

return [ # Change number_of_elements_in_array if a new element is added here
theme
,keywords_with_timestamp
,number_of_keywords
Expand Down Expand Up @@ -236,6 +237,7 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
,number_of_biodiversite_solutions_no_hrfp
]
else:
logging.info("Empty keywords")
return [None] * number_of_elements_in_array

def get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds: int = 20):
Expand Down Expand Up @@ -316,9 +318,10 @@ def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :
result_type='expand'
)

logging.info("Dropping")
# remove all rows that does not have themes
df = df.dropna(subset=['theme'], how='any') # any is for None values

logging.info("Droped")
logging.info(f"After filtering with out keywords, we have {len(df)} out of {count_before_filtering} subtitles left that are insteresting for us")

return df
Expand All @@ -336,14 +339,16 @@ def add_primary_key(row):
def filter_indirect_words(keywords_with_timestamp: List[dict]) -> List[dict]:
return list(filter(lambda kw: indirectes not in kw['theme'], keywords_with_timestamp))

def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: List[str] = None) -> int:
def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: List[str] = None, count_high_risk_false_positive: bool = True) -> int:
total_keywords = len(keywords_with_timestamp)
if(total_keywords) == 0:
return 0
else:
if theme is not None:
logging.debug(f"filter theme {theme}")
keywords_with_timestamp = list(filter(lambda kw: kw['theme'] in theme, keywords_with_timestamp))
if count_high_risk_false_positive is False:
keywords_with_timestamp = list(filter(lambda kw: 'hrfp' in kw, keywords_with_timestamp))

length_filtered_items = len(keywords_with_timestamp)

Expand Down Expand Up @@ -391,7 +396,9 @@ def transform_false_positive_keywords_to_positive(keywords_with_timestamp: List[

if( contains_direct_keywords_same_suject(neighbour_keywords, keyword_info['theme']) ) :
logging.debug(f"Transforming false positive to positive { keyword_info['keyword']} { keyword_info['theme']}")
keyword_info['theme'] = remove_indirect(keyword_info['theme'])
if indirectes in keyword_info['theme']:
keyword_info['theme'] = remove_indirect(keyword_info['theme'])
keyword_info['hrfp'] = True # to store if a keyword was a transformed to a direct keyword

return keywords_with_timestamp

Expand All @@ -417,7 +424,4 @@ def tag_wanted_duration_second_window_number(keywords_with_timestamp: List[dict]
return keywords_with_timestamp

def remove_indirect(theme: str) -> str:
if indirectes in theme:
return theme.replace(f'_{indirectes}', '')
else:
return theme
return theme.replace(f'_{indirectes}', '')
98 changes: 76 additions & 22 deletions test/sitemap/test_detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@

from quotaclimat.data_processing.mediatree.utils import *
from quotaclimat.data_processing.mediatree.detect_keywords import *

from datetime import datetime, timezone

import pandas as pd
localhost = get_localhost()
original_timestamp = 1706437079004
start = datetime.utcfromtimestamp(original_timestamp / 1000)
start = datetime.fromtimestamp(original_timestamp / 1000, timezone.utc)

array_of_none = [None] * 28

subtitles = [{
"duration_ms": 34,
Expand Down Expand Up @@ -80,7 +81,7 @@
]
def test_default_get_themes_keywords_duration():
plaintext_nothing = "cheese pizza"
assert get_themes_keywords_duration(plaintext_nothing, subtitles, start) == [None] * 17
assert get_themes_keywords_duration(plaintext_nothing, subtitles, start) == array_of_none

def test_one_theme_get_themes_keywords_duration():
plaintext_climat = "réchauffement planétaire test"
Expand Down Expand Up @@ -265,12 +266,12 @@ def test_long_sentence_theme_get_themes_keywords_duration():
def test_nothing_get_themes_keywords_duration():
# should not accept theme 'bus' for keyword "abusive"
plaintext_regression_incomplete_word = "abusive"
assert get_themes_keywords_duration(plaintext_regression_incomplete_word, subtitles, start) == [None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None]
assert get_themes_keywords_duration(plaintext_regression_incomplete_word, subtitles, start) == array_of_none

def test_regression_included_get_themes_keywords_duration():
# should not accept theme 'ngt' for keyword "vingt"
plaintext_regression_incomplete_word_ngt = "vingt"
assert get_themes_keywords_duration(plaintext_regression_incomplete_word_ngt, subtitles, start) == [None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None]
assert get_themes_keywords_duration(plaintext_regression_incomplete_word_ngt, subtitles, start) == array_of_none


def test_three_get_themes_keywords_duration():
Expand Down Expand Up @@ -405,11 +406,11 @@ def test_long_get_themes_keywords_duration():

def test_stop_word_get_themes_keywords_duration():
plaintext = "haute isolation thermique fabriqué en france pizza"
assert get_themes_keywords_duration(plaintext, subtitles, start) == [None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None]
assert get_themes_keywords_duration(plaintext, subtitles, start) == array_of_none

def test_train_stop_word_get_themes_keywords_duration():
plaintext = "en train de fabrique en france pizza"
assert get_themes_keywords_duration(plaintext, subtitles, start) == [None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None]
assert get_themes_keywords_duration(plaintext, subtitles, start) == array_of_none


def test_get_cts_in_ms_for_keywords():
Expand Down Expand Up @@ -546,6 +547,17 @@ def test_lower_case_filter_and_tag_by_theme():
,'number_of_keywords_climat':1,
'number_of_keywords_biodiversite':0,
'number_of_keywords_ressources':0
,"number_of_changement_climatique_constat_no_hrfp": 0
,"number_of_changement_climatique_causes_no_hrfp": 0
,"number_of_changement_climatique_consequences_no_hrfp": 0
,"number_of_attenuation_climatique_solutions_no_hrfp": 0
,"number_of_adaptation_climatique_solutions_no_hrfp": 0
,"number_of_ressources_no_hrfp": 0
,"number_of_ressources_solutions_no_hrfp": 0
,"number_of_biodiversite_concepts_generaux_no_hrfp": 0
,"number_of_biodiversite_causes_no_hrfp": 0
,"number_of_biodiversite_consequences_no_hrfp": 0
,"number_of_biodiversite_solutions_no_hrfp":0
}])

# List of words to filter on
Expand Down Expand Up @@ -599,6 +611,17 @@ def test_singular_plural_case_filter_and_tag_by_theme():
,'number_of_keywords_climat':1,
'number_of_keywords_biodiversite':0,
'number_of_keywords_ressources':0
,"number_of_changement_climatique_constat_no_hrfp": 0
,"number_of_changement_climatique_causes_no_hrfp": 0
,"number_of_changement_climatique_consequences_no_hrfp": 0
,"number_of_attenuation_climatique_solutions_no_hrfp": 0
,"number_of_adaptation_climatique_solutions_no_hrfp": 0
,"number_of_ressources_no_hrfp": 0
,"number_of_ressources_solutions_no_hrfp": 0
,"number_of_biodiversite_concepts_generaux_no_hrfp": 0
,"number_of_biodiversite_causes_no_hrfp": 0
,"number_of_biodiversite_consequences_no_hrfp": 0
,"number_of_biodiversite_solutions_no_hrfp":0
}])

# List of words to filter on
Expand Down Expand Up @@ -695,6 +718,17 @@ def test_complexe_filter_and_tag_by_theme():
,'number_of_keywords_climat':1,
'number_of_keywords_biodiversite':0,
'number_of_keywords_ressources':0
,"number_of_changement_climatique_constat_no_hrfp": 0
,"number_of_changement_climatique_causes_no_hrfp": 0
,"number_of_changement_climatique_consequences_no_hrfp": 0
,"number_of_attenuation_climatique_solutions_no_hrfp": 0
,"number_of_adaptation_climatique_solutions_no_hrfp": 0
,"number_of_ressources_no_hrfp": 0
,"number_of_ressources_solutions_no_hrfp": 0
,"number_of_biodiversite_concepts_generaux_no_hrfp": 0
,"number_of_biodiversite_causes_no_hrfp": 0
,"number_of_biodiversite_consequences_no_hrfp": 0
,"number_of_biodiversite_solutions_no_hrfp":0
}])

# List of words to filter on
Expand Down Expand Up @@ -1294,19 +1328,22 @@ def test_tag_wanted_duration_second_window_number():

def test_transform_false_positive_keywords_to_positive():
keywords_with_timestamp = [
{'keyword': 'recyclage',
{
'keyword': 'recyclage',
'timestamp': original_timestamp,
'theme': 'attenuation_climatique_solutions_indirectes' # should be transformed to direct
},
{'keyword': 'climatique',
'timestamp': original_timestamp + 150,
'theme': 'changement_climatique_constat'
},
{'keyword': 'covoiturage',
{
'keyword': 'covoiturage',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) + 10000, # should be transformed to direct
'theme': 'attenuation_climatique_solutions_indirectes'
},
{'keyword': 'industrie verte',
{
'keyword': 'industrie verte',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 2 ,
'theme': 'attenuation_climatique_solutions_indirectes' # should be transformed to direct
},
Expand All @@ -1325,32 +1362,42 @@ def test_transform_false_positive_keywords_to_positive():
]

expected_output = [
{'keyword': 'recyclage',
{
'hrfp': True,
'keyword': 'recyclage',
'timestamp': original_timestamp,
'theme': 'attenuation_climatique_solutions' # was indirect
,'window_number': 0
},
{'keyword': 'climatique',
{
'keyword': 'climatique',
'timestamp': original_timestamp + 150,
'theme': 'changement_climatique_constat' # our positive keyword that transform false positive
,'window_number': 0
},
{'keyword': 'covoiturage',
{
'hrfp': True,
'keyword': 'covoiturage',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) + 10000, # should be transformed to direct
'theme': 'attenuation_climatique_solutions'
,'window_number': 1
},
{'keyword': 'industrie verte',
{
'hrfp': True,
'keyword': 'industrie verte',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 2 ,
'theme': 'attenuation_climatique_solutions' # should be transformed to direct
,'window_number': 2
},
{'keyword': 'industrie verte',
{
'hrfp': True,
'keyword': 'industrie verte',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 3 ,
'theme': 'attenuation_climatique_solutions'# should be transformed to direct
,'window_number': 3
},
{'keyword': 'industrie verte',
{
'keyword': 'industrie verte',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 5 ,
'theme': 'attenuation_climatique_solutions_indirectes' # should stay to indirect
,'window_number': 5
Expand Down Expand Up @@ -1398,22 +1445,26 @@ def test_different_steps_transform_false_positive_keywords_to_positive():
'window_number': 0,
'theme': 'changement_climatique_constat'
},
{'keyword': 'industrie verte',
{'hrfp': True,
'keyword': 'industrie verte',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 1 + 150,
'window_number': 1,
'theme': 'attenuation_climatique_solutions' # should be transformed to direct
},
{'keyword': 'agroforesterie',
{'hrfp': True,
'keyword': 'agroforesterie',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 2 + 150,
'window_number': 2,
'theme': 'attenuation_climatique_solutions' # should be transformed to direct
},
{'keyword': 'alternative durable',
{'hrfp': True,
'keyword': 'alternative durable',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 3 + 150,
'window_number': 3,
'theme': 'attenuation_climatique_solutions' # should be transformed to direct
},
{'keyword': 'planification écologique',
{'hrfp': True,
'keyword': 'planification écologique',
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 4 + 150,
'window_number': 4,
'theme': 'attenuation_climatique_solutions' # should be transformed to direct
Expand All @@ -1433,7 +1484,8 @@ def test_transform_false_positive_keywords_to_positive_different_and_same_subjec
'timestamp': original_timestamp + 150,
'theme': 'changement_climatique_constat'
},
{'keyword': "activisme climatique",
{
'keyword': "activisme climatique",
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 1 + 151,
'theme': 'attenuation_climatique_solutions_indirectes' # should be transformed to direct
},
Expand Down Expand Up @@ -1465,7 +1517,9 @@ def test_transform_false_positive_keywords_to_positive_different_and_same_subjec
'window_number': 0,
'theme': 'changement_climatique_constat'
},
{'keyword': "activisme climatique",
{
'hrfp': True,
'keyword': "activisme climatique",
'timestamp': original_timestamp + get_keyword_time_separation_ms(15) * 1 + 151,
'window_number': 1,
'theme': 'attenuation_climatique_solutions' # should be transformed to direct
Expand Down
2 changes: 2 additions & 0 deletions test/sitemap/test_main_import_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ def test_main_api_import():

def test_first_row_api_import():
primary_key = "29d2b1f8267b206cb62e475b960de3247e835273f396af012f5ce21bf3056472"

specific_keyword = get_keyword(primary_key)
logging.info(f"Getting {primary_key} :\n {specific_keyword}")
assert set(specific_keyword.theme) == set([
'biodiversite_concepts_generaux_indirectes',
'changement_climatique_consequences_indirectes',
Expand Down

0 comments on commit bb1f7f7

Please sign in to comment.