Skip to content

Commit

Permalink
fix: window duration (#186)
Browse files Browse the repository at this point in the history
  • Loading branch information
polomarcus authored Jun 19, 2024
1 parent beee4b2 commit 50b2f2c
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 34 deletions.
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ services:
# SENTRY_DSN: prod_only
# UPDATE: "true" # to batch update PG
#UPDATE_PROGRAM_ONLY: "true" # to batch update PG but only channel with program
START_OFFSET: 1 # to batch update PG from a offset
# START_OFFSET: 1 # to batch update PG from a offset
#BATCH_SIZE: 50000 # number of records to update in one batch
#NUMBER_OF_BATCH: 4 # number of batch size to process
# START_DATE: 1717227223 # to test batch import
Expand Down
1 change: 1 addition & 0 deletions quotaclimat/data_processing/mediatree/channel_program.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def get_programs():
"program_name":pd.StringDtype,
"program_type":pd.StringDtype
}
logging.debug(f"Reading {json_file_path}")
df_programs = pd.read_json(json_file_path, lines=True, dtype=data_dtype)

df_programs['start'] = format_hour_minute(df_programs['start'])
Expand Down
34 changes: 17 additions & 17 deletions quotaclimat/data_processing/mediatree/detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def remove_stopwords(plaintext: str) -> str:
@sentry_sdk.trace
def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], start: datetime):
keywords_with_timestamp = []

number_of_elements_in_array = 17
plaitext_without_stopwords = remove_stopwords(plaintext)
logging.debug(f"display datetime start {start}")

Expand Down Expand Up @@ -144,29 +144,29 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
keywords_with_timestamp_40 = get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds=40)
filtered_keywords_with_timestamp = filter_indirect_words(keywords_with_timestamp_15)

# TODO refacto this return array and else
return [
get_themes(keywords_with_timestamp_15), # theme
clean_metadata(keywords_with_timestamp_15), # keywords
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start), # number_of_keywords
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"changement_climatique_constat"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"changement_climatique_causes"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"changement_climatique_consequences"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"attenuation_climatique_solutions"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"adaptation_climatique_solutions"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"ressources"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"ressources_solutions"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"biodiversite_concepts_generaux"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"biodiversite_causes"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"biodiversite_consequences"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,"biodiversite_solutions")
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="changement_climatique_constat"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="changement_climatique_causes"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="changement_climatique_consequences"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="attenuation_climatique_solutions"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="adaptation_climatique_solutions"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="ressources"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="ressources_solutions"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_concepts_generaux"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_causes"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_consequences"),
count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme="biodiversite_solutions")
# number_of_keywords with special duration to compare duration
,count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_20), start,f"keywords_20")
,count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_30), start,f"keywords_30")
,count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_40), start,f"keywords_40")
,count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_20), start) # number_of_keywords_20
,count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_30), start) # number_of_keywords_30
,count_keywords_duration_overlap(filter_indirect_words(keywords_with_timestamp_40), start) # number_of_keywords_40
]

else:
return [None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None,None] # TODO refacto me
return [None] * number_of_elements_in_array

def get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds: int = 15):
logging.debug(f"using duration_seconds {duration_seconds}")
Expand Down
36 changes: 20 additions & 16 deletions test/sitemap/test_detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ def test_one_theme_get_themes_keywords_duration():
assert compare_unordered_lists_of_dicts(keywords_output, keywords)

assert number_of_keywords == 1
assert number_of_keywords_20 == 1
assert number_of_keywords_30 == 1
assert number_of_keywords_40 == 1
assert number_of_changement_climatique_constat == 1
assert number_of_changement_climatique_causes_directes == 0
assert number_of_changement_climatique_consequences == 0
Expand Down Expand Up @@ -490,9 +493,9 @@ def test_lower_case_filter_and_tag_by_theme():
"number_of_biodiversite_causes_directes": 0,
"number_of_biodiversite_consequences": 0,
"number_of_biodiversite_solutions_directes" :0
,'number_of_keywords_20':0,
'number_of_keywords_30':0,
'number_of_keywords_40':0
,'number_of_keywords_20':1,
'number_of_keywords_30':1,
'number_of_keywords_40':1
}])

# List of words to filter on
Expand Down Expand Up @@ -543,9 +546,9 @@ def test_singular_plural_case_filter_and_tag_by_theme():
"number_of_biodiversite_causes_directes": 0,
"number_of_biodiversite_consequences": 0,
"number_of_biodiversite_solutions_directes" :0
,'number_of_keywords_20':0,
'number_of_keywords_30':0,
'number_of_keywords_40':0
,'number_of_keywords_20':1,
'number_of_keywords_30':1,
'number_of_keywords_40':1
}])

# List of words to filter on
Expand Down Expand Up @@ -638,9 +641,9 @@ def test_complexe_filter_and_tag_by_theme():
"number_of_biodiversite_causes_directes": 0,
"number_of_biodiversite_consequences": 0,
"number_of_biodiversite_solutions_directes" :0
,'number_of_keywords_20':0,
'number_of_keywords_30':0,
'number_of_keywords_40':0
,'number_of_keywords_20':2,
'number_of_keywords_30':2,
'number_of_keywords_40':2
}])

# List of words to filter on
Expand Down Expand Up @@ -711,40 +714,41 @@ def test_overlap_count_keywords_duration_overlap():

assert count_keywords_duration_overlap(tag_wanted_duration_second_window_number(keywords_with_timestamp, start), start) == 1

def test_no_overlap_count_keywords_duration_overlap():
def test_20_seconds_no_overlap_count_keywords_duration_overlap():
duration = 20
keywords_with_timestamp = [{
"keyword" : 'habitabilité de la planète',
"timestamp": original_timestamp,
"theme":"changement_climatique_constat",
},
{
"keyword" : 'conditions de vie sur terre',
"timestamp": original_timestamp + 1 * get_keyword_time_separation_ms(),
"timestamp": original_timestamp + 1 * get_keyword_time_separation_ms(duration),
"theme":"changement_climatique_constat",
},
{
"keyword" : 'planète',
"timestamp": original_timestamp + 2 * get_keyword_time_separation_ms(),
"timestamp": original_timestamp + 2 * get_keyword_time_separation_ms(duration),
"theme":"ressources", # resources does count now
},
{
"keyword" : 'terre',
"timestamp": original_timestamp + 3 * get_keyword_time_separation_ms(),
"timestamp": original_timestamp + 3 * get_keyword_time_separation_ms(duration),
"theme":"ressources", # resources does count now
},
{
"keyword" : 'habitabilité de la planète',
"timestamp": original_timestamp + 4 * get_keyword_time_separation_ms(),
"timestamp": original_timestamp + 4 * get_keyword_time_separation_ms(duration),
"theme":"changement_climatique_constat",
},
{
"keyword" : 'conditions de vie sur terre',
"timestamp": original_timestamp + 5 * get_keyword_time_separation_ms(),
"timestamp": original_timestamp + 5 * get_keyword_time_separation_ms(duration),
"theme":"changement_climatique_constat",
},
]

assert count_keywords_duration_overlap(tag_wanted_duration_second_window_number(keywords_with_timestamp, start),start) == 6
assert count_keywords_duration_overlap(tag_wanted_duration_second_window_number(keywords_with_timestamp, start, duration),start) == 6

def test_with_a_mix_of_overlap_count_keywords_duration_overlap():
keywords_with_timestamp = [{
Expand Down

1 comment on commit 50b2f2c

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
postgres
   insert_data.py44784%36–38, 57–59, 64
   insert_existing_data_example.py19384%25–27
postgres/schemas
   models.py1461093%120–127, 139–140, 198–199, 213–214
quotaclimat/data_ingestion
   scrap_sitemap.py1341787%27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
quotaclimat/data_ingestion/ingest_db
   ingest_sitemap_in_db.py553733%21–42, 45–58, 62–73
quotaclimat/data_ingestion/scrap_html
   scrap_description_article.py36392%19–20, 32
quotaclimat/data_processing/mediatree
   api_import.py20012338%43–47, 52–67, 71–74, 80, 83–122, 128–143, 147–148, 161–173, 177–183, 196–207, 210–214, 220, 255–256, 260, 264–293, 296–298
   channel_program.py1365162%30–32, 43–45, 59, 95, 104, 142–183
   config.py15287%7, 16
   detect_keywords.py190498%190, 245–247
   update_pg_keywords.py513727%14–99, 121–122, 147–176, 182
   utils.py642266%26–50, 53, 62, 78–79
quotaclimat/utils
   healthcheck_config.py291452%22–24, 27–38
   logger.py241154%22–24, 28–37
   sentry.py10280%21–22
TOTAL117934371% 

Tests Skipped Failures Errors Time
81 0 💤 0 ❌ 0 🔥 56.590s ⏱️

Please sign in to comment.