diff --git a/.github/workflows/deploy-main.yml b/.github/workflows/deploy-main.yml index 80a206c0..11c3f31a 100644 --- a/.github/workflows/deploy-main.yml +++ b/.github/workflows/deploy-main.yml @@ -40,7 +40,7 @@ jobs: git config user.name barometre-github-actions git config user.email barometre-github-actions@github.com git add pyproject.toml - git commit -m "[no ci]: ${{ env.PROJECT_VERSION }} bumping version" + git commit -m "[no ci]: $PROJECT_VERSION bumping version" git push origin main - name: Login to Scaleway Container Registry uses: docker/login-action@v3 diff --git a/postgres/schemas/models.py b/postgres/schemas/models.py index fd5aa71e..2ade3d6c 100644 --- a/postgres/schemas/models.py +++ b/postgres/schemas/models.py @@ -30,6 +30,8 @@ def get_sitemap_cols(): sitemap_table = "sitemap_table" +# ALTER TABLE keywords_new_list +# RENAME TO keywords; keywords_table = "keywords_new_list" class Sitemap(Base): diff --git a/quotaclimat/data_processing/mediatree/detect_keywords.py b/quotaclimat/data_processing/mediatree/detect_keywords.py index f9815461..5927bd1a 100644 --- a/quotaclimat/data_processing/mediatree/detect_keywords.py +++ b/quotaclimat/data_processing/mediatree/detect_keywords.py @@ -69,8 +69,10 @@ def set_timestamp_with_margin(keywords_with_timestamp: List[dict]) -> List[dict] current_keyword = keywords_with_timestamp[i].get("keyword") next_keyword = keywords_with_timestamp[i + 1].get("keyword") - if current_timestamp is not None and next_timestamp is not None: - if next_timestamp - current_timestamp < 1000: + if current_timestamp is not None and next_timestamp is not None: + difference = next_timestamp - current_timestamp + if difference < 1000 and difference != 0: + logging.debug("margin of 1 second detected") current_keyword = keywords_with_timestamp[i].get("keyword") next_keyword = keywords_with_timestamp[i + 1].get("keyword") if len(current_keyword) > len(next_keyword): @@ -90,10 +92,18 @@ def set_timestamp_with_margin(keywords_with_timestamp: List[dict]) -> List[dict] return keywords_with_timestamp # some keywords are contained inside other keywords, we need to filter them +# some keyword are tagged with the same timestamp and different theme def filter_keyword_with_same_timestamp(keywords_with_timestamp: List[dict])-> List[dict]: logging.debug(f"Filtering keywords with same timestamp with a margin of one second") - number_of_keywords = len(keywords_with_timestamp) - keywords_with_timestamp = set_timestamp_with_margin(keywords_with_timestamp) + number_of_keywords = len(keywords_with_timestamp) + + # we want to keep them + same_keyword_different_theme = [item for item in keywords_with_timestamp if len(list(filter(lambda x: x.get('keyword') == item.get('keyword') and x.get('theme') != item.get('theme'), keywords_with_timestamp))) > 0] + logging.debug(f"Same keyword different theme {same_keyword_different_theme}") + # keep the longest keyword based on almost or the same timestamp + unique_keywords = [item for item in keywords_with_timestamp if len(list(filter(lambda x: x.get('keyword') == item.get('keyword') and x.get('theme') != item.get('theme'), keywords_with_timestamp))) == 0] + logging.debug(f"Unique keywords {unique_keywords}") + keywords_with_timestamp = set_timestamp_with_margin(unique_keywords) # Group keywords by timestamp - with a margin of 1 second grouped_keywords = {timestamp: list(group) for timestamp, group in groupby(keywords_with_timestamp, key=lambda x: x['timestamp'])} @@ -102,6 +112,9 @@ def filter_keyword_with_same_timestamp(keywords_with_timestamp: List[dict])-> Li max(group, key=lambda x: len(x['keyword'])) for group in grouped_keywords.values() ] + logging.debug(f"result keywords {result}") + result = result + same_keyword_different_theme + final_result = len(result) if final_result < number_of_keywords: @@ -205,4 +218,5 @@ def count_keywords_duration_overlap_without_indirect(keywords_with_timestamp: Li return sum(fifteen_second_window) else: - return 0 \ No newline at end of file + return 0 + \ No newline at end of file diff --git a/test/sitemap/test_detect_keywords.py b/test/sitemap/test_detect_keywords.py index f1a8de25..beb66a0e 100644 --- a/test/sitemap/test_detect_keywords.py +++ b/test/sitemap/test_detect_keywords.py @@ -625,6 +625,15 @@ def test_keyword_inside_keyword_filter_keyword_with_same_timestamp(): assert filter_keyword_with_same_timestamp(keywords_with_timestamp) == expected +def test_keyword_different_theme_keyword_filter_keyword_with_same_timestamp(): + keywords_with_timestamp = [ + {'keyword': 'climatique', 'timestamp': 1693757470012, 'theme': 'changement_climatique_constat'}, + {'keyword': 'sécheresse', 'timestamp': 1693757450073, 'theme': 'changement_climatique_consequences'}, + {'keyword': 'sécheresse', 'timestamp': 1693757450073, 'theme': 'ressources_naturelles_concepts_generaux'} + ] + + assert filter_keyword_with_same_timestamp(keywords_with_timestamp) == keywords_with_timestamp + def test_keyword_2words_inside_keyword_filter_keyword_with_same_timestamp(): keywords_with_timestamp = [{ "keyword" : 'agriculture',