Skip to content

Commit

Permalink
fix: keep the same keyword but for a different theme (#131)
Browse files Browse the repository at this point in the history
  • Loading branch information
polomarcus authored Mar 1, 2024
1 parent c208d58 commit 5641d83
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/deploy-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
git config user.name barometre-github-actions
git config user.email [email protected]
git add pyproject.toml
git commit -m "[no ci]: ${{ env.PROJECT_VERSION }} bumping version"
git commit -m "[no ci]: $PROJECT_VERSION bumping version"
git push origin main
- name: Login to Scaleway Container Registry
uses: docker/login-action@v3
Expand Down
2 changes: 2 additions & 0 deletions postgres/schemas/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ def get_sitemap_cols():


sitemap_table = "sitemap_table"
# ALTER TABLE keywords_new_list
# RENAME TO keywords;
keywords_table = "keywords_new_list"

class Sitemap(Base):
Expand Down
24 changes: 19 additions & 5 deletions quotaclimat/data_processing/mediatree/detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,10 @@ def set_timestamp_with_margin(keywords_with_timestamp: List[dict]) -> List[dict]
current_keyword = keywords_with_timestamp[i].get("keyword")
next_keyword = keywords_with_timestamp[i + 1].get("keyword")

if current_timestamp is not None and next_timestamp is not None:
if next_timestamp - current_timestamp < 1000:
if current_timestamp is not None and next_timestamp is not None:
difference = next_timestamp - current_timestamp
if difference < 1000 and difference != 0:
logging.debug("margin of 1 second detected")
current_keyword = keywords_with_timestamp[i].get("keyword")
next_keyword = keywords_with_timestamp[i + 1].get("keyword")
if len(current_keyword) > len(next_keyword):
Expand All @@ -90,10 +92,18 @@ def set_timestamp_with_margin(keywords_with_timestamp: List[dict]) -> List[dict]
return keywords_with_timestamp

# some keywords are contained inside other keywords, we need to filter them
# some keyword are tagged with the same timestamp and different theme
def filter_keyword_with_same_timestamp(keywords_with_timestamp: List[dict])-> List[dict]:
logging.debug(f"Filtering keywords with same timestamp with a margin of one second")
number_of_keywords = len(keywords_with_timestamp)
keywords_with_timestamp = set_timestamp_with_margin(keywords_with_timestamp)
number_of_keywords = len(keywords_with_timestamp)

# we want to keep them
same_keyword_different_theme = [item for item in keywords_with_timestamp if len(list(filter(lambda x: x.get('keyword') == item.get('keyword') and x.get('theme') != item.get('theme'), keywords_with_timestamp))) > 0]
logging.debug(f"Same keyword different theme {same_keyword_different_theme}")
# keep the longest keyword based on almost or the same timestamp
unique_keywords = [item for item in keywords_with_timestamp if len(list(filter(lambda x: x.get('keyword') == item.get('keyword') and x.get('theme') != item.get('theme'), keywords_with_timestamp))) == 0]
logging.debug(f"Unique keywords {unique_keywords}")
keywords_with_timestamp = set_timestamp_with_margin(unique_keywords)
# Group keywords by timestamp - with a margin of 1 second
grouped_keywords = {timestamp: list(group) for timestamp, group in groupby(keywords_with_timestamp, key=lambda x: x['timestamp'])}

Expand All @@ -102,6 +112,9 @@ def filter_keyword_with_same_timestamp(keywords_with_timestamp: List[dict])-> Li
max(group, key=lambda x: len(x['keyword']))
for group in grouped_keywords.values()
]
logging.debug(f"result keywords {result}")
result = result + same_keyword_different_theme

final_result = len(result)

if final_result < number_of_keywords:
Expand Down Expand Up @@ -205,4 +218,5 @@ def count_keywords_duration_overlap_without_indirect(keywords_with_timestamp: Li

return sum(fifteen_second_window)
else:
return 0
return 0

9 changes: 9 additions & 0 deletions test/sitemap/test_detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,15 @@ def test_keyword_inside_keyword_filter_keyword_with_same_timestamp():

assert filter_keyword_with_same_timestamp(keywords_with_timestamp) == expected

def test_keyword_different_theme_keyword_filter_keyword_with_same_timestamp():
keywords_with_timestamp = [
{'keyword': 'climatique', 'timestamp': 1693757470012, 'theme': 'changement_climatique_constat'},
{'keyword': 'sécheresse', 'timestamp': 1693757450073, 'theme': 'changement_climatique_consequences'},
{'keyword': 'sécheresse', 'timestamp': 1693757450073, 'theme': 'ressources_naturelles_concepts_generaux'}
]

assert filter_keyword_with_same_timestamp(keywords_with_timestamp) == keywords_with_timestamp

def test_keyword_2words_inside_keyword_filter_keyword_with_same_timestamp():
keywords_with_timestamp = [{
"keyword" : 'agriculture',
Expand Down

1 comment on commit 5641d83

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
postgres
   insert_data.py44784%37–39, 58–60, 65
   insert_existing_data_example.py20385%25–27
postgres/schemas
   models.py721579%76–83, 93–94, 103–113
quotaclimat/data_analytics
   analytics_signataire_charte.py29290%1–67
   bilan.py1081080%2–372
   data_coverage.py34340%1–94
   exploration.py1251250%1–440
   sitemap_analytics.py1181180%1–343
quotaclimat/data_ingestion
   categorization_program_type.py110%1
   config_youtube.py110%1
   scaleway_db_backups.py34340%1–74
   scrap_chartejournalismeecologie_signataires.py50500%1–169
   scrap_sitemap.py1341787%27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
   scrap_tv_program.py62620%1–149
   scrap_youtube.py1141140%1–238
quotaclimat/data_ingestion/ingest_db
   ingest_sitemap_in_db.py594131%21–42, 45–65, 69–80
quotaclimat/data_ingestion/scrap_html
   scrap_description_article.py36392%19–20, 32
quotaclimat/data_processing/mediatree
   api_import.py17710342%38–42, 47–53, 57–60, 66, 69–96, 102–117, 122–124, 149–161, 165–168, 172–178, 189–200, 203–207, 213, 237–238, 242, 246–265, 268–270
   config.py15287%7, 16
   detect_keywords.py153696%79–81, 180–182
   utils.py662267%19, 30–54, 57, 76–77
quotaclimat/data_processing/sitemap
   sitemap_processing.py412734%15–19, 23–25, 29–47, 51–58, 66–96, 101–103
quotaclimat/utils
   channels.py660%1–95
   climate_keywords.py220%3–35
   healthcheck_config.py291452%22–24, 27–38
   logger.py241154%22–24, 28–37
   plotly_theme.py17170%1–56
   sentry.py10280%21–22
TOTAL163397440% 

Tests Skipped Failures Errors Time
64 0 💤 0 ❌ 0 🔥 52.806s ⏱️

Please sign in to comment.