Skip to content

Commit

Permalink
feat: remove indirect from number_of_keywords (#108)
Browse files Browse the repository at this point in the history
* feat: remove indirect from number_of_keywords

* log: to debug
  • Loading branch information
polomarcus authored Feb 23, 2024
1 parent 30155d0 commit a7ca5fd
Show file tree
Hide file tree
Showing 7 changed files with 643 additions and 540 deletions.
6 changes: 3 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

52 changes: 31 additions & 21 deletions quotaclimat/data_processing/mediatree/detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str]) -
keywords_with_timestamp.extend(keywords_to_add)

if len(matching_themes) > 0:
return [matching_themes, keywords_with_timestamp, count_keywords_duration_overlap(keywords_with_timestamp)]
return [matching_themes, keywords_with_timestamp, count_keywords_duration_overlap_without_indirect(keywords_with_timestamp)]
else:
return [None, None, None]

Expand Down Expand Up @@ -107,25 +107,35 @@ def add_primary_key(df):
logging.error(error)
return get_consistent_hash("empty") # TODO improve - should be a None ?

def count_keywords_duration_overlap(keywords_with_timestamp: List[dict]) -> int:
if(len(keywords_with_timestamp)) <= 1:
return len(keywords_with_timestamp)
def filter_indirect_words(keywords_with_timestamp: List[dict]) -> List[dict]:
return list(filter(lambda kw: 'indirectes' not in kw['theme'], keywords_with_timestamp))

def count_keywords_duration_overlap_without_indirect(keywords_with_timestamp: List[dict]) -> int:
total_keywords = len(keywords_with_timestamp)
if(total_keywords) == 0:
return 0
else:
# in case keywords are not in the right order
sorted_keywords = iter(sorted(keywords_with_timestamp, key=lambda x: x['timestamp']))

count = 1
previous_timestamp = next(sorted_keywords)['timestamp']

for keyword_info in sorted_keywords:
current_timestamp = keyword_info['timestamp']
overlap_time = current_timestamp - previous_timestamp

if is_time_distance_between_keyword_enough(overlap_time):
logging.debug(f"No overlapping keyword {count} + 1 : {overlap_time}")
count += 1
previous_timestamp = current_timestamp
else:
logging.debug(f"Keyword timestamp overlap : {overlap_time} - current count is {count}")

return count
sorted_keywords = sorted(keywords_with_timestamp, key=lambda x: x['timestamp'])
filtered_themes = filter_indirect_words(sorted_keywords)
length_filtered_items = len(filtered_themes)
logging.debug(f"Before filtering {total_keywords} - After filtering indirect kw {length_filtered_items}")
if length_filtered_items > 0:
iter_filtered_themes = iter(filtered_themes)
count = 1
previous_timestamp = next(iter_filtered_themes)['timestamp']

for keyword_info in filtered_themes:
current_timestamp = keyword_info['timestamp']
overlap_time = current_timestamp - previous_timestamp

if is_time_distance_between_keyword_enough(overlap_time):
logging.debug(f"No overlapping keyword {count} + 1 : {overlap_time}")
count += 1
previous_timestamp = current_timestamp
else:
logging.debug(f"Keyword timestamp overlap : {overlap_time} - current count is {count}")

return count
else:
return 0
1 change: 0 additions & 1 deletion quotaclimat/data_processing/mediatree/keyword/keyword.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# Based on experts' document

THEME_KEYWORDS = {
"changement_climatique_constat" : [ # 1.1.2
"climatique"
Expand Down
23 changes: 15 additions & 8 deletions quotaclimat/data_processing/mediatree/update_pg_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,21 @@
from postgres.schemas.models import Keywords
from quotaclimat.data_processing.mediatree.detect_keywords import *

def update_keywords(session: Session) -> list:
def update_keywords(session: Session, batch_size: int = 50000) -> list:
saved_keywords = get_keywords_columns(session)
logging.info(f"Updating {len(saved_keywords)} saved keywords")
for keyword_id, plaintext, keywords_with_timestamp, number_of_keywords in saved_keywords:
new_number_of_keywords = count_keywords_duration_overlap(keywords_with_timestamp)
logging.debug(f"{keyword_id} new value {new_number_of_keywords}")
update_number_of_keywords(session, keyword_id, new_number_of_keywords)
total_updates = len(saved_keywords)
logging.info(f"Updating {total_updates} saved keywords")
for i in range(0, total_updates, batch_size):
batch_updates = saved_keywords[i:i+batch_size]
for keyword_id, plaintext, keywords_with_timestamp, number_of_keywords in batch_updates:
logging
new_number_of_keywords = count_keywords_duration_overlap_without_indirect(keywords_with_timestamp)
logging.debug(f"{keyword_id} new value {new_number_of_keywords}")
update_number_of_keywords(session, keyword_id, new_number_of_keywords)

logging.info(f"bulk update done {i} out of {total_updates}")
session.commit()

logging.info("updated all keywords")


Expand All @@ -33,5 +41,4 @@ def update_number_of_keywords(session: Session, keyword_id: int, new_number_of_k
session.query(Keywords).filter(Keywords.id == keyword_id).update(
{Keywords.number_of_keywords: new_number_of_keywords},
synchronize_session=False
)
session.commit()
)
Loading

1 comment on commit a7ca5fd

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
postgres
   insert_data.py46785%38–40, 59–61, 66
   insert_existing_data_example.py20385%25–27
postgres/schemas
   models.py711579%74–81, 91–92, 101–111
quotaclimat/data_analytics
   analytics_signataire_charte.py29290%1–67
   bilan.py1081080%2–372
   data_coverage.py34340%1–94
   exploration.py1251250%1–440
   sitemap_analytics.py1181180%1–343
quotaclimat/data_ingestion
   categorization_program_type.py110%1
   config_youtube.py110%1
   scaleway_db_backups.py34340%1–74
   scrap_chartejournalismeecologie_signataires.py50500%1–169
   scrap_sitemap.py1341787%27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
   scrap_tv_program.py62620%1–149
   scrap_youtube.py1141140%1–238
quotaclimat/data_ingestion/ingest_db
   ingest_sitemap_in_db.py584031%30–51, 56–75, 79–90
quotaclimat/data_ingestion/scrap_html
   scrap_description_article.py36392%19–20, 32
quotaclimat/data_processing/mediatree
   api_import.py18310642%45–49, 54–57, 61–64, 70, 73–98, 104–119, 124–126, 151–158, 162–165, 169–175, 186–197, 200–204, 210, 235–236, 242, 244, 249–275, 279–290
   config.py15287%7, 16
   detect_keywords.py97694%101–108
   utils.py642167%27–51, 54, 73–74
quotaclimat/data_processing/sitemap
   sitemap_processing.py412734%15–19, 23–25, 29–47, 51–58, 66–96, 101–103
quotaclimat/utils
   channels.py660%1–95
   climate_keywords.py220%3–35
   healthcheck_config.py291452%22–24, 27–38
   logger.py14379%22–24
   plotly_theme.py17170%1–56
TOTAL155896538% 

Tests Skipped Failures Errors Time
41 0 💤 0 ❌ 0 🔥 11.512s ⏱️

Please sign in to comment.