Skip to content

Commit

Permalink
feat: number of keywords based on 15 sec window
Browse files Browse the repository at this point in the history
  • Loading branch information
polomarcus committed Feb 26, 2024
1 parent 7e97e48 commit c0c25cf
Show file tree
Hide file tree
Showing 9 changed files with 649,683 additions and 98 deletions.
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ services:
context: ./
dockerfile: Dockerfile
entrypoint: ["poetry", "run", "pytest", "--cov-report", "term:skip-covered", "--cov=quotaclimat", "--cov=postgres", "test/"]
#entrypoint: ["sleep", "1200"] # use to debug the container if needed
# entrypoint: ["sleep", "12000"] # use to debug the container if needed
environment:
ENV: docker
# CHANNEL: "fr3-idf"
Expand Down
48 changes: 25 additions & 23 deletions quotaclimat/data_processing/mediatree/detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,12 @@ def filter_keyword_with_same_timestamp(keywords_with_timestamp: List[dict]) -> L
return result

@sentry_sdk.trace
def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str]) -> List[Optional[List[str]]]:
def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], start: datetime) -> List[Optional[List[str]]]:
matching_themes = []
keywords_with_timestamp = []

logging.debug(f"display datetime start {start}")

for theme, keywords in THEME_KEYWORDS.items():
logging.debug(f"searching {theme} for {keywords}")

Expand All @@ -91,7 +93,7 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str]) -

if len(matching_themes) > 0:
keywords_with_timestamp = filter_keyword_with_same_timestamp(keywords_with_timestamp)
return [matching_themes, keywords_with_timestamp, count_keywords_duration_overlap_without_indirect(keywords_with_timestamp)]
return [matching_themes, keywords_with_timestamp, count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start)]
else:
return [None, None, None]

Expand All @@ -109,7 +111,7 @@ def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :

logging.info(f'tagging plaintext subtitle with keywords and theme : regexp - search taking time...')
# using swifter to speed up apply https://github.com/jmcarpenter2/swifter
df[['theme', u'keywords_with_timestamp', 'number_of_keywords']] = df[['plaintext','srt']].swifter.apply(lambda row: get_themes_keywords_duration(*row), axis=1, result_type='expand')
df[['theme', u'keywords_with_timestamp', 'number_of_keywords']] = df[['plaintext','srt', 'start']].swifter.apply(lambda row: get_themes_keywords_duration(*row), axis=1, result_type='expand')

# remove all rows that does not have themes
df = df.dropna(subset=['theme'])
Expand All @@ -133,33 +135,33 @@ def add_primary_key(df):
def filter_indirect_words(keywords_with_timestamp: List[dict]) -> List[dict]:
return list(filter(lambda kw: 'indirectes' not in kw['theme'], keywords_with_timestamp))

def get_keyword_by_fifteen_second_window(filtered_themes: List[dict], start: datetime) -> List[int]:
window_size_seconds = get_keyword_time_separation_ms()
total_seconds_in_window = get_chunk_duration_api()
number_of_windows = int(total_seconds_in_window // window_size_seconds)
fifteen_second_window = [0] * number_of_windows

for keyword_info in filtered_themes:
window_number = int( (keyword_info['timestamp'] - start.timestamp() * 1000) // (window_size_seconds) )
logging.debug(f"Window number {window_number} - kwtimestamp {keyword_info['timestamp']} - start {start.timestamp() * 1000}")
if window_number > number_of_windows and window_number < 0:
logging.error(f"Window number {window_number} is out of range : kwtimestamp {keyword_info['timestamp']} - start {start.timestamp() * 1000}")
else:
fifteen_second_window[window_number] = 1

return fifteen_second_window

def count_keywords_duration_overlap_without_indirect(keywords_with_timestamp: List[dict]) -> int:
def count_keywords_duration_overlap_without_indirect(keywords_with_timestamp: List[dict], start: datetime) -> int:
total_keywords = len(keywords_with_timestamp)
if(total_keywords) == 0:
return 0
else:
# in case keywords are not in the right order
sorted_keywords = sorted(keywords_with_timestamp, key=lambda x: x['timestamp'])
filtered_themes = filter_indirect_words(sorted_keywords)
filtered_themes = filter_indirect_words(keywords_with_timestamp)
length_filtered_items = len(filtered_themes)
logging.debug(f"Before filtering {total_keywords} - After filtering indirect kw {length_filtered_items}")
if length_filtered_items > 0:
iter_filtered_themes = iter(filtered_themes)
count = 1
previous_timestamp = next(iter_filtered_themes)['timestamp']

for keyword_info in filtered_themes:
current_timestamp = keyword_info['timestamp']
overlap_time = current_timestamp - previous_timestamp

if is_time_distance_between_keyword_enough(overlap_time):
logging.debug(f"No overlapping keyword {count} + 1 : {overlap_time}")
count += 1
previous_timestamp = current_timestamp
else:
logging.debug(f"Keyword timestamp overlap : {overlap_time} - current count is {count}")

return count
fifteen_second_window = get_keyword_by_fifteen_second_window(filtered_themes, start)

return sum(fifteen_second_window)
else:
return 0
7 changes: 4 additions & 3 deletions quotaclimat/data_processing/mediatree/update_pg_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ def update_keywords(session: Session, batch_size: int = 50000) -> list:
logging.info(f"Updating {total_updates} saved keywords")
for i in range(0, total_updates, batch_size):
batch_updates = saved_keywords[i:i+batch_size]
for keyword_id, plaintext, keywords_with_timestamp, number_of_keywords in batch_updates:
for keyword_id, plaintext, keywords_with_timestamp, number_of_keywords, start in batch_updates:
logging
new_number_of_keywords = count_keywords_duration_overlap_without_indirect(keywords_with_timestamp)
new_number_of_keywords = count_keywords_duration_overlap_without_indirect(keywords_with_timestamp, start)
logging.debug(f"{keyword_id} new value {new_number_of_keywords}")
update_number_of_keywords(session, keyword_id, new_number_of_keywords)

Expand All @@ -32,7 +32,8 @@ def get_keywords_columns(session: Session) -> list:
Keywords.id,
Keywords.plaintext,
Keywords.keywords_with_timestamp,
Keywords.number_of_keywords
Keywords.number_of_keywords,
Keywords.start
)
.all()
)
Expand Down
3 changes: 3 additions & 0 deletions quotaclimat/data_processing/mediatree/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
def get_keyword_time_separation_ms():
return 15000

def get_chunk_duration_api():
return 2 * 60 * 1000

def is_time_distance_between_keyword_enough(overlap):
return overlap >= get_keyword_time_separation_ms()

Expand Down
649,330 changes: 649,329 additions & 1 deletion test/sitemap/mediatree.json

Large diffs are not rendered by default.

Loading

1 comment on commit c0c25cf

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
postgres
   insert_data.py46785%38–40, 59–61, 66
   insert_existing_data_example.py20385%25–27
postgres/schemas
   models.py711579%74–81, 91–92, 101–111
quotaclimat/data_analytics
   analytics_signataire_charte.py29290%1–67
   bilan.py1081080%2–372
   data_coverage.py34340%1–94
   exploration.py1251250%1–440
   sitemap_analytics.py1181180%1–343
quotaclimat/data_ingestion
   categorization_program_type.py110%1
   config_youtube.py110%1
   scaleway_db_backups.py34340%1–74
   scrap_chartejournalismeecologie_signataires.py50500%1–169
   scrap_sitemap.py1341787%27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
   scrap_tv_program.py62620%1–149
   scrap_youtube.py1141140%1–238
quotaclimat/data_ingestion/ingest_db
   ingest_sitemap_in_db.py594131%21–42, 45–65, 69–80
quotaclimat/data_ingestion/scrap_html
   scrap_description_article.py36392%19–20, 32
quotaclimat/data_processing/mediatree
   api_import.py17710242%38–42, 47–50, 54–57, 63, 66–93, 99–114, 119–121, 146–153, 157–160, 164–170, 181–192, 195–199, 205, 231–232, 236, 240–259, 263–274
   config.py15287%7, 16
   detect_keywords.py110496%131–133, 148
   utils.py662267%19, 30–54, 57, 76–77
quotaclimat/data_processing/sitemap
   sitemap_processing.py412734%15–19, 23–25, 29–47, 51–58, 66–96, 101–103
quotaclimat/utils
   channels.py660%1–95
   climate_keywords.py220%3–35
   healthcheck_config.py291452%22–24, 27–38
   logger.py14379%22–24
   plotly_theme.py17170%1–56
   sentry.py10280%21–22
TOTAL157996339% 

Tests Skipped Failures Errors Time
51 0 💤 0 ❌ 0 🔥 49.533s ⏱️

Please sign in to comment.