From 449d562d64c70f665f6647e209d6e7f1b28de822 Mon Sep 17 00:00:00 2001
From: Paul Leclercq <paleclercq@gmail.com>
Date: Thu, 15 Feb 2024 19:57:22 +0100
Subject: [PATCH] Feat/number of keywords (#104)

* feat: overlap number_of_keywords

* includes distance time keyword into parsing

* wip: update keywords saved in pg based on new logic

* autoreview

* docker compose
---
 README.md                                     |   5 +
 docker-compose.yml                            |   1 +
 .../data_processing/mediatree/api_import.py   | 136 +++--------------
 .../data_processing/mediatree/config.py       |  21 +++
 .../mediatree/detect_keywords.py              | 131 +++++++++++++++++
 .../mediatree/update_pg_keywords.py           |  37 +++++
 .../data_processing/mediatree/utils.py        |   7 +
 test/sitemap/test_mediatree.py                | 138 +++++++++++++++---
 test/sitemap/test_update_pg_keywords.py       |  67 +++++++++
 9 files changed, 413 insertions(+), 130 deletions(-)
 create mode 100644 quotaclimat/data_processing/mediatree/config.py
 create mode 100644 quotaclimat/data_processing/mediatree/detect_keywords.py
 create mode 100644 quotaclimat/data_processing/mediatree/update_pg_keywords.py
 create mode 100644 test/sitemap/test_update_pg_keywords.py

diff --git a/README.md b/README.md
index 1976bac5..46b987ba 100644
--- a/README.md
+++ b/README.md
@@ -268,6 +268,11 @@ docker compose up mediatree
 Use env variable `START_DATE` like in docker compose (epoch second format : 1705409797).
 
 Otherwise, default is yesterday midnight date.
+
+### Batch update
+In case we have a new word detection logic, we must re apply it to all saved keywords inside our database.
+
+We should use env variable `UPDATE`  like in docker compose (should be set to "true")
 ### Fix linting
 Before committing, make sure that the line of codes you wrote are conform to PEP8 standard by running:
 ```bash
diff --git a/docker-compose.yml b/docker-compose.yml
index 1a8a896d..84353127 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -134,6 +134,7 @@ services:
       PORT: 5050 # healthcheck
       HEALTHCHECK_SERVER: "0.0.0.0"
      # START_DATE: 1704576615 # to test batch import 
+      # UPDATE: "true" # to batch update PG 
       MEDIATREE_USER : /run/secrets/username_api
       MEDIATREE_PASSWORD:  /run/secrets/pwd_api
       MEDIATREE_AUTH_URL: https://keywords.mediatree.fr/api/auth/token/
diff --git a/quotaclimat/data_processing/mediatree/api_import.py b/quotaclimat/data_processing/mediatree/api_import.py
index 0496f60c..8f1de148 100644
--- a/quotaclimat/data_processing/mediatree/api_import.py
+++ b/quotaclimat/data_processing/mediatree/api_import.py
@@ -1,37 +1,32 @@
 ### Library imports
 import requests
 import pandas as pd
-import datetime
 import json
 
 import logging
 import asyncio
-from utils import *
 import time
 import sys
 import os
 from quotaclimat.utils.healthcheck_config import run_health_check_server
 from quotaclimat.utils.logger import CustomFormatter
+from quotaclimat.data_processing.mediatree.utils import *
+from quotaclimat.data_processing.mediatree.config import *
+from quotaclimat.data_processing.mediatree.update_pg_keywords import *
+from quotaclimat.data_processing.mediatree.detect_keywords import *
 from postgres.insert_data import save_to_pg
-from postgres.schemas.models import create_tables, connect_to_db
+from postgres.schemas.models import create_tables, connect_to_db, get_db_session
 from postgres.schemas.models import keywords_table
 from pandas import json_normalize
 from quotaclimat.data_processing.mediatree.keyword.keyword import THEME_KEYWORDS
 from typing import List, Optional
-from quotaclimat.data_ingestion.scrap_sitemap import get_consistent_hash
-import re
-import swifter
 from tenacity import *
 
 #read whole file to a string
-password = os.environ.get("MEDIATREE_PASSWORD")
-if(password == '/run/secrets/pwd_api'):
-    password= open("/run/secrets/pwd_api", "r").read()
-AUTH_URL = os.environ.get("MEDIATREE_AUTH_URL") # 
-USER = os.environ.get("MEDIATREE_USER")
-if(USER == '/run/secrets/username_api'):
-    USER=open("/run/secrets/username_api", "r").read()
-KEYWORDS_URL = os.environ.get("KEYWORDS_URL") #https://keywords.mediatree.fr/docs/#api-Subtitle-SubtitleList
+password = get_password()
+AUTH_URL = get_auth_url()
+USER = get_user()
+KEYWORDS_URL = get_keywords_url()
 
 def refresh_token(token, date):
     if is_it_tuesday(date): # refresh token every weekday for batch import
@@ -40,6 +35,14 @@ def refresh_token(token, date):
     else:
         return token
 
+# reapply word detector logic to all saved keywords
+# use when word detection is changed
+async def update_pg_data(exit_event):
+    logging.info("Updating already saved data from Postgresql")
+    session = get_db_session()
+    update_keywords(session)
+    exit_event.set()
+
 async def get_and_save_api_data(exit_event):
     conn = connect_to_db()
     token=get_auth_token(password=password, user_name=USER)
@@ -120,102 +123,6 @@ def get_theme_query_includes(theme_dict):
 def transform_theme_query_includes(themes_with_keywords = THEME_KEYWORDS):
     return list(map(get_theme_query_includes, themes_with_keywords))
 
-def get_cts_in_ms_for_keywords(subtitle_duration: List[dict], keywords: List[str], theme: str) -> List[dict]:
-    result = []
-
-    logging.debug(f"Looking for timecode for {keywords}")
-    for multiple_keyword in keywords:
-        all_keywords = multiple_keyword.split() # case with multiple words such as 'économie circulaire'
-        match = next((item for item in subtitle_duration if is_word_in_sentence(all_keywords[0], item.get('text'))), None)  
-        logging.debug(f"match found {match} with {all_keywords[0].lower()}")     
-        if match is not None:
-            logging.debug(f'Result added due to this match {match} based on {all_keywords[0]}')
-            result.append(
-                {
-                    "keyword" :multiple_keyword.lower(),
-                    "timestamp" : match['cts_in_ms'],
-                    "theme" : theme
-                })
-
-    logging.debug(f"Timecode found {result}")
-    return result
-
-# be able to detect singular or plural for a word
-def format_word_regex(word: str) -> str:
-    word = word.replace('\'', '\' ?') # case for d'eau -> d' eau
-    if not word.endswith('s') and not word.endswith('x') and not word.endswith('à'):
-        return word + "s?"
-    elif word.endswith('s'):
-        return word + '?'
-    else:
-        return word
-
-def is_word_in_sentence(words: str, sentence: str) -> bool :
-    # words can contain plurals and several words
-    words = ' '.join(list(map(( lambda x: format_word_regex(x)), words.split(" "))))
-    logging.debug(f"testing {words}")
-    #  test https://regex101.com/r/ilvs9G/1/
-    if re.search(rf"\b{words}(?![\w-])", sentence, re.IGNORECASE):
-        logging.debug(f"words {words} found in {sentence}")
-        return True
-    else:
-        return False
-
-def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str]) -> List[Optional[List[str]]]:
-    matching_themes = []
-    keywords_with_timestamp = []
-
-    for theme, keywords in THEME_KEYWORDS.items():
-        logging.debug(f"searching {theme} for {keywords}")
-
-        matching_words = [word for word in keywords if is_word_in_sentence(word, plaintext)]  
-        if matching_words:
-            logging.debug(f"theme found : {theme} with word {matching_words}")
-            matching_themes.append(theme)
-            # look for cts_in_ms inside matching_words (['économie circulaire', 'panneaux solaires', 'solaires'] from subtitle_duration 
-            keywords_to_add = get_cts_in_ms_for_keywords(subtitle_duration, matching_words, theme)
-            if(len(keywords_to_add) == 0):
-                logging.warning(f"Check regex - Empty keywords but themes is there {theme} - matching_words {matching_words} - {subtitle_duration}")
-            keywords_with_timestamp.extend(keywords_to_add)
-    
-    if len(matching_themes) > 0:
-        return [matching_themes, keywords_with_timestamp, int(len(keywords_with_timestamp))]
-    else:
-        return [None, None, None]
-
-def log_min_max_date(df):
-    max_date = max(df['start'])
-    min_date = min(df['start'])
-    logging.info(f"Date min : {min_date}, max : {max_date}")
-
-def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :
-    count_before_filtering = len(df)
-    logging.info(f"{count_before_filtering} subtitles to filter by keywords and tag with themes")
-    log_min_max_date(df)
-
-    logging.info(f'tagging plaintext subtitle with keywords and theme : regexp - search taking time...')
-    # using swifter to speed up apply https://github.com/jmcarpenter2/swifter
-    df[['theme', u'keywords_with_timestamp', 'number_of_keywords']] = df[['plaintext','srt']].swifter.apply(lambda row: get_themes_keywords_duration(*row), axis=1, result_type='expand')
-
-    # remove all rows that does not have themes
-    df = df.dropna(subset=['theme'])
-
-    df.drop('srt', axis=1, inplace=True)
-
-    logging.info(f"After filtering with out keywords, we have {len(df)} out of {count_before_filtering} subtitles left that are insteresting for us")
-
-    return df
-
-def add_primary_key(df):
-    logging.info("Adding primary key to save to PG and have idempotent result")
-    try:
-        return (
-            df["start"].astype(str) + df["channel_name"]
-        ).apply(get_consistent_hash)
-    except (Exception) as error:
-        logging.error(error)
-        return get_consistent_hash("empty") #  TODO improve - should be a None ?
-
 # "Randomly wait up to 2^x * 1 seconds between each retry until the range reaches 60 seconds, then randomly up to 60 seconds afterwards"
 # @see https://github.com/jd/tenacity/tree/main
 @retry(wait=wait_random_exponential(multiplier=1, max=60),stop=stop_after_attempt(7))
@@ -314,7 +221,7 @@ def log_dataframe_size(df, channel):
         logging.warning(f"High Dataframe size : {bytes_size / (1000 * 1000)}")
     if(len(df) == 1000):
         logging.error("We might lose data - df size is 1000 out of 1000 - we should divide this querry")
-
+    
 async def main():    
     logger.info("Start api mediatree import")
     create_tables()
@@ -324,7 +231,10 @@ async def main():
     health_check_task = asyncio.create_task(run_health_check_server())
 
     # Start batch job
-    asyncio.create_task(get_and_save_api_data(event_finish))
+    if(os.environ.get("UPDATE") == "true"):
+        asyncio.create_task(update_pg_data(event_finish))
+    else:
+        asyncio.create_task(get_and_save_api_data(event_finish))
 
     # Wait for both tasks to complete
     await event_finish.wait()
@@ -354,3 +264,5 @@ async def main():
 
     asyncio.run(main())
     sys.exit(0)
+
+
diff --git a/quotaclimat/data_processing/mediatree/config.py b/quotaclimat/data_processing/mediatree/config.py
new file mode 100644
index 00000000..84dd55ad
--- /dev/null
+++ b/quotaclimat/data_processing/mediatree/config.py
@@ -0,0 +1,21 @@
+import os
+
+#read whole file to a string
+def get_password():
+    password = os.environ.get("MEDIATREE_PASSWORD")
+    if(password == '/run/secrets/pwd_api'):
+        password= open("/run/secrets/pwd_api", "r").read()
+    return password
+
+def get_auth_url():
+    return os.environ.get("MEDIATREE_AUTH_URL") # 
+
+def get_user():
+    USER = os.environ.get("MEDIATREE_USER")
+    if(USER == '/run/secrets/username_api'):
+        USER=open("/run/secrets/username_api", "r").read()
+    return USER
+
+#https://keywords.mediatree.fr/docs/#api-Subtitle-SubtitleList
+def get_keywords_url():
+    return os.environ.get("KEYWORDS_URL") 
\ No newline at end of file
diff --git a/quotaclimat/data_processing/mediatree/detect_keywords.py b/quotaclimat/data_processing/mediatree/detect_keywords.py
new file mode 100644
index 00000000..ee28e203
--- /dev/null
+++ b/quotaclimat/data_processing/mediatree/detect_keywords.py
@@ -0,0 +1,131 @@
+import pandas as pd
+
+import logging
+
+from quotaclimat.data_processing.mediatree.utils import *
+from quotaclimat.data_processing.mediatree.config import *
+from postgres.schemas.models import keywords_table
+from quotaclimat.data_processing.mediatree.keyword.keyword import THEME_KEYWORDS
+from typing import List, Optional
+from quotaclimat.data_ingestion.scrap_sitemap import get_consistent_hash
+import re
+import swifter
+
+def get_cts_in_ms_for_keywords(subtitle_duration: List[dict], keywords: List[str], theme: str) -> List[dict]:
+    result = []
+
+    logging.debug(f"Looking for timecode for {keywords}")
+    for multiple_keyword in keywords:
+        all_keywords = multiple_keyword.split() # case with multiple words such as 'économie circulaire'
+        match = next((item for item in subtitle_duration if is_word_in_sentence(all_keywords[0], item.get('text'))), None)  
+        logging.debug(f"match found {match} with {all_keywords[0].lower()}")     
+        if match is not None:
+            logging.debug(f'Result added due to this match {match} based on {all_keywords[0]}')
+            result.append(
+                {
+                    "keyword" :multiple_keyword.lower(),
+                    "timestamp" : match['cts_in_ms'],
+                    "theme" : theme
+                })
+
+    logging.debug(f"Timecode found {result}")
+    return result
+
+# be able to detect singular or plural for a word
+def format_word_regex(word: str) -> str:
+    word = word.replace('\'', '\' ?') # case for d'eau -> d' eau
+    if not word.endswith('s') and not word.endswith('x') and not word.endswith('à'):
+        return word + "s?"
+    elif word.endswith('s'):
+        return word + '?'
+    else:
+        return word
+
+def is_word_in_sentence(words: str, sentence: str) -> bool :
+    # words can contain plurals and several words
+    words = ' '.join(list(map(( lambda x: format_word_regex(x)), words.split(" "))))
+    logging.debug(f"testing {words}")
+    #  test https://regex101.com/r/ilvs9G/1/
+    if re.search(rf"\b{words}(?![\w-])", sentence, re.IGNORECASE):
+        logging.debug(f"words {words} found in {sentence}")
+        return True
+    else:
+        return False
+
+def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str]) -> List[Optional[List[str]]]:
+    matching_themes = []
+    keywords_with_timestamp = []
+
+    for theme, keywords in THEME_KEYWORDS.items():
+        logging.debug(f"searching {theme} for {keywords}")
+
+        matching_words = [word for word in keywords if is_word_in_sentence(word, plaintext)]  
+        if matching_words:
+            logging.debug(f"theme found : {theme} with word {matching_words}")
+            matching_themes.append(theme)
+            # look for cts_in_ms inside matching_words (['économie circulaire', 'panneaux solaires', 'solaires'] from subtitle_duration 
+            keywords_to_add = get_cts_in_ms_for_keywords(subtitle_duration, matching_words, theme)
+            if(len(keywords_to_add) == 0):
+                logging.warning(f"Check regex - Empty keywords but themes is there {theme} - matching_words {matching_words} - {subtitle_duration}")
+            keywords_with_timestamp.extend(keywords_to_add)
+    
+    if len(matching_themes) > 0:
+        return [matching_themes, keywords_with_timestamp, count_keywords_duration_overlap(keywords_with_timestamp)]
+    else:
+        return [None, None, None]
+
+def log_min_max_date(df):
+    max_date = max(df['start'])
+    min_date = min(df['start'])
+    logging.info(f"Date min : {min_date}, max : {max_date}")
+
+def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :
+    count_before_filtering = len(df)
+    logging.info(f"{count_before_filtering} subtitles to filter by keywords and tag with themes")
+    log_min_max_date(df)
+
+    logging.info(f'tagging plaintext subtitle with keywords and theme : regexp - search taking time...')
+    # using swifter to speed up apply https://github.com/jmcarpenter2/swifter
+    df[['theme', u'keywords_with_timestamp', 'number_of_keywords']] = df[['plaintext','srt']].swifter.apply(lambda row: get_themes_keywords_duration(*row), axis=1, result_type='expand')
+
+    # remove all rows that does not have themes
+    df = df.dropna(subset=['theme'])
+
+    df.drop('srt', axis=1, inplace=True)
+
+    logging.info(f"After filtering with out keywords, we have {len(df)} out of {count_before_filtering} subtitles left that are insteresting for us")
+
+    return df
+
+def add_primary_key(df):
+    logging.info("Adding primary key to save to PG and have idempotent result")
+    try:
+        return (
+            df["start"].astype(str) + df["channel_name"]
+        ).apply(get_consistent_hash)
+    except (Exception) as error:
+        logging.error(error)
+        return get_consistent_hash("empty") #  TODO improve - should be a None ?
+
+def count_keywords_duration_overlap(keywords_with_timestamp: List[dict]) -> int:
+    if(len(keywords_with_timestamp)) <= 1:
+        return len(keywords_with_timestamp)
+    else:
+        # in case keywords are not in the right order
+        sorted_keywords = iter(sorted(keywords_with_timestamp, key=lambda x: x['timestamp']))
+
+        count = 1
+        previous_timestamp = next(sorted_keywords)['timestamp']
+
+        for keyword_info in sorted_keywords:
+            current_timestamp = keyword_info['timestamp']
+            overlap_time = current_timestamp - previous_timestamp
+            
+            if is_time_distance_between_keyword_enough(overlap_time):
+                logging.debug(f"No overlapping keyword {count} + 1 : {overlap_time}")
+                count += 1
+                previous_timestamp = current_timestamp
+            else:
+                logging.debug(f"Keyword timestamp overlap : {overlap_time} - current count is {count}")
+
+        return count
\ No newline at end of file
diff --git a/quotaclimat/data_processing/mediatree/update_pg_keywords.py b/quotaclimat/data_processing/mediatree/update_pg_keywords.py
new file mode 100644
index 00000000..af2fa6ca
--- /dev/null
+++ b/quotaclimat/data_processing/mediatree/update_pg_keywords.py
@@ -0,0 +1,37 @@
+### Library imports
+import requests
+import pandas as pd
+import json
+
+import logging
+from sqlalchemy.orm import Session
+from postgres.schemas.models import Keywords
+from quotaclimat.data_processing.mediatree.detect_keywords import *
+
+def update_keywords(session: Session) -> list:
+    saved_keywords = get_keywords_columns(session)
+    logging.info(f"Updating {len(saved_keywords)} saved keywords")
+    for keyword_id, plaintext, keywords_with_timestamp, number_of_keywords in saved_keywords:
+        new_number_of_keywords = count_keywords_duration_overlap(keywords_with_timestamp)
+        logging.debug(f"{keyword_id} new value {new_number_of_keywords}")
+        update_number_of_keywords(session, keyword_id, new_number_of_keywords)
+    logging.info("updated all keywords")
+
+
+def get_keywords_columns(session: Session) -> list:
+    return (
+        session.query(
+            Keywords.id,
+            Keywords.plaintext,
+            Keywords.keywords_with_timestamp,
+            Keywords.number_of_keywords
+        )
+        .all()
+    )
+
+def update_number_of_keywords(session: Session, keyword_id: int, new_number_of_keywords: int):
+    session.query(Keywords).filter(Keywords.id == keyword_id).update(
+        {Keywords.number_of_keywords: new_number_of_keywords},
+        synchronize_session=False
+    )
+    session.commit()
\ No newline at end of file
diff --git a/quotaclimat/data_processing/mediatree/utils.py b/quotaclimat/data_processing/mediatree/utils.py
index ec39cd71..d5102228 100644
--- a/quotaclimat/data_processing/mediatree/utils.py
+++ b/quotaclimat/data_processing/mediatree/utils.py
@@ -8,6 +8,13 @@
 
 timezone='Europe/Paris'
 
+
+def get_keyword_time_separation_ms():
+    return 15000
+
+def is_time_distance_between_keyword_enough(overlap):
+    return overlap >= get_keyword_time_separation_ms()
+
 def get_exact_days_from_week_day_name(
         start_date
         , end_date
diff --git a/test/sitemap/test_mediatree.py b/test/sitemap/test_mediatree.py
index b031cb44..085acca9 100644
--- a/test/sitemap/test_mediatree.py
+++ b/test/sitemap/test_mediatree.py
@@ -3,8 +3,9 @@
 
 from bs4 import BeautifulSoup
 from utils import get_localhost, debug_df
-from quotaclimat.data_processing.mediatree.api_import import format_word_regex, is_word_in_sentence, get_themes_keywords_duration, get_cts_in_ms_for_keywords, filter_and_tag_by_theme, parse_reponse_subtitle, get_includes_or_query, transform_theme_query_includes
-import json 
+from quotaclimat.data_processing.mediatree.api_import import *
+from quotaclimat.data_processing.mediatree.utils import *
+from quotaclimat.data_processing.mediatree.detect_keywords import *
 from postgres.insert_data import save_to_pg
 from postgres.schemas.models import keywords_table, connect_to_db, get_keyword, drop_tables
 from quotaclimat.data_processing.mediatree.keyword.keyword import THEME_KEYWORDS
@@ -413,6 +414,8 @@ def test_singular_plural_case_filter_and_tag_by_theme():
     pd.testing.assert_frame_equal(df.reset_index(drop=True), expected_result.reset_index(drop=True))
 
 def test_complexe_filter_and_tag_by_theme():
+    original_timestamp = 1706437079004
+    original_timestamp_first_keyword = original_timestamp + 6
     df1 = pd.DataFrame([{
         "start": 1704798000,
         "plaintext": "cheese pizza habitabilité de la planète conditions de vie sur terre animal",
@@ -420,51 +423,51 @@ def test_complexe_filter_and_tag_by_theme():
         "channel_radio": False,
         "srt": [{
             "duration_ms": 34,
-            "cts_in_ms": 1706437079004,
+            "cts_in_ms": original_timestamp,
             "text": "cheese"
             },{
             "duration_ms": 34,
-            "cts_in_ms": 1706437079005,
+            "cts_in_ms":original_timestamp + 5,
             "text": "pizza"
             },{
             "duration_ms": 34,
-            "cts_in_ms": 1706437079006,
+            "cts_in_ms": original_timestamp_first_keyword,
             "text": "habitabilité"
             },{
             "duration_ms": 34,
-            "cts_in_ms": 1706437079007,
+            "cts_in_ms":original_timestamp + 7,
             "text": "de"
             },{
             "duration_ms": 34,
-            "cts_in_ms": 1706437079008,
+            "cts_in_ms":original_timestamp + 8,
             "text": "la"
             },{
             "duration_ms": 34,
-            "cts_in_ms": 1706437079009,
+            "cts_in_ms":original_timestamp + 9,
             "text": "planète"
             },{
             "duration_ms": 34,
-            "cts_in_ms": 1706437079010,
+            "cts_in_ms":original_timestamp + 10,
             "text": "conditions"
             },{
             "duration_ms": 34,
-            "cts_in_ms": 1706437079011,
+            "cts_in_ms":original_timestamp + 11,
             "text": "de"
             },{
             "duration_ms": 34,
-            "cts_in_ms": 1706437079011,
+            "cts_in_ms":original_timestamp + 11,
             "text": "vie"
             },{
             "duration_ms": 34,
-            "cts_in_ms": 1706437079011,
+            "cts_in_ms":original_timestamp + 11,
             "text": "sur"
             },{
             "duration_ms": 34,
-            "cts_in_ms": 1706437079011,
+            "cts_in_ms": original_timestamp_first_keyword + get_keyword_time_separation_ms(),
             "text": "terre"
             },{
             "duration_ms": 34,
-            "cts_in_ms": 1706437079012,
+            "cts_in_ms": original_timestamp + 12,
             "text": "animal"
             },
         ],
@@ -500,7 +503,7 @@ def test_complexe_filter_and_tag_by_theme():
                 "theme":"ressources_naturelles_concepts_generaux",
             }
         ]
-        ,"number_of_keywords": 4
+        ,"number_of_keywords": 2
     }])
 
     # List of words to filter on
@@ -545,7 +548,7 @@ def test_save_to_pg_keyword():
         "channel_radio": False,
         "theme": themes,
         "keywords_with_timestamp": keywords_with_timestamp
-        ,"number_of_keywords": 4
+        ,"number_of_keywords": 1
     }])
 
     df['start'] = pd.to_datetime(df['start'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('Europe/Paris')
@@ -560,7 +563,7 @@ def test_save_to_pg_keyword():
     assert result.channel_radio == False
     assert result.theme == themes 
     assert result.keywords_with_timestamp == keywords_with_timestamp
-    assert result.number_of_keywords == 4
+    assert result.number_of_keywords == 1
     assert result.start == datetime.datetime(2024, 1, 28, 10, 17, 59, 6000)
 
 def test_is_word_in_sentence():
@@ -584,4 +587,103 @@ def test_format_word_regex():
     assert format_word_regex("voitures") == "voitures?"
     assert format_word_regex("voiture") == "voitures?"
     assert format_word_regex("coraux") == "coraux"
-    assert format_word_regex("d'eau") == "d' ?eaus?"
\ No newline at end of file
+    assert format_word_regex("d'eau") == "d' ?eaus?"
+
+def test_overlap_count_keywords_duration_overlap():
+    original_timestamp = 1708010919000
+    keywords_with_timestamp = [{
+                "keyword" : 'habitabilité de la planète',
+                "timestamp": original_timestamp + 1,
+                "theme":"changement_climatique_constat",
+            },
+            {
+                "keyword" : 'conditions de vie sur terre',
+                "timestamp": original_timestamp + 2,
+                "theme":"changement_climatique_constat",
+            },
+            {
+                "keyword" : 'planète',
+                "timestamp": original_timestamp + 3,
+                "theme":"ressources_naturelles_concepts_generaux",
+            },
+            {
+                "keyword" : 'terre',
+                "timestamp": original_timestamp + 4,
+                "theme":"ressources_naturelles_concepts_generaux",
+            }
+    ]
+    
+    assert count_keywords_duration_overlap(keywords_with_timestamp) == 1
+  
+def test_no_overlap_count_keywords_duration_overlap():
+    original_timestamp = 1708010900000
+    keywords_with_timestamp = [{
+                "keyword" : 'habitabilité de la planète',
+                "timestamp": original_timestamp + get_keyword_time_separation_ms(), 
+                "theme":"changement_climatique_constat",
+            },
+            {
+                "keyword" : 'conditions de vie sur terre',
+                "timestamp": original_timestamp + 2 * get_keyword_time_separation_ms(),
+                "theme":"changement_climatique_constat",
+            },
+            {
+                "keyword" : 'planète',
+                "timestamp": original_timestamp + 3* get_keyword_time_separation_ms(),
+                "theme":"ressources_naturelles_concepts_generaux",
+            },
+            {
+                "keyword" : 'terre',
+                "timestamp": original_timestamp + 4 * get_keyword_time_separation_ms(),
+                "theme":"ressources_naturelles_concepts_generaux",
+            }
+    ]
+    
+    assert count_keywords_duration_overlap(keywords_with_timestamp) == 4
+
+def test_with_a_mix_of_overlap_count_keywords_duration_overlap():
+    original_timestamp = 1708010900000
+    keywords_with_timestamp = [{
+                "keyword" : 'habitabilité de la planète',
+                "timestamp": original_timestamp, # count for one
+                "theme":"changement_climatique_constat",
+            },
+            {
+                "keyword" : 'conditions de vie sur terre',
+                "timestamp": original_timestamp + get_keyword_time_separation_ms() / 2,
+                "theme":"changement_climatique_constat",
+            },
+            {
+                "keyword" : 'planète',
+                "timestamp": original_timestamp + get_keyword_time_separation_ms(), # count for one
+                "theme":"ressources_naturelles_concepts_generaux",
+            },
+            {
+                "keyword" : 'terre',
+                "timestamp": original_timestamp + get_keyword_time_separation_ms() + 2000,
+                "theme":"ressources_naturelles_concepts_generaux",
+            },
+            {
+                "keyword" : 'terre',
+                "timestamp": original_timestamp + get_keyword_time_separation_ms() + 10000,
+                "theme":"ressources_naturelles_concepts_generaux",
+            },
+            {
+                "keyword" : 'terre',
+                "timestamp": original_timestamp + get_keyword_time_separation_ms() * 2,  # count for one
+                "theme":"ressources_naturelles_concepts_generaux",
+            }
+    ]
+    
+    assert count_keywords_duration_overlap(keywords_with_timestamp) == 3
+
+def test_only_one_count_keywords_duration_overlap():
+    original_timestamp = 1708010900000
+    keywords_with_timestamp = [{
+                "keyword" : 'habitabilité de la planète',
+                "timestamp": original_timestamp, # count for one
+                "theme":"changement_climatique_constat",
+            }
+    ]
+    
+    assert count_keywords_duration_overlap(keywords_with_timestamp) == 1
\ No newline at end of file
diff --git a/test/sitemap/test_update_pg_keywords.py b/test/sitemap/test_update_pg_keywords.py
new file mode 100644
index 00000000..734b1f7a
--- /dev/null
+++ b/test/sitemap/test_update_pg_keywords.py
@@ -0,0 +1,67 @@
+import logging
+
+from quotaclimat.data_processing.mediatree.update_pg_keywords import *
+
+from postgres.insert_data import (clean_data,
+                                  insert_data_in_sitemap_table)
+from quotaclimat.data_ingestion.scrap_sitemap import (add_primary_key, get_consistent_hash)
+
+from postgres.schemas.models import create_tables, get_db_session, get_keyword, connect_to_db
+from postgres.insert_data import save_to_pg
+from quotaclimat.data_processing.mediatree.detect_keywords import *
+def test_insert_data_in_sitemap_table():
+    create_tables()
+    session = get_db_session()
+    conn = connect_to_db()
+    wrong_value = 0
+    # insezrt data
+    primary_key = "test_save_to_pg_keyword"
+    keywords_with_timestamp = [{
+                "keyword" : 'habitabilité de la planète',
+                "timestamp": 1706437079006, 
+                "theme":"changement_climatique_constat",
+            },
+            {
+                "keyword" : 'conditions de vie sur terre',
+                "timestamp": 1706437079010,
+                "theme":"changement_climatique_constat",
+            },
+            {
+                "keyword" : 'planète',
+                "timestamp": 1706437079009,
+                "theme":"ressources_naturelles_concepts_generaux",
+            },
+            {
+                "keyword" : 'terre',
+                "timestamp": 1706437079011,
+                "theme":"ressources_naturelles_concepts_generaux",
+            }
+        ]
+    themes = [
+            "changement_climatique_constat",
+            "ressources_naturelles_concepts_generaux",
+        ]
+    channel_name = "m6"
+    df = pd.DataFrame([{
+        "id" : primary_key,
+        "start": 1706437079006,
+        "plaintext": "cheese pizza habitabilité de la planète conditions de vie sur terre animal",
+        "channel_name": channel_name,
+        "channel_radio": False,
+        "theme": themes,
+        "keywords_with_timestamp": keywords_with_timestamp
+        ,"number_of_keywords": wrong_value # wrong data to reapply our custom logic for "new_value"
+    }]) 
+    df['start'] = pd.to_datetime(df['start'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('Europe/Paris')
+   
+    assert save_to_pg(df, keywords_table, conn) == 1
+
+    # check the value is well existing
+    result_before_update = get_keyword(primary_key)
+    update_keywords(session)
+    result_after_update = get_keyword(primary_key)
+
+    new_value = count_keywords_duration_overlap(keywords_with_timestamp)
+    assert result_after_update.id == result_before_update.id
+    assert result_after_update.number_of_keywords == new_value
+    assert result_before_update.number_of_keywords == wrong_value