Fix/calculation keyword inside first word (#128)

* wip: close timestamp * wip
dataforgoodfr · Mar 1, 2024 · 48e9742 · 48e9742 · github-actions · Mar 1, 2024
1 parent dcaa07f
commit 48e9742
Show file tree

Hide file tree

Showing 10 changed files with 143 additions and 46 deletions.
diff --git a/README.md b/README.md
@@ -247,9 +247,11 @@ Thanks to the nginx container, we can have a local server for sitemap :
 
 ```
 docker compose up -d nginx # used to scrap sitemap locally - a figaro like website with only 3 news
-pytest test # "test" is the folder containing tests
+# docker compose up test with entrypoint modified to sleep
+# docker exec test bash
+pytest -vv --log-level DEBUG test # "test" is the folder containing tests
 # Only one test
-pytest -k 'mediatree' 
+pytest -vv --log-level DEBUG -k detect
 # OR
 docker compose up test # test is the container name running pytest test
 ```

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -5,8 +5,8 @@ services:
     build:
       context: ./
       dockerfile: Dockerfile
-    entrypoint: ["poetry", "run", "pytest","-vv", "--cov-report", "term:skip-covered", "--cov=quotaclimat", "--cov=postgres", "test/"]
-    # entrypoint: ["sleep", "12000"] # use to debug the container if needed
+    entrypoint: ["poetry", "run", "pytest","-vv", "-o", "log_cli=true", "--cov-report", "term:skip-covered", "--cov=quotaclimat", "--cov=postgres", "test/"]
+    #entrypoint: ["sleep", "12000"] # use to debug the container if needed
     environment:
       ENV: docker
       # CHANNEL: "fr3-idf"
@@ -24,6 +24,7 @@ services:
       - ./postgres/:/app/postgres/
       - ./test/:/app/test/
       - ./app.py:/app/app.py
+      - ./pyproject.toml:/app/pyproject.toml
     depends_on:
       nginxtest:
         condition: service_healthy

diff --git a/postgres/insert_data.py b/postgres/insert_data.py
@@ -16,15 +16,14 @@ def clean_data(df: pd.DataFrame):
 # from https://stackoverflow.com/a/69421596/3535853
 def insert_or_do_nothing_on_conflict(table, conn, keys, data_iter):
     data = [dict(zip(keys, row)) for row in data_iter]
-    logging.debug("data_iter %s", data)
+
     insert_statement = insert(table.table).values(data)
 
     on_duplicate_key_stmt = insert_statement.on_conflict_do_update(
         constraint=f"{table.table.name}_pkey",
         set_={c.key: c for c in insert_statement.excluded},
     )
 
-    logging.debug("insert_statement %s", on_duplicate_key_stmt)
     return conn.execute(on_duplicate_key_stmt)
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,6 +10,9 @@ name = "pypi-public"
 url = "https://pypi.org/simple/"
 priority = "primary"
 
+[tool.pytest.ini_options]
+log_cli = 1
+log_cli_level = "DEBUG"
 
 [[tool.poetry.source]]
 name = "PyPI"

diff --git a/quotaclimat/data_processing/mediatree/api_import.py b/quotaclimat/data_processing/mediatree/api_import.py
@@ -8,7 +8,7 @@
 import sys
 import os
 from quotaclimat.utils.healthcheck_config import run_health_check_server
-from quotaclimat.utils.logger import CustomFormatter
+from quotaclimat.utils.logger import getLogger
 from quotaclimat.data_processing.mediatree.utils import *
 from quotaclimat.data_processing.mediatree.config import *
 from quotaclimat.data_processing.mediatree.update_pg_keywords import *
@@ -214,9 +214,8 @@ def parse_number_pages(response_sub) -> int :
 
 def parse_reponse_subtitle(response_sub, channel = None) -> Optional[pd.DataFrame]:
     with sentry_sdk.start_transaction(op="task", name="parse_reponse_subtitle"):
-        logging.debug(f"Parsing json response:\n {response_sub}")
-
         total_results = parse_total_results(response_sub)
+        logging.getLogger("modin.logger.default").setLevel(logging.WARNING)
         if(total_results > 0):
             logging.info(f"{total_results} 'total_results' field")
 
@@ -230,8 +229,7 @@ def parse_reponse_subtitle(response_sub, channel = None) -> Optional[pd.DataFram
             new_df.rename(columns={'channel.name':'channel_name', 'channel.radio': 'channel_radio', 'timestamp':'start'}, inplace=True)
 
             log_dataframe_size(new_df, channel)
-
-            logging.debug("Parsed %s" % (new_df.head(1).to_string()))
+
             logging.debug("Parsed Schema\n%s", new_df.dtypes)
 
             return new_df
@@ -267,17 +265,7 @@ async def main():
     sys.exit(0)
 
 if __name__ == "__main__":
-    # create logger with 'spam_application'
-    logger = logging.getLogger()
-    logger.setLevel(level=os.getenv('LOGLEVEL', 'INFO').upper())
-
-    # create console handler with a higher log level
-    if (logger.hasHandlers()):
-        logger.handlers.clear()
-    ch = logging.StreamHandler()
-    ch.setFormatter(CustomFormatter())
-    logger.addHandler(ch)
-
+    getLogger()
     asyncio.run(main())
     sys.exit(0)
 

diff --git a/quotaclimat/data_processing/mediatree/detect_keywords.py b/quotaclimat/data_processing/mediatree/detect_keywords.py
@@ -12,7 +12,7 @@
 import sentry_sdk
 import modin.pandas as pd
 import dask
-
+from quotaclimat.utils.logger import getLogger
 dask.config.set({'dataframe.query-planning': True})
 
 def get_cts_in_ms_for_keywords(subtitle_duration: List[dict], keywords: List[str], theme: str) -> List[dict]:
@@ -49,25 +49,61 @@ def format_word_regex(word: str) -> str:
 def is_word_in_sentence(words: str, sentence: str) -> bool :
     # words can contain plurals and several words
     words = ' '.join(list(map(( lambda x: format_word_regex(x)), words.split(" "))))
-    logging.debug(f"testing {words}")
+
     #  test https://regex101.com/r/ilvs9G/1/
     if re.search(rf"\b{words}(?![\w-])", sentence, re.IGNORECASE):
         logging.debug(f"words {words} found in {sentence}")
         return True
     else:
         return False
 
-# some keywords are contained inside other keywords, we need to filter them
 
-def filter_keyword_with_same_timestamp(keywords_with_timestamp: List[dict]) -> List[dict]:
-    # Group keywords by timestamp
+def set_timestamp_with_margin(keywords_with_timestamp: List[dict]) -> List[dict]:
+    number_of_keywords = len(keywords_with_timestamp)
+    if number_of_keywords > 1:
+        for i in range(len(keywords_with_timestamp) - 1):
+            current_timestamp = keywords_with_timestamp[i].get("timestamp")
+            next_timestamp = keywords_with_timestamp[i + 1].get("timestamp")
+            current_keyword = keywords_with_timestamp[i].get("keyword")
+            next_keyword = keywords_with_timestamp[i + 1].get("keyword")
+
+            if current_timestamp is not None and next_timestamp is not None:           
+                if next_timestamp - current_timestamp < 1000:
+                    current_keyword = keywords_with_timestamp[i].get("keyword")
+                    next_keyword = keywords_with_timestamp[i + 1].get("keyword")
+                    if len(current_keyword) > len(next_keyword):
+                        shortest_word = next_keyword
+                        longest_word = current_keyword
+                        timestamp_to_change = current_timestamp
+                    else:
+                        shortest_word = current_keyword
+                        longest_word = next_keyword
+                        timestamp_to_change = next_timestamp
+
+                    if shortest_word in longest_word:
+                        logging.info(f"Close keywords - we group them {shortest_word} - {longest_word}")
+                        keywords_with_timestamp[i]["timestamp"] = timestamp_to_change
+                        keywords_with_timestamp[i+1]["timestamp"] = timestamp_to_change
+
+    return keywords_with_timestamp
+
+# some keywords are contained inside other keywords, we need to filter them
+def filter_keyword_with_same_timestamp(keywords_with_timestamp: List[dict])-> List[dict]:
+    logging.debug(f"Filtering keywords with same timestamp with a margin of one second")
+    number_of_keywords = len(keywords_with_timestamp) 
+    keywords_with_timestamp = set_timestamp_with_margin(keywords_with_timestamp)
+    # Group keywords by timestamp - with a margin of 1 second 
     grouped_keywords = {timestamp: list(group) for timestamp, group in groupby(keywords_with_timestamp, key=lambda x: x['timestamp'])}
 
     # Filter out keywords with the same timestamp and keep the longest keyword
     result = [
         max(group, key=lambda x: len(x['keyword']))
         for group in grouped_keywords.values()
     ]
+    final_result = len(result)
+
+    if final_result < number_of_keywords:
+        logging.info(f"Filtering keywords {final_result} out of {number_of_keywords} | {keywords_with_timestamp} with final result {result}")
 
     return result
 

diff --git a/quotaclimat/utils/logger.py b/quotaclimat/utils/logger.py
@@ -1,5 +1,5 @@
 import logging
-
+import os
 class CustomFormatter(logging.Formatter):
 
     grey = "\x1b[38;20m"
@@ -21,4 +21,17 @@ class CustomFormatter(logging.Formatter):
     def format(self, record):
         log_fmt = self.FORMATS.get(record.levelno)
         formatter = logging.Formatter(log_fmt)
-        return formatter.format(record)
+        return formatter.format(record)
+
+def getLogger():
+    # create logger with 'spam_application'
+    logger = logging.getLogger()
+    logger.setLevel(level=os.getenv('LOGLEVEL', 'INFO').upper())
+    # create console handler with a higher log level
+    if (logger.hasHandlers()):
+        logger.handlers.clear()
+    ch = logging.StreamHandler()
+    ch.setFormatter(CustomFormatter())
+    logger.addHandler(ch)
+
+    return logger
diff --git a/test/sitemap/test_detect_keywords.py b/test/sitemap/test_detect_keywords.py
@@ -67,6 +67,12 @@ def test_get_themes_keywords_duration():
      ,"adaptation_climatique_solutions_directes"
     ],[], 0]
 
+
+    assert get_themes_keywords_duration("il rencontre aussi une crise majeure de la pénurie de l' offre laetitia jaoude des barrages sauvages", subtitles, start) == [[
+      "changement_climatique_consequences"
+     ,"atténuation_climatique_solutions_directes"
+    ],[], 0]
+
 def test_get_cts_in_ms_for_keywords():
     str = [{
           "duration_ms": 34,
@@ -310,7 +316,7 @@ def test_complexe_filter_and_tag_by_theme():
             "text": "planète"
             },{
             "duration_ms": 34,
-            "cts_in_ms":original_timestamp + 10,
+            "cts_in_ms": original_timestamp + get_keyword_time_separation_ms(),
             "text": "conditions"
             },{
             "duration_ms": 34,
@@ -354,16 +360,16 @@ def test_complexe_filter_and_tag_by_theme():
         ],
         "keywords_with_timestamp": [{
                 "keyword" : 'habitabilité de la planète',
-                "timestamp": 1706437079006, # count for one
+                "timestamp": original_timestamp_first_keyword, # count for one
                 "theme":"changement_climatique_constat",
             },
             {
                 "keyword" : 'conditions de vie sur terre',
-                "timestamp": 1706437079010, # timestamp too close
+                "timestamp": original_timestamp + get_keyword_time_separation_ms(), # timestamp too close
                 "theme":"changement_climatique_constat",
             }
         ]
-        ,"number_of_keywords": 1
+        ,"number_of_keywords": 2
     }])
 
     # List of words to filter on
@@ -614,7 +620,7 @@ def test_keyword_inside_keyword_filter_keyword_with_same_timestamp():
 
     assert filter_keyword_with_same_timestamp(keywords_with_timestamp) == expected
 
-def test_keyword_inside_keyword_filter_keyword_with_same_timestamp():
+def test_keyword_2words_inside_keyword_filter_keyword_with_same_timestamp():
     keywords_with_timestamp = [{
                 "keyword" : 'agriculture',
                 "timestamp": original_timestamp,
@@ -636,6 +642,59 @@ def test_keyword_inside_keyword_filter_keyword_with_same_timestamp():
 
     assert filter_keyword_with_same_timestamp(keywords_with_timestamp) == expected
 
+# we should keep the longest keyword, even it's come before the first one
+def test_keyword_second_word_a_bit_later_inside_keyword_filter_keyword_with_same_timestamp():
+    later_timestamp = original_timestamp + 960 # from real data
+    keywords_with_timestamp = [{
+                "keyword" : 'carbone',
+                "timestamp": later_timestamp,
+                "theme":"changement_climatique_causes_directes",
+            },
+            {
+                "keyword" : 'béton bas carbone',
+                "timestamp": original_timestamp, # same timestamp, so we take longest keyword
+                "theme":"atténuation_climatique_solutions_directes", # different theme, keep this one
+            }
+    ]
+
+    expected = [{
+                "keyword" : 'béton bas carbone',
+                "timestamp": original_timestamp, # same timestamp, so we take longest keyword
+                "theme":"atténuation_climatique_solutions_directes", # different theme, keep this one
+            }
+    ]
+
+    assert filter_keyword_with_same_timestamp(keywords_with_timestamp) == expected
+
+# we should keep the longest keyword, even it's come before the first one
+def test_keyword_second_word_to_keep_inside_keyword_filter_keyword_with_same_timestamp():
+    keywords_with_timestamp = [{
+                    "theme": "changement_climatique_consequences",
+                    "timestamp": 1707627703040,
+                    "keyword": "pénurie"
+            },
+            {
+                "theme":"atténuation_climatique_solutions_directes", # different theme, keep this one
+                "timestamp": 1707627708051,
+                "keyword": "barrages"
+            },
+    ]
+
+    expected = [
+        {
+                "keyword": "pénurie",
+                "timestamp": 1707627703040,
+                "theme": "changement_climatique_consequences",
+        },
+        {
+            "keyword" : 'barrages',
+            "timestamp": 1707627708051, # same timestamp, so we take longest keyword
+            "theme":"atténuation_climatique_solutions_directes", # different theme, keep this one
+        }
+    ]
+
+    assert filter_keyword_with_same_timestamp(keywords_with_timestamp) == expected
+
 def test_filter_keyword_with_same_timestamp():
     keywords_with_timestamp = [{ #nothing to filter
                 "keyword" : "période la plus chaude",
@@ -644,7 +703,7 @@ def test_filter_keyword_with_same_timestamp():
             },
             {
                 "keyword" : "élévation du niveau de la mer",
-                "timestamp": original_timestamp + 1,
+                "timestamp": original_timestamp + 1200, # margin superior to 1000ms
                 "theme":"changement_climatique_consequences",
             }
     ]

diff --git a/test/sitemap/test_main_import_api.py b/test/sitemap/test_main_import_api.py
@@ -51,11 +51,12 @@ def test_second_row_api_import():
         primary_key = "67b9cc593516b40f55d6a3e89b377fccc8ab76d263c5fd6d4bfe379626190641"
         specific_keyword = get_keyword(primary_key)
         assert specific_keyword.theme == [
-        "changement_climatique_constat",
-        "changement_climatique_causes_indirectes",
-        "changement_climatique_consequences",
-        "atténuation_climatique_solutions_directes"
+            "changement_climatique_constat",
+            "changement_climatique_causes_indirectes",
+            "changement_climatique_consequences",
+            "atténuation_climatique_solutions_directes"
         ]
+
         assert specific_keyword.keywords_with_timestamp == [ # from metabase to speedup check
             {
                 "keyword": "écologique",
@@ -67,11 +68,6 @@ def test_second_row_api_import():
                 "timestamp": 1707627631076,
                 "theme": "changement_climatique_constat"
             },
-            {
-                "keyword": "pétrole",
-                "timestamp": 1707627629004,
-                "theme": "changement_climatique_causes_indirectes"
-            },
             {
                 "keyword": "puits de pétrole",
                 "timestamp": 1707627628054,
@@ -92,7 +88,7 @@ def test_second_row_api_import():
                 "timestamp": 1707627686004,
                 "theme": "atténuation_climatique_solutions_directes"
             }
-            ]
+        ]
         assert specific_keyword.number_of_keywords == 4
 
 def test_third_row_api_import():

diff --git a/test/sitemap/test_mediatree_utils.py b/test/sitemap/test_mediatree_utils.py
@@ -2,7 +2,7 @@
 import pandas as pd
 
 from utils import get_localhost
-from quotaclimat.data_processing.mediatree.utils import get_yesterday, get_date_range, get_start_end_date_env_variable_with_default, is_it_tuesday
+from quotaclimat.data_processing.mediatree.utils import *
 
 import logging
 from time import strftime,localtime
File	Stmts	Miss	Cover	Missing
postgres
insert_data.py	44	7	84%	37–39, 58–60, 65
insert_existing_data_example.py	20	3	85%	25–27
postgres/schemas
models.py	72	15	79%	74–81, 91–92, 101–111
quotaclimat/data_analytics
analytics_signataire_charte.py	29	29	0%	1–67
bilan.py	108	108	0%	2–372
data_coverage.py	34	34	0%	1–94
exploration.py	125	125	0%	1–440
sitemap_analytics.py	118	118	0%	1–343
quotaclimat/data_ingestion
categorization_program_type.py	1	1	0%	1
config_youtube.py	1	1	0%	1
scaleway_db_backups.py	34	34	0%	1–74
scrap_chartejournalismeecologie_signataires.py	50	50	0%	1–169
scrap_sitemap.py	134	17	87%	27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
scrap_tv_program.py	62	62	0%	1–149
scrap_youtube.py	114	114	0%	1–238
quotaclimat/data_ingestion/ingest_db
ingest_sitemap_in_db.py	59	41	31%	21–42, 45–65, 69–80
quotaclimat/data_ingestion/scrap_html
scrap_description_article.py	36	3	92%	19–20, 32
quotaclimat/data_processing/mediatree
api_import.py	177	103	42%	38–42, 47–53, 57–60, 66, 69–96, 102–117, 122–124, 149–161, 165–168, 172–178, 189–200, 203–207, 213, 237–238, 242, 246–265, 268–270
config.py	15	2	87%	7, 16
detect_keywords.py	143	6	96%	75–77, 165–167
utils.py	66	22	67%	19, 30–54, 57, 76–77
quotaclimat/data_processing/sitemap
sitemap_processing.py	41	27	34%	15–19, 23–25, 29–47, 51–58, 66–96, 101–103
quotaclimat/utils
channels.py	6	6	0%	1–95
climate_keywords.py	2	2	0%	3–35
healthcheck_config.py	29	14	52%	22–24, 27–38
logger.py	24	11	54%	22–24, 28–37
plotly_theme.py	17	17	0%	1–56
sentry.py	10	2	80%	21–22
TOTAL	1623	974	40%