Skip to content

Commit

Permalink
Fix/calculation keyword inside first word (#128)
Browse files Browse the repository at this point in the history
* wip: close timestamp

* wip
  • Loading branch information
polomarcus authored Mar 1, 2024
1 parent dcaa07f commit 48e9742
Show file tree
Hide file tree
Showing 10 changed files with 143 additions and 46 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -247,9 +247,11 @@ Thanks to the nginx container, we can have a local server for sitemap :

```
docker compose up -d nginx # used to scrap sitemap locally - a figaro like website with only 3 news
pytest test # "test" is the folder containing tests
# docker compose up test with entrypoint modified to sleep
# docker exec test bash
pytest -vv --log-level DEBUG test # "test" is the folder containing tests
# Only one test
pytest -k 'mediatree'
pytest -vv --log-level DEBUG -k detect
# OR
docker compose up test # test is the container name running pytest test
```
Expand Down
5 changes: 3 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ services:
build:
context: ./
dockerfile: Dockerfile
entrypoint: ["poetry", "run", "pytest","-vv", "--cov-report", "term:skip-covered", "--cov=quotaclimat", "--cov=postgres", "test/"]
# entrypoint: ["sleep", "12000"] # use to debug the container if needed
entrypoint: ["poetry", "run", "pytest","-vv", "-o", "log_cli=true", "--cov-report", "term:skip-covered", "--cov=quotaclimat", "--cov=postgres", "test/"]
#entrypoint: ["sleep", "12000"] # use to debug the container if needed
environment:
ENV: docker
# CHANNEL: "fr3-idf"
Expand All @@ -24,6 +24,7 @@ services:
- ./postgres/:/app/postgres/
- ./test/:/app/test/
- ./app.py:/app/app.py
- ./pyproject.toml:/app/pyproject.toml
depends_on:
nginxtest:
condition: service_healthy
Expand Down
3 changes: 1 addition & 2 deletions postgres/insert_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,14 @@ def clean_data(df: pd.DataFrame):
# from https://stackoverflow.com/a/69421596/3535853
def insert_or_do_nothing_on_conflict(table, conn, keys, data_iter):
data = [dict(zip(keys, row)) for row in data_iter]
logging.debug("data_iter %s", data)

insert_statement = insert(table.table).values(data)

on_duplicate_key_stmt = insert_statement.on_conflict_do_update(
constraint=f"{table.table.name}_pkey",
set_={c.key: c for c in insert_statement.excluded},
)

logging.debug("insert_statement %s", on_duplicate_key_stmt)
return conn.execute(on_duplicate_key_stmt)


Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ name = "pypi-public"
url = "https://pypi.org/simple/"
priority = "primary"

[tool.pytest.ini_options]
log_cli = 1
log_cli_level = "DEBUG"

[[tool.poetry.source]]
name = "PyPI"
Expand Down
20 changes: 4 additions & 16 deletions quotaclimat/data_processing/mediatree/api_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import sys
import os
from quotaclimat.utils.healthcheck_config import run_health_check_server
from quotaclimat.utils.logger import CustomFormatter
from quotaclimat.utils.logger import getLogger
from quotaclimat.data_processing.mediatree.utils import *
from quotaclimat.data_processing.mediatree.config import *
from quotaclimat.data_processing.mediatree.update_pg_keywords import *
Expand Down Expand Up @@ -214,9 +214,8 @@ def parse_number_pages(response_sub) -> int :

def parse_reponse_subtitle(response_sub, channel = None) -> Optional[pd.DataFrame]:
with sentry_sdk.start_transaction(op="task", name="parse_reponse_subtitle"):
logging.debug(f"Parsing json response:\n {response_sub}")

total_results = parse_total_results(response_sub)
logging.getLogger("modin.logger.default").setLevel(logging.WARNING)
if(total_results > 0):
logging.info(f"{total_results} 'total_results' field")

Expand All @@ -230,8 +229,7 @@ def parse_reponse_subtitle(response_sub, channel = None) -> Optional[pd.DataFram
new_df.rename(columns={'channel.name':'channel_name', 'channel.radio': 'channel_radio', 'timestamp':'start'}, inplace=True)

log_dataframe_size(new_df, channel)

logging.debug("Parsed %s" % (new_df.head(1).to_string()))

logging.debug("Parsed Schema\n%s", new_df.dtypes)

return new_df
Expand Down Expand Up @@ -267,17 +265,7 @@ async def main():
sys.exit(0)

if __name__ == "__main__":
# create logger with 'spam_application'
logger = logging.getLogger()
logger.setLevel(level=os.getenv('LOGLEVEL', 'INFO').upper())

# create console handler with a higher log level
if (logger.hasHandlers()):
logger.handlers.clear()
ch = logging.StreamHandler()
ch.setFormatter(CustomFormatter())
logger.addHandler(ch)

getLogger()
asyncio.run(main())
sys.exit(0)

Expand Down
46 changes: 41 additions & 5 deletions quotaclimat/data_processing/mediatree/detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import sentry_sdk
import modin.pandas as pd
import dask

from quotaclimat.utils.logger import getLogger
dask.config.set({'dataframe.query-planning': True})

def get_cts_in_ms_for_keywords(subtitle_duration: List[dict], keywords: List[str], theme: str) -> List[dict]:
Expand Down Expand Up @@ -49,25 +49,61 @@ def format_word_regex(word: str) -> str:
def is_word_in_sentence(words: str, sentence: str) -> bool :
# words can contain plurals and several words
words = ' '.join(list(map(( lambda x: format_word_regex(x)), words.split(" "))))
logging.debug(f"testing {words}")

# test https://regex101.com/r/ilvs9G/1/
if re.search(rf"\b{words}(?![\w-])", sentence, re.IGNORECASE):
logging.debug(f"words {words} found in {sentence}")
return True
else:
return False

# some keywords are contained inside other keywords, we need to filter them

def filter_keyword_with_same_timestamp(keywords_with_timestamp: List[dict]) -> List[dict]:
# Group keywords by timestamp
def set_timestamp_with_margin(keywords_with_timestamp: List[dict]) -> List[dict]:
number_of_keywords = len(keywords_with_timestamp)
if number_of_keywords > 1:
for i in range(len(keywords_with_timestamp) - 1):
current_timestamp = keywords_with_timestamp[i].get("timestamp")
next_timestamp = keywords_with_timestamp[i + 1].get("timestamp")
current_keyword = keywords_with_timestamp[i].get("keyword")
next_keyword = keywords_with_timestamp[i + 1].get("keyword")

if current_timestamp is not None and next_timestamp is not None:
if next_timestamp - current_timestamp < 1000:
current_keyword = keywords_with_timestamp[i].get("keyword")
next_keyword = keywords_with_timestamp[i + 1].get("keyword")
if len(current_keyword) > len(next_keyword):
shortest_word = next_keyword
longest_word = current_keyword
timestamp_to_change = current_timestamp
else:
shortest_word = current_keyword
longest_word = next_keyword
timestamp_to_change = next_timestamp

if shortest_word in longest_word:
logging.info(f"Close keywords - we group them {shortest_word} - {longest_word}")
keywords_with_timestamp[i]["timestamp"] = timestamp_to_change
keywords_with_timestamp[i+1]["timestamp"] = timestamp_to_change

return keywords_with_timestamp

# some keywords are contained inside other keywords, we need to filter them
def filter_keyword_with_same_timestamp(keywords_with_timestamp: List[dict])-> List[dict]:
logging.debug(f"Filtering keywords with same timestamp with a margin of one second")
number_of_keywords = len(keywords_with_timestamp)
keywords_with_timestamp = set_timestamp_with_margin(keywords_with_timestamp)
# Group keywords by timestamp - with a margin of 1 second
grouped_keywords = {timestamp: list(group) for timestamp, group in groupby(keywords_with_timestamp, key=lambda x: x['timestamp'])}

# Filter out keywords with the same timestamp and keep the longest keyword
result = [
max(group, key=lambda x: len(x['keyword']))
for group in grouped_keywords.values()
]
final_result = len(result)

if final_result < number_of_keywords:
logging.info(f"Filtering keywords {final_result} out of {number_of_keywords} | {keywords_with_timestamp} with final result {result}")

return result

Expand Down
17 changes: 15 additions & 2 deletions quotaclimat/utils/logger.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging

import os
class CustomFormatter(logging.Formatter):

grey = "\x1b[38;20m"
Expand All @@ -21,4 +21,17 @@ class CustomFormatter(logging.Formatter):
def format(self, record):
log_fmt = self.FORMATS.get(record.levelno)
formatter = logging.Formatter(log_fmt)
return formatter.format(record)
return formatter.format(record)

def getLogger():
# create logger with 'spam_application'
logger = logging.getLogger()
logger.setLevel(level=os.getenv('LOGLEVEL', 'INFO').upper())
# create console handler with a higher log level
if (logger.hasHandlers()):
logger.handlers.clear()
ch = logging.StreamHandler()
ch.setFormatter(CustomFormatter())
logger.addHandler(ch)

return logger
71 changes: 65 additions & 6 deletions test/sitemap/test_detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ def test_get_themes_keywords_duration():
,"adaptation_climatique_solutions_directes"
],[], 0]


assert get_themes_keywords_duration("il rencontre aussi une crise majeure de la pénurie de l' offre laetitia jaoude des barrages sauvages", subtitles, start) == [[
"changement_climatique_consequences"
,"atténuation_climatique_solutions_directes"
],[], 0]

def test_get_cts_in_ms_for_keywords():
str = [{
"duration_ms": 34,
Expand Down Expand Up @@ -310,7 +316,7 @@ def test_complexe_filter_and_tag_by_theme():
"text": "planète"
},{
"duration_ms": 34,
"cts_in_ms":original_timestamp + 10,
"cts_in_ms": original_timestamp + get_keyword_time_separation_ms(),
"text": "conditions"
},{
"duration_ms": 34,
Expand Down Expand Up @@ -354,16 +360,16 @@ def test_complexe_filter_and_tag_by_theme():
],
"keywords_with_timestamp": [{
"keyword" : 'habitabilité de la planète',
"timestamp": 1706437079006, # count for one
"timestamp": original_timestamp_first_keyword, # count for one
"theme":"changement_climatique_constat",
},
{
"keyword" : 'conditions de vie sur terre',
"timestamp": 1706437079010, # timestamp too close
"timestamp": original_timestamp + get_keyword_time_separation_ms(), # timestamp too close
"theme":"changement_climatique_constat",
}
]
,"number_of_keywords": 1
,"number_of_keywords": 2
}])

# List of words to filter on
Expand Down Expand Up @@ -614,7 +620,7 @@ def test_keyword_inside_keyword_filter_keyword_with_same_timestamp():

assert filter_keyword_with_same_timestamp(keywords_with_timestamp) == expected

def test_keyword_inside_keyword_filter_keyword_with_same_timestamp():
def test_keyword_2words_inside_keyword_filter_keyword_with_same_timestamp():
keywords_with_timestamp = [{
"keyword" : 'agriculture',
"timestamp": original_timestamp,
Expand All @@ -636,6 +642,59 @@ def test_keyword_inside_keyword_filter_keyword_with_same_timestamp():

assert filter_keyword_with_same_timestamp(keywords_with_timestamp) == expected

# we should keep the longest keyword, even it's come before the first one
def test_keyword_second_word_a_bit_later_inside_keyword_filter_keyword_with_same_timestamp():
later_timestamp = original_timestamp + 960 # from real data
keywords_with_timestamp = [{
"keyword" : 'carbone',
"timestamp": later_timestamp,
"theme":"changement_climatique_causes_directes",
},
{
"keyword" : 'béton bas carbone',
"timestamp": original_timestamp, # same timestamp, so we take longest keyword
"theme":"atténuation_climatique_solutions_directes", # different theme, keep this one
}
]

expected = [{
"keyword" : 'béton bas carbone',
"timestamp": original_timestamp, # same timestamp, so we take longest keyword
"theme":"atténuation_climatique_solutions_directes", # different theme, keep this one
}
]

assert filter_keyword_with_same_timestamp(keywords_with_timestamp) == expected

# we should keep the longest keyword, even it's come before the first one
def test_keyword_second_word_to_keep_inside_keyword_filter_keyword_with_same_timestamp():
keywords_with_timestamp = [{
"theme": "changement_climatique_consequences",
"timestamp": 1707627703040,
"keyword": "pénurie"
},
{
"theme":"atténuation_climatique_solutions_directes", # different theme, keep this one
"timestamp": 1707627708051,
"keyword": "barrages"
},
]

expected = [
{
"keyword": "pénurie",
"timestamp": 1707627703040,
"theme": "changement_climatique_consequences",
},
{
"keyword" : 'barrages',
"timestamp": 1707627708051, # same timestamp, so we take longest keyword
"theme":"atténuation_climatique_solutions_directes", # different theme, keep this one
}
]

assert filter_keyword_with_same_timestamp(keywords_with_timestamp) == expected

def test_filter_keyword_with_same_timestamp():
keywords_with_timestamp = [{ #nothing to filter
"keyword" : "période la plus chaude",
Expand All @@ -644,7 +703,7 @@ def test_filter_keyword_with_same_timestamp():
},
{
"keyword" : "élévation du niveau de la mer",
"timestamp": original_timestamp + 1,
"timestamp": original_timestamp + 1200, # margin superior to 1000ms
"theme":"changement_climatique_consequences",
}
]
Expand Down
16 changes: 6 additions & 10 deletions test/sitemap/test_main_import_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,12 @@ def test_second_row_api_import():
primary_key = "67b9cc593516b40f55d6a3e89b377fccc8ab76d263c5fd6d4bfe379626190641"
specific_keyword = get_keyword(primary_key)
assert specific_keyword.theme == [
"changement_climatique_constat",
"changement_climatique_causes_indirectes",
"changement_climatique_consequences",
"atténuation_climatique_solutions_directes"
"changement_climatique_constat",
"changement_climatique_causes_indirectes",
"changement_climatique_consequences",
"atténuation_climatique_solutions_directes"
]

assert specific_keyword.keywords_with_timestamp == [ # from metabase to speedup check
{
"keyword": "écologique",
Expand All @@ -67,11 +68,6 @@ def test_second_row_api_import():
"timestamp": 1707627631076,
"theme": "changement_climatique_constat"
},
{
"keyword": "pétrole",
"timestamp": 1707627629004,
"theme": "changement_climatique_causes_indirectes"
},
{
"keyword": "puits de pétrole",
"timestamp": 1707627628054,
Expand All @@ -92,7 +88,7 @@ def test_second_row_api_import():
"timestamp": 1707627686004,
"theme": "atténuation_climatique_solutions_directes"
}
]
]
assert specific_keyword.number_of_keywords == 4

def test_third_row_api_import():
Expand Down
2 changes: 1 addition & 1 deletion test/sitemap/test_mediatree_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pandas as pd

from utils import get_localhost
from quotaclimat.data_processing.mediatree.utils import get_yesterday, get_date_range, get_start_end_date_env_variable_with_default, is_it_tuesday
from quotaclimat.data_processing.mediatree.utils import *

import logging
from time import strftime,localtime
Expand Down

1 comment on commit 48e9742

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
postgres
   insert_data.py44784%37–39, 58–60, 65
   insert_existing_data_example.py20385%25–27
postgres/schemas
   models.py721579%74–81, 91–92, 101–111
quotaclimat/data_analytics
   analytics_signataire_charte.py29290%1–67
   bilan.py1081080%2–372
   data_coverage.py34340%1–94
   exploration.py1251250%1–440
   sitemap_analytics.py1181180%1–343
quotaclimat/data_ingestion
   categorization_program_type.py110%1
   config_youtube.py110%1
   scaleway_db_backups.py34340%1–74
   scrap_chartejournalismeecologie_signataires.py50500%1–169
   scrap_sitemap.py1341787%27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
   scrap_tv_program.py62620%1–149
   scrap_youtube.py1141140%1–238
quotaclimat/data_ingestion/ingest_db
   ingest_sitemap_in_db.py594131%21–42, 45–65, 69–80
quotaclimat/data_ingestion/scrap_html
   scrap_description_article.py36392%19–20, 32
quotaclimat/data_processing/mediatree
   api_import.py17710342%38–42, 47–53, 57–60, 66, 69–96, 102–117, 122–124, 149–161, 165–168, 172–178, 189–200, 203–207, 213, 237–238, 242, 246–265, 268–270
   config.py15287%7, 16
   detect_keywords.py143696%75–77, 165–167
   utils.py662267%19, 30–54, 57, 76–77
quotaclimat/data_processing/sitemap
   sitemap_processing.py412734%15–19, 23–25, 29–47, 51–58, 66–96, 101–103
quotaclimat/utils
   channels.py660%1–95
   climate_keywords.py220%3–35
   healthcheck_config.py291452%22–24, 27–38
   logger.py241154%22–24, 28–37
   plotly_theme.py17170%1–56
   sentry.py10280%21–22
TOTAL162397440% 

Tests Skipped Failures Errors Time
57 0 💤 0 ❌ 0 🔥 50.627s ⏱️

Please sign in to comment.