Skip to content

Commit

Permalink
fix: delete row where there is no more theme (deleted keywords case) (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
polomarcus authored Mar 9, 2024
1 parent 81d4ad1 commit 3eb0775
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 24 deletions.
5 changes: 3 additions & 2 deletions quotaclimat/data_processing/mediatree/detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :
)

# remove all rows that does not have themes
df = df.dropna(subset=['theme'])
df = df.dropna(subset=['theme'], how='any') # any is for None values

logging.info(f"After filtering with out keywords, we have {len(df)} out of {count_before_filtering} subtitles left that are insteresting for us")

Expand Down Expand Up @@ -277,4 +277,5 @@ def count_keywords_duration_overlap_without_indirect(keywords_with_timestamp: Li
return final_count
else:
return 0



47 changes: 26 additions & 21 deletions quotaclimat/data_processing/mediatree/update_pg_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from sqlalchemy.orm import Session
from postgres.schemas.models import Keywords
from quotaclimat.data_processing.mediatree.detect_keywords import *
from sqlalchemy import func, select
from sqlalchemy import func, select, delete

def update_keywords(session: Session, batch_size: int = 50000, start_offset : int = 0) -> list:
total_updates = get_total_count_saved_keywords(session)
Expand Down Expand Up @@ -110,23 +110,28 @@ def update_keyword_row(session: Session,
number_of_biodiversite_consequences: int,
number_of_biodiversite_solutions_directes: int,
):
session.query(Keywords).filter(Keywords.id == keyword_id).update(
{
Keywords.number_of_keywords: new_number_of_keywords,
Keywords.keywords_with_timestamp: new_keywords_with_timestamp,
Keywords.theme: matching_themes,
Keywords.number_of_changement_climatique_constat:number_of_changement_climatique_constat ,
Keywords.number_of_changement_climatique_causes_directes:number_of_changement_climatique_causes_directes ,
Keywords.number_of_changement_climatique_consequences:number_of_changement_climatique_consequences ,
Keywords.number_of_attenuation_climatique_solutions_directes:number_of_attenuation_climatique_solutions_directes ,
Keywords.number_of_adaptation_climatique_solutions_directes:number_of_adaptation_climatique_solutions_directes ,
Keywords.number_of_ressources_naturelles_concepts_generaux:number_of_ressources_naturelles_concepts_generaux ,
Keywords.number_of_ressources_naturelles_causes:number_of_ressources_naturelles_causes ,
Keywords.number_of_ressources_naturelles_solutions:number_of_ressources_naturelles_solutions ,
Keywords.number_of_biodiversite_concepts_generaux:number_of_biodiversite_concepts_generaux ,
Keywords.number_of_biodiversite_causes_directes:number_of_biodiversite_causes_directes ,
Keywords.number_of_biodiversite_consequences:number_of_biodiversite_consequences ,
Keywords.number_of_biodiversite_solutions_directes:number_of_biodiversite_solutions_directes
},
synchronize_session=False
)
if matching_themes is not None:
session.query(Keywords).filter(Keywords.id == keyword_id).update(
{
Keywords.number_of_keywords: new_number_of_keywords,
Keywords.keywords_with_timestamp: new_keywords_with_timestamp,
Keywords.theme: matching_themes,
Keywords.number_of_changement_climatique_constat:number_of_changement_climatique_constat ,
Keywords.number_of_changement_climatique_causes_directes:number_of_changement_climatique_causes_directes ,
Keywords.number_of_changement_climatique_consequences:number_of_changement_climatique_consequences ,
Keywords.number_of_attenuation_climatique_solutions_directes:number_of_attenuation_climatique_solutions_directes ,
Keywords.number_of_adaptation_climatique_solutions_directes:number_of_adaptation_climatique_solutions_directes ,
Keywords.number_of_ressources_naturelles_concepts_generaux:number_of_ressources_naturelles_concepts_generaux ,
Keywords.number_of_ressources_naturelles_causes:number_of_ressources_naturelles_causes ,
Keywords.number_of_ressources_naturelles_solutions:number_of_ressources_naturelles_solutions ,
Keywords.number_of_biodiversite_concepts_generaux:number_of_biodiversite_concepts_generaux ,
Keywords.number_of_biodiversite_causes_directes:number_of_biodiversite_causes_directes ,
Keywords.number_of_biodiversite_consequences:number_of_biodiversite_consequences ,
Keywords.number_of_biodiversite_solutions_directes:number_of_biodiversite_solutions_directes
},
synchronize_session=False
)
else:
logging.warning(f"Matching themes is empty - deleting row {keyword_id}")
session.query(Keywords).filter(Keywords.id == keyword_id).delete()
session.commit()
13 changes: 13 additions & 0 deletions test/sitemap/test_detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,19 @@ def test_complex_hyphen_get_cts_in_ms_for_keywords():
assert get_cts_in_ms_for_keywords(str, keywords, theme) == expected


def test_none_theme_filter_and_tag_by_theme():
df1 = pd.DataFrame([{
"start": start,
"plaintext": "cheese pizza",
"channel_name": "m6",
"channel_radio": False,
"srt": []
}])

# List of words to filter on
df = filter_and_tag_by_theme(df1)
debug_df(df)
assert len(df) == 0

def test_filter_and_tag_by_theme():
srt = [{
Expand Down
52 changes: 51 additions & 1 deletion test/sitemap/test_update_pg_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,59 @@

original_timestamp = 1706437079004
start = datetime.utcfromtimestamp(original_timestamp / 1000)
create_tables()

def test_delete_keywords():
conn = connect_to_db()
primary_key = "delete_me"
wrong_value = 0
df = pd.DataFrame([{
"id" : primary_key,
"start": start,
"plaintext": "test",
"channel_name": "test",
"channel_radio": False,
"theme":[],
"keywords_with_timestamp": [],
"srt": [],
"number_of_keywords": wrong_value, # wrong data to reapply our custom logic for "new_value"
"number_of_changement_climatique_constat": wrong_value,
"number_of_changement_climatique_causes_directes": wrong_value,
"number_of_changement_climatique_consequences": wrong_value,
"number_of_attenuation_climatique_solutions_directes": wrong_value,
"number_of_adaptation_climatique_solutions_directes": wrong_value,
"number_of_ressources_naturelles_concepts_generaux": wrong_value,
"number_of_ressources_naturelles_causes": wrong_value,
"number_of_ressources_naturelles_solutions": wrong_value,
"number_of_biodiversite_concepts_generaux": wrong_value,
"number_of_biodiversite_causes_directes": wrong_value,
"number_of_biodiversite_consequences": wrong_value,
"number_of_biodiversite_solutions_directes" : wrong_value
}])
df['start'] = pd.to_datetime(df['start'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('Europe/Paris')
assert save_to_pg(df._to_pandas(), keywords_table, conn) == 1
session = get_db_session(conn)
assert get_keyword(primary_key) != None
update_keyword_row(session, primary_key,
0,
None,
None
,0
,0
,0
,0
,0
,0
,0
,0
,0
,0
,0
,0
)
assert get_keyword(primary_key) == None

def test_first_update_keywords():
create_tables()
conn = connect_to_db()

wrong_value = 0
Expand Down

1 comment on commit 3eb0775

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
postgres
   insert_data.py44784%36–38, 57–59, 64
   insert_existing_data_example.py20385%25–27
postgres/schemas
   models.py841582%88–95, 105–106, 115–125
quotaclimat/data_analytics
   analytics_signataire_charte.py29290%1–67
   bilan.py1081080%2–372
   data_coverage.py34340%1–94
   exploration.py1251250%1–440
   sitemap_analytics.py1181180%1–343
quotaclimat/data_ingestion
   categorization_program_type.py110%1
   config_youtube.py110%1
   scaleway_db_backups.py34340%1–74
   scrap_chartejournalismeecologie_signataires.py50500%1–169
   scrap_sitemap.py1341787%27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
   scrap_tv_program.py62620%1–149
   scrap_youtube.py1141140%1–238
quotaclimat/data_ingestion/ingest_db
   ingest_sitemap_in_db.py594131%21–42, 45–65, 69–80
quotaclimat/data_ingestion/scrap_html
   scrap_description_article.py36392%19–20, 32
quotaclimat/data_processing/mediatree
   api_import.py18310543%41–45, 50–58, 62–65, 71, 74–101, 107–122, 127–129, 154–166, 170–173, 177–183, 194–205, 208–212, 218, 242–243, 247, 251–270, 273–275
   config.py15287%7, 16
   detect_keywords.py169398%230–232
   utils.py662267%19, 30–54, 57, 76–77
quotaclimat/data_processing/sitemap
   sitemap_processing.py412734%15–19, 23–25, 29–47, 51–58, 66–96, 101–103
quotaclimat/utils
   channels.py660%1–95
   climate_keywords.py220%3–35
   healthcheck_config.py291452%22–24, 27–38
   logger.py241154%22–24, 28–37
   plotly_theme.py17170%1–56
   sentry.py10280%21–22
TOTAL167497342% 

Tests Skipped Failures Errors Time
69 0 💤 0 ❌ 0 🔥 1m 8s ⏱️

Please sign in to comment.