Skip to content

Commit

Permalink
refacto: faster update by enabling UPDATE_PROGRAM_ONLY when needed on…
Browse files Browse the repository at this point in the history
…ly (#189)

* refacto: faster update by enabling UPDATE_PROGRAM_ONLY when needed only

* chores: dep
  • Loading branch information
polomarcus authored Jun 21, 2024
1 parent 91f20c5 commit b74c5e2
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 50 deletions.
69 changes: 39 additions & 30 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ tenacity = "^8.2.3"
sentry-sdk = "^1.44.1"
coverage = "^7.4.2"
modin = {extras = ["ray"], version = "^0.30.1"}
filelock = "<=3.14"
[build-system]
requires = ["poetry-core>=1.1"]
build-backend = "poetry.core.masonry.api"
Expand Down
6 changes: 3 additions & 3 deletions quotaclimat/data_processing/mediatree/api_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ async def update_pg_data(exit_event):
number_of_batch = int(os.environ.get("NUMBER_OF_BATCH", 6))
program_only = os.environ.get("UPDATE_PROGRAM_ONLY", "false") == "true"
if(program_only):
logging.warning("Update : Program only mode activated")

#TODO get program here
logging.warning("Update : Program only mode activated - UPDATE_PROGRAM_ONLY")
else:
logging.warning("Update : programs will not be updated for performance issue - use UPDATE_PROGRAM_ONLY to true for this")

logging.warning(f"Updating already saved data from Postgresql from offset {start_offset} - env variable START_OFFSET until {start_offset + number_of_batch * batch_size}")
try:
Expand Down
15 changes: 4 additions & 11 deletions quotaclimat/data_processing/mediatree/update_pg_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,10 @@ def update_keywords(session: Session, batch_size: int = 50000, start_offset : in
current_batch_saved_keywords = get_keywords_columns(session, i, batch_size)
logging.info(f"Updating {len(current_batch_saved_keywords)} elements from {i} offsets - batch size {batch_size} - until offset {until_offset}")
for keyword_id, plaintext, keywords_with_timestamp, number_of_keywords, start, srt, theme, channel_name, channel_title in current_batch_saved_keywords:
program_name, program_name_type = get_a_program_with_start_timestamp(df_programs, pd.Timestamp(start).tz_convert('Europe/Paris'), channel_name)
if channel_title is None:
logging.debug("channel_title none, set it using channel_name")
channel_title = get_channel_title_for_name(channel_name)
logging.debug("channel_title none, set it using channel_name")
channel_title = get_channel_title_for_name(channel_name)

if(not program_only):
try:
matching_themes, \
Expand Down Expand Up @@ -80,14 +79,13 @@ def update_keywords(session: Session, batch_size: int = 50000, start_offset : in
,number_of_biodiversite_causes_directes
,number_of_biodiversite_consequences
,number_of_biodiversite_solutions_directes
,channel_program=program_name
,channel_program_type=program_name_type
,channel_title=channel_title
,number_of_keywords_20=new_number_of_keywords_20
,number_of_keywords_30=new_number_of_keywords_30
,number_of_keywords_40=new_number_of_keywords_40
)
else:
program_name, program_name_type = get_a_program_with_start_timestamp(df_programs, pd.Timestamp(start).tz_convert('Europe/Paris'), channel_name)
update_keyword_row_program(session
,keyword_id
,channel_program=program_name
Expand Down Expand Up @@ -138,8 +136,6 @@ def update_keyword_row(session: Session,
number_of_biodiversite_causes_directes: int,
number_of_biodiversite_consequences: int,
number_of_biodiversite_solutions_directes: int,
channel_program: str,
channel_program_type: str,
channel_title: str
,number_of_keywords_20: int
,number_of_keywords_30: int
Expand All @@ -162,8 +158,6 @@ def update_keyword_row(session: Session,
Keywords.number_of_biodiversite_causes_directes:number_of_biodiversite_causes_directes ,
Keywords.number_of_biodiversite_consequences:number_of_biodiversite_consequences ,
Keywords.number_of_biodiversite_solutions_directes:number_of_biodiversite_solutions_directes,
Keywords.channel_program: channel_program,
Keywords.channel_program_type: channel_program_type,
Keywords.channel_title: channel_title
,Keywords.number_of_keywords_20: number_of_keywords_20
,Keywords.number_of_keywords_30: number_of_keywords_30
Expand All @@ -174,7 +168,6 @@ def update_keyword_row(session: Session,
else:
logging.warning(f"Matching themes is empty - deleting row {keyword_id}")
session.query(Keywords).filter(Keywords.id == keyword_id).delete()
session.commit()

def update_keyword_row_program(session: Session,
keyword_id: int,
Expand Down
9 changes: 4 additions & 5 deletions test/sitemap/test_update_pg_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,12 @@ def test_delete_keywords():
,0
,0
,0
,"télématin"
,"Information - Magazine"
,"M6"
,0
,0
,0
)
session.commit()
assert get_keyword(primary_key) == None

def test_first_update_keywords():
Expand Down Expand Up @@ -253,9 +252,9 @@ def test_first_update_keywords():
assert number_of_biodiversite_consequences == 0
assert number_of_biodiversite_solutions_directes == 0

# program
assert result_after_update.channel_program == "1245 le mag"
assert result_after_update.channel_program_type == "Information - Magazine"
# program - only when UPDATE_PROGRAM_ONLY for speed issues
# assert result_after_update.channel_program == "1245 le mag"
# assert result_after_update.channel_program_type == "Information - Magazine"

#channel_title
assert result_after_update.channel_title == "M6"
Expand Down

1 comment on commit b74c5e2

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
postgres
   insert_data.py44784%36–38, 57–59, 64
   insert_existing_data_example.py19384%25–27
postgres/schemas
   models.py1461093%120–127, 139–140, 198–199, 213–214
quotaclimat/data_ingestion
   scrap_sitemap.py1341787%27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
quotaclimat/data_ingestion/ingest_db
   ingest_sitemap_in_db.py553733%21–42, 45–58, 62–73
quotaclimat/data_ingestion/scrap_html
   scrap_description_article.py36392%19–20, 32
quotaclimat/data_processing/mediatree
   api_import.py20412738%43–47, 52–67, 71–74, 80, 83–122, 128–143, 147–148, 161–173, 177–183, 196–207, 210–214, 220, 255–256, 260, 264–298, 301–303
   channel_program.py1365162%30–32, 43–45, 59, 95, 104, 142–183
   config.py15287%7, 16
   detect_keywords.py213896%169–172, 216, 271–273
   update_pg_keywords.py523729%14–97, 120–121, 144–170, 176
   utils.py642266%26–50, 53, 62, 78–79
quotaclimat/utils
   healthcheck_config.py291452%22–24, 27–38
   logger.py241154%22–24, 28–37
   sentry.py10280%21–22
TOTAL120735171% 

Tests Skipped Failures Errors Time
81 0 💤 0 ❌ 0 🔥 57.358s ⏱️

Please sign in to comment.