Skip to content

Commit

Permalink
Fix/program name margin with close programs (#250)
Browse files Browse the repository at this point in the history
* fix: when 2 programs are super close but with a margin, pick the first one

* fix: edge case
  • Loading branch information
polomarcus authored Sep 26, 2024
1 parent cce6801 commit 7e736fd
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 10 deletions.
30 changes: 20 additions & 10 deletions quotaclimat/data_processing/mediatree/channel_program.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def get_programs():
with open(json_file_path, 'r') as file:
json_data = json.load(file)
df_programs = pd.DataFrame(json_data)
logging.info(df_programs.dtypes)

df_programs[['start', 'end']] = df_programs.apply(lambda x: pd.Series({
'start': format_hour_minute(x['start']),
'end': format_hour_minute(x['end'])
Expand Down Expand Up @@ -79,16 +79,31 @@ def get_day_of_week(time: pd.Timestamp) -> int:
return start_weekday

def get_matching_program_hour(df_program: pd.DataFrame, start_time: pd.Timestamp):
number_of_rows_to_filter = len(df_program)
logging.debug(f"df_program {df_program['start']}")
logging.debug(f"{start_time + pd.Timedelta(seconds=EPOCH__5MIN_MARGIN + EPOCH__1MIN_MARGIN)}")
logging.debug(f"df_program {df_program['end']}")
logging.debug(f"number_of_rows_to_filter {number_of_rows_to_filter}")
start_time = get_hour_minute(start_time)
matching_rows = df_program[
(df_program['start'] <= (start_time + pd.Timedelta(seconds=EPOCH__5MIN_MARGIN + EPOCH__1MIN_MARGIN))) &
(df_program['end'] > (start_time - pd.Timedelta(seconds=EPOCH__5MIN_MARGIN + EPOCH__1MIN_MARGIN)))
]
if(len(matching_rows) > 1): # no margin necessary because programs are next to each others
return df_program[

number_of_result = len(matching_rows)
logging.info(f"matching_rows {matching_rows}")
if(number_of_result > 1): # no margin necessary because programs are next to each others
closest_result = df_program[
(df_program['start'] <= (start_time)) &
(df_program['end'] > (start_time)) # stricly > to avoid overlapping programs
]
if(len(closest_result) == 0):
return matching_rows.head(1)
else:
return closest_result
elif(number_of_result == 0 & number_of_rows_to_filter > 0):
logging.warning("No results from hour filter")
return None
else:
return matching_rows

Expand All @@ -99,12 +114,12 @@ def get_matching_program_weekday(df_program: pd.DataFrame, start_time: pd.Timest
if "weekday_mask" in df_program.columns:
df_program.drop(columns=["weekday_mask"], inplace=True)
df_program["weekday_mask"] = df_program['weekday'].apply(lambda x: compare_weekday(x, start_weekday), axis=1)
logging.debug("weekday_mask done")

matching_rows = df_program[
(df_program['channel_name'] == channel_name) &
(df_program["weekday_mask"] == True)
]
logging.debug("matching_rows done")

matching_rows.drop(columns=['weekday_mask'], inplace=True)
matching_rows.drop(columns=['weekday'], inplace=True)

Expand All @@ -113,11 +128,6 @@ def get_matching_program_weekday(df_program: pd.DataFrame, start_time: pd.Timest

return matching_rows

def get_closest_program_between_2_with_margin(channel_name: str,start_time: pd.Timestamp, matching_rows):
logging.info(f"Several programs name for the same channel and time {channel_name} and {start_time} - {matching_rows} - returning the first match")
matching_rows["start"]
return matching_rows.iloc[0]['program_name'], matching_rows.iloc[0]['program_type']

def get_a_program_with_start_timestamp(df_program: pd.DataFrame, start_time: pd.Timestamp, channel_name: str):
matching_rows = get_matching_program_weekday(df_program, start_time, channel_name)
matching_rows = get_matching_program_hour(matching_rows, start_time)
Expand Down
9 changes: 9 additions & 0 deletions test/sitemap/test_program_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,15 @@ def test_get_13h_monday_rfi_with_margin_program_with_start_timestamp():
assert program_type == "Information - Journal"


def test_get_6h26_friday_fr2_with_margin_program_with_start_timestamp():
df_programs = get_programs()
friday_6h26 = 1726719981
program_name, program_type = get_a_program_with_start_timestamp(df_programs,\
pd.to_datetime(friday_6h26, unit='s', utc=True).tz_convert('Europe/Paris'),\
"france2")
assert program_name == "Le 6h Info"
assert program_type == "Information - Journal"

def test_compare_weekday_string():
assert compare_weekday('*', 0) == True
assert compare_weekday('*', 3) == True
Expand Down

1 comment on commit 7e736fd

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
postgres
   insert_data.py43784%36–38, 56–58, 63
   insert_existing_data_example.py19384%25–27
postgres/schemas
   models.py1501093%124–131, 143–144, 202–203, 217–218
quotaclimat/data_ingestion
   scrap_sitemap.py1341787%27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
quotaclimat/data_ingestion/ingest_db
   ingest_sitemap_in_db.py553733%21–42, 45–58, 62–73
quotaclimat/data_ingestion/scrap_html
   scrap_description_article.py36392%19–20, 32
quotaclimat/data_processing/mediatree
   api_import.py21513637%44–48, 53–74, 78–81, 87, 90–132, 138–153, 158, 171–183, 187–193, 206–218, 221–225, 231, 267–268, 271–307, 310–312
   channel_program.py1685766%28–30, 41–43, 60–61, 64–66, 105–106, 115, 127, 174–215
   config.py15287%7, 16
   detect_keywords.py209896%220, 278–285
   update_pg_keywords.py674927%15–108, 132, 135, 142–157, 180–206, 213
   utils.py712269%29–53, 56, 65, 86–87
quotaclimat/utils
   healthcheck_config.py291452%22–24, 27–38
   logger.py241154%22–24, 28–37
   sentry.py11282%22–23
TOTAL127237870% 

Tests Skipped Failures Errors Time
90 0 💤 0 ❌ 0 🔥 1m 37s ⏱️

Please sign in to comment.