Fix/program name margin with close programs (#250)

* fix: when 2 programs are super close but with a margin, pick the first one * fix: edge case
dataforgoodfr · Sep 26, 2024 · 7e736fd · 7e736fd · github-actions · Sep 26, 2024
1 parent cce6801
commit 7e736fd
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 10 deletions.
diff --git a/quotaclimat/data_processing/mediatree/channel_program.py b/quotaclimat/data_processing/mediatree/channel_program.py
@@ -19,7 +19,7 @@ def get_programs():
         with open(json_file_path, 'r') as file:
             json_data = json.load(file)
             df_programs = pd.DataFrame(json_data)
-            logging.info(df_programs.dtypes)
+
             df_programs[['start', 'end']] = df_programs.apply(lambda x: pd.Series({
                 'start': format_hour_minute(x['start']),
                 'end': format_hour_minute(x['end'])
@@ -79,16 +79,31 @@ def get_day_of_week(time: pd.Timestamp) -> int:
     return start_weekday
 
 def get_matching_program_hour(df_program: pd.DataFrame, start_time: pd.Timestamp):
+    number_of_rows_to_filter = len(df_program)
+    logging.debug(f"df_program {df_program['start']}")
+    logging.debug(f"{start_time + pd.Timedelta(seconds=EPOCH__5MIN_MARGIN + EPOCH__1MIN_MARGIN)}")
+    logging.debug(f"df_program {df_program['end']}")
+    logging.debug(f"number_of_rows_to_filter {number_of_rows_to_filter}")
     start_time = get_hour_minute(start_time)
     matching_rows = df_program[
                          (df_program['start'] <= (start_time + pd.Timedelta(seconds=EPOCH__5MIN_MARGIN + EPOCH__1MIN_MARGIN))) &
                          (df_program['end'] > (start_time - pd.Timedelta(seconds=EPOCH__5MIN_MARGIN + EPOCH__1MIN_MARGIN)))
                     ]
-    if(len(matching_rows) > 1): # no margin necessary because programs are next to each others
-        return df_program[
+
+    number_of_result = len(matching_rows)
+    logging.info(f"matching_rows {matching_rows}")
+    if(number_of_result > 1): # no margin necessary because programs are next to each others
+        closest_result = df_program[
                             (df_program['start'] <= (start_time)) &
                             (df_program['end'] > (start_time)) # stricly > to avoid overlapping programs
         ]
+        if(len(closest_result) == 0):
+            return matching_rows.head(1)
+        else:
+            return closest_result
+    elif(number_of_result == 0 & number_of_rows_to_filter > 0):
+        logging.warning("No results from hour filter")
+        return None
     else:
         return matching_rows
 
@@ -99,12 +114,12 @@ def get_matching_program_weekday(df_program: pd.DataFrame, start_time: pd.Timest
     if "weekday_mask" in df_program.columns:
         df_program.drop(columns=["weekday_mask"], inplace=True)
     df_program["weekday_mask"] = df_program['weekday'].apply(lambda x: compare_weekday(x, start_weekday), axis=1)
-    logging.debug("weekday_mask done")
+
     matching_rows = df_program[
                         (df_program['channel_name'] == channel_name) &
                         (df_program["weekday_mask"] == True)
                     ]
-    logging.debug("matching_rows done")
+
     matching_rows.drop(columns=['weekday_mask'], inplace=True)
     matching_rows.drop(columns=['weekday'], inplace=True)
 
@@ -113,11 +128,6 @@ def get_matching_program_weekday(df_program: pd.DataFrame, start_time: pd.Timest
 
     return matching_rows
 
-def get_closest_program_between_2_with_margin(channel_name: str,start_time: pd.Timestamp, matching_rows):
-    logging.info(f"Several programs name for the same channel and time {channel_name} and {start_time} - {matching_rows} - returning the first match")
-    matching_rows["start"]
-    return matching_rows.iloc[0]['program_name'], matching_rows.iloc[0]['program_type']
-
 def get_a_program_with_start_timestamp(df_program: pd.DataFrame, start_time: pd.Timestamp, channel_name: str):
     matching_rows = get_matching_program_weekday(df_program, start_time, channel_name)
     matching_rows = get_matching_program_hour(matching_rows, start_time)

diff --git a/test/sitemap/test_program_metadata.py b/test/sitemap/test_program_metadata.py
@@ -228,6 +228,15 @@ def test_get_13h_monday_rfi_with_margin_program_with_start_timestamp():
     assert program_type == "Information - Journal"
 
 
+def test_get_6h26_friday_fr2_with_margin_program_with_start_timestamp():
+    df_programs = get_programs()
+    friday_6h26 = 1726719981
+    program_name, program_type = get_a_program_with_start_timestamp(df_programs,\
+                                                                    pd.to_datetime(friday_6h26, unit='s', utc=True).tz_convert('Europe/Paris'),\
+                                                                    "france2")
+    assert program_name == "Le 6h Info"
+    assert program_type == "Information - Journal"
+
 def test_compare_weekday_string():
     assert compare_weekday('*', 0) == True
     assert compare_weekday('*', 3) == True
File	Stmts	Miss	Cover	Missing
postgres
insert_data.py	43	7	84%	36–38, 56–58, 63
insert_existing_data_example.py	19	3	84%	25–27
postgres/schemas
models.py	150	10	93%	124–131, 143–144, 202–203, 217–218
quotaclimat/data_ingestion
scrap_sitemap.py	134	17	87%	27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
quotaclimat/data_ingestion/ingest_db
ingest_sitemap_in_db.py	55	37	33%	21–42, 45–58, 62–73
quotaclimat/data_ingestion/scrap_html
scrap_description_article.py	36	3	92%	19–20, 32
quotaclimat/data_processing/mediatree
api_import.py	215	136	37%	44–48, 53–74, 78–81, 87, 90–132, 138–153, 158, 171–183, 187–193, 206–218, 221–225, 231, 267–268, 271–307, 310–312
channel_program.py	168	57	66%	28–30, 41–43, 60–61, 64–66, 105–106, 115, 127, 174–215
config.py	15	2	87%	7, 16
detect_keywords.py	209	8	96%	220, 278–285
update_pg_keywords.py	67	49	27%	15–108, 132, 135, 142–157, 180–206, 213
utils.py	71	22	69%	29–53, 56, 65, 86–87
quotaclimat/utils
healthcheck_config.py	29	14	52%	22–24, 27–38
logger.py	24	11	54%	22–24, 28–37
sentry.py	11	2	82%	22–23
TOTAL	1272	378	70%