Skip to content

Commit

Permalink
refacto: modin has problem with multiple type for same column (#187)
Browse files Browse the repository at this point in the history
* refacto: modin has problem with multiple type for same column

* chores: docker compose

* fix: filelock bug

* fix: test
  • Loading branch information
polomarcus authored Jun 19, 2024
1 parent 6cdc51d commit a7072c3
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 120 deletions.
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ services:
#entrypoint: ["python", "quotaclimat/data_processing/mediatree/api_import.py"]
environment:
ENV: docker # change me to prod for real cases
LOGLEVEL: INFO # Change me to info (debug, info, warning, error) to have less log
LOGLEVEL: DEBUG # Change me to info (debug, info, warning, error) to have less log
PYTHONPATH: /app
POSTGRES_USER: user
POSTGRES_DB: barometre
Expand Down
159 changes: 80 additions & 79 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ tenacity = "^8.2.3"
sentry-sdk = "^1.44.1"
coverage = "^7.4.2"
modin = {extras = ["ray"], version = "^0.30.1"}

filelock = "<=3.14"
[build-system]
requires = ["poetry-core>=1.1"]
build-backend = "poetry.core.masonry.api"
Expand Down
50 changes: 25 additions & 25 deletions quotaclimat/data_processing/mediatree/channel_program.json
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
{"channel_name":"tf1","start":"6:55","end":"9:30","weekday":"weekday","program_name":"Bonjour ! La Matinale","program_type":"Information - Magazine"}
{"channel_name":"tf1","start":"13:00","end":"13:40","weekday":"*","program_name":"JT 13h","program_type":"Information - Journal"}
{"channel_name":"tf1","start":"19:55","end":"20:40","weekday":"*","program_name":"JT 20h + météo","program_type":"Information - Journal"}
{"channel_name":"tf1","start":"13:40","end":"14:50","weekday":6,"program_name":"Reportage Découverte","program_type":"Information - Magazine"}
{"channel_name":"tf1","start":"14:50","end":"16:00","weekday":6,"program_name":"Grands reportages","program_type":"Information - Magazine"}
{"channel_name":"tf1","start":"17:15","end":"18:20","weekday":6,"program_name":"Sept à huit Life","program_type":"Information - Magazine"}
{"channel_name":"tf1","start":"18:20","end":"19:45","weekday":6,"program_name":"Sept à huit","program_type":"Information - Magazine"}
{"channel_name":"tf1","start":"13:40","end":"14:50","weekday":"6","program_name":"Reportage Découverte","program_type":"Information - Magazine"}
{"channel_name":"tf1","start":"14:50","end":"16:00","weekday":"6","program_name":"Grands reportages","program_type":"Information - Magazine"}
{"channel_name":"tf1","start":"17:15","end":"18:20","weekday":"6","program_name":"Sept à huit Life","program_type":"Information - Magazine"}
{"channel_name":"tf1","start":"18:20","end":"19:45","weekday":"6","program_name":"Sept à huit","program_type":"Information - Magazine"}
{"channel_name":"france2","start":"6:00","end":"6:25","weekday":"*","program_name":"Le 6h Info","program_type":"Information - Journal"}
{"channel_name":"france2","start":"6:30","end":"9:30","weekday":"*","program_name":"Télématin","program_type":"Information - Autres émissions"}
{"channel_name":"france2","start":"13:00","end":"13:40","weekday":"weekday","program_name":"JT 13h","program_type":"Information - Journal"}
{"channel_name":"france2","start":"13:00","end":"13:15","weekday":"weekend","program_name":"JT 13h","program_type":"Information - Journal"}
{"channel_name":"france2","start":"13:15","end":"14:05","weekday":5,"program_name":"13h15 le samedi","program_type":"Information - Journal"}
{"channel_name":"france2","start":"13:15","end":"14:05","weekday":6,"program_name":"13h15 le dimanche","program_type":"Information - Journal"}
{"channel_name":"france2","start":"10:35","end":"11:00","weekday":6,"program_name":"Nous les européens","program_type":"Information - Journal"}
{"channel_name":"france2","start":"13:15","end":"14:05","weekday":"5","program_name":"13h15 le samedi","program_type":"Information - Journal"}
{"channel_name":"france2","start":"13:15","end":"14:05","weekday":"6","program_name":"13h15 le dimanche","program_type":"Information - Journal"}
{"channel_name":"france2","start":"10:35","end":"11:00","weekday":"6","program_name":"Nous les européens","program_type":"Information - Journal"}
{"channel_name":"france2","start":"19:55","end":"20:40","weekday":"weekday","program_name":"JT 20h + météo","program_type":"Information - Journal"}
{"channel_name":"france2","start":"19:55","end":"20:30","weekday":"weekend","program_name":"JT 20h + météo","program_type":"Information - Journal"}
{"channel_name":"france2","start":"21:10","end":"23:00","weekday":3,"program_name":"Envoyé spécial","program_type":"Information - Magazine"}
{"channel_name":"france2","start":"20:30","end":"21:00","weekday":5,"program_name":"20h30 le samedi","program_type":"Information - Journal"}
{"channel_name":"france2","start":"20:30","end":"21:00","weekday":6,"program_name":"20h30 le dimanche","program_type":"Information - Journal"}
{"channel_name":"france2","start":"21:10","end":"23:00","weekday":"3","program_name":"Envoyé spécial","program_type":"Information - Magazine"}
{"channel_name":"france2","start":"20:30","end":"21:00","weekday":"5","program_name":"20h30 le samedi","program_type":"Information - Journal"}
{"channel_name":"france2","start":"20:30","end":"21:00","weekday":"6","program_name":"20h30 le dimanche","program_type":"Information - Journal"}
{"channel_name":"fr3-idf","start":"7:00","end":"9:00","weekday":"weekday","program_name":"Ici Matin","program_type":"Information - Journal"}
{"channel_name":"fr3-idf","start":"12:00","end":"12:50","weekday":"*","program_name":"JT 12h","program_type":"Information - Journal"}
{"channel_name":"fr3-idf","start":"19:00","end":"19:55","weekday":"*","program_name":"JT 19h + météo","program_type":"Information - Journal"}
{"channel_name":"m6","start":"12:45","end":"13:15","weekday":"*","program_name":"JT 1245","program_type":"Information - Journal"}
{"channel_name":"m6","start":"13:50","end":"17:20","weekday":"weekday","program_name":"1 jour un doc","program_type":"Information - Magazine"}
{"channel_name":"m6","start":"13:15","end":"13:50","weekday":"weekday","program_name":"1245 le mag","program_type":"Information - Magazine"}
{"channel_name":"m6","start":"19:40","end":"20:10","weekday":"*","program_name":"JT 1945 + météo","program_type":"Information - Journal"}
{"channel_name":"m6","start":"11:00","end":"12:30","weekday":5,"program_name":"66 minutes samedi","program_type":"Information - Magazine"}
{"channel_name":"m6","start":"16:35","end":"19:30","weekday":6,"program_name":"66 minutes","program_type":"Information - Magazine"}
{"channel_name":"m6","start":"21:10","end":"23:15","weekday":6,"program_name":"Capital / Zone interdite","program_type":"Information - Magazine"}
{"channel_name":"m6","start":"11:00","end":"12:30","weekday":"5","program_name":"66 minutes samedi","program_type":"Information - Magazine"}
{"channel_name":"m6","start":"16:35","end":"19:30","weekday":"6","program_name":"66 minutes","program_type":"Information - Magazine"}
{"channel_name":"m6","start":"21:10","end":"23:15","weekday":"6","program_name":"Capital / Zone interdite","program_type":"Information - Magazine"}
{"channel_name":"arte","start":"19:45","end":"20:05","weekday":"*","program_name":"JT","program_type":"Information - Journal"}
{"channel_name":"arte","start":"20:05","end":"20:50","weekday":"*","program_name":"28 minutes","program_type":"Information - Magazine"}
{"channel_name":"d8","start":"12:30","end":"13:00","weekday":"weekday","program_name":"Le journal du jour + météo","program_type":"Information - Journal"}
Expand All @@ -35,15 +35,15 @@
{"channel_name":"lci","start":"6:00","end":"23:00","weekday":"*","program_name":"Information en continu","program_type":"Information en continu"}
{"channel_name":"france24","start":"6:00","end":"23:00","weekday":"*","program_name":"Information en continu","program_type":"Information en continu"}
{"channel_name":"franceinfotv","start":"6:00","end":"23:00","weekday":"*","program_name":"Information en continu","program_type":"Information en continu"}
{"channel_name":"france-inter","start":"6:00","end":"7:00","weekday":0,"program_name":"Le 5/7","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"7:00","end":"10:00","weekday":0,"program_name":"Le 7/10","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"6:00","end":"7:00","weekday":1,"program_name":"Le 5/7","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"7:00","end":"10:00","weekday":1,"program_name":"Le 7/10","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"6:00","end":"7:00","weekday":2,"program_name":"Le 5/7","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"7:00","end":"10:00","weekday":2,"program_name":"Le 7/10","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"6:00","end":"7:00","weekday":3,"program_name":"Le 5/7","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"7:00","end":"10:00","weekday":3,"program_name":"Le 7/10","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"6:00","end":"9:00","weekday":4,"program_name":"Le 6/9","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"6:00","end":"7:00","weekday":"0","program_name":"Le 5/7","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"7:00","end":"10:00","weekday":"0","program_name":"Le 7/10","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"6:00","end":"7:00","weekday":"1","program_name":"Le 5/7","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"7:00","end":"10:00","weekday":"1","program_name":"Le 7/10","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"6:00","end":"7:00","weekday":"2","program_name":"Le 5/7","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"7:00","end":"10:00","weekday":"2","program_name":"Le 7/10","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"6:00","end":"7:00","weekday":"3","program_name":"Le 5/7","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"7:00","end":"10:00","weekday":"3","program_name":"Le 7/10","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"6:00","end":"9:00","weekday":"4","program_name":"Le 6/9","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"6:00","end":"9:00","weekday":"weekend","program_name":"Le 6/9","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"13:00","end":"14:00","weekday":"weekday","program_name":"Le 13/14","program_type":"Information - Magazine"}
{"channel_name":"france-inter","start":"13:00","end":"13:30","weekday":"weekend","program_name":"Le journal de 13h du WE","program_type":"Information - Journal"}
Expand All @@ -55,10 +55,10 @@
{"channel_name":"rtl","start":"7:00","end":"9:00","weekday":"weekday","program_name":"RTL Matin","program_type":"Information - Magazine"}
{"channel_name":"rtl","start":"12:00","end":"13:00","weekday":"weekday","program_name":"RTL Midi","program_type":"Information - Magazine"}
{"channel_name":"rtl","start":"18:00","end":"20:00","weekday":"weekday","program_name":"RTL Bonsoir","program_type":"Information - Magazine"}
{"channel_name":"rtl","start":"18:00","end":"18:15","weekday":5,"program_name":"Journal","program_type":"Information - Journal"}
{"channel_name":"rtl","start":"18:00","end":"18:15","weekday":"5","program_name":"Journal","program_type":"Information - Journal"}
{"channel_name":"rtl","start":"6:00","end":"9:15","weekday":"weekend","program_name":"RTL Matin","program_type":"Information - Magazine"}
{"channel_name":"rtl","start":"18:00","end":"19:15","weekday":6,"program_name":"RTL Dimanche soir","program_type":"Information - Magazine"}
{"channel_name":"rtl","start":"13:00","end":"14:00","weekday":6,"program_name":"Focus Dimanche","program_type":"Information - Magazine"}
{"channel_name":"rtl","start":"18:00","end":"19:15","weekday":"6","program_name":"RTL Dimanche soir","program_type":"Information - Magazine"}
{"channel_name":"rtl","start":"13:00","end":"14:00","weekday":"6","program_name":"Focus Dimanche","program_type":"Information - Magazine"}
{"channel_name":"rmc","start":"6:30","end":"9:00","weekday":"weekday","program_name":"Apolline Matin","program_type":"Information - Magazine"}
{"channel_name":"rmc","start":"9:00","end":"12:00","weekday":"weekday","program_name":"Les grandes gueules","program_type":"Information - Magazine"}
{"channel_name":"rmc","start":"12:00","end":"15:00","weekday":"weekday","program_name":"Estelle Midi","program_type":"Information - Magazine"}
Expand Down
18 changes: 9 additions & 9 deletions quotaclimat/data_processing/mediatree/channel_program.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ def get_programs():
current_dir = os.path.dirname(os.path.abspath(__file__))
json_file_path = os.path.join(current_dir, 'channel_program.json')
data_dtype = { # UserWarning: `read_*` implementation has mismatches with pandas:
"channel_name":pd.StringDtype,
"start":pd.StringDtype,
"end":pd.StringDtype,
"weekday":pd.StringDtype,
"program_name":pd.StringDtype,
"program_type":pd.StringDtype
"channel_name":str,
"start":str,
"end":str,
"weekday":str,
"program_name":str,
"program_type":str
}
logging.debug(f"Reading {json_file_path}")
df_programs = pd.read_json(json_file_path, lines=True, dtype=data_dtype)
Expand All @@ -44,11 +44,11 @@ def add_channel_program(df: pd.DataFrame):
logging.error("Could not merge program and subtitle df", error)
raise Exception

def compare_weekday(df_program_weekday, start_weekday: int) -> bool:
def compare_weekday(df_program_weekday: str, start_weekday: int) -> bool:
logging.debug(f"Comparing weekday {start_weekday} with df_program_weekday value : {df_program_weekday}")
match isinstance(df_program_weekday, str):
match not df_program_weekday.isdigit():
case False: #int case
return start_weekday == df_program_weekday
return start_weekday == int(df_program_weekday)
case True: # string case
match df_program_weekday:
case '*': return True
Expand Down
3 changes: 2 additions & 1 deletion quotaclimat/data_processing/mediatree/update_pg_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def update_keywords(session: Session, batch_size: int = 50000, start_offset : in

logging.info(f"Updating {total_updates} saved keywords from {start_offset} offsets - batch size {batch_size} - until offset {until_offset}")
df_programs = get_programs()

logging.debug("Got channel programs")
for i in range(start_offset, until_offset, batch_size):
current_batch_saved_keywords = get_keywords_columns(session, i, batch_size)
logging.info(f"Updating {len(current_batch_saved_keywords)} elements from {i} offsets - batch size {batch_size} - until offset {until_offset}")
Expand Down Expand Up @@ -100,6 +100,7 @@ def update_keywords(session: Session, batch_size: int = 50000, start_offset : in


def get_keywords_columns(session: Session, page: int = 0, batch_size: int = 50000) -> list:
logging.debug(f"Getting {batch_size} elements from offset {page}")
return (
session.query(
Keywords.id,
Expand Down
6 changes: 3 additions & 3 deletions test/sitemap/test_program_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,6 @@ def test_compare_weekday_string():
assert compare_weekday('weekend', 3) == False

def test_compare_weekday_int():
assert compare_weekday(1, 5) == False
assert compare_weekday(1, 1) == True
assert compare_weekday(4, 4) == True
assert compare_weekday("1", 5) == False
assert compare_weekday("1", 1) == True
assert compare_weekday("4", 4) == True
2 changes: 1 addition & 1 deletion transform_program.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def generate_program_id(channel_name, weekday, program_name):
else:
# from 1 to 7 to simplify SQL queries
new_program_data = program_data.copy()
new_program_data['weekday'] = new_program_data['weekday'] + 1
new_program_data['weekday'] = int(new_program_data['weekday']) + 1
programs.append(new_program_data)

for program in programs:
Expand Down

1 comment on commit a7072c3

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
postgres
   insert_data.py44784%36–38, 57–59, 64
   insert_existing_data_example.py19384%25–27
postgres/schemas
   models.py1461093%120–127, 139–140, 198–199, 213–214
quotaclimat/data_ingestion
   scrap_sitemap.py1341787%27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
quotaclimat/data_ingestion/ingest_db
   ingest_sitemap_in_db.py553733%21–42, 45–58, 62–73
quotaclimat/data_ingestion/scrap_html
   scrap_description_article.py36392%19–20, 32
quotaclimat/data_processing/mediatree
   api_import.py20012338%43–47, 52–67, 71–74, 80, 83–122, 128–143, 147–148, 161–173, 177–183, 196–207, 210–214, 220, 255–256, 260, 264–293, 296–298
   channel_program.py1365162%30–32, 43–45, 59, 95, 104, 142–183
   config.py15287%7, 16
   detect_keywords.py190498%190, 245–247
   update_pg_keywords.py533828%14–99, 122–123, 148–177, 183
   utils.py642266%26–50, 53, 62, 78–79
quotaclimat/utils
   healthcheck_config.py291452%22–24, 27–38
   logger.py241154%22–24, 28–37
   sentry.py10280%21–22
TOTAL118134471% 

Tests Skipped Failures Errors Time
81 0 💤 0 ❌ 0 🔥 58.333s ⏱️

Please sign in to comment.