Skip to content

Commit

Permalink
wip: update dep / init ray (#181)
Browse files Browse the repository at this point in the history
  • Loading branch information
polomarcus authored May 30, 2024
1 parent 7884992 commit 0fe932f
Show file tree
Hide file tree
Showing 8 changed files with 900 additions and 895 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/deploy-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ on:

env:
PYTHON_VERSION: '3.11'
POETRY_VERSION: '1.8.2'
POETRY_VERSION: '1.8.3'

jobs:
build:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on:

env:
PYTHON_VERSION: '3.11'
POETRY_VERSION: '1.8.2'
POETRY_VERSION: '1.8.3'

jobs:
# Label of the runner job
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ WORKDIR /app

COPY pyproject.toml poetry.lock ./

RUN pip install poetry==1.8.2
RUN pip install poetry==1.8.3

RUN poetry install

Expand Down
2 changes: 1 addition & 1 deletion Dockerfile_api_import
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ WORKDIR /app

COPY pyproject.toml poetry.lock ./

RUN pip install poetry==1.8.2
RUN pip install poetry==1.8.3

RUN poetry install

Expand Down
2 changes: 1 addition & 1 deletion Dockerfile_ingest
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ WORKDIR /app

COPY pyproject.toml poetry.lock ./

RUN pip install poetry==1.8.2
RUN pip install poetry==1.8.3

RUN poetry install

Expand Down
4 changes: 2 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ services:
build:
context: ./
dockerfile: Dockerfile
#entrypoint: ["poetry", "run", "pytest","-vv", "-o", "log_cli=true", "--cov-report", "term:skip-covered", "--cov=quotaclimat", "--cov=postgres", "test/"]
entrypoint: ["sleep", "12000"] # use to debug the container if needed
entrypoint: ["poetry", "run", "pytest","-vv", "-o", "log_cli=true", "--cov-report", "term:skip-covered", "--cov=quotaclimat", "--cov=postgres", "test/"]
#entrypoint: ["sleep", "12000"] # use to debug the container if needed
environment:
ENV: docker
# CHANNEL: "fr3-idf"
Expand Down
1,705 changes: 849 additions & 856 deletions poetry.lock

Large diffs are not rendered by default.

76 changes: 44 additions & 32 deletions quotaclimat/data_processing/mediatree/api_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from sentry_sdk.crons import monitor
import modin.pandas as pd
from modin.pandas import json_normalize
import ray
from quotaclimat.utils.sentry import sentry_init
logging.getLogger('modin.logger.default').setLevel(logging.ERROR)
logging.getLogger('distributed.scheduler').setLevel(logging.ERROR)
Expand Down Expand Up @@ -77,39 +78,50 @@ def get_channels():

async def get_and_save_api_data(exit_event):
with sentry_sdk.start_transaction(op="task", name="get_and_save_api_data"):
conn = connect_to_db()
token=get_auth_token(password=password, user_name=USER)
type_sub = 's2t'

(start_date_to_query, end_date) = get_start_end_date_env_variable_with_default()
df_programs = get_programs()
channels = get_channels()

day_range = get_date_range(start_date_to_query, end_date)
logging.info(f"Number of days to query : {len(day_range)} - day_range : {day_range}")
for day in day_range:
token = refresh_token(token, day)
try:
ray.init(
_system_config={
"object_spilling_config": json.dumps(
{"type": "filesystem", "params": {"directory_path": "/tmp/spill"}},
)
},
)
conn = connect_to_db()
token=get_auth_token(password=password, user_name=USER)
type_sub = 's2t'

(start_date_to_query, end_date) = get_start_end_date_env_variable_with_default()
df_programs = get_programs()
channels = get_channels()

for channel in channels:
try:
programs_for_this_day = get_programs_for_this_day(day, channel, df_programs)

for index, program in programs_for_this_day.iterrows():
start_epoch = program['start']
end_epoch = program['end']
channel_program = program['program_name']
channel_program_type = program['program_type']
logging.info(f"Querying API for {channel} - {channel_program} - {channel_program_type} - {start_epoch} - {end_epoch}")
df = extract_api_sub(token, channel, type_sub, start_epoch,end_epoch, channel_program,channel_program_type)
if(df is not None):
# must ._to_pandas() because modin to_sql is not working
save_to_pg(df._to_pandas(), keywords_table, conn)
else:
logging.info("Nothing to save to Postgresql")
except Exception as err:
logging.error(f"continuing loop but met error : {err}")
continue
exit_event.set()
day_range = get_date_range(start_date_to_query, end_date)
logging.info(f"Number of days to query : {len(day_range)} - day_range : {day_range}")
for day in day_range:
token = refresh_token(token, day)

for channel in channels:
try:
programs_for_this_day = get_programs_for_this_day(day, channel, df_programs)

for index, program in programs_for_this_day.iterrows():
start_epoch = program['start']
end_epoch = program['end']
channel_program = program['program_name']
channel_program_type = program['program_type']
logging.info(f"Querying API for {channel} - {channel_program} - {channel_program_type} - {start_epoch} - {end_epoch}")
df = extract_api_sub(token, channel, type_sub, start_epoch,end_epoch, channel_program,channel_program_type)
if(df is not None):
# must ._to_pandas() because modin to_sql is not working
save_to_pg(df._to_pandas(), keywords_table, conn)
else:
logging.info("Nothing to save to Postgresql")
except Exception as err:
logging.error(f"continuing loop but met error : {err}")
continue
exit_event.set()
except Exception as err:
logging.fatal("get_and_save_api_data (%s) %s" % (type(err).__name__, err))
sys.exit(1)

# "Randomly wait up to 2^x * 1 seconds between each retry until the range reaches 60 seconds, then randomly up to 60 seconds afterwards"
# @see https://github.com/jd/tenacity/tree/main
Expand Down

1 comment on commit 0fe932f

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
postgres
   insert_data.py44784%36–38, 57–59, 64
   insert_existing_data_example.py19384%25–27
postgres/schemas
   models.py1421093%116–123, 135–136, 194–195, 209–210
quotaclimat/data_ingestion
   scrap_sitemap.py1341787%27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
quotaclimat/data_ingestion/ingest_db
   ingest_sitemap_in_db.py553733%21–42, 45–58, 62–73
quotaclimat/data_ingestion/scrap_html
   scrap_description_article.py36392%19–20, 32
quotaclimat/data_processing/mediatree
   api_import.py19411939%43–47, 52–64, 68–71, 77, 80–124, 130–145, 149–150, 163–175, 179–185, 198–209, 212–216, 222, 249–250, 254, 258–281, 284–286
   channel_program.py91990%21–23, 34–36, 50, 86, 95
   config.py15287%7, 16
   detect_keywords.py180498%178, 230–232
   update_pg_keywords.py443032%14–84, 105–106, 127–152, 158
   utils.py662365%18, 29–53, 56, 65, 81–82
quotaclimat/utils
   healthcheck_config.py291452%22–24, 27–38
   logger.py241154%22–24, 28–37
   sentry.py10280%21–22
TOTAL110929174% 

Tests Skipped Failures Errors Time
79 0 💤 0 ❌ 0 🔥 58.101s ⏱️

Please sign in to comment.