Skip to content

Commit

Permalink
Chores: update libs - add channel_title to keywords (#182)
Browse files Browse the repository at this point in the history
* chores: update lib - add channel_title to keywords

* remove log
  • Loading branch information
polomarcus authored Jun 5, 2024
1 parent a0f08c1 commit 7df4389
Show file tree
Hide file tree
Showing 9 changed files with 84 additions and 593 deletions.
31 changes: 31 additions & 0 deletions alembic/versions/055173743036_keywords_add_channel_title.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""keywords: add channel_title
Revision ID: 055173743036
Revises: a0a707673259
Create Date: 2024-06-05 11:43:22.071610
"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = '055173743036'
down_revision: Union[str, None] = 'a0a707673259'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('keywords', sa.Column('channel_title', sa.String(), nullable=True))
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('keywords','channel_title')
# ### end Alembic commands ###
7 changes: 5 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ services:
#entrypoint: ["python", "quotaclimat/data_processing/mediatree/api_import.py"]
environment:
ENV: docker # change me to prod for real cases
LOGLEVEL: INFO # Change me to info (debug, info, warning, error) to have less log
LOGLEVEL: DEBUG # Change me to info (debug, info, warning, error) to have less log
PYTHONPATH: /app
POSTGRES_USER: user
POSTGRES_DB: barometre
Expand All @@ -117,14 +117,17 @@ services:
#UPDATE: "true" # to batch update PG
#UPDATE_PROGRAM_ONLY: "true" # to batch update PG but only channel with program
# START_OFFSET: 100 # to batch update PG from a offset
START_DATE: 1714026680 # to test batch import
# START_DATE: 1717227223 # to test batch import
CHANNEL : france2 # to reimport only one channel
MEDIATREE_USER : /run/secrets/username_api
MEDIATREE_PASSWORD: /run/secrets/pwd_api
MEDIATREE_AUTH_URL: https://keywords.mediatree.fr/api/auth/token/
KEYWORDS_URL: https://keywords.mediatree.fr/api/subtitle/ # https://keywords.mediatree.fr/docs/#api-Subtitle-SubtitleList
MODIN_ENGINE: ray
MODIN_CPUS: 4 # "https://modin.readthedocs.io/en/0.11.0/using_modin.html#reducing-or-limiting-the-resources-modin-can-use"
MODIN_MEMORY: 1000000000 # 1Gb
RAY_memory_usage_threshold: 1
mem_limit: "1G"
volumes:
- ./quotaclimat/:/app/quotaclimat/
- ./postgres/:/app/postgres/
Expand Down
578 changes: 9 additions & 569 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions postgres/schemas/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class Keywords(Base):

id = Column(Text, primary_key=True)
channel_name = Column(String, nullable=False)
channel_title = Column(String, nullable=True)
channel_program = Column(String, nullable=True) # arcom - alembic handles this
channel_program_type = Column(String, nullable=True) # arcom - (magazine, journal etc) alembic handles this
channel_radio = Column(Boolean, nullable=True)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ swifter = "^1.4.0"
tenacity = "^8.2.3"
sentry-sdk = "^1.44.1"
coverage = "^7.4.2"
modin = {extras = ["ray"], version = "^0.29.0"}
modin = {extras = ["ray"], version = "^0.30.0"}

[build-system]
requires = ["poetry-core>=1.1"]
Expand Down
33 changes: 18 additions & 15 deletions quotaclimat/data_processing/mediatree/api_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,9 @@ def get_channels():
async def get_and_save_api_data(exit_event):
with sentry_sdk.start_transaction(op="task", name="get_and_save_api_data"):
try:
logging.warning(f"Available CPUS {os.cpu_count()} - MODIN_CPUS config : {os.environ.get('MODIN_CPUS', 0)}")
context = ray.init(
dashboard_host="0.0.0.0", # for docker dashboard
_system_config={

"object_spilling_config": json.dumps(
{"type": "filesystem", "params": {"directory_path": "/tmp/spill"}},
)
},
)
logging.info(f"ray context dahsboard : {context.dashboard_url}")
conn = connect_to_db()
Expand All @@ -109,8 +104,8 @@ async def get_and_save_api_data(exit_event):
for index, program in programs_for_this_day.iterrows():
start_epoch = program['start']
end_epoch = program['end']
channel_program = program['program_name']
channel_program_type = program['program_type']
channel_program = str(program['program_name'])
channel_program_type = str(program['program_type'])
logging.info(f"Querying API for {channel} - {channel_program} - {channel_program_type} - {start_epoch} - {end_epoch}")
df = extract_api_sub(token, channel, type_sub, start_epoch,end_epoch, channel_program,channel_program_type)
if(df is not None):
Expand Down Expand Up @@ -233,16 +228,24 @@ def parse_reponse_subtitle(response_sub, channel = None, channel_program = "", c

new_df : pd.DataFrame = json_normalize(response_sub.get('data'))
logging.debug("Schema from API before formatting :\n%s", new_df.dtypes)
new_df.drop('channel.title', axis=1, inplace=True) # keep only channel.name

new_df['timestamp'] = pd.to_datetime(new_df['start'], unit='s', utc=True)
new_df.drop('start', axis=1, inplace=True) # keep only channel.name

new_df.rename(columns={'channel.name':'channel_name', 'channel.radio': 'channel_radio', 'timestamp':'start'}, inplace=True)

new_df['channel_program'] = channel_program
new_df['channel_program_type'] = channel_program_type

logging.debug("renaming columns")
new_df.rename(columns={'channel.name':'channel_name',
'channel.title':'channel_title',
'channel.radio': 'channel_radio',
'timestamp':'start'
},
inplace=True
)
logging.debug(f"setting program {channel_program} type { type(channel_program)}")

# weird error if not using this way: (ValueError) format number 1 of "20h30 le samedi" is not recognized
new_df['channel_program'] = new_df.apply(lambda x: channel_program, axis=1)
new_df['channel_program_type'] = new_df.apply(lambda x: channel_program_type, axis=1)

logging.debug("programs were set")
log_dataframe_size(new_df, channel)

logging.debug("Parsed Schema\n%s", new_df.dtypes)
Expand Down
4 changes: 2 additions & 2 deletions quotaclimat/data_processing/mediatree/channel_program.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ def get_a_program_with_start_timestamp(df_program: pd.DataFrame, start_time: pd.

def process_subtitle(row, df_program):
channel_program, channel_program_type = get_a_program_with_start_timestamp(df_program, row['start'], row['channel_name'])
row['channel_program'] = channel_program
row['channel_program_type'] = channel_program_type
row['channel_program'] = str(channel_program)
row['channel_program_type'] = str(channel_program_type)
return row

def merge_program_subtitle(df_subtitle: pd.DataFrame, df_program: pd.DataFrame):
Expand Down
14 changes: 10 additions & 4 deletions test/sitemap/test_mediatree.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,14 @@
"text": "adaptation"
}
],
"channel":{"name":"tf1","title":"M6","radio":false},"start":1704798120,
"channel":{"name":"tf1","title":"TF1","radio":false},"start":1704798120,
"plaintext":"test2"}
],
"elapsed_time_ms":335}
""")

def test_parse_reponse_subtitle():
channel_program = "13h15 le samedi"
expected_result = pd.DataFrame([{
"srt": [{
"duration_ms": 34,
Expand All @@ -82,9 +83,10 @@ def test_parse_reponse_subtitle():
],
"plaintext" : plaintext1,
"channel_name" : "m6",
"channel_title" : "M6",
"channel_radio" : False,
"start" : 1704798000,
"channel_program" : "",
"channel_program" : channel_program,
"channel_program_type" : "",
},
{
Expand All @@ -97,13 +99,14 @@ def test_parse_reponse_subtitle():
"plaintext" : plaintext2,
"channel_name" : "tf1",
"channel_radio" : False,
"channel_title" : "TF1",
"start" : 1704798120,
"channel_program" : "",
"channel_program" : channel_program,
"channel_program_type" : "",
}])

expected_result['start'] = pd.to_datetime(expected_result['start'], unit='s').dt.tz_localize('UTC')
df = parse_reponse_subtitle(json_response)
df = parse_reponse_subtitle(json_response, channel = None, channel_program = channel_program, channel_program_type = "")
debug_df(df)

pd.testing.assert_frame_equal(df._to_pandas().reset_index(drop=True), expected_result.reset_index(drop=True))
Expand Down Expand Up @@ -144,11 +147,13 @@ def test_save_to_pg_keyword():
"ressources_concepts_generaux",
]
channel_name = "m6"
channel_title = "M6"
df = pd.DataFrame([{
"id" : primary_key,
"start": 1706437079006,
"plaintext": "cheese pizza habitabilité de la planète conditions de vie sur terre animal",
"channel_name": channel_name,
"channel_title": channel_title,
"channel_radio": False,
"theme": themes,
"keywords_with_timestamp": keywords_with_timestamp
Expand All @@ -164,6 +169,7 @@ def test_save_to_pg_keyword():

assert result.id == primary_key
assert result.channel_name == channel_name
assert result.channel_title == channel_title
assert result.channel_radio == False
assert result.theme == themes
assert result.keywords_with_timestamp == keywords_with_timestamp
Expand Down
7 changes: 7 additions & 0 deletions test/sitemap/test_program_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,13 @@ def test_get_a_program_with_start_timestamp():
assert program_name == "JT 13h"
assert program_type == "Information - Journal"

def test_get_13h_program_with_start_timestamp():
df_programs = get_programs()
saturday_13h18 = 1717240693
program_name, program_type = get_a_program_with_start_timestamp(df_programs, pd.to_datetime(saturday_13h18, unit='s', utc=True).tz_convert('Europe/Paris'), channel_name)
assert program_name == "13h15 le samedi"
assert program_type == "Information - Journal"

def test_compare_weekday_string():
assert compare_weekday('*', 0) == True
assert compare_weekday('*', 3) == True
Expand Down

1 comment on commit 7df4389

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
postgres
   insert_data.py44784%36–38, 57–59, 64
   insert_existing_data_example.py19384%25–27
postgres/schemas
   models.py1431093%117–124, 136–137, 195–196, 210–211
quotaclimat/data_ingestion
   scrap_sitemap.py1341787%27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
quotaclimat/data_ingestion/ingest_db
   ingest_sitemap_in_db.py553733%21–42, 45–58, 62–73
quotaclimat/data_ingestion/scrap_html
   scrap_description_article.py36392%19–20, 32
quotaclimat/data_processing/mediatree
   api_import.py19812139%43–47, 52–64, 68–71, 77, 80–122, 128–143, 147–148, 161–173, 177–183, 196–207, 210–214, 220, 255–256, 260, 264–287, 290–292
   channel_program.py91990%21–23, 34–36, 50, 86, 95
   config.py15287%7, 16
   detect_keywords.py180498%178, 230–232
   update_pg_keywords.py443032%14–84, 105–106, 127–152, 158
   utils.py662365%18, 29–53, 56, 65, 81–82
quotaclimat/utils
   healthcheck_config.py291452%22–24, 27–38
   logger.py241154%22–24, 28–37
   sentry.py10280%21–22
TOTAL111429374% 

Tests Skipped Failures Errors Time
80 0 💤 0 ❌ 0 🔥 59.672s ⏱️

Please sign in to comment.