Chores: update libs - add channel_title to keywords (#182)

* chores: update lib - add channel_title to keywords * remove log
dataforgoodfr · Jun 5, 2024 · 7df4389 · 7df4389 · github-actions · Jun 5, 2024
1 parent a0f08c1
commit 7df4389
Show file tree

Hide file tree

Showing 9 changed files with 84 additions and 593 deletions.
diff --git a/alembic/versions/055173743036_keywords_add_channel_title.py b/alembic/versions/055173743036_keywords_add_channel_title.py
@@ -0,0 +1,31 @@
+"""keywords: add channel_title
+
+
+Revision ID: 055173743036
+Revises: a0a707673259
+Create Date: 2024-06-05 11:43:22.071610
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '055173743036'
+down_revision: Union[str, None] = 'a0a707673259'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('keywords', sa.Column('channel_title', sa.String(), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('keywords','channel_title')
+    # ### end Alembic commands ###
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -104,7 +104,7 @@ services:
     #entrypoint: ["python", "quotaclimat/data_processing/mediatree/api_import.py"]
     environment:
       ENV: docker # change me to prod for real cases
-      LOGLEVEL: INFO # Change me to info (debug, info, warning, error) to have less log
+      LOGLEVEL: DEBUG # Change me to info (debug, info, warning, error) to have less log
       PYTHONPATH: /app
       POSTGRES_USER: user
       POSTGRES_DB: barometre
@@ -117,14 +117,17 @@ services:
       #UPDATE: "true" # to batch update PG 
       #UPDATE_PROGRAM_ONLY: "true" # to batch update PG but only channel with program
       # START_OFFSET: 100 # to batch update PG from a offset
-      START_DATE: 1714026680 # to test batch import 
+      # START_DATE: 1717227223 # to test batch import 
       CHANNEL : france2 # to reimport only one channel
       MEDIATREE_USER : /run/secrets/username_api
       MEDIATREE_PASSWORD:  /run/secrets/pwd_api
       MEDIATREE_AUTH_URL: https://keywords.mediatree.fr/api/auth/token/
       KEYWORDS_URL: https://keywords.mediatree.fr/api/subtitle/ # https://keywords.mediatree.fr/docs/#api-Subtitle-SubtitleList
       MODIN_ENGINE: ray
       MODIN_CPUS: 4 # "https://modin.readthedocs.io/en/0.11.0/using_modin.html#reducing-or-limiting-the-resources-modin-can-use"
+      MODIN_MEMORY: 1000000000 # 1Gb
+      RAY_memory_usage_threshold: 1
+    mem_limit: "1G"
     volumes:
       - ./quotaclimat/:/app/quotaclimat/
       - ./postgres/:/app/postgres/

diff --git a/poetry.lock b/poetry.lock
diff --git a/postgres/schemas/models.py b/postgres/schemas/models.py
@@ -58,6 +58,7 @@ class Keywords(Base):
 
     id = Column(Text, primary_key=True)
     channel_name = Column(String, nullable=False)
+    channel_title = Column(String, nullable=True)
     channel_program = Column(String, nullable=True) #  arcom - alembic handles this
     channel_program_type = Column(String, nullable=True) # arcom - (magazine, journal etc) alembic handles this
     channel_radio = Column(Boolean, nullable=True)

diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ swifter = "^1.4.0"
 tenacity = "^8.2.3"
 sentry-sdk = "^1.44.1"
 coverage = "^7.4.2"
-modin = {extras = ["ray"], version = "^0.29.0"}
+modin = {extras = ["ray"], version = "^0.30.0"}
 
 [build-system]
 requires = ["poetry-core>=1.1"]

diff --git a/quotaclimat/data_processing/mediatree/api_import.py b/quotaclimat/data_processing/mediatree/api_import.py
@@ -79,14 +79,9 @@ def get_channels():
 async def get_and_save_api_data(exit_event):
     with sentry_sdk.start_transaction(op="task", name="get_and_save_api_data"):
         try:
+            logging.warning(f"Available CPUS {os.cpu_count()} - MODIN_CPUS config : {os.environ.get('MODIN_CPUS', 0)}")
             context = ray.init(
                 dashboard_host="0.0.0.0", # for docker dashboard
-                _system_config={
-
-                    "object_spilling_config": json.dumps(
-                        {"type": "filesystem", "params": {"directory_path": "/tmp/spill"}},
-                    )
-                },
             )
             logging.info(f"ray context dahsboard : {context.dashboard_url}")
             conn = connect_to_db()
@@ -109,8 +104,8 @@ async def get_and_save_api_data(exit_event):
                         for index, program in programs_for_this_day.iterrows():
                             start_epoch = program['start']
                             end_epoch = program['end']
-                            channel_program = program['program_name']
-                            channel_program_type = program['program_type']
+                            channel_program = str(program['program_name'])
+                            channel_program_type = str(program['program_type'])
                             logging.info(f"Querying API for {channel} - {channel_program} - {channel_program_type} - {start_epoch} - {end_epoch}")
                             df = extract_api_sub(token, channel, type_sub, start_epoch,end_epoch, channel_program,channel_program_type) 
                             if(df is not None):
@@ -233,16 +228,24 @@ def parse_reponse_subtitle(response_sub, channel = None, channel_program = "", c
 
             new_df : pd.DataFrame = json_normalize(response_sub.get('data'))
             logging.debug("Schema from API before formatting :\n%s", new_df.dtypes)
-            new_df.drop('channel.title', axis=1, inplace=True) # keep only channel.name
 
             new_df['timestamp'] = pd.to_datetime(new_df['start'], unit='s', utc=True)
             new_df.drop('start', axis=1, inplace=True) # keep only channel.name
-
-            new_df.rename(columns={'channel.name':'channel_name', 'channel.radio': 'channel_radio', 'timestamp':'start'}, inplace=True)
-
-            new_df['channel_program'] = channel_program
-            new_df['channel_program_type'] = channel_program_type
-
+            logging.debug("renaming columns")
+            new_df.rename(columns={'channel.name':'channel_name', 
+                                   'channel.title':'channel_title',
+                                   'channel.radio': 'channel_radio',
+                                    'timestamp':'start'
+                                  },
+                        inplace=True
+            )
+            logging.debug(f"setting program {channel_program} type { type(channel_program)}")
+
+            # weird error if not using this way: (ValueError) format number 1 of "20h30 le samedi" is not recognized
+            new_df['channel_program'] = new_df.apply(lambda x: channel_program, axis=1)
+            new_df['channel_program_type'] = new_df.apply(lambda x: channel_program_type, axis=1)
+
+            logging.debug("programs were set")
             log_dataframe_size(new_df, channel)
 
             logging.debug("Parsed Schema\n%s", new_df.dtypes)

diff --git a/quotaclimat/data_processing/mediatree/channel_program.py b/quotaclimat/data_processing/mediatree/channel_program.py
@@ -102,8 +102,8 @@ def get_a_program_with_start_timestamp(df_program: pd.DataFrame, start_time: pd.
 
 def process_subtitle(row, df_program):
         channel_program, channel_program_type = get_a_program_with_start_timestamp(df_program, row['start'], row['channel_name'])
-        row['channel_program'] = channel_program
-        row['channel_program_type'] = channel_program_type
+        row['channel_program'] = str(channel_program)
+        row['channel_program_type'] = str(channel_program_type)
         return row
 
 def merge_program_subtitle(df_subtitle: pd.DataFrame, df_program: pd.DataFrame):

diff --git a/test/sitemap/test_mediatree.py b/test/sitemap/test_mediatree.py
@@ -51,13 +51,14 @@
                 "text": "adaptation"
                 }
             ],
-            "channel":{"name":"tf1","title":"M6","radio":false},"start":1704798120,
+            "channel":{"name":"tf1","title":"TF1","radio":false},"start":1704798120,
             "plaintext":"test2"}
     ],
     "elapsed_time_ms":335}
 """)
 
 def test_parse_reponse_subtitle():
+    channel_program = "13h15 le samedi"
     expected_result = pd.DataFrame([{
         "srt": [{
             "duration_ms": 34,
@@ -82,9 +83,10 @@ def test_parse_reponse_subtitle():
         ],
         "plaintext" : plaintext1,
         "channel_name" : "m6",
+        "channel_title" : "M6",
         "channel_radio" : False,
         "start" : 1704798000,
-        "channel_program" : "",
+        "channel_program" : channel_program,
         "channel_program_type" : "",
     },
     {
@@ -97,13 +99,14 @@ def test_parse_reponse_subtitle():
         "plaintext" : plaintext2,
         "channel_name" : "tf1",
         "channel_radio" : False,
+        "channel_title" : "TF1",
         "start" : 1704798120,
-        "channel_program" : "",
+        "channel_program" : channel_program,
         "channel_program_type" : "",
     }])
 
     expected_result['start'] = pd.to_datetime(expected_result['start'], unit='s').dt.tz_localize('UTC')
-    df = parse_reponse_subtitle(json_response)
+    df = parse_reponse_subtitle(json_response, channel = None, channel_program = channel_program, channel_program_type = "")
     debug_df(df)
 
     pd.testing.assert_frame_equal(df._to_pandas().reset_index(drop=True), expected_result.reset_index(drop=True))
@@ -144,11 +147,13 @@ def test_save_to_pg_keyword():
             "ressources_concepts_generaux",
         ]
     channel_name = "m6"
+    channel_title = "M6"
     df = pd.DataFrame([{
         "id" : primary_key,
         "start": 1706437079006,
         "plaintext": "cheese pizza habitabilité de la planète conditions de vie sur terre animal",
         "channel_name": channel_name,
+        "channel_title": channel_title,
         "channel_radio": False,
         "theme": themes,
         "keywords_with_timestamp": keywords_with_timestamp
@@ -164,6 +169,7 @@ def test_save_to_pg_keyword():
 
     assert result.id == primary_key
     assert result.channel_name == channel_name
+    assert result.channel_title == channel_title
     assert result.channel_radio == False
     assert result.theme == themes 
     assert result.keywords_with_timestamp == keywords_with_timestamp

diff --git a/test/sitemap/test_program_metadata.py b/test/sitemap/test_program_metadata.py
@@ -174,6 +174,13 @@ def test_get_a_program_with_start_timestamp():
     assert program_name == "JT 13h"
     assert program_type == "Information - Journal"
 
+def test_get_13h_program_with_start_timestamp():
+    df_programs = get_programs()
+    saturday_13h18 = 1717240693
+    program_name, program_type = get_a_program_with_start_timestamp(df_programs, pd.to_datetime(saturday_13h18, unit='s', utc=True).tz_convert('Europe/Paris'), channel_name)
+    assert program_name == "13h15 le samedi"
+    assert program_type == "Information - Journal"
+
 def test_compare_weekday_string():
     assert compare_weekday('*', 0) == True
     assert compare_weekday('*', 3) == True
File	Stmts	Miss	Cover	Missing
postgres
insert_data.py	44	7	84%	36–38, 57–59, 64
insert_existing_data_example.py	19	3	84%	25–27
postgres/schemas
models.py	143	10	93%	117–124, 136–137, 195–196, 210–211
quotaclimat/data_ingestion
scrap_sitemap.py	134	17	87%	27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
quotaclimat/data_ingestion/ingest_db
ingest_sitemap_in_db.py	55	37	33%	21–42, 45–58, 62–73
quotaclimat/data_ingestion/scrap_html
scrap_description_article.py	36	3	92%	19–20, 32
quotaclimat/data_processing/mediatree
api_import.py	198	121	39%	43–47, 52–64, 68–71, 77, 80–122, 128–143, 147–148, 161–173, 177–183, 196–207, 210–214, 220, 255–256, 260, 264–287, 290–292
channel_program.py	91	9	90%	21–23, 34–36, 50, 86, 95
config.py	15	2	87%	7, 16
detect_keywords.py	180	4	98%	178, 230–232
update_pg_keywords.py	44	30	32%	14–84, 105–106, 127–152, 158
utils.py	66	23	65%	18, 29–53, 56, 65, 81–82
quotaclimat/utils
healthcheck_config.py	29	14	52%	22–24, 27–38
logger.py	24	11	54%	22–24, 28–37
sentry.py	10	2	80%	21–22
TOTAL	1114	293	74%