Merge branch 'main' into feat/automate-stop-words-list-pg

dataforgoodfr · Dec 2, 2024 · b427f77 · b427f77
2 parents 9e93c89 + 6df6152
commit b427f77
Show file tree

Hide file tree

Showing 13 changed files with 629 additions and 75 deletions.
diff --git a/.github/workflows/scaleway-start-import-job-update.yml b/.github/workflows/scaleway-start-import-job-update.yml
@@ -28,6 +28,7 @@ jobs:
           ,{start_date: "2024-08-01", end_date: "2024-09-01"}
           ,{start_date: "2024-09-01", end_date: "2024-10-01"}
           ,{start_date: "2024-10-01", end_date: "2024-11-01"}
+          ,{start_date: "2024-11-01", end_date: "2024-12-01"}
         ]
     runs-on: ubuntu-latest
     steps:
@@ -39,4 +40,4 @@ jobs:
         SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
         SCW_ZONE: ${{ secrets.SCW_ZONE }}
       with:
-        args: jobs definition start ${{ secrets.SCALEWAY_JOB_IMPORT_ID }} environment-variables.UPDATE=true environment-variables.START_DATE_UPDATE=${{ matrix.dates.start_date }} environment-variables.END_DATE=${{ matrix.dates.end_date }}
+        args: jobs definition start ${{ secrets.SCALEWAY_JOB_IMPORT_ID }} environment-variables.UPDATE=true environment-variables.START_DATE_UPDATE=${{ matrix.dates.start_date }} environment-variables.END_DATE=${{ matrix.dates.end_date }}
diff --git a/alembic/versions/ac96222af6fe_hrfp_counters.py b/alembic/versions/ac96222af6fe_hrfp_counters.py
@@ -0,0 +1,50 @@
+"""hrfp counters
+
+Revision ID: ac96222af6fe
+Revises: 30abfd828007
+Create Date: 2024-12-02 14:36:21.970968
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'ac96222af6fe'
+down_revision: Union[str, None] = '30abfd828007'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('keywords', sa.Column('number_of_changement_climatique_constat_no_hrfp', sa.Integer(), nullable=True))
+    op.add_column('keywords', sa.Column('number_of_changement_climatique_causes_no_hrfp', sa.Integer(), nullable=True))
+    op.add_column('keywords', sa.Column('number_of_changement_climatique_consequences_no_hrfp', sa.Integer(), nullable=True))
+    op.add_column('keywords', sa.Column('number_of_attenuation_climatique_solutions_no_hrfp', sa.Integer(), nullable=True))
+    op.add_column('keywords', sa.Column('number_of_adaptation_climatique_solutions_no_hrfp', sa.Integer(), nullable=True))
+    op.add_column('keywords', sa.Column('number_of_ressources_no_hrfp', sa.Integer(), nullable=True))
+    op.add_column('keywords', sa.Column('number_of_ressources_solutions_no_hrfp', sa.Integer(), nullable=True))
+    op.add_column('keywords', sa.Column('number_of_biodiversite_concepts_generaux_no_hrfp', sa.Integer(), nullable=True))
+    op.add_column('keywords', sa.Column('number_of_biodiversite_causes_no_hrfp', sa.Integer(), nullable=True))
+    op.add_column('keywords', sa.Column('number_of_biodiversite_consequences_no_hrfp', sa.Integer(), nullable=True))
+    op.add_column('keywords', sa.Column('number_of_biodiversite_solutions_no_hrfp', sa.Integer(), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('keywords', 'number_of_biodiversite_solutions_no_hrfp')
+    op.drop_column('keywords', 'number_of_biodiversite_consequences_no_hrfp')
+    op.drop_column('keywords', 'number_of_biodiversite_causes_no_hrfp')
+    op.drop_column('keywords', 'number_of_biodiversite_concepts_generaux_no_hrfp')
+    op.drop_column('keywords', 'number_of_ressources_solutions_no_hrfp')
+    op.drop_column('keywords', 'number_of_ressources_no_hrfp')
+    op.drop_column('keywords', 'number_of_adaptation_climatique_solutions_no_hrfp')
+    op.drop_column('keywords', 'number_of_attenuation_climatique_solutions_no_hrfp')
+    op.drop_column('keywords', 'number_of_changement_climatique_consequences_no_hrfp')
+    op.drop_column('keywords', 'number_of_changement_climatique_causes_no_hrfp')
+    op.drop_column('keywords', 'number_of_changement_climatique_constat_no_hrfp')
+    # ### end Alembic commands ###
diff --git a/postgres/program_metadata.json b/postgres/program_metadata.json
@@ -553,7 +553,7 @@
         "program_name": "Information en continu",
         "program_type": "Information en continu",
         "duration": 1020,
-        "channel_title": "France Info",
+        "channel_title": "France Info TV",
         "public": true,
         "infocontinue": true,
         "radio": false,
@@ -1721,7 +1721,7 @@
         "program_name": "Information en continu",
         "program_type": "Information en continu",
         "duration": 1020,
-        "channel_title": "France Info",
+        "channel_title": "France Info TV",
         "public": true,
         "infocontinue": true,
         "radio": false,
@@ -2889,7 +2889,7 @@
         "program_name": "Information en continu",
         "program_type": "Information en continu",
         "duration": 1020,
-        "channel_title": "France Info",
+        "channel_title": "France Info TV",
         "public": true,
         "infocontinue": true,
         "radio": false,
@@ -4089,7 +4089,7 @@
         "program_name": "Information en continu",
         "program_type": "Information en continu",
         "duration": 1020,
-        "channel_title": "France Info",
+        "channel_title": "France Info TV",
         "public": true,
         "infocontinue": true,
         "radio": false,
@@ -5241,7 +5241,7 @@
         "program_name": "Information en continu",
         "program_type": "Information en continu",
         "duration": 1020,
-        "channel_title": "France Info",
+        "channel_title": "France Info TV",
         "public": true,
         "infocontinue": true,
         "radio": false,
@@ -6297,7 +6297,7 @@
         "program_name": "Information en continu",
         "program_type": "Information en continu",
         "duration": 1020,
-        "channel_title": "France Info",
+        "channel_title": "France Info TV",
         "public": true,
         "infocontinue": true,
         "radio": false,
@@ -7321,7 +7321,7 @@
         "program_name": "Information en continu",
         "program_type": "Information en continu",
         "duration": 1020,
-        "channel_title": "France Info",
+        "channel_title": "France Info TV",
         "public": true,
         "infocontinue": true,
         "radio": false,

diff --git a/postgres/schemas/models.py b/postgres/schemas/models.py
@@ -88,6 +88,17 @@ class Keywords(Base):
     number_of_keywords_climat = Column(Integer) # sum of all climatique counters without duplicate (like number_of_keywords)
     number_of_keywords_biodiversite = Column(Integer) # sum of all biodiversite counters without duplicate
     number_of_keywords_ressources = Column(Integer) # sum of all ressources counters without duplicate
+    number_of_changement_climatique_constat_no_hrfp= Column(Integer)  # ALTER TABLE keywords ADD number_of_changement_climatique_constat integer;
+    number_of_changement_climatique_causes_no_hrfp= Column(Integer)  # ALTER TABLE keywords ADD number_of_changement_climatique_causes_directes integer;
+    number_of_changement_climatique_consequences_no_hrfp= Column(Integer)  # ALTER TABLE keywords ADD number_of_changement_climatique_consequences integer;
+    number_of_attenuation_climatique_solutions_no_hrfp= Column(Integer)  # ALTER TABLE keywords ADD number_of_attenuation_climatique_solutions_directes integer;
+    number_of_adaptation_climatique_solutions_no_hrfp= Column(Integer)  # ALTER TABLE keywords ADD number_of_adaptation_climatique_solutions_directes integer;
+    number_of_ressources_no_hrfp= Column(Integer)  # ALTER TABLE keywords ADD number_of_ressources_naturelles_concepts_generaux integer;
+    number_of_ressources_solutions_no_hrfp= Column(Integer)  # ALTER TABLE keywords ADD number_of_ressources_solutions integer;
+    number_of_biodiversite_concepts_generaux_no_hrfp= Column(Integer)  # ALTER TABLE keywords ADD number_of_biodiversite_concepts_generaux integer;
+    number_of_biodiversite_causes_no_hrfp= Column(Integer)  # ALTER TABLE keywords ADD number_of_biodiversite_causes_directes integer;
+    number_of_biodiversite_consequences_no_hrfp= Column(Integer)  # ALTER TABLE keywords ADD number_of_biodiversite_consequences integer;
+    number_of_biodiversite_solutions_no_hrfp= Column(Integer)  # ALTER TABLE keywords ADD number_of_biodiversite_solutions_directes integer;
 
 class Channel_Metadata(Base):
     __tablename__ = channel_metadata_table

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "quotaclimat"
-version = "1.0.44"
+version = "1.0.48"
 description = ""
 authors = [
     "Rambier Estelle <[email protected]>",

diff --git a/quotaclimat/data_processing/mediatree/channel_program.py b/quotaclimat/data_processing/mediatree/channel_program.py
@@ -190,7 +190,7 @@ def get_channel_title_for_name(channel_name: str) -> str:
         case "lci":
             return "LCI"
         case "franceinfotv":
-            return "France Info"
+            return "France Info TV"
         case "itele":
             return "CNews"
         case "europe1":

diff --git a/quotaclimat/data_processing/mediatree/detect_keywords.py b/quotaclimat/data_processing/mediatree/detect_keywords.py
@@ -131,7 +131,7 @@ def remove_stopwords(plaintext: str) -> str:
 @sentry_sdk.trace
 def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], start: datetime):
     keywords_with_timestamp = []
-    number_of_elements_in_array = 17
+    number_of_elements_in_array = 28
     default_window_in_seconds = DEFAULT_WINDOW_DURATION
     plaitext_without_stopwords = remove_stopwords(plaintext)
     logging.debug(f"display datetime start {start}")
@@ -192,8 +192,32 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
         number_of_biodiversite_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_causes"])
         number_of_biodiversite_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_consequences"])
         number_of_biodiversite_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_solutions"])
-
-        return [
+
+        # No high risk of false positive counters
+        number_of_changement_climatique_constat_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_constat"], \
+            count_high_risk_false_positive=False)
+        number_of_changement_climatique_causes_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_causes"], \
+            count_high_risk_false_positive=False)
+        number_of_changement_climatique_consequences_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_consequences"], \
+            count_high_risk_false_positive=False)
+        number_of_attenuation_climatique_solutions_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["attenuation_climatique_solutions"], \
+            count_high_risk_false_positive=False)
+        number_of_adaptation_climatique_solutions_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["adaptation_climatique_solutions"], \
+            count_high_risk_false_positive=False)
+        number_of_ressources_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["ressources"], \
+            count_high_risk_false_positive=False)
+        number_of_ressources_solutions_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["ressources_solutions"], \
+            count_high_risk_false_positive=False)
+        number_of_biodiversite_concepts_generaux_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_concepts_generaux"], \
+            count_high_risk_false_positive=False)
+        number_of_biodiversite_causes_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_causes"], \
+            count_high_risk_false_positive=False)
+        number_of_biodiversite_consequences_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_consequences"], \
+            count_high_risk_false_positive=False)
+        number_of_biodiversite_solutions_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_solutions"], \
+            count_high_risk_false_positive=False)
+
+        return [ # Change number_of_elements_in_array if a new element is added here
             theme
             ,keywords_with_timestamp 
             ,number_of_keywords
@@ -211,8 +235,20 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
             ,number_of_keywords_climat
             ,number_of_keywords_biodiversite
             ,number_of_keywords_ressources
+            ,number_of_changement_climatique_constat_no_hrfp
+            ,number_of_changement_climatique_causes_no_hrfp
+            ,number_of_changement_climatique_consequences_no_hrfp
+            ,number_of_attenuation_climatique_solutions_no_hrfp
+            ,number_of_adaptation_climatique_solutions_no_hrfp
+            ,number_of_ressources_no_hrfp
+            ,number_of_ressources_solutions_no_hrfp
+            ,number_of_biodiversite_concepts_generaux_no_hrfp
+            ,number_of_biodiversite_causes_no_hrfp
+            ,number_of_biodiversite_consequences_no_hrfp
+            ,number_of_biodiversite_solutions_no_hrfp
         ]
     else:
+        logging.info("Empty keywords")
         return [None] * number_of_elements_in_array
 
 def get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds: int = 20):
@@ -274,6 +310,17 @@ def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :
                  ,"number_of_keywords_climat"
                  ,"number_of_keywords_biodiversite"
                  ,"number_of_keywords_ressources"
+                 ,"number_of_changement_climatique_constat_no_hrfp"
+                 ,"number_of_changement_climatique_causes_no_hrfp"
+                 ,"number_of_changement_climatique_consequences_no_hrfp"
+                 ,"number_of_attenuation_climatique_solutions_no_hrfp"
+                 ,"number_of_adaptation_climatique_solutions_no_hrfp"
+                 ,"number_of_ressources_no_hrfp"
+                 ,"number_of_ressources_solutions_no_hrfp"
+                 ,"number_of_biodiversite_concepts_generaux_no_hrfp"
+                 ,"number_of_biodiversite_causes_no_hrfp"
+                 ,"number_of_biodiversite_consequences_no_hrfp"
+                 ,"number_of_biodiversite_solutions_no_hrfp"
                 ]
             ] = df[['plaintext','srt', 'start']]\
                 .swifter.apply(\
@@ -282,9 +329,10 @@ def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :
                         result_type='expand'
                 )
 
+            logging.info("Dropping")
             # remove all rows that does not have themes
             df = df.dropna(subset=['theme'], how='any') # any is for None values
-
+            logging.info("Droped")
             logging.info(f"After filtering with out keywords, we have {len(df)} out of {count_before_filtering} subtitles left that are insteresting for us")
 
             return df
@@ -302,15 +350,21 @@ def add_primary_key(row):
 def filter_indirect_words(keywords_with_timestamp: List[dict]) -> List[dict]:
     return list(filter(lambda kw: indirectes not in kw['theme'], keywords_with_timestamp))
 
-def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: List[str] = None) -> int:
+def filter_high_risk_false_positive(keywords_with_timestamp: List[dict]) -> List[dict]:
+    return list(filter(lambda kw: 'hrfp' not in kw, keywords_with_timestamp))
+
+def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: List[str] = None, count_high_risk_false_positive: bool = True) -> int:
     total_keywords = len(keywords_with_timestamp)
     if(total_keywords) == 0:
         return 0
     else:
+        logging.debug(f"keywords_with_timestamp is {keywords_with_timestamp}")
         if theme is not None:
             logging.debug(f"filter theme {theme}")
             keywords_with_timestamp = list(filter(lambda kw: kw['theme'] in theme, keywords_with_timestamp))
-
+        if count_high_risk_false_positive is False:
+            keywords_with_timestamp = filter_high_risk_false_positive(keywords_with_timestamp)
+        logging.debug(f"keywords_with_timestamp is after filtering {keywords_with_timestamp}")
         length_filtered_items = len(keywords_with_timestamp)
 
         if length_filtered_items > 0:
@@ -357,7 +411,9 @@ def transform_false_positive_keywords_to_positive(keywords_with_timestamp: List[
 
         if( contains_direct_keywords_same_suject(neighbour_keywords, keyword_info['theme']) ) :
             logging.debug(f"Transforming false positive to positive { keyword_info['keyword']} { keyword_info['theme']}")
-            keyword_info['theme'] = remove_indirect(keyword_info['theme'])
+            if indirectes in keyword_info['theme']:
+                keyword_info['theme'] = remove_indirect(keyword_info['theme'])
+                keyword_info['hrfp'] = True # to store if a keyword was a transformed to a direct keyword
 
     return keywords_with_timestamp
 
@@ -383,7 +439,4 @@ def tag_wanted_duration_second_window_number(keywords_with_timestamp: List[dict]
     return keywords_with_timestamp
 
 def remove_indirect(theme: str) -> str:
-    if indirectes in theme:
-        return theme.replace(f'_{indirectes}', '')
-    else:
-        return theme
+    return theme.replace(f'_{indirectes}', '')