Skip to content

Commit

Permalink
Merge branch 'main' into feat/automate-stop-words-list-pg
Browse files Browse the repository at this point in the history
  • Loading branch information
polomarcus committed Dec 2, 2024
2 parents 9e93c89 + 6df6152 commit b427f77
Show file tree
Hide file tree
Showing 13 changed files with 629 additions and 75 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/scaleway-start-import-job-update.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ jobs:
,{start_date: "2024-08-01", end_date: "2024-09-01"}
,{start_date: "2024-09-01", end_date: "2024-10-01"}
,{start_date: "2024-10-01", end_date: "2024-11-01"}
,{start_date: "2024-11-01", end_date: "2024-12-01"}
]
runs-on: ubuntu-latest
steps:
Expand All @@ -39,4 +40,4 @@ jobs:
SCW_ORGANIZATION_ID: ${{ secrets.SCW_ORGANIZATION_ID }}
SCW_ZONE: ${{ secrets.SCW_ZONE }}
with:
args: jobs definition start ${{ secrets.SCALEWAY_JOB_IMPORT_ID }} environment-variables.UPDATE=true environment-variables.START_DATE_UPDATE=${{ matrix.dates.start_date }} environment-variables.END_DATE=${{ matrix.dates.end_date }}
args: jobs definition start ${{ secrets.SCALEWAY_JOB_IMPORT_ID }} environment-variables.UPDATE=true environment-variables.START_DATE_UPDATE=${{ matrix.dates.start_date }} environment-variables.END_DATE=${{ matrix.dates.end_date }}
50 changes: 50 additions & 0 deletions alembic/versions/ac96222af6fe_hrfp_counters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""hrfp counters
Revision ID: ac96222af6fe
Revises: 30abfd828007
Create Date: 2024-12-02 14:36:21.970968
"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = 'ac96222af6fe'
down_revision: Union[str, None] = '30abfd828007'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('keywords', sa.Column('number_of_changement_climatique_constat_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_changement_climatique_causes_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_changement_climatique_consequences_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_attenuation_climatique_solutions_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_adaptation_climatique_solutions_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_ressources_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_ressources_solutions_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_biodiversite_concepts_generaux_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_biodiversite_causes_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_biodiversite_consequences_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_biodiversite_solutions_no_hrfp', sa.Integer(), nullable=True))
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('keywords', 'number_of_biodiversite_solutions_no_hrfp')
op.drop_column('keywords', 'number_of_biodiversite_consequences_no_hrfp')
op.drop_column('keywords', 'number_of_biodiversite_causes_no_hrfp')
op.drop_column('keywords', 'number_of_biodiversite_concepts_generaux_no_hrfp')
op.drop_column('keywords', 'number_of_ressources_solutions_no_hrfp')
op.drop_column('keywords', 'number_of_ressources_no_hrfp')
op.drop_column('keywords', 'number_of_adaptation_climatique_solutions_no_hrfp')
op.drop_column('keywords', 'number_of_attenuation_climatique_solutions_no_hrfp')
op.drop_column('keywords', 'number_of_changement_climatique_consequences_no_hrfp')
op.drop_column('keywords', 'number_of_changement_climatique_causes_no_hrfp')
op.drop_column('keywords', 'number_of_changement_climatique_constat_no_hrfp')
# ### end Alembic commands ###
14 changes: 7 additions & 7 deletions postgres/program_metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@
"program_name": "Information en continu",
"program_type": "Information en continu",
"duration": 1020,
"channel_title": "France Info",
"channel_title": "France Info TV",
"public": true,
"infocontinue": true,
"radio": false,
Expand Down Expand Up @@ -1721,7 +1721,7 @@
"program_name": "Information en continu",
"program_type": "Information en continu",
"duration": 1020,
"channel_title": "France Info",
"channel_title": "France Info TV",
"public": true,
"infocontinue": true,
"radio": false,
Expand Down Expand Up @@ -2889,7 +2889,7 @@
"program_name": "Information en continu",
"program_type": "Information en continu",
"duration": 1020,
"channel_title": "France Info",
"channel_title": "France Info TV",
"public": true,
"infocontinue": true,
"radio": false,
Expand Down Expand Up @@ -4089,7 +4089,7 @@
"program_name": "Information en continu",
"program_type": "Information en continu",
"duration": 1020,
"channel_title": "France Info",
"channel_title": "France Info TV",
"public": true,
"infocontinue": true,
"radio": false,
Expand Down Expand Up @@ -5241,7 +5241,7 @@
"program_name": "Information en continu",
"program_type": "Information en continu",
"duration": 1020,
"channel_title": "France Info",
"channel_title": "France Info TV",
"public": true,
"infocontinue": true,
"radio": false,
Expand Down Expand Up @@ -6297,7 +6297,7 @@
"program_name": "Information en continu",
"program_type": "Information en continu",
"duration": 1020,
"channel_title": "France Info",
"channel_title": "France Info TV",
"public": true,
"infocontinue": true,
"radio": false,
Expand Down Expand Up @@ -7321,7 +7321,7 @@
"program_name": "Information en continu",
"program_type": "Information en continu",
"duration": 1020,
"channel_title": "France Info",
"channel_title": "France Info TV",
"public": true,
"infocontinue": true,
"radio": false,
Expand Down
11 changes: 11 additions & 0 deletions postgres/schemas/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,17 @@ class Keywords(Base):
number_of_keywords_climat = Column(Integer) # sum of all climatique counters without duplicate (like number_of_keywords)
number_of_keywords_biodiversite = Column(Integer) # sum of all biodiversite counters without duplicate
number_of_keywords_ressources = Column(Integer) # sum of all ressources counters without duplicate
number_of_changement_climatique_constat_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_changement_climatique_constat integer;
number_of_changement_climatique_causes_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_changement_climatique_causes_directes integer;
number_of_changement_climatique_consequences_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_changement_climatique_consequences integer;
number_of_attenuation_climatique_solutions_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_attenuation_climatique_solutions_directes integer;
number_of_adaptation_climatique_solutions_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_adaptation_climatique_solutions_directes integer;
number_of_ressources_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_ressources_naturelles_concepts_generaux integer;
number_of_ressources_solutions_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_ressources_solutions integer;
number_of_biodiversite_concepts_generaux_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_concepts_generaux integer;
number_of_biodiversite_causes_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_causes_directes integer;
number_of_biodiversite_consequences_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_consequences integer;
number_of_biodiversite_solutions_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_solutions_directes integer;

class Channel_Metadata(Base):
__tablename__ = channel_metadata_table
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "quotaclimat"
version = "1.0.44"
version = "1.0.48"
description = ""
authors = [
"Rambier Estelle <[email protected]>",
Expand Down
2 changes: 1 addition & 1 deletion quotaclimat/data_processing/mediatree/channel_program.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def get_channel_title_for_name(channel_name: str) -> str:
case "lci":
return "LCI"
case "franceinfotv":
return "France Info"
return "France Info TV"
case "itele":
return "CNews"
case "europe1":
Expand Down
75 changes: 64 additions & 11 deletions quotaclimat/data_processing/mediatree/detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def remove_stopwords(plaintext: str) -> str:
@sentry_sdk.trace
def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], start: datetime):
keywords_with_timestamp = []
number_of_elements_in_array = 17
number_of_elements_in_array = 28
default_window_in_seconds = DEFAULT_WINDOW_DURATION
plaitext_without_stopwords = remove_stopwords(plaintext)
logging.debug(f"display datetime start {start}")
Expand Down Expand Up @@ -192,8 +192,32 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
number_of_biodiversite_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_causes"])
number_of_biodiversite_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_consequences"])
number_of_biodiversite_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_solutions"])

return [

# No high risk of false positive counters
number_of_changement_climatique_constat_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_constat"], \
count_high_risk_false_positive=False)
number_of_changement_climatique_causes_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_causes"], \
count_high_risk_false_positive=False)
number_of_changement_climatique_consequences_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_consequences"], \
count_high_risk_false_positive=False)
number_of_attenuation_climatique_solutions_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["attenuation_climatique_solutions"], \
count_high_risk_false_positive=False)
number_of_adaptation_climatique_solutions_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["adaptation_climatique_solutions"], \
count_high_risk_false_positive=False)
number_of_ressources_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["ressources"], \
count_high_risk_false_positive=False)
number_of_ressources_solutions_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["ressources_solutions"], \
count_high_risk_false_positive=False)
number_of_biodiversite_concepts_generaux_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_concepts_generaux"], \
count_high_risk_false_positive=False)
number_of_biodiversite_causes_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_causes"], \
count_high_risk_false_positive=False)
number_of_biodiversite_consequences_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_consequences"], \
count_high_risk_false_positive=False)
number_of_biodiversite_solutions_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_solutions"], \
count_high_risk_false_positive=False)

return [ # Change number_of_elements_in_array if a new element is added here
theme
,keywords_with_timestamp
,number_of_keywords
Expand All @@ -211,8 +235,20 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
,number_of_keywords_climat
,number_of_keywords_biodiversite
,number_of_keywords_ressources
,number_of_changement_climatique_constat_no_hrfp
,number_of_changement_climatique_causes_no_hrfp
,number_of_changement_climatique_consequences_no_hrfp
,number_of_attenuation_climatique_solutions_no_hrfp
,number_of_adaptation_climatique_solutions_no_hrfp
,number_of_ressources_no_hrfp
,number_of_ressources_solutions_no_hrfp
,number_of_biodiversite_concepts_generaux_no_hrfp
,number_of_biodiversite_causes_no_hrfp
,number_of_biodiversite_consequences_no_hrfp
,number_of_biodiversite_solutions_no_hrfp
]
else:
logging.info("Empty keywords")
return [None] * number_of_elements_in_array

def get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds: int = 20):
Expand Down Expand Up @@ -274,6 +310,17 @@ def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :
,"number_of_keywords_climat"
,"number_of_keywords_biodiversite"
,"number_of_keywords_ressources"
,"number_of_changement_climatique_constat_no_hrfp"
,"number_of_changement_climatique_causes_no_hrfp"
,"number_of_changement_climatique_consequences_no_hrfp"
,"number_of_attenuation_climatique_solutions_no_hrfp"
,"number_of_adaptation_climatique_solutions_no_hrfp"
,"number_of_ressources_no_hrfp"
,"number_of_ressources_solutions_no_hrfp"
,"number_of_biodiversite_concepts_generaux_no_hrfp"
,"number_of_biodiversite_causes_no_hrfp"
,"number_of_biodiversite_consequences_no_hrfp"
,"number_of_biodiversite_solutions_no_hrfp"
]
] = df[['plaintext','srt', 'start']]\
.swifter.apply(\
Expand All @@ -282,9 +329,10 @@ def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :
result_type='expand'
)

logging.info("Dropping")
# remove all rows that does not have themes
df = df.dropna(subset=['theme'], how='any') # any is for None values

logging.info("Droped")
logging.info(f"After filtering with out keywords, we have {len(df)} out of {count_before_filtering} subtitles left that are insteresting for us")

return df
Expand All @@ -302,15 +350,21 @@ def add_primary_key(row):
def filter_indirect_words(keywords_with_timestamp: List[dict]) -> List[dict]:
return list(filter(lambda kw: indirectes not in kw['theme'], keywords_with_timestamp))

def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: List[str] = None) -> int:
def filter_high_risk_false_positive(keywords_with_timestamp: List[dict]) -> List[dict]:
return list(filter(lambda kw: 'hrfp' not in kw, keywords_with_timestamp))

def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: List[str] = None, count_high_risk_false_positive: bool = True) -> int:
total_keywords = len(keywords_with_timestamp)
if(total_keywords) == 0:
return 0
else:
logging.debug(f"keywords_with_timestamp is {keywords_with_timestamp}")
if theme is not None:
logging.debug(f"filter theme {theme}")
keywords_with_timestamp = list(filter(lambda kw: kw['theme'] in theme, keywords_with_timestamp))

if count_high_risk_false_positive is False:
keywords_with_timestamp = filter_high_risk_false_positive(keywords_with_timestamp)
logging.debug(f"keywords_with_timestamp is after filtering {keywords_with_timestamp}")
length_filtered_items = len(keywords_with_timestamp)

if length_filtered_items > 0:
Expand Down Expand Up @@ -357,7 +411,9 @@ def transform_false_positive_keywords_to_positive(keywords_with_timestamp: List[

if( contains_direct_keywords_same_suject(neighbour_keywords, keyword_info['theme']) ) :
logging.debug(f"Transforming false positive to positive { keyword_info['keyword']} { keyword_info['theme']}")
keyword_info['theme'] = remove_indirect(keyword_info['theme'])
if indirectes in keyword_info['theme']:
keyword_info['theme'] = remove_indirect(keyword_info['theme'])
keyword_info['hrfp'] = True # to store if a keyword was a transformed to a direct keyword

return keywords_with_timestamp

Expand All @@ -383,7 +439,4 @@ def tag_wanted_duration_second_window_number(keywords_with_timestamp: List[dict]
return keywords_with_timestamp

def remove_indirect(theme: str) -> str:
if indirectes in theme:
return theme.replace(f'_{indirectes}', '')
else:
return theme
return theme.replace(f'_{indirectes}', '')
Loading

0 comments on commit b427f77

Please sign in to comment.