Skip to content

Commit

Permalink
Feat/no hrfp counter #285 (#288)
Browse files Browse the repository at this point in the history
* added columns for counters without hrfp

* fixed some issues of df formats with new columns

* modified test for first update keywords

* fixed bugs

* fixed small issue

* fixed issue

* fixed unpack issue

* updated test function

* fixed test function

* wip: add high risk false positive column

* refacto: remove directes from column names

* added tests for HRFP

* test: filtering hrfp

* db: alembic hrfp counters

---------

Co-authored-by: RDiPiazza <[email protected]>
  • Loading branch information
polomarcus and RDiPiazza authored Dec 2, 2024
1 parent 2725c11 commit 10caded
Show file tree
Hide file tree
Showing 8 changed files with 617 additions and 64 deletions.
50 changes: 50 additions & 0 deletions alembic/versions/ac96222af6fe_hrfp_counters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""hrfp counters
Revision ID: ac96222af6fe
Revises: 30abfd828007
Create Date: 2024-12-02 14:36:21.970968
"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = 'ac96222af6fe'
down_revision: Union[str, None] = '30abfd828007'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('keywords', sa.Column('number_of_changement_climatique_constat_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_changement_climatique_causes_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_changement_climatique_consequences_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_attenuation_climatique_solutions_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_adaptation_climatique_solutions_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_ressources_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_ressources_solutions_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_biodiversite_concepts_generaux_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_biodiversite_causes_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_biodiversite_consequences_no_hrfp', sa.Integer(), nullable=True))
op.add_column('keywords', sa.Column('number_of_biodiversite_solutions_no_hrfp', sa.Integer(), nullable=True))
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('keywords', 'number_of_biodiversite_solutions_no_hrfp')
op.drop_column('keywords', 'number_of_biodiversite_consequences_no_hrfp')
op.drop_column('keywords', 'number_of_biodiversite_causes_no_hrfp')
op.drop_column('keywords', 'number_of_biodiversite_concepts_generaux_no_hrfp')
op.drop_column('keywords', 'number_of_ressources_solutions_no_hrfp')
op.drop_column('keywords', 'number_of_ressources_no_hrfp')
op.drop_column('keywords', 'number_of_adaptation_climatique_solutions_no_hrfp')
op.drop_column('keywords', 'number_of_attenuation_climatique_solutions_no_hrfp')
op.drop_column('keywords', 'number_of_changement_climatique_consequences_no_hrfp')
op.drop_column('keywords', 'number_of_changement_climatique_causes_no_hrfp')
op.drop_column('keywords', 'number_of_changement_climatique_constat_no_hrfp')
# ### end Alembic commands ###
11 changes: 11 additions & 0 deletions postgres/schemas/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,17 @@ class Keywords(Base):
number_of_keywords_climat = Column(Integer) # sum of all climatique counters without duplicate (like number_of_keywords)
number_of_keywords_biodiversite = Column(Integer) # sum of all biodiversite counters without duplicate
number_of_keywords_ressources = Column(Integer) # sum of all ressources counters without duplicate
number_of_changement_climatique_constat_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_changement_climatique_constat integer;
number_of_changement_climatique_causes_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_changement_climatique_causes_directes integer;
number_of_changement_climatique_consequences_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_changement_climatique_consequences integer;
number_of_attenuation_climatique_solutions_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_attenuation_climatique_solutions_directes integer;
number_of_adaptation_climatique_solutions_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_adaptation_climatique_solutions_directes integer;
number_of_ressources_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_ressources_naturelles_concepts_generaux integer;
number_of_ressources_solutions_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_ressources_solutions integer;
number_of_biodiversite_concepts_generaux_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_concepts_generaux integer;
number_of_biodiversite_causes_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_causes_directes integer;
number_of_biodiversite_consequences_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_consequences integer;
number_of_biodiversite_solutions_no_hrfp= Column(Integer) # ALTER TABLE keywords ADD number_of_biodiversite_solutions_directes integer;

class Channel_Metadata(Base):
__tablename__ = channel_metadata_table
Expand Down
75 changes: 64 additions & 11 deletions quotaclimat/data_processing/mediatree/detect_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def remove_stopwords(plaintext: str) -> str:
@sentry_sdk.trace
def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], start: datetime):
keywords_with_timestamp = []
number_of_elements_in_array = 17
number_of_elements_in_array = 28
default_window_in_seconds = DEFAULT_WINDOW_DURATION
plaitext_without_stopwords = remove_stopwords(plaintext)
logging.debug(f"display datetime start {start}")
Expand Down Expand Up @@ -192,8 +192,32 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
number_of_biodiversite_causes = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_causes"])
number_of_biodiversite_consequences = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_consequences"])
number_of_biodiversite_solutions = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_solutions"])

return [

# No high risk of false positive counters
number_of_changement_climatique_constat_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_constat"], \
count_high_risk_false_positive=False)
number_of_changement_climatique_causes_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_causes"], \
count_high_risk_false_positive=False)
number_of_changement_climatique_consequences_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["changement_climatique_consequences"], \
count_high_risk_false_positive=False)
number_of_attenuation_climatique_solutions_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["attenuation_climatique_solutions"], \
count_high_risk_false_positive=False)
number_of_adaptation_climatique_solutions_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["adaptation_climatique_solutions"], \
count_high_risk_false_positive=False)
number_of_ressources_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["ressources"], \
count_high_risk_false_positive=False)
number_of_ressources_solutions_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["ressources_solutions"], \
count_high_risk_false_positive=False)
number_of_biodiversite_concepts_generaux_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_concepts_generaux"], \
count_high_risk_false_positive=False)
number_of_biodiversite_causes_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_causes"], \
count_high_risk_false_positive=False)
number_of_biodiversite_consequences_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_consequences"], \
count_high_risk_false_positive=False)
number_of_biodiversite_solutions_no_hrfp = count_keywords_duration_overlap(filtered_keywords_with_timestamp, start,theme=["biodiversite_solutions"], \
count_high_risk_false_positive=False)

return [ # Change number_of_elements_in_array if a new element is added here
theme
,keywords_with_timestamp
,number_of_keywords
Expand All @@ -211,8 +235,20 @@ def get_themes_keywords_duration(plaintext: str, subtitle_duration: List[str], s
,number_of_keywords_climat
,number_of_keywords_biodiversite
,number_of_keywords_ressources
,number_of_changement_climatique_constat_no_hrfp
,number_of_changement_climatique_causes_no_hrfp
,number_of_changement_climatique_consequences_no_hrfp
,number_of_attenuation_climatique_solutions_no_hrfp
,number_of_adaptation_climatique_solutions_no_hrfp
,number_of_ressources_no_hrfp
,number_of_ressources_solutions_no_hrfp
,number_of_biodiversite_concepts_generaux_no_hrfp
,number_of_biodiversite_causes_no_hrfp
,number_of_biodiversite_consequences_no_hrfp
,number_of_biodiversite_solutions_no_hrfp
]
else:
logging.info("Empty keywords")
return [None] * number_of_elements_in_array

def get_keywords_with_timestamp_with_false_positive(keywords_with_timestamp, start, duration_seconds: int = 20):
Expand Down Expand Up @@ -274,6 +310,17 @@ def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :
,"number_of_keywords_climat"
,"number_of_keywords_biodiversite"
,"number_of_keywords_ressources"
,"number_of_changement_climatique_constat_no_hrfp"
,"number_of_changement_climatique_causes_no_hrfp"
,"number_of_changement_climatique_consequences_no_hrfp"
,"number_of_attenuation_climatique_solutions_no_hrfp"
,"number_of_adaptation_climatique_solutions_no_hrfp"
,"number_of_ressources_no_hrfp"
,"number_of_ressources_solutions_no_hrfp"
,"number_of_biodiversite_concepts_generaux_no_hrfp"
,"number_of_biodiversite_causes_no_hrfp"
,"number_of_biodiversite_consequences_no_hrfp"
,"number_of_biodiversite_solutions_no_hrfp"
]
] = df[['plaintext','srt', 'start']]\
.swifter.apply(\
Expand All @@ -282,9 +329,10 @@ def filter_and_tag_by_theme(df: pd.DataFrame) -> pd.DataFrame :
result_type='expand'
)

logging.info("Dropping")
# remove all rows that does not have themes
df = df.dropna(subset=['theme'], how='any') # any is for None values

logging.info("Droped")
logging.info(f"After filtering with out keywords, we have {len(df)} out of {count_before_filtering} subtitles left that are insteresting for us")

return df
Expand All @@ -302,15 +350,21 @@ def add_primary_key(row):
def filter_indirect_words(keywords_with_timestamp: List[dict]) -> List[dict]:
return list(filter(lambda kw: indirectes not in kw['theme'], keywords_with_timestamp))

def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: List[str] = None) -> int:
def filter_high_risk_false_positive(keywords_with_timestamp: List[dict]) -> List[dict]:
return list(filter(lambda kw: 'hrfp' not in kw, keywords_with_timestamp))

def count_keywords_duration_overlap(keywords_with_timestamp: List[dict], start: datetime, theme: List[str] = None, count_high_risk_false_positive: bool = True) -> int:
total_keywords = len(keywords_with_timestamp)
if(total_keywords) == 0:
return 0
else:
logging.debug(f"keywords_with_timestamp is {keywords_with_timestamp}")
if theme is not None:
logging.debug(f"filter theme {theme}")
keywords_with_timestamp = list(filter(lambda kw: kw['theme'] in theme, keywords_with_timestamp))

if count_high_risk_false_positive is False:
keywords_with_timestamp = filter_high_risk_false_positive(keywords_with_timestamp)
logging.debug(f"keywords_with_timestamp is after filtering {keywords_with_timestamp}")
length_filtered_items = len(keywords_with_timestamp)

if length_filtered_items > 0:
Expand Down Expand Up @@ -357,7 +411,9 @@ def transform_false_positive_keywords_to_positive(keywords_with_timestamp: List[

if( contains_direct_keywords_same_suject(neighbour_keywords, keyword_info['theme']) ) :
logging.debug(f"Transforming false positive to positive { keyword_info['keyword']} { keyword_info['theme']}")
keyword_info['theme'] = remove_indirect(keyword_info['theme'])
if indirectes in keyword_info['theme']:
keyword_info['theme'] = remove_indirect(keyword_info['theme'])
keyword_info['hrfp'] = True # to store if a keyword was a transformed to a direct keyword

return keywords_with_timestamp

Expand All @@ -383,7 +439,4 @@ def tag_wanted_duration_second_window_number(keywords_with_timestamp: List[dict]
return keywords_with_timestamp

def remove_indirect(theme: str) -> str:
if indirectes in theme:
return theme.replace(f'_{indirectes}', '')
else:
return theme
return theme.replace(f'_{indirectes}', '')
54 changes: 49 additions & 5 deletions quotaclimat/data_processing/mediatree/update_pg_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,18 @@ def update_keywords(session: Session, batch_size: int = 50000, start_date : str
,number_of_biodiversite_solutions_directes \
,new_number_of_keywords_climat \
,new_number_of_keywords_biodiversite \
,new_number_of_keywords_ressources = get_themes_keywords_duration(plaintext, srt, start)
,new_number_of_keywords_ressources \
,number_of_changement_climatique_constat_no_hrfp \
,number_of_changement_climatique_causes_no_hrfp \
,number_of_changement_climatique_consequences_no_hrfp \
,number_of_attenuation_climatique_solutions_no_hrfp \
,number_of_adaptation_climatique_solutions_no_hrfp \
,number_of_ressources_no_hrfp \
,number_of_ressources_solutions_no_hrfp \
,number_of_biodiversite_concepts_generaux_no_hrfp \
,number_of_biodiversite_causes_no_hrfp \
,number_of_biodiversite_consequences_no_hrfp \
,number_of_biodiversite_solutions_no_hrfp = get_themes_keywords_duration(plaintext, srt, start)
except Exception as err:
logging.error(f"continuing loop but met error : {err}")
continue
Expand Down Expand Up @@ -83,10 +94,21 @@ def update_keywords(session: Session, batch_size: int = 50000, start_date : str
,number_of_biodiversite_causes_directes
,number_of_biodiversite_consequences
,number_of_biodiversite_solutions_directes
,channel_title=channel_title
,number_of_keywords_climat=new_number_of_keywords_climat
,number_of_keywords_biodiversite=new_number_of_keywords_biodiversite
,number_of_keywords_ressources=new_number_of_keywords_ressources
,channel_title
,new_number_of_keywords_climat
,new_number_of_keywords_biodiversite
,new_number_of_keywords_ressources
,number_of_changement_climatique_constat_no_hrfp
,number_of_changement_climatique_causes_no_hrfp
,number_of_changement_climatique_consequences_no_hrfp
,number_of_attenuation_climatique_solutions_no_hrfp
,number_of_adaptation_climatique_solutions_no_hrfp
,number_of_ressources_no_hrfp
,number_of_ressources_solutions_no_hrfp
,number_of_biodiversite_concepts_generaux_no_hrfp
,number_of_biodiversite_causes_no_hrfp
,number_of_biodiversite_consequences_no_hrfp
,number_of_biodiversite_solutions_no_hrfp
)
else: # Program only mode
logging.info(f"Updating program for keyword {keyword_id} - {channel_name} - original tz : {start}")
Expand Down Expand Up @@ -176,6 +198,17 @@ def update_keyword_row(session: Session,
,number_of_keywords_climat: int
,number_of_keywords_biodiversite: int
,number_of_keywords_ressources: int
,number_of_changement_climatique_constat_no_hrfp: int,
number_of_changement_climatique_causes_no_hrfp: int,
number_of_changement_climatique_consequences_no_hrfp: int,
number_of_attenuation_climatique_solutions_no_hrfp: int,
number_of_adaptation_climatique_solutions_no_hrfp: int,
number_of_ressources_no_hrfp: int,
number_of_ressources_solutions_no_hrfp: int,
number_of_biodiversite_concepts_generaux_no_hrfp: int,
number_of_biodiversite_causes_no_hrfp: int,
number_of_biodiversite_consequences_no_hrfp: int,
number_of_biodiversite_solutions_no_hrfp: int
):
if matching_themes is not None:
session.query(Keywords).filter(Keywords.id == keyword_id).update(
Expand All @@ -198,6 +231,17 @@ def update_keyword_row(session: Session,
,Keywords.number_of_keywords_climat: number_of_keywords_climat
,Keywords.number_of_keywords_biodiversite: number_of_keywords_biodiversite
,Keywords.number_of_keywords_ressources: number_of_keywords_ressources
,Keywords.number_of_changement_climatique_constat_no_hrfp:number_of_changement_climatique_constat_no_hrfp ,
Keywords.number_of_changement_climatique_causes_no_hrfp:number_of_changement_climatique_causes_no_hrfp ,
Keywords.number_of_changement_climatique_consequences_no_hrfp:number_of_changement_climatique_consequences_no_hrfp ,
Keywords.number_of_attenuation_climatique_solutions_no_hrfp:number_of_attenuation_climatique_solutions_no_hrfp ,
Keywords.number_of_adaptation_climatique_solutions_no_hrfp:number_of_adaptation_climatique_solutions_no_hrfp ,
Keywords.number_of_ressources_no_hrfp:number_of_ressources_no_hrfp,
Keywords.number_of_ressources_solutions_no_hrfp:number_of_ressources_solutions_no_hrfp ,
Keywords.number_of_biodiversite_concepts_generaux_no_hrfp:number_of_biodiversite_concepts_generaux_no_hrfp ,
Keywords.number_of_biodiversite_causes_no_hrfp:number_of_biodiversite_causes_no_hrfp ,
Keywords.number_of_biodiversite_consequences_no_hrfp:number_of_biodiversite_consequences_no_hrfp ,
Keywords.number_of_biodiversite_solutions_no_hrfp:number_of_biodiversite_solutions_no_hrfp,
},
synchronize_session=False
)
Expand Down
Empty file added secrets/.empty
Empty file.
Loading

1 comment on commit 10caded

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
postgres
   insert_data.py43784%36–38, 56–58, 63
   insert_existing_data_example.py19384%25–27
postgres/schemas
   models.py1681193%137–144, 157, 159–160, 225–226, 240–241
quotaclimat/data_ingestion
   scrap_sitemap.py1341787%27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
quotaclimat/data_ingestion/ingest_db
   ingest_sitemap_in_db.py553733%21–42, 45–58, 62–73
quotaclimat/data_ingestion/scrap_html
   scrap_description_article.py36392%19–20, 32
quotaclimat/data_processing/mediatree
   api_import.py21313338%44–48, 53–74, 78–81, 87, 90–132, 138–153, 158, 171–183, 187–193, 206–218, 221–225, 231, 269–270, 273–304, 307–309
   channel_program.py1625765%21–23, 34–36, 53–54, 57–59, 98–99, 108, 124, 175–216
   config.py15287%7, 16
   detect_keywords.py2521694%111–118, 126–127, 271, 341–348, 390
   update_pg_keywords.py674927%15–130, 154, 157, 164–179, 213–250, 257
   utils.py792568%29–53, 56, 65, 86–87, 117–120
quotaclimat/utils
   healthcheck_config.py291452%22–24, 27–38
   logger.py241154%22–24, 28–37
   sentry.py11282%22–23
TOTAL133438771% 

Tests Skipped Failures Errors Time
102 0 💤 0 ❌ 0 🔥 8m 25s ⏱️

Please sign in to comment.