feat: add keywords category / update keywords list with script (#167)

* feat: add category / update keywords list with script * wip: test test_long_sentence_theme_get_themes_keywords_duration * test: test_long_sentence_theme_get_themes_keywords_duration * wip: before remove keywords in differnt themes * refacto: keyword inside another keyword * readme: doc alembic * sql: saved alembic version * chores poetry lock * logs: to info * test * ci: test * ci test * test: ci * ci:test * ci: test * ci: test drop table on ci
dataforgoodfr · Apr 29, 2024 · d699a8b · d699a8b · github-actions · Apr 29, 2024
1 parent fc4c98c
commit d699a8b
Show file tree

Hide file tree

Showing 17 changed files with 4,744 additions and 1,056 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -50,10 +50,21 @@ jobs:
         run: poetry install
       - name: docker compose up nginx
         run: docker compose up -d nginxtest
-      - name: pytest run
+      - name: pytest run test_first_update_keywords
         run: |
           set -o pipefail
-          poetry run pytest --junitxml=pytest.xml \
+          poetry run pytest -k 'test_update_pg_keywords'
+        env:
+          ENV: dev
+          POSTGRES_USER: user
+          POSTGRES_DB: postgres
+          POSTGRES_PASSWORD: postgres
+          POSTGRES_HOST: localhost
+          POSTGRES_PORT: 5432
+      - name: pytest run everything else
+        run: |
+          set -o pipefail
+          poetry run pytest -k 'not test_update_pg_keywords' --junitxml=pytest.xml \
            --cov-report=term-missing:skip-covered \
            --cov=quotaclimat --cov=postgres test/ | \
            tee pytest-coverage.txt
@@ -64,6 +75,7 @@ jobs:
           POSTGRES_PASSWORD: postgres
           POSTGRES_HOST: localhost
           POSTGRES_PORT: 5432
+
       # - name: Upload coverage reports to Codecov
       #   uses: codecov/codecov-action@v3
       #   env:

diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,7 @@
 secrets/pwd_api.txt
 secrets/username_api.txt
+cc-bio.json
+*.xlsx
 coverage_re
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/README.md b/README.md
@@ -329,7 +329,11 @@ We can use a Github actions to start multiple update operations with different o
 Using [Alembic](https://alembic.sqlalchemy.org/en/latest/autogenerate.html) Auto Generating Migrations¶ we can add a new column inside `models.py` and it will automatically make the schema evolution :
 
 ```
-# connect to the test container : docker compose test exec bash
+# If changes have already been applied and you want to recreate your alembic file:
+# 1. change to you main branch
+# 2. start test container and run "pytest -vv -k api" to rebuild the state of the DB
+# 3. rechange to your WIP branch and
+# 4. connect to the test container : docker compose exec test bash
 poetry run alembic revision --autogenerate -m "Add new column test for table keywords"
 # this should generate a file to commit inside "alembic/versions"
 # to apply it we need to run, from our container
@@ -344,6 +348,14 @@ RUN alembic upgrade head
 ### Channel metadata
 In order to maintain channel perimeter (weekday, hours) up to date, we save the current version inside `postgres/channel_metadata.json`, if we modify this file the next deploy will update every lines of inside Postgresql table `channel_metadata`.
 
+## Produce keywords list from Excel
+How to update `quotaclimat/data_processing/mediatree/keyword/keyword.py` for share excel file ?
+Download file locally then :
+```
+poetry run python3 quotaclimat/transform_excel_to_json.py > cc-bio.json
+# then update quotaclimat/data_processing/mediatree/keyword/keyword.py list
+```
+
 ### Fix linting
 Before committing, make sure that the line of codes you wrote are conform to PEP8 standard by running:
 ```bash

diff --git a/alembic/versions/356882459cec_remove_category_keywords_change_columns_.py b/alembic/versions/356882459cec_remove_category_keywords_change_columns_.py
@@ -0,0 +1,40 @@
+"""Remove: category keywords / change columns names
+
+Revision ID: 356882459cec
+Revises: faec286c7f92
+Create Date: 2024-04-29 10:14:27.240887
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision: str = '356882459cec'
+down_revision: Union[str, None] = 'faec286c7f92'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('keywords', sa.Column('number_of_ressources', sa.Integer(), nullable=True))
+    op.add_column('keywords', sa.Column('number_of_ressources_solutions', sa.Integer(), nullable=True))
+    op.drop_column('keywords', 'number_of_ressources_naturelles_causes')
+    op.drop_column('keywords', 'number_of_ressources_naturelles_concepts_generaux')
+    op.drop_column('keywords', 'category')
+    op.drop_column('keywords', 'number_of_ressources_naturelles_solutions')
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('keywords', sa.Column('number_of_ressources_naturelles_solutions', sa.INTEGER(), autoincrement=False, nullable=True))
+    op.add_column('keywords', sa.Column('category', postgresql.JSON(astext_type=sa.Text()), autoincrement=False, nullable=True))
+    op.add_column('keywords', sa.Column('number_of_ressources_naturelles_concepts_generaux', sa.INTEGER(), autoincrement=False, nullable=True))
+    op.add_column('keywords', sa.Column('number_of_ressources_naturelles_causes', sa.INTEGER(), autoincrement=False, nullable=True))
+    op.drop_column('keywords', 'number_of_ressources_solutions')
+    op.drop_column('keywords', 'number_of_ressources')
+    # ### end Alembic commands ###
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -5,8 +5,8 @@ services:
     build:
       context: ./
       dockerfile: Dockerfile
-    entrypoint: ["poetry", "run", "pytest","-vv", "-o", "log_cli=true", "--cov-report", "term:skip-covered", "--cov=quotaclimat", "--cov=postgres", "test/"]
-    #entrypoint: ["sleep", "12000"] # use to debug the container if needed
+    #entrypoint: ["poetry", "run", "pytest","-vv", "-o", "log_cli=true", "--cov-report", "term:skip-covered", "--cov=quotaclimat", "--cov=postgres", "test/"]
+    entrypoint: ["sleep", "12000"] # use to debug the container if needed
     environment:
       ENV: docker
       # CHANNEL: "fr3-idf"

diff --git a/postgres/schemas/models.py b/postgres/schemas/models.py
@@ -63,7 +63,6 @@ class Keywords(Base):
     start = Column(DateTime())
     plaintext= Column(Text)
     theme=Column(JSON) #keyword.py  # ALTER TABLE keywords ALTER theme TYPE json USING to_json(theme);
-    category=Column(JSON) #keyword.py  # new column - alembic handles this
     created_at = Column(DateTime(timezone=True), server_default=text("(now() at time zone 'utc')")) # ALTER TABLE ONLY keywords ALTER COLUMN created_at SET DEFAULT (now() at time zone 'utc');
     keywords_with_timestamp = Column(JSON) # ALTER TABLE keywords ADD keywords_with_timestamp json;
     number_of_keywords = Column(Integer) # ALTER TABLE keywords ADD number_of_keywords integer;
@@ -73,9 +72,8 @@ class Keywords(Base):
     number_of_changement_climatique_consequences= Column(Integer)  # ALTER TABLE keywords ADD number_of_changement_climatique_consequences integer;
     number_of_attenuation_climatique_solutions_directes= Column(Integer)  # ALTER TABLE keywords ADD number_of_attenuation_climatique_solutions_directes integer;
     number_of_adaptation_climatique_solutions_directes= Column(Integer)  # ALTER TABLE keywords ADD number_of_adaptation_climatique_solutions_directes integer;
-    number_of_ressources_naturelles_concepts_generaux= Column(Integer)  # ALTER TABLE keywords ADD number_of_ressources_naturelles_concepts_generaux integer;
-    number_of_ressources_naturelles_causes= Column(Integer)  # ALTER TABLE keywords ADD number_of_ressources_naturelles_causes integer;
-    number_of_ressources_naturelles_solutions= Column(Integer)  # ALTER TABLE keywords ADD number_of_ressources_naturelles_solutions integer;
+    number_of_ressources= Column(Integer)  # ALTER TABLE keywords ADD number_of_ressources_naturelles_concepts_generaux integer;
+    number_of_ressources_solutions= Column(Integer)  # ALTER TABLE keywords ADD number_of_ressources_solutions integer;
     number_of_biodiversite_concepts_generaux= Column(Integer)  # ALTER TABLE keywords ADD number_of_biodiversite_concepts_generaux integer;
     number_of_biodiversite_causes_directes= Column(Integer)  # ALTER TABLE keywords ADD number_of_biodiversite_causes_directes integer;
     number_of_biodiversite_consequences= Column(Integer)  # ALTER TABLE keywords ADD number_of_biodiversite_consequences integer;
@@ -148,7 +146,7 @@ def update_channel_metadata(engine):
 def drop_tables():
     """Drop table keyword in the PostgreSQL database"""
 
-    if(os.environ.get("ENV") == "docker"):
+    if(os.environ.get("ENV") == "docker" or os.environ.get("ENV") == "dev"):
         logging.warning("drop tables")
         try:
             engine = connect_to_db()

diff --git a/pyproject.toml b/pyproject.toml
@@ -2,7 +2,10 @@
 name = "quotaclimat"
 version = "0.2.5"
 description = ""
-authors = ["Rambier Estelle <[email protected]>"]
+authors = [
+    "Rambier Estelle <[email protected]>",
+    "Paul Leclercq <[email protected]>"
+]
 readme = "README.md"
 
 [tool.pytest.ini_options]
File	Stmts	Miss	Cover	Missing
postgres
insert_data.py	44	7	84%	36–38, 57–59, 64
insert_existing_data_example.py	19	3	84%	25–27
postgres/schemas
models.py	108	8	93%	99–106, 117–118, 156–157
quotaclimat/data_ingestion
scrap_sitemap.py	134	17	87%	27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
quotaclimat/data_ingestion/ingest_db
ingest_sitemap_in_db.py	55	37	33%	21–42, 45–58, 62–73
quotaclimat/data_ingestion/scrap_html
scrap_description_article.py	36	3	92%	19–20, 32
quotaclimat/data_processing/mediatree
api_import.py	184	110	40%	42–46, 51–63, 67–70, 76, 79–112, 118–133, 137–138, 151–163, 167–173, 186–197, 200–204, 210, 237–238, 242, 246–265, 268–270
channel_program.py	91	9	90%	21–23, 34–36, 50, 86, 95
config.py	15	2	87%	7, 16
detect_keywords.py	180	4	98%	178, 230–232
update_pg_keywords.py	44	30	32%	14–84, 105–106, 127–152, 158
utils.py	66	23	65%	18, 29–53, 56, 65, 81–82
quotaclimat/utils
healthcheck_config.py	29	14	52%	22–24, 27–38
logger.py	24	11	54%	22–24, 28–37
sentry.py	10	2	80%	21–22
TOTAL	1065	280	74%