Data: add program name (#158)

* bump minor v0.2.0
dataforgoodfr · Apr 22, 2024 · 874bacb · 874bacb · github-actions · Apr 22, 2024
1 parent e59e536
commit 874bacb
Show file tree

Hide file tree

Showing 116 changed files with 1,426 additions and 399,791 deletions.
diff --git a/.github/workflows/deploy-main.yml b/.github/workflows/deploy-main.yml
@@ -60,10 +60,10 @@ jobs:
     - name: Push ingest_to_db Image
       run: docker push ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/ingest_to_db
 
-    - name: Build streamlit image
-      run: docker build -f Dockerfile_streamlit . -t ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/streamlit
-    - name: Push streamlit Image
-      run: docker push ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/streamlit
+    # - name: Build streamlit image
+    #   run: docker build -f Dockerfile_streamlit . -t ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/streamlit
+    # - name: Push streamlit Image
+    #   run: docker push ${{ secrets.CONTAINER_REGISTRY_ENDPOINT }}/streamlit
 
     - name: update scaleway job definition with version mediatree_import
       uses: jawher/[email protected]

diff --git a/.streamlit/config.toml b/.streamlit/config.toml
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,5 @@
 #from https://medium.com/@albertazzir/blazing-fast-python-docker-builds-with-poetry-a78a66f5aed0
-FROM python:3.11.8 as builder
+FROM python:3.11.9 as builder
 
 ENV VIRTUAL_ENV=/app/.venv
 
@@ -36,6 +36,7 @@ RUN pip install poetry
 COPY quotaclimat ./quotaclimat
 COPY postgres ./postgres
 COPY app.py ./app.py
+COPY alembic/ ./alembic
 
 # Docker compose overwrite this config to have only one Dockerfile
 CMD ["ls"]
diff --git a/Dockerfile_api_import b/Dockerfile_api_import
@@ -32,8 +32,14 @@ COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
 COPY quotaclimat ./quotaclimat
 COPY postgres ./postgres
 COPY pyproject.toml pyproject.toml
+COPY alembic/ ./alembic
+COPY alembic.ini ./alembic.ini
 
 # healthcheck
 EXPOSE 5050
 
-ENTRYPOINT ["python", "quotaclimat/data_processing/mediatree/api_import.py"]
+# Use a separate script to handle migrations and start the application
+COPY docker-entrypoint.sh ./docker-entrypoint.sh
+RUN chmod +x ./docker-entrypoint.sh
+
+ENTRYPOINT ["./docker-entrypoint.sh"]
diff --git a/Dockerfile_streamlit b/Dockerfile_streamlit
diff --git a/README.md b/README.md
@@ -2,9 +2,15 @@
 
 
 ![](quotaclimat/utils/coverquotaclimat.png)
-The aim of this work is to deliver a tool to [QuotaClimat](https://www.quotaclimat.org/ "Quotaclimat website"), allowing them to quantify the media coverage of the climate crisis. By the mean of sitemap scrapping (among others data sources), a Streamlit dashboard is developed to answer their needs. 
-- 2022-09-28, Introduction by Eva Morel (Quotaclimat): from 14:10 to 32:00 https://www.youtube.com/watch?v=GMrwDjq3rYs
+The aim of this work is to deliver a tool to a consortium around [QuotaClimat](https://www.quotaclimat.org/ "Quotaclimat website"), [Climat Medias](https://climatmedias.org/) allowing them to quantify the media coverage of the climate crisis. 
+
+Radio and TV data are collected thanks to Mediatree.
+
+And webpress is currently at work in progress (as for 04/2024)
+
+- 2022-09-28, Introduction by Eva Morel (Quota Climat): from 14:10 to 32:00 https://www.youtube.com/watch?v=GMrwDjq3rYs
 - 2022-11-29 Project status and prospects by Estelle Rambier (Data): from 09:00 to 25:00 https://www.youtube.com/watch?v=cLGQxHJWwYA
+- 2024-03 Project tech presentation by Paul Leclercq (Data) : https://www.youtube.com/watch?v=zWk4WLVC5Hs
 
 ## Index
 - [I want to contribute! Where do I start?](#contrib)
@@ -279,9 +285,9 @@ Learn more here : https://docs.sentry.io/platforms/python/configuration/options/
 
 ## Batch import
 ### Batch import based on time
-Use env variable `START_DATE` like in docker compose (epoch second format : 1705409797).
+If our media perimeter evolves, we need to reimport it all using env variable `START_DATE` like in docker compose (epoch second format : 1705409797).
 
-Otherwise, default is yesterday midnight date.
+Otherwise, default is yesterday midnight date (default cron job)
 
 ### Batch import based on channel
 Use env variable `CHANNEL` like in docker compose (string: tf1)
@@ -314,6 +320,24 @@ With +1 millions rows, we can update from an offset to fix a custom logic by usi
 
 Example inside the docker-compose.yml mediatree service -> START_OFFSET: 100
 
+We can use a Github actions to start multiple update operations with different offsets.
+
+## SQL Tables evolution
+Using [Alembic](https://alembic.sqlalchemy.org/en/latest/autogenerate.html) Auto Generating Migrations¶ we can add a new column inside `models.py` and it will automatically make the schema evolution :
+
+```
+# connect to the test container : docker compose test exec bash
+poetry run alembic revision --autogenerate -m "Add new column test for table keywords"
+# this should generate a file to commit inside "alembic/versions"
+# to apply it we need to run, from our container
+poetry run alembic upgrade head
+```
+
+Inside our Dockerfile_api_import, we call this line 
+```
+# to migrate SQL tables schema if needed
+RUN alembic upgrade head
+```
 ### Channel metadata
 In order to maintain channel perimeter (weekday, hours) up to date, we save the current version inside `postgres/channel_metadata.json`, if we modify this file the next deploy will update every lines of inside Postgresql table `channel_metadata`.
 

diff --git a/alembic.ini b/alembic.ini
@@ -0,0 +1,114 @@
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+script_location = alembic
+
+# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
+# Uncomment the line below if you want the files to be prepended with date and time
+# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
+# for all available tokens
+# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
+
+# sys.path path, will be prepended to sys.path if present.
+# defaults to the current working directory.
+prepend_sys_path = .
+
+# timezone to use when rendering the date within the migration file
+# as well as the filename.
+# If specified, requires the python>=3.9 or backports.zoneinfo library.
+# Any required deps can installed by adding `alembic[tz]` to the pip requirements
+# string value is passed to ZoneInfo()
+# leave blank for localtime
+# timezone =
+
+# max length of characters to apply to the
+# "slug" field
+# truncate_slug_length = 40
+
+# set to 'true' to run the environment during
+# the 'revision' command, regardless of autogenerate
+# revision_environment = false
+
+# set to 'true' to allow .pyc and .pyo files without
+# a source .py file to be detected as revisions in the
+# versions/ directory
+# sourceless = false
+
+# version location specification; This defaults
+# to alembic/versions.  When using multiple version
+# directories, initial revisions must be specified with --version-path.
+# The path separator used here should be the separator specified by "version_path_separator" below.
+# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
+
+# version path separator; As mentioned above, this is the character used to split
+# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
+# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
+# Valid values for version_path_separator are:
+#
+# version_path_separator = :
+# version_path_separator = ;
+# version_path_separator = space
+version_path_separator = os  # Use os.pathsep. Default configuration used for new projects.
+
+# set to 'true' to search source files recursively
+# in each "version_locations" directory
+# new in Alembic version 1.10
+# recursive_version_locations = false
+
+# the output encoding used when revision files
+# are written from script.py.mako
+# output_encoding = utf-8
+sqlalchemy.url = postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}
+
+[post_write_hooks]
+# post_write_hooks defines scripts or Python functions that are run
+# on newly generated revision scripts.  See the documentation for further
+# detail and examples
+
+# format using "black" - use the console_scripts runner, against the "black" entrypoint
+# hooks = black
+# black.type = console_scripts
+# black.entrypoint = black
+# black.options = -l 79 REVISION_SCRIPT_FILENAME
+
+# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
+# hooks = ruff
+# ruff.type = exec
+# ruff.executable = %(here)s/.venv/bin/ruff
+# ruff.options = --fix REVISION_SCRIPT_FILENAME
+
+# Logging configuration
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
diff --git a/alembic/README b/alembic/README
@@ -0,0 +1 @@
+Generic single-database configuration.
diff --git a/alembic/env.py b/alembic/env.py
@@ -0,0 +1,100 @@
+from logging.config import fileConfig
+
+from sqlalchemy import create_engine
+from postgres.schemas.models import Base
+from alembic import context
+
+import re
+import os
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+if config.config_file_name is not None:
+    fileConfig(config.config_file_name)
+
+# add your model's MetaData object here
+# for 'autogenerate' support
+# from myapp import mymodel
+# target_metadata = mymodel.Base.metadata
+target_metadata = Base.metadata
+
+# from https://stackoverflow.com/a/63672522/3535853
+# https://alembic.sqlalchemy.org/en/latest/cookbook.html#don-t-generate-any-drop-table-directives-with-autogenerate
+def include_object(object, name, type_, reflected, compare_to):
+    if type_ == "table" and reflected and compare_to is None:
+        return False
+    else:
+        return True
+
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+
+
+def run_migrations_offline() -> None:
+    """Run migrations in 'offline' mode.
+
+    This configures the context with just a URL
+    and not an Engine, though an Engine is acceptable
+    here as well.  By skipping the Engine creation
+    we don't even need a DBAPI to be available.
+
+    Calls to context.execute() here emit the given string to the
+    script output.
+
+    """
+    url = config.get_main_option("sqlalchemy.url")
+    context.configure(
+        url=url,
+        target_metadata=target_metadata,
+        literal_binds=True,
+        dialect_opts={"paramstyle": "named"},
+        include_object=include_object
+    )
+
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def run_migrations_online() -> None:
+    """Run migrations in 'online' mode.
+
+    In this scenario we need to create an Engine
+    and associate a connection with the context.
+
+    """
+    url_tokens = {
+      "POSTGRES_USER": os.getenv("POSTGRES_USER",""),
+      "POSTGRES_DB": os.getenv("POSTGRES_DB",""),
+      "POSTGRES_PASSWORD": os.getenv("POSTGRES_PASSWORD",""),
+      "POSTGRES_HOST": os.getenv("POSTGRES_HOST",""),
+      "POSTGRES_PORT": os.getenv("POSTGRES_PORT","")
+    }
+
+    url = config.get_main_option("sqlalchemy.url")
+
+    url = re.sub(r"\${(.+?)}", lambda m: url_tokens[m.group(1)], url)
+
+    connectable = create_engine(url)
+
+    with connectable.connect() as connection:
+        context.configure(
+            connection=connection,
+            target_metadata=target_metadata,
+            compare_type=True,
+            compare_server_default=True,
+            include_object=include_object
+        )
+
+        with context.begin_transaction():
+            context.run_migrations()
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    run_migrations_online()
diff --git a/alembic/script.py.mako b/alembic/script.py.mako
@@ -0,0 +1,26 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision: str = ${repr(up_revision)}
+down_revision: Union[str, None] = ${repr(down_revision)}
+branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
+depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
+
+
+def upgrade() -> None:
+    ${upgrades if upgrades else "pass"}
+
+
+def downgrade() -> None:
+    ${downgrades if downgrades else "pass"}
diff --git a/quotaclimat/data_analytics/__init__.py → alembic/versions/.keep b/quotaclimat/data_analytics/__init__.py → alembic/versions/.keep
File	Stmts	Miss	Cover	Missing
postgres
insert_data.py	44	7	84%	36–38, 57–59, 64
insert_existing_data_example.py	19	3	84%	25–27
postgres/schemas
models.py	110	15	86%	101–108, 119–120, 152–162
quotaclimat/data_ingestion
scrap_sitemap.py	134	17	87%	27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
quotaclimat/data_ingestion/ingest_db
ingest_sitemap_in_db.py	55	37	33%	21–42, 45–58, 62–73
quotaclimat/data_ingestion/scrap_html
scrap_description_article.py	36	3	92%	19–20, 32
quotaclimat/data_processing/mediatree
api_import.py	179	106	41%	42–46, 51–59, 63–66, 72, 75–102, 108–123, 128–130, 145–157, 161–164, 168–174, 185–197, 200–204, 210, 234–235, 239, 243–262, 265–267
channel_program.py	68	10	85%	14–16, 27–29, 33–34, 48, 77
config.py	15	2	87%	7, 16
detect_keywords.py	192	4	98%	197, 250–252
utils.py	66	22	67%	19, 30–54, 57, 76–77
quotaclimat/utils
healthcheck_config.py	29	14	52%	22–24, 27–38
logger.py	24	11	54%	22–24, 28–37
sentry.py	10	2	80%	21–22
TOTAL	1043	253	76%