Merge pull request #26 from dataforgoodfr/french-tmdb-movie-data

Implement curating French regional movies dataset on HF
dataforgoodfr · Apr 18, 2024 · 40f9543 · 40f9543
2 parents b3b34f8 + 27692c9
commit 40f9543
Show file tree

Hide file tree

Showing 16 changed files with 1,257 additions and 2 deletions.
diff --git a/.github/workflows/movies.yaml b/.github/workflows/movies.yaml
@@ -0,0 +1,28 @@
+name: Update Movies Dataset
+
+on:
+  # Run this workflow once per day, at 0:15 UTC
+  schedule: [{ cron: "15 0 * * *" }]
+  # Run this workflow when triggered manually in GitHub’s UI.
+  workflow_dispatch: {}
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11.4"
+      - name: Setup Poetry
+        uses: Gr1N/setup-poetry@v8
+        with:
+          poetry-version: "1.7.1"
+      - run: poetry install --no-root
+      - run: poetry run python -m observatoire.tmdb.movies
+        env:
+          # Hugging Face credentials
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          # TMDB credentials
+          TMDB_API_KEY: ${{ secrets.TMDB_API_KEY }}
+          TMDB_MAX_RETRIES: ${{ secrets.TMDB_MAX_RETRIES }}
diff --git a/.gitignore b/.gitignore
@@ -162,4 +162,4 @@ cython_debug/
 # Precommit hooks: ruff cache
 .ruff_cache
 .DS_Store
-data/
+data/
diff --git a/README.md b/README.md
@@ -118,3 +118,12 @@ This repo includes invoke for pythonic task execution. To see the
 is of available tasks you can run:
 
 invoke -l
+
+# Updating the Movie Database
+
+The [French regional TMDB Movies Dataset](https://huggingface.co/datasets/DataForGood/observatoire_des_imaginaires_movies)
+on Hugging Face can be updated using the following command:
+
+```bash
+invoke update-movies-dataset
+```
diff --git a/observatoire/__init__.py b/observatoire/__init__.py
@@ -0,0 +1,3 @@
+from dotenv import load_dotenv
+
+load_dotenv()
diff --git a/observatoire/tmdb/__init__.py b/observatoire/tmdb/__init__.py
diff --git a/observatoire/tmdb/config.py b/observatoire/tmdb/config.py
@@ -0,0 +1,4 @@
+import os
+
+TMDB_API_KEY = os.getenv("TMDB_API_KEY")
+TMDB_MAX_RETRIES = int(os.getenv("TMDB_MAX_RETRIES", "500"))
diff --git a/observatoire/tmdb/data.py b/observatoire/tmdb/data.py
@@ -0,0 +1,212 @@
+import json
+
+import pandas as pd
+
+
+def transform_movie_json(movie_json: list[str]) -> pd.DataFrame:  # noqa: C901, PLR0915
+    """
+    Transforms the JSON data into a DataFrame
+    """
+
+    data = []
+    unique_ids = set()
+
+    # load the data
+    variations_to_ignore = [
+        None,
+        "",
+        "NA",
+        "N/A",
+        "None",
+        "na",
+        "n/a",
+        "NULL",
+        "Not Available",
+    ]
+
+    for line in movie_json:
+        line_in_json = json.loads(line)
+
+        # TODO: format_list_to_str is too complex and should be refactored
+        def format_list_to_str(line_in_json: dict, formatted_data: dict) -> dict:  # noqa: C901, PLR0912
+            # convert genres list to str
+
+            genres_str = None
+            genres_list = []
+            try:
+                for genre in line_in_json["genres"]:
+                    if genre["name"] is not any(variations_to_ignore):
+                        genres_list.append(genre["name"])
+                if len(genres_list) > 0:
+                    genres_str = ", ".join(genres_list)
+            except Exception:
+                pass
+
+            formatted_data["genres"] = genres_str
+
+            # convert production_companies list to str
+            production_companies_str = None
+            production_companies_list = []
+            try:
+                for company in line_in_json["production_companies"]:
+                    if company["name"] is not any(variations_to_ignore):
+                        production_companies_list.append(company["name"])
+                if len(production_companies_list) > 0:
+                    production_companies_str = ", ".join(production_companies_list)
+            except Exception:
+                pass
+
+            formatted_data["production_companies"] = production_companies_str
+
+            # convert production_countries list to str
+            production_countries_str = None
+            production_countries_list = []
+            try:
+                for country in line_in_json["production_countries"]:
+                    if country["name"] is not any(variations_to_ignore):
+                        production_countries_list.append(country["name"])
+                if len(production_countries_list) > 0:
+                    production_countries_str = ", ".join(production_countries_list)
+            except Exception:
+                pass
+
+            formatted_data["production_countries"] = production_countries_str
+
+            # convert spoken_languages list to str
+            spoken_languages_str = None
+            spoken_languages_list = []
+            try:
+                for language in line_in_json["spoken_languages"]:
+                    if language["english_name"] is not any(variations_to_ignore):
+                        spoken_languages_list.append(language["english_name"])
+                if len(spoken_languages_list) > 0:
+                    spoken_languages_str = ", ".join(spoken_languages_list)
+            except Exception:
+                pass
+
+            formatted_data["spoken_languages"] = spoken_languages_str
+
+            return formatted_data
+
+        # format the data
+
+        formatted_data = {}
+
+        formatted_data["id"] = int(line_in_json["id"]) if line_in_json["id"] is not None else 0
+
+        formatted_data["title"] = (
+            str(line_in_json["title"])
+            if line_in_json["title"] is not any(variations_to_ignore)
+            else None
+        )
+        formatted_data["vote_average"] = (
+            float(line_in_json["vote_average"])
+            if line_in_json["vote_average"] is not None
+            else 0.0
+        )
+
+        formatted_data["vote_count"] = (
+            int(line_in_json["vote_count"]) if line_in_json["vote_count"] is not None else 0
+        )
+
+        formatted_data["status"] = (
+            str(line_in_json["status"])
+            if line_in_json["status"] is not any(variations_to_ignore)
+            else None
+        )
+
+        formatted_data["release_date"] = (
+            str(line_in_json["release_date"])
+            if line_in_json["release_date"] is not None and line_in_json["release_date"] != ""
+            else "1500-01-01"
+        )
+
+        formatted_data["revenue"] = (
+            int(line_in_json["revenue"]) if line_in_json["revenue"] is not None else 0
+        )
+
+        formatted_data["runtime"] = (
+            int(line_in_json["runtime"]) if line_in_json["runtime"] is not None else 0
+        )
+
+        formatted_data["adult"] = (
+            bool(line_in_json["adult"]) if line_in_json["adult"] is not None else False
+        )
+
+        formatted_data["backdrop_path"] = (
+            str(line_in_json["backdrop_path"])
+            if line_in_json["backdrop_path"] is not any(variations_to_ignore)
+            else None
+        )
+
+        formatted_data["budget"] = (
+            int(line_in_json["budget"]) if line_in_json["budget"] is not None else 0
+        )
+
+        formatted_data["homepage"] = (
+            str(line_in_json["homepage"])
+            if line_in_json["homepage"] is not any(variations_to_ignore)
+            else None
+        )
+
+        formatted_data["imdb_id"] = (
+            str(line_in_json["imdb_id"])
+            if line_in_json["imdb_id"] is not any(variations_to_ignore)
+            else None
+        )
+
+        formatted_data["original_language"] = (
+            str(line_in_json["original_language"])
+            if line_in_json["original_language"] is not any(variations_to_ignore)
+            else None
+        )
+
+        formatted_data["original_title"] = (
+            str(line_in_json["original_title"])
+            if line_in_json["original_title"] is not any(variations_to_ignore)
+            else None
+        )
+
+        formatted_data["overview"] = (
+            str(line_in_json["overview"])
+            if line_in_json["overview"] is not any(variations_to_ignore)
+            else None
+        )
+
+        formatted_data["popularity"] = (
+            float(line_in_json["popularity"]) if line_in_json["popularity"] is not None else 0.0
+        )
+
+        formatted_data["poster_path"] = (
+            str(line_in_json["poster_path"])
+            if line_in_json["poster_path"] is not any(variations_to_ignore)
+            else None
+        )
+
+        formatted_data["tagline"] = (
+            str(line_in_json["tagline"])
+            if line_in_json["tagline"] is not any(variations_to_ignore)
+            else None
+        )
+
+        formatted_data["keywords"] = str(line_in_json["keywords"])
+        try:
+            final_formatted_data = format_list_to_str(line_in_json, formatted_data)
+        except Exception:
+            continue
+
+        # remove and newline chracters
+        updated_data = {}
+        for key, value in final_formatted_data.items():
+            if isinstance(value, str):  # Check if the value is a string
+                updated_value = value.replace("\n", " ").replace("\r", " ")
+                updated_data[key] = updated_value
+            else:
+                updated_data[key] = value
+
+        # add the data to our local list
+        if updated_data["id"] not in unique_ids:
+            unique_ids.add(updated_data["id"])
+            data.append(updated_data)
+
+    return pd.DataFrame(data)
diff --git a/observatoire/tmdb/helpers.py b/observatoire/tmdb/helpers.py
@@ -0,0 +1,42 @@
+import json
+import logging
+
+import pandas as pd
+
+
+def parse_keywords(keywords: str) -> str:
+    """
+    Parses the keywords into a list
+    """
+
+    keywords_dict = json.loads(keywords)
+
+    keywords_list = []
+
+    for item in keywords_dict["keywords"]:
+        keywords_list.append(item["name"])
+
+    return ", ".join(keywords_list)
+
+
+def merge(df_current: pd.DataFrame | None, df_latest: pd.DataFrame, logger: logging) -> None:
+    """
+    Merge the current dataset with the new dataset
+    """
+    logger.info("starting to merge new data with current dataframe")
+
+    df_latest.replace({"Not Available": pd.NA, "1500-01-01": pd.NA, None: pd.NA}, inplace=True)
+
+    # Concatenate the current dataset_df with the new df to merge the data
+    merged_df = pd.concat([df_current, df_latest]) if df_current is not None else df_latest
+    merged_df.reset_index(drop=True, inplace=True)
+
+    # sort by id column
+    merged_df.sort_values(by="id", inplace=True)
+
+    # Drop unnecessary columns
+    merged_df = merged_df.drop(
+        columns=[col for col in merged_df.columns if col.startswith("__")],
+    )
+
+    return merged_df
diff --git a/observatoire/tmdb/hf.py b/observatoire/tmdb/hf.py
@@ -0,0 +1,24 @@
+import pandas as pd
+from datasets import Dataset, DatasetDict, load_dataset
+
+
+def load_movies_dataset() -> pd.DataFrame | None:
+    try:
+        dataset = load_dataset(
+            "DataForGood/observatoire_des_imaginaires_movies",
+            split="train",
+        )
+        dataset.cleanup_cache_files()
+
+        # Load dataset into Pandas DataFrame
+        df = dataset.to_pandas()
+    except Exception:
+        df = None
+
+    return df
+
+
+def save_movies_dataset(df: pd.DataFrame) -> None:
+    dataset = Dataset.from_pandas(df, preserve_index=False)
+    dataset_dict = DatasetDict({"train": dataset})
+    dataset_dict.push_to_hub("DataForGood/observatoire_des_imaginaires_movies")
diff --git a/observatoire/tmdb/logger.py b/observatoire/tmdb/logger.py
@@ -0,0 +1,19 @@
+import logging
+
+
+def setup_logger() -> logging:
+    # Create a logger for the current module
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.INFO)
+
+    # Define the log format
+    log_formatter = logging.Formatter(
+        "%(asctime)s    | %(name)s  | %(levelname)s | %(message)s",
+    )
+
+    # Create a stream handler to output log messages to the console
+    stream_handler = logging.StreamHandler()
+    stream_handler.setFormatter(log_formatter)
+    logger.addHandler(stream_handler)
+
+    return logger