Skip to content

Commit

Permalink
Merge pull request #26 from dataforgoodfr/french-tmdb-movie-data
Browse files Browse the repository at this point in the history
Implement curating French regional movies dataset on HF
  • Loading branch information
kaaloo authored Apr 18, 2024
2 parents b3b34f8 + 27692c9 commit 40f9543
Show file tree
Hide file tree
Showing 16 changed files with 1,257 additions and 2 deletions.
28 changes: 28 additions & 0 deletions .github/workflows/movies.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Update Movies Dataset

on:
# Run this workflow once per day, at 0:15 UTC
schedule: [{ cron: "15 0 * * *" }]
# Run this workflow when triggered manually in GitHub’s UI.
workflow_dispatch: {}

jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.11.4"
- name: Setup Poetry
uses: Gr1N/setup-poetry@v8
with:
poetry-version: "1.7.1"
- run: poetry install --no-root
- run: poetry run python -m observatoire.tmdb.movies
env:
# Hugging Face credentials
HF_TOKEN: ${{ secrets.HF_TOKEN }}
# TMDB credentials
TMDB_API_KEY: ${{ secrets.TMDB_API_KEY }}
TMDB_MAX_RETRIES: ${{ secrets.TMDB_MAX_RETRIES }}
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,4 +162,4 @@ cython_debug/
# Precommit hooks: ruff cache
.ruff_cache
.DS_Store
data/
data/
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,12 @@ This repo includes invoke for pythonic task execution. To see the
is of available tasks you can run:

invoke -l

# Updating the Movie Database

The [French regional TMDB Movies Dataset](https://huggingface.co/datasets/DataForGood/observatoire_des_imaginaires_movies)
on Hugging Face can be updated using the following command:

```bash
invoke update-movies-dataset
```
3 changes: 3 additions & 0 deletions observatoire/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from dotenv import load_dotenv

load_dotenv()
Empty file added observatoire/tmdb/__init__.py
Empty file.
4 changes: 4 additions & 0 deletions observatoire/tmdb/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import os

TMDB_API_KEY = os.getenv("TMDB_API_KEY")
TMDB_MAX_RETRIES = int(os.getenv("TMDB_MAX_RETRIES", "500"))
212 changes: 212 additions & 0 deletions observatoire/tmdb/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
import json

import pandas as pd


def transform_movie_json(movie_json: list[str]) -> pd.DataFrame: # noqa: C901, PLR0915
"""
Transforms the JSON data into a DataFrame
"""

data = []
unique_ids = set()

# load the data
variations_to_ignore = [
None,
"",
"NA",
"N/A",
"None",
"na",
"n/a",
"NULL",
"Not Available",
]

for line in movie_json:
line_in_json = json.loads(line)

# TODO: format_list_to_str is too complex and should be refactored
def format_list_to_str(line_in_json: dict, formatted_data: dict) -> dict: # noqa: C901, PLR0912
# convert genres list to str

genres_str = None
genres_list = []
try:
for genre in line_in_json["genres"]:
if genre["name"] is not any(variations_to_ignore):
genres_list.append(genre["name"])
if len(genres_list) > 0:
genres_str = ", ".join(genres_list)
except Exception:
pass

formatted_data["genres"] = genres_str

# convert production_companies list to str
production_companies_str = None
production_companies_list = []
try:
for company in line_in_json["production_companies"]:
if company["name"] is not any(variations_to_ignore):
production_companies_list.append(company["name"])
if len(production_companies_list) > 0:
production_companies_str = ", ".join(production_companies_list)
except Exception:
pass

formatted_data["production_companies"] = production_companies_str

# convert production_countries list to str
production_countries_str = None
production_countries_list = []
try:
for country in line_in_json["production_countries"]:
if country["name"] is not any(variations_to_ignore):
production_countries_list.append(country["name"])
if len(production_countries_list) > 0:
production_countries_str = ", ".join(production_countries_list)
except Exception:
pass

formatted_data["production_countries"] = production_countries_str

# convert spoken_languages list to str
spoken_languages_str = None
spoken_languages_list = []
try:
for language in line_in_json["spoken_languages"]:
if language["english_name"] is not any(variations_to_ignore):
spoken_languages_list.append(language["english_name"])
if len(spoken_languages_list) > 0:
spoken_languages_str = ", ".join(spoken_languages_list)
except Exception:
pass

formatted_data["spoken_languages"] = spoken_languages_str

return formatted_data

# format the data

formatted_data = {}

formatted_data["id"] = int(line_in_json["id"]) if line_in_json["id"] is not None else 0

formatted_data["title"] = (
str(line_in_json["title"])
if line_in_json["title"] is not any(variations_to_ignore)
else None
)
formatted_data["vote_average"] = (
float(line_in_json["vote_average"])
if line_in_json["vote_average"] is not None
else 0.0
)

formatted_data["vote_count"] = (
int(line_in_json["vote_count"]) if line_in_json["vote_count"] is not None else 0
)

formatted_data["status"] = (
str(line_in_json["status"])
if line_in_json["status"] is not any(variations_to_ignore)
else None
)

formatted_data["release_date"] = (
str(line_in_json["release_date"])
if line_in_json["release_date"] is not None and line_in_json["release_date"] != ""
else "1500-01-01"
)

formatted_data["revenue"] = (
int(line_in_json["revenue"]) if line_in_json["revenue"] is not None else 0
)

formatted_data["runtime"] = (
int(line_in_json["runtime"]) if line_in_json["runtime"] is not None else 0
)

formatted_data["adult"] = (
bool(line_in_json["adult"]) if line_in_json["adult"] is not None else False
)

formatted_data["backdrop_path"] = (
str(line_in_json["backdrop_path"])
if line_in_json["backdrop_path"] is not any(variations_to_ignore)
else None
)

formatted_data["budget"] = (
int(line_in_json["budget"]) if line_in_json["budget"] is not None else 0
)

formatted_data["homepage"] = (
str(line_in_json["homepage"])
if line_in_json["homepage"] is not any(variations_to_ignore)
else None
)

formatted_data["imdb_id"] = (
str(line_in_json["imdb_id"])
if line_in_json["imdb_id"] is not any(variations_to_ignore)
else None
)

formatted_data["original_language"] = (
str(line_in_json["original_language"])
if line_in_json["original_language"] is not any(variations_to_ignore)
else None
)

formatted_data["original_title"] = (
str(line_in_json["original_title"])
if line_in_json["original_title"] is not any(variations_to_ignore)
else None
)

formatted_data["overview"] = (
str(line_in_json["overview"])
if line_in_json["overview"] is not any(variations_to_ignore)
else None
)

formatted_data["popularity"] = (
float(line_in_json["popularity"]) if line_in_json["popularity"] is not None else 0.0
)

formatted_data["poster_path"] = (
str(line_in_json["poster_path"])
if line_in_json["poster_path"] is not any(variations_to_ignore)
else None
)

formatted_data["tagline"] = (
str(line_in_json["tagline"])
if line_in_json["tagline"] is not any(variations_to_ignore)
else None
)

formatted_data["keywords"] = str(line_in_json["keywords"])
try:
final_formatted_data = format_list_to_str(line_in_json, formatted_data)
except Exception:
continue

# remove and newline chracters
updated_data = {}
for key, value in final_formatted_data.items():
if isinstance(value, str): # Check if the value is a string
updated_value = value.replace("\n", " ").replace("\r", " ")
updated_data[key] = updated_value
else:
updated_data[key] = value

# add the data to our local list
if updated_data["id"] not in unique_ids:
unique_ids.add(updated_data["id"])
data.append(updated_data)

return pd.DataFrame(data)
42 changes: 42 additions & 0 deletions observatoire/tmdb/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import json
import logging

import pandas as pd


def parse_keywords(keywords: str) -> str:
"""
Parses the keywords into a list
"""

keywords_dict = json.loads(keywords)

keywords_list = []

for item in keywords_dict["keywords"]:
keywords_list.append(item["name"])

return ", ".join(keywords_list)


def merge(df_current: pd.DataFrame | None, df_latest: pd.DataFrame, logger: logging) -> None:
"""
Merge the current dataset with the new dataset
"""
logger.info("starting to merge new data with current dataframe")

df_latest.replace({"Not Available": pd.NA, "1500-01-01": pd.NA, None: pd.NA}, inplace=True)

# Concatenate the current dataset_df with the new df to merge the data
merged_df = pd.concat([df_current, df_latest]) if df_current is not None else df_latest
merged_df.reset_index(drop=True, inplace=True)

# sort by id column
merged_df.sort_values(by="id", inplace=True)

# Drop unnecessary columns
merged_df = merged_df.drop(
columns=[col for col in merged_df.columns if col.startswith("__")],
)

return merged_df
24 changes: 24 additions & 0 deletions observatoire/tmdb/hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset


def load_movies_dataset() -> pd.DataFrame | None:
try:
dataset = load_dataset(
"DataForGood/observatoire_des_imaginaires_movies",
split="train",
)
dataset.cleanup_cache_files()

# Load dataset into Pandas DataFrame
df = dataset.to_pandas()
except Exception:
df = None

return df


def save_movies_dataset(df: pd.DataFrame) -> None:
dataset = Dataset.from_pandas(df, preserve_index=False)
dataset_dict = DatasetDict({"train": dataset})
dataset_dict.push_to_hub("DataForGood/observatoire_des_imaginaires_movies")
19 changes: 19 additions & 0 deletions observatoire/tmdb/logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import logging


def setup_logger() -> logging:
# Create a logger for the current module
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Define the log format
log_formatter = logging.Formatter(
"%(asctime)s | %(name)s | %(levelname)s | %(message)s",
)

# Create a stream handler to output log messages to the console
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(log_formatter)
logger.addHandler(stream_handler)

return logger
Loading

0 comments on commit 40f9543

Please sign in to comment.