Skip to content

Commit

Permalink
Merge branch 'main' into sort-results-popularity
Browse files Browse the repository at this point in the history
  • Loading branch information
kaaloo committed May 14, 2024
2 parents 65e9850 + eb25da5 commit 99a99bd
Show file tree
Hide file tree
Showing 7 changed files with 120 additions and 28 deletions.
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

### Prerequisites:

1. Python (≥ `3.10`) installed on your system.
1. Python 3.12.3 installed on your system.
2. Ensure you have `poetry` installed. If not, you can install them using `pip`.

```bash
Expand Down Expand Up @@ -135,4 +135,15 @@ The [French regional TMDB Series Dataset](https://huggingface.co/datasets/DataFo

```bash
invoke update-series-dataset
```
```

## Python CLI

The Python CLI supports the following commands:

```
python -m observatoire.tmdb.movies --mode=[latest | missing]
python -m observatoire.tmdb.series --mode=[latest | missing]
```

In the `latest` mode, which is the default, these commands sync the latest records from TMDB to our datasets on Hugging Face. In the `missing` mode, they calculate which rows may be missing from the Hugging Face datasets and attempt to sync these records.
46 changes: 35 additions & 11 deletions observatoire/tmdb/movies/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import contextlib
import os

import fire
from tqdm import tqdm

from observatoire.tmdb.config import HF_MOVIES_DATASET, TMDB_BATCH_SIZE
Expand All @@ -10,32 +11,55 @@
from observatoire.tmdb.logger import setup_logger
from observatoire.tmdb.movies.data import make_movie_df
from observatoire.tmdb.movies.tmdb import get_latest_movie_id, get_movie_data
from observatoire.tmdb.types import Mode


def executor() -> None:
def executor(mode: Mode = "latest") -> None:
logger = setup_logger()
logger.info("Starting Executor")

# first, lets get the id of the lastest movies
latest_id = get_latest_movie_id()
logger.info("Starting Movie Dataset Update")

# second, let's get the last movie from our last run
df_current = load_dataset(HF_MOVIES_DATASET)
current_id = df_current["id"].max() if df_current is not None else None

# Generate a list of movie IDs
movie_ids_list = list(range(current_id or 1, latest_id))
total_movies_to_process = len(movie_ids_list)
if mode == "missing":
logger.info("Mode: Update Missing Movies")

# Create a set of all possible ids
all_ids = set(range(1, current_id + 1))

# Create a set of ids present in df_current
current_ids = set(df_current["id"].dropna())

# Find the difference between all_ids and current_ids
missing_ids = all_ids - current_ids

# Convert the set to a list
movie_ids_list = list(missing_ids)

logger.info(f"Total Missing Movies: {len(movie_ids_list)}")
elif mode == "latest":
logger.info("Mode: Update Latest MOvies")

# first, lets get the id of the lastest movie
latest_id = get_latest_movie_id()

# Generate a list of movie IDs
movie_ids_list = list(range(current_id or 1, latest_id))

logger.info(f"Total Latest Movies: {len(movie_ids_list)}")

logger.info(f"Total Movies to Process in this run: {total_movies_to_process}")
else:
logger.critical("Invalid mode selected")
return

# Split movie_ids_list into chunks of TMDB_BATCH_SIZE
batches = [
movie_ids_list[i : i + TMDB_BATCH_SIZE]
for i in range(0, len(movie_ids_list), TMDB_BATCH_SIZE)
]

with tqdm(total=total_movies_to_process, unit=" movies") as pbar:
with tqdm(total=len(movie_ids_list), unit=" movies") as pbar:
for batch in batches:
logger.info(f"Processing batch of {len(batch)} movies")

Expand Down Expand Up @@ -75,4 +99,4 @@ def executor() -> None:


if __name__ == "__main__":
executor()
fire.Fire(executor)
46 changes: 35 additions & 11 deletions observatoire/tmdb/series/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import contextlib
import os

import fire
from tqdm import tqdm

from observatoire.tmdb.config import HF_SERIES_DATASET, TMDB_BATCH_SIZE
Expand All @@ -10,32 +11,55 @@
from observatoire.tmdb.logger import setup_logger
from observatoire.tmdb.series.data import make_series_df
from observatoire.tmdb.series.tmdb import get_latest_series_id, get_series_data
from observatoire.tmdb.types import Mode


def executor() -> None:
def executor(mode: Mode = "latest") -> None:
logger = setup_logger()
logger.info("Starting Executor")

# first, lets get the id of the lastest series
latest_id = get_latest_series_id()
logger.info("Starting Series Dataset Update")

# second, let's get the last series from our last run
df_current = load_dataset(HF_SERIES_DATASET)
current_id = df_current["id"].max() if df_current is not None else None

# Generate a list of series IDs
series_ids_list = list(range(current_id or 1, latest_id))
total_series_to_process = len(series_ids_list)
if mode == "missing":
logger.info("Mode: Update Missing Series")

# Create a set of all possible ids
all_ids = set(range(1, current_id + 1))

# Create a set of ids present in df_current
current_ids = set(df_current["id"].dropna())

# Find the difference between all_ids and current_ids
missing_ids = all_ids - current_ids

# Convert the set to a list
series_ids_list = list(missing_ids)

logger.info(f"Total Missing Series: {len(series_ids_list)}")
elif mode == "latest":
logger.info("Mode: Update Latest Series")

# first, lets get the id of the lastest series
latest_id = get_latest_series_id()

# Generate a list of series IDs
series_ids_list = list(range(current_id or 1, latest_id))

logger.info(f"Total Latest Series: {len(series_ids_list)}")

logger.info(f"Total Series to Process in this run: {total_series_to_process}")
else:
logger.critical("Invalid mode selected")
return

# Split series_ids_list into chunks of TMDB_BATCH_SIZE
batches = [
series_ids_list[i : i + TMDB_BATCH_SIZE]
for i in range(0, len(series_ids_list), TMDB_BATCH_SIZE)
]

with tqdm(total=total_series_to_process, unit=" series") as pbar:
with tqdm(total=len(series_ids_list), unit=" series") as pbar:
for batch in batches:
logger.info(f"Processing batch of {len(batch)} series")

Expand Down Expand Up @@ -75,4 +99,4 @@ def executor() -> None:


if __name__ == "__main__":
executor()
fire.Fire(executor)
7 changes: 4 additions & 3 deletions observatoire/tmdb/series/data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import json
import sys

import pandas as pd

Expand All @@ -12,12 +11,14 @@
safe_list,
safe_str,
)
from observatoire.tmdb.logger import setup_logger


def make_series_df(series_json: list[str]) -> pd.DataFrame:
"""
Transforms the JSON data into a DataFrame
"""
logger = setup_logger()

data = []
unique_ids = set()
Expand Down Expand Up @@ -82,8 +83,8 @@ def make_series_df(series_json: list[str]) -> pd.DataFrame:
safe_data["vote_average"] = safe_float(line_in_json, "vote_average")
safe_data["vote_count"] = safe_int(line_in_json, "vote_count")

except Exception as e:
print(f"Error: {e}", file=sys.stderr)
except Exception:
logger.exception(f"Could not parse line: {line_in_json}")
continue

# remove and newline chracters
Expand Down
3 changes: 3 additions & 0 deletions observatoire/tmdb/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from typing import Literal

Mode = Literal["latest", "missing"]
30 changes: 29 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ bs4 = "^0.0.2"
python-dotenv = "^1.0.1"
datasets = "^2.18.0"
pycountry-convert = "^0.7.2"
fire = "^0.6.0"

[tool.poetry.group.dev.dependencies]
pre-commit = "^2.20.0"
Expand Down

0 comments on commit 99a99bd

Please sign in to comment.