Skip to content

Commit

Permalink
Merge pull request #30 from dataforgoodfr:introduce-batch-size
Browse files Browse the repository at this point in the history
Introduce TMDB_BATCH_SIZE env var. Default is 1000.
  • Loading branch information
kaaloo authored Apr 19, 2024
2 parents 5fbec30 + ef123e7 commit 850d496
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
1 change: 1 addition & 0 deletions observatoire/tmdb/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@

TMDB_API_KEY = os.getenv("TMDB_API_KEY")
TMDB_MAX_RETRIES = int(os.getenv("TMDB_MAX_RETRIES", "500"))
TMDB_BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1000"))
8 changes: 6 additions & 2 deletions observatoire/tmdb/movies.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from tqdm import tqdm

from observatoire.tmdb.config import TMDB_BATCH_SIZE
from observatoire.tmdb.data import transform_movie_json
from observatoire.tmdb.helpers import merge
from observatoire.tmdb.hf import load_movies_dataset, save_movies_dataset
Expand All @@ -28,8 +29,11 @@ def executor() -> None:

logger.info(f"Total Movies to Process in this run: {total_movies_to_process}")

# Split movie_ids_list into chunks of 100
batches = [movie_ids_list[i : i + 100] for i in range(0, len(movie_ids_list), 100)]
# Split movie_ids_list into chunks of TMDB_BATCH_SIZE
batches = [
movie_ids_list[i : i + TMDB_BATCH_SIZE]
for i in range(0, len(movie_ids_list), TMDB_BATCH_SIZE)
]

with tqdm(total=total_movies_to_process, unit=" movies") as pbar:
for batch in batches:
Expand Down

0 comments on commit 850d496

Please sign in to comment.