Merge pull request #30 from dataforgoodfr:introduce-batch-size

Introduce TMDB_BATCH_SIZE env var. Default is 1000.
dataforgoodfr · Apr 19, 2024 · 850d496 · 850d496
2 parents 5fbec30 + ef123e7
commit 850d496
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 2 deletions.
diff --git a/observatoire/tmdb/config.py b/observatoire/tmdb/config.py
@@ -2,3 +2,4 @@
 
 TMDB_API_KEY = os.getenv("TMDB_API_KEY")
 TMDB_MAX_RETRIES = int(os.getenv("TMDB_MAX_RETRIES", "500"))
+TMDB_BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1000"))
diff --git a/observatoire/tmdb/movies.py b/observatoire/tmdb/movies.py
@@ -4,6 +4,7 @@
 
 from tqdm import tqdm
 
+from observatoire.tmdb.config import TMDB_BATCH_SIZE
 from observatoire.tmdb.data import transform_movie_json
 from observatoire.tmdb.helpers import merge
 from observatoire.tmdb.hf import load_movies_dataset, save_movies_dataset
@@ -28,8 +29,11 @@ def executor() -> None:
 
     logger.info(f"Total Movies to Process in this run: {total_movies_to_process}")
 
-    # Split movie_ids_list into chunks of 100
-    batches = [movie_ids_list[i : i + 100] for i in range(0, len(movie_ids_list), 100)]
+    # Split movie_ids_list into chunks of TMDB_BATCH_SIZE
+    batches = [
+        movie_ids_list[i : i + TMDB_BATCH_SIZE]
+        for i in range(0, len(movie_ids_list), TMDB_BATCH_SIZE)
+    ]
 
     with tqdm(total=total_movies_to_process, unit=" movies") as pbar:
         for batch in batches:
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,3 +2,4 @@

		TMDB_API_KEY = os.getenv("TMDB_API_KEY")
		TMDB_MAX_RETRIES = int(os.getenv("TMDB_MAX_RETRIES", "500"))
		TMDB_BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1000"))