From f6c481d79b5cc5325ee12a10c45c291966edf348 Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Date: Fri, 6 Dec 2024 09:15:52 -0500
Subject: [PATCH 1/2] Flush after each print

Otherwise, `ts` buffers output.
---
 src/letsrolld/cmd/fetch_directors.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/letsrolld/cmd/fetch_directors.py b/src/letsrolld/cmd/fetch_directors.py
index 26bb84a..7a10592 100644
--- a/src/letsrolld/cmd/fetch_directors.py
+++ b/src/letsrolld/cmd/fetch_directors.py
@@ -30,9 +30,11 @@ def get_directors_by_films(film_list):
             ) as e:  # TODO: deduplicate error handling with update script
                 traceback.print_exception(e)
                 print(f"Retrying in {_SEC_WAIT_ON_FAIL} seconds...")
+                sys.stdout.flush()
                 time.sleep(_SEC_WAIT_ON_FAIL)
                 continue
         print(f"Processed {i + 1}/{len(film_list)} films")
+        sys.stdout.flush()
 
 
 def main():

From 40dbd953d9cda1fd23ae057de74e9c5bd4d32e5b Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Date: Fri, 6 Dec 2024 09:39:59 -0500
Subject: [PATCH 2/2] Ignore films that are already in db when fetching
 directors

---
 Makefile                             |  3 +++
 src/letsrolld/cmd/fetch_directors.py | 32 +++++++++++++++++++++++++---
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index a000efd..e0b518b 100644
--- a/Makefile
+++ b/Makefile
@@ -17,6 +17,9 @@ test: lint
 
 # One can use e.g. https://letterboxd.com/hershwin/list/all-the-movies/ as the base list
 fetch-directors:
+	pdm run fetch-directors --new-only -i ./data/lists/everything.csv -o ${DIRECTORS_FILE} | $(RUN_LOG_CMD)
+
+fetch-directors-all:
 	pdm run fetch-directors -i ./data/lists/everything.csv -o ${DIRECTORS_FILE} | $(RUN_LOG_CMD)
 
 populate-directors:
diff --git a/src/letsrolld/cmd/fetch_directors.py b/src/letsrolld/cmd/fetch_directors.py
index 7a10592..33a207c 100644
--- a/src/letsrolld/cmd/fetch_directors.py
+++ b/src/letsrolld/cmd/fetch_directors.py
@@ -5,10 +5,15 @@
 import time
 import traceback
 
+from sqlalchemy.orm import sessionmaker
+
+from letsrolld import db
+from letsrolld.db import models
 from letsrolld import film
 from letsrolld import filmlist
 from letsrolld.directorlist import read_director_list
 
+# TODO: deduplicate error handling with update script
 _SEC_WAIT_ON_FAIL = 5
 
 
@@ -25,9 +30,7 @@ def get_directors_by_films(film_list):
                         directors[director.base_url] = director
                         yield director
                 break
-            except (
-                Exception
-            ) as e:  # TODO: deduplicate error handling with update script
+            except Exception as e:
                 traceback.print_exception(e)
                 print(f"Retrying in {_SEC_WAIT_ON_FAIL} seconds...")
                 sys.stdout.flush()
@@ -37,15 +40,38 @@ def get_directors_by_films(film_list):
         sys.stdout.flush()
 
 
+def is_known_film(film_):
+    session = sessionmaker(bind=db.create_engine())()
+    film = (
+        session.query(models.Film)
+        .filter(models.Film.title == film_.name)
+        .filter(models.Film.year == film_.year)
+        .first()
+    )
+    if film is not None:
+        print(f"Skipping known film: {film_.name} ({film_.year})")
+        sys.stdout.flush()
+        return True
+    return False
+
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("-i", "--input", help="input movie list file", required=True)
     parser.add_argument(
         "-o", "--output", help="output director list file", required=True
     )
+    parser.add_argument(
+        "-N",
+        "--new-only",
+        action="store_true",
+        help="whether to ignore (probably) known movies",
+    )
     args = parser.parse_args()
 
     film_list = list(filmlist.read_film_list(args.input))
+    if args.new_only:
+        film_list = [f for f in film_list if not is_known_film(f)]
 
     directors = set()
     if os.path.exists(args.output):