From f6c481d79b5cc5325ee12a10c45c291966edf348 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka <ihar.hrachyshka@gmail.com> Date: Fri, 6 Dec 2024 09:15:52 -0500 Subject: [PATCH 1/2] Flush after each print Otherwise, `ts` buffers output. --- src/letsrolld/cmd/fetch_directors.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/letsrolld/cmd/fetch_directors.py b/src/letsrolld/cmd/fetch_directors.py index 26bb84a..7a10592 100644 --- a/src/letsrolld/cmd/fetch_directors.py +++ b/src/letsrolld/cmd/fetch_directors.py @@ -30,9 +30,11 @@ def get_directors_by_films(film_list): ) as e: # TODO: deduplicate error handling with update script traceback.print_exception(e) print(f"Retrying in {_SEC_WAIT_ON_FAIL} seconds...") + sys.stdout.flush() time.sleep(_SEC_WAIT_ON_FAIL) continue print(f"Processed {i + 1}/{len(film_list)} films") + sys.stdout.flush() def main(): From 40dbd953d9cda1fd23ae057de74e9c5bd4d32e5b Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka <ihar.hrachyshka@gmail.com> Date: Fri, 6 Dec 2024 09:39:59 -0500 Subject: [PATCH 2/2] Ignore films that are already in db when fetching directors --- Makefile | 3 +++ src/letsrolld/cmd/fetch_directors.py | 32 +++++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index a000efd..e0b518b 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,9 @@ test: lint # One can use e.g. https://letterboxd.com/hershwin/list/all-the-movies/ as the base list fetch-directors: + pdm run fetch-directors --new-only -i ./data/lists/everything.csv -o ${DIRECTORS_FILE} | $(RUN_LOG_CMD) + +fetch-directors-all: pdm run fetch-directors -i ./data/lists/everything.csv -o ${DIRECTORS_FILE} | $(RUN_LOG_CMD) populate-directors: diff --git a/src/letsrolld/cmd/fetch_directors.py b/src/letsrolld/cmd/fetch_directors.py index 7a10592..33a207c 100644 --- a/src/letsrolld/cmd/fetch_directors.py +++ b/src/letsrolld/cmd/fetch_directors.py @@ -5,10 +5,15 @@ import time import traceback +from sqlalchemy.orm import sessionmaker + +from letsrolld import db +from letsrolld.db import models from letsrolld import film from letsrolld import filmlist from letsrolld.directorlist import read_director_list +# TODO: deduplicate error handling with update script _SEC_WAIT_ON_FAIL = 5 @@ -25,9 +30,7 @@ def get_directors_by_films(film_list): directors[director.base_url] = director yield director break - except ( - Exception - ) as e: # TODO: deduplicate error handling with update script + except Exception as e: traceback.print_exception(e) print(f"Retrying in {_SEC_WAIT_ON_FAIL} seconds...") sys.stdout.flush() @@ -37,15 +40,38 @@ def get_directors_by_films(film_list): sys.stdout.flush() +def is_known_film(film_): + session = sessionmaker(bind=db.create_engine())() + film = ( + session.query(models.Film) + .filter(models.Film.title == film_.name) + .filter(models.Film.year == film_.year) + .first() + ) + if film is not None: + print(f"Skipping known film: {film_.name} ({film_.year})") + sys.stdout.flush() + return True + return False + + def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="input movie list file", required=True) parser.add_argument( "-o", "--output", help="output director list file", required=True ) + parser.add_argument( + "-N", + "--new-only", + action="store_true", + help="whether to ignore (probably) known movies", + ) args = parser.parse_args() film_list = list(filmlist.read_film_list(args.input)) + if args.new_only: + film_list = [f for f in film_list if not is_known_film(f)] directors = set() if os.path.exists(args.output):