Skip to content

Commit

Permalink
Merge pull request #169 from booxter/new-directors
Browse files Browse the repository at this point in the history
Complete directors list fetch
  • Loading branch information
booxter authored Dec 25, 2024
2 parents 225521a + dc3f3af commit 2fe3b1e
Show file tree
Hide file tree
Showing 16 changed files with 242,052 additions and 17 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ test: lint

# One can use e.g. https://letterboxd.com/hershwin/list/all-the-movies/ as the base list
fetch-directors:
pdm run fetch-directors --new-only -i ./data/lists/everything.csv -o ${DIRECTORS_FILE} | $(RUN_LOG_CMD)
pdm run fetch-directors --new-only -i ./data/lists/adult2.csv -o ${DIRECTORS_FILE} | $(RUN_LOG_CMD)

fetch-directors-all:
pdm run fetch-directors -i ./data/lists/everything.csv -o ${DIRECTORS_FILE} | $(RUN_LOG_CMD)
Expand Down
23 changes: 23 additions & 0 deletions configs/default.json
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,28 @@
"min_rating": "3.75",
"min_length": 60,
"genre": "documentary"
},
"Obscure": {
"max_movies": 3,
"services": ["FREE", "ADS", "FLATRATE", "RENT", "BUY"],
"min_rating": "3.35",
"max_rating": "3.75",
"min_length": 30,
"exclude_genres": ["adult"]
},
"Adult": {
"max_movies": 3,
"services": ["DISC", "cultpix"],
"min_length": 30,
"genre": "adult"
},
"On Disc": {
"max_movies": 3,
"exclude_services": ["FREE", "ADS", "FLATRATE", "RENT", "BUY"],
"services": ["DISC"],
"min_rating": "4.0",
"min_length": 30,
"max_length": 300,
"exclude_genres": ["adult"]
}
}
241,702 changes: 241,701 additions & 1 deletion directors.csv

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ build-backend = "pdm.backend"
distribution = true

[project.scripts]
fetch-countries = "letsrolld.cmd.fetch_countries:main"
fetch-country-list = "letsrolld.cmd.fetch_country_list:main"
fetch-lb-list = "letsrolld.cmd.fetch_lb_list:main"
fetch-directors = "letsrolld.cmd.fetch_directors:main"
populate-directors = "letsrolld.cmd.populate_directors:main"
update-directors = "letsrolld.cmd.update:directors_main"
Expand Down
16 changes: 16 additions & 0 deletions scripts/apply-countries.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash

set -xe

tdir=$(mktemp -d)
echo "$tdir"
for country in $(fetch-countries); do
echo "Applying country: $country"
fetch-country-list -N -c "$country" -o "$tdir/list.csv"
fetch-directors -i "$tdir/list.csv" -o "$tdir/directors.csv"
populate-directors -d "$tdir/directors.csv"
make run-update-directors # fetch all films after each list so that "film already known" heuristic considers just added directors/movies
rm "$tdir/*.csv"
done
echo "Done applying countries; results are in $tdir"
# rm -rf $tdir
70 changes: 70 additions & 0 deletions scripts/apply-lb-lists.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env bash

# https://letterboxd.com/eternalalien/list/all-narrative-feature-films-on-letterboxd/
# https://letterboxd.com/narpjay/list/exploitation-films-b-movie-cheese-grindhouse/
# https://letterboxd.com/seanmcgann98/list/sleazemovies-erotiga/
# https://letterboxd.com/mishima24/list/the-most-comprehensive-list-of-japanese-movies/
# https://letterboxd.com/ashleylynch/list/rarelust-complete-list/
# https://letterboxd.com/giniyapp/list/the-celluloid-void-rare-films-written-and/
# https://letterboxd.com/aopisaac/list/video-galactica-a-virtual-video-store-11293/
# https://letterboxd.com/sadhomersimpson/list/criterion/
# https://letterboxd.com/natethecyborg/list/every-film-available-on-the-criterion-channel/
# https://letterboxd.com/smiskfisk/list/almost-every-title-from-worldscinemaorg/
# https://letterboxd.com/robbob01/list/public-domain/
# https://letterboxd.com/ru6yy/list/every-movie-of-every-actor-actress-ive-seen/
# https://letterboxd.com/xob/list/films-of-interest-that-are-available-in-full/
# https://letterboxd.com/000_leo/list/every-film-ever-eligible-for-the-oscars/
# https://letterboxd.com/kordian86/list/list-of-polish-movies-1902-2023/
# https://letterboxd.com/ryokohakubi/list/a-comprehensive-animation-list/
# https://letterboxd.com/astroturd/list/no-average-rating/
# https://letterboxd.com/adamrant/list/the-definitive-films-about-film-list/
# https://letterboxd.com/terje_jr/list/struggles-of-the-global-south-in-progress/
# https://letterboxd.com/nodadyoushutup/list/nodadyoushutup-plex/
# https://letterboxd.com/marconerix/list/balkan-movies/
# https://letterboxd.com/solidaritycine/list/solidarity-cinema-archive/
# https://letterboxd.com/dessaint/list/best-korean/
# https://letterboxd.com/dmytro_malyar/list/ukrainian-films/
# https://letterboxd.com/bratofthe1980s/list/the-european-boys-adventure-tale/
# https://letterboxd.com/butterbud/list/the-working-class/
# https://letterboxd.com/jim13/list/cold-war-films/
# https://letterboxd.com/adarksong/list/films-from-classic-films-channel-to-watch/
# https://letterboxd.com/magrosleau/list/psychosexual-dramas-nihilistic-fever-dreams/
# https://letterboxd.com/evilbjork/list/avant-garde-underground/
# https://letterboxd.com/knuffeltje/list/zero-watches/
# https://letterboxd.com/artyficial/list/the-oxford-history-of-world-cinema/
# https://letterboxd.com/loureviews/list/i-l3ve-musicals/
# https://letterboxd.com/mitramitra/list/the-iran-archive-everything-wip/
# https://letterboxd.com/juliec/list/neverending-christmas-movie-list/
# https://letterboxd.com/stottiain/list/great-big-list-of-classic-cartoons/
# https://letterboxd.com/kodifranzese/list/kodi-franzeses-non-usa-foreign-filmography/
# https://letterboxd.com/timonist/list/east-germany/
# https://letterboxd.com/rhettman417/list/movies-on-rarefilmmcom/
# https://letterboxd.com/pakejanek/list/okru/
# https://letterboxd.com/liveandrew/list/genre-exploitation-100-years-1896-1995/
# https://letterboxd.com/gabrmachado/list/brazilian-cinema/
urls=(
"https://letterboxd.com/sr0man/list/erotic/"
"https://letterboxd.com/dustin_b/list/films-i-own-the-complete-mega-load/"
"https://letterboxd.com/smlibrary/list/okay/"
"https://letterboxd.com/retinaburn/list/the-most-comprehensive-list-of-hong-kong/"
"https://letterboxd.com/wolfman07/list/every-horror-film-ever-made-1/"
"https://letterboxd.com/infinityinc/list/every-tv-movie-aired-on-american-television/"
"https://letterboxd.com/kaijufan67/list/the-bad-taste-list-weird-wild-psychotronic/"
"https://letterboxd.com/buttersgreer/list/ultimate-list-of-every-movie-musical/"
"https://letterboxd.com/natethecyborg/list/every-film-available-on-the-criterion-channel/"
"https://letterboxd.com/mishima24/list/the-most-comprehensive-list-of-japanese-movies/"
"https://letterboxd.com/elmiko_/list/every-animated-film-made-from-1878-present/"
"https://letterboxd.com/clowchan/list/every-horror-film-made-from-1895-present/"
)

tdir=$(mktemp -d)
echo "$tdir"
for url in "${urls[@]}"; do
echo "Applying list from url: $url"
fetch-lb-list -N -u "$url" -o "$tdir/list.csv"
fetch-directors -N -i "$tdir/list.csv" -o "$tdir/directors.csv"
populate-directors -d "$tdir/directors.csv"
make run-all # fetch all films after each list so that "film already known" heuristic considers just added directors/movies
done
echo "Done applying lists; results are in $tdir"
# rm -rf $tdir
24 changes: 24 additions & 0 deletions src/letsrolld/cmd/fetch_countries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import sys

from bs4 import BeautifulSoup

from letsrolld import http


def main():
# fetch film list
url = "https://letterboxd.com/countries/"
content = http.get_url(url)

soup = BeautifulSoup(content, "html.parser")

countries_section = soup.find("div", class_="browse-countries")
if countries_section is None:
print("No countries section found")
sys.exit(1)

for a in countries_section.find_all("a"):
href = a.get("href")
if not href.startswith("/films/country/"):
continue
print(href.split("/")[-2])
55 changes: 55 additions & 0 deletions src/letsrolld/cmd/fetch_country_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import argparse
import csv
import sys

from sqlalchemy.orm import sessionmaker

from letsrolld import db
from letsrolld.db import models
from letsrolld import lb_list


# TODO: move to common module, reuse everywhere
def is_known_film(film_):
session = sessionmaker(bind=db.create_engine())()
film = session.query(models.Film).filter(models.Film.lb_url == film_.url).first()
if film is not None:
print(f"Skipping known film: {film_.url}")
sys.stdout.flush()
return True
return False


def main():
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--country", help="country slug", required=True)
parser.add_argument("-o", "--output", help="output film list file", required=True)
parser.add_argument(
"-N",
"--new-only",
help="whether to filter out movies already present in db",
action="store_true",
)
args = parser.parse_args()

# fetch film list
films = lb_list.MovieCountryList(args.country).films()

if args.new_only:
films = (film for film in films if not is_known_film(film))

# write to file
with open(args.output, "w", newline="") as csvfile:
writer = csv.writer(csvfile, dialect=csv.unix_dialect)
writer.writerow(["Name", "Year", "URL"])

for film_ in films:
while True:
try:
writer.writerow([film_.name, film_.year, film_.url])
break
except Exception:
print(f"Error writing film: {film_.url}")
sys.stdout.flush()
continue
csvfile.flush()
19 changes: 14 additions & 5 deletions src/letsrolld/cmd/fetch_directors.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,20 @@ def main():
csvfile.seek(0, os.SEEK_END)

for i, director_ in enumerate(get_directors_by_films(film_list), start=1):
if director_.base_url in directors:
print(f"Skipping director #{i}: {director_.name}")
sys.stdout.flush()
continue
print(f"Adding director #{i}: {director_.name}")
while True:
try:
if director_.base_url in directors:
print(f"Skipping director #{i}: {director_.name}")
sys.stdout.flush()
continue
print(f"Adding director #{i}: {director_.name}")
break
except Exception as e:
traceback.print_exception(e)
print(f"Retrying in {_SEC_WAIT_ON_FAIL} seconds...")
sys.stdout.flush()
time.sleep(_SEC_WAIT_ON_FAIL)
continue
sys.stdout.flush()
writer.writerow([director_.name, director_.base_url])
csvfile.flush()
48 changes: 48 additions & 0 deletions src/letsrolld/cmd/fetch_lb_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import argparse
import csv
import sys

from sqlalchemy.orm import sessionmaker

from letsrolld import db
from letsrolld.db import models
from letsrolld import lb_list


# TODO: move to common module, reuse everywhere
def is_known_film(film_):
session = sessionmaker(bind=db.create_engine())()
film = session.query(models.Film).filter(models.Film.lb_url == film_.url).first()
if film is not None:
print(f"Skipping known film: {film_.url}")
sys.stdout.flush()
return True
return False


def main():
parser = argparse.ArgumentParser()
parser.add_argument("-u", "--url", help="list URL", required=True)
parser.add_argument("-o", "--output", help="output film list file", required=True)
parser.add_argument(
"-N",
"--new-only",
help="whether to filter out movies already present in db",
action="store_true",
)
args = parser.parse_args()

# fetch film list
films = lb_list.MovieList(args.url).films()

if args.new_only:
films = (film for film in films if not is_known_film(film))

# write to file
with open(args.output, "w", newline="") as csvfile:
writer = csv.writer(csvfile, dialect=csv.unix_dialect)
writer.writerow(["Name", "Year", "URL"])

for film_ in films:
writer.writerow([film_.name, film_.year, film_.url])
csvfile.flush()
18 changes: 16 additions & 2 deletions src/letsrolld/cmd/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import time
import traceback

from sqlalchemy import func, select, or_, and_
from sqlalchemy import func, select, or_, and_, nullsfirst
from sqlalchemy.orm import sessionmaker

from letsrolld import db
Expand Down Expand Up @@ -53,6 +53,11 @@ def _seen_obj_query(model, seen):
return model.id.notin_(seen)


_MODEL_TO_ORDER_BY = {
models.Film: [models.Film.rating.desc()],
}


def get_obj_to_update(session, model, threshold, last_checked_field, seen, match):
return (
session.execute(
Expand All @@ -61,6 +66,9 @@ def get_obj_to_update(session, model, threshold, last_checked_field, seen, match
_get_obj_to_update_query(model, threshold, last_checked_field, match)
)
.filter(_seen_obj_query(model, seen))
.order_by(
nullsfirst(last_checked_field), *(_MODEL_TO_ORDER_BY.get(model, []))
)
.limit(1)
)
.scalars()
Expand Down Expand Up @@ -198,7 +206,12 @@ def film_threshold(f):
return multiplier

multiplier = max(0, _NOW.year - year(f)) + 1
return min(100, multiplier)
if f.rating < 3.0:
multiplier *= 2
if int(f.rating) != 0:
multiplier *= 2
multiplier = min(100, multiplier)
return multiplier


def offer_threshold(f):
Expand Down Expand Up @@ -315,6 +328,7 @@ def run_update(
n_objs = get_number_of_objs_to_update(
session, model, threshold, last_checked_field, match
)
print(f"Updating {model_name}s: {n_objs} to update")

i = 1
seen = set()
Expand Down
4 changes: 2 additions & 2 deletions src/letsrolld/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


def lines_to_skip(file_name):
with open(file_name, "r") as file:
with open(file_name) as file:
line = next(file)
if line.strip() == "Letterboxd list export v7":
return _SKIP_V7
Expand All @@ -14,7 +14,7 @@ def lines_to_skip(file_name):

def read_lines(file_name):
skip = lines_to_skip(file_name)
with open(file_name, "r") as file:
with open(file_name) as file:
while skip > 0:
next(file)
skip -= 1
Expand Down
5 changes: 4 additions & 1 deletion src/letsrolld/film.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@ def offers(self):

@functools.cached_property
def genres(self):
return [] if self.jw is None else self.jw.genres
genres = [] if self.jw is None else self.jw.genres
if self.soup.find("span", class_="badge -adult"):
genres.append("adult")
return genres

@functools.cached_property
def genre_names(self):
Expand Down
4 changes: 1 addition & 3 deletions src/letsrolld/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import requests


_CACHE_INSTALLED = False

# TODO: use a library to fill these in
_HEADERS = {
"Content-Type": "application/json",
Expand All @@ -27,7 +25,7 @@ def enable_debug():


def get_url(url):
return requests.get(url).text
return requests.get(url, timeout=120).text


def get_json(url, data):
Expand Down
Loading

0 comments on commit 2fe3b1e

Please sign in to comment.