From acd22018e952c1e93adfaf000e4b965e1a0508e0 Mon Sep 17 00:00:00 2001 From: Luis Arias Date: Mon, 29 Apr 2024 10:27:08 +0200 Subject: [PATCH] Fix date issues --- observable/src/data/films.sqlite.py | 131 +++++++++++++--------------- 1 file changed, 62 insertions(+), 69 deletions(-) diff --git a/observable/src/data/films.sqlite.py b/observable/src/data/films.sqlite.py index ea4c860..44a86ef 100755 --- a/observable/src/data/films.sqlite.py +++ b/observable/src/data/films.sqlite.py @@ -3,84 +3,77 @@ import tempfile from datetime import datetime -import pandas as pd +from observatoire.tmdb.movies.hf import load_movies_dataset -with tempfile.TemporaryDirectory() as temp_dir: - os.chdir(temp_dir) +# Load the dataset +df = load_movies_dataset() - os.system( - "kaggle datasets download -d asaniczka/tmdb-movies-dataset-2023-930k-movies >&2", - ) - os.system("unzip tmdb-movies-dataset-2023-930k-movies.zip >&2") - - df = pd.read_csv("TMDB_movie_dataset_v11.csv", parse_dates=["release_date"]) - - # Remove adult movies - df = df[df["adult"] == False] # noqa: E712 +# Remove adult movies +df = df[df["adult"] == False] # noqa: E712 - # Remove documentaries - df = df[df["genres"].str.contains("Documentary") == False] # noqa: E712 +# Remove documentaries +df = df[df["genres"].str.contains("Documentary") == False] # noqa: E712 - # Remove movies with a future release date - now = datetime.now() - df = df[df["release_date"] < now] +# Remove movies with a future release date +now = datetime.now().strftime("%Y-%m-%d") +df = df[df["release_date"] < now] - # Remove movies with no known revenue - # and original_language other than EU languages - df = df[ - (df["revenue"] == 0) - & ( - df["original_language"].isin( - [ - "cs", - "da", - "de", - "en", - "es", - "et", - "fi", - "fr", - "hr", - "hu", - "is", - "it", - "lt", - "lv", - "nl", - "no", - "pl", - "pt", - "ro", - "sl", - "sv", - ], - ) +# Remove movies with no known revenue +# and original_language other than EU languages +df = df[ + (df["revenue"] == 0) + & ( + df["original_language"].isin( + [ + "cs", + "da", + "de", + "en", + "es", + "et", + "fi", + "fr", + "hr", + "hu", + "is", + "it", + "lt", + "lv", + "nl", + "no", + "pl", + "pt", + "ro", + "sl", + "sv", + ], ) - | (df["revenue"] > 0) - ] + ) + | (df["revenue"] > 0) +] - # Add a column with the production_year based on the release_date - df["production_year"] = df["release_date"].dt.year +# Add a column with the production_year based on the release_date +df["production_year"] = df["release_date"].str[:4] - # Select the columns we want - df = df[ - [ - "id", - "title", - "original_title", - "production_year", - "poster_path", - ] +# Select the columns we want +df = df[ + [ + "id", + "title", + "original_title", + "production_year", + "poster_path", ] +] - # Set original title to blank string if same as title - df["original_title"] = df["original_title"].where(df["title"] != df["original_title"], "") +# Set original title to blank string if same as title +df["original_title"] = df["original_title"].where(df["title"] != df["original_title"], "") - # Save the dataframe to a SQLite database - with tempfile.NamedTemporaryFile(suffix=".sqlite", delete=False) as temp_file: - temp_filename = temp_file.name - with sqlite3.connect(temp_filename) as conn: - df.to_sql("films", conn, index=False) +# Save the dataframe to a SQLite database +with tempfile.NamedTemporaryFile(suffix=".sqlite", delete=False) as temp_file: + temp_filename = temp_file.name + with sqlite3.connect(temp_filename) as conn: + df.to_sql("films", conn, index=False) - # Print db file to stdout - os.system(f"cat {temp_filename}") +# Print db file to stdout +os.system(f"cat {temp_filename}")