Skip to content

Commit

Permalink
Fix date issues
Browse files Browse the repository at this point in the history
  • Loading branch information
kaaloo committed Apr 29, 2024
1 parent 7c67fc0 commit acd2201
Showing 1 changed file with 62 additions and 69 deletions.
131 changes: 62 additions & 69 deletions observable/src/data/films.sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,84 +3,77 @@
import tempfile
from datetime import datetime

import pandas as pd
from observatoire.tmdb.movies.hf import load_movies_dataset

with tempfile.TemporaryDirectory() as temp_dir:
os.chdir(temp_dir)
# Load the dataset
df = load_movies_dataset()

os.system(
"kaggle datasets download -d asaniczka/tmdb-movies-dataset-2023-930k-movies >&2",
)
os.system("unzip tmdb-movies-dataset-2023-930k-movies.zip >&2")

df = pd.read_csv("TMDB_movie_dataset_v11.csv", parse_dates=["release_date"])

# Remove adult movies
df = df[df["adult"] == False] # noqa: E712
# Remove adult movies
df = df[df["adult"] == False] # noqa: E712

# Remove documentaries
df = df[df["genres"].str.contains("Documentary") == False] # noqa: E712
# Remove documentaries
df = df[df["genres"].str.contains("Documentary") == False] # noqa: E712

# Remove movies with a future release date
now = datetime.now()
df = df[df["release_date"] < now]
# Remove movies with a future release date
now = datetime.now().strftime("%Y-%m-%d")
df = df[df["release_date"] < now]

# Remove movies with no known revenue
# and original_language other than EU languages
df = df[
(df["revenue"] == 0)
& (
df["original_language"].isin(
[
"cs",
"da",
"de",
"en",
"es",
"et",
"fi",
"fr",
"hr",
"hu",
"is",
"it",
"lt",
"lv",
"nl",
"no",
"pl",
"pt",
"ro",
"sl",
"sv",
],
)
# Remove movies with no known revenue
# and original_language other than EU languages
df = df[
(df["revenue"] == 0)
& (
df["original_language"].isin(
[
"cs",
"da",
"de",
"en",
"es",
"et",
"fi",
"fr",
"hr",
"hu",
"is",
"it",
"lt",
"lv",
"nl",
"no",
"pl",
"pt",
"ro",
"sl",
"sv",
],
)
| (df["revenue"] > 0)
]
)
| (df["revenue"] > 0)
]

# Add a column with the production_year based on the release_date
df["production_year"] = df["release_date"].dt.year
# Add a column with the production_year based on the release_date
df["production_year"] = df["release_date"].str[:4]

# Select the columns we want
df = df[
[
"id",
"title",
"original_title",
"production_year",
"poster_path",
]
# Select the columns we want
df = df[
[
"id",
"title",
"original_title",
"production_year",
"poster_path",
]
]

# Set original title to blank string if same as title
df["original_title"] = df["original_title"].where(df["title"] != df["original_title"], "")
# Set original title to blank string if same as title
df["original_title"] = df["original_title"].where(df["title"] != df["original_title"], "")

# Save the dataframe to a SQLite database
with tempfile.NamedTemporaryFile(suffix=".sqlite", delete=False) as temp_file:
temp_filename = temp_file.name
with sqlite3.connect(temp_filename) as conn:
df.to_sql("films", conn, index=False)
# Save the dataframe to a SQLite database
with tempfile.NamedTemporaryFile(suffix=".sqlite", delete=False) as temp_file:
temp_filename = temp_file.name
with sqlite3.connect(temp_filename) as conn:
df.to_sql("films", conn, index=False)

# Print db file to stdout
os.system(f"cat {temp_filename}")
# Print db file to stdout
os.system(f"cat {temp_filename}")

0 comments on commit acd2201

Please sign in to comment.