Merge branch 'main' into scorreia/poc_analysis_notebook

dataforgoodfr · Apr 16, 2024 · abf0d1b · abf0d1b
2 parents 3467902 + 8f31055
commit abf0d1b
Show file tree

Hide file tree

Showing 17 changed files with 634 additions and 435 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,20 +1,20 @@
 repos:
   - repo: https://github.com/charliermarsh/ruff-pre-commit
     # Ruff version.
-    rev: 'v0.0.254'
+    rev: "v0.0.254"
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
-  - repo: https://github.com/psf/black
-    rev: 22.3.0
-    hooks:
-    - id: black
-      language_version: python3
+  # - repo: https://github.com/psf/black
+  #   rev: 22.3.0
+  #   hooks:
+  #   - id: black
+  #     language_version: python3
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.3.0
     hooks:
-    - id: check-merge-conflict
-    - id: mixed-line-ending
+      - id: check-merge-conflict
+      - id: mixed-line-ending
   #- repo: https://github.com/pycqa/bandit
   #  rev: 1.7.4
   #  hooks:
@@ -23,4 +23,4 @@ repos:
   - repo: https://github.com/Lucas-C/pre-commit-hooks-safety
     rev: v1.3.1
     hooks:
-    - id: python-safety-dependencies-check
+      - id: python-safety-dependencies-check
diff --git a/README.md b/README.md
@@ -1,6 +1,4 @@
-Observatoire des imaginaires
-================
-
+# Observatoire des imaginaires
 
 ## Installing with poetry
 
@@ -65,7 +63,7 @@ pip install poetry
    jupyter notebook
    ```
 
-## Download datasets from kaggle 
+## Download datasets from kaggle
 
 If you want to use kaggle to download datasets, please make sure to have api's credentials in ~/.kaggle/kaggle.json.
 
@@ -78,24 +76,26 @@ make download-tmdb-movies-dataset
 make download-full-tmdb-tv-shows-dataset
 ```
 
-
 Alternatively you can download directly the datasets from kaggle website :
+
 - [tmdb-movies-dataset](https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies)
 - [full-tmdb-tv-shows-dataset](https://www.kaggle.com/datasets/asaniczka/full-tmdb-tv-shows-dataset-2023-150k-shows)
 
 ## Website to select a specific movie or TV show
 
 The [site-observable](https://github.com/dataforgoodfr/12_observatoire_des_imaginaires/tree/main/site-observable) directory contains
 an observable framework site that collect film and movie data from the above datasets on kaggle and filters the datasets according
-to the following rules in order to reduced the size of the data present on the generated web site.  This site provides a search UI
-allow a user to select a specific movie or TV show.  The user can then click on the link for their selection to kick off the
+to the following rules in order to reduced the size of the data present on the generated web site. This site provides a search UI
+allow a user to select a specific movie or TV show. The user can then click on the link for their selection to kick off the
 questionnaire on tally andis destined to be embedded in an iframe in the main Observatoire des Imaginaires web site.
 
 Movies:
+
 - filter out adult movies
 - filter out movies released more that two years ago
 
 TV Shows:
+
 - filter out adult shows
 
 The web site is currently hosted on the [Observable hosting platform](https://observablehq.com/) and is available at the following URL:
@@ -106,10 +106,15 @@ https://observatoire-des-imaginaires.observablehq.cloud/questionnaire
 
 [Install precommits](https://pre-commit.com/)
 
-
-    pre-commit run --all-files 
-
+    pre-commit run --all-files
 
 ## Use Tox to test your code
 
     tox -vv
+
+## Tasks
+
+This repo includes invoke for pythonic task execution. To see the
+is of available tasks you can run:
+
+invoke -l
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,7 @@ pre-commit = "^2.20.0"
 pytest = "^7.2.0"
 # ruff = "^0.0.254"
 tox = "^4.4.8"
+invoke = "^2.2.0"
 
 [tool.ruff]
 select = [

diff --git a/site-observable/docs/data/films.sqlite.py b/site-observable/docs/data/films.sqlite.py
@@ -0,0 +1,86 @@
+import os
+import sqlite3
+import tempfile
+from datetime import datetime
+
+import pandas as pd
+
+with tempfile.TemporaryDirectory() as temp_dir:
+    os.chdir(temp_dir)
+
+    os.system(
+        "kaggle datasets download -d asaniczka/tmdb-movies-dataset-2023-930k-movies >&2",
+    )
+    os.system("unzip tmdb-movies-dataset-2023-930k-movies.zip >&2")
+
+    df = pd.read_csv("TMDB_movie_dataset_v11.csv", parse_dates=["release_date"])
+
+    # Remove adult movies
+    df = df[df["adult"] == False]  # noqa: E712
+
+    # Remove documentaries
+    df = df[df["genres"].str.contains("Documentary") == False]  # noqa: E712
+
+    # Remove movies with a future release date
+    now = datetime.now()
+    df = df[df["release_date"] < now]
+
+    # Remove movies with no known revenue
+    # and original_language other than EU languages
+    df = df[
+        (df["revenue"] == 0)
+        & (
+            df["original_language"].isin(
+                [
+                    "cs",
+                    "da",
+                    "de",
+                    "en",
+                    "es",
+                    "et",
+                    "fi",
+                    "fr",
+                    "hr",
+                    "hu",
+                    "is",
+                    "it",
+                    "lt",
+                    "lv",
+                    "nl",
+                    "no",
+                    "pl",
+                    "pt",
+                    "ro",
+                    "sl",
+                    "sv",
+                ],
+            )
+        )
+        | (df["revenue"] > 0)
+    ]
+
+    # Add a column with the production_year based on the release_date
+    df["production_year"] = df["release_date"].dt.year
+
+    # Select the columns we want
+    df = df[
+        [
+            "id",
+            "title",
+            "original_title",
+            "production_year",
+            "poster_path",
+        ]
+    ]
+
+    # Set original title to blank string if same as title
+    df["original_title"] = df["original_title"].where(df["title"] != df["original_title"], "")
+
+    # Save the dataframe to a SQLite database
+    with tempfile.NamedTemporaryFile(suffix=".sqlite", delete=False) as temp_file:
+        temp_filename = temp_file.name
+        with sqlite3.connect(temp_filename) as conn:
+            df.to_sql("films", conn, index=False)
+
+    # Print db file to stdout
+    os.system(f"cat {temp_filename}")
diff --git a/site-observable/docs/data/movies.sqlite.py b/site-observable/docs/data/movies.sqlite.py
diff --git a/site-observable/docs/data/shows.sqlite.py b/site-observable/docs/data/shows.sqlite.py
@@ -1,6 +1,7 @@
 import os
 import sqlite3
 import tempfile
+from datetime import datetime
 
 import pandas as pd
 
@@ -12,13 +13,26 @@
     )
     os.system("unzip full-tmdb-tv-shows-dataset-2023-150k-shows.zip >&2")
 
-    df = pd.read_csv("TMDB_tv_dataset_v3.csv")
+    df = pd.read_csv("TMDB_tv_dataset_v3.csv", parse_dates=["first_air_date"])
 
     # Remove adult movies
     df = df[df["adult"] == False]  # noqa: E712
 
+    # Remove documentaries
+    df = df[df["genres"].str.contains("Documentary") == False]  # noqa: E712
+
+    # Remove shows with a future first air date or no first air date
+    now = datetime.now()
+    df = df[df["first_air_date"] < now]
+
     # Select the columns we want
-    df = df[["id", "name", "original_name", "production_countries"]]
+    df = df[["id", "name", "original_name", "poster_path"]]
+
+    # Set original name to blank string if same as name
+    df["original_name"] = df["original_name"].where(
+        df["name"] != df["original_name"],
+        "",
+    )
 
     # Save the dataframe to a SQLite database
     with tempfile.NamedTemporaryFile(suffix=".sqlite", delete=False) as temp_file: