dataforgoodfr · Lokhia · May 29, 2022 · May 29, 2022 · May 29, 2022 · May 29, 2022
diff --git a/.gitignore b/.gitignore
@@ -129,4 +129,7 @@ dmypy.json
 .pyre/
 .vscode
 .DS_Store
-*.csv
+*.csv
+
+# IDE
+.idea/**
diff --git a/bechdelai/data/tmdb.py b/bechdelai/data/tmdb.py
@@ -3,6 +3,7 @@
 from os import environ
 
 import pandas as pd
+import numpy as np
 from dotenv import load_dotenv
 
 from bechdelai.data.scrap import get_json_from_url
@@ -37,6 +38,7 @@ class APIKeyNotSetInEnv(Exception):
 MOVIE_API_URL = f"{API_URL}/movie/{{movie_id}}?api_key={API_KEY}"
 CAST_API_URL = f"{API_URL}/movie/{{movie_id}}/credits?api_key={API_KEY}"
 PERSON_API_URL = f"{API_URL}/person/{{person_id}}?api_key={API_KEY}"
+PERSON_IMG_API_URL = f"{API_URL}/person/{{person_id}}/images?api_key={API_KEY}"
 SEARCH_IMDB_URL = (
     f"{API_URL}//find/tt{{imdb_id}}?api_key={API_KEY}&external_source=imdb_id"
 )
@@ -135,6 +137,18 @@ def get_person_details_from_id(person_id) -> dict:
 
     return get_json_from_url(url)
 
+def get_person_image_from_id(person_id) -> dict:
+    """Get TMDB API images for person by id
+
+    Parameters
+    ----------
+    person_id : str or int
+        Person id to get details from
+    """
+    url = PERSON_IMG_API_URL.format(person_id=str(person_id))
+
+    return get_json_from_url(url)
+
 def format_results_for_suggestion(search_res: dict) -> list:
     """Format search movie results for `show_movie_suggestions()`
 
@@ -224,3 +238,49 @@ def get_movies_from_ids(movie_ids: list) -> tuple:
     cast_df = pd.concat(cast_df)
 
     return movies_df, crew_df, cast_df
+
+def get_best_tmdb_id(title,release_year):
+    """
+    Get most probable TMDB id for movie title released in release year.
+    The release_date in TMDB may be different from the release_year given, but we look for the closest date.
+
+    Parameters
+    ----------
+    title : str
+                movie title
+    release_year : int
+                year the movie was release
+
+
+    Returns
+    -------
+    int
+        TMDB id
+
+    """
+    movie_candidates = search_movie_from_query(title)
+    if movie_candidates['total_results']==0:
+        # Movie not found in TMDB with query
+        return None
+
+    if  release_year==None:
+        return res[0]['id']
+    else:
+        # find most probable id -> same (or closest) release year
+        movie_id = ''
+        release_year_error = np.Inf # should be min
+        # look at the 5 first matches to choose the one that was release closer to release_year
+        for res in movie_candidates["results"][:5]:
+            if ('release_date' not in res.keys()):
+                continue
+            try:
+                res_release_year = int(res['release_date'][:4])
+            except ValueError:
+                continue
+            if res_release_year==release_year:
+                movie_id = res['id']
+                break
+            elif abs(res_release_year-release_year)<release_year_error:
+                movie_id = res['id']
+                release_year_error = abs(res_release_year-release_year)
+    return movie_id
diff --git a/bechdelai/data/wikipedia.py b/bechdelai/data/wikipedia.py
@@ -5,8 +5,8 @@
 from bs4 import BeautifulSoup
 import re
 import outputformat as ouf
-import wikipediaapi
-from bechdelai.data.scrap import get_json_from_url
+# import wikipediaapi
+# from bechdelai.data.scrap import get_json_from_url
 
 def get_sections(query, lang="en"):
     """Return all sections and subsections in the page and their corresponding indexes

diff --git a/notebooks/age_gap/__init__.py b/notebooks/age_gap/__init__.py
diff --git a/notebooks/age_gap/age gap and relationships.ipynb b/notebooks/age_gap/age gap and relationships.ipynb
diff --git a/notebooks/age_gap/age_gap_automation.py b/notebooks/age_gap/age_gap_automation.py
@@ -0,0 +1,87 @@
+import sys
+sys.path.append("../..")
+import bechdelai.data.wikipedia as wiki
+import bechdelai.data.tmdb as tmdb
+import process_couples as pc
+import outputformat as ouf
+import pandas as pd
+from datetime import datetime
+import requests
+import io
+import spacy
+from spacy import displacy
+from spacy.matcher import Matcher
+from spacy.tokens import Span
+from spacy.matcher import PhraseMatcher
+from pathlib import Path
+
+
+class Movie:
+    def __init__(self, title, release_year=None):
+        self.title = title
+        self.release_year = release_year
+        self.plot = self.get_plot()
+        self.cast_wiki = self.get_cast_wiki()
+        self.cast = self.get_cast_tmdb()
+
+    def __repr__(self):
+        return self.__str__()
+
+    def __str__(self):
+        return "Film : {}".format(self.title)
+
+    def get_plot(self):
+        for query_suffix in [' ('+str(self.release_year)+' film)',' (film)','']:
+            try:
+                return wiki.get_section_text(self.title+query_suffix, ['Plot'])['Plot']  # to improve
+            except ValueError:
+                continue
+        return None
+
+    def get_cast_wiki(self):
+        return  pc.get_cast_from_wikipedia(self.title,self.release_year)
+
+    def get_cast_tmdb(self):
+        movie_id = tmdb.get_best_tmdb_id(self.title,self.release_year)
+
+        # get casting data
+        data = tmdb.get_movie_cast_from_id(movie_id)
+        tmdb_cast = pd.DataFrame(data["cast"])
+        wiki_cast = self.cast_wiki
+        cast_df = pc.correct_cast_with_wikipedia(tmdb_cast,wiki_cast)
+
+        # only use simple quotation marks'
+        cast_df.replace(regex=r'\"',value="'",inplace=True)
+
+        #remove any accents
+        cast_df['name'] = cast_df['name'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
+        cast_df['character'] = cast_df['character'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
+
+        # get release date
+        release_date = tmdb.get_movie_details_from_id(movie_id)['release_date']
+        release_date = datetime.strptime(release_date, '%Y-%m-%d')
+        # complete with actors/actress ages
+        cast_df['age_at_release'] = pc.compute_cast_age(cast_df,release_date)
+
+        return cast_df
+
+def main():
+    verbs = ['kisses', 'sleeps with', 'goes on a date with', 'has sex with', 'marries', 'is in love with','is in couple with',
+            'is the father of', 'is the mother of','is a friend of', 'is in the family of', 'is the enemy of']
+    hp4 = Movie("Harry Potter and the Goblet of Fire",2005)
+    ans = pc.compute_relationships_in_movie(hp4, verbs)
+    ans.to_csv('hp4.csv')
+
+    call_me = Movie("Call Me by Your Name",2017)
+    ans = pc.compute_relationships_in_movie(call_me.cast,call_me.plot, verbs)
+    ans.to_csv('call_me.csv')
+
+    lebowski = Movie("The Big Lebowski",1998)
+    ans = pc.compute_relationships_in_movie(lebowski.cast,lebowski.plot, verbs)
+    ans.to_csv('lebowski.csv')
+
+    print(ans)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/notebooks/age_gap/age_gap_visualisation.py b/notebooks/age_gap/age_gap_visualisation.py
@@ -0,0 +1,99 @@
+import sys
+sys.path.append("../..")
+from age_gap_automation import Movie
+import process_couples as pc
+import bechdelai.data.wikipedia as wiki
+import bechdelai.data.tmdb as tmdb
+
+import streamlit as st
+import pandas as pd
+import plotly.express as px
+
+MOVIE_FILES = {"Harry Potter and the Goblet of Fire":"hp4.csv",
+               "Call me by your name":"call_me.csv",
+               "The Big Lebowski":"lebowski.csv",
+               "Love Actually":"love_actually.csv"}
+MOVIE_YEARS = {"Harry Potter and the Goblet of Fire":2005,
+              "Call me by your name":2017,
+              "The Big Lebowski":1998,
+              "Love Actually":2003}
+
+VERBS = ['kisses', 'sleeps with', 'goes on a date with', 'has sex with', 'marries', 'is in love with','is in couple with', 'is the father of', 'is the mother of']
+LOVE_VERBS = ['kisses', 'sleeps with', 'goes on a date with', 'has sex with', 'marries', 'is in love with','is in couple with']
+
+@st.cache
+def load_data_from_file(file):
+    return pd.read_csv(file)
+def load_data(movie):
+    return pc.compute_relationships_in_movie(movie.cast,movie.plot, VERBS)
+
+
+def main():
+    st.set_page_config(layout="wide")
+    title = st.selectbox("Choose a movie:",list(MOVIE_FILES.keys()))
+    st.title(title)
+    st.subheader('Romantic relationships')
+
+
+    movie = Movie(title,MOVIE_YEARS[title])
+    cast = movie.cast
+
+    try:
+        scores = load_data_from_file(MOVIE_FILES[title])
+    except FileNotFoundError:
+        with st.spinner('Wait for it...'):
+            scores = load_data(movie)
+
+    scores.sort_values('score',ascending=False,inplace=True)
+    scores.drop_duplicates(['star1','star2'],keep='first',inplace=True) # TO DO: avoid duplicates when star1 and star2 are inversed
+
+    count=0
+    for i,row in scores.iterrows():
+
+        if row.question not in LOVE_VERBS:
+            continue
+
+        if (count==10) | (row.score<0.7):
+            break
+
+        star_younger = {'name':row.star1,
+                 'character':row.character1,
+                 'age':cast[cast.name==row.star1]['age_at_release'].iloc[0],
+                 'gender':cast[cast.name==row.star1]['gender'].iloc[0],
+                 'image' : tmdb.get_person_image_from_id(row.star_id1)["profiles"][0]["file_path"] }
+        star_older = {'name':row.star2,
+                 'character':row.character2,
+                 'age':cast[cast.name==row.star2]['age_at_release'].iloc[0],
+                 'gender':cast[cast.name==row.star2]['gender'].iloc[0],
+                 'image' : tmdb.get_person_image_from_id(row.star_id2)["profiles"][0]["file_path"] }
+
+        if star_younger['age'] > star_older['age']:
+            star_aux = star_younger
+            star_younger = star_older
+            star_older = star_aux
+
+
+
+        st.subheader('{} and {}'.format(star_younger['character'], star_older['character']))
+        st.write('They were played by {} and {} respectively. '.format(star_younger['name'], star_older['name']))
+        st.write('Age gap: ' ,row.age_gap)
+
+        col1, col2, col3,col4,col5 = st.columns([1.5,5,1.5,2,10])
+        col1.image('https://image.tmdb.org/t/p/original'+star_younger['image'],width=100)
+
+
+        values = col2.slider(
+             '',
+             10, 50,
+             (star_younger['age'], star_older['age']),
+             disabled=True, key = "slider_"+str(i))
+
+        col3.image('https://image.tmdb.org/t/p/original'+star_older['image'],width=100)
+
+        # relationship_true = col4.radio('Is this relationship true?', ['Yes', 'No'],key = "radio_"+str(i))
+        count+=1
+
+
+
+if __name__ == "__main__":
+    main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -129,4 +129,7 @@ dmypy.json @@
     .pyre/
     .vscode
     .DS_Store
-    *.csv
+    *.csv
+    # IDE
+    .idea/**