diff --git a/.gitignore b/.gitignore
index 9f76b71..d379779 100644
--- a/.gitignore
+++ b/.gitignore
@@ -161,5 +161,5 @@ cython_debug/
# Precommit hooks: ruff cache
.ruff_cache
-
+.DS_Store
data/
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 80d7fab..510b0cc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
repos:
- repo: https://github.com/charliermarsh/ruff-pre-commit
# Ruff version.
- rev: "v0.0.254"
+ rev: "v0.3.7"
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
diff --git a/app/README.md b/app/README.md
new file mode 100644
index 0000000..ae102df
--- /dev/null
+++ b/app/README.md
@@ -0,0 +1,5 @@
+Pour exécuter l'application Streamlit de , lancer la commande suivante depuis ce répertoire :
+ `streamlit run odi_streamlit.py`
+
+Note:
+L'application streamlit `analyse_app_OLD.py` sera à supprimer.
diff --git a/app/analyse_app_OLD.py b/app/analyse_app_OLD.py
new file mode 100644
index 0000000..dce7f36
--- /dev/null
+++ b/app/analyse_app_OLD.py
@@ -0,0 +1,63 @@
+"""
+# My first app with Streamlit https://streamlit.io/
+Here's our first attempt at using data to create a table:
+"""
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import streamlit as st
+
+st.title("Observatoire des Imaginaires")
+st.divider()
+st.header("Fait par la dream team _Analyse de données_")
+st.write(
+ (
+ "Cette application analyse les données du PoC. On peut se faire plaisir"
+ " en y ajoutant tous les graphiques nécessaires. "
+ "Le code est à nettoyer pour une meilleure maintenance ;-) "
+ ),
+)
+
+
+st.container()
+st.header("Aperçu des données")
+# Load the data
+file_path = "../data/Analyse réponses.xlsx - Treated data.csv"
+
+# ne pas lire la première ligne
+data = pd.read_csv(file_path, skiprows=1)
+
+# Supprimer les lignes où la première colonne contient "Contenu XXX"
+# XXX est un nombre
+# Et Supprimer les lignes où toutes les valeurs sont NaN
+df = data[~data["TITRE"].str.contains(r"Contenu \d+", na=False)].dropna(how="all")
+
+# ne conserver qu'une ligne sur 4 (ce qui revient à supprimer les informations
+# des personnages 2, 3, 4 quand ils existent)
+df_truncated = df.iloc[::4]
+# Nettoyage du data set
+
+# mettre les titres en majuscule
+df_truncated["TITRE"] = df_truncated["TITRE"].str.upper()
+
+### Convertir les types de données correctement ici
+# Convertir les années en entier
+annee = "ANNEE"
+df_truncated[annee] = pd.to_numeric(df_truncated[annee], errors="coerce").fillna(0).astype(int)
+# Trouver les titres qui apparaissent plus de 4 fois dans la colonne "TITRE"
+# (car chaque titre a 4 lignes, une pour chaque personnage)
+titles_more_than_once = df_truncated["TITRE"].value_counts()
+titles_more_than_once = titles_more_than_once[titles_more_than_once > 1]
+
+# Afficher un bar chart des titres les plus fréquents
+# Affichage d'un bar chart horizontal
+
+
+st.header("Films les plus fréquents")
+# Création du graphique
+fig, ax = plt.subplots()
+t = titles_more_than_once.sort_values(ascending=True)
+t.plot(kind="barh", color="skyblue", ax=ax)
+ax.set_xlabel("Nb")
+ax.set_title("Fréquence des films/séries")
+st.pyplot(fig)
diff --git a/app/odi_streamlit.py b/app/odi_streamlit.py
new file mode 100644
index 0000000..6efc676
--- /dev/null
+++ b/app/odi_streamlit.py
@@ -0,0 +1,354 @@
+"""
+# My first app with Streamlit https://streamlit.io/
+Here's our first attempt at using data to create a table:
+"""
+
+
+# Export fichier
+
+# Datavisualisation
+
+import pandas as pd
+import plotly.express as px
+
+# O.Importation des librairies nécessaires pour le script
+# Core Pkgs - Web application
+import streamlit as st
+
+# Other Pkgs
+
+# 3.Setup de l'application Streamlit - Streamlit webpage properties / set up the app
+# with wide view preset and a title
+st.set_page_config(
+ page_title="Observatoire des Imaginaires",
+ page_icon="herb",
+ layout="wide",
+)
+
+
+@st.cache_data # 👈 Add the caching decorator
+def load_data(file: str) -> pd.DataFrame:
+ df = pd.read_csv(file, skiprows=1)
+ return df
+
+
+# Load the data
+file_path = "../data/AnalyseReponsesTreatedData.csv" #'../data/Analyse réponses.xlsx - Treated data.csv' # noqa: E501
+# ne pas lire la première ligne
+data = load_data(file_path)
+
+
+# if 'df' not in st.session_state:
+# if 'dico' not in st.session_state:
+# if 'liste_ean' not in st.session_state:
+# if 'select_editeur' not in st.session_state:
+# if 'liste_ouvrage' not in st.session_state:
+#
+## 1. Classe lancée si choix de "rentrée littéraire" dans le menu en sidebar
+# class InfoRentreeLitt():
+# def get_data(df,dico):
+
+
+### A. Sidebar
+with st.sidebar:
+ st.image(
+ "https://media.licdn.com/dms/image/D4E0BAQEZHVBxFn3OXQ/company-logo_200_200/0/1697116934909/cercle_thmatique_culture_the_shifters_logo?e=1718841600&v=beta&t=_2DWaEBrblIgXhgVASUipHTcJesOL6s1Sk2uH73Kx58",
+ use_column_width=True,
+ ) # width=50
+
+ st.title("Fait par la dream team _Analyse de données_")
+ st.write(
+ (
+ "Cette application analyse les données du PoC. On peut se faire plaisir"
+ " en y ajoutant tous les graphiques nécessaires. "
+ "Le code est à nettoyer pour une meilleure maintenance ;-) "
+ ),
+ )
+
+
+### B. Container du header
+header = st.container()
+header.title("Observatoire des Imaginaires")
+header.write("""
""", unsafe_allow_html=True)
+
+### Custom CSS for the sticky header #74d1b4
+# st.markdown(
+# """
+#
+# """,
+
+### C. Container des métriques
+cont_metric = st.container(border=True) # border = True
+
+
+# Supprimer les lignes où la première colonne contient "Contenu XXX"
+# XXX est un nombre
+# Et Supprimer les lignes où toutes les valeurs sont NaN
+df = data[~data["TITRE"].str.contains(r"Contenu \d+", na=False)].dropna(how="all")
+
+# ne conserver qu'une ligne sur 4 (ce qui revient à supprimer
+# les informations des personnages 2, 3, 4 quand ils existent)
+df_truncated = df.iloc[::4]
+
+# Nettoyage du data set
+
+# mettre les titres en majuscule
+df_truncated["TITRE"] = df_truncated["TITRE"].str.upper()
+# mettre les pays en majuscule et supprimer les espaces au début et à la fin
+df_truncated["PAYS"] = df_truncated["PAYS"].str.strip().str.upper()
+df_truncated["PAYS"] = df_truncated["PAYS"].apply(lambda p: p.replace(" ET ", ";"))
+df_truncated.insert(
+ 3,
+ "pays_rework",
+ [
+ pays if len(pays.split(";")) == 1 else "INTERNATIONAL"
+ for pays in df_truncated["PAYS"]
+ ],
+)
+
+### Convertir les types de données correctement ici
+# Convertir les années en entier
+annee = "ANNEE"
+df_truncated[annee] = (
+ pd.to_numeric(df_truncated[annee], errors="coerce").fillna(0).astype(int)
+)
+
+with cont_metric:
+ with st.expander("Aperçu des donnéess"):
+ st.dataframe(df_truncated)
+
+ ### A. Affichage des métriques macro
+ col_nb_livre, col_nb_editeur, col_nb_prem_roman = st.columns([2, 2, 2])
+ with col_nb_livre:
+ # Metric nb Ouvrages
+ st.metric(label="Oeuvres analysées", value=len(set(df_truncated["TITRE"])))
+ with col_nb_editeur:
+ # Metric nb Editeurs
+ st.metric(
+ label="Films",
+ value=len(set(df_truncated[df_truncated.TYPE == "FILM"]["TITRE"])),
+ )
+ with col_nb_prem_roman:
+ # Metric Premier Roman
+ st.metric(
+ label="Séries",
+ value=len(set(df_truncated[df_truncated.TYPE == "SÉRIE"]["TITRE"])),
+ )
+
+ st.write()
+ st.write(
+ f":blue[{round(100*len(set(df_truncated[df_truncated.TYPE == 'FILM']['TITRE']))/len(set(df_truncated['TITRE'])),2)}%] des contenus renseignés sont des films vs :blue[{round(100*len(set(df_truncated[df_truncated.TYPE == 'SÉRIE']['TITRE']))/len(set(df_truncated['TITRE'])),2)}%] des séries.", # noqa: E501
+ )
+
+# Trouver les titres qui apparaissent plus de 4 fois dans la colonne "TITRE"
+# (car chaque titre a 4 lignes, une pour chaque personnage)
+
+
+titles_more_than_once = (
+ df_truncated.groupby(["TITRE", "TYPE"]).agg(compte=("TITRE", "count")).reset_index()
+)
+titles_more_than_once = titles_more_than_once[titles_more_than_once["compte"] > 1]
+
+
+# Afficher un bar chart des titres les plus fréquents
+# Affichage d'un bar chart horizontal
+
+with st.container(border=True):
+ st.header("Films les plus fréquents")
+ col_freq_film_select, col_freq_film_vide, col_freq_film_graph = st.columns(
+ [2, 0.5, 5],
+ )
+ with col_freq_film_select:
+ type_choice = st.selectbox(
+ "Choisir un type",
+ titles_more_than_once["TYPE"].unique(),
+ index=None,
+ )
+ with col_freq_film_graph:
+ if type_choice == "FILM":
+ t = titles_more_than_once.loc[
+ titles_more_than_once["TYPE"] == "FILM"
+ ].sort_values(
+ by="compte",
+ ascending=True,
+ )
+ elif type_choice == "SÉRIE":
+ t = titles_more_than_once.loc[
+ titles_more_than_once["TYPE"] == "SÉRIE"
+ ].sort_values(
+ by="compte",
+ ascending=True,
+ )
+ else:
+ t = titles_more_than_once.sort_values(by="compte", ascending=True)
+
+ st.bar_chart(t, x="TITRE", y="compte")
+
+# Types de contenus et pays d'origine
+with st.container(border=True):
+ st.header("Types de contenus")
+ col_contenu_date, col_contenu_vide, col_contenu_graph = st.columns([4, 0.5, 4])
+
+ with col_contenu_date:
+ date_group_df = (
+ df_truncated.groupby("ANNEE")
+ .count()
+ .reset_index()[["ANNEE", "TITRE"]]
+ .rename(columns={"TITRE": "nb_titre"})
+ )
+ date_group_df["periode_percent"] = 100 * (
+ 1 - (date_group_df.nb_titre.cumsum() / date_group_df.nb_titre.sum())
+ )
+
+ date_min = str(df_truncated.ANNEE.min())
+ date_max = str(df_truncated.ANNEE.max())
+ date_pareto = (
+ date_group_df[date_group_df["periode_percent"] <= 80]["ANNEE"].min() # noqa: PLR2004
+ )
+ date_value_pareto = int(
+ round(
+ date_group_df[date_group_df["periode_percent"] <= 80][ # noqa: PLR2004
+ "periode_percent"
+ ].max(),
+ 0,
+ ),
+ )
+
+ st.markdown(
+ (
+ f"Les contenus datent d`une période qui s`étend de {date_min}"
+ f" à {date_max}. {date_value_pareto}% des contenus sont postérieurs à"
+ f" {date_pareto}."
+ ),
+ )
+
+ st.bar_chart(date_group_df, x="ANNEE", y="nb_titre")
+
+ with col_contenu_graph:
+ country_group_df = df_truncated
+ country_group_df = (
+ country_group_df.groupby("pays_rework")
+ .count()
+ .reset_index()[["pays_rework", "TITRE"]]
+ .rename(columns={"TITRE": "nb_titre"})
+ .sort_values("nb_titre")
+ )
+ country_group_df["country_percent_cumul"] = round(
+ 100
+ * (
+ 1
+ - (country_group_df.nb_titre.cumsum() / country_group_df.nb_titre.sum())
+ ),
+ 0,
+ )
+ country_group_df["country_percent"] = round(
+ 100 * (country_group_df.nb_titre / country_group_df.nb_titre.sum()),
+ 2,
+ )
+
+ country_value_pareto = int(
+ round(
+ country_group_df[country_group_df["country_percent"] >= 10][ # noqa: PLR2004
+ "country_percent"
+ ].sum(),
+ 2,
+ ),
+ )
+ country_group_df_pareto = country_group_df[
+ country_group_df["country_percent"] >= 10 # noqa: PLR2004
+ ][["pays_rework", "country_percent"]].sort_values(
+ "country_percent",
+ ascending=False,
+ )
+
+ st.write(
+ f"A **:blue[{country_value_pareto}%]**, les 2 principaux pays dont les contenus sont les plus visionnés sont : {country_group_df.nlargest(2,'country_percent').reset_index(drop=True)['pays_rework'][0].capitalize()} ({country_group_df.nlargest(2,'country_percent').reset_index(drop=True)['country_percent'][0]}%) et {country_group_df.nlargest(2,'country_percent').reset_index(drop=True)['pays_rework'][1].capitalize()} ({country_group_df.nlargest(2,'country_percent').reset_index(drop=True)['country_percent'][1]}%).", # noqa: E501
+ )
+
+ fig_type = px.bar(
+ country_group_df,
+ y="pays_rework",
+ x="nb_titre",
+ orientation="h",
+ text_auto=True,
+ )
+ st.plotly_chart(fig_type, use_container_width=True)
+
+# LIEUX VISIONNAGE
+with st.container(border=True):
+ canal_group_df = (
+ df_truncated.groupby("CANAL")
+ .count()[["TITRE"]]
+ .rename(columns={"TITRE": "nb_titre"})
+ .sort_values("nb_titre", ascending=False)
+ )
+ canal_group_df["canal_percent"] = 100 * (
+ canal_group_df.nb_titre / canal_group_df.nb_titre.sum()
+ )
+ canal_country_group_df = (
+ df_truncated.groupby(["CANAL", "pays_rework"])
+ .count()[["TITRE"]]
+ .rename(columns={"TITRE": "nb_titre"})
+ .sort_values("nb_titre", ascending=False)
+ .reset_index()
+ )
+
+ col_text_canal, col_table_canal = st.columns([5, 3])
+ with col_text_canal:
+ canal_visionne1 = canal_group_df.nb_titre.nlargest(2).reset_index()["CANAL"][0]
+ percent_canal_visionne1 = round(canal_group_df.canal_percent.nlargest(2)[0], 2)
+ canal_visionne2 = canal_group_df.nb_titre.nlargest(2).reset_index()["CANAL"][1]
+ percent_canal_visionne2 = round(canal_group_df.canal_percent.nlargest(2)[1], 2)
+
+ st.markdown(
+ f"Les contenus sont visionnés principalement sur :blue[{canal_visionne1.capitalize()}] (:blue[{percent_canal_visionne1}%]) et :blue[{canal_visionne2.capitalize()}] (:blue[{percent_canal_visionne2}%]).\n\n La majorité des contenus visionnés sur :blue[{canal_visionne1.capitalize()}] ont pour pays d'origine :blue[{canal_country_group_df[canal_country_group_df['CANAL']==canal_visionne1].nlargest(1,'nb_titre').reset_index()['pays_rework'][0]}] (:blue[%]), alors que la majorité des contenus français sont visionnés xxx (xxx%).\n\n :blue[{round(canal_group_df.loc['Autre','canal_percent'],2)}%] des contenus sont visionnés sur un canal `Autre` que la liste proposée (cf ci-contre)", # noqa: E501
+ )
+
+ # Les contenus sont visionnés principalement sur Netflix (29.91%) ou dans
+ # une salle de cinéma (28.97%). La majorité des contenus américains sont
+ # visionnés sur Netflix (40.48% des contenus US), alors que la majorité
+ # des contenus français sont visionnés au cinéma (44.19%).
+ # 23.36% des contenus sont visionnés sur un canal `autre`
+ # que la liste proposée (cf ci-dessous)
+
+ with col_table_canal:
+ st.markdown(set(canal_group_df.reset_index().CANAL))
+
+st.subheader("EPOQUE DE RECITS")
+st.write(set(df_truncated["EPOQUE DU RECIT"]))
+
+st.subheader("TYPE DE MONDE")
+st.write(set(df_truncated["TYPE DE MONDE"]))
+st.dataframe(
+ df_truncated[["TITRE", "TRAITEMENT DU RECIT", "TYPE DE MONDE"]]
+ .groupby(["TRAITEMENT DU RECIT", "TYPE DE MONDE"])
+ .count(),
+)
+
+#
+# with st.container():
+
+#
+# fig.add_trace(go.Pie(labels=gender_viz_crew.gender_text, values=gender_viz_crew.nb_by_genre, name="Equipe tech"), # noqa: E501
+# 1, 2)
+#
+#
+#
+#
+# fig.update_layout(width = 400,
+# margin=dict(t=0, b=0, l=0, r=0),)
+# x=1),
+# Add annotations in the center of the donut pies.
+# annotations=[dict(text='Casting', x=0.15, y=0.5, font_size=20, showarrow=False),
+# dict(text='Crew', x=0.83, y=0.5, font_size=20, showarrow=False)])
+#
+#
+# data=[go.Bar(x=country_group_df.pays_rework ,
+# y=country_group_df.nb_titre)])
+#
+# with colinfofilm.expander("Table de données"):
diff --git a/notebooks/analyse-poc-treated-data.ipynb b/notebooks/analyse-poc-treated-data.ipynb
new file mode 100644
index 0000000..3af52f9
--- /dev/null
+++ b/notebooks/analyse-poc-treated-data.ipynb
@@ -0,0 +1,735 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "title: \"Analyse des résultats (PoC V1)\"\n",
+ "format: html\n",
+ "---\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/s4/6jbnvfsj1v10f2txtlkndj5w0000gp/T/ipykernel_38954/2759663841.py:8: DeprecationWarning: \n",
+ "Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n",
+ "(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n",
+ "but was not found to be installed on your system.\n",
+ "If this would cause problems for you,\n",
+ "please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n",
+ " \n",
+ " import pandas as pd\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
TITRE
\n",
+ "
ANNEE
\n",
+ "
PAYS
\n",
+ "
CANAL
\n",
+ "
TYPE
\n",
+ "
SAISON
\n",
+ "
LIEU(X) DE L'ACTION
\n",
+ "
EPOQUE DU RECIT
\n",
+ "
TRAITEMENT DU RECIT
\n",
+ "
TYPE DE MONDE
\n",
+ "
...
\n",
+ "
Epoque
\n",
+ "
Traitement
\n",
+ "
Monde
\n",
+ "
Genre
\n",
+ "
Conso
\n",
+ "
Vie
\n",
+ "
Vivant
\n",
+ "
Climat
\n",
+ "
Recul
\n",
+ "
Nécessité
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
Une nuit
\n",
+ "
2023.0
\n",
+ "
France
\n",
+ "
SALLE DE CINÉMA
\n",
+ "
FILM
\n",
+ "
NaN
\n",
+ "
France, Paris
\n",
+ "
2020-2030
\n",
+ "
Ni dystopique ni utopique
\n",
+ "
Réaliste (monde proche du nôtre dans son fonct...
\n",
+ "
...
\n",
+ "
2020-2030
\n",
+ "
Ni dystopique ni utopique
\n",
+ "
Réaliste (monde proche du nôtre dans son fonct...
\n",
+ "
Comédie-romantique
\n",
+ "
-1.0
\n",
+ "
0.0
\n",
+ "
0.0
\n",
+ "
0.0
\n",
+ "
Non applicable, les scores obtenus sont neutre...
\n",
+ "
Non applicable, les scores obtenus sont neutre...
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
...
\n",
+ "
2020-2030
\n",
+ "
Ni dystopique ni utopique
\n",
+ "
Réaliste (monde proche du nôtre dans son fonct...
\n",
+ "
Comédie-romantique
\n",
+ "
-1.0
\n",
+ "
0.0
\n",
+ "
0.0
\n",
+ "
0.0
\n",
+ "
Non applicable, les scores obtenus sont neutre...
\n",
+ "
Non applicable, les scores obtenus sont neutre...
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2 rows × 104 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " TITRE ANNEE PAYS CANAL TYPE SAISON LIEU(X) DE L'ACTION \\\n",
+ "0 Une nuit 2023.0 France SALLE DE CINÉMA FILM NaN France, Paris \n",
+ "1 NaN NaN NaN NaN NaN NaN NaN \n",
+ "\n",
+ " EPOQUE DU RECIT TRAITEMENT DU RECIT \\\n",
+ "0 2020-2030 Ni dystopique ni utopique \n",
+ "1 NaN NaN \n",
+ "\n",
+ " TYPE DE MONDE ... Epoque \\\n",
+ "0 Réaliste (monde proche du nôtre dans son fonct... ... 2020-2030 \n",
+ "1 NaN ... 2020-2030 \n",
+ "\n",
+ " Traitement \\\n",
+ "0 Ni dystopique ni utopique \n",
+ "1 Ni dystopique ni utopique \n",
+ "\n",
+ " Monde Genre \\\n",
+ "0 Réaliste (monde proche du nôtre dans son fonct... Comédie-romantique \n",
+ "1 Réaliste (monde proche du nôtre dans son fonct... Comédie-romantique \n",
+ "\n",
+ " Conso Vie Vivant Climat \\\n",
+ "0 -1.0 0.0 0.0 0.0 \n",
+ "1 -1.0 0.0 0.0 0.0 \n",
+ "\n",
+ " Recul \\\n",
+ "0 Non applicable, les scores obtenus sont neutre... \n",
+ "1 Non applicable, les scores obtenus sont neutre... \n",
+ "\n",
+ " Nécessité \n",
+ "0 Non applicable, les scores obtenus sont neutre... \n",
+ "1 Non applicable, les scores obtenus sont neutre... \n",
+ "\n",
+ "[2 rows x 104 columns]"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Getting started\n",
+ "\n",
+ "# A simple notebooks for testing some analyses on the first version of the questionnaire.\n",
+ "# Data are not pushed to github due to privacy reasons. \n",
+ "# The raw_data.csv should be copied to the data/ folder. \n",
+ "\n",
+ "# Let's first load the data to understand its structure and then compute general statistical information.\n",
+ "import pandas as pd\n",
+ "\n",
+ "# Load the data\n",
+ "file_path = '../data/Analyse réponses.xlsx - Treated data.csv'\n",
+ "\n",
+ "# ne pas lire la première ligne\n",
+ "data = pd.read_csv(file_path, skiprows=1)\n",
+ "\n",
+ "# Display the first few rows of the dataframe to understand its structure\n",
+ "data.head(2)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['TITRE', 'ANNEE', 'PAYS', 'CANAL', 'TYPE', 'SAISON', \"LIEU(X) DE L'ACTION\", 'EPOQUE DU RECIT', 'TRAITEMENT DU RECIT', 'TYPE DE MONDE', 'GENRE', 'POSITION DANS LE RECIT', 'GENRE.1', 'AGE', 'ETHNICITÉ', 'BEAUTE', 'POIDS', 'PAYS DE RESIDENCE', \"TYPE D'HABITAT\", \"LIEU D'HABITAT\", 'Appartement', 'Colocation', 'Maison', 'Studio', 'Hôtel', 'Ferme', 'Alternatif / Communautaire', 'Ne sait pas / Autre', 'ACTIVITE', \"SECTEUR D'ACTIVITE\", 'CATEGORIE SOCIALE', 'ASPIRATIONS', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51', 'VALEURS / TRAITS DE CARACTÈRE', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 65', 'Unnamed: 66', 'Unnamed: 67', 'Unnamed: 68', 'Unnamed: 69', 'Unnamed: 70', 'Unnamed: 71', 'Unnamed: 72', 'EMPREINTE PERSO', 'CONSOMMATION & MATERIALISME', 'CONSOMMATION & MATERIALISME.1', 'MODES DE VIE', 'MODES DE VIE.1', 'RAPPORT AU VIVANT', 'RAPPORT AU VIVANT.1', 'CRISE ENVIRONNEMENTALE', 'CRISE ENVIRONNEMENTALE.1', 'Y A-T-IL UNE PRISE DE RECUL ?', 'EST-CE NECESSAIRE ?', 'QUELQUE CHOSE A AJOUTER ?', 'VOTRE EMAIL ?', 'Label / corpus', 'TYPOLOGIE', 'Corpus', 'Année', 'Pays', 'Canal', 'Type', 'Lieux action', 'Epoque', 'Traitement', 'Monde', 'Genre', 'Conso', 'Vie', 'Vivant', 'Climat', 'Recul', 'Nécessité']\n"
+ ]
+ }
+ ],
+ "source": [
+ "# colonnes\n",
+ "print(list(data.columns))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "nb lignes et nb colonnes\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "(812, 104)"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Supprimer les lignes où la première colonne contient \"Contenu XXX\"\n",
+ "# XXX est un nombre\n",
+ "# Et Supprimer les lignes où toutes les valeurs sont NaN\n",
+ "df = data[~data['TITRE'].str.contains(r'Contenu \\d+', na=False)].dropna(how='all')\n",
+ "print('nb lignes et nb colonnes')\n",
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Nombre de réponses analysées : 203\n",
+ "Nombre de colonnes dans l'analyse : 104\n"
+ ]
+ }
+ ],
+ "source": [
+ "# ne conserver qu'une ligne sur 4 (ce qui revient à supprimer les informations des personnages 2, 3, 4 quand ils existent)\n",
+ "df_truncated = df.iloc[::4]\n",
+ "\n",
+ "# Pour afficher séparément le nombre de lignes et de colonnes\n",
+ "nb_lignes, nb_colonnes = df_truncated.shape\n",
+ "print(f\"Nombre de réponses analysées : {nb_lignes}\")\n",
+ "print(f\"Nombre de colonnes dans l'analyse : {nb_colonnes}\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/s4/6jbnvfsj1v10f2txtlkndj5w0000gp/T/ipykernel_38954/3001236423.py:4: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df_truncated['TITRE'] = df_truncated['TITRE'].str.upper()\n",
+ "/var/folders/s4/6jbnvfsj1v10f2txtlkndj5w0000gp/T/ipykernel_38954/3001236423.py:9: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df_truncated[annee] = pd.to_numeric(df_truncated[annee], errors='coerce').fillna(0).astype(int)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Nettoyage du data set\n",
+ "\n",
+ "# mettre les titres en majuscule \n",
+ "df_truncated['TITRE'] = df_truncated['TITRE'].str.upper()\n",
+ "\n",
+ "### Convertir les types de données correctement ici \n",
+ "# Convertir les années en entier\n",
+ "annee = \"ANNEE\"\n",
+ "df_truncated[annee] = pd.to_numeric(df_truncated[annee], errors='coerce').fillna(0).astype(int)\n",
+ "\n",
+ "# print(df_truncated.dtypes) # Afficher les types de données des colonnes pour vérifier\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "TITRE\n",
+ "POLAR PARK 6\n",
+ "LE RÈGNE ANIMAL 4\n",
+ "UNE ANNÉE DIFFICILE 3\n",
+ "SAMBRE 3\n",
+ "VOLEUSES 2\n",
+ "SECOND TOUR 2\n",
+ "THE CROWN 2\n",
+ "PAST LIVES 2\n",
+ "BODIES 2\n",
+ "LE LIVRE DES SOLUTIONS 2\n",
+ "À PLEIN TEMPS 2\n",
+ "REVOIR PARIS 2\n",
+ "D'ARGENT ET DE SANG 2\n",
+ "YANNICK 2\n",
+ "SENSE8 2\n",
+ "ANATOMIE D'UNE CHUTE 2\n",
+ "LE THÉORÈME DE MARGUERITE 2\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Trouver les titres qui apparaissent plus de 4 fois dans la colonne \"TITRE\" (car chaque titre a 4 lignes, une pour chaque personnage)\n",
+ "titles_more_than_once = df_truncated['TITRE'].value_counts()\n",
+ "titles_more_than_once = titles_more_than_once[titles_more_than_once > 1]\n",
+ "\n",
+ "titles_more_than_once\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "# Afficher un bar chart des titres les plus fréquents\n",
+ "# Affichage d'un bar chart horizontal\n",
+ "plt.figure(figsize=(10, 6))\n",
+ "titles_more_than_once.sort_values(ascending=True).plot(kind='barh', color='skyblue',)\n",
+ "plt.xlabel('Nb')\n",
+ "plt.title('Fréquence des films/séries')\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([2023, 2016, 2022, 1997, 2015, 2006, 2018, 2021, 1984, 1968, 2020,\n",
+ " 2019, 1971, 2017, 2001, 2012, 1987, 2005, 1975, 1982, 2013, 2014,\n",
+ " 2010, 1994, 2009, 1988, 2000, 2011, 1999, 1957, 2002])"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "\n",
+ "# Extract unique values from the \"ANNEE\" column to understand its format\n",
+ "unique_years = df_truncated[annee].unique()\n",
+ "\n",
+ "# Display a few unique values for inspection\n",
+ "unique_years[:100]\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def visualize_year_distribution(years_data, chart_title):\n",
+ " \"\"\"\n",
+ " Visualise la distribution des années, y compris les intervalles, dans un ensemble de données d'années.\n",
+ " \n",
+ " Parameters:\n",
+ " - years_data (list or pd.Series): Un tableau ou une série Pandas contenant des années ou des intervalles d'années.\n",
+ " \n",
+ " Returns:\n",
+ " - Affiche un histogramme de la distribution des années.\n",
+ " \"\"\"\n",
+ " import pandas as pd\n",
+ " import matplotlib.pyplot as plt\n",
+ " import re # For regular expression operations\n",
+ " \n",
+ " \n",
+ " # Convertir toutes les valeurs en chaînes pour gérer les intervalles potentiels comme des entrées uniques\n",
+ " years_data = years_data.astype(str)\n",
+ " \n",
+ " # Séparer les intervalles en années individuelles et compter la fréquence de chaque année\n",
+ " all_years = []\n",
+ " # Utilise un set pour les valeurs non numériques\n",
+ " non_numeric_entries = set()\n",
+ " \n",
+ " for year in years_data:\n",
+ " if re.search(r'[a-z]', year): # If there are no digits in the string\n",
+ " non_numeric_entries.add(year)\n",
+ " elif \",\" in year:\n",
+ " years_list = year.split(\",\")\n",
+ " for y in years_list:\n",
+ " all_years.append(int(y.strip())) # Ajouter chaque année en la convertissant en entier\n",
+ " elif \"+/-\" in year: # Vérifier si c'est un intervalle\n",
+ " start_year, uncertainty = year.split(\"+/-\")\n",
+ " if (start_year != ''):\n",
+ " all_years.extend(range(int(start_year), int(start_year) + uncertainty + 1))\n",
+ " print(year)\n",
+ " else:\n",
+ " non_numeric_entries.add(year)\n",
+ " elif \"-\" in year: # Vérifier si c'est un intervalle\n",
+ " start_year, end_year = year.split(\"-\")\n",
+ " all_years.extend(range(int(start_year), int(end_year) + 1))\n",
+ " elif \"/\" in year: # Vérifier si c'est un intervalle\n",
+ " start_year, end_year = year.split(\"/\")\n",
+ " all_years.extend(range(int(start_year), int(end_year) + 1))\n",
+ " else:\n",
+ " all_years.append(int(year))\n",
+ " \n",
+ " # Print non-numeric entries if any\n",
+ " if non_numeric_entries:\n",
+ " print(\"Non-numeric entries found:\", non_numeric_entries)\n",
+ " \n",
+ " # Proceed with visualization only if there are numeric years\n",
+ " if all_years:\n",
+ " # Créer un DataFrame à partir de la liste des années pour faciliter la visualisation\n",
+ " df_years = pd.DataFrame(all_years, columns=[\"Year\"])\n",
+ " \n",
+ " # Visualiser la distribution des années avec un histogramme\n",
+ " plt.figure(figsize=(10, 6))\n",
+ " df_years[\"Year\"].hist(bins=len(df_years[\"Year\"].unique()), color=\"skyblue\", edgecolor=\"black\")\n",
+ " plt.title(chart_title)\n",
+ " plt.xlabel(\"Année\")\n",
+ " plt.ylabel(\"Fréquence\")\n",
+ " plt.xticks(rotation=45)\n",
+ " plt.tight_layout()\n",
+ " plt.show()\n",
+ " else:\n",
+ " print(\"No numeric years found for visualization.\")\n",
+ " return all_years\n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/s4/6jbnvfsj1v10f2txtlkndj5w0000gp/T/ipykernel_38954/2978788642.py:4: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df_truncated[annee] = df_truncated[annee].astype(str)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "#import matplotlib.pyplot as plt\n",
+ "\n",
+ "# Convertir toutes les valeurs en chaînes pour gérer les intervalles potentiels comme des entrées uniques\n",
+ "df_truncated[annee] = df_truncated[annee].astype(str)\n",
+ "all_prod_years = visualize_year_distribution(df_truncated[annee], 'Distribution des années de production')\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Non-numeric entries found: {'Années 50', 'Actuelle', 'Futur proche', 'notre époque', 'Contemporain ', 'Milieu du XXI siècle', 'Contemporain', 'Futur très proche', \"Aujourd'hui \", 'Futur', 'Tous les 33 ans ', 'Sur plusieurs siecles', '1969-1995 (mais uchronie...)', 'Années 2010/2020', 'Actuel ', '+/- 1930', 'années 2000'}\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/s4/6jbnvfsj1v10f2txtlkndj5w0000gp/T/ipykernel_38954/2873658588.py:4: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df_truncated[epoque] = df_truncated[epoque].astype(str)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "epoque = \"EPOQUE DU RECIT\"\n",
+ "\n",
+ "# Convertir toutes les valeurs en chaînes pour gérer les intervalles potentiels comme des entrées uniques\n",
+ "df_truncated[epoque] = df_truncated[epoque].astype(str)\n",
+ "all_years = visualize_year_distribution(df_truncated[epoque], 'Distribution des époques du récit')\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Construction du camembert\n",
+ "plt.figure(figsize=(8, 8))\n",
+ "plt.pie(c, autopct='%1.1f%%', startangle=140)\n",
+ "plt.title(\"Empreinte Personnelle\")\n",
+ "\n",
+ "plt.axis('equal') # S'assure que le camembert est dessiné comme un cercle.\n",
+ "plt.title('Empreinte carbone')\n",
+ "plt.legend(c.index)\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'Non-empty count': 54,\n",
+ " 'Unique non-empty entries': 54,\n",
+ " 'Sample entries': ['Très compliqué pour cette série. ',\n",
+ " 'Le film me semble représenter une vie bas carbone souhaitable',\n",
+ " 'Approche très intéressante, je laisse mon mail pour en savoir plus :) ',\n",
+ " \"La série met en marge des personnages plutôt en marge de la société de consommation, mais montre pourtant bien l'emprise du système du Tout voiture\",\n",
+ " \"C'est une série israélienne qui traite du problème palestinien... Il y a donc forcément un biais. Les méchants sont palestiniens. \"]}"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "col_qqch_ajouter = df_truncated[\"QUELQUE CHOSE A AJOUTER ?\"]\n",
+ "\n",
+ "# Analyzing the \"QUELQUE CHOSE A AJOUTER ?\" column to understand its content\n",
+ "# Check if the column exists\n",
+ "if \"QUELQUE CHOSE A AJOUTER ?\" in df_truncated.columns:\n",
+ " # Basic analysis of the column\n",
+ " # Count of non-empty entries\n",
+ " non_empty_count = col_qqch_ajouter.notna().sum()\n",
+ " # Unique non-empty entries\n",
+ " unique_non_empty_entries = col_qqch_ajouter.nunique()\n",
+ " # Sample of non-empty entries\n",
+ " sample_entries = col_qqch_ajouter.dropna().sample(min(5, non_empty_count)).tolist()\n",
+ " analysis_results = {\n",
+ " \"Non-empty count\": non_empty_count,\n",
+ " \"Unique non-empty entries\": unique_non_empty_entries,\n",
+ " \"Sample entries\": sample_entries\n",
+ " }\n",
+ "else:\n",
+ " analysis_results = \"Column 'QUELQUE CHOSE A AJOUTER ?' does not exist in the dataset.\"\n",
+ "\n",
+ "analysis_results\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/analyse-poc.ipynb b/notebooks/analyse-poc.ipynb
new file mode 100644
index 0000000..0b1b46a
--- /dev/null
+++ b/notebooks/analyse-poc.ipynb
@@ -0,0 +1,1019 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
Horodateur
\n",
+ "
TITRE DU CONTENU
\n",
+ "
ANNÉE DE PRODUCTION
\n",
+ "
PAYS DE PRODUCTION
\n",
+ "
CANAL DE DIFFUSION
\n",
+ "
SÉRIE OU FILM ?
\n",
+ "
QUELLE SAISON ?
\n",
+ "
LIEU(X) DE L'ACTION
\n",
+ "
EPOQUE DU RECIT
\n",
+ "
GENRE
\n",
+ "
...
\n",
+ "
Unnamed: 116
\n",
+ "
Unnamed: 117
\n",
+ "
Unnamed: 118
\n",
+ "
Unnamed: 119
\n",
+ "
Unnamed: 120
\n",
+ "
Unnamed: 121
\n",
+ "
Unnamed: 122
\n",
+ "
Unnamed: 123
\n",
+ "
Unnamed: 124
\n",
+ "
Unnamed: 125
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
7/29/23 13:06
\n",
+ "
Une nuit
\n",
+ "
2023
\n",
+ "
France
\n",
+ "
SALLE DE CINÉMA
\n",
+ "
FILM
\n",
+ "
NaN
\n",
+ "
Paris
\n",
+ "
2023
\n",
+ "
Comédie-romantique, Réaliste
\n",
+ "
...
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
7/30/23 15:18
\n",
+ "
David Brent, Life on the road
\n",
+ "
2016
\n",
+ "
UK
\n",
+ "
NETFLIX
\n",
+ "
FILM
\n",
+ "
NaN
\n",
+ "
Angleterre
\n",
+ "
2016
\n",
+ "
Comédie, Réaliste
\n",
+ "
...
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
7/30/23 15:47
\n",
+ "
Survival of the thickest
\n",
+ "
2022
\n",
+ "
Etats-Unis
\n",
+ "
NETFLIX
\n",
+ "
SÉRIE
\n",
+ "
1
\n",
+ "
New York
\n",
+ "
2022
\n",
+ "
Comédie-romantique
\n",
+ "
...
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
7/30/23 16:18
\n",
+ "
Meurtres à
\n",
+ "
2016
\n",
+ "
France
\n",
+ "
FRANCE TV
\n",
+ "
SÉRIE
\n",
+ "
???? je ne sais pas
\n",
+ "
en France, dans différentes régions selon les ...
\n",
+ "
2016
\n",
+ "
Thriller / Policier
\n",
+ "
...
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
7/31/23 9:28
\n",
+ "
Dernière nuit Milan
\n",
+ "
2022
\n",
+ "
Italie
\n",
+ "
SALLE DE CINÉMA
\n",
+ "
FILM
\n",
+ "
NaN
\n",
+ "
Milan
\n",
+ "
2022
\n",
+ "
Thriller / Policier
\n",
+ "
...
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 126 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Horodateur TITRE DU CONTENU ANNÉE DE PRODUCTION \\\n",
+ "0 7/29/23 13:06 Une nuit 2023 \n",
+ "1 7/30/23 15:18 David Brent, Life on the road 2016 \n",
+ "2 7/30/23 15:47 Survival of the thickest 2022 \n",
+ "3 7/30/23 16:18 Meurtres à 2016 \n",
+ "4 7/31/23 9:28 Dernière nuit Milan 2022 \n",
+ "\n",
+ " PAYS DE PRODUCTION CANAL DE DIFFUSION SÉRIE OU FILM ? QUELLE SAISON ? \\\n",
+ "0 France SALLE DE CINÉMA FILM NaN \n",
+ "1 UK NETFLIX FILM NaN \n",
+ "2 Etats-Unis NETFLIX SÉRIE 1 \n",
+ "3 France FRANCE TV SÉRIE ???? je ne sais pas \n",
+ "4 Italie SALLE DE CINÉMA FILM NaN \n",
+ "\n",
+ " LIEU(X) DE L'ACTION EPOQUE DU RECIT \\\n",
+ "0 Paris 2023 \n",
+ "1 Angleterre 2016 \n",
+ "2 New York 2022 \n",
+ "3 en France, dans différentes régions selon les ... 2016 \n",
+ "4 Milan 2022 \n",
+ "\n",
+ " GENRE ... Unnamed: 116 Unnamed: 117 Unnamed: 118 \\\n",
+ "0 Comédie-romantique, Réaliste ... NaN NaN NaN \n",
+ "1 Comédie, Réaliste ... NaN NaN NaN \n",
+ "2 Comédie-romantique ... NaN NaN NaN \n",
+ "3 Thriller / Policier ... NaN NaN NaN \n",
+ "4 Thriller / Policier ... NaN NaN NaN \n",
+ "\n",
+ " Unnamed: 119 Unnamed: 120 Unnamed: 121 Unnamed: 122 Unnamed: 123 \\\n",
+ "0 NaN NaN NaN NaN NaN \n",
+ "1 NaN NaN NaN NaN NaN \n",
+ "2 NaN NaN NaN NaN NaN \n",
+ "3 NaN NaN NaN NaN NaN \n",
+ "4 NaN NaN NaN NaN NaN \n",
+ "\n",
+ " Unnamed: 124 Unnamed: 125 \n",
+ "0 NaN NaN \n",
+ "1 NaN NaN \n",
+ "2 NaN NaN \n",
+ "3 NaN NaN \n",
+ "4 NaN NaN \n",
+ "\n",
+ "[5 rows x 126 columns]"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Getting started\n",
+ "\n",
+ "# A simple notebooks for testing some analyses on the first version of the questionnaire.\n",
+ "# Data are not pushed to github due to privacy reasons. \n",
+ "# The raw_data.csv should be copied to the data/ folder. \n",
+ "\n",
+ "# Let's first load the data to understand its structure and then compute general statistical information.\n",
+ "import pandas as pd\n",
+ "\n",
+ "# Load the data\n",
+ "# file_path = './data/Treated_data.csv'\n",
+ "file_path = '../data/raw_data.csv'\n",
+ "data = pd.read_csv(file_path)\n",
+ "\n",
+ "# Display the first few rows of the dataframe to understand its structure\n",
+ "data.head()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
ANNÉE DE PRODUCTION
\n",
+ "
MORPHOLOGIE 1/2 (OPTIONNEL)
\n",
+ "
MORPHOLOGIE 1/2 (OPTIONNEL).1
\n",
+ "
TYPE DE RESIDENCE [Studio].1
\n",
+ "
MORPHOLOGIE 1/2 (OPTIONNEL).2
\n",
+ "
TYPE DE RESIDENCE [Colocation].2
\n",
+ "
TYPE DE RESIDENCE [Studio].2
\n",
+ "
TYPE DE RESIDENCE [Hotel].2
\n",
+ "
VALEURS / TRAITS DE CARACTERE.2
\n",
+ "
MORPHOLOGIE 1/2 (OPTIONNEL).3
\n",
+ "
...
\n",
+ "
MORPHOLOGIE 2/2 (OPTIONNEL).1
\n",
+ "
MORPHOLOGIE 2/2 (OPTIONNEL).2
\n",
+ "
MORPHOLOGIE 2/2 (OPTIONNEL).3
\n",
+ "
Unnamed: 119
\n",
+ "
Unnamed: 120
\n",
+ "
Unnamed: 121
\n",
+ "
Unnamed: 122
\n",
+ "
Unnamed: 123
\n",
+ "
Unnamed: 124
\n",
+ "
Unnamed: 125
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
count
\n",
+ "
203.000000
\n",
+ "
199.000000
\n",
+ "
75.000000
\n",
+ "
0.0
\n",
+ "
20.00000
\n",
+ "
0.0
\n",
+ "
0.0
\n",
+ "
0.0
\n",
+ "
0.0
\n",
+ "
7.000000
\n",
+ "
...
\n",
+ "
73.000000
\n",
+ "
20.000000
\n",
+ "
7.000000
\n",
+ "
0.0
\n",
+ "
0.0
\n",
+ "
0.0
\n",
+ "
0.0
\n",
+ "
0.0
\n",
+ "
0.0
\n",
+ "
0.0
\n",
+ "
\n",
+ "
\n",
+ "
mean
\n",
+ "
2017.832512
\n",
+ "
3.733668
\n",
+ "
3.813333
\n",
+ "
NaN
\n",
+ "
3.35000
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
2.714286
\n",
+ "
...
\n",
+ "
2.136986
\n",
+ "
2.300000
\n",
+ "
2.285714
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
std
\n",
+ "
10.463114
\n",
+ "
0.986997
\n",
+ "
1.086693
\n",
+ "
NaN
\n",
+ "
1.03999
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
1.496026
\n",
+ "
...
\n",
+ "
1.071280
\n",
+ "
1.128576
\n",
+ "
1.112697
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
min
\n",
+ "
1957.000000
\n",
+ "
1.000000
\n",
+ "
1.000000
\n",
+ "
NaN
\n",
+ "
1.00000
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
1.000000
\n",
+ "
...
\n",
+ "
1.000000
\n",
+ "
1.000000
\n",
+ "
1.000000
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
25%
\n",
+ "
2018.000000
\n",
+ "
3.000000
\n",
+ "
3.000000
\n",
+ "
NaN
\n",
+ "
3.00000
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
1.500000
\n",
+ "
...
\n",
+ "
1.000000
\n",
+ "
1.000000
\n",
+ "
1.500000
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
50%
\n",
+ "
2022.000000
\n",
+ "
4.000000
\n",
+ "
4.000000
\n",
+ "
NaN
\n",
+ "
3.00000
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
3.000000
\n",
+ "
...
\n",
+ "
2.000000
\n",
+ "
2.500000
\n",
+ "
2.000000
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
75%
\n",
+ "
2023.000000
\n",
+ "
4.500000
\n",
+ "
5.000000
\n",
+ "
NaN
\n",
+ "
4.00000
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
3.500000
\n",
+ "
...
\n",
+ "
3.000000
\n",
+ "
3.000000
\n",
+ "
3.000000
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
max
\n",
+ "
2023.000000
\n",
+ "
5.000000
\n",
+ "
5.000000
\n",
+ "
NaN
\n",
+ "
5.00000
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
5.000000
\n",
+ "
...
\n",
+ "
5.000000
\n",
+ "
4.000000
\n",
+ "
4.000000
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
8 rows × 35 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ANNÉE DE PRODUCTION MORPHOLOGIE 1/2 (OPTIONNEL) \\\n",
+ "count 203.000000 199.000000 \n",
+ "mean 2017.832512 3.733668 \n",
+ "std 10.463114 0.986997 \n",
+ "min 1957.000000 1.000000 \n",
+ "25% 2018.000000 3.000000 \n",
+ "50% 2022.000000 4.000000 \n",
+ "75% 2023.000000 4.500000 \n",
+ "max 2023.000000 5.000000 \n",
+ "\n",
+ " MORPHOLOGIE 1/2 (OPTIONNEL).1 TYPE DE RESIDENCE [Studio].1 \\\n",
+ "count 75.000000 0.0 \n",
+ "mean 3.813333 NaN \n",
+ "std 1.086693 NaN \n",
+ "min 1.000000 NaN \n",
+ "25% 3.000000 NaN \n",
+ "50% 4.000000 NaN \n",
+ "75% 5.000000 NaN \n",
+ "max 5.000000 NaN \n",
+ "\n",
+ " MORPHOLOGIE 1/2 (OPTIONNEL).2 TYPE DE RESIDENCE [Colocation].2 \\\n",
+ "count 20.00000 0.0 \n",
+ "mean 3.35000 NaN \n",
+ "std 1.03999 NaN \n",
+ "min 1.00000 NaN \n",
+ "25% 3.00000 NaN \n",
+ "50% 3.00000 NaN \n",
+ "75% 4.00000 NaN \n",
+ "max 5.00000 NaN \n",
+ "\n",
+ " TYPE DE RESIDENCE [Studio].2 TYPE DE RESIDENCE [Hotel].2 \\\n",
+ "count 0.0 0.0 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
+ "\n",
+ " VALEURS / TRAITS DE CARACTERE.2 MORPHOLOGIE 1/2 (OPTIONNEL).3 ... \\\n",
+ "count 0.0 7.000000 ... \n",
+ "mean NaN 2.714286 ... \n",
+ "std NaN 1.496026 ... \n",
+ "min NaN 1.000000 ... \n",
+ "25% NaN 1.500000 ... \n",
+ "50% NaN 3.000000 ... \n",
+ "75% NaN 3.500000 ... \n",
+ "max NaN 5.000000 ... \n",
+ "\n",
+ " MORPHOLOGIE 2/2 (OPTIONNEL).1 MORPHOLOGIE 2/2 (OPTIONNEL).2 \\\n",
+ "count 73.000000 20.000000 \n",
+ "mean 2.136986 2.300000 \n",
+ "std 1.071280 1.128576 \n",
+ "min 1.000000 1.000000 \n",
+ "25% 1.000000 1.000000 \n",
+ "50% 2.000000 2.500000 \n",
+ "75% 3.000000 3.000000 \n",
+ "max 5.000000 4.000000 \n",
+ "\n",
+ " MORPHOLOGIE 2/2 (OPTIONNEL).3 Unnamed: 119 Unnamed: 120 \\\n",
+ "count 7.000000 0.0 0.0 \n",
+ "mean 2.285714 NaN NaN \n",
+ "std 1.112697 NaN NaN \n",
+ "min 1.000000 NaN NaN \n",
+ "25% 1.500000 NaN NaN \n",
+ "50% 2.000000 NaN NaN \n",
+ "75% 3.000000 NaN NaN \n",
+ "max 4.000000 NaN NaN \n",
+ "\n",
+ " Unnamed: 121 Unnamed: 122 Unnamed: 123 Unnamed: 124 Unnamed: 125 \n",
+ "count 0.0 0.0 0.0 0.0 0.0 \n",
+ "mean NaN NaN NaN NaN NaN \n",
+ "std NaN NaN NaN NaN NaN \n",
+ "min NaN NaN NaN NaN NaN \n",
+ "25% NaN NaN NaN NaN NaN \n",
+ "50% NaN NaN NaN NaN NaN \n",
+ "75% NaN NaN NaN NaN NaN \n",
+ "max NaN NaN NaN NaN NaN \n",
+ "\n",
+ "[8 rows x 35 columns]"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Compute general statistical information for the numerical columns in the data\n",
+ "statistical_info = data.describe()\n",
+ "\n",
+ "# Display the statistical information\n",
+ "statistical_info\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['Horodateur', 'TITRE DU CONTENU', 'ANNÉE DE PRODUCTION', 'PAYS DE PRODUCTION', 'CANAL DE DIFFUSION', 'SÉRIE OU FILM ?', 'QUELLE SAISON ?', \"LIEU(X) DE L'ACTION\", 'EPOQUE DU RECIT', 'GENRE', 'POSITION DANS LE RECIT', 'GENRE.1', 'AGE', 'ETHNICITE PERCUE (OPTIONNEL)', 'MORPHOLOGIE 1/2 (OPTIONNEL)', 'PAYS DE RESIDENCE', 'TYPE DE RESIDENCE [Appartement]', 'TYPE DE RESIDENCE [Colocation]', 'TYPE DE RESIDENCE [Maison individuelle]', 'TYPE DE RESIDENCE [Studio]', 'TYPE DE RESIDENCE [Hotel]', 'TYPE DE RESIDENCE [Ferme]', 'TYPE DE RESIDENCE [Alternatif/communautaire]', 'TYPE DE RESIDENCE [On ne sait pas / Autre ]', 'METIER / OCCUPATION ?', 'CATEGORIE SOCIALE PERCUE ?', 'ASPIRATIONS', 'VALEURS / TRAITS DE CARACTERE', 'SOUHAITEZ-VOUS AJOUTER UN PERSONNAGE ?', 'POSITION DANS LE RECIT.1', 'GENRE.2', 'AGE.1', 'ETHNICITE PERCUE (OPTIONNEL).1', 'MORPHOLOGIE 1/2 (OPTIONNEL).1', 'PAYS DE RESIDENCE.1', 'TYPE DE RESIDENCE [Appartement].1', 'TYPE DE RESIDENCE [Colocation].1', 'TYPE DE RESIDENCE [Maison individuelle].1', 'TYPE DE RESIDENCE [Studio].1', 'TYPE DE RESIDENCE [Hotel].1', 'TYPE DE RESIDENCE [Ferme].1', 'TYPE DE RESIDENCE [Alternatif/communautaire].1', 'TYPE DE RESIDENCE [On ne sait pas / Autre ].1', 'METIER / OCCUPATION ?.1', 'CATEGORIE SOCIALE PERCUE ?.1', 'ASPIRATIONS.1', 'VALEURS / TRAITS DE CARACTERE.1', 'SOUHAITEZ-VOUS AJOUTER UN PERSONNAGE ?.1', 'POSITION DANS LE RECIT.2', 'GENRE.3', 'AGE.2', 'ETHNICITE PERCUE (OPTIONNEL).2', 'MORPHOLOGIE 1/2 (OPTIONNEL).2', 'PAYS DE RESIDENCE.2', 'TYPE DE RESIDENCE [Appartement].2', 'TYPE DE RESIDENCE [Colocation].2', 'TYPE DE RESIDENCE [Maison individuelle].2', 'TYPE DE RESIDENCE [Studio].2', 'TYPE DE RESIDENCE [Hotel].2', 'TYPE DE RESIDENCE [Ferme].2', 'TYPE DE RESIDENCE [Alternatif/communautaire].2', 'TYPE DE RESIDENCE [On ne sait pas / Autre ].2', 'METIER / OCCUPATION ?.2', 'CATEGORIE SOCIALE PERCUE ?.2', 'ASPIRATIONS.2', 'VALEURS / TRAITS DE CARACTERE.2', 'SOUHAITEZ-VOUS AJOUTER UN PERSONNAGE ?.2', 'POSITION DANS LE RECIT.3', 'GENRE.4', 'AGE.3', 'ETHNICITE PERCUE (OPTIONNEL).3', 'MORPHOLOGIE 1/2 (OPTIONNEL).3', 'PAYS DE RESIDENCE.3', 'TYPE DE RESIDENCE [Appartement].3', 'TYPE DE RESIDENCE [Colocation].3', 'TYPE DE RESIDENCE [Maison individuelle].3', 'TYPE DE RESIDENCE [Studio].3', 'TYPE DE RESIDENCE [Hotel].3', 'TYPE DE RESIDENCE [Ferme].3', 'TYPE DE RESIDENCE [Alternatif/communautaire].3', 'TYPE DE RESIDENCE [On ne sait pas / Autre ].3', 'METIER / OCCUPATION ?.3', 'CATEGORIE SOCIALE PERCUE ?.3', 'ASPIRATIONS.3', 'VALEURS / TRAITS DE CARACTERE.3', \"QUELLE EST L'EMPREINTE CARBONE ANNUELLE DU PERSONNAGE ?\", \"Voulez-vous estimer l'empreinte carbone d'un autre personnage ?\", \"QUELLE EST L'EMPREINTE CARBONE ANNUELLE DU PERSONNAGE ?.1\", \"Voulez-vous estimer l'empreinte carbone d'un autre personnage ?.1\", \"QUELLE EST L'EMPREINTE CARBONE ANNUELLE DU PERSONNAGE ?.2\", \"Voulez-vous estimer l'empreinte carbone d'un autre personnage ?.2\", \"QUELLE EST L'EMPREINTE CARBONE ANNUELLE DU PERSONNAGE ?.3\", 'CONSOMMATION & MATERIALISME', 'CONSOMMATION & MATERIALISME.1', 'MODES DE VIE', 'MODES DE VIE.1', 'RAPPORT AU VIVANT', 'RAPPORT AU VIVANT.1', 'CRISE ENVIRONNEMENTALE', 'CRISE ENVIRONNEMENTALE.1', 'Y A-T-IL UNE PRISE DE RECUL ?', 'EST-CE NECESSAIRE ?', 'QUELQUE CHOSE A AJOUTER ?', 'VOTRE EMAIL ?', 'Traitement du récit', 'Type de monde du récit', 'VALEURS / TRAITS DE CARACTÈRE', 'MORPHOLOGIE 2/2 (OPTIONNEL)', 'VALEURS / TRAITS DE CARACTÈRE.1', 'VALEURS / TRAITS DE CARACTÈRE.2', 'MORPHOLOGIE 2/2 (OPTIONNEL).1', 'MORPHOLOGIE 2/2 (OPTIONNEL).2', 'MORPHOLOGIE 2/2 (OPTIONNEL).3', 'VALEURS / TRAITS DE CARACTERE.4', 'VALEURS / TRAITS DE CARACTERE.5', 'VALEURS / TRAITS DE CARACTERE.6', 'Unnamed: 116', 'Unnamed: 117', 'Unnamed: 118', 'Unnamed: 119', 'Unnamed: 120', 'Unnamed: 121', 'Unnamed: 122', 'Unnamed: 123', 'Unnamed: 124', 'Unnamed: 125']\n"
+ ]
+ }
+ ],
+ "source": [
+ "# colonnes\n",
+ "print(list(data.columns))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Le nombre de réponses est de \n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "203"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "titre = \"TITRE DU CONTENU\"\n",
+ "c = data[titre].count()\n",
+ "print('Le nombre de réponses est de ')\n",
+ "c "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'Non-empty count': 54,\n",
+ " 'Unique non-empty entries': 54,\n",
+ " 'Sample entries': [\"Ce genre de série ne s'intéresse pas au mode de vie des protagonistes principaux, on est concentré sur leur boulot et leur problèmes sentimentaux. Néanmoins dans le boulot utilisation de gros SUV ou 4x4 non indispensables\",\n",
+ " 'Un film assez ancien des années 80 mais finalement qui interroge sur le mode de vie \"moderne\" et considéré comme l\\'aspiration et la norme de l\\'époque. ',\n",
+ " 'Série plutôt écolo mais pas spécialement proactive sur le sujet. Prône l’égalité et le dialogue, ce qui est déjà beaucoup !',\n",
+ " 'Série qui correspond vraiment à la mentalité des années 2000 sur ces sujets.',\n",
+ " 'Voilà un exemple très intéressant : le film se passe dans un monde pour l\\'essentiel réaliste, dans une société contemporaine et occidental. Il obtient pourtant un score très neutre pour deux raisons : il y a des objets, des voitures, des smartphones, mais on ne les \"voit\" pas, ils ne sont jamais saillants, ils sont toujours intégré dans l\\'image d\\'une manière qui renforce son effet graphique sans les valoriser ou mettre en évidence leur fonction. Deux : la nature occupe une place importante (c\\'est peut-être réellement le cas à la Nouvelle Orléans ?) et, alors même que l\\'action se déroule en ville, on ne se sent pas oppressé']}"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "# Analyzing the \"QUELQUE CHOSE A AJOUTER ?\" column to understand its content\n",
+ "# Check if the column exists\n",
+ "if \"QUELQUE CHOSE A AJOUTER ?\" in data.columns:\n",
+ " # Basic analysis of the column\n",
+ " # Count of non-empty entries\n",
+ " non_empty_count = data[\"QUELQUE CHOSE A AJOUTER ?\"].notna().sum()\n",
+ " # Unique non-empty entries\n",
+ " unique_non_empty_entries = data[\"QUELQUE CHOSE A AJOUTER ?\"].nunique()\n",
+ " # Sample of non-empty entries\n",
+ " sample_entries = data[\"QUELQUE CHOSE A AJOUTER ?\"].dropna().sample(min(5, non_empty_count)).tolist()\n",
+ " analysis_results = {\n",
+ " \"Non-empty count\": non_empty_count,\n",
+ " \"Unique non-empty entries\": unique_non_empty_entries,\n",
+ " \"Sample entries\": sample_entries\n",
+ " }\n",
+ "else:\n",
+ " analysis_results = \"Column 'QUELQUE CHOSE A AJOUTER ?' does not exist in the dataset.\"\n",
+ "\n",
+ "analysis_results\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "TITRE DU CONTENU\n",
+ "Polar park 3\n",
+ "Une année difficile 3\n",
+ "Sambre 3\n",
+ "Le règne animal 3\n",
+ "Anatomie d'une chute 2\n",
+ "Voleuses 2\n",
+ "D'argent et de sang 2\n",
+ "The Crown 2\n",
+ "Polar Park 2\n",
+ "Bodies 2\n",
+ "À plein temps 2\n",
+ "Revoir Paris 2\n",
+ "Le livre des solutions 2\n",
+ "Yannick 2\n",
+ "Sense8 2\n",
+ "Second Tour 2\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Trouver les titres qui apparaissent plus de 4 fois dans la colonne \"TITRE\" (car chaque titre a 4 lignes, une pour chaque personnage)\n",
+ "titles_more_than_once = data[titre].value_counts()\n",
+ "titles_more_than_once = titles_more_than_once[titles_more_than_once > 1]\n",
+ "\n",
+ "titles_more_than_once\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([2023, 2016, 2022, 1997, 2015, 2006, 2018, 2021, 1984, 1968, 2020,\n",
+ " 2019, 1971, 2017, 2001, 2012, 1987, 2005, 1975, 1982, 2013, 2014,\n",
+ " 2010, 1994, 2009, 1988, 2000, 2011, 1999, 1957, 2002])"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# annee = \"ANNEE\"\n",
+ "annee = \"ANNÉE DE PRODUCTION\"\n",
+ "\n",
+ "# Extract unique values from the \"ANNEE\" column to understand its format\n",
+ "unique_years = data[annee].unique()\n",
+ "\n",
+ "# Display a few unique values for inspection\n",
+ "unique_years[:100]\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def visualize_year_distribution(years_data, chart_title):\n",
+ " \"\"\"\n",
+ " Visualise la distribution des années, y compris les intervalles, dans un ensemble de données d'années.\n",
+ " \n",
+ " Parameters:\n",
+ " - years_data (list or pd.Series): Un tableau ou une série Pandas contenant des années ou des intervalles d'années.\n",
+ " \n",
+ " Returns:\n",
+ " - Affiche un histogramme de la distribution des années.\n",
+ " \"\"\"\n",
+ " import pandas as pd\n",
+ " import matplotlib.pyplot as plt\n",
+ " import re # For regular expression operations\n",
+ " \n",
+ " \n",
+ " # Convertir toutes les valeurs en chaînes pour gérer les intervalles potentiels comme des entrées uniques\n",
+ " years_data = years_data.astype(str)\n",
+ " \n",
+ " # Séparer les intervalles en années individuelles et compter la fréquence de chaque année\n",
+ " all_years = []\n",
+ " # Utilise un set pour les valeurs non numériques\n",
+ " non_numeric_entries = set()\n",
+ " \n",
+ " for year in years_data:\n",
+ " if re.search(r'[a-z]', year): # If there are no digits in the string\n",
+ " non_numeric_entries.add(year)\n",
+ " elif \",\" in year:\n",
+ " years_list = year.split(\",\")\n",
+ " for y in years_list:\n",
+ " all_years.append(int(y.strip())) # Ajouter chaque année en la convertissant en entier\n",
+ " elif \"+/-\" in year: # Vérifier si c'est un intervalle\n",
+ " start_year, uncertainty = year.split(\"+/-\")\n",
+ " if (start_year != ''):\n",
+ " all_years.extend(range(int(start_year), int(start_year) + uncertainty + 1))\n",
+ " print(year)\n",
+ " else:\n",
+ " non_numeric_entries.add(year)\n",
+ " elif \"-\" in year: # Vérifier si c'est un intervalle\n",
+ " start_year, end_year = year.split(\"-\")\n",
+ " all_years.extend(range(int(start_year), int(end_year) + 1))\n",
+ " elif \"/\" in year: # Vérifier si c'est un intervalle\n",
+ " start_year, end_year = year.split(\"/\")\n",
+ " all_years.extend(range(int(start_year), int(end_year) + 1))\n",
+ " else:\n",
+ " all_years.append(int(year))\n",
+ " \n",
+ " # Print non-numeric entries if any\n",
+ " if non_numeric_entries:\n",
+ " print(\"Non-numeric entries found:\", non_numeric_entries)\n",
+ " \n",
+ " # Proceed with visualization only if there are numeric years\n",
+ " if all_years:\n",
+ " # Créer un DataFrame à partir de la liste des années pour faciliter la visualisation\n",
+ " df_years = pd.DataFrame(all_years, columns=[\"Year\"])\n",
+ " \n",
+ " # Visualiser la distribution des années avec un histogramme\n",
+ " plt.figure(figsize=(10, 6))\n",
+ " df_years[\"Year\"].hist(bins=len(df_years[\"Year\"].unique()), color=\"skyblue\", edgecolor=\"black\")\n",
+ " plt.title(chart_title)\n",
+ " plt.xlabel(\"Année\")\n",
+ " plt.ylabel(\"Fréquence\")\n",
+ " plt.xticks(rotation=45)\n",
+ " plt.tight_layout()\n",
+ " plt.show()\n",
+ " else:\n",
+ " print(\"No numeric years found for visualization.\")\n",
+ " return all_years\n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "