Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

maj fossil_reserves_OPEC #22

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#Fichier Python permettant de générer :
#__FOSSIL_RESERVES_bp_fossil_with_zones_prod.csv__#

import pandas as pd
import numpy as np
from utils.translation import CountryTranslatorFrenchToEnglish
from transformation.demographic.countries import StatisticsPerCountriesAndZonesJoiner
from utils.format import StatisticsDataframeFormatter

# Charger donnée "country_groups.csv"

# Charger données OPEC -> pas API disponible
# site source Maj annuelle : https://publications.opec.org/asb

class OpecFossilProvenReservesCleaner:
def __init__(self) :
pass

def rename_column(self, df) :
return df.rename(columns={'Unnamed: 0': 'country'})

def drop_unnecessary_lines(self, df) :
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

J'ajouterais aussi un commentaire pour dire où se trouve ces lignes vides et les notes de bas de page. Et dans la docstring, j'expliquerais aussi pourquoi on exclut certains continents et certains groupements de pays, car sur le site on peut filtrer sur ce type de zones géographiques. Ça nous évitera aussi de nous reposer la question dans quelques mois :)

Ça donnerait quelque chose comme ça :

Suggested change
def drop_unnecessary_lines(self, df) :
def drop_unnecessary_lines(self, df) :
"""
Drop continents, country groups and footnotes that are not necessary for data processing
We exclude certain continents and country groups because ...
"""
df.set_index('country', inplace=True)
# After "Total World", we have empty lines or footnotes
df = df.loc[:"Total World", :]

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

C'est plus clair tu as raison merci

"""
Drop footnotes not necessary for data processing
"""
df.set_index('country', inplace=True)
df = df.loc[:"Total World", :]
df.drop(['Africa', 'Latin America', 'Other Asia', 'Other Eurasia', 'Middle East', 'OECD Europe', 'OECD Asia Pacific', 'OECD Americas', 'Others', 'Other Europe', 'Total World'], axis = 0, inplace = True)
return df

def mb_to_gb_scale(self, df) :
df = df.apply(lambda x: round(pd.to_numeric(x, errors='coerce') / 1000, 3))
return df

def translate_country(self, df) :
df = df.reset_index()
df["country"] = CountryTranslatorFrenchToEnglish().run(df["country"], raise_errors=False)
return df

def column_to_line(self, df) :
df = pd.melt(df,
id_vars = "country",
var_name = "year",
value_name = "proven_reserves")
df.dropna(inplace = True)
return df

def stat_per_countries_zones_join(self, df, country) :
list_col_group_by = ['group_type', 'group_name', 'year']
dict_agg = {"proven_reserves" : "sum"}
df = StatisticsPerCountriesAndZonesJoiner().run(df, country, list_col_group_by, dict_agg)
return df

def stat_df_formatter(self, df) :
col_statistics = "proven_reserves"
df = StatisticsDataframeFormatter().select_and_sort_values(df, col_statistics)
df["energy_source"] = "Oil"
df["proven_reserves_unit"] = "Gb"
df = df[["group_type", "group_name", "energy_source", "year", "proven_reserves", "proven_reserves_unit"]]
return df

def clean_data(self, df, country):
df = self.rename_column(df)
df = self.drop_unnecessary_lines(df)
df = self.mb_to_gb_scale(df)
df = self.translate_country(df)
df = self.column_to_line(df)
df = self.stat_per_countries_zones_join(df, country)
df = self.stat_df_formatter(df)
return df

# Application class
cleaner = OpecFossilProvenReservesCleaner()
df_new = cleaner.clean_data(df, country)
df_new
5 changes: 5 additions & 0 deletions data-preparation/src/utils/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
"Cameroon": "Cameroon",
"Cameroun": "Cameroon",
"Canada": "Canada",
"Canada1": "Canada",
"Cabo Verde": "Cape Verde",
"Cape Verde": "Cape Verde",
"Central African Republic": "Central African Republic",
Expand Down Expand Up @@ -170,6 +171,7 @@
"Indonésie": "Indonesia", "Indonesia": "Indonesia", "Indonesie": "Indonesia",
"Iran (Islamic Republic of)": "Iran",
"Iran": "Iran",
"IR Iran" :"Iran",
"République Islamique d'Iran": "Iran",
"Iran, Islamic Republic of": "Iran",
"Iran, Islamic Rep.": "Iran",
Expand Down Expand Up @@ -271,6 +273,7 @@
"Porto Rico": "Puerto Rico", "Puerto Rico": "Puerto Rico", "Qatar": "Qatar", "REF": "REF SRES",
"REF SRES": "REF SRES", "Romania": "Romania", "Roumanie": "Romania",
"Russian Federation": "Russian Federation & USSR",
"Russia2": "Russian federation & USSR",
"USSR": "Russian Federation & USSR",
"Former USSR": "Russian Federation & USSR",
"Former U.S.S.R.": "Russian Federation & USSR",
Expand Down Expand Up @@ -311,6 +314,7 @@
"Rep. Coree": "South Korea", "Korea, Republic of": "South Korea",
"Korea (South)": "South Korea", "Sri Lanka": "Sri Lanka", "Espagne": "Spain", "Spain": "Spain",
"Soudan": "Sudan",
"Sudans" : "Sudan",
"Sudan (former)": "Sudan",
"Sudan": "Sudan",
"Sudan (Former)": "Sudan",
Expand Down Expand Up @@ -358,6 +362,7 @@
"US": "United States of America", "USA": "United States of America",
"U.S. Pacific Islands": "US Pacific Islands",
"Uzbekistan": "Uzbekistan",
"Uzbekistan`" : "Uzbekistan",
"Ouzbékistan": "Uzbekistan", "Vanuatu": "Vanuatu",
"Venezuela, Bolivarian Republic of": "Venezuela",
"Venezuela (Bolivarian Republic of)": "Venezuela",
Expand Down
Loading