Skip to content

Commit

Permalink
updated version of eurostat material; utils is working again
Browse files Browse the repository at this point in the history
  • Loading branch information
pmayd committed Nov 2, 2022
1 parent 712af07 commit 1aa2883
Show file tree
Hide file tree
Showing 8 changed files with 847 additions and 441 deletions.
1,153 changes: 761 additions & 392 deletions 01 Working with excel.ipynb

Large diffs are not rendered by default.

29 changes: 15 additions & 14 deletions 02 Dataset env_wasgen.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -126,26 +126,27 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\tools\\miniconda3\\lib\\site-packages\\openpyxl\\styles\\stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default\n",
" warn(\"Workbook contains no default style, apply openpyxl's default\")\n"
]
}
],
"source": [
"codes = get_header_codes_from_excel(file)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/miay/miniconda/lib/python3.9/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default\n",
" warn(\"Workbook contains no default style, apply openpyxl's default\")\n"
]
},
{
"name": "stdout",
"output_type": "stream",
Expand Down Expand Up @@ -4305,7 +4306,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.12",
"display_name": "Python 3.9.7 ('base')",
"language": "python",
"name": "python3"
},
Expand All @@ -4319,11 +4320,11 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
"version": "3.9.7"
},
"vscode": {
"interpreter": {
"hash": "854afd56987f2f89f833600b1696b8dd8f924afccdbd759679e77e1a3b52b928"
"hash": "3247f7d4635bb288d9e06d3deacee818856115b2677ccfdf5a578edab993fe5f"
}
}
},
Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# Data-Science-in-the-Wild
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/pmayd/Data-Science-in-the-Wild/HEAD)

[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/pmayd/Data-Science-in-the-Wild/HEAD)

This repository contains the material for our Data Science in the Wild workshop at [Spartakiade 2022](https://spartakiade.org/).

## Data sets

- [Eurostat](https://ec.europa.eu/eurostat/web/main/data/database)
- We need `Database by themes` -> `Environment and energy` -> `Environment (env)` -> `Waste (env_was)` -> `Waste generation and treatment (env_wasgt)` -> `env_wasgen`.
- You can directly jump into the data browser [here](https://ec.europa.eu/eurostat/databrowser/view/env_wasgen/default/table?lang=en).
- We need `Database by themes` -> `Environment and energy` -> `Environment (env)` -> `Waste (env_was)` -> `Waste generation and treatment (env_wasgt)` -> `env_wasgen`.
- You can directly jump into the data browser [here](https://ec.europa.eu/eurostat/databrowser/view/env_wasgen/default/table?lang=en).
- [GENESIS-Online](https://www.destatis.de/DE/Themen/Gesellschaft-Umwelt/Gesundheit/Glossar/genesis.html)
- You can find the documentation of the API [here](https://www.destatis.de/DE/Service/OpenData/genesis-api-webservice-oberflaeche.html) (yes -- it's a PDF).
- You need a registered user to be able to use the GENESIS API, so register your user [here]().
- You can find the documentation of the API [here](https://www.destatis.de/DE/Service/OpenData/genesis-api-webservice-oberflaeche.html) (yes -- it's a PDF).
- You need a registered user to be able to use the GENESIS API, so register your user [here](https://www-genesis.destatis.de/genesis/online?Menu=Registrierung#abreadcrumb).
Binary file modified data/env_wasgen_combined.parquet
Binary file not shown.
Binary file modified data/env_wasgen_new.xlsx
Binary file not shown.
Binary file modified data/env_wasgen_old.xls
Binary file not shown.
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,5 @@ matplotlib
numpy
pandas
plotly
xlrd
statsmodels
openpyxl
scikit-learn
93 changes: 65 additions & 28 deletions utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from collections import defaultdict
from typing import Union, List
from typing import List, Union

import pandas as pd
import openpyxl
import pandas as pd

HEADER_ROWS = range(4,8)
HEADER_ROWS = range(6, 10)
DIMENSIONS_SHEET_NAME = "Structure"
DIMENSIONS_CELL_RANGE = "B4:E1000"
DIMENSIONS_FIRST_ROW = 3
Expand All @@ -13,13 +13,21 @@
DIMENSIONS_COL_LABEL = 3


def get_header_codes_from_excel(excel_file: str):
def get_header_codes_from_excel(excel_file: str) -> dict:
"""Return Dimension categories, codes and labels from an Eurostat dataset.
Args:
excel_file (str): Path to a local excel file.
Returns:
dict: A dictionary with categories as key and another dict with code - label as key-value.
"""
book = openpyxl.load_workbook(excel_file, data_only=True)

codes = defaultdict(dict)
sheet = book[DIMENSIONS_SHEET_NAME]
cells = sheet[DIMENSIONS_CELL_RANGE]

for row in cells:
cat = row[DIMENSIONS_COL_CAT].value
code = row[DIMENSIONS_COL_CODE].value
Expand All @@ -33,38 +41,47 @@ def get_header_codes_from_excel(excel_file: str):


def print_codes(excel_file: str):
"""Print the Dimension categories, codes and labels from an Eurostat dataset.
Args:
excel_file (str): Path to a local excel file.
"""
codes = get_header_codes_from_excel(excel_file)
for k,v in codes.items():
for k, v in codes.items():
print("Category: ", k)
print("---------")
for k,v in v.items():
for k, v in v.items():
print(f"{k}: {v}")
print()


def get_data_from_excel(excel_file: str, headers: Union[tuple, List[tuple]] = None) -> pd.DataFrame:
book = xlrd.open_workbook(excel_file, on_demand=True)
def get_data_from_excel(
excel_file: str, headers: Union[tuple, List[tuple]] = None
) -> pd.DataFrame:
book = openpyxl.load_workbook(excel_file, data_only=True)
list_of_df = []

if isinstance(headers, tuple):
headers = [headers]

for sheet in book.sheet_names():
sh = book.sheet_by_name(sheet)
for sheetname in book.sheetnames[2:]:
sheet = book[sheetname]

header_names = []
header_values = []

for row in range(6,10):
name = sh.cell_value(row, 0)
value = sh.cell_value(row, 1)

if not name:
for row in HEADER_ROWS:
name = sheet.cell(row=row, column=1).value
value = sheet.cell(row=row, column=3).value
if name is None:
break


name = name.split("[")[1].strip("]")
value = value.split("[")[1].strip("]")
header_names.append(name)
header_values.append(value.split(" - ")[0])

header_values.append(value)
header_names = tuple(header_names)
header_values = tuple(header_values)

Expand All @@ -83,26 +100,46 @@ def get_data_from_excel(excel_file: str, headers: Union[tuple, List[tuple]] = No

# find header row
header_row = 0
for row in range(20):
if sh.cell_value(row, 0) == "GEO":
for row in range(1, 21):
value = sheet.cell(row=row, column=1).value
if value is not None and value.startswith("TIME"):
header_row = row
break

nrows = header_row
for row in range(header_row, 100):
if sh.cell_value(row, 0) == "":
for row in range(header_row, 101):
if sheet.cell(row=row, column=1).value is None:
nrows = row - header_row - 1
break

df_sheet = pd.read_excel(excel_file, sheet_name=sheet, header=header_row, nrows=nrows, na_values=":").drop(columns="GEO(L)/TIME").rename(columns={'GEO': 'geo'})

df_sheet = pd.read_excel(
excel_file,
sheet_name=sheetname,
header=header_row-1,
nrows=nrows,
na_values=":",
)
df_sheet = df_sheet.rename(
columns={
df_sheet.columns[0]: df_sheet.iloc[0, 0],
df_sheet.columns[1]: df_sheet.iloc[0, 1],
}
)
df_sheet = df_sheet.iloc[1:]
df_sheet = df_sheet.drop(columns="GEO (Labels)")
df_sheet = df_sheet.rename(columns={"GEO (Codes)": "geo"})
df_sheet = df_sheet.melt(id_vars="geo", var_name="year", value_name="value")
df_sheet = df_sheet.assign(**{x.lower():y for x,y in zip(header_names, header_values) if x})
df_sheet = df_sheet.assign(
**{x.lower(): y for x, y in zip(header_names, header_values) if x}
)

list_of_df.append(df_sheet)

df = pd.concat(list_of_df)
df.year = df.year.astype(int)
df[df.select_dtypes("object").columns] = df.select_dtypes("object").astype("category")
df[df.select_dtypes("object").columns] = df.select_dtypes("object").astype(
"category"
)
df = df.set_index("geo")

return df

0 comments on commit 1aa2883

Please sign in to comment.