diff --git a/01 Working with excel.ipynb b/01 Working with excel.ipynb
index 2c6e5db..7dee58b 100644
--- a/01 Working with excel.ipynb
+++ b/01 Working with excel.ipynb
@@ -2,17 +2,15 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 1,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"import pandas as pd # for data analysis\n",
- "import xlrd # read, format Excel xls files\n",
"import openpyxl # read, write Excel xlsx/xlsm files\n",
"\n",
"import matplotlib.pyplot as plt # data visualization\n",
@@ -22,12 +20,11 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 2,
"metadata": {
"collapsed": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
@@ -66,22 +63,30 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 3,
"metadata": {
"collapsed": false,
"hideCode": false,
"hidePrompt": false
},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "c:\\tools\\miniconda3\\lib\\site-packages\\openpyxl\\styles\\stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default\n",
+ " warn(\"Workbook contains no default style, apply openpyxl's default\")\n"
+ ]
+ }
],
"source": [
- "file = \"data/env_wasgen.xls\"\n",
- "book = xlrd.open_workbook(file, on_demand=True) # \"on_demand\" saves memory and time by loading only those sheets that the caller is interested in, and releasing sheets when no longer required."
+ "file = \"data/env_wasgen_new.xlsx\"\n",
+ "book = openpyxl.load_workbook(file, data_only=True)"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 10,
"metadata": {
"collapsed": false,
"hideCode": false,
@@ -92,42 +97,41 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "The number of worksheets is 660\n",
+ "The number of worksheets is 62\n",
"Worksheet name(s):\n"
]
},
{
"data": {
"text/plain": [
- "(660,\n",
- " ['Data',\n",
- " 'Data2',\n",
- " 'Data3',\n",
- " 'Data4',\n",
- " 'Data5',\n",
- " 'Data6',\n",
- " 'Data7',\n",
- " 'Data8',\n",
- " 'Data9',\n",
- " 'Data10'])"
+ "(62,\n",
+ " ['Summary',\n",
+ " 'Structure',\n",
+ " 'Sheet 1',\n",
+ " 'Sheet 2',\n",
+ " 'Sheet 3',\n",
+ " 'Sheet 4',\n",
+ " 'Sheet 5',\n",
+ " 'Sheet 6',\n",
+ " 'Sheet 7',\n",
+ " 'Sheet 8'])"
]
},
- "execution_count": 6,
- "metadata": {
- },
+ "execution_count": 10,
+ "metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "print(f\"The number of worksheets is {book.nsheets}\")\n",
+ "print(f\"The number of worksheets is {len(book.worksheets)}\")\n",
"\n",
"print(\"Worksheet name(s):\")\n",
- "len(book.sheet_names()), book.sheet_names()[:10]"
+ "len(book.sheetnames), book.sheetnames[:10]"
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 25,
"metadata": {
"collapsed": false,
"hideCode": false,
@@ -138,27 +142,23 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Sheet name: Data, nrows: 55, ncols: 10\n",
- "Sheet name: Data2, nrows: 55, ncols: 10\n",
- "Cell A1: ('Generation of waste by waste category, hazardousness and NACE Rev. 2 activity [env_wasgen]', 1)\n"
+ "Sheet name: Summary, dimensions: A1:O75\n",
+ "Cell A1: ('General', None, 'n')\n"
]
}
],
"source": [
- "sh = book.sheet_by_index(0)\n",
- "\n",
- "print(f\"Sheet name: {sh.name}, nrows: {sh.nrows}, ncols: {sh.ncols}\")\n",
+ "sh = book[book.sheetnames[0]]\n",
"\n",
- "sh = book.sheet_by_name(\"Data2\")\n",
+ "print(f\"Sheet name: {sh.title}, dimensions: {sh.dimensions}\")\n",
"\n",
- "print(f\"Sheet name: {sh.name}, nrows: {sh.nrows}, ncols: {sh.ncols}\")\n",
- "\n",
- "print(f\"Cell A1: {sh.cell_value(rowx=0, colx=0), sh.cell_type(rowx=0, colx=0)}\")"
+ "cell_A1 = sh.cell(row=1, column=1)\n",
+ "print(f\"Cell A1: {cell_A1.number_format, cell_A1.value, cell_A1.data_type}\")"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 30,
"metadata": {
"collapsed": false,
"hideCode": false,
@@ -169,35 +169,84 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "['Generation of waste by waste category, hazardousness and NACE Rev. 2 activity [env_wasgen]', '', '', '', '', '', '', '', '', '']\n",
- "['', '', '', '', '', '', '', '', '', '']\n",
- "['Last update', 44316.50739583334, '', '', '', '', '', '', '', '']\n",
- "['Extracted on', 44347.90493962963, '', '', '', '', '', '', '', '']\n",
- "['Source of data', 'Eurostat', '', '', '', '', '', '', '', '']\n",
- "['', '', '', '', '', '', '', '', '', '']\n",
- "['UNIT', 'KG_HAB - Kilograms per capita', '', '', '', '', '', '', '', '']\n",
- "['HAZARD', 'HAZ_NHAZ - Hazardous and non-hazardous - Total', '', '', '', '', '', '', '', '']\n",
- "['WASTE', 'TOTAL - Total waste', '', '', '', '', '', '', '', '']\n",
- "['NACE_R2', 'A - Agriculture, forestry and fishing', '', '', '', '', '', '', '', '']\n",
- "['', '', '', '', '', '', '', '', '', '']\n",
- "['GEO', 'GEO(L)/TIME', '2004', '2006', '2008', '2010', '2012', '2014', '2016', '2018']\n",
- "['EU27_2020', 'European Union - 27 countries (from 2020)', 146.0, 131.0, 104.0, 47.0, 47.0, 41.0, 45.0, 45.0]\n",
- "['EU28', 'European Union - 28 countries (2013-2020)', 130.0, 116.0, 93.0, 42.0, 42.0, 37.0, 41.0, 41.0]\n",
- "['BE', 'Belgium', 114.0, 34.0, 27.0, 21.0, 15.0, 28.0, 24.0, 39.0]\n",
- "['BG', 'Bulgaria', 94.0, 83.0, 101.0, 84.0, 124.0, 116.0, 87.0, 44.0]\n",
- "['CZ', 'Czechia', 122.0, 31.0, 25.0, 11.0, 19.0, 13.0, 11.0, 39.0]\n",
- "['DK', 'Denmark', 4.0, 5.0, 7.0, 34.0, 13.0, 21.0, 35.0, 65.0]\n",
- "['DE', 'Germany (until 1990 former territory of the FRG)', 15.0, 18.0, 16.0, 3.0, 8.0, 5.0, 14.0, 12.0]\n",
- "['EE', 'Estonia', 135.0, 88.0, 179.0, 83.0, 59.0, 93.0, 87.0, 105.0]\n"
+ "('Generation of waste by waste category, hazardousness and NACE Rev. 2 activity [ENV_WASGEN__custom_3738744]', None, None, None, None, None, None, None, None, None, None, None, None, None, None)\n",
+ "('Open product page', 'Open in Data Browser', None, None, None, None, None, None, None, None, None, None, None, None, None)\n",
+ "('Description: ', '-', None, None, None, None, None, None, None, None, None, None, None, None, None)\n",
+ "('Last update of data: ', None, None, '13/09/2022 11:00', None, None, None, None, None, None, None, None, None, None, None)\n",
+ "('Last change of data structure: ', None, None, '13/09/2022 11:00', None, None, None, None, None, None, None, None, None, None, None)\n",
+ "(None, 'Institutional source(s)', None, None, None, None, None, None, None, None, None, None, None, None, None)\n",
+ "(None, None, 'Eurostat', None, None, None, None, None, None, None, None, None, None, None, None)\n",
+ "(None, 'Contents', 'Time frequency [FREQ]', 'Unit of measure [UNIT]', 'Hazard class [HAZARD]', 'Statistical classification of economic activities in the European Community (NACE Rev. 2) [NACE_R2]', 'Waste categories [WASTE]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 1', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Agriculture, forestry and fishing [A]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 2', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Mining and quarrying [B]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 3', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Manufacturing [C]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 4', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Electricity, gas, steam and air conditioning supply [D]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 5', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Water supply; sewerage, waste management and remediation activities [E]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 6', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Construction [F]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 7', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Services (except wholesale of waste and scrap) [G-U_X_G4677]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 8', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Wholesale of waste and scrap [G4677]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 9', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Households [EP_HH]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 10', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'All NACE activities plus households [TOTAL_HH]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 11', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous [HAZ]', 'Agriculture, forestry and fishing [A]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 12', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous [HAZ]', 'Mining and quarrying [B]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 13', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous [HAZ]', 'Manufacturing [C]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 14', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous [HAZ]', 'Electricity, gas, steam and air conditioning supply [D]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 15', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous [HAZ]', 'Water supply; sewerage, waste management and remediation activities [E]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 16', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous [HAZ]', 'Construction [F]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 17', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous [HAZ]', 'Services (except wholesale of waste and scrap) [G-U_X_G4677]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 18', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous [HAZ]', 'Wholesale of waste and scrap [G4677]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 19', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous [HAZ]', 'Households [EP_HH]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 20', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Hazardous [HAZ]', 'All NACE activities plus households [TOTAL_HH]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 21', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Non-hazardous [NHAZ]', 'Agriculture, forestry and fishing [A]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 22', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Non-hazardous [NHAZ]', 'Mining and quarrying [B]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 23', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Non-hazardous [NHAZ]', 'Manufacturing [C]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 24', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Non-hazardous [NHAZ]', 'Electricity, gas, steam and air conditioning supply [D]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 25', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Non-hazardous [NHAZ]', 'Water supply; sewerage, waste management and remediation activities [E]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 26', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Non-hazardous [NHAZ]', 'Construction [F]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Sheet 27', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Non-hazardous [NHAZ]', 'Services (except wholesale of waste and scrap) [G-U_X_G4677]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 28', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Non-hazardous [NHAZ]', 'Wholesale of waste and scrap [G4677]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 29', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Non-hazardous [NHAZ]', 'Households [EP_HH]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 30', 'Annual [A]', 'Kilograms per capita [KG_HAB]', 'Non-hazardous [NHAZ]', 'All NACE activities plus households [TOTAL_HH]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 31', 'Annual [A]', 'Tonne [T]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Agriculture, forestry and fishing [A]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 32', 'Annual [A]', 'Tonne [T]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Mining and quarrying [B]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 33', 'Annual [A]', 'Tonne [T]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Manufacturing [C]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 34', 'Annual [A]', 'Tonne [T]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Electricity, gas, steam and air conditioning supply [D]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 35', 'Annual [A]', 'Tonne [T]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Water supply; sewerage, waste management and remediation activities [E]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 36', 'Annual [A]', 'Tonne [T]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Construction [F]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 37', 'Annual [A]', 'Tonne [T]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Services (except wholesale of waste and scrap) [G-U_X_G4677]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 38', 'Annual [A]', 'Tonne [T]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Wholesale of waste and scrap [G4677]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 39', 'Annual [A]', 'Tonne [T]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'Households [EP_HH]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 40', 'Annual [A]', 'Tonne [T]', 'Hazardous and non-hazardous - Total [HAZ_NHAZ]', 'All NACE activities plus households [TOTAL_HH]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 41', 'Annual [A]', 'Tonne [T]', 'Hazardous [HAZ]', 'Agriculture, forestry and fishing [A]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 42', 'Annual [A]', 'Tonne [T]', 'Hazardous [HAZ]', 'Mining and quarrying [B]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 43', 'Annual [A]', 'Tonne [T]', 'Hazardous [HAZ]', 'Manufacturing [C]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 44', 'Annual [A]', 'Tonne [T]', 'Hazardous [HAZ]', 'Electricity, gas, steam and air conditioning supply [D]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 45', 'Annual [A]', 'Tonne [T]', 'Hazardous [HAZ]', 'Water supply; sewerage, waste management and remediation activities [E]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 46', 'Annual [A]', 'Tonne [T]', 'Hazardous [HAZ]', 'Construction [F]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 47', 'Annual [A]', 'Tonne [T]', 'Hazardous [HAZ]', 'Services (except wholesale of waste and scrap) [G-U_X_G4677]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 48', 'Annual [A]', 'Tonne [T]', 'Hazardous [HAZ]', 'Wholesale of waste and scrap [G4677]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 49', 'Annual [A]', 'Tonne [T]', 'Hazardous [HAZ]', 'Households [EP_HH]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 50', 'Annual [A]', 'Tonne [T]', 'Hazardous [HAZ]', 'All NACE activities plus households [TOTAL_HH]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 51', 'Annual [A]', 'Tonne [T]', 'Non-hazardous [NHAZ]', 'Agriculture, forestry and fishing [A]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 52', 'Annual [A]', 'Tonne [T]', 'Non-hazardous [NHAZ]', 'Mining and quarrying [B]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 53', 'Annual [A]', 'Tonne [T]', 'Non-hazardous [NHAZ]', 'Manufacturing [C]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 54', 'Annual [A]', 'Tonne [T]', 'Non-hazardous [NHAZ]', 'Electricity, gas, steam and air conditioning supply [D]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 55', 'Annual [A]', 'Tonne [T]', 'Non-hazardous [NHAZ]', 'Water supply; sewerage, waste management and remediation activities [E]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 56', 'Annual [A]', 'Tonne [T]', 'Non-hazardous [NHAZ]', 'Construction [F]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 57', 'Annual [A]', 'Tonne [T]', 'Non-hazardous [NHAZ]', 'Services (except wholesale of waste and scrap) [G-U_X_G4677]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 58', 'Annual [A]', 'Tonne [T]', 'Non-hazardous [NHAZ]', 'Wholesale of waste and scrap [G4677]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 59', 'Annual [A]', 'Tonne [T]', 'Non-hazardous [NHAZ]', 'Households [EP_HH]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n",
+ "(None, 'Feuille 60', 'Annual [A]', 'Tonne [T]', 'Non-hazardous [NHAZ]', 'All NACE activities plus households [TOTAL_HH]', 'Total waste [TOTAL]', None, None, None, None, None, None, None, None)\n"
]
}
],
"source": [
- "sh = book.sheet_by_index(0)\n",
+ "sh = book[book.sheetnames[0]]\n",
"\n",
"# get rows of the sheet\n",
- "for rx in range(20):\n",
- " print(sh.row_values(rx))"
+ "for row in sh.iter_rows(values_only=True):\n",
+ " if any(v is not None for v in row):\n",
+ " print(row)"
]
},
{
@@ -221,7 +270,7 @@
"\n",
"What is [NACE_R2](https://ec.europa.eu/eurostat/web/nace-rev2)?\n",
"\n",
- "> NACE ist das Akronym3 zur Bezeichnung der verschiedenen statistischen Systematiken der Wirtschaftszweige, die seit\n",
+ "> NACE ist das Akronym zur Bezeichnung der verschiedenen statistischen Systematiken der Wirtschaftszweige, die seit\n",
"1970 in der Europäischen Union entwickelt worden sind. Die NACE bildet den Rahmen für die Sammlung und Darstellung\n",
"einer breiten Palette statistischer, nach Wirtschaftszweigen untergliederter Daten aus dem Bereich Wirtschaft\n",
"(z. B. Produktion, Beschäftigung, Volkswirtschaftliche Gesamtrechnungen) und aus anderen Bereichen."
@@ -235,8 +284,7 @@
"hideCode": false,
"hidePrompt": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"from collections import defaultdict # why defaultdict? Cause if key is not found in the dictionary, then instead of KeyError, a new entry is created\n",
" # by declaration: list, set or int\n",
@@ -293,8 +341,7 @@
]
},
"execution_count": 10,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -310,8 +357,7 @@
"hideCode": false,
"hidePrompt": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"codes = {k: {s.split(\" - \")[0]: s.split(\" - \")[1] for s in v} for k,v in header.items()}"
]
@@ -356,8 +402,7 @@
]
},
"execution_count": 14,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -373,8 +418,7 @@
"hideCode": false,
"hidePrompt": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"def print_codes():\n",
" for k,v in codes.items():\n",
@@ -463,8 +507,7 @@
"hideCode": false,
"hidePrompt": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"file = \"data/env_wasgen.xls\"\n",
"df = pd.read_excel(file)"
@@ -1390,8 +1433,7 @@
]
},
"execution_count": 14,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -1407,8 +1449,7 @@
"hideCode": false,
"hidePrompt": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"df = pd.read_excel(file, header=11, nrows=40)"
]
@@ -2068,8 +2109,7 @@
]
},
"execution_count": 16,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -2146,8 +2186,7 @@
]
},
"execution_count": 17,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -2847,8 +2886,7 @@
]
},
"execution_count": 19,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -2900,8 +2938,7 @@
"hideCode": false,
"hidePrompt": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"def get_data_from_sheet(excel_file: str, header: tuple) -> pd.DataFrame:\n",
" book = xlrd.open_workbook(file, on_demand=True)\n",
@@ -2988,8 +3025,7 @@
"hideCode": false,
"hidePrompt": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"df = get_data_from_sheet(file, (\"KG_HAB\", \"HAZ_NHAZ\", \"TOTAL\", \"A\"))"
]
@@ -3536,8 +3572,7 @@
]
},
"execution_count": 24,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -3572,8 +3607,7 @@
]
},
"execution_count": 25,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
},
{
@@ -3614,8 +3648,7 @@
]
},
"execution_count": 26,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
},
{
@@ -3656,8 +3689,7 @@
]
},
"execution_count": 27,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
},
{
@@ -3922,10 +3954,8 @@
]
},
"execution_count": 27,
- "metadata": {
- },
- "output_type": "execute_result",
- "start": 0
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
@@ -3943,11 +3973,42 @@
"outputs": [
{
"data": {
- "text/html": "\n
\n\n \n\n"
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
},
"execution_count": 28,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -3964,8 +4025,7 @@
"hideCode": false,
"hidePrompt": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"df = get_data_from_sheet(file, (\"T\", \"HAZ_NHAZ\", \"TOTAL\", \"A\"))"
]
@@ -3981,11 +4041,42 @@
"outputs": [
{
"data": {
- "text/html": "\n\n\n \n\n"
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
},
"execution_count": 30,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -4591,8 +4682,7 @@
]
},
"execution_count": 31,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -4737,10 +4827,8 @@
]
},
"execution_count": 32,
- "metadata": {
- },
- "output_type": "execute_result",
- "start": 0
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
@@ -4866,8 +4954,7 @@
]
},
"execution_count": 32,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -4883,8 +4970,7 @@
"hideCode": false,
"hidePrompt": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"df = get_data_from_sheet(file, (\"KG_HAB\", \"HAZ_NHAZ\", \"TOTAL\", \"A\"))\n",
"df = df.reset_index().melt(id_vars=\"GEO\", var_name=\"year\", value_name=\"value\")\n",
@@ -5065,8 +5151,7 @@
]
},
"execution_count": 34,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -5120,11 +5205,42 @@
"outputs": [
{
"data": {
- "text/html": "\n\n\n \n\n"
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
},
"execution_count": 36,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -5186,8 +5302,7 @@
]
},
"execution_count": 55,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -5206,11 +5321,42 @@
"outputs": [
{
"data": {
- "text/html": "\n\n\n \n\n"
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
},
"execution_count": 58,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -5707,8 +5853,7 @@
]
},
"execution_count": 40,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -5733,8 +5878,7 @@
]
},
"execution_count": 41,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
},
{
@@ -5780,8 +5924,7 @@
"hideCode": false,
"hidePrompt": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"df1 = get_data_from_sheet(file, (\"KG_HAB\", \"HAZ_NHAZ\", \"TOTAL\", \"A\"))\n",
"df1 = df1.reset_index().melt(id_vars=\"GEO\", var_name=\"year\", value_name=\"value\")\n",
@@ -5967,8 +6110,7 @@
]
},
"execution_count": 60,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -6149,8 +6291,7 @@
]
},
"execution_count": 61,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -6166,8 +6307,7 @@
"hideCode": false,
"hidePrompt": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"df = df1.append(df2)"
]
@@ -6345,8 +6485,7 @@
]
},
"execution_count": 63,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -6365,11 +6504,42 @@
"outputs": [
{
"data": {
- "text/html": "\n\n\n \n\n"
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
},
"execution_count": 64,
- "metadata": {
- },
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -6394,31 +6564,29 @@
},
{
"cell_type": "code",
- "execution_count": 72,
+ "execution_count": 4,
"metadata": {
"collapsed": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"from utils import *"
]
},
{
"cell_type": "code",
- "execution_count": 74,
+ "execution_count": 5,
"metadata": {
"collapsed": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
- "file = \"data/env_wasgen.xls\""
+ "file = \"data/env_wasgen_new.xlsx\""
]
},
{
"cell_type": "code",
- "execution_count": 75,
+ "execution_count": 6,
"metadata": {
"collapsed": false
},
@@ -6427,43 +6595,80 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Category: UNIT\n",
+ "Category: freq\n",
+ "---------\n",
+ "A: Annual\n",
+ "\n",
+ "Category: unit\n",
"---------\n",
"T: Tonne\n",
- "KG_HAB: Kilograms per capita\n",
"\n",
- "Category: HAZARD\n",
+ "Category: hazard\n",
"---------\n",
- "HAZ: Hazardous\n",
- "HAZ_NHAZ: Hazardous and non-hazardous\n",
- "NHAZ: Non-hazardous\n",
+ "HAZ_NHAZ: Hazardous and non-hazardous - Total\n",
"\n",
- "Category: WASTE\n",
+ "Category: nace_r2\n",
+ "---------\n",
+ "TOTAL_HH: All NACE activities plus households\n",
+ "\n",
+ "Category: waste\n",
"---------\n",
- "W09: Animal and vegetal wastes (subtotal, W091+W092+W093)\n",
- "SEC: Secondary waste (W033+W103+W128_13)\n",
- "TOT_X_MIN: Waste excluding major mineral wastes\n",
- "W06_07A: Recyclable wastes (subtotal, W06+W07 except W077)\n",
- "PRIM: Primary waste (TOTAL minus SEC)\n",
- "W11: Common sludges\n",
- "W10: Mixed ordinary wastes (subtotal, W101+W102+W103)\n",
"TOTAL: Total waste\n",
- "W01-05: Chemical and medical wastes (subtotal)\n",
- "W12-13: Mineral and solidified wastes (subtotal)\n",
- "W077_08: Equipment (subtotal, W077+W08A+W081+W0841)\n",
"\n",
- "Category: NACE_R2\n",
+ "Category: geo\n",
"---------\n",
- "EP_HH: Households\n",
- "B: Mining and quarrying\n",
- "A: Agriculture, forestry and fishing\n",
- "C: Manufacturing\n",
- "G4677: Wholesale of waste and scrap\n",
- "F: Construction\n",
- "TOTAL_HH: All NACE activities plus households\n",
- "D: Electricity, gas, steam and air conditioning supply\n",
- "G-U_X_G4677: Services (except wholesale of waste and scrap)\n",
- "E: Water supply; sewerage, waste management and remediation activities\n",
+ "EU27_2020: European Union - 27 countries (from 2020)\n",
+ "EU28: European Union - 28 countries (2013-2020)\n",
+ "BE: Belgium\n",
+ "BG: Bulgaria\n",
+ "CZ: Czechia\n",
+ "DK: Denmark\n",
+ "DE: Germany (until 1990 former territory of the FRG)\n",
+ "EE: Estonia\n",
+ "IE: Ireland\n",
+ "EL: Greece\n",
+ "ES: Spain\n",
+ "FR: France\n",
+ "HR: Croatia\n",
+ "IT: Italy\n",
+ "CY: Cyprus\n",
+ "LV: Latvia\n",
+ "LT: Lithuania\n",
+ "LU: Luxembourg\n",
+ "HU: Hungary\n",
+ "MT: Malta\n",
+ "NL: Netherlands\n",
+ "AT: Austria\n",
+ "PL: Poland\n",
+ "PT: Portugal\n",
+ "RO: Romania\n",
+ "SI: Slovenia\n",
+ "SK: Slovakia\n",
+ "FI: Finland\n",
+ "SE: Sweden\n",
+ "IS: Iceland\n",
+ "LI: Liechtenstein\n",
+ "NO: Norway\n",
+ "UK: United Kingdom\n",
+ "ME: Montenegro\n",
+ "MK: North Macedonia\n",
+ "AL: Albania\n",
+ "RS: Serbia\n",
+ "TR: Türkiye\n",
+ "BA: Bosnia and Herzegovina\n",
+ "XK: Kosovo (under United Nations Security Council Resolution 1244/99)\n",
+ "\n",
+ "Category: time\n",
+ "---------\n",
+ "2004: 2004\n",
+ "2006: 2006\n",
+ "2008: 2008\n",
+ "2010: 2010\n",
+ "2012: 2012\n",
+ "2014: 2014\n",
+ "2016: 2016\n",
+ "2018: 2018\n",
+ "2020: 2020\n",
"\n"
]
}
@@ -6474,19 +6679,18 @@
},
{
"cell_type": "code",
- "execution_count": 84,
+ "execution_count": 38,
"metadata": {
"collapsed": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
- "df = get_data_from_excel(file, [(\"KG_HAB\", \"HAZ_NHAZ\", \"TOTAL\", \"A\")])"
+ "df = get_data_from_excel(file, [(\"KG_HAB\", \"HAZ_NHAZ\", \"A\", \"TOTAL\")])"
]
},
{
"cell_type": "code",
- "execution_count": 85,
+ "execution_count": 39,
"metadata": {
"collapsed": false
},
@@ -6516,8 +6720,8 @@
" value | \n",
" unit | \n",
" hazard | \n",
- " waste | \n",
" nace_r2 | \n",
+ " waste | \n",
" \n",
" \n",
" geo | \n",
@@ -6536,8 +6740,8 @@
" 146.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" EU28 | \n",
@@ -6545,8 +6749,8 @@
" 130.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" BE | \n",
@@ -6554,8 +6758,8 @@
" 114.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" BG | \n",
@@ -6563,8 +6767,8 @@
" 94.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" CZ | \n",
@@ -6572,8 +6776,8 @@
" 122.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" ... | \n",
@@ -6586,75 +6790,74 @@
"
\n",
" \n",
" AL | \n",
- " 2018 | \n",
+ " 2020 | \n",
" NaN | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" RS | \n",
- " 2018 | \n",
- " 12.0 | \n",
+ " 2020 | \n",
+ " 13.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" TR | \n",
- " 2018 | \n",
+ " 2020 | \n",
" 0.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" BA | \n",
- " 2018 | \n",
- " 0.0 | \n",
+ " 2020 | \n",
+ " NaN | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" XK | \n",
- " 2018 | \n",
+ " 2020 | \n",
" NaN | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
"\n",
- "320 rows × 6 columns
\n",
+ "360 rows × 6 columns
\n",
""
],
"text/plain": [
- " year value unit hazard waste nace_r2\n",
+ " year value unit hazard nace_r2 waste\n",
"geo \n",
- "EU27_2020 2004 146.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "EU28 2004 130.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "BE 2004 114.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "BG 2004 94.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "CZ 2004 122.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "... ... ... ... ... ... ...\n",
- "AL 2018 NaN KG_HAB HAZ_NHAZ TOTAL A\n",
- "RS 2018 12.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "TR 2018 0.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "BA 2018 0.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "XK 2018 NaN KG_HAB HAZ_NHAZ TOTAL A\n",
+ "EU27_2020 2004 146.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "EU28 2004 130.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "BE 2004 114.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "BG 2004 94.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "CZ 2004 122.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "... ... ... ... ... ... ...\n",
+ "AL 2020 NaN KG_HAB HAZ_NHAZ A TOTAL\n",
+ "RS 2020 13.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "TR 2020 0.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "BA 2020 NaN KG_HAB HAZ_NHAZ A TOTAL\n",
+ "XK 2020 NaN KG_HAB HAZ_NHAZ A TOTAL\n",
"\n",
- "[320 rows x 6 columns]"
+ "[360 rows x 6 columns]"
]
},
- "execution_count": 85,
- "metadata": {
- },
+ "execution_count": 39,
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -6664,7 +6867,7 @@
},
{
"cell_type": "code",
- "execution_count": 86,
+ "execution_count": 40,
"metadata": {
"collapsed": false
},
@@ -6674,18 +6877,18 @@
"output_type": "stream",
"text": [
"\n",
- "CategoricalIndex: 320 entries, EU27_2020 to XK\n",
+ "CategoricalIndex: 360 entries, EU27_2020 to XK\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
- " 0 year 320 non-null int64 \n",
- " 1 value 287 non-null float64 \n",
- " 2 unit 320 non-null category\n",
- " 3 hazard 320 non-null category\n",
- " 4 waste 320 non-null category\n",
- " 5 nace_r2 320 non-null category\n",
- "dtypes: category(4), float64(1), int64(1)\n",
- "memory usage: 8.4 KB\n"
+ " 0 year 360 non-null int32 \n",
+ " 1 value 321 non-null float64 \n",
+ " 2 unit 360 non-null category\n",
+ " 3 hazard 360 non-null category\n",
+ " 4 nace_r2 360 non-null category\n",
+ " 5 waste 360 non-null category\n",
+ "dtypes: category(4), float64(1), int32(1)\n",
+ "memory usage: 7.8 KB\n"
]
}
],
@@ -6695,11 +6898,19 @@
},
{
"cell_type": "code",
- "execution_count": 87,
+ "execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "c:\\tools\\miniconda3\\lib\\site-packages\\openpyxl\\styles\\stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default\n",
+ " warn(\"Workbook contains no default style, apply openpyxl's default\")\n"
+ ]
+ },
{
"data": {
"text/html": [
@@ -6725,8 +6936,8 @@
" value | \n",
" unit | \n",
" hazard | \n",
- " waste | \n",
" nace_r2 | \n",
+ " waste | \n",
" \n",
" \n",
" geo | \n",
@@ -6745,8 +6956,8 @@
" 146.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" EU28 | \n",
@@ -6754,8 +6965,8 @@
" 130.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" BE | \n",
@@ -6763,8 +6974,8 @@
" 114.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" BG | \n",
@@ -6772,8 +6983,8 @@
" 94.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" CZ | \n",
@@ -6781,8 +6992,8 @@
" 122.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" ... | \n",
@@ -6795,90 +7006,97 @@
"
\n",
" \n",
" AL | \n",
- " 2018 | \n",
+ " 2020 | \n",
" NaN | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" B | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" RS | \n",
- " 2018 | \n",
- " 5532.0 | \n",
+ " 2020 | \n",
+ " 6626.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" B | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" TR | \n",
- " 2018 | \n",
- " 214.0 | \n",
+ " 2020 | \n",
+ " 331.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" B | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" BA | \n",
- " 2018 | \n",
- " 158.0 | \n",
+ " 2020 | \n",
+ " NaN | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" B | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" XK | \n",
- " 2018 | \n",
+ " 2020 | \n",
" NaN | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" B | \n",
+ " TOTAL | \n",
"
\n",
" \n",
"\n",
- "640 rows × 6 columns
\n",
+ "720 rows × 6 columns
\n",
""
],
"text/plain": [
- " year value unit hazard waste nace_r2\n",
+ " year value unit hazard nace_r2 waste\n",
"geo \n",
- "EU27_2020 2004 146.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "EU28 2004 130.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "BE 2004 114.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "BG 2004 94.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "CZ 2004 122.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "... ... ... ... ... ... ...\n",
- "AL 2018 NaN KG_HAB HAZ_NHAZ TOTAL B\n",
- "RS 2018 5532.0 KG_HAB HAZ_NHAZ TOTAL B\n",
- "TR 2018 214.0 KG_HAB HAZ_NHAZ TOTAL B\n",
- "BA 2018 158.0 KG_HAB HAZ_NHAZ TOTAL B\n",
- "XK 2018 NaN KG_HAB HAZ_NHAZ TOTAL B\n",
+ "EU27_2020 2004 146.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "EU28 2004 130.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "BE 2004 114.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "BG 2004 94.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "CZ 2004 122.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "... ... ... ... ... ... ...\n",
+ "AL 2020 NaN KG_HAB HAZ_NHAZ B TOTAL\n",
+ "RS 2020 6626.0 KG_HAB HAZ_NHAZ B TOTAL\n",
+ "TR 2020 331.0 KG_HAB HAZ_NHAZ B TOTAL\n",
+ "BA 2020 NaN KG_HAB HAZ_NHAZ B TOTAL\n",
+ "XK 2020 NaN KG_HAB HAZ_NHAZ B TOTAL\n",
"\n",
- "[640 rows x 6 columns]"
+ "[720 rows x 6 columns]"
]
},
- "execution_count": 87,
- "metadata": {
- },
+ "execution_count": 41,
+ "metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "df = get_data_from_excel(file, [(\"KG_HAB\", \"HAZ_NHAZ\", \"TOTAL\", \"A\"), (\"KG_HAB\", \"HAZ_NHAZ\", \"TOTAL\", \"B\")])\n",
+ "df = get_data_from_excel(file, [(\"KG_HAB\", \"HAZ_NHAZ\", \"A\", \"TOTAL\"), (\"KG_HAB\", \"HAZ_NHAZ\", \"B\", \"TOTAL\")])\n",
"df"
]
},
{
"cell_type": "code",
- "execution_count": 62,
+ "execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "c:\\tools\\miniconda3\\lib\\site-packages\\openpyxl\\styles\\stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default\n",
+ " warn(\"Workbook contains no default style, apply openpyxl's default\")\n"
+ ]
+ }
],
"source": [
"df = get_data_from_excel(file)"
@@ -6886,7 +7104,7 @@
},
{
"cell_type": "code",
- "execution_count": 63,
+ "execution_count": 43,
"metadata": {
"collapsed": false
},
@@ -6896,18 +7114,18 @@
"output_type": "stream",
"text": [
"\n",
- "CategoricalIndex: 211200 entries, EU27_2020 to XK\n",
+ "CategoricalIndex: 21600 entries, EU27_2020 to XK\n",
"Data columns (total 6 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 year 211200 non-null int64 \n",
- " 1 value 95562 non-null float64 \n",
- " 2 unit 211200 non-null category\n",
- " 3 hazard 211200 non-null category\n",
- " 4 waste 211200 non-null category\n",
- " 5 nace_r2 211200 non-null category\n",
- "dtypes: category(4), float64(1), int64(1)\n",
- "memory usage: 4.2 MB\n"
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 year 21600 non-null int32 \n",
+ " 1 value 18626 non-null float64 \n",
+ " 2 unit 21600 non-null category\n",
+ " 3 hazard 21600 non-null category\n",
+ " 4 nace_r2 21600 non-null category\n",
+ " 5 waste 21600 non-null category\n",
+ "dtypes: category(4), float64(1), int32(1)\n",
+ "memory usage: 360.7 KB\n"
]
}
],
@@ -6917,7 +7135,182 @@
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " year | \n",
+ " value | \n",
+ " unit | \n",
+ " hazard | \n",
+ " nace_r2 | \n",
+ " waste | \n",
+ "
\n",
+ " \n",
+ " geo | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " EU27_2020 | \n",
+ " 2004 | \n",
+ " 146.0 | \n",
+ " KG_HAB | \n",
+ " HAZ_NHAZ | \n",
+ " A | \n",
+ " TOTAL | \n",
+ "
\n",
+ " \n",
+ " EU28 | \n",
+ " 2004 | \n",
+ " 130.0 | \n",
+ " KG_HAB | \n",
+ " HAZ_NHAZ | \n",
+ " A | \n",
+ " TOTAL | \n",
+ "
\n",
+ " \n",
+ " BE | \n",
+ " 2004 | \n",
+ " 114.0 | \n",
+ " KG_HAB | \n",
+ " HAZ_NHAZ | \n",
+ " A | \n",
+ " TOTAL | \n",
+ "
\n",
+ " \n",
+ " BG | \n",
+ " 2004 | \n",
+ " 94.0 | \n",
+ " KG_HAB | \n",
+ " HAZ_NHAZ | \n",
+ " A | \n",
+ " TOTAL | \n",
+ "
\n",
+ " \n",
+ " CZ | \n",
+ " 2004 | \n",
+ " 122.0 | \n",
+ " KG_HAB | \n",
+ " HAZ_NHAZ | \n",
+ " A | \n",
+ " TOTAL | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " AL | \n",
+ " 2020 | \n",
+ " NaN | \n",
+ " T | \n",
+ " NHAZ | \n",
+ " TOTAL_HH | \n",
+ " TOTAL | \n",
+ "
\n",
+ " \n",
+ " RS | \n",
+ " 2020 | \n",
+ " 47307595.0 | \n",
+ " T | \n",
+ " NHAZ | \n",
+ " TOTAL_HH | \n",
+ " TOTAL | \n",
+ "
\n",
+ " \n",
+ " TR | \n",
+ " 2020 | \n",
+ " 76949950.0 | \n",
+ " T | \n",
+ " NHAZ | \n",
+ " TOTAL_HH | \n",
+ " TOTAL | \n",
+ "
\n",
+ " \n",
+ " BA | \n",
+ " 2020 | \n",
+ " 6743515.0 | \n",
+ " T | \n",
+ " NHAZ | \n",
+ " TOTAL_HH | \n",
+ " TOTAL | \n",
+ "
\n",
+ " \n",
+ " XK | \n",
+ " 2020 | \n",
+ " 2592826.0 | \n",
+ " T | \n",
+ " NHAZ | \n",
+ " TOTAL_HH | \n",
+ " TOTAL | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
21600 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " year value unit hazard nace_r2 waste\n",
+ "geo \n",
+ "EU27_2020 2004 146.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "EU28 2004 130.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "BE 2004 114.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "BG 2004 94.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "CZ 2004 122.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "... ... ... ... ... ... ...\n",
+ "AL 2020 NaN T NHAZ TOTAL_HH TOTAL\n",
+ "RS 2020 47307595.0 T NHAZ TOTAL_HH TOTAL\n",
+ "TR 2020 76949950.0 T NHAZ TOTAL_HH TOTAL\n",
+ "BA 2020 6743515.0 T NHAZ TOTAL_HH TOTAL\n",
+ "XK 2020 2592826.0 T NHAZ TOTAL_HH TOTAL\n",
+ "\n",
+ "[21600 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
"metadata": {
"collapsed": false,
"scrolled": true
@@ -6929,31 +7322,25 @@
""
]
},
- "execution_count": 34,
- "metadata": {
- },
+ "execution_count": 48,
+ "metadata": {},
"output_type": "execute_result"
},
{
"data": {
- "image/png": "",
+ "image/png": "",
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 34,
"metadata": {
- "image/png": {
- "height": 472,
- "width": 723
- },
"needs_background": "light"
},
- "output_type": "execute_result"
+ "output_type": "display_data"
}
],
"source": [
- "df[(df.year == \"2016\") & (df.unit == \"KG_HAB\") & (df.hazard == \"HAZ_NHAZ\") & (df.waste == \"TOTAL\") & (df.nace_r2 == \"TOTAL_HH\")].value.plot.bar()"
+ "df[(df.year == 2016) & (df.unit == \"KG_HAB\") & (df.hazard == \"HAZ_NHAZ\") & (df.waste == \"TOTAL\") & (df.nace_r2 == \"TOTAL_HH\")].value.plot.bar(figsize=(20,8))"
]
},
{
@@ -6971,8 +7358,7 @@
"metadata": {
"collapsed": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"df.to_csv(\"data/env_wasgen_combined.csv\")"
]
@@ -6983,8 +7369,7 @@
"metadata": {
"collapsed": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"df = pd.read_csv(\"data/env_wasgen_combined.csv\")"
]
@@ -7027,8 +7412,7 @@
"metadata": {
"collapsed": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"df[df.select_dtypes(\"object\").columns] = df.select_dtypes(\"object\").apply(lambda x: pd.Series(x).astype(\"category\"))"
]
@@ -7071,8 +7455,7 @@
"metadata": {
"collapsed": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"df = df.set_index(\"geo\")"
]
@@ -7110,31 +7493,29 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 49,
"metadata": {
"collapsed": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"df.to_parquet(\"data/env_wasgen_combined.parquet\")"
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 50,
"metadata": {
"collapsed": false
},
- "outputs": [
- ],
+ "outputs": [],
"source": [
"df = pd.read_parquet(\"data/env_wasgen_combined.parquet\")"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 51,
"metadata": {
"collapsed": false
},
@@ -7164,8 +7545,8 @@
" value | \n",
" unit | \n",
" hazard | \n",
- " waste | \n",
" nace_r2 | \n",
+ " waste | \n",
" \n",
" \n",
" geo | \n",
@@ -7184,8 +7565,8 @@
" 146.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" EU28 | \n",
@@ -7193,8 +7574,8 @@
" 130.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" BE | \n",
@@ -7202,8 +7583,8 @@
" 114.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" BG | \n",
@@ -7211,8 +7592,8 @@
" 94.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" CZ | \n",
@@ -7220,8 +7601,8 @@
" 122.0 | \n",
" KG_HAB | \n",
" HAZ_NHAZ | \n",
- " TOTAL | \n",
" A | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" ... | \n",
@@ -7234,75 +7615,74 @@
"
\n",
" \n",
" AL | \n",
- " 2018 | \n",
+ " 2020 | \n",
" NaN | \n",
" T | \n",
" NHAZ | \n",
- " TOT_X_MIN | \n",
" TOTAL_HH | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" RS | \n",
- " 2018 | \n",
- " 11780914.0 | \n",
+ " 2020 | \n",
+ " 47307595.0 | \n",
" T | \n",
" NHAZ | \n",
- " TOT_X_MIN | \n",
" TOTAL_HH | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" TR | \n",
- " 2018 | \n",
- " 74786520.0 | \n",
+ " 2020 | \n",
+ " 76949950.0 | \n",
" T | \n",
" NHAZ | \n",
- " TOT_X_MIN | \n",
" TOTAL_HH | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" BA | \n",
- " 2018 | \n",
- " 5610790.0 | \n",
+ " 2020 | \n",
+ " 6743515.0 | \n",
" T | \n",
" NHAZ | \n",
- " TOT_X_MIN | \n",
" TOTAL_HH | \n",
+ " TOTAL | \n",
"
\n",
" \n",
" XK | \n",
- " 2018 | \n",
- " NaN | \n",
+ " 2020 | \n",
+ " 2592826.0 | \n",
" T | \n",
" NHAZ | \n",
- " TOT_X_MIN | \n",
" TOTAL_HH | \n",
+ " TOTAL | \n",
"
\n",
" \n",
"\n",
- "211200 rows × 6 columns
\n",
+ "21600 rows × 6 columns
\n",
""
],
"text/plain": [
- " year value unit hazard waste nace_r2\n",
- "geo \n",
- "EU27_2020 2004 146.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "EU28 2004 130.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "BE 2004 114.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "BG 2004 94.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "CZ 2004 122.0 KG_HAB HAZ_NHAZ TOTAL A\n",
- "... ... ... ... ... ... ...\n",
- "AL 2018 NaN T NHAZ TOT_X_MIN TOTAL_HH\n",
- "RS 2018 11780914.0 T NHAZ TOT_X_MIN TOTAL_HH\n",
- "TR 2018 74786520.0 T NHAZ TOT_X_MIN TOTAL_HH\n",
- "BA 2018 5610790.0 T NHAZ TOT_X_MIN TOTAL_HH\n",
- "XK 2018 NaN T NHAZ TOT_X_MIN TOTAL_HH\n",
+ " year value unit hazard nace_r2 waste\n",
+ "geo \n",
+ "EU27_2020 2004 146.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "EU28 2004 130.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "BE 2004 114.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "BG 2004 94.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "CZ 2004 122.0 KG_HAB HAZ_NHAZ A TOTAL\n",
+ "... ... ... ... ... ... ...\n",
+ "AL 2020 NaN T NHAZ TOTAL_HH TOTAL\n",
+ "RS 2020 47307595.0 T NHAZ TOTAL_HH TOTAL\n",
+ "TR 2020 76949950.0 T NHAZ TOTAL_HH TOTAL\n",
+ "BA 2020 6743515.0 T NHAZ TOTAL_HH TOTAL\n",
+ "XK 2020 2592826.0 T NHAZ TOTAL_HH TOTAL\n",
"\n",
- "[211200 rows x 6 columns]"
+ "[21600 rows x 6 columns]"
]
},
- "execution_count": 9,
- "metadata": {
- },
+ "execution_count": 51,
+ "metadata": {},
"output_type": "execute_result"
}
],
@@ -7312,7 +7692,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 52,
"metadata": {
"collapsed": false
},
@@ -7322,18 +7702,18 @@
"output_type": "stream",
"text": [
"\n",
- "CategoricalIndex: 211200 entries, EU27_2020 to XK\n",
+ "CategoricalIndex: 21600 entries, EU27_2020 to XK\n",
"Data columns (total 6 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 year 211200 non-null int64 \n",
- " 1 value 95562 non-null float64 \n",
- " 2 unit 211200 non-null category\n",
- " 3 hazard 211200 non-null category\n",
- " 4 waste 211200 non-null category\n",
- " 5 nace_r2 211200 non-null category\n",
- "dtypes: category(4), float64(1), int64(1)\n",
- "memory usage: 4.2 MB\n"
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 year 21600 non-null int32 \n",
+ " 1 value 18626 non-null float64 \n",
+ " 2 unit 21600 non-null category\n",
+ " 3 hazard 21600 non-null category\n",
+ " 4 nace_r2 21600 non-null category\n",
+ " 5 waste 21600 non-null category\n",
+ "dtypes: category(4), float64(1), int32(1)\n",
+ "memory usage: 360.7 KB\n"
]
}
],
@@ -7343,7 +7723,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 54,
"metadata": {
"collapsed": false
},
@@ -7354,59 +7734,43 @@
""
]
},
- "execution_count": 12,
- "metadata": {
- },
+ "execution_count": 54,
+ "metadata": {},
"output_type": "execute_result"
},
{
"data": {
- "image/png": "",
+ "image/png": "",
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 12,
"metadata": {
- "image/png": {
- "height": 482,
- "width": 707
- },
"needs_background": "light"
},
- "output_type": "execute_result"
+ "output_type": "display_data"
}
],
"source": [
- "df[(df.year == 2016) & (df.unit == \"T\") & (df.hazard == \"HAZ_NHAZ\") & (df.waste == \"TOTAL\") & (df.nace_r2 == \"TOTAL_HH\")].value.plot.bar()"
+ "df[(df.year == 2016) & (df.unit == \"T\") & (df.hazard == \"HAZ_NHAZ\") & (df.waste == \"TOTAL\") & (df.nace_r2 == \"TOTAL_HH\")].value.plot.bar(figsize=(20,8))"
]
},
{
"cell_type": "code",
- "execution_count": 0,
+ "execution_count": null,
"metadata": {
"collapsed": false
},
- "outputs": [
- ],
- "source": [
- ]
+ "outputs": [],
+ "source": []
}
],
"metadata": {
"hide_code_all_hidden": false,
"kernelspec": {
- "display_name": "Python 3 (system-wide)",
+ "display_name": "Python 3.9.7 ('base')",
"language": "python",
- "metadata": {
- "cocalc": {
- "description": "Python 3 programming language",
- "priority": 100,
- "url": "https://www.python.org/"
- }
- },
- "name": "python3",
- "resource_dir": "/ext/jupyter/kernels/python3"
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
@@ -7418,9 +7782,14 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.9.7"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "3247f7d4635bb288d9e06d3deacee818856115b2677ccfdf5a578edab993fe5f"
+ }
}
},
"nbformat": 4,
"nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/02 Dataset env_wasgen.ipynb b/02 Dataset env_wasgen.ipynb
index b88abd7..a618d46 100644
--- a/02 Dataset env_wasgen.ipynb
+++ b/02 Dataset env_wasgen.ipynb
@@ -126,26 +126,27 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 4,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "c:\\tools\\miniconda3\\lib\\site-packages\\openpyxl\\styles\\stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default\n",
+ " warn(\"Workbook contains no default style, apply openpyxl's default\")\n"
+ ]
+ }
+ ],
"source": [
"codes = get_header_codes_from_excel(file)"
]
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 5,
"metadata": {},
"outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/miay/miniconda/lib/python3.9/site-packages/openpyxl/styles/stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default\n",
- " warn(\"Workbook contains no default style, apply openpyxl's default\")\n"
- ]
- },
{
"name": "stdout",
"output_type": "stream",
@@ -4305,7 +4306,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3.9.12",
+ "display_name": "Python 3.9.7 ('base')",
"language": "python",
"name": "python3"
},
@@ -4319,11 +4320,11 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.12"
+ "version": "3.9.7"
},
"vscode": {
"interpreter": {
- "hash": "854afd56987f2f89f833600b1696b8dd8f924afccdbd759679e77e1a3b52b928"
+ "hash": "3247f7d4635bb288d9e06d3deacee818856115b2677ccfdf5a578edab993fe5f"
}
}
},
diff --git a/README.md b/README.md
index b291d3c..f8dd425 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,14 @@
# Data-Science-in-the-Wild
-[](https://mybinder.org/v2/gh/pmayd/Data-Science-in-the-Wild/HEAD)
+[](https://mybinder.org/v2/gh/pmayd/Data-Science-in-the-Wild/HEAD)
This repository contains the material for our Data Science in the Wild workshop at [Spartakiade 2022](https://spartakiade.org/).
## Data sets
- [Eurostat](https://ec.europa.eu/eurostat/web/main/data/database)
- - We need `Database by themes` -> `Environment and energy` -> `Environment (env)` -> `Waste (env_was)` -> `Waste generation and treatment (env_wasgt)` -> `env_wasgen`.
- - You can directly jump into the data browser [here](https://ec.europa.eu/eurostat/databrowser/view/env_wasgen/default/table?lang=en).
+ - We need `Database by themes` -> `Environment and energy` -> `Environment (env)` -> `Waste (env_was)` -> `Waste generation and treatment (env_wasgt)` -> `env_wasgen`.
+ - You can directly jump into the data browser [here](https://ec.europa.eu/eurostat/databrowser/view/env_wasgen/default/table?lang=en).
- [GENESIS-Online](https://www.destatis.de/DE/Themen/Gesellschaft-Umwelt/Gesundheit/Glossar/genesis.html)
- - You can find the documentation of the API [here](https://www.destatis.de/DE/Service/OpenData/genesis-api-webservice-oberflaeche.html) (yes -- it's a PDF).
- - You need a registered user to be able to use the GENESIS API, so register your user [here]().
\ No newline at end of file
+ - You can find the documentation of the API [here](https://www.destatis.de/DE/Service/OpenData/genesis-api-webservice-oberflaeche.html) (yes -- it's a PDF).
+ - You need a registered user to be able to use the GENESIS API, so register your user [here](https://www-genesis.destatis.de/genesis/online?Menu=Registrierung#abreadcrumb).
\ No newline at end of file
diff --git a/data/env_wasgen_combined.parquet b/data/env_wasgen_combined.parquet
index d332c44..a6a566a 100644
Binary files a/data/env_wasgen_combined.parquet and b/data/env_wasgen_combined.parquet differ
diff --git a/data/env_wasgen_new.xlsx b/data/env_wasgen_new.xlsx
index a07f32e..b46f469 100644
Binary files a/data/env_wasgen_new.xlsx and b/data/env_wasgen_new.xlsx differ
diff --git a/data/env_wasgen_old.xls b/data/env_wasgen_old.xls
index 87741a7..1e0b8a8 100644
Binary files a/data/env_wasgen_old.xls and b/data/env_wasgen_old.xls differ
diff --git a/requirements.txt b/requirements.txt
index 33d2cb0..ab64755 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,5 @@ matplotlib
numpy
pandas
plotly
-xlrd
-statsmodels
+openpyxl
scikit-learn
\ No newline at end of file
diff --git a/utils.py b/utils.py
index d8d1b73..f92e4f6 100644
--- a/utils.py
+++ b/utils.py
@@ -1,10 +1,10 @@
from collections import defaultdict
-from typing import Union, List
+from typing import List, Union
-import pandas as pd
import openpyxl
+import pandas as pd
-HEADER_ROWS = range(4,8)
+HEADER_ROWS = range(6, 10)
DIMENSIONS_SHEET_NAME = "Structure"
DIMENSIONS_CELL_RANGE = "B4:E1000"
DIMENSIONS_FIRST_ROW = 3
@@ -13,13 +13,21 @@
DIMENSIONS_COL_LABEL = 3
-def get_header_codes_from_excel(excel_file: str):
+def get_header_codes_from_excel(excel_file: str) -> dict:
+ """Return Dimension categories, codes and labels from an Eurostat dataset.
+
+ Args:
+ excel_file (str): Path to a local excel file.
+
+ Returns:
+ dict: A dictionary with categories as key and another dict with code - label as key-value.
+ """
book = openpyxl.load_workbook(excel_file, data_only=True)
-
+
codes = defaultdict(dict)
sheet = book[DIMENSIONS_SHEET_NAME]
cells = sheet[DIMENSIONS_CELL_RANGE]
-
+
for row in cells:
cat = row[DIMENSIONS_COL_CAT].value
code = row[DIMENSIONS_COL_CODE].value
@@ -33,38 +41,47 @@ def get_header_codes_from_excel(excel_file: str):
def print_codes(excel_file: str):
+ """Print the Dimension categories, codes and labels from an Eurostat dataset.
+
+ Args:
+ excel_file (str): Path to a local excel file.
+ """
codes = get_header_codes_from_excel(excel_file)
- for k,v in codes.items():
+ for k, v in codes.items():
print("Category: ", k)
print("---------")
- for k,v in v.items():
+ for k, v in v.items():
print(f"{k}: {v}")
print()
-def get_data_from_excel(excel_file: str, headers: Union[tuple, List[tuple]] = None) -> pd.DataFrame:
- book = xlrd.open_workbook(excel_file, on_demand=True)
+def get_data_from_excel(
+ excel_file: str, headers: Union[tuple, List[tuple]] = None
+) -> pd.DataFrame:
+ book = openpyxl.load_workbook(excel_file, data_only=True)
list_of_df = []
if isinstance(headers, tuple):
headers = [headers]
- for sheet in book.sheet_names():
- sh = book.sheet_by_name(sheet)
+ for sheetname in book.sheetnames[2:]:
+ sheet = book[sheetname]
header_names = []
header_values = []
- for row in range(6,10):
- name = sh.cell_value(row, 0)
- value = sh.cell_value(row, 1)
-
- if not name:
+ for row in HEADER_ROWS:
+ name = sheet.cell(row=row, column=1).value
+ value = sheet.cell(row=row, column=3).value
+
+ if name is None:
break
-
+
+ name = name.split("[")[1].strip("]")
+ value = value.split("[")[1].strip("]")
header_names.append(name)
- header_values.append(value.split(" - ")[0])
-
+ header_values.append(value)
+
header_names = tuple(header_names)
header_values = tuple(header_values)
@@ -83,26 +100,46 @@ def get_data_from_excel(excel_file: str, headers: Union[tuple, List[tuple]] = No
# find header row
header_row = 0
- for row in range(20):
- if sh.cell_value(row, 0) == "GEO":
+ for row in range(1, 21):
+ value = sheet.cell(row=row, column=1).value
+ if value is not None and value.startswith("TIME"):
header_row = row
break
nrows = header_row
- for row in range(header_row, 100):
- if sh.cell_value(row, 0) == "":
+ for row in range(header_row, 101):
+ if sheet.cell(row=row, column=1).value is None:
nrows = row - header_row - 1
break
-
- df_sheet = pd.read_excel(excel_file, sheet_name=sheet, header=header_row, nrows=nrows, na_values=":").drop(columns="GEO(L)/TIME").rename(columns={'GEO': 'geo'})
+
+ df_sheet = pd.read_excel(
+ excel_file,
+ sheet_name=sheetname,
+ header=header_row-1,
+ nrows=nrows,
+ na_values=":",
+ )
+ df_sheet = df_sheet.rename(
+ columns={
+ df_sheet.columns[0]: df_sheet.iloc[0, 0],
+ df_sheet.columns[1]: df_sheet.iloc[0, 1],
+ }
+ )
+ df_sheet = df_sheet.iloc[1:]
+ df_sheet = df_sheet.drop(columns="GEO (Labels)")
+ df_sheet = df_sheet.rename(columns={"GEO (Codes)": "geo"})
df_sheet = df_sheet.melt(id_vars="geo", var_name="year", value_name="value")
- df_sheet = df_sheet.assign(**{x.lower():y for x,y in zip(header_names, header_values) if x})
+ df_sheet = df_sheet.assign(
+ **{x.lower(): y for x, y in zip(header_names, header_values) if x}
+ )
list_of_df.append(df_sheet)
df = pd.concat(list_of_df)
df.year = df.year.astype(int)
- df[df.select_dtypes("object").columns] = df.select_dtypes("object").astype("category")
+ df[df.select_dtypes("object").columns] = df.select_dtypes("object").astype(
+ "category"
+ )
df = df.set_index("geo")
return df