Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/19 improve readability of the table format #42

Merged
merged 6 commits into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 36 additions & 1 deletion src/pystatis/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,14 @@ def __init__(self, name: str):
self.data = pd.DataFrame()
self.metadata: dict = {}

def get_data(self, area: str = "all", **kwargs):
def get_data(self, area: str = "all", prettify: bool = True, **kwargs):
"""Downloads raw data and metadata from GENESIS-Online.

Additional keyword arguments are passed on to the GENESIS-Online GET request for tablefile.

Args:
area (str, optional): Area to search for the object in GENESIS-Online. Defaults to "all".
prettify (bool, optional): Reformats the table into a readable format. Defaults to True.
"""
params = {"name": self.name, "area": area, "format": "ffcsv"}

Expand All @@ -38,12 +39,46 @@ def get_data(self, area: str = "all", **kwargs):
endpoint="data", method="tablefile", params=params, as_json=False
)
assert isinstance(raw_data, str) # nosec assert_used

self.raw_data = raw_data
data_str = StringIO(raw_data)
self.data = pd.read_csv(data_str, sep=";")

if prettify:
self.data = self.prettify_table(self.data)

metadata = load_data(
endpoint="metadata", method="table", params=params, as_json=True
)
assert isinstance(metadata, dict) # nosec assert_used

self.metadata = metadata

@staticmethod
def prettify_table(data: pd.DataFrame) -> pd.DataFrame:
"""Reformat the data into a more readable table

Args:
data (pd.DataFrame): A pandas dataframe created from raw_data

Returns:
pd.DataFrame: Formatted dataframe that omits all unnecessary Code columns
and includes informative columns names
"""
# Extracts time column with name from first element of Zeit_Label column
time = pd.DataFrame({data["Zeit_Label"].iloc[0]: data["Zeit"]})

# Extracts new column names from first values of the Merkmal_Label columns
# and assigns these to the relevant attribute columns (Auspraegung_Label)
attributes = data.filter(like="Auspraegung_Label")
attributes.columns = data.filter(like="Merkmal_Label").iloc[0].tolist()

# Selects all columns containing the values
values = data.filter(like="__")

# Given a name like BEV036__Bevoelkerung_in_Hauptwohnsitzhaushalten__1000
# extracts the readable label and omit both the code and the unit
values.columns = [name.split("__")[1] for name in values.columns]

pretty_data = pd.concat([time, attributes, values], axis=1)
return pretty_data
1 change: 0 additions & 1 deletion tests/test_config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import copy
import os
from configparser import ConfigParser
from pathlib import Path
Expand Down
2 changes: 0 additions & 2 deletions tests/test_db.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import logging
from configparser import ConfigParser

import pytest

from pystatis import config, db
from pystatis.exception import PystatisConfigError


@pytest.fixture()
Expand Down
57 changes: 57 additions & 0 deletions tests/test_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import pandas as pd
import pytest

import pystatis

EASY_TABLE = """Statistik_Code;Statistik_Label;Zeit_Code;Zeit_Label;Zeit;1_Merkmal_Code;1_Merkmal_Label;1_Auspraegung_Code;1_Auspraegung_Label; FLC006__Gebietsflaeche__qkm
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;08;Baden-Württemberg;35747,85
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;09;Bayern;70541,58
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;11;Berlin;891,12
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;12;Brandenburg;29654,38
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;04;Bremen;419,61
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;02;Hamburg;755,09
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;06;Hessen;21115,62
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;13;Mecklenburg-Vorpommern;23294,90
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;03;Niedersachsen;47709,90
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;05;Nordrhein-Westfalen;34112,72
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;07;Rheinland-Pfalz;19857,97
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;10;Saarland;2571,52
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;14;Sachsen;18449,86
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;15;Sachsen-Anhalt;20467,20
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;01;Schleswig-Holstein;15804,30
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;16;Thüringen;16202,37
11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;;Insgesamt;357595,99"""


def mocked_load_data(endpoint, method, params, as_json):
if endpoint == "data" and method == "tablefile":
return EASY_TABLE
elif endpoint == "metadata" and method == "table":
return {"metadata": "table"}
else:
raise NotImplementedError


def test_get_data(monkeypatch):
# patch pystatis.table.load_data with parameter endpoint="data"
# and method="tablefile" to return EASY_TABLE
monkeypatch.setattr(pystatis.table, "load_data", mocked_load_data)
table = pystatis.Table(name="11111-0001")
table.get_data(prettify=False)
assert table.data.shape == (17, 10)
assert isinstance(table.data, pd.DataFrame)
assert not table.data.empty
assert isinstance(table.raw_data, str)
assert table.raw_data != ""


def test_prettify(monkeypatch):
monkeypatch.setattr(pystatis.table, "load_data", mocked_load_data)
table = pystatis.Table(name="11111-0001")
table.get_data(prettify=True)
assert table.data.shape == (17, 3)
assert table.data.columns.to_list() == [
"Stichtag",
"Bundesländer",
"Gebietsflaeche",
]
Loading