From f12830c2c57ad4bb09804d8e724f300d9255c0ea Mon Sep 17 00:00:00 2001 From: Zosia Borowska Date: Wed, 29 Nov 2023 18:16:22 +0000 Subject: [PATCH 1/6] Reformatting the raw data tables for readability --- src/pystatis/table.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/pystatis/table.py b/src/pystatis/table.py index 071fc93..ad0ea3b 100644 --- a/src/pystatis/table.py +++ b/src/pystatis/table.py @@ -41,9 +41,40 @@ def get_data(self, area: str = "all", **kwargs): self.raw_data = raw_data data_str = StringIO(raw_data) self.data = pd.read_csv(data_str, sep=";") + self.nice_data = format_table(self.data) metadata = load_data( endpoint="metadata", method="table", params=params, as_json=True ) assert isinstance(metadata, dict) # nosec assert_used self.metadata = metadata + +def format_table(data: pd.DataFrame, + ) -> pd.DataFrame: + """Format the raw data into a more readable table + + Args: + data (pd.DataFrame): A pandas dataframe created with get_data() + + Returns: + pd.DataFrame: Formatted dataframe that omits all CODE columns and gives + infromative columns names. + """ + time_name, = data["Zeit_Label"].unique() # Time label (usually Jahr) + time_values = data["Zeit"] + + merkmal_labels = data.filter(like="Merkmal_Label").columns + indep_names = [data[name].unique()[0] for name in merkmal_labels] # list of column names from Merkmal_Label + + auspraegung_labels = data.filter(like="Auspraegung_Label").columns + indep_values = [data[name] for name in auspraegung_labels] # list of data from Ausgepragung_Label + + dep_values = data.loc[:,auspraegung_labels[-1]:].iloc[:,1:] # get all columns after last Auspraegung column + dep_names = [" ".join(name.split('_')[1:]) + for name in dep_values.columns] # splits strings in column names for readability + + nice_dict = {time_name:time_values, + **dict(zip(indep_names, indep_values)), + **dict(zip(dep_names, dep_values.values.T))} + nice_data = pd.DataFrame(nice_dict) + return nice_data \ No newline at end of file From 3363b5e8202bbac9f2ad5aa276a495f4e0ee2ee6 Mon Sep 17 00:00:00 2001 From: Zosia Borowska Date: Wed, 29 Nov 2023 18:25:19 +0000 Subject: [PATCH 2/6] Adding comments --- src/pystatis/table.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pystatis/table.py b/src/pystatis/table.py index ad0ea3b..7bbb1dc 100644 --- a/src/pystatis/table.py +++ b/src/pystatis/table.py @@ -58,9 +58,9 @@ def format_table(data: pd.DataFrame, Returns: pd.DataFrame: Formatted dataframe that omits all CODE columns and gives - infromative columns names. + informative columns names. """ - time_name, = data["Zeit_Label"].unique() # Time label (usually Jahr) + time_name, = data["Zeit_Label"].unique() time_values = data["Zeit"] merkmal_labels = data.filter(like="Merkmal_Label").columns From 59650f6ef13a24898df62a9f8e53d097b77ede50 Mon Sep 17 00:00:00 2001 From: Zosia Borowska Date: Mon, 15 Jan 2024 15:33:02 +0000 Subject: [PATCH 3/6] Applied suggested changes and run code formatting --- src/pystatis/table.py | 57 +++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/src/pystatis/table.py b/src/pystatis/table.py index 7bbb1dc..ed374e3 100644 --- a/src/pystatis/table.py +++ b/src/pystatis/table.py @@ -22,13 +22,14 @@ def __init__(self, name: str): self.data = pd.DataFrame() self.metadata: dict = {} - def get_data(self, area: str = "all", **kwargs): + def get_data(self, area: str = "all", prettify: bool = True, **kwargs): """Downloads raw data and metadata from GENESIS-Online. Additional keyword arguments are passed on to the GENESIS-Online GET request for tablefile. Args: area (str, optional): Area to search for the object in GENESIS-Online. Defaults to "all". + prettify (bool, optional): Reformats the table into a readable format. Defaults to True. """ params = {"name": self.name, "area": area, "format": "ffcsv"} @@ -41,7 +42,8 @@ def get_data(self, area: str = "all", **kwargs): self.raw_data = raw_data data_str = StringIO(raw_data) self.data = pd.read_csv(data_str, sep=";") - self.nice_data = format_table(self.data) + if prettify: + self.data = self.prettify_table(self.data) metadata = load_data( endpoint="metadata", method="table", params=params, as_json=True @@ -49,32 +51,33 @@ def get_data(self, area: str = "all", **kwargs): assert isinstance(metadata, dict) # nosec assert_used self.metadata = metadata -def format_table(data: pd.DataFrame, - ) -> pd.DataFrame: - """Format the raw data into a more readable table - - Args: - data (pd.DataFrame): A pandas dataframe created with get_data() - - Returns: - pd.DataFrame: Formatted dataframe that omits all CODE columns and gives - informative columns names. - """ - time_name, = data["Zeit_Label"].unique() - time_values = data["Zeit"] + @staticmethod + def prettify_table(data: pd.DataFrame) -> pd.DataFrame: + """Reformat the data into a more readable table + + Args: + data (pd.DataFrame): A pandas dataframe created from raw_data + + Returns: + pd.DataFrame: Formatted dataframe that omits all unnecessary Code columns + and includes informative columns names + """ + # Extracts time column with name from first element of Zeit_Label column + time = pd.DataFrame({data["Zeit_Label"].iloc[0]: data["Zeit"]}) - merkmal_labels = data.filter(like="Merkmal_Label").columns - indep_names = [data[name].unique()[0] for name in merkmal_labels] # list of column names from Merkmal_Label + # Extracts new column names from first values of the Merkmal_Label columns + # and assigns these to the relevant attribute columns (Auspraegung_Label) + attributes = data.filter(like="Auspraegung_Label") + attributes.columns = data.filter(like="Merkmal_Label").iloc[0].tolist() - auspraegung_labels = data.filter(like="Auspraegung_Label").columns - indep_values = [data[name] for name in auspraegung_labels] # list of data from Ausgepragung_Label + # Selects all columns containing the values + values = data.filter(like="__") - dep_values = data.loc[:,auspraegung_labels[-1]:].iloc[:,1:] # get all columns after last Auspraegung column - dep_names = [" ".join(name.split('_')[1:]) - for name in dep_values.columns] # splits strings in column names for readability + # Given a name like BEV036__Bevoelkerung_in_Hauptwohnsitzhaushalten__1000 + # extracts the readable label and omit both the code and the unit + values.columns = [ + " ".join(name.split("_")[1:-1]) for name in values.columns + ] - nice_dict = {time_name:time_values, - **dict(zip(indep_names, indep_values)), - **dict(zip(dep_names, dep_values.values.T))} - nice_data = pd.DataFrame(nice_dict) - return nice_data \ No newline at end of file + pretty_data = pd.concat([time, attributes, values], axis=1) + return pretty_data From 92f7b16ed79dd0e67655ed38ca2f184cf1b0c8bf Mon Sep 17 00:00:00 2001 From: Michael Aydinbas Date: Mon, 29 Jan 2024 15:41:59 +0100 Subject: [PATCH 4/6] add tests for Table --- src/pystatis/table.py | 7 ++++--- tests/test_config.py | 1 - tests/test_db.py | 2 -- tests/test_table.py | 46 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 50 insertions(+), 6 deletions(-) create mode 100644 tests/test_table.py diff --git a/src/pystatis/table.py b/src/pystatis/table.py index ed374e3..6b644d3 100644 --- a/src/pystatis/table.py +++ b/src/pystatis/table.py @@ -39,9 +39,11 @@ def get_data(self, area: str = "all", prettify: bool = True, **kwargs): endpoint="data", method="tablefile", params=params, as_json=False ) assert isinstance(raw_data, str) # nosec assert_used + self.raw_data = raw_data data_str = StringIO(raw_data) self.data = pd.read_csv(data_str, sep=";") + if prettify: self.data = self.prettify_table(self.data) @@ -49,6 +51,7 @@ def get_data(self, area: str = "all", prettify: bool = True, **kwargs): endpoint="metadata", method="table", params=params, as_json=True ) assert isinstance(metadata, dict) # nosec assert_used + self.metadata = metadata @staticmethod @@ -75,9 +78,7 @@ def prettify_table(data: pd.DataFrame) -> pd.DataFrame: # Given a name like BEV036__Bevoelkerung_in_Hauptwohnsitzhaushalten__1000 # extracts the readable label and omit both the code and the unit - values.columns = [ - " ".join(name.split("_")[1:-1]) for name in values.columns - ] + values.columns = [name.split("__")[1] for name in values.columns] pretty_data = pd.concat([time, attributes, values], axis=1) return pretty_data diff --git a/tests/test_config.py b/tests/test_config.py index 138deb7..6c05b49 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,4 +1,3 @@ -import copy import os from configparser import ConfigParser from pathlib import Path diff --git a/tests/test_db.py b/tests/test_db.py index 6840704..1a0f96b 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -1,10 +1,8 @@ -import logging from configparser import ConfigParser import pytest from pystatis import config, db -from pystatis.exception import PystatisConfigError @pytest.fixture() diff --git a/tests/test_table.py b/tests/test_table.py new file mode 100644 index 0000000..2b5b467 --- /dev/null +++ b/tests/test_table.py @@ -0,0 +1,46 @@ +import pandas as pd +import pytest + +import pystatis + +EASY_TABLE = """Statistik_Code;Statistik_Label;Zeit_Code;Zeit_Label;Zeit;1_Merkmal_Code;1_Merkmal_Label;1_Auspraegung_Code;1_Auspraegung_Label; FLC006__Gebietsflaeche__qkm +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;08;Baden-Württemberg;35747,85 +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;09;Bayern;70541,58 +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;11;Berlin;891,12 +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;12;Brandenburg;29654,38 +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;04;Bremen;419,61 +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;02;Hamburg;755,09 +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;06;Hessen;21115,62 +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;13;Mecklenburg-Vorpommern;23294,90 +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;03;Niedersachsen;47709,90 +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;05;Nordrhein-Westfalen;34112,72 +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;07;Rheinland-Pfalz;19857,97 +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;10;Saarland;2571,52 +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;14;Sachsen;18449,86 +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;15;Sachsen-Anhalt;20467,20 +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;01;Schleswig-Holstein;15804,30 +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;16;Thüringen;16202,37 +11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;;Insgesamt;357595,99""" + + +def test_get_data(mocker): + mocker.patch("pystatis.http_helper.load_data", return_value=EASY_TABLE) + table = pystatis.Table(name="11111-0001") + table.get_data(prettify=False) + assert table.data.shape == (17, 10) + assert isinstance(table.data, pd.DataFrame) + assert not table.data.empty + assert isinstance(table.raw_data, str) + assert table.raw_data != "" + + +def test_prettify(mocker): + mocker.patch("pystatis.http_helper.load_data", return_value=EASY_TABLE) + table = pystatis.Table(name="11111-0001") + table.get_data(prettify=True) + assert table.data.shape == (17, 3) + assert table.data.columns.to_list() == [ + "Stichtag", + "Bundesländer", + "Gebietsflaeche", + ] From 6a798db60ebc0b0ea9e8906294348048021ea88c Mon Sep 17 00:00:00 2001 From: Michael Aydinbas Date: Mon, 29 Jan 2024 15:50:32 +0100 Subject: [PATCH 5/6] fix mocker patches in test_table --- src/pystatis/table.py | 2 -- tests/test_table.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/pystatis/table.py b/src/pystatis/table.py index 6b644d3..2136d22 100644 --- a/src/pystatis/table.py +++ b/src/pystatis/table.py @@ -38,7 +38,6 @@ def get_data(self, area: str = "all", prettify: bool = True, **kwargs): raw_data = load_data( endpoint="data", method="tablefile", params=params, as_json=False ) - assert isinstance(raw_data, str) # nosec assert_used self.raw_data = raw_data data_str = StringIO(raw_data) @@ -50,7 +49,6 @@ def get_data(self, area: str = "all", prettify: bool = True, **kwargs): metadata = load_data( endpoint="metadata", method="table", params=params, as_json=True ) - assert isinstance(metadata, dict) # nosec assert_used self.metadata = metadata diff --git a/tests/test_table.py b/tests/test_table.py index 2b5b467..966c586 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -24,7 +24,7 @@ def test_get_data(mocker): - mocker.patch("pystatis.http_helper.load_data", return_value=EASY_TABLE) + mocker.patch("pystatis.table.load_data", return_value=EASY_TABLE) table = pystatis.Table(name="11111-0001") table.get_data(prettify=False) assert table.data.shape == (17, 10) @@ -35,7 +35,7 @@ def test_get_data(mocker): def test_prettify(mocker): - mocker.patch("pystatis.http_helper.load_data", return_value=EASY_TABLE) + mocker.patch("pystatis.table.load_data", return_value=EASY_TABLE) table = pystatis.Table(name="11111-0001") table.get_data(prettify=True) assert table.data.shape == (17, 3) From 8217717fd2a796736c2e790d5aa5f4b8001d78f3 Mon Sep 17 00:00:00 2001 From: Michael Aydinbas Date: Mon, 29 Jan 2024 16:24:28 +0100 Subject: [PATCH 6/6] implement monkeypatch to overwrite load_data for table tests --- src/pystatis/table.py | 2 ++ tests/test_table.py | 19 +++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/pystatis/table.py b/src/pystatis/table.py index 2136d22..6b644d3 100644 --- a/src/pystatis/table.py +++ b/src/pystatis/table.py @@ -38,6 +38,7 @@ def get_data(self, area: str = "all", prettify: bool = True, **kwargs): raw_data = load_data( endpoint="data", method="tablefile", params=params, as_json=False ) + assert isinstance(raw_data, str) # nosec assert_used self.raw_data = raw_data data_str = StringIO(raw_data) @@ -49,6 +50,7 @@ def get_data(self, area: str = "all", prettify: bool = True, **kwargs): metadata = load_data( endpoint="metadata", method="table", params=params, as_json=True ) + assert isinstance(metadata, dict) # nosec assert_used self.metadata = metadata diff --git a/tests/test_table.py b/tests/test_table.py index 966c586..7c3f15f 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -23,8 +23,19 @@ 11111;Feststellung des Gebietsstands;STAG;Stichtag;31.12.2022;DLAND;Bundesländer;;Insgesamt;357595,99""" -def test_get_data(mocker): - mocker.patch("pystatis.table.load_data", return_value=EASY_TABLE) +def mocked_load_data(endpoint, method, params, as_json): + if endpoint == "data" and method == "tablefile": + return EASY_TABLE + elif endpoint == "metadata" and method == "table": + return {"metadata": "table"} + else: + raise NotImplementedError + + +def test_get_data(monkeypatch): + # patch pystatis.table.load_data with parameter endpoint="data" + # and method="tablefile" to return EASY_TABLE + monkeypatch.setattr(pystatis.table, "load_data", mocked_load_data) table = pystatis.Table(name="11111-0001") table.get_data(prettify=False) assert table.data.shape == (17, 10) @@ -34,8 +45,8 @@ def test_get_data(mocker): assert table.raw_data != "" -def test_prettify(mocker): - mocker.patch("pystatis.table.load_data", return_value=EASY_TABLE) +def test_prettify(monkeypatch): + monkeypatch.setattr(pystatis.table, "load_data", mocked_load_data) table = pystatis.Table(name="11111-0001") table.get_data(prettify=True) assert table.data.shape == (17, 3)