Skip to content

Commit

Permalink
Merge pull request #379 from GermanZero-de/select-ref-year
Browse files Browse the repository at this point in the history
New derived facts + code to select data frames by reference year (year_ref)
  • Loading branch information
bgrundmann authored Feb 2, 2024
2 parents 8f56369 + e0951ca commit b6684fc
Show file tree
Hide file tree
Showing 13 changed files with 12,999 additions and 12,379 deletions.
2 changes: 1 addition & 1 deletion data/production.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"public": "bec0551f1b84f8afff005733ab0186016000194a",
"public": "8f367e918857c3640752b950acd8541d726ecf71",
"proprietary": "d0445ba3e5501b8570c89cfd07677c91a7206e31"
}
Original file line number Diff line number Diff line change
Expand Up @@ -66,24 +66,24 @@ def gen_calculate_derived_facts(rows: ROWS):
row_num = 0
all_derived_facts: set[str] = set()
for data in rows:
if data["update 2022"] == "xF":
if data["Formula"] is None or data["Formula"] == "noch nicht existent":
if data["update 2022"] in ["xF"]:
if data["Updated?"] is None or data["Updated?"] == "noch nicht existent":
continue
label: str = data["label"] # type: ignore
all_derived_facts.add(label)

for data in rows:
row_num += 1
if data["update 2022"] == "xF":
if data["Formula"] is None or data["Formula"] == "noch nicht existent":
if data["update 2022"] in ["xF"]:
if data["Updated?"] is None or data["Updated?"] == "noch nicht existent":
continue
# raise Exception(f"Missing formula for {data['label']}")
label: str = data["label"] # type: ignore
formula: str = data["Formula"] # type: ignore
formula: str = data["Updated?"] # type: ignore
formula = FACT_REGEX.sub(
lambda m: replace_fact_name_by_fact_lookup(m.group(1)), formula
)
del data["Formula"]
del data["Updated?"]
del data["label"]
del data["update 2022"]
del data["value"]
Expand Down Expand Up @@ -127,7 +127,13 @@ def extract_new_facts(rows: ROWS):
with open("new_facts.csv", "w", encoding="utf-8") as fp:
writer = csv.writer(fp, lineterminator="\n")
for data in rows:
if data["update 2022"] in ["xNEW"] and data["value"] is not None:
if (
data["update 2022"]
in ["xF"] # ["x", "NEW", "xNEW", "ASS", "xF", "", None, "ggf"]
and data["value"] is not None
):
if data["Updated?"] != "done":
print(data["label"], data["Updated?"])
row = [data[c] for c in columns]
row = [d if type(d) != str else d.replace("\n", " ") for d in row]
writer.writerow(row)
Expand Down
553 changes: 525 additions & 28 deletions src/climatevision/generator/calculate_derived_facts.py

Large diffs are not rendered by default.

17 changes: 15 additions & 2 deletions src/climatevision/generator/diffs.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,19 @@ def __str__(self) -> str:
return f"at {self.path} expected {self.expected} got {self.actual}"


@dataclass(kw_only=True)
class FloatDiff(Diff):
path: str

def __str__(self) -> str:
actual: float = float(self.actual) # type: ignore
expected: float = float(self.expected) # type: ignore
diff: float = actual - expected
percent: float = expected / 100.0 if expected != 0 else 0
pstr: str = "{:.2f}%".format(diff / percent) if percent != 0 else "0%"
return f"at {self.path} expected {self.expected} got {self.actual} ({pstr})"


def all_helper(path: str, actual: Any, expected: Any, *, rel: float) -> Iterator[Diff]:
if isinstance(actual, Mapping) and isinstance(expected, Mapping):
keys1: Any = frozenset(actual.keys()) # type: ignore
Expand All @@ -70,12 +83,12 @@ def all_helper(path: str, actual: Any, expected: Any, *, rel: float) -> Iterator
)
elif isinstance(actual, Number) and isinstance(expected, Number):
if not float_matches(actual=actual, expected=expected, rel=rel):
yield Diff(path=path, actual=actual, expected=expected)
yield FloatDiff(path=path, actual=actual, expected=expected)
elif hasattr(actual, "__float__") and hasattr(expected, "__float__"): # type: ignore
f = float(actual) # type: ignore
e = float(expected) # type: ignore
if not float_matches(actual=f, expected=e, rel=rel):
yield Diff(path=path, actual=f, expected=e) # type: ignore
yield FloatDiff(path=path, actual=f, expected=e) # type: ignore
elif actual != expected:
yield Diff(path=path, actual=actual, expected=expected) # type: ignore

Expand Down
156 changes: 130 additions & 26 deletions src/climatevision/generator/refdata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Module refdata -- tools to read the reference data used by the generator.
"""

# pyright: strict

from dataclasses import dataclass
Expand Down Expand Up @@ -167,8 +168,8 @@ def add_to(d: dict[str, list[float]], ags: str, e: list[str]):
else:
d[ags] = [float(x) for x in e]

sums_by_sta = {}
sums_by_dis = {}
sums_by_sta: dict[str, list[float]] = {}
sums_by_dis: dict[str, list[float]] = {}
already_in_raw_data: set[str] = set()

for ags, row in df.rows():
Expand Down Expand Up @@ -453,6 +454,96 @@ def load(cls, name: str, datadir: str | None = None) -> "Version":
return cls(public=d["public"], proprietary=d["proprietary"])


def filename(year_ref: int, what: str) -> str:
"""Return the filename of the given data set for the current year_ref."""
# Most of the time the name is identical to the refyear (e.g. 2018)
# But sometimes we only got older data and have stored the file accordingly
# (or similarly couldn't get an update of the data and are using 2018 data
# for 2021)
exceptions: dict[int, dict[str, str]] = {
2018: {
"ags": "master", # This is a bit stupid, we should have named that file by year as well.
"nat_organic_agri": "2016",
},
# 2021: {
# "ags": "2021",
# "area": "2021",
# "area_kinds": "2021",
# "assumptions": "2021",
# "buildings": "2018", # Building census is delayed
# "co2path": "2018", # We can use this unchanged.
# "destatis": "2018", # TODO: What about this? (Landkreisfeiner öffentlicher Verkehr)
# Have to check for above that we can use the traffic code to do the transplant
# "facts": "2018", # TODO: Bene is late
# "flats": "2018", # TODO: Building census is delayed
# "industry_facilites": "2018", # TODO: Jan
# "nat_agri": "2021",
# "nat_energy": "2021",
# "nat_organic_agri": "2020",
# "nat_res_buildings": "2018", # TODO: Building census is delayed
# "population": "2021",
# "renewable_energy": "2018", # TODO: What about this?
# "traffic": "2018", # TODO: We did write code to transplant this, must still check in the work
# "traffic_air": "2018", # TODO: ? Can we use the transplant code for this as well?!
# "traffic_rail": "2018", # TODO: ? CAn we use the transplant code for this as well?!
# },
# For Testing
2021: {
"ags": "master",
"area": "2021", # Checked Germany + Göttingen
"area_kinds": "2018",
"assumptions": "2018",
"buildings": "2018", # Building census is delayed
"co2path": "2018", # TODO: Will we get this?
"destatis": "2018", # TODO: What about this?
"facts": "2018", # TODO: Bene is late
"flats": "2018", # TODO: Building census is delayed
"industry_facilites": "2018", # TODO: Jan
"nat_agri": "2018",
"nat_energy": "2018",
"nat_organic_agri": "2016",
"nat_res_buildings": "2018", # TODO: Building census is delayed
"population": "2018", # Checked Germany
"renewable_energy": "2018", # TODO: What about this?
"traffic": "2018", # TODO: We did write code to transplant this, must still check in the work
"traffic_air": "2018", # TODO: ?
"traffic_rail": "2018", # TODO: ?
},
}
return exceptions.get(year_ref, {}).get(what, str(year_ref))


def load_data_frame_ags(
datadir: str, year_ref: int, what: str, set_nans_to_0_in_columns: list[str] = []
) -> DataFrame[str]:
"""Load a data frame for the given data set for the current refyear."""
return DataFrame.load_ags(
datadir,
what,
filename=filename(year_ref, what),
set_nans_to_0_in_columns=set_nans_to_0_in_columns,
)


def load_data_frame(
datadir: str,
year_ref: int,
what: str,
key_column: str,
key_from_raw: Callable[[str], KeyT],
set_nans_to_0_in_columns: list[str] = [],
) -> DataFrame[KeyT]:
"""Load a data frame for the given data set for the current refyear."""
return DataFrame.load(
datadir,
what,
key_column,
key_from_raw,
filename=filename(year_ref, what),
set_nans_to_0_in_columns=set_nans_to_0_in_columns,
)


@dataclass(kw_only=True)
class RefData:
"""This class gives you a single handle around all the reference data."""
Expand Down Expand Up @@ -638,7 +729,7 @@ def traffic(self, ags: str):
return Row(self._traffic, ags)

def industry_dehst(self, ags: str):
"""TODO Function to read CO2e for each ags from DEHST Table."""
"""Function to read CO2e for each ags from DEHST Table."""
return OptRow(self._industry_dehst, ags)

@classmethod
Expand All @@ -655,6 +746,8 @@ def load(
as we can't yet run the generator without the data.
"""
datadir = datadir_or_default(datadir)
year_ref: int = 2021

area_0_columns = (
[
"land_settlement",
Expand Down Expand Up @@ -687,36 +780,47 @@ def load(
population_0_columns = ["total"] if fix_missing_entries else []
d = cls(
ags_master=DataFrame.load_ags(datadir, "ags", filename="master"),
area=DataFrame.load_ags(
datadir, "area", set_nans_to_0_in_columns=area_0_columns
area=load_data_frame_ags(
datadir, year_ref, "area", set_nans_to_0_in_columns=area_0_columns
),
area_kinds=DataFrame.load_ags(datadir, "area_kinds"),
assumptions=DataFrame.load(
datadir, "assumptions", key_column="label", key_from_raw=lambda k: k
area_kinds=load_data_frame_ags(datadir, year_ref, "area_kinds"),
assumptions=load_data_frame(
datadir,
year_ref,
"assumptions",
key_column="label",
key_from_raw=lambda k: k,
),
buildings=DataFrame.load_ags(datadir, "buildings"),
co2path=DataFrame.load(
datadir, "co2path", key_column="year", key_from_raw=int
buildings=load_data_frame_ags(datadir, year_ref, "buildings"),
co2path=load_data_frame(
datadir, year_ref, "co2path", key_column="year", key_from_raw=int
),
destatis=DataFrame.load_ags(datadir, "destatis"),
facts=DataFrame.load(
datadir, "facts", key_column="label", key_from_raw=lambda k: k
destatis=load_data_frame_ags(datadir, year_ref, "destatis"),
facts=load_data_frame(
datadir,
year_ref,
"facts",
key_column="label",
key_from_raw=lambda k: k,
),
flats=DataFrame.load_ags(
datadir, "flats", set_nans_to_0_in_columns=flats_0_columns
flats=load_data_frame_ags(
datadir, year_ref, "flats", set_nans_to_0_in_columns=flats_0_columns
),
nat_agri=DataFrame.load_ags(datadir, "nat_agri"),
nat_organic_agri=DataFrame.load_ags(
datadir, "nat_organic_agri", filename="2016"
nat_agri=load_data_frame_ags(datadir, year_ref, "nat_agri"),
nat_organic_agri=load_data_frame_ags(datadir, year_ref, "nat_organic_agri"),
nat_energy=load_data_frame_ags(datadir, year_ref, "nat_energy"),
nat_res_buildings=load_data_frame_ags(
datadir, year_ref, "nat_res_buildings"
),
nat_energy=DataFrame.load_ags(datadir, "nat_energy"),
nat_res_buildings=DataFrame.load_ags(datadir, "nat_res_buildings"),
population=DataFrame.load_ags(
datadir, "population", set_nans_to_0_in_columns=population_0_columns
population=load_data_frame_ags(
datadir,
year_ref,
"population",
set_nans_to_0_in_columns=population_0_columns,
),
renewable_energy=DataFrame.load_ags(datadir, "renewable_energy"),
traffic=DataFrame.load_ags(datadir, "traffic"),
industry_dehst=DataFrame.load_ags(datadir, "industry_facilites"),
renewable_energy=load_data_frame_ags(datadir, year_ref, "renewable_energy"),
traffic=load_data_frame_ags(datadir, year_ref, "traffic"),
industry_dehst=load_data_frame_ags(datadir, year_ref, "industry_facilites"),
fix_missing_entries=fix_missing_entries,
)
from . import calculate_derived_facts
Expand Down
Loading

0 comments on commit b6684fc

Please sign in to comment.