diff --git a/CHANGELOG.md b/CHANGELOG.md index 97ac6db..264bb77 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,10 @@ Here is a template for new release sections - ``` +## [Unreleased] +### Changed +- "None" values in possible FK column are overwritten by FK mapping + ## [0.21.0] - 2024-06-25 ### Changed - in case of bandwidth values, first value is used for process diff --git a/data_adapter/preprocessing.py b/data_adapter/preprocessing.py index d7b2ee2..7c318dd 100644 --- a/data_adapter/preprocessing.py +++ b/data_adapter/preprocessing.py @@ -261,10 +261,14 @@ def __unpack_bandwidths(self, df: pd.DataFrame) -> pd.DataFrame: Currently only supports to return the first argument of bandwidths Different bandwidth types are not supported yet - Example: - + Parameters + ---------- + df : pd.DataFrame + Data containing bandwidths - Returns: + Returns + ------- + pd.Dataframe Modified Dataframe """ @@ -442,11 +446,16 @@ def _get_foreign_keys(process: str, df: pd.DataFrame) -> dict[str, ForeignKey]: # Check if Fks are unique (cannot have different FKs per process/subprocess) fk_candidates = {} for fk_column in fk_column_candidates: - if len(df[fk_column].unique()) > 1: + column_data_without_none = df[fk_column][~df[fk_column].isnull()] + if len(column_data_without_none.unique()) > 1: continue # no candidate - fk = df[fk_column].iloc[0] + fk = column_data_without_none.iloc[0] if "." not in fk: continue # no candidate + if df[fk_column].isnull().sum() > 0: + logging.warning( + f"None values in column '{fk_column}' of process '{process}' will be overwritten by FK values." + ) fk_candidates[fk_column] = ForeignKey(*fk.split(".")) return fk_candidates