diff --git a/README.md b/README.md index 6030a13b..2b88af81 100644 --- a/README.md +++ b/README.md @@ -37,8 +37,8 @@ python -m teehr.utils.install_spark_jars ``` Use Docker ```bash -$ docker build -t teehr:v0.4.5 . -$ docker run -it --rm --volume $HOME:$HOME -p 8888:8888 teehr:v0.4.5 jupyter lab --ip 0.0.0.0 $HOME +$ docker build -t teehr:v0.4.6 . +$ docker run -it --rm --volume $HOME:$HOME -p 8888:8888 teehr:v0.4.6 jupyter lab --ip 0.0.0.0 $HOME ``` ## Examples diff --git a/docs/sphinx/changelog/index.rst b/docs/sphinx/changelog/index.rst index c45dd0ba..d5f60bf9 100644 --- a/docs/sphinx/changelog/index.rst +++ b/docs/sphinx/changelog/index.rst @@ -2,6 +2,26 @@ Release Notes ============= +0.4.6 - 2024-12-17 +-------------------- + +Added +^^^^^ +* Adds `add_missing_columns` to the `_validate` method in the `BaseTable` class +to allow for adding missing columns to the schema. + - When upgrading from 0.4.4 or earlier, you may need to run the following to add + the missing columns to the secondary_timeseries if you have existing datasets: + ``` + sdf = ev.secondary_timeseries.to_sdf() + validated_sdf = ev.secondary_timeseries._validate(sdf, add_missing_columns=True) + ev.secondary_timeseries._write_spark_df(validated_sdf) + `` + +Changed +^^^^^^^ +* None + + 0.4.5 - 2024-12-09 -------------------- diff --git a/docs/sphinx/getting_started/index.rst b/docs/sphinx/getting_started/index.rst index 0f300717..cf9df0ea 100644 --- a/docs/sphinx/getting_started/index.rst +++ b/docs/sphinx/getting_started/index.rst @@ -37,8 +37,8 @@ Or, if you do not want to install TEEHR in your own virtual environment, you can .. code-block:: bash - docker build -t teehr:v0.4.5 . - docker run -it --rm --volume $HOME:$HOME -p 8888:8888 teehr:v0.4.5 jupyter lab --ip 0.0.0.0 $HOME + docker build -t teehr:v0.4.6 . + docker run -it --rm --volume $HOME:$HOME -p 8888:8888 teehr:v0.4.6 jupyter lab --ip 0.0.0.0 $HOME Project Objectives ------------------ diff --git a/pyproject.toml b/pyproject.toml index 294ea861..62b948fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "teehr" -version = "0.4.5" +version = "0.4.6" description = "Tools for Exploratory Evaluation in Hydrologic Research" authors = [ "RTI International", diff --git a/src/teehr/__init__.py b/src/teehr/__init__.py index 42f64f1d..9c339420 100644 --- a/src/teehr/__init__.py +++ b/src/teehr/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.4.5" +__version__ = "0.4.6" from teehr.evaluation.evaluation import Evaluation # noqa from teehr.models.metrics.metric_models import Metrics # noqa diff --git a/src/teehr/evaluation/tables/base_table.py b/src/teehr/evaluation/tables/base_table.py index c782df94..41d1dbc5 100644 --- a/src/teehr/evaluation/tables/base_table.py +++ b/src/teehr/evaluation/tables/base_table.py @@ -10,6 +10,7 @@ from teehr.utils.utils import to_path_or_s3path, path_to_spark from teehr.models.filters import FilterBaseModel import logging +from pyspark.sql.functions import lit, col logger = logging.getLogger(__name__) @@ -145,7 +146,12 @@ def _get_schema(self, type: str = "pyspark"): return self.schema_func() - def _validate(self, df: ps.DataFrame, strict: bool = True) -> ps.DataFrame: + def _validate( + self, + df: ps.DataFrame, + strict: bool = True, + add_missing_columns: bool = False + ) -> ps.DataFrame: """Validate a DataFrame against the table schema. Parameters @@ -156,13 +162,25 @@ def _validate(self, df: ps.DataFrame, strict: bool = True) -> ps.DataFrame: If True, any extra columns will be dropped before validation. If False, will be validated as-is. The default is True. + + Returns + ------- + validated_df : ps.DataFrame + The validated DataFrame. """ schema = self._get_schema() logger.info(f"Validating DataFrame with {schema.columns}.") + schema_cols = schema.columns.keys() + + # Add missing columns + if add_missing_columns: + for col_name in schema_cols: + if col_name not in df.columns: + df = df.withColumn(col_name, lit(None)) + if strict: - schema_cols = schema.columns.keys() df = df.select(*schema_cols) validated_df = schema.validate(df) diff --git a/version.txt b/version.txt index c8a5397f..c0a1ac19 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.4.5 \ No newline at end of file +0.4.6 \ No newline at end of file