From c24d1cd6a9590684ca05572d6638fac9e939b761 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Mon, 28 Oct 2024 19:42:00 +0100 Subject: [PATCH 01/22] wip: add config --- .gitignore | 1 + README.md | 19 +++++ scystream/sdk/config/__init__.py | 0 scystream/sdk/config/config_loader.py | 103 ++++++++++++++++++++++++++ scystream/sdk/core.py | 2 + scystream/sdk/scheduler.py | 4 +- setup.py | 4 +- tests/example_config.yaml | 31 ++++++++ tests/test_core.py | 2 + 9 files changed, 163 insertions(+), 3 deletions(-) create mode 100644 scystream/sdk/config/__init__.py create mode 100644 scystream/sdk/config/config_loader.py create mode 100644 tests/example_config.yaml diff --git a/.gitignore b/.gitignore index 0e61186..36baf00 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ __pycache__/ dist/ build/ venv/ +.venv/ diff --git a/README.md b/README.md index dbfff7e..f236535 100644 --- a/README.md +++ b/README.md @@ -35,3 +35,22 @@ if __name__ == "__main__": main() ``` + +### Development + +1. Create a venv + +```bash +python3 -m venv .venv +``` + +2. Install the package within the venv + +> [!INFO] +> This will also install all the install_requirements from the setup.py + +```bash +pip install -e .[dev] +``` + +3. Develop! diff --git a/scystream/sdk/config/__init__.py b/scystream/sdk/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scystream/sdk/config/config_loader.py b/scystream/sdk/config/config_loader.py new file mode 100644 index 0000000..215dbab --- /dev/null +++ b/scystream/sdk/config/config_loader.py @@ -0,0 +1,103 @@ +import yaml +from typing import Optional, Dict, Literal, Any, Callable +from pydantic import BaseModel, StrictStr, validator, Field +import os + +""" +This file contains the schema definition, the read function and validation +for the config file. +""" + +STRING_TYPE = "string" +INT_TYPE = "int" +FLOAT_TYPE = "float" +BOOL_TYPE = "bool" +LIST_TYPE = "list" +SPARK_TABLE_TYPE = "spark_table" + +CONFIG_FILE_DEFAULT_NAME = "cbc.yaml" + +DataTypes = Literal[STRING_TYPE, INT_TYPE, FLOAT_TYPE, + BOOL_TYPE, LIST_TYPE, SPARK_TABLE_TYPE] + +VALIDATORS: Dict[str, Callable[[Any], bool]] = { + "string": lambda x: isinstance(x, str), + "int": lambda x: isinstance(x, (int)), + "number": lambda x: isinstance(x, (float)), + "bool": lambda x: isinstance(x, (bool)), + "list": lambda x: isinstance(x, (list)), + # spark_table must be of type str + "spark_table": lambda x: isinstance(x, (str)) +} + + +class InputOutputDefinitions(BaseModel): + type: DataTypes + description: Optional[StrictStr] = None + item_type: Optional[DataTypes] = Field( + None, description="Type of items in the list") + table_name: Optional[StrictStr] = Field( + None, description="Name of the spark_table,\ + required if type is spark_table") + example: Optional[DataTypes] = Field( + None, description="Example for the Input/Output" + ) + + """ + If the type is spark_table, table_name must also be set + """ + @validator("table_name", always=True) + def validate_table_name(cls, v, values): + set_type = values.get("type") + if set_type == "spark_table": + if not v: + raise ValueError( + "table_name must be set when type is 'spark_table'") + return v + + """ + Check if the example corresponds with the inputs type + """ + @validator("example") + def validate_example_type(cls, v, values): + expected_type = values.get("type") + + if expected_type in VALIDATORS: + if not VALIDATORS[expected_type](v): + raise ValueError(f"Example must be of type \ + '{expected_type}' when type is '{expected_type}'") + + return v + + +class Entrypoint(BaseModel): + description: StrictStr + inputs: Dict[StrictStr, InputOutputDefinitions] + outputs: Dict[StrictStr, InputOutputDefinitions] + + +class ComputeBlock(BaseModel): + name: StrictStr + description: StrictStr + author: StrictStr + entrypoints: Dict[StrictStr, Entrypoint] + + @validator("entrypoints") + def check_entrypoints(cls, v): + if not v: + raise ValueError("At least one entrypoint must be defined.") + return v + + +def load_config(config_path: str = CONFIG_FILE_DEFAULT_NAME) -> ComputeBlock: + """ + Loads a YAML configuration file for workflow unit definitions. + """ + + root_dir = os.path.dirname(os.path.abspath(__file__)) + full_path = os.path.join(root_dir, "..", config_path) + + with open(full_path, "r") as file: + config = yaml.safe_load(file) + + return ComputeBlock(**config) diff --git a/scystream/sdk/core.py b/scystream/sdk/core.py index 47efac8..3965d1c 100644 --- a/scystream/sdk/core.py +++ b/scystream/sdk/core.py @@ -2,6 +2,7 @@ _registered_functions = {} + def entrypoint(func): """Decorator to mark a function as an entrypoint.""" @functools.wraps(func) @@ -10,6 +11,7 @@ def wrapper(*args, **kwargs): _registered_functions[func.__name__] = func return wrapper + def get_registered_functions(): """Returns a dictionary of registered entrypoint functions.""" return _registered_functions diff --git a/scystream/sdk/scheduler.py b/scystream/sdk/scheduler.py index 8e91d18..f403c83 100644 --- a/scystream/sdk/scheduler.py +++ b/scystream/sdk/scheduler.py @@ -1,5 +1,6 @@ from .core import get_registered_functions + class Scheduler: @staticmethod def list_entrypoints(): @@ -7,7 +8,7 @@ def list_entrypoints(): functions = get_registered_functions() for name in functions: print(f"'{name}' is available as an entrypoint.") - + @staticmethod def execute_function(name, *args, **kwargs): functions = get_registered_functions() @@ -15,4 +16,3 @@ def execute_function(name, *args, **kwargs): return functions[name](*args, **kwargs) else: raise Exception(f"No entrypoint found with the name: {name}") - diff --git a/setup.py b/setup.py index e66963e..be1c613 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,9 @@ author_email="evers@time.rwth-aachen.de", license="MIT", packages=find_packages(), - install_requires=[], + install_requires=[ + "pydantic>=2.9.2" + ], classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", diff --git a/tests/example_config.yaml b/tests/example_config.yaml new file mode 100644 index 0000000..b46befd --- /dev/null +++ b/tests/example_config.yaml @@ -0,0 +1,31 @@ +workflow_unit: + name: "The first Web-Crawler" + description: "This is a web crawler, it crawls text..." + author: "John Doe" + + entrypoints: + crawl: + description: "Crawl text from specified URLs" + inputs: + url_list: + type: "list" + item_type: "string" + description: "List of URLs to crawl. Can be defined by the user." + example: ["https://example.com", "http://one.com"] + outputs: + text_data: + type: "spark_table" + description: "Crawled text data in a spark table" + name: "text_data_spark" + + analyze_url: + description: "Analyzes if data is crawlable" + inputs: + url-list: + type: "list" + item_type: "string" + description: "List of URLS to check" + example: ["https://example.com"] + outputs: + type: "bool" + description: "True if all urls can be crawled" diff --git a/tests/test_core.py b/tests/test_core.py index b9b5030..10fb7ac 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,6 +1,7 @@ import unittest from scystream_sdk.core import entrypoint, get_registered_functions + class TestEntrypoint(unittest.TestCase): def test_entrypoint_registration(self): @entrypoint @@ -11,5 +12,6 @@ def dummy_function(): self.assertIn("dummy_function", registered) self.assertEqual(registered["dummy_function"](), "Hello") + if __name__ == "__main__": unittest.main() From a77f4beac5d5bad363f26f3d51e890d61c63f625 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Tue, 29 Oct 2024 17:46:55 +0100 Subject: [PATCH 02/22] wip: add yaml dependency --- scystream/sdk/config/config_loader.py | 19 +++++++++++++++---- setup.py | 3 ++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/scystream/sdk/config/config_loader.py b/scystream/sdk/config/config_loader.py index 215dbab..4146f6a 100644 --- a/scystream/sdk/config/config_loader.py +++ b/scystream/sdk/config/config_loader.py @@ -1,6 +1,6 @@ import yaml from typing import Optional, Dict, Literal, Any, Callable -from pydantic import BaseModel, StrictStr, validator, Field +from pydantic import BaseModel, StrictStr, field_validator, Field import os """ @@ -8,6 +8,10 @@ for the config file. """ +PROJECT_ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) + +print(PROJECT_ROOT_DIR) + STRING_TYPE = "string" INT_TYPE = "int" FLOAT_TYPE = "float" @@ -46,7 +50,7 @@ class InputOutputDefinitions(BaseModel): """ If the type is spark_table, table_name must also be set """ - @validator("table_name", always=True) + @field_validator("table_name") def validate_table_name(cls, v, values): set_type = values.get("type") if set_type == "spark_table": @@ -58,7 +62,7 @@ def validate_table_name(cls, v, values): """ Check if the example corresponds with the inputs type """ - @validator("example") + @field_validator("example") def validate_example_type(cls, v, values): expected_type = values.get("type") @@ -82,13 +86,20 @@ class ComputeBlock(BaseModel): author: StrictStr entrypoints: Dict[StrictStr, Entrypoint] - @validator("entrypoints") + @field_validator("entrypoints") def check_entrypoints(cls, v): if not v: raise ValueError("At least one entrypoint must be defined.") return v +def validate_config(config_path: str = CONFIG_FILE_DEFAULT_NAME) -> bool: + """ + Reads the passed Compute Block YAML definition. + Returns True if the validation using pydantic was successfull + """ + + def load_config(config_path: str = CONFIG_FILE_DEFAULT_NAME) -> ComputeBlock: """ Loads a YAML configuration file for workflow unit definitions. diff --git a/setup.py b/setup.py index be1c613..1a176d3 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,8 @@ license="MIT", packages=find_packages(), install_requires=[ - "pydantic>=2.9.2" + "pydantic>=2.9.2", + "PyYAML>=6.0.2" ], classifiers=[ "Programming Language :: Python :: 3", From d2598559bcc7296c958513207e4464063594eeb0 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Wed, 30 Oct 2024 23:05:29 +0100 Subject: [PATCH 03/22] feat: add validation and loading of config file --- README.md | 81 ++++++++++- scystream/sdk/config/config_loader.py | 137 +++++------------- scystream/sdk/config/models.py | 59 ++++++++ setup.py | 2 +- tests/example_config.yaml | 31 ---- tests/test_config.py | 42 ++++++ tests/test_config_files/invalid_datatype.yaml | 29 ++++ .../missing_entrypoints.yaml | 5 + .../test_config_files/missing_table_name.yaml | 16 ++ tests/test_config_files/not_a_yaml.json | 3 + tests/test_config_files/valid_config.yaml | 29 ++++ tests/test_core.py | 2 +- 12 files changed, 302 insertions(+), 134 deletions(-) create mode 100644 scystream/sdk/config/models.py delete mode 100644 tests/example_config.yaml create mode 100644 tests/test_config.py create mode 100644 tests/test_config_files/invalid_datatype.yaml create mode 100644 tests/test_config_files/missing_entrypoints.yaml create mode 100644 tests/test_config_files/missing_table_name.yaml create mode 100644 tests/test_config_files/not_a_yaml.json create mode 100644 tests/test_config_files/valid_config.yaml diff --git a/README.md b/README.md index f236535..fd18e0c 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,75 @@ if __name__ == "__main__": ``` -### Development +### Compute Block Config Files +We expect every repository which will be used within the scystream application +to contain a `Compute Block Config File`, the `cbc.yaml`, within the root directory. + +This yaml-file describes the compute block itself. +It shows the entrypoints, their inputs and outputs. + +This is an example `cbc.yaml`: + +```yaml +name: "The first Web-Crawler" +description: "This is a web crawler, it crawls text..." +author: "John Doe" + +entrypoints: + crawl: + description: "Crawl text from specified URLs" + inputs: + url_list: + type: "list" + item_type: "string" + description: "List of URLs to crawl. Can be defined by the user." + outputs: + text_data: + type: "spark_table" + description: "Crawled text data in a spark table" + table_name: "text_data_spark" + + analyze_url: + description: "Analyzes if data is crawlable" + inputs: + url-list: + type: "list" + item_type: "string" + description: "List of URLS to check" + outputs: + was_sucess: + type: "bool" + description: "True if all urls can be crawled" +``` + +To read and validate such a config file u can proceed as follows: + +```python3 +from scystream.sdk.config.config_loader import load_config + +def main(): + load_config() + +if __name__ == "__main__": + main() +``` + +If you want the file to have another name than `cbc.yaml` or you want the file to be +somewhere else than the root directory you can define that using the parameters the +`load_config` function takes. + +Example: + +```python3 +load_config(config_file_name="test.yaml", config_path="configs/") +``` + +the `config_path` is the path relative to your root directory + + +## Development of the SDK + +### Installation 1. Create a venv @@ -50,7 +118,16 @@ python3 -m venv .venv > This will also install all the install_requirements from the setup.py ```bash -pip install -e .[dev] +pip install -e . ``` 3. Develop! + +### Tests + +To run all the tests run the following command: + +```bash +python3 -m unittest discover -s tests +``` + diff --git a/scystream/sdk/config/config_loader.py b/scystream/sdk/config/config_loader.py index 4146f6a..1826439 100644 --- a/scystream/sdk/config/config_loader.py +++ b/scystream/sdk/config/config_loader.py @@ -1,114 +1,53 @@ import yaml -from typing import Optional, Dict, Literal, Any, Callable -from pydantic import BaseModel, StrictStr, field_validator, Field -import os - -""" -This file contains the schema definition, the read function and validation -for the config file. -""" - -PROJECT_ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) - -print(PROJECT_ROOT_DIR) - -STRING_TYPE = "string" -INT_TYPE = "int" -FLOAT_TYPE = "float" -BOOL_TYPE = "bool" -LIST_TYPE = "list" -SPARK_TABLE_TYPE = "spark_table" +from typing import Union +from pydantic import ValidationError +from pathlib import Path +from .models import ComputeBlock CONFIG_FILE_DEFAULT_NAME = "cbc.yaml" -DataTypes = Literal[STRING_TYPE, INT_TYPE, FLOAT_TYPE, - BOOL_TYPE, LIST_TYPE, SPARK_TABLE_TYPE] - -VALIDATORS: Dict[str, Callable[[Any], bool]] = { - "string": lambda x: isinstance(x, str), - "int": lambda x: isinstance(x, (int)), - "number": lambda x: isinstance(x, (float)), - "bool": lambda x: isinstance(x, (bool)), - "list": lambda x: isinstance(x, (list)), - # spark_table must be of type str - "spark_table": lambda x: isinstance(x, (str)) -} - - -class InputOutputDefinitions(BaseModel): - type: DataTypes - description: Optional[StrictStr] = None - item_type: Optional[DataTypes] = Field( - None, description="Type of items in the list") - table_name: Optional[StrictStr] = Field( - None, description="Name of the spark_table,\ - required if type is spark_table") - example: Optional[DataTypes] = Field( - None, description="Example for the Input/Output" - ) - - """ - If the type is spark_table, table_name must also be set - """ - @field_validator("table_name") - def validate_table_name(cls, v, values): - set_type = values.get("type") - if set_type == "spark_table": - if not v: - raise ValueError( - "table_name must be set when type is 'spark_table'") - return v +def load_config( + config_file_name: str = CONFIG_FILE_DEFAULT_NAME, + config_path: Union[str, Path] = None +) -> ComputeBlock: """ - Check if the example corresponds with the inputs type + Returns and Validates the Compute Block YAML definition. + Returns a ComputeBlock instance if the validation is successfull """ - @field_validator("example") - def validate_example_type(cls, v, values): - expected_type = values.get("type") + try: + file = _find_and_load_config(config_file_name, config_path) + block = ComputeBlock(**file) + return block + except ValidationError as e: + raise ValueError(f"Configuration file validation error: {e}") - if expected_type in VALIDATORS: - if not VALIDATORS[expected_type](v): - raise ValueError(f"Example must be of type \ - '{expected_type}' when type is '{expected_type}'") - return v - - -class Entrypoint(BaseModel): - description: StrictStr - inputs: Dict[StrictStr, InputOutputDefinitions] - outputs: Dict[StrictStr, InputOutputDefinitions] - - -class ComputeBlock(BaseModel): - name: StrictStr - description: StrictStr - author: StrictStr - entrypoints: Dict[StrictStr, Entrypoint] - - @field_validator("entrypoints") - def check_entrypoints(cls, v): - if not v: - raise ValueError("At least one entrypoint must be defined.") - return v - - -def validate_config(config_path: str = CONFIG_FILE_DEFAULT_NAME) -> bool: +def _find_and_load_config( + config_file_name: str, + config_path: Union[str, Path] = None +): """ - Reads the passed Compute Block YAML definition. - Returns True if the validation using pydantic was successfull + Loads the compute block config YAML from the projects root directory + returns the loaded file """ + base_path = Path.cwd() + if config_path: + base_path /= Path(config_path) + full_path = base_path / config_file_name -def load_config(config_path: str = CONFIG_FILE_DEFAULT_NAME) -> ComputeBlock: - """ - Loads a YAML configuration file for workflow unit definitions. - """ - - root_dir = os.path.dirname(os.path.abspath(__file__)) - full_path = os.path.join(root_dir, "..", config_path) + if not full_path.is_file(): + raise FileNotFoundError(f"Configuration file '{ + full_path}' not found.") - with open(full_path, "r") as file: - config = yaml.safe_load(file) + try: + with full_path.open("r") as file: + config_data = yaml.safe_load(file) + except FileNotFoundError: + raise FileNotFoundError(f"Configuration file '{ + full_path}' not found.'") + except yaml.YAMLError as e: + raise ValueError(f"Error parsing YAML file: {e}") - return ComputeBlock(**config) + return config_data diff --git a/scystream/sdk/config/models.py b/scystream/sdk/config/models.py new file mode 100644 index 0000000..ab0d53b --- /dev/null +++ b/scystream/sdk/config/models.py @@ -0,0 +1,59 @@ +from typing import Optional, Dict, Literal +from pydantic import BaseModel, StrictStr, field_validator, Field + +""" +This file contains the schema definition for the config file. +""" + +STRING_TYPE = "string" +INT_TYPE = "int" +FLOAT_TYPE = "float" +BOOL_TYPE = "bool" +LIST_TYPE = "list" +SPARK_TABLE_TYPE = "spark_table" + +DataTypes = Literal[STRING_TYPE, INT_TYPE, FLOAT_TYPE, + BOOL_TYPE, LIST_TYPE, SPARK_TABLE_TYPE] + + +class InputOutputDefinitions(BaseModel): + type: DataTypes + description: Optional[StrictStr] = None + item_type: Optional[DataTypes] = Field( + None, description="Type of items in the list") + table_name: Optional[StrictStr] = Field( + None, description="Name of the spark_table,\ + required if type is spark_table", validate_default=True) + # TODO: Add an optional example field, this could be very helpful for the + # frontend + + """ + If the type is spark_table, table_name must also be set + """ + @field_validator("table_name") + def validate_table_name(cls, v, info): + set_type = info.data.get("type") + if set_type == "spark_table": + if not v: + raise ValueError( + "table_name must be set when type is 'spark_table'") + return v + + +class Entrypoint(BaseModel): + description: StrictStr + inputs: Dict[StrictStr, InputOutputDefinitions] + outputs: Dict[StrictStr, InputOutputDefinitions] + + +class ComputeBlock(BaseModel): + name: StrictStr + description: StrictStr + author: StrictStr + entrypoints: Dict[StrictStr, Entrypoint] + + @field_validator("entrypoints") + def check_entrypoints(cls, v): + if not v: + raise ValueError("At least one entrypoint must be defined.") + return v diff --git a/setup.py b/setup.py index 1a176d3..76ae135 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup( - name="scystream-sdk", + name="scystream_sdk", version="0.1.4", description="The official SDK for developing scystream compute blocks", long_description=open("README.md", "r", encoding="utf-8").read(), diff --git a/tests/example_config.yaml b/tests/example_config.yaml deleted file mode 100644 index b46befd..0000000 --- a/tests/example_config.yaml +++ /dev/null @@ -1,31 +0,0 @@ -workflow_unit: - name: "The first Web-Crawler" - description: "This is a web crawler, it crawls text..." - author: "John Doe" - - entrypoints: - crawl: - description: "Crawl text from specified URLs" - inputs: - url_list: - type: "list" - item_type: "string" - description: "List of URLs to crawl. Can be defined by the user." - example: ["https://example.com", "http://one.com"] - outputs: - text_data: - type: "spark_table" - description: "Crawled text data in a spark table" - name: "text_data_spark" - - analyze_url: - description: "Analyzes if data is crawlable" - inputs: - url-list: - type: "list" - item_type: "string" - description: "List of URLS to check" - example: ["https://example.com"] - outputs: - type: "bool" - description: "True if all urls can be crawled" diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..e5b2d2d --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,42 @@ +import unittest +from scystream.sdk.config.config_loader import load_config, ComputeBlock + + +class TestComputeBlockValidation(unittest.TestCase): + TEST_CONFIG_FOLDER = "tests/test_config_files" + + def test_valid_config(self): + try: + compute_block = load_config( + "valid_config.yaml", config_path=self.TEST_CONFIG_FOLDER) + self.assertIsInstance(compute_block, ComputeBlock) + except Exception: + self.fail("ComputeBlock raised an Exception unexpectedly!") + + def test_missing_entrypoints(self): + with self.assertRaises(ValueError): + load_config("missing_entrypoints.yaml", + config_path=self.TEST_CONFIG_FOLDER) + + def test_missing_table_name_for_spark_table(self): + with self.assertRaises(ValueError): + load_config("missing_table_name.yaml", + config_path=self.TEST_CONFIG_FOLDER) + + def test_invalid_datatypes(self): + with self.assertRaises(ValueError): + load_config("invalid_datatype.yaml", + config_path=self.TEST_CONFIG_FOLDER) + + def test_not_a_yaml(self): + with self.assertRaises(ValueError): + load_config("not_a_yaml.json", + config_path=self.TEST_CONFIG_FOLDER) + + def test_file_not_found(self): + with self.assertRaises(FileNotFoundError): + load_config("test.yaml") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_config_files/invalid_datatype.yaml b/tests/test_config_files/invalid_datatype.yaml new file mode 100644 index 0000000..84727af --- /dev/null +++ b/tests/test_config_files/invalid_datatype.yaml @@ -0,0 +1,29 @@ +name: "The first Web-Crawler" +description: "This is a web crawler, it crawls text..." +author: "John Doe" + +entrypoints: + crawl: + description: "Crawl text from specified URLs" + inputs: + url_list: + type: "invalid_type" + item_type: "string" + description: "List of URLs to crawl. Can be defined by the user." + outputs: + text_data: + type: "spark_table" + description: "Crawled text data in a spark table" + name: "text_data_spark" + + analyze_url: + description: "Analyzes if data is crawlable" + inputs: + url-list: + type: "list" + item_type: "string" + description: "List of URLS to check" + outputs: + was_sucess: + type: "bool" + description: "True if all urls can be crawled" diff --git a/tests/test_config_files/missing_entrypoints.yaml b/tests/test_config_files/missing_entrypoints.yaml new file mode 100644 index 0000000..32cf852 --- /dev/null +++ b/tests/test_config_files/missing_entrypoints.yaml @@ -0,0 +1,5 @@ +name: "The first Web-Crawler" +description: "This is a web crawler, it crawls text..." +author: "John Doe" +# Missing `entrypoints` field, which should cause validation to fail. + diff --git a/tests/test_config_files/missing_table_name.yaml b/tests/test_config_files/missing_table_name.yaml new file mode 100644 index 0000000..a22fa16 --- /dev/null +++ b/tests/test_config_files/missing_table_name.yaml @@ -0,0 +1,16 @@ +name: "The first Web-Crawler" +description: "This is a web crawler, it crawls text..." +author: "John Doe" +entrypoints: + crawl: + description: "Crawl text from specified URLs" + inputs: + url_list: + type: "list" + item_type: "string" + description: "List of URLs to crawl. Can be defined by the user." + outputs: + text_data: + type: "spark_table" + description: "Crawled text data in a spark table" + # Missing `table_name`, which should cause validation to fail. diff --git a/tests/test_config_files/not_a_yaml.json b/tests/test_config_files/not_a_yaml.json new file mode 100644 index 0000000..21da3b2 --- /dev/null +++ b/tests/test_config_files/not_a_yaml.json @@ -0,0 +1,3 @@ +{ + "key": "value" +} diff --git a/tests/test_config_files/valid_config.yaml b/tests/test_config_files/valid_config.yaml new file mode 100644 index 0000000..0e8c994 --- /dev/null +++ b/tests/test_config_files/valid_config.yaml @@ -0,0 +1,29 @@ +name: "The first Web-Crawler" +description: "This is a web crawler, it crawls text..." +author: "John Doe" + +entrypoints: + crawl: + description: "Crawl text from specified URLs" + inputs: + url_list: + type: "list" + item_type: "string" + description: "List of URLs to crawl. Can be defined by the user." + outputs: + text_data: + type: "spark_table" + description: "Crawled text data in a spark table" + table_name: "text_data_spark" + + analyze_url: + description: "Analyzes if data is crawlable" + inputs: + url-list: + type: "list" + item_type: "string" + description: "List of URLS to check" + outputs: + was_sucess: + type: "bool" + description: "True if all urls can be crawled" diff --git a/tests/test_core.py b/tests/test_core.py index 10fb7ac..775ae75 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,5 +1,5 @@ import unittest -from scystream_sdk.core import entrypoint, get_registered_functions +from scystream.sdk.core import entrypoint, get_registered_functions class TestEntrypoint(unittest.TestCase): From 46d2f9e315232e38560df79ce26352cb861f58e8 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Wed, 30 Oct 2024 23:15:07 +0100 Subject: [PATCH 04/22] style: fix linting --- scystream/sdk/config/config_loader.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scystream/sdk/config/config_loader.py b/scystream/sdk/config/config_loader.py index 1826439..f68e565 100644 --- a/scystream/sdk/config/config_loader.py +++ b/scystream/sdk/config/config_loader.py @@ -38,15 +38,17 @@ def _find_and_load_config( full_path = base_path / config_file_name if not full_path.is_file(): - raise FileNotFoundError(f"Configuration file '{ - full_path}' not found.") + raise FileNotFoundError( + f"Configuration file '{full_path}' not found." + ) try: with full_path.open("r") as file: config_data = yaml.safe_load(file) except FileNotFoundError: - raise FileNotFoundError(f"Configuration file '{ - full_path}' not found.'") + raise FileNotFoundError( + f"Configuration file '{full_path}' not found.'" + ) except yaml.YAMLError as e: raise ValueError(f"Error parsing YAML file: {e}") From 1fbbd39c63f76fe483de93bb88fc3092c66c08d3 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Wed, 30 Oct 2024 23:21:46 +0100 Subject: [PATCH 05/22] style: remove line --- tests/test_core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index acd8055..775ae75 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -2,7 +2,6 @@ from scystream.sdk.core import entrypoint, get_registered_functions - class TestEntrypoint(unittest.TestCase): def test_entrypoint_registration(self): @entrypoint From f6e715058a8e79f40ed70fae65268fd5523a6732 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Wed, 30 Oct 2024 23:25:05 +0100 Subject: [PATCH 06/22] docs: fix note --- README.md | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fd18e0c..2e1ef5b 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,7 @@ python3 -m venv .venv 2. Install the package within the venv -> [!INFO] +> [!NOTE] > This will also install all the install_requirements from the setup.py ```bash diff --git a/setup.py b/setup.py index 76ae135..1a176d3 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup( - name="scystream_sdk", + name="scystream-sdk", version="0.1.4", description="The official SDK for developing scystream compute blocks", long_description=open("README.md", "r", encoding="utf-8").read(), From 73c88433b03c16f7bc7170c119c37c2956b9006d Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Mon, 4 Nov 2024 14:40:45 +0100 Subject: [PATCH 07/22] feat: add optional to inputs validation --- scystream/sdk/config/models.py | 63 ++++++++++++------- tests/test_config.py | 10 +++ tests/test_config_files/invalid_datatype.yaml | 2 - .../optional_invalid_default.yaml | 31 +++++++++ .../optional_no_default.yaml | 30 +++++++++ tests/test_config_files/valid_config.yaml | 9 +-- 6 files changed, 117 insertions(+), 28 deletions(-) create mode 100644 tests/test_config_files/optional_invalid_default.yaml create mode 100644 tests/test_config_files/optional_no_default.yaml diff --git a/scystream/sdk/config/models.py b/scystream/sdk/config/models.py index ab0d53b..c83400e 100644 --- a/scystream/sdk/config/models.py +++ b/scystream/sdk/config/models.py @@ -1,4 +1,4 @@ -from typing import Optional, Dict, Literal +from typing import Optional, Dict, Literal, Any, Callable from pydantic import BaseModel, StrictStr, field_validator, Field """ @@ -15,35 +15,53 @@ DataTypes = Literal[STRING_TYPE, INT_TYPE, FLOAT_TYPE, BOOL_TYPE, LIST_TYPE, SPARK_TABLE_TYPE] +VALIDATORS: Dict[DataTypes, Callable[[Any], bool]] = { + STRING_TYPE: lambda v: isinstance(v, str), + INT_TYPE: lambda v: isinstance(v, str), + FLOAT_TYPE: lambda v: isinstance(v, float), + BOOL_TYPE: lambda v: isinstance(v, bool), + LIST_TYPE: lambda v: isinstance(v, list), + # SPARK_TABLE_TYPE should be the name of the spark table (str) + SPARK_TABLE_TYPE: lambda v: isinstance(v, str) +} -class InputOutputDefinitions(BaseModel): + +class BaseIOModel(BaseModel): type: DataTypes description: Optional[StrictStr] = None - item_type: Optional[DataTypes] = Field( - None, description="Type of items in the list") - table_name: Optional[StrictStr] = Field( - None, description="Name of the spark_table,\ - required if type is spark_table", validate_default=True) - # TODO: Add an optional example field, this could be very helpful for the - # frontend - - """ - If the type is spark_table, table_name must also be set - """ - @field_validator("table_name") - def validate_table_name(cls, v, info): - set_type = info.data.get("type") - if set_type == "spark_table": - if not v: - raise ValueError( - "table_name must be set when type is 'spark_table'") + + +class InputDefinitions(BaseIOModel): + optional: bool + default_value: Optional[Any] = Field(default=None, validate_default=True) + + @field_validator("default_value") + def validate_default_value(cls, v, info): + optional = info.data.get("optional") + expected_type = info.data.get("type") + + if not optional: + # If field is not optional, default_value does not have to be set + return v + + if v is None: + raise ValueError("default_value must be set when optional is True") + + validator = VALIDATORS.get(expected_type) + if validator and not validator(v): + raise TypeError(f"default_value must be of type {expected_type}") + return v +class OutputDefinitions(BaseIOModel): + pass + + class Entrypoint(BaseModel): description: StrictStr - inputs: Dict[StrictStr, InputOutputDefinitions] - outputs: Dict[StrictStr, InputOutputDefinitions] + inputs: Dict[StrictStr, InputDefinitions] + outputs: Dict[StrictStr, OutputDefinitions] class ComputeBlock(BaseModel): @@ -51,6 +69,7 @@ class ComputeBlock(BaseModel): description: StrictStr author: StrictStr entrypoints: Dict[StrictStr, Entrypoint] + docker_image: Optional[StrictStr] @field_validator("entrypoints") def check_entrypoints(cls, v): diff --git a/tests/test_config.py b/tests/test_config.py index e5b2d2d..212cb04 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -37,6 +37,16 @@ def test_file_not_found(self): with self.assertRaises(FileNotFoundError): load_config("test.yaml") + def test_optional_invalid_default(self): + with self.assertRaises(TypeError): + load_config("optional_invalid_default.yaml", + config_path=self.TEST_CONFIG_FOLDER) + + def test_optional_no_default(self): + with self.assertRaises(ValueError): + load_config("optional_no_default.yaml", + config_path=self.TEST_CONFIG_FOLDER) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_config_files/invalid_datatype.yaml b/tests/test_config_files/invalid_datatype.yaml index 84727af..3083e9b 100644 --- a/tests/test_config_files/invalid_datatype.yaml +++ b/tests/test_config_files/invalid_datatype.yaml @@ -8,7 +8,6 @@ entrypoints: inputs: url_list: type: "invalid_type" - item_type: "string" description: "List of URLs to crawl. Can be defined by the user." outputs: text_data: @@ -21,7 +20,6 @@ entrypoints: inputs: url-list: type: "list" - item_type: "string" description: "List of URLS to check" outputs: was_sucess: diff --git a/tests/test_config_files/optional_invalid_default.yaml b/tests/test_config_files/optional_invalid_default.yaml new file mode 100644 index 0000000..7d9c9cf --- /dev/null +++ b/tests/test_config_files/optional_invalid_default.yaml @@ -0,0 +1,31 @@ +name: "The first Web-Crawler" +description: "This is a web crawler, it crawls text..." +author: "John Doe" +docker_image: "https://ghcr.io/sycstream" + +entrypoints: + crawl: + description: "Crawl text from specified URLs" + inputs: + url_list: + type: "list" + description: "List of URLs to crawl. Can be defined by the user." + optional: True + default_value: "a string" + outputs: + text_data: + type: "spark_table" + description: "Crawled text data in a spark table" + table_name: "text_data_spark" + optional: False + + analyze_url: + description: "Analyzes if data is crawlable" + inputs: + url-list: + type: "list" + description: "List of URLS to check" + outputs: + was_sucess: + type: "bool" + description: "True if all urls can be crawled" diff --git a/tests/test_config_files/optional_no_default.yaml b/tests/test_config_files/optional_no_default.yaml new file mode 100644 index 0000000..98c9503 --- /dev/null +++ b/tests/test_config_files/optional_no_default.yaml @@ -0,0 +1,30 @@ +name: "The first Web-Crawler" +description: "This is a web crawler, it crawls text..." +author: "John Doe" +docker_image: "https://ghcr.io/sycstream" + +entrypoints: + crawl: + description: "Crawl text from specified URLs" + inputs: + url_list: + type: "list" + description: "List of URLs to crawl. Can be defined by the user." + optional: True + outputs: + text_data: + type: "spark_table" + description: "Crawled text data in a spark table" + table_name: "text_data_spark" + optional: False + + analyze_url: + description: "Analyzes if data is crawlable" + inputs: + url-list: + type: "list" + description: "List of URLS to check" + outputs: + was_sucess: + type: "bool" + description: "True if all urls can be crawled" diff --git a/tests/test_config_files/valid_config.yaml b/tests/test_config_files/valid_config.yaml index 0e8c994..12bae4d 100644 --- a/tests/test_config_files/valid_config.yaml +++ b/tests/test_config_files/valid_config.yaml @@ -1,6 +1,7 @@ name: "The first Web-Crawler" description: "This is a web crawler, it crawls text..." author: "John Doe" +docker_image: "https://ghcr.io/sycstream" entrypoints: crawl: @@ -8,21 +9,21 @@ entrypoints: inputs: url_list: type: "list" - item_type: "string" description: "List of URLs to crawl. Can be defined by the user." + optional: True + default_value: ["test", "1234"] outputs: text_data: type: "spark_table" description: "Crawled text data in a spark table" - table_name: "text_data_spark" analyze_url: description: "Analyzes if data is crawlable" inputs: - url-list: + url_list: type: "list" - item_type: "string" description: "List of URLS to check" + optional: False outputs: was_sucess: type: "bool" From f0868239391b31d01a15de92c32c805c7db57f51 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Mon, 4 Nov 2024 18:02:30 +0100 Subject: [PATCH 08/22] feat: remodel config structure --- README.md | 67 ++++--- scystream/sdk/config/models.py | 164 +++++++++++++----- tests/test_config.py | 15 +- tests/test_config_files/invalid_datatype.yaml | 59 +++++-- .../missing_entrypoints.yaml | 6 +- .../missing_required_fields.yaml | 51 ++++++ .../test_config_files/missing_table_name.yaml | 16 -- .../optional_default_not_set.yaml | 54 ++++++ .../optional_env_key_not_set.yaml | 49 ++++++ .../optional_invalid_default.yaml | 31 ---- .../optional_no_default.yaml | 30 ---- tests/test_config_files/valid_config.yaml | 64 ++++--- 12 files changed, 414 insertions(+), 192 deletions(-) create mode 100644 tests/test_config_files/missing_required_fields.yaml delete mode 100644 tests/test_config_files/missing_table_name.yaml create mode 100644 tests/test_config_files/optional_default_not_set.yaml create mode 100644 tests/test_config_files/optional_env_key_not_set.yaml delete mode 100644 tests/test_config_files/optional_invalid_default.yaml delete mode 100644 tests/test_config_files/optional_no_default.yaml diff --git a/README.md b/README.md index 2e1ef5b..70138bf 100644 --- a/README.md +++ b/README.md @@ -46,35 +46,58 @@ It shows the entrypoints, their inputs and outputs. This is an example `cbc.yaml`: ```yaml -name: "The first Web-Crawler" -description: "This is a web crawler, it crawls text..." +name: "NLP toolbox" +description: "Contains NLP algorithms..." author: "John Doe" +docker_image: "https://ghcr.io/nlp-toolbox" entrypoints: - crawl: - description: "Crawl text from specified URLs" + topic_modelling: + description: "Run topic modelling" inputs: - url_list: - type: "list" - item_type: "string" - description: "List of URLs to crawl. Can be defined by the user." - outputs: + language: + description: "The language to use" + type: "env" + env_key: "LANG" + optional: True + default_value: "de" text_data: - type: "spark_table" - description: "Crawled text data in a spark table" - table_name: "text_data_spark" - - analyze_url: - description: "Analyzes if data is crawlable" + description: "Text file. Can be uploaded by the user." + type: "file" + env_key: "TXT_SRC_PATH" + optional: False + db_data: + description: "Information in a database" + type: "db_table" + env_key: "DATA_TABLE_NAME" + table_name: "nlp_information" + optional: True + outputs: + topic_model: + type: "file" + description: "Topic model file" + env_key: "OUTPUT_PATH_TOPIC_MODEL" + file_path: "outputs/output.pkl" + run_durations: + type: "db_table" + description: "Table that contains the run durations per day." + env_key: "RUN_DURATIONS_TABLE_NAME" + table_name: "run_durations_nlp" + + analyze_runtime: + description: "Analyze the runtimes" inputs: - url-list: - type: "list" - item_type: "string" - description: "List of URLS to check" + run_durations: + type: "db_table" + env_key: "RUN_DURATIONS_TABLE_NAME" + table_name: "run_durations_nlp" + optional: True outputs: - was_sucess: - type: "bool" - description: "True if all urls can be crawled" + csv_output: + type: "file" + description: "A csv containing statistical information" + env_key: "CSV_OUTPUT_PATH" + file_path: "outputs/statistics.csv" ``` To read and validate such a config file u can proceed as follows: diff --git a/scystream/sdk/config/models.py b/scystream/sdk/config/models.py index c83400e..b786a25 100644 --- a/scystream/sdk/config/models.py +++ b/scystream/sdk/config/models.py @@ -1,67 +1,149 @@ -from typing import Optional, Dict, Literal, Any, Callable +from typing import Optional, Dict, Literal, Union from pydantic import BaseModel, StrictStr, field_validator, Field """ This file contains the schema definition for the config file. """ -STRING_TYPE = "string" -INT_TYPE = "int" -FLOAT_TYPE = "float" -BOOL_TYPE = "bool" -LIST_TYPE = "list" -SPARK_TABLE_TYPE = "spark_table" - -DataTypes = Literal[STRING_TYPE, INT_TYPE, FLOAT_TYPE, - BOOL_TYPE, LIST_TYPE, SPARK_TABLE_TYPE] - -VALIDATORS: Dict[DataTypes, Callable[[Any], bool]] = { - STRING_TYPE: lambda v: isinstance(v, str), - INT_TYPE: lambda v: isinstance(v, str), - FLOAT_TYPE: lambda v: isinstance(v, float), - BOOL_TYPE: lambda v: isinstance(v, bool), - LIST_TYPE: lambda v: isinstance(v, list), - # SPARK_TABLE_TYPE should be the name of the spark table (str) - SPARK_TABLE_TYPE: lambda v: isinstance(v, str) -} - - -class BaseIOModel(BaseModel): - type: DataTypes + +class BaseInputModel(BaseModel): description: Optional[StrictStr] = None + optional: bool = False + env_key: Optional[StrictStr] = Field( + default=None, validate_default=True, + description="The env_key describes the key of the environment variable\ + which can be set to override the default value" + ) + + @field_validator("env_key") + def validate_env_key(cls, v, info): + """a + If optional == False, the env_key must be set! As the user must have + the possibility to define the variable. + """ + optional = info.data.get("optional") + + if not optional and v is None: + raise ValueError("If optional is False, the env_key must be set.") + + return v + +class EnvInput(BaseInputModel): + """ + The EnvInput type describes the input of an ENV variable + It should describe one env-variable the compute unit accesses. -class InputDefinitions(BaseIOModel): - optional: bool - default_value: Optional[Any] = Field(default=None, validate_default=True) + The default_value can be overridden, if the env_key is set. + """ + type: Literal["env"] + default_value: Optional[StrictStr] = Field( + default=None, validate_default=True) @field_validator("default_value") def validate_default_value(cls, v, info): + """ + If optional == True, default_value must be set! + """ optional = info.data.get("optional") - expected_type = info.data.get("type") - if not optional: - # If field is not optional, default_value does not have to be set - return v + if optional and v is None: + raise ValueError("If optional is True, default_value must be set.") - if v is None: - raise ValueError("default_value must be set when optional is True") + return v - validator = VALIDATORS.get(expected_type) - if validator and not validator(v): - raise TypeError(f"default_value must be of type {expected_type}") - return v +class FileInput(BaseInputModel): + """ + The FileInput type describes the input for files. + The file_path describes the path to a file on the S3 bucket, + it can be overriden by using the env_key, if set. + + This makes sense, if a user should be able to manually upload + files the compute units wants to access. It does not know the + path to the file while writing the defintion. + """ + type: Literal["file"] + file_path: Optional[StrictStr] = Field( + default=None, validate_default=True, + description="The default value of the FileInput type.\ + Can be overriden." + ) + + @field_validator("file_path") + def validate_file_path(cls, v, info): + """ + If optional == True, file_path must be set! + """ + optional = info.data.get("optional") + + if optional and v is None: + raise ValueError("If optional is True, file_path must be set.") + + +class BaseOutputModel(BaseModel): + description: StrictStr + env_key: StrictStr = Field( + description="The env_key describes the key of the environment variable\ + which can be set to override the default value" + ) + + +class FileOutput(BaseOutputModel): + """ + The FileOutput type describes the output of a file. + The file_path describes the path to a file on the S3 bucket. + """ + type: Literal["file"] + file_path: StrictStr = Field( + desscription="The path to the file on the S3 bucket." + ) + + +class DBTableOutput(BaseOutputModel): + """ + The DBTableOutput type defines a table that provides output data. + The table_name refers to the output table name. + """ + type: Literal["db_table"] + table_name: StrictStr = Field( + description="The name of the output database table." + ) + + +class DBTableInput(BaseInputModel): + """ + The DBTableInput type defines a table that provides input data. + The table_name can be overriden by using the env_key, if set. + + This makes sense, if a previous compute units output db_table should + be used as an input. This table_name is then not known while writing the + definition. + """ + type: Literal["db_table"] + table_name: Optional[StrictStr] = Field( + default=None, validate_default=True, + description="The default value of the DBTableInput type.\ + Can be overriden." + ) + + @field_validator("table_name") + def validate_table_name(cls, v, info): + """ + If optional == True, table_name must be set! + """ + + optional = info.data.get("optional") -class OutputDefinitions(BaseIOModel): - pass + if optional and v is None: + raise ValueError("If optional is True, table_name must be set.") class Entrypoint(BaseModel): description: StrictStr - inputs: Dict[StrictStr, InputDefinitions] - outputs: Dict[StrictStr, OutputDefinitions] + inputs: Dict[StrictStr, Union[EnvInput, FileInput, DBTableInput]] + outputs: Dict[StrictStr, Union[FileInput, DBTableOutput]] class ComputeBlock(BaseModel): diff --git a/tests/test_config.py b/tests/test_config.py index 212cb04..3a426f2 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -18,11 +18,6 @@ def test_missing_entrypoints(self): load_config("missing_entrypoints.yaml", config_path=self.TEST_CONFIG_FOLDER) - def test_missing_table_name_for_spark_table(self): - with self.assertRaises(ValueError): - load_config("missing_table_name.yaml", - config_path=self.TEST_CONFIG_FOLDER) - def test_invalid_datatypes(self): with self.assertRaises(ValueError): load_config("invalid_datatype.yaml", @@ -37,14 +32,14 @@ def test_file_not_found(self): with self.assertRaises(FileNotFoundError): load_config("test.yaml") - def test_optional_invalid_default(self): - with self.assertRaises(TypeError): - load_config("optional_invalid_default.yaml", + def test_optional_default_not_set(self): + with self.assertRaises(ValueError): + load_config("optional_default_not_set.yaml", config_path=self.TEST_CONFIG_FOLDER) - def test_optional_no_default(self): + def test_optional_env_key_not_set(self): with self.assertRaises(ValueError): - load_config("optional_no_default.yaml", + load_config("optional_env_key_not_set.yaml", config_path=self.TEST_CONFIG_FOLDER) diff --git a/tests/test_config_files/invalid_datatype.yaml b/tests/test_config_files/invalid_datatype.yaml index 3083e9b..8df5b73 100644 --- a/tests/test_config_files/invalid_datatype.yaml +++ b/tests/test_config_files/invalid_datatype.yaml @@ -1,27 +1,50 @@ -name: "The first Web-Crawler" -description: "This is a web crawler, it crawls text..." +name: "NLP toolbox" +description: "Contains NLP algorithms..." author: "John Doe" +docker_image: "https://ghcr.io/nlp-toolbox" entrypoints: - crawl: - description: "Crawl text from specified URLs" + topic_modelling: + description: "Run topic modelling" inputs: - url_list: + language: + description: "The language to use" type: "invalid_type" - description: "List of URLs to crawl. Can be defined by the user." - outputs: + env_key: "LANG" + optional: True + default_value: "de" text_data: - type: "spark_table" - description: "Crawled text data in a spark table" - name: "text_data_spark" + description: "Text file. Can be uploaded by the user." + type: "file" + env_key: "TXT_SRC_PATH" + optional: False + db_data: + description: "Information in a database" + type: "db_table" + env_key: "DATA_TABLE_NAME" + optional: True + outputs: + topic_model: + type: "file" + description: "Topic model file" + env_key: "OUTPUT_PATH_TOPIC_MODEL" + # Missing file_path here will trigger validation error + run_durations: + type: "db_table" + env_key: "RUN_DURATIONS_TABLE_NAME" + table_name: "run_durations_nlp" - analyze_url: - description: "Analyzes if data is crawlable" + analyze_runtime: + description: "Analyze the runtimes" inputs: - url-list: - type: "list" - description: "List of URLS to check" + run_durations: + type: "db_table" + env_key: "RUN_DURATIONS_TABLE_NAME" + table_name: "run_durations_nlp" + optional: True outputs: - was_sucess: - type: "bool" - description: "True if all urls can be crawled" + csv_output: + type: "file" + description: "A csv containing statistical information" + env_key: "CSV_OUTPUT_PATH" + file_path: "outputs/statistics.csv" diff --git a/tests/test_config_files/missing_entrypoints.yaml b/tests/test_config_files/missing_entrypoints.yaml index 32cf852..e02ed4f 100644 --- a/tests/test_config_files/missing_entrypoints.yaml +++ b/tests/test_config_files/missing_entrypoints.yaml @@ -1,5 +1,5 @@ -name: "The first Web-Crawler" -description: "This is a web crawler, it crawls text..." +name: "NLP toolbox" +description: "Contains NLP algorithms..." author: "John Doe" -# Missing `entrypoints` field, which should cause validation to fail. +docker_image: "https://ghcr.io/nlp-toolbox" diff --git a/tests/test_config_files/missing_required_fields.yaml b/tests/test_config_files/missing_required_fields.yaml new file mode 100644 index 0000000..5187f95 --- /dev/null +++ b/tests/test_config_files/missing_required_fields.yaml @@ -0,0 +1,51 @@ +name: "NLP toolbox" +description: "Contains NLP algorithms..." +author: "John Doe" +docker_image: "https://ghcr.io/nlp-toolbox" + +entrypoints: + topic_modelling: + description: "Run topic modelling" + inputs: + language: + description: "The language to use" + type: "env" + env_key: "LANG" + optional: True + default_value: "de" + text_data: + description: "Text file. Can be uploaded by the user." + type: "file" + env_key: "TXT_SRC_PATH" + optional: False + db_data: + description: "Information in a database" + type: "db_table" + env_key: "DATA_TABLE_NAME" + optional: True + outputs: + topic_model: + type: "file" + description: "Topic model file" + env_key: "OUTPUT_PATH_TOPIC_MODEL" + # Missing file_path here will trigger validation error + run_durations: + type: "db_table" + env_key: "RUN_DURATIONS_TABLE_NAME" + table_name: "run_durations_nlp" + + analyze_runtime: + description: "Analyze the runtimes" + inputs: + run_durations: + type: "db_table" + env_key: "RUN_DURATIONS_TABLE_NAME" + table_name: "run_durations_nlp" + optional: True + outputs: + csv_output: + type: "file" + description: "A csv containing statistical information" + env_key: "CSV_OUTPUT_PATH" + file_path: "outputs/statistics.csv" + diff --git a/tests/test_config_files/missing_table_name.yaml b/tests/test_config_files/missing_table_name.yaml deleted file mode 100644 index a22fa16..0000000 --- a/tests/test_config_files/missing_table_name.yaml +++ /dev/null @@ -1,16 +0,0 @@ -name: "The first Web-Crawler" -description: "This is a web crawler, it crawls text..." -author: "John Doe" -entrypoints: - crawl: - description: "Crawl text from specified URLs" - inputs: - url_list: - type: "list" - item_type: "string" - description: "List of URLs to crawl. Can be defined by the user." - outputs: - text_data: - type: "spark_table" - description: "Crawled text data in a spark table" - # Missing `table_name`, which should cause validation to fail. diff --git a/tests/test_config_files/optional_default_not_set.yaml b/tests/test_config_files/optional_default_not_set.yaml new file mode 100644 index 0000000..83ac369 --- /dev/null +++ b/tests/test_config_files/optional_default_not_set.yaml @@ -0,0 +1,54 @@ +name: "NLP toolbox" +description: "Contains NLP algorithms..." +author: "John Doe" +docker_image: "https://ghcr.io/nlp-toolbox" + +entrypoints: + topic_modelling: + description: "Run topic modelling" + inputs: + language: + description: "The language to use" + type: "env" + env_key: "LANG" + optional: True + # Missing default_value here will trigger validation error + text_data: + description: "Text file. Can be uploaded by the user." + type: "file" + env_key: "TXT_SRC_PATH" + optional: False + db_data: + description: "Information in a database" + type: "db_table" + env_key: "DATA_TABLE_NAME" + table_name: "nlp_information" + optional: True + outputs: + topic_model: + type: "file" + description: "Topic model file" + env_key: "OUTPUT_PATH_TOPIC_MODEL" + file_path: "outputs/output.pkl" + run_durations: + description: "A table which contains the run durations per day." + type: "db_table" + env_key: "RUN_DURATIONS_TABLE_NAME" + table_name: "run_durations_nlp" + + analyze_runtime: + description: "Analyze the runtimes" + inputs: + run_durations: + description: "A table which contains the run durations per day." + type: "db_table" + env_key: "RUN_DURATIONS_TABLE_NAME" + table_name: "run_durations_nlp" + optional: True + outputs: + csv_output: + type: "file" + description: "A csv containing statistical information" + env_key: "CSV_OUTPUT_PATH" + file_path: "outputs/statistics.csv" + diff --git a/tests/test_config_files/optional_env_key_not_set.yaml b/tests/test_config_files/optional_env_key_not_set.yaml new file mode 100644 index 0000000..b6e2040 --- /dev/null +++ b/tests/test_config_files/optional_env_key_not_set.yaml @@ -0,0 +1,49 @@ +name: "NLP toolbox" +description: "Contains NLP algorithms..." +author: "John Doe" +docker_image: "https://ghcr.io/nlp-toolbox" + +entrypoints: + topic_modelling: + description: "Run topic modelling" + inputs: + language: + description: "The language to use" + type: "env" + optional: False + text_data: + description: "Text file. Can be uploaded by the user." + type: "file" + env_key: "TXT_SRC_PATH" + optional: False + db_data: + description: "Information in a database" + type: "db_table" + env_key: "DATA_TABLE_NAME" + optional: True + outputs: + topic_model: + type: "file" + description: "Topic model file" + env_key: "OUTPUT_PATH_TOPIC_MODEL" + file_path: "test/test.pkj" + run_durations: + type: "db_table" + env_key: "RUN_DURATIONS_TABLE_NAME" + table_name: "run_durations_nlp" + + analyze_runtime: + description: "Analyze the runtimes" + inputs: + run_durations: + type: "db_table" + env_key: "RUN_DURATIONS_TABLE_NAME" + table_name: "run_durations_nlp" + optional: True + outputs: + csv_output: + type: "file" + description: "A csv containing statistical information" + env_key: "CSV_OUTPUT_PATH" + file_path: "outputs/statistics.csv" + diff --git a/tests/test_config_files/optional_invalid_default.yaml b/tests/test_config_files/optional_invalid_default.yaml deleted file mode 100644 index 7d9c9cf..0000000 --- a/tests/test_config_files/optional_invalid_default.yaml +++ /dev/null @@ -1,31 +0,0 @@ -name: "The first Web-Crawler" -description: "This is a web crawler, it crawls text..." -author: "John Doe" -docker_image: "https://ghcr.io/sycstream" - -entrypoints: - crawl: - description: "Crawl text from specified URLs" - inputs: - url_list: - type: "list" - description: "List of URLs to crawl. Can be defined by the user." - optional: True - default_value: "a string" - outputs: - text_data: - type: "spark_table" - description: "Crawled text data in a spark table" - table_name: "text_data_spark" - optional: False - - analyze_url: - description: "Analyzes if data is crawlable" - inputs: - url-list: - type: "list" - description: "List of URLS to check" - outputs: - was_sucess: - type: "bool" - description: "True if all urls can be crawled" diff --git a/tests/test_config_files/optional_no_default.yaml b/tests/test_config_files/optional_no_default.yaml deleted file mode 100644 index 98c9503..0000000 --- a/tests/test_config_files/optional_no_default.yaml +++ /dev/null @@ -1,30 +0,0 @@ -name: "The first Web-Crawler" -description: "This is a web crawler, it crawls text..." -author: "John Doe" -docker_image: "https://ghcr.io/sycstream" - -entrypoints: - crawl: - description: "Crawl text from specified URLs" - inputs: - url_list: - type: "list" - description: "List of URLs to crawl. Can be defined by the user." - optional: True - outputs: - text_data: - type: "spark_table" - description: "Crawled text data in a spark table" - table_name: "text_data_spark" - optional: False - - analyze_url: - description: "Analyzes if data is crawlable" - inputs: - url-list: - type: "list" - description: "List of URLS to check" - outputs: - was_sucess: - type: "bool" - description: "True if all urls can be crawled" diff --git a/tests/test_config_files/valid_config.yaml b/tests/test_config_files/valid_config.yaml index 12bae4d..5a15dd0 100644 --- a/tests/test_config_files/valid_config.yaml +++ b/tests/test_config_files/valid_config.yaml @@ -1,30 +1,52 @@ -name: "The first Web-Crawler" -description: "This is a web crawler, it crawls text..." +name: "NLP toolbox" +description: "Contains NLP algorithms..." author: "John Doe" -docker_image: "https://ghcr.io/sycstream" +docker_image: "https://ghcr.io/nlp-toolbox" entrypoints: - crawl: - description: "Crawl text from specified URLs" + topic_modelling: + description: "Run topic modelling" inputs: - url_list: - type: "list" - description: "List of URLs to crawl. Can be defined by the user." + language: + description: "The language to use" + type: "env" + env_key: "LANG" optional: True - default_value: ["test", "1234"] - outputs: + default_value: "de" text_data: - type: "spark_table" - description: "Crawled text data in a spark table" + description: "Text file. Can be uploaded by the user." + type: "file" + env_key: "TXT_SRC_PATH" + optional: False + db_data: + description: "Information in a database" + type: "db_table" + env_key: "DATA_TABLE_NAME" + table_name: "nlp_information" + optional: True + outputs: + topic_model: + type: "file" + description: "Topic model file" + env_key: "OUTPUT_PATH_TOPIC_MODEL" + file_path: "outputs/output.pkl" + run_durations: + type: "db_table" + description: "Table that contains the run durations per day." + env_key: "RUN_DURATIONS_TABLE_NAME" + table_name: "run_durations_nlp" - analyze_url: - description: "Analyzes if data is crawlable" + analyze_runtime: + description: "Analyze the runtimes" inputs: - url_list: - type: "list" - description: "List of URLS to check" - optional: False + run_durations: + type: "db_table" + env_key: "RUN_DURATIONS_TABLE_NAME" + table_name: "run_durations_nlp" + optional: True outputs: - was_sucess: - type: "bool" - description: "True if all urls can be crawled" + csv_output: + type: "file" + description: "A csv containing statistical information" + env_key: "CSV_OUTPUT_PATH" + file_path: "outputs/statistics.csv" From 8d8574af97a766984e11770e5cfd494270e08a34 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Mon, 4 Nov 2024 20:30:13 +0100 Subject: [PATCH 09/22] feat: ensure env_key is set --- scystream/sdk/config/models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scystream/sdk/config/models.py b/scystream/sdk/config/models.py index b786a25..31b1428 100644 --- a/scystream/sdk/config/models.py +++ b/scystream/sdk/config/models.py @@ -9,15 +9,15 @@ class BaseInputModel(BaseModel): description: Optional[StrictStr] = None optional: bool = False - env_key: Optional[StrictStr] = Field( - default=None, validate_default=True, + env_key: StrictStr = Field( + validate_default=True, description="The env_key describes the key of the environment variable\ which can be set to override the default value" ) @field_validator("env_key") def validate_env_key(cls, v, info): - """a + """ If optional == False, the env_key must be set! As the user must have the possibility to define the variable. """ From 6b56af76107fd1f6af5aebf0bb01d70d3f0f8ccb Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Thu, 7 Nov 2024 15:22:55 +0100 Subject: [PATCH 10/22] feat: allow multiple envs for input configuration --- scystream/sdk/config/models.py | 146 ++---------------- tests/test_config.py | 10 -- tests/test_config_files/invalid_datatype.yaml | 39 ++--- tests/test_config_files/missing_fields.yaml | 42 +++++ .../missing_required_fields.yaml | 51 ------ .../optional_default_not_set.yaml | 54 ------- .../optional_env_key_not_set.yaml | 49 ------ tests/test_config_files/valid_config.yaml | 35 ++--- 8 files changed, 85 insertions(+), 341 deletions(-) create mode 100644 tests/test_config_files/missing_fields.yaml delete mode 100644 tests/test_config_files/missing_required_fields.yaml delete mode 100644 tests/test_config_files/optional_default_not_set.yaml delete mode 100644 tests/test_config_files/optional_env_key_not_set.yaml diff --git a/scystream/sdk/config/models.py b/scystream/sdk/config/models.py index 31b1428..f57b07f 100644 --- a/scystream/sdk/config/models.py +++ b/scystream/sdk/config/models.py @@ -1,149 +1,29 @@ -from typing import Optional, Dict, Literal, Union +from typing import Optional, Dict, Literal from pydantic import BaseModel, StrictStr, field_validator, Field +FILE_TYPE_IDENTIFIER = "file" +DB_TABLE_TYPE_IDENTIFIER = "db_table" + """ This file contains the schema definition for the config file. """ -class BaseInputModel(BaseModel): +class InputOutputModel(BaseModel): + type: Literal[FILE_TYPE_IDENTIFIER, DB_TABLE_TYPE_IDENTIFIER] description: Optional[StrictStr] = None - optional: bool = False - env_key: StrictStr = Field( - validate_default=True, - description="The env_key describes the key of the environment variable\ - which can be set to override the default value" - ) - - @field_validator("env_key") - def validate_env_key(cls, v, info): - """ - If optional == False, the env_key must be set! As the user must have - the possibility to define the variable. - """ - optional = info.data.get("optional") - - if not optional and v is None: - raise ValueError("If optional is False, the env_key must be set.") - - return v - - -class EnvInput(BaseInputModel): - """ - The EnvInput type describes the input of an ENV variable - It should describe one env-variable the compute unit accesses. - - The default_value can be overridden, if the env_key is set. - """ - type: Literal["env"] - default_value: Optional[StrictStr] = Field( - default=None, validate_default=True) - - @field_validator("default_value") - def validate_default_value(cls, v, info): - """ - If optional == True, default_value must be set! - """ - optional = info.data.get("optional") - - if optional and v is None: - raise ValueError("If optional is True, default_value must be set.") - - return v - - -class FileInput(BaseInputModel): - """ - The FileInput type describes the input for files. - The file_path describes the path to a file on the S3 bucket, - it can be overriden by using the env_key, if set. - - This makes sense, if a user should be able to manually upload - files the compute units wants to access. It does not know the - path to the file while writing the defintion. - """ - type: Literal["file"] - file_path: Optional[StrictStr] = Field( - default=None, validate_default=True, - description="The default value of the FileInput type.\ - Can be overriden." - ) - - @field_validator("file_path") - def validate_file_path(cls, v, info): - """ - If optional == True, file_path must be set! - """ - - optional = info.data.get("optional") - - if optional and v is None: - raise ValueError("If optional is True, file_path must be set.") - - -class BaseOutputModel(BaseModel): - description: StrictStr - env_key: StrictStr = Field( - description="The env_key describes the key of the environment variable\ - which can be set to override the default value" - ) - - -class FileOutput(BaseOutputModel): - """ - The FileOutput type describes the output of a file. - The file_path describes the path to a file on the S3 bucket. - """ - type: Literal["file"] - file_path: StrictStr = Field( - desscription="The path to the file on the S3 bucket." - ) - - -class DBTableOutput(BaseOutputModel): - """ - The DBTableOutput type defines a table that provides output data. - The table_name refers to the output table name. - """ - type: Literal["db_table"] - table_name: StrictStr = Field( - description="The name of the output database table." + config: Optional[Dict[StrictStr, Optional[StrictStr]]] = Field( + default=None, + description="The configuration for the input values\ + (file_path, table_name, etc.)" ) -class DBTableInput(BaseInputModel): - """ - The DBTableInput type defines a table that provides input data. - The table_name can be overriden by using the env_key, if set. - - This makes sense, if a previous compute units output db_table should - be used as an input. This table_name is then not known while writing the - definition. - """ - type: Literal["db_table"] - table_name: Optional[StrictStr] = Field( - default=None, validate_default=True, - description="The default value of the DBTableInput type.\ - Can be overriden." - ) - - @field_validator("table_name") - def validate_table_name(cls, v, info): - """ - If optional == True, table_name must be set! - """ - - optional = info.data.get("optional") - - if optional and v is None: - raise ValueError("If optional is True, table_name must be set.") - - class Entrypoint(BaseModel): description: StrictStr - inputs: Dict[StrictStr, Union[EnvInput, FileInput, DBTableInput]] - outputs: Dict[StrictStr, Union[FileInput, DBTableOutput]] + envs: Optional[Dict[StrictStr, StrictStr]] = None + inputs: Dict[StrictStr, InputOutputModel] + outputs: Dict[StrictStr, InputOutputModel] class ComputeBlock(BaseModel): diff --git a/tests/test_config.py b/tests/test_config.py index 3a426f2..149812e 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -32,16 +32,6 @@ def test_file_not_found(self): with self.assertRaises(FileNotFoundError): load_config("test.yaml") - def test_optional_default_not_set(self): - with self.assertRaises(ValueError): - load_config("optional_default_not_set.yaml", - config_path=self.TEST_CONFIG_FOLDER) - - def test_optional_env_key_not_set(self): - with self.assertRaises(ValueError): - load_config("optional_env_key_not_set.yaml", - config_path=self.TEST_CONFIG_FOLDER) - if __name__ == "__main__": unittest.main() diff --git a/tests/test_config_files/invalid_datatype.yaml b/tests/test_config_files/invalid_datatype.yaml index 8df5b73..e38fe43 100644 --- a/tests/test_config_files/invalid_datatype.yaml +++ b/tests/test_config_files/invalid_datatype.yaml @@ -6,45 +6,36 @@ docker_image: "https://ghcr.io/nlp-toolbox" entrypoints: topic_modelling: description: "Run topic modelling" - inputs: - language: - description: "The language to use" - type: "invalid_type" - env_key: "LANG" - optional: True - default_value: "de" + envs: + LANG: "de" + inputs: text_data: - description: "Text file. Can be uploaded by the user." - type: "file" - env_key: "TXT_SRC_PATH" - optional: False + description: "Text file. Must be uploaded by the user." + type: "invalid_type" + config: + TXT_SRC_PATH: null db_data: description: "Information in a database" type: "db_table" - env_key: "DATA_TABLE_NAME" - optional: True + config: + DATA_TABLE_NAME: "test_db_table" outputs: topic_model: type: "file" description: "Topic model file" - env_key: "OUTPUT_PATH_TOPIC_MODEL" - # Missing file_path here will trigger validation error + config: + OUTPUT_PATH_TOPIC_MODEL: null run_durations: type: "db_table" - env_key: "RUN_DURATIONS_TABLE_NAME" - table_name: "run_durations_nlp" + config: + DURATIONS_TABLE_NAME: "run_durations_table" analyze_runtime: description: "Analyze the runtimes" inputs: run_durations: - type: "db_table" - env_key: "RUN_DURATIONS_TABLE_NAME" - table_name: "run_durations_nlp" - optional: True + type: "db_table" outputs: csv_output: type: "file" - description: "A csv containing statistical information" - env_key: "CSV_OUTPUT_PATH" - file_path: "outputs/statistics.csv" + description: "A csv containing statistical information" diff --git a/tests/test_config_files/missing_fields.yaml b/tests/test_config_files/missing_fields.yaml new file mode 100644 index 0000000..db761de --- /dev/null +++ b/tests/test_config_files/missing_fields.yaml @@ -0,0 +1,42 @@ +name: "NLP toolbox" +description: "Contains NLP algorithms..." +author: "John Doe" +docker_image: "https://ghcr.io/nlp-toolbox" + +entrypoints: + topic_modelling: + envs: + LANGUAGE: "de" + inputs: + text_data: + description: "Text file. Can be uploaded by the user." + config: + TXT_SRC_PATH: null + db_data: + description: "Information in a database" + type: "db_table" + config: + DATA_TABLE_NAME: "nlp_information" + outputs: + topic_model: + description: "Topic model file" + config: + OUTPUT_PATH_TOPIC_MODEL: null + run_durations: + type: "db_table" + description: "Table that contains the run durations per day." + config: + RUN_DURATIONS_TABLE_NAME: "run_durations_nlp" + + analyze_runtime: + description: "Analyze the runtimes" + inputs: + run_durations: + config: + RUN_DURATIONS_TABLE_NAME: "run_durations_nlp" + outputs: + csv_output: + type: "file" + description: "A csv containing statistical information" + config: + CSV_OUTPUT_PATH: "outputs/statistics.csv" diff --git a/tests/test_config_files/missing_required_fields.yaml b/tests/test_config_files/missing_required_fields.yaml deleted file mode 100644 index 5187f95..0000000 --- a/tests/test_config_files/missing_required_fields.yaml +++ /dev/null @@ -1,51 +0,0 @@ -name: "NLP toolbox" -description: "Contains NLP algorithms..." -author: "John Doe" -docker_image: "https://ghcr.io/nlp-toolbox" - -entrypoints: - topic_modelling: - description: "Run topic modelling" - inputs: - language: - description: "The language to use" - type: "env" - env_key: "LANG" - optional: True - default_value: "de" - text_data: - description: "Text file. Can be uploaded by the user." - type: "file" - env_key: "TXT_SRC_PATH" - optional: False - db_data: - description: "Information in a database" - type: "db_table" - env_key: "DATA_TABLE_NAME" - optional: True - outputs: - topic_model: - type: "file" - description: "Topic model file" - env_key: "OUTPUT_PATH_TOPIC_MODEL" - # Missing file_path here will trigger validation error - run_durations: - type: "db_table" - env_key: "RUN_DURATIONS_TABLE_NAME" - table_name: "run_durations_nlp" - - analyze_runtime: - description: "Analyze the runtimes" - inputs: - run_durations: - type: "db_table" - env_key: "RUN_DURATIONS_TABLE_NAME" - table_name: "run_durations_nlp" - optional: True - outputs: - csv_output: - type: "file" - description: "A csv containing statistical information" - env_key: "CSV_OUTPUT_PATH" - file_path: "outputs/statistics.csv" - diff --git a/tests/test_config_files/optional_default_not_set.yaml b/tests/test_config_files/optional_default_not_set.yaml deleted file mode 100644 index 83ac369..0000000 --- a/tests/test_config_files/optional_default_not_set.yaml +++ /dev/null @@ -1,54 +0,0 @@ -name: "NLP toolbox" -description: "Contains NLP algorithms..." -author: "John Doe" -docker_image: "https://ghcr.io/nlp-toolbox" - -entrypoints: - topic_modelling: - description: "Run topic modelling" - inputs: - language: - description: "The language to use" - type: "env" - env_key: "LANG" - optional: True - # Missing default_value here will trigger validation error - text_data: - description: "Text file. Can be uploaded by the user." - type: "file" - env_key: "TXT_SRC_PATH" - optional: False - db_data: - description: "Information in a database" - type: "db_table" - env_key: "DATA_TABLE_NAME" - table_name: "nlp_information" - optional: True - outputs: - topic_model: - type: "file" - description: "Topic model file" - env_key: "OUTPUT_PATH_TOPIC_MODEL" - file_path: "outputs/output.pkl" - run_durations: - description: "A table which contains the run durations per day." - type: "db_table" - env_key: "RUN_DURATIONS_TABLE_NAME" - table_name: "run_durations_nlp" - - analyze_runtime: - description: "Analyze the runtimes" - inputs: - run_durations: - description: "A table which contains the run durations per day." - type: "db_table" - env_key: "RUN_DURATIONS_TABLE_NAME" - table_name: "run_durations_nlp" - optional: True - outputs: - csv_output: - type: "file" - description: "A csv containing statistical information" - env_key: "CSV_OUTPUT_PATH" - file_path: "outputs/statistics.csv" - diff --git a/tests/test_config_files/optional_env_key_not_set.yaml b/tests/test_config_files/optional_env_key_not_set.yaml deleted file mode 100644 index b6e2040..0000000 --- a/tests/test_config_files/optional_env_key_not_set.yaml +++ /dev/null @@ -1,49 +0,0 @@ -name: "NLP toolbox" -description: "Contains NLP algorithms..." -author: "John Doe" -docker_image: "https://ghcr.io/nlp-toolbox" - -entrypoints: - topic_modelling: - description: "Run topic modelling" - inputs: - language: - description: "The language to use" - type: "env" - optional: False - text_data: - description: "Text file. Can be uploaded by the user." - type: "file" - env_key: "TXT_SRC_PATH" - optional: False - db_data: - description: "Information in a database" - type: "db_table" - env_key: "DATA_TABLE_NAME" - optional: True - outputs: - topic_model: - type: "file" - description: "Topic model file" - env_key: "OUTPUT_PATH_TOPIC_MODEL" - file_path: "test/test.pkj" - run_durations: - type: "db_table" - env_key: "RUN_DURATIONS_TABLE_NAME" - table_name: "run_durations_nlp" - - analyze_runtime: - description: "Analyze the runtimes" - inputs: - run_durations: - type: "db_table" - env_key: "RUN_DURATIONS_TABLE_NAME" - table_name: "run_durations_nlp" - optional: True - outputs: - csv_output: - type: "file" - description: "A csv containing statistical information" - env_key: "CSV_OUTPUT_PATH" - file_path: "outputs/statistics.csv" - diff --git a/tests/test_config_files/valid_config.yaml b/tests/test_config_files/valid_config.yaml index 5a15dd0..6d05169 100644 --- a/tests/test_config_files/valid_config.yaml +++ b/tests/test_config_files/valid_config.yaml @@ -6,47 +6,42 @@ docker_image: "https://ghcr.io/nlp-toolbox" entrypoints: topic_modelling: description: "Run topic modelling" + envs: + LANGUAGE: "de" inputs: - language: - description: "The language to use" - type: "env" - env_key: "LANG" - optional: True - default_value: "de" text_data: description: "Text file. Can be uploaded by the user." type: "file" - env_key: "TXT_SRC_PATH" - optional: False + config: + TXT_SRC_PATH: null db_data: description: "Information in a database" type: "db_table" - env_key: "DATA_TABLE_NAME" - table_name: "nlp_information" - optional: True + config: + DATA_TABLE_NAME: "nlp_information" outputs: topic_model: type: "file" description: "Topic model file" - env_key: "OUTPUT_PATH_TOPIC_MODEL" - file_path: "outputs/output.pkl" + config: + OUTPUT_PATH_TOPIC_MODEL: null run_durations: type: "db_table" description: "Table that contains the run durations per day." - env_key: "RUN_DURATIONS_TABLE_NAME" - table_name: "run_durations_nlp" + config: + RUN_DURATIONS_TABLE_NAME: "run_durations_nlp" analyze_runtime: description: "Analyze the runtimes" inputs: run_durations: + description: "Teble that contains all runtimes and dates" type: "db_table" - env_key: "RUN_DURATIONS_TABLE_NAME" - table_name: "run_durations_nlp" - optional: True + config: + RUN_DURATIONS_TABLE_NAME: "run_durations_nlp" outputs: csv_output: type: "file" description: "A csv containing statistical information" - env_key: "CSV_OUTPUT_PATH" - file_path: "outputs/statistics.csv" + config: + CSV_OUTPUT_PATH: "outputs/statistics.csv" From 8cc227000d4c061fda4f8e726a11d5b8bc487273 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Thu, 7 Nov 2024 15:32:19 +0100 Subject: [PATCH 11/22] docs: update readme --- README.md | 37 +++++++++++------------ scystream/sdk/config/models.py | 15 ++++++--- tests/test_config_files/valid_config.yaml | 2 ++ 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 70138bf..50052fa 100644 --- a/README.md +++ b/README.md @@ -54,50 +54,47 @@ docker_image: "https://ghcr.io/nlp-toolbox" entrypoints: topic_modelling: description: "Run topic modelling" + envs: + LANGUAGE: "de" inputs: - language: - description: "The language to use" - type: "env" - env_key: "LANG" - optional: True - default_value: "de" text_data: description: "Text file. Can be uploaded by the user." type: "file" - env_key: "TXT_SRC_PATH" - optional: False + config: + TXT_SRC_PATH: null db_data: description: "Information in a database" type: "db_table" - env_key: "DATA_TABLE_NAME" - table_name: "nlp_information" - optional: True + config: + DATA_TABLE_NAME: "nlp_information" + DB_HOST: "time.rwth-aachen.de" + DB_PORT: 1234 outputs: topic_model: type: "file" description: "Topic model file" - env_key: "OUTPUT_PATH_TOPIC_MODEL" - file_path: "outputs/output.pkl" + config: + OUTPUT_PATH_TOPIC_MODEL: null run_durations: type: "db_table" description: "Table that contains the run durations per day." - env_key: "RUN_DURATIONS_TABLE_NAME" - table_name: "run_durations_nlp" + config: + RUN_DURATIONS_TABLE_NAME: "run_durations_nlp" analyze_runtime: description: "Analyze the runtimes" inputs: run_durations: + description: "Teble that contains all runtimes and dates" type: "db_table" - env_key: "RUN_DURATIONS_TABLE_NAME" - table_name: "run_durations_nlp" - optional: True + config: + RUN_DURATIONS_TABLE_NAME: "run_durations_nlp" outputs: csv_output: type: "file" description: "A csv containing statistical information" - env_key: "CSV_OUTPUT_PATH" - file_path: "outputs/statistics.csv" + config: + CSV_OUTPUT_PATH: "outputs/statistics.csv" ``` To read and validate such a config file u can proceed as follows: diff --git a/scystream/sdk/config/models.py b/scystream/sdk/config/models.py index f57b07f..18cdc9a 100644 --- a/scystream/sdk/config/models.py +++ b/scystream/sdk/config/models.py @@ -1,5 +1,6 @@ -from typing import Optional, Dict, Literal -from pydantic import BaseModel, StrictStr, field_validator, Field +from typing import Optional, Dict, Literal, Union +from pydantic import BaseModel, StrictStr, field_validator, Field, \ + StrictInt, StrictFloat FILE_TYPE_IDENTIFIER = "file" DB_TABLE_TYPE_IDENTIFIER = "db_table" @@ -12,9 +13,13 @@ class InputOutputModel(BaseModel): type: Literal[FILE_TYPE_IDENTIFIER, DB_TABLE_TYPE_IDENTIFIER] description: Optional[StrictStr] = None - config: Optional[Dict[StrictStr, Optional[StrictStr]]] = Field( - default=None, - description="The configuration for the input values\ + config: Optional[ + Dict[ + StrictStr, + Optional[Union[StrictStr, StrictInt, StrictFloat]] + ]] = Field( + default=None, + description="The configuration for the input values\ (file_path, table_name, etc.)" ) diff --git a/tests/test_config_files/valid_config.yaml b/tests/test_config_files/valid_config.yaml index 6d05169..e479e71 100644 --- a/tests/test_config_files/valid_config.yaml +++ b/tests/test_config_files/valid_config.yaml @@ -19,6 +19,8 @@ entrypoints: type: "db_table" config: DATA_TABLE_NAME: "nlp_information" + DB_HOST: "time.rwth-aachen.de" + DB_PORT: 1234 outputs: topic_model: type: "file" From 61738de87e69b7f2a032d5daca5cbd8712d2c5fc Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Thu, 7 Nov 2024 16:40:04 +0100 Subject: [PATCH 12/22] style: add better comments --- scystream/sdk/config/models.py | 42 +++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/scystream/sdk/config/models.py b/scystream/sdk/config/models.py index 18cdc9a..22859c9 100644 --- a/scystream/sdk/config/models.py +++ b/scystream/sdk/config/models.py @@ -11,6 +11,18 @@ class InputOutputModel(BaseModel): + """ + Represents configuration for inputs or outputs in a ComputeBlock. + + The configuration is defined as a dictionary with key-value pairs, where: + - The key is the name of an environment variable (e.., `FILE_PATH`, + `TABLE_NAME`). + - The value is the default value for that environment variable, which can + be a string, integer, or float. + + If a value is explicitly set to `null`, validation will fail unless the + ENV-Variable is manually set by the ComputeBlock user. + """ type: Literal[FILE_TYPE_IDENTIFIER, DB_TABLE_TYPE_IDENTIFIER] description: Optional[StrictStr] = None config: Optional[ @@ -25,13 +37,41 @@ class InputOutputModel(BaseModel): class Entrypoint(BaseModel): + """ + Represents an entrypoint within a ComputeBlock. + + An entrypoint includes: + - A description of the entrypoint's purpose. + - A dictionary of environment variables (`envs`), where each key-value + pair represents an environment variable and its default value. + - These variables should be shared variables across the entrypoint + - Input and output configurations, each described by the + `InputOutputModel`. + + If an environment variable’s value is set to `None` in the configuration, + the ComputeBlock user must provide that variable during runtime, or else + the process will fail. + """ description: StrictStr - envs: Optional[Dict[StrictStr, StrictStr]] = None + envs: Optional[Dict[StrictStr, StrictStr] + ] = None # Todo can be set to Null inputs: Dict[StrictStr, InputOutputModel] outputs: Dict[StrictStr, InputOutputModel] class ComputeBlock(BaseModel): + """ + Represents a ComputeBlock configuration, which describes the compute + process, including entrypoints, inputs, and outputs. + + A ComputeBlock is defined by: + - A name, description, and author. + - One or more entrypoints that specify how data is passed into and out of + the compute process. + - Optionally, a Docker image to specify the execution environment. + + At least one entrypoint must be defined for the ComputeBlock to be valid. + """ name: StrictStr description: StrictStr author: StrictStr From c5434011074d0caea5550513117bcc0dad12e705 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Sat, 9 Nov 2024 23:41:43 +0100 Subject: [PATCH 13/22] feat: add more datatypes for env keys --- scystream/sdk/config/models.py | 12 ++++++++---- tests/test_config_files/valid_config.yaml | 2 ++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/scystream/sdk/config/models.py b/scystream/sdk/config/models.py index 22859c9..cca2008 100644 --- a/scystream/sdk/config/models.py +++ b/scystream/sdk/config/models.py @@ -1,4 +1,4 @@ -from typing import Optional, Dict, Literal, Union +from typing import Optional, Dict, Literal, Union, List from pydantic import BaseModel, StrictStr, field_validator, Field, \ StrictInt, StrictFloat @@ -28,7 +28,7 @@ class InputOutputModel(BaseModel): config: Optional[ Dict[ StrictStr, - Optional[Union[StrictStr, StrictInt, StrictFloat]] + Optional[Union[StrictStr, StrictInt, StrictFloat, List, bool]] ]] = Field( default=None, description="The configuration for the input values\ @@ -53,8 +53,12 @@ class Entrypoint(BaseModel): the process will fail. """ description: StrictStr - envs: Optional[Dict[StrictStr, StrictStr] - ] = None # Todo can be set to Null + envs: Optional[ + Dict[ + StrictStr, + Optional[Union[StrictStr, StrictInt, StrictFloat, List, bool]] + ] + ] = None inputs: Dict[StrictStr, InputOutputModel] outputs: Dict[StrictStr, InputOutputModel] diff --git a/tests/test_config_files/valid_config.yaml b/tests/test_config_files/valid_config.yaml index e479e71..ed443a1 100644 --- a/tests/test_config_files/valid_config.yaml +++ b/tests/test_config_files/valid_config.yaml @@ -21,6 +21,8 @@ entrypoints: DATA_TABLE_NAME: "nlp_information" DB_HOST: "time.rwth-aachen.de" DB_PORT: 1234 + TXT_SRC_PATH: ["test.txt", "hi.txt"] # for testing purposes + IS_INDEXED: True outputs: topic_model: type: "file" From 771b5b1e58d2d7ca207d453b5a32274ac296301f Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Sun, 10 Nov 2024 19:23:46 +0100 Subject: [PATCH 14/22] feat: add basic validation --- README.md | 123 ++++++++++++++++------ scystream/sdk/core.py | 33 ++++-- scystream/sdk/env/settings.py | 30 ++++++ setup.py | 3 +- tests/test_config_files/valid_config.yaml | 2 +- tests/test_core.py | 2 +- tests/test_settings.py | 50 +++++++++ 7 files changed, 202 insertions(+), 41 deletions(-) create mode 100644 scystream/sdk/env/settings.py create mode 100644 tests/test_settings.py diff --git a/README.md b/README.md index 50052fa..a5a87dc 100644 --- a/README.md +++ b/README.md @@ -8,40 +8,27 @@ You can install the package via pip once it's published: pip install scystream-sdk ``` -## Usage - -```python3 -from scystream.sdk.core import entrypoint -from scystream.sdk.scheduler import Scheduler - - -@entrypoint -def example_task(): - print("Executing example_task...") +### Compute Blocks and their configs +One of the central concepts of scystream are the so-called **Compute Blocks**. +A Compute Block describes an independent programm, that acts as some kind of worker +which will be scheduled using the scystream-core application. +This worker executes a task (e.g. a NLP task, a crwaling task). -@entrypoint -def another_task(task_name): - print(f"Executing another_task with task name: {task_name}") +Each worker can have multiple entrypoints, each aiming to solve one task. +These entrypoints can be configured from the outside using the **Settings**. +These are basically ENV-Variables, which will be parsed & validated using pydantic. +This SDK aims to implement helper functions and other requirements we expect each +Compute Block to have. -def main(): - Scheduler.list_entrypoints() - Scheduler.execute_function("example_task") - Scheduler.execute_function("another_task", "ScheduledTask") +To understand the concept of such a Compute Block even more, take a look at the +config below. - -if __name__ == "__main__": - main() - -``` - -### Compute Block Config Files We expect every repository which will be used within the scystream application -to contain a `Compute Block Config File`, the `cbc.yaml`, within the root directory. - -This yaml-file describes the compute block itself. -It shows the entrypoints, their inputs and outputs. +to contain a **Compute Block Config File**, the `cbc.yaml`, within the root directory. +This `cbc.yaml` will be used to define the entrypoints, the inputs & outputs each +Compute Block offers, necessary for the scystream-frontend to understand. This is an example `cbc.yaml`: @@ -85,7 +72,7 @@ entrypoints: description: "Analyze the runtimes" inputs: run_durations: - description: "Teble that contains all runtimes and dates" + description: "Table that contains all runtimes and dates" type: "db_table" config: RUN_DURATIONS_TABLE_NAME: "run_durations_nlp" @@ -97,7 +84,10 @@ entrypoints: CSV_OUTPUT_PATH: "outputs/statistics.csv" ``` -To read and validate such a config file u can proceed as follows: +For now, you have to write this config file on your own. However, at some +point you will be able to generate this config from your code. + +To read and validate such a config file you can proceed as follows: ```python3 from scystream.sdk.config.config_loader import load_config @@ -121,15 +111,86 @@ load_config(config_file_name="test.yaml", config_path="configs/") the `config_path` is the path relative to your root directory +## Basic Usage of the SDK + +```python3 +from scystream.sdk.core import entrypoint +from scystream.sdk.scheduler import Scheduler + + +@entrypoint +def example_task(): + print("Executing example_task...") + + +@entrypoint +def another_task(task_name): + print(f"Executing another_task with task name: {task_name}") + + +def main(): + Scheduler.list_entrypoints() + Scheduler.execute_function("example_task") + Scheduler.execute_function("another_task", "ScheduledTask") + + +if __name__ == "__main__": + main() + +``` + +## Defining Settings and Using them. + +Earlier, we already wrote about **Settings**. +Each Input & Output can be configured using these settings. +There are also Global Settings, refered to as `envs` in the `cbc.yaml` + +Below you can find a simple example of how we define & validate these settings. +Therefore you should use the `BaseENVSettings` class. + +```python3 +from scystream.sdk.core import entrypoint +from scystream.sdk.env.settings import BaseENVSettings + +class GlobalSettings(BaseENVSettings): + LANGUAGE: str = "de" + +class TopicModellingEntrypointSettings(BaseENVSettings): + TXT_SRC_PATH: str # if no default provided, setting this ENV manually is a MUST + +@entrypoint(TopicModellingEntrypointSettings) # Pass it to the Entrypoint +def topic_modelling(settings): + print(f"Running topic modelling, using file: {settings.TXT_SRC_PATH}") + +@entrypoint +def test_entrypint(): + print("This entrypoint does not have any configs.") +``` + +We recommend defining your `GlobalSettings` in an extra file and "exporting" the loaded +Settings to make them accessible to other files. +See an example below: + +```python3 +from scystream.sdk.env.settings import BaseENVSettings + +class GlobalSettings(BaseENVSettings): + LANGUAGE: str = "de" + +GLOBAL_SETTINGS = GlobalSettings.load_settings() +``` + +You can then use the loaded `GLOBAL_SETTINGS` in your other files, by importing them. ## Development of the SDK ### Installation -1. Create a venv +1. Create a venv and use it ```bash python3 -m venv .venv +source .venv/bin/activate ``` 2. Install the package within the venv diff --git a/scystream/sdk/core.py b/scystream/sdk/core.py index 3965d1c..07ee5b1 100644 --- a/scystream/sdk/core.py +++ b/scystream/sdk/core.py @@ -1,15 +1,34 @@ import functools +from typing import Callable, Type, Optional +from .env.settings import BaseENVSettings +from pydantic import ValidationError + _registered_functions = {} -def entrypoint(func): - """Decorator to mark a function as an entrypoint.""" - @functools.wraps(func) - def wrapper(*args, **kwargs): - return func(*args, **kwargs) - _registered_functions[func.__name__] = func - return wrapper +def entrypoint(settings_class: Optional[Type[BaseENVSettings]] = None): + """ + Decorator to mark a function as an entrypoint. + It also loads and injects the settings of the entrypoint. + """ + def decorator(func: Callable): + @functools.wraps(func) + def wrapper(*args, **kwargs): + if settings_class is not None: + # Load settings + try: + settings = settings_class.load_settings() + except ValidationError as e: + raise ValueError(f"Invalid environment configuration: {e}") + + return func(settings, *args, **kwargs) + else: + return func(*args, **kwargs) + + _registered_functions[func.__name__] = wrapper + return wrapper + return decorator def get_registered_functions(): diff --git a/scystream/sdk/env/settings.py b/scystream/sdk/env/settings.py new file mode 100644 index 0000000..9eab887 --- /dev/null +++ b/scystream/sdk/env/settings.py @@ -0,0 +1,30 @@ +from pydantic_settings import BaseSettings, SettingsConfigDict +from typing import Type + +ENV_FILE_ENCODING = "utf-8" + + +class BaseENVSettings(BaseSettings): + """ + This class acts as the BaseClass which can be used to define custom + ENV-Variables which can be used across the ComputeBlock & for entrypoints + This definition, and pydantic, will then take care of validating the envs + """ + + model_config = SettingsConfigDict( + env_file_encoding=ENV_FILE_ENCODING, + case_sensitive=True, + extra="ignore" + ) + + @classmethod + def load_settings( + cls: Type["BaseENVSettings"], + env_file: str = ".env" + ) -> "BaseENVSettings": + """ + load_settings loads the env file. The name of the env_file can be + passed as an argument. + Returns the parsed ENVs + """ + return cls(_env_file=env_file, _env_file_encoding=ENV_FILE_ENCODING) diff --git a/setup.py b/setup.py index 1a176d3..411a077 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,8 @@ packages=find_packages(), install_requires=[ "pydantic>=2.9.2", - "PyYAML>=6.0.2" + "PyYAML>=6.0.2", + "pydantic-settings>=2.6.1" ], classifiers=[ "Programming Language :: Python :: 3", diff --git a/tests/test_config_files/valid_config.yaml b/tests/test_config_files/valid_config.yaml index ed443a1..73d0c3c 100644 --- a/tests/test_config_files/valid_config.yaml +++ b/tests/test_config_files/valid_config.yaml @@ -39,7 +39,7 @@ entrypoints: description: "Analyze the runtimes" inputs: run_durations: - description: "Teble that contains all runtimes and dates" + description: "Table that contains all runtimes and dates" type: "db_table" config: RUN_DURATIONS_TABLE_NAME: "run_durations_nlp" diff --git a/tests/test_core.py b/tests/test_core.py index 775ae75..f9a19ec 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -4,7 +4,7 @@ class TestEntrypoint(unittest.TestCase): def test_entrypoint_registration(self): - @entrypoint + @entrypoint() def dummy_function(): return "Hello" diff --git a/tests/test_settings.py b/tests/test_settings.py new file mode 100644 index 0000000..755f6d9 --- /dev/null +++ b/tests/test_settings.py @@ -0,0 +1,50 @@ +import unittest +import os +from scystream.sdk.core import entrypoint +from scystream.sdk.env.settings import BaseENVSettings + + +class WithDefaultSettings(BaseENVSettings): + DUMMY_SETTING: str = "this is a dummy setting" + + +class NoDefaultSetting(BaseENVSettings): + DUMMY_SETTING: str + + +class TestSettings(unittest.TestCase): + def test_entrypoint_with_setting_default(self): + @entrypoint(WithDefaultSettings) + def with_default_settings(settings): + return settings.DUMMY_SETTING + + result = with_default_settings() + self.assertEqual(result, "this is a dummy setting") + + """ + environment is set + """ + os.environ["DUMMY_SETTING"] = "overridden setting" + result = with_default_settings() + self.assertEqual(result, "overridden setting") + del os.environ["DUMMY_SETTING"] + + def test_entrypoint_with_no_setting_default(self): + @entrypoint(NoDefaultSetting) + def with_no_default_settings(settings): + return settings.DUMMY_SETTING + + with self.assertRaises(ValueError): + with_no_default_settings() + + """ + environemnt is set + """ + os.environ["DUMMY_SETTING"] = "required setting" + result = with_no_default_settings() + self.assertEqual(result, "required setting") + del os.environ["DUMMY_SETTING"] + + +if __name__ == "__main__": + unittest.main() From dde0e500b661aa16c4a6abade78c056639ae72cf Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Mon, 11 Nov 2024 03:27:19 +0100 Subject: [PATCH 15/22] feat: add validation --- scystream/sdk/core.py | 2 +- scystream/sdk/env/settings.py | 66 +++++++++++++---- tests/test_settings.py | 132 ++++++++++++++++++++++++++++------ 3 files changed, 163 insertions(+), 37 deletions(-) diff --git a/scystream/sdk/core.py b/scystream/sdk/core.py index 07ee5b1..bd1414f 100644 --- a/scystream/sdk/core.py +++ b/scystream/sdk/core.py @@ -18,7 +18,7 @@ def wrapper(*args, **kwargs): if settings_class is not None: # Load settings try: - settings = settings_class.load_settings() + settings = settings_class.from_env() except ValidationError as e: raise ValueError(f"Invalid environment configuration: {e}") diff --git a/scystream/sdk/env/settings.py b/scystream/sdk/env/settings.py index 9eab887..d1aba76 100644 --- a/scystream/sdk/env/settings.py +++ b/scystream/sdk/env/settings.py @@ -1,16 +1,18 @@ +from pathlib import Path from pydantic_settings import BaseSettings, SettingsConfigDict -from typing import Type +from typing import Union, List, get_type_hints +from pydantic import Field ENV_FILE_ENCODING = "utf-8" class BaseENVSettings(BaseSettings): """ - This class acts as the BaseClass which can be used to define custom - ENV-Variables which can be used across the ComputeBlock & for entrypoints - This definition, and pydantic, will then take care of validating the envs - """ + Allow kwargs to propagate to any fields whose default factory extends + BaseSettings, + This is mostly to allow _env_file to be passed through. + """ model_config = SettingsConfigDict( env_file_encoding=ENV_FILE_ENCODING, case_sensitive=True, @@ -18,13 +20,51 @@ class BaseENVSettings(BaseSettings): ) @classmethod - def load_settings( - cls: Type["BaseENVSettings"], - env_file: str = ".env" - ) -> "BaseENVSettings": + def from_env( + cls, + env_file: Union[str, Path, List[Union[str, Path]]] = None, + *args, + **kwargs + ): + return cls(propagate_kwargs={"_env_file": env_file}, *args, **kwargs) + + @classmethod + def _basesettings_fields(cls): """ - load_settings loads the env file. The name of the env_file can be - passed as an argument. - Returns the parsed ENVs + :return a dict of field_name: default_factory for any fields that + extend BaseSettings """ - return cls(_env_file=env_file, _env_file_encoding=ENV_FILE_ENCODING) + type_hints = get_type_hints(cls) + return { + name: typ for name, typ in type_hints.items() + if isinstance(typ, type) and issubclass(typ, BaseSettings) + + } + + @classmethod + def _propagate_kwargs(cls, kwargs): + """ + Any settings that extend BaseSettings be passed the kwargs. + """ + sub_settings = cls._basesettings_fields() + for name, field_type in sub_settings.items(): + kwargs[name] = field_type(**kwargs) + return kwargs + + def __init_subclass__(cls, **kwargs): + """ + Automatically set up nested settings fields with default_factory. + """ + super().__init_subclass__(**kwargs) + type_hints = get_type_hints(cls) + for field_name, field_type in type_hints.items(): + if isinstance(field_type, type) and issubclass( + field_type, BaseSettings): + # Set a default factory for nested BaseSettings fields + default_field = Field(default_factory=field_type) + setattr(cls, field_name, default_field) + + def __init__(self, propagate_kwargs=None, *args, **kwargs): + if propagate_kwargs: + kwargs = self._propagate_kwargs(propagate_kwargs) + super().__init__(*args, **kwargs) diff --git a/tests/test_settings.py b/tests/test_settings.py index 755f6d9..4aceebd 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -2,48 +2,134 @@ import os from scystream.sdk.core import entrypoint from scystream.sdk.env.settings import BaseENVSettings +from scystream.sdk.scheduler import Scheduler + + +class DummyInputSettings(BaseENVSettings): + DUMMY_INPUT: str = "test" class WithDefaultSettings(BaseENVSettings): - DUMMY_SETTING: str = "this is a dummy setting" + DUMMY_GLOBAL: str = "dummy global var" + + dummy_input_settings: DummyInputSettings + + +class DummyInputSettingsNoDef(BaseENVSettings): + DUMMY_INPUT: str + + +class WithoutDefaultSettings(BaseENVSettings): + DUMMY_GLOBAL: str + + dummy_input_settings_no_def: DummyInputSettingsNoDef + + +class WithoutDefaultNoNesting(BaseENVSettings): + TEST: str = "teststr" + MUST_SET: str + + +class SubOne(BaseENVSettings): + ONE: str + TWO: str + + +class SubTwo(BaseENVSettings): + TEST: str + NO_DEF: str -class NoDefaultSetting(BaseENVSettings): - DUMMY_SETTING: str +class TwoSubclasses(BaseENVSettings): + GLOBAL: str + + input_one: SubOne + input_two: SubTwo class TestSettings(unittest.TestCase): def test_entrypoint_with_setting_default(self): @entrypoint(WithDefaultSettings) def with_default_settings(settings): - return settings.DUMMY_SETTING + return settings.dummy_input_settings.DUMMY_INPUT result = with_default_settings() - self.assertEqual(result, "this is a dummy setting") + self.assertEqual(result, "test") - """ - environment is set - """ - os.environ["DUMMY_SETTING"] = "overridden setting" + # set environ + os.environ["DUMMY_INPUT"] = "overridden setting" result = with_default_settings() + # check if overriding works self.assertEqual(result, "overridden setting") - del os.environ["DUMMY_SETTING"] - def test_entrypoint_with_no_setting_default(self): - @entrypoint(NoDefaultSetting) - def with_no_default_settings(settings): - return settings.DUMMY_SETTING + del os.environ["DUMMY_INPUT"] + + def test_entrypoint_no_setting_default_one(self): + @entrypoint(WithoutDefaultSettings) + def without_def_settings(settings): + print("test...") + # do we fail if environments not set with self.assertRaises(ValueError): - with_no_default_settings() - - """ - environemnt is set - """ - os.environ["DUMMY_SETTING"] = "required setting" - result = with_no_default_settings() - self.assertEqual(result, "required setting") - del os.environ["DUMMY_SETTING"] + Scheduler.execute_function("without_def_settings") + + def test_entrypoint_no_setting_default_two(self): + @entrypoint(WithoutDefaultSettings) + def without_def_settings(settings): + return ( + settings.DUMMY_GLOBAL, + settings.dummy_input_settings_no_def.DUMMY_INPUT + ) + + # set environments + os.environ["DUMMY_GLOBAL"] = "dummy global" + os.environ["DUMMY_INPUT"] = "dummy input" + + # check if environments have been set + result = without_def_settings() + self.assertEqual(result[0], "dummy global") + self.assertEqual(result[1], "dummy input") + + del os.environ["DUMMY_GLOBAL"] + del os.environ["DUMMY_INPUT"] + + def test_entrypoint_no_setting_defautl_three(self): + @entrypoint(WithoutDefaultNoNesting) + def no_nesting(settings): + print("testing...") + + with self.assertRaises(ValueError): + Scheduler.execute_function("no_nesting") + + def test_two_subs(self): + @entrypoint(TwoSubclasses) + def two_subs(settings): + return ( + settings.GLOBAL, + settings.input_one.ONE, + settings.input_one.TWO, + settings.input_two.TEST, + settings.input_two.NO_DEF + ) + + os.environ["GLOBAL"] = "global" + os.environ["ONE"] = "one" + os.environ["TWO"] = "two" + os.environ["TEST"] = "test" + os.environ["NO_DEF"] = "no_def" + + result = two_subs() + self.assertEqual(result[0], "global") + self.assertEqual(result[1], "one") + self.assertEqual(result[2], "two") + self.assertEqual(result[3], "test") + self.assertEqual(result[4], "no_def") + + del os.environ["GLOBAL"] + del os.environ["ONE"] + del os.environ["TWO"] + del os.environ["TEST"] + del os.environ["NO_DEF"] if __name__ == "__main__": From 1e797700232b8a3eb0b6a4be7b6767fca4b82be7 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Mon, 11 Nov 2024 03:38:26 +0100 Subject: [PATCH 16/22] docs: update readme --- README.md | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index a5a87dc..c8f39d1 100644 --- a/README.md +++ b/README.md @@ -152,36 +152,29 @@ Therefore you should use the `BaseENVSettings` class. from scystream.sdk.core import entrypoint from scystream.sdk.env.settings import BaseENVSettings -class GlobalSettings(BaseENVSettings): - LANGUAGE: str = "de" +class TextDataInputSettings(BaseENVSettings): + TXT_SRC_PATH: str # no default provided, manual setting is a MUST + +class DBDataInputSettings(BaseENVSettings): + DATA_TABLE_NAME: str = "nlp_information" + DB_HOST: str = "time.rwth-aachen.de" + DB_PORT: str = 1234 class TopicModellingEntrypointSettings(BaseENVSettings): - TXT_SRC_PATH: str # if no default provided, setting this ENV manually is a MUST + LANGUAGE: str = "de" + + text_data: TextDataInputSettings + db_data: DBDataInputSettings @entrypoint(TopicModellingEntrypointSettings) # Pass it to the Entrypoint def topic_modelling(settings): - print(f"Running topic modelling, using file: {settings.TXT_SRC_PATH}") + print(f"Running topic modelling, using file: {settings.text_data.TXT_SRC_PATH}") @entrypoint def test_entrypint(): print("This entrypoint does not have any configs.") ``` -We recommend defining your `GlobalSettings` in an extra file and "exporting" the loaded -Settings to make them accessible to other files. -See an example below: - -```python3 -from scystream.sdk.env.settings import BaseENVSettings - -class GlobalSettings(BaseENVSettings): - LANGUAGE: str = "de" - -GLOBAL_SETTINGS = GlobalSettings.load_settings() -``` - -You can then use the loaded `GLOBAL_SETTINGS` in your other files, by importing them. - ## Development of the SDK ### Installation From f3553ad80e7f04b2d9b7b587e725279ed4f6ce27 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Mon, 11 Nov 2024 14:50:37 +0100 Subject: [PATCH 17/22] style: rename base class --- README.md | 22 +++++++++++++--------- scystream/sdk/core.py | 6 +++--- scystream/sdk/env/settings.py | 6 +++++- tests/test_settings.py | 18 +++++++++--------- 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index c8f39d1..65034fa 100644 --- a/README.md +++ b/README.md @@ -118,12 +118,12 @@ from scystream.sdk.core import entrypoint from scystream.sdk.scheduler import Scheduler -@entrypoint +@entrypoint() def example_task(): print("Executing example_task...") -@entrypoint +@entrypoint() def another_task(task_name): print(f"Executing another_task with task name: {task_name}") @@ -146,35 +146,39 @@ Each Input & Output can be configured using these settings. There are also Global Settings, refered to as `envs` in the `cbc.yaml` Below you can find a simple example of how we define & validate these settings. -Therefore you should use the `BaseENVSettings` class. +Therefore you should use the `EnvSettings` class. ```python3 from scystream.sdk.core import entrypoint -from scystream.sdk.env.settings import BaseENVSettings +from scystream.sdk.env.settings import EnvSettings -class TextDataInputSettings(BaseENVSettings): +class TextDataInputSettings(EnvSettings): TXT_SRC_PATH: str # no default provided, manual setting is a MUST -class DBDataInputSettings(BaseENVSettings): +class DBDataInputSettings(EnvSettings): DATA_TABLE_NAME: str = "nlp_information" DB_HOST: str = "time.rwth-aachen.de" DB_PORT: str = 1234 -class TopicModellingEntrypointSettings(BaseENVSettings): +class TopicModellingEntrypointSettings(EnvSettings): LANGUAGE: str = "de" text_data: TextDataInputSettings db_data: DBDataInputSettings @entrypoint(TopicModellingEntrypointSettings) # Pass it to the Entrypoint -def topic_modelling(settings): +def topic_modelling(settings): # The settings param is automatically injected to your function, you can use it print(f"Running topic modelling, using file: {settings.text_data.TXT_SRC_PATH}") -@entrypoint +@entrypoint() def test_entrypint(): print("This entrypoint does not have any configs.") ``` +Of course, you will also be able to use your settings in other files/directories. +For that, just import your desired setting and use the `get_settings()` function. +It will load the configurations correctly. + ## Development of the SDK ### Installation diff --git a/scystream/sdk/core.py b/scystream/sdk/core.py index bd1414f..5dd8b12 100644 --- a/scystream/sdk/core.py +++ b/scystream/sdk/core.py @@ -1,13 +1,13 @@ import functools from typing import Callable, Type, Optional -from .env.settings import BaseENVSettings +from .env.settings import EnvSettings from pydantic import ValidationError _registered_functions = {} -def entrypoint(settings_class: Optional[Type[BaseENVSettings]] = None): +def entrypoint(settings_class: Optional[Type[EnvSettings]] = None): """ Decorator to mark a function as an entrypoint. It also loads and injects the settings of the entrypoint. @@ -18,7 +18,7 @@ def wrapper(*args, **kwargs): if settings_class is not None: # Load settings try: - settings = settings_class.from_env() + settings = settings_class.get_settings() except ValidationError as e: raise ValueError(f"Invalid environment configuration: {e}") diff --git a/scystream/sdk/env/settings.py b/scystream/sdk/env/settings.py index d1aba76..b217b93 100644 --- a/scystream/sdk/env/settings.py +++ b/scystream/sdk/env/settings.py @@ -6,7 +6,7 @@ ENV_FILE_ENCODING = "utf-8" -class BaseENVSettings(BaseSettings): +class EnvSettings(BaseSettings): """ Allow kwargs to propagate to any fields whose default factory extends BaseSettings, @@ -51,6 +51,10 @@ def _propagate_kwargs(cls, kwargs): kwargs[name] = field_type(**kwargs) return kwargs + @classmethod + def get_settings(cls): + return cls.from_env(env_file=".env") + def __init_subclass__(cls, **kwargs): """ Automatically set up nested settings fields with default_factory. diff --git a/tests/test_settings.py b/tests/test_settings.py index 4aceebd..b6144d2 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -1,46 +1,46 @@ import unittest import os from scystream.sdk.core import entrypoint -from scystream.sdk.env.settings import BaseENVSettings +from scystream.sdk.env.settings import EnvSettings from scystream.sdk.scheduler import Scheduler -class DummyInputSettings(BaseENVSettings): +class DummyInputSettings(EnvSettings): DUMMY_INPUT: str = "test" -class WithDefaultSettings(BaseENVSettings): +class WithDefaultSettings(EnvSettings): DUMMY_GLOBAL: str = "dummy global var" dummy_input_settings: DummyInputSettings -class DummyInputSettingsNoDef(BaseENVSettings): +class DummyInputSettingsNoDef(EnvSettings): DUMMY_INPUT: str -class WithoutDefaultSettings(BaseENVSettings): +class WithoutDefaultSettings(EnvSettings): DUMMY_GLOBAL: str dummy_input_settings_no_def: DummyInputSettingsNoDef -class WithoutDefaultNoNesting(BaseENVSettings): +class WithoutDefaultNoNesting(EnvSettings): TEST: str = "teststr" MUST_SET: str -class SubOne(BaseENVSettings): +class SubOne(EnvSettings): ONE: str TWO: str -class SubTwo(BaseENVSettings): +class SubTwo(EnvSettings): TEST: str NO_DEF: str -class TwoSubclasses(BaseENVSettings): +class TwoSubclasses(EnvSettings): GLOBAL: str input_one: SubOne From bf8936164c819a8bc1a47e5043ab35b2820fef40 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Mon, 11 Nov 2024 22:00:58 +0100 Subject: [PATCH 18/22] feat: add input output abstraction to settings --- scystream/sdk/config/config_loader.py | 23 ++++++++ scystream/sdk/config/models.py | 9 ++- scystream/sdk/core.py | 81 ++++++++++++++++++++++++++- scystream/sdk/env/settings.py | 16 ++++++ scystream/sdk/scheduler.py | 2 +- 5 files changed, 124 insertions(+), 7 deletions(-) diff --git a/scystream/sdk/config/config_loader.py b/scystream/sdk/config/config_loader.py index f68e565..13760e2 100644 --- a/scystream/sdk/config/config_loader.py +++ b/scystream/sdk/config/config_loader.py @@ -7,6 +7,18 @@ CONFIG_FILE_DEFAULT_NAME = "cbc.yaml" +def _remove_empty_dicts(data): + """ + Remove keys with empty dictionaries from a nested structure. + """ + if isinstance(data, dict): + return {k: _remove_empty_dicts(v) for k, v in data.items() if v != {}} + elif isinstance(data, list): + return [_remove_empty_dicts(i) for i in data] + else: + return data + + def load_config( config_file_name: str = CONFIG_FILE_DEFAULT_NAME, config_path: Union[str, Path] = None @@ -18,11 +30,22 @@ def load_config( try: file = _find_and_load_config(config_file_name, config_path) block = ComputeBlock(**file) + # TODO: Check if envs && input/output configs correspond to the + # loaded one return block except ValidationError as e: raise ValueError(f"Configuration file validation error: {e}") +def generate_yaml_from_compute_block( + compute_block: ComputeBlock, + output_path: Path +): + cleaned_data = _remove_empty_dicts(compute_block.dict()) + with output_path.open("w") as file: + yaml.dump(cleaned_data, file, default_flow_style=False) + + def _find_and_load_config( config_file_name: str, config_path: Union[str, Path] = None diff --git a/scystream/sdk/config/models.py b/scystream/sdk/config/models.py index cca2008..bcc67de 100644 --- a/scystream/sdk/config/models.py +++ b/scystream/sdk/config/models.py @@ -4,6 +4,8 @@ FILE_TYPE_IDENTIFIER = "file" DB_TABLE_TYPE_IDENTIFIER = "db_table" +# TODO: reevaluate the identifier +TODO_TYPE_IDENTIFIER = "TODO: SetType" """ This file contains the schema definition for the config file. @@ -23,7 +25,8 @@ class InputOutputModel(BaseModel): If a value is explicitly set to `null`, validation will fail unless the ENV-Variable is manually set by the ComputeBlock user. """ - type: Literal[FILE_TYPE_IDENTIFIER, DB_TABLE_TYPE_IDENTIFIER] + type: Literal[FILE_TYPE_IDENTIFIER, + DB_TABLE_TYPE_IDENTIFIER, TODO_TYPE_IDENTIFIER] description: Optional[StrictStr] = None config: Optional[ Dict[ @@ -59,8 +62,8 @@ class Entrypoint(BaseModel): Optional[Union[StrictStr, StrictInt, StrictFloat, List, bool]] ] ] = None - inputs: Dict[StrictStr, InputOutputModel] - outputs: Dict[StrictStr, InputOutputModel] + inputs: Optional[Dict[StrictStr, InputOutputModel]] = None + outputs: Optional[Dict[StrictStr, InputOutputModel]] = None class ComputeBlock(BaseModel): diff --git a/scystream/sdk/core.py b/scystream/sdk/core.py index 5dd8b12..58cf18d 100644 --- a/scystream/sdk/core.py +++ b/scystream/sdk/core.py @@ -1,8 +1,11 @@ import functools - -from typing import Callable, Type, Optional +from typing import Callable, Type, Optional, Union from .env.settings import EnvSettings from pydantic import ValidationError +from scystream.sdk.config.models import ComputeBlock, Entrypoint, \ + InputOutputModel +from pydantic_core import PydanticUndefinedType +from scystream.sdk.env.settings import InputSettings, OutputSettings _registered_functions = {} @@ -18,6 +21,9 @@ def wrapper(*args, **kwargs): if settings_class is not None: # Load settings try: + # TODO: 1. LoadSettings + # TODO: 2. Generate config from settings (only for the entrypoint) + # TODO: 3. Validate if generated config and given config are same settings = settings_class.get_settings() except ValidationError as e: raise ValueError(f"Invalid environment configuration: {e}") @@ -26,11 +32,80 @@ def wrapper(*args, **kwargs): else: return func(*args, **kwargs) - _registered_functions[func.__name__] = wrapper + _registered_functions[func.__name__] = { + "function": wrapper, + "settings": settings_class + } return wrapper return decorator def get_registered_functions(): """Returns a dictionary of registered entrypoint functions.""" + print(_registered_functions) return _registered_functions + + +def _get_pydantic_default_value_or_none(value): + if type(value.default) is PydanticUndefinedType: + return None + return value.default + + +def _build_input_output_dict_from_class( + subject: Union[InputSettings, OutputSettings] +): + config_dict = {} + for key, value in subject.model_fields.items(): + config_dict[key] = _get_pydantic_default_value_or_none(value) + return InputOutputModel( + type="TODO: SetType", + description="", + config=config_dict + ) + + +def generate_compute_block() -> ComputeBlock: + """ + Converts the Settings to a ComputeBlock + """ + entrypoints = {} + for entrypoint, func in _registered_functions.items(): + envs = {} + inputs = {} + outputs = {} + + if func["settings"]: + entrypoint_settings_class = func["settings"] + for key, value in entrypoint_settings_class.model_fields.items(): + if ( + isinstance(value.default_factory, type) and + issubclass(value.default_factory, InputSettings) + ): + inputs[key] = _build_input_output_dict_from_class( + value.default_factory + ) + elif ( + isinstance(value.default_factory, type) and + issubclass(value.default_factory, OutputSettings) + ): + outputs[key] = _build_input_output_dict_from_class( + value.default_factory + ) + else: + envs[key] = _get_pydantic_default_value_or_none(value) + + entrypoints[entrypoint] = Entrypoint( + description="", + envs=envs, + inputs=inputs, + outputs=outputs + ) + + return ComputeBlock( + name="", + description="", + author="", + entrypoints=entrypoints, + docker_image="" + ) diff --git a/scystream/sdk/env/settings.py b/scystream/sdk/env/settings.py index b217b93..e0c27c8 100644 --- a/scystream/sdk/env/settings.py +++ b/scystream/sdk/env/settings.py @@ -72,3 +72,19 @@ def __init__(self, propagate_kwargs=None, *args, **kwargs): if propagate_kwargs: kwargs = self._propagate_kwargs(propagate_kwargs) super().__init__(*args, **kwargs) + + +class InputSettings(EnvSettings): + """ + Abstraction-Layer for inputs + could be extended + """ + pass + + +class OutputSettings(EnvSettings): + """ + Abstraction-Layer for outputs + could be exended + """ + pass diff --git a/scystream/sdk/scheduler.py b/scystream/sdk/scheduler.py index f403c83..c610897 100644 --- a/scystream/sdk/scheduler.py +++ b/scystream/sdk/scheduler.py @@ -13,6 +13,6 @@ def list_entrypoints(): def execute_function(name, *args, **kwargs): functions = get_registered_functions() if name in functions: - return functions[name](*args, **kwargs) + return functions[name]["function"](*args, **kwargs) else: raise Exception(f"No entrypoint found with the name: {name}") From 55dac98ead5c3b3b1e2fd27b7536a9728d296225 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Tue, 12 Nov 2024 03:08:09 +0100 Subject: [PATCH 19/22] feat: validate in outputs with load_and_validate func --- scystream/sdk/config/compute_block_utils.py | 72 ++++++++++++++++++ scystream/sdk/config/config_loader.py | 24 ++++-- scystream/sdk/config/models.py | 57 ++++++++++++-- scystream/sdk/core.py | 82 ++------------------- scystream/sdk/env/settings.py | 2 - tests/test_config.py | 19 ++--- tests/test_core.py | 2 +- 7 files changed, 158 insertions(+), 100 deletions(-) create mode 100644 scystream/sdk/config/compute_block_utils.py diff --git a/scystream/sdk/config/compute_block_utils.py b/scystream/sdk/config/compute_block_utils.py new file mode 100644 index 0000000..6c6f8bb --- /dev/null +++ b/scystream/sdk/config/compute_block_utils.py @@ -0,0 +1,72 @@ +from pydantic_core import PydanticUndefinedType +from scystream.sdk.config.models import ComputeBlock, Entrypoint, \ + InputOutputModel +from scystream.sdk.env.settings import InputSettings, \ + OutputSettings +from typing import Union +from scystream.sdk.core import get_registered_functions + + +def _get_pydantic_default_value_or_none(value): + if type(value.default) is PydanticUndefinedType: + return None + return value.default + + +def _build_input_output_dict_from_class( + subject: Union[InputSettings, OutputSettings] +): + config_dict = {} + for key, value in subject.model_fields.items(): + config_dict[key] = _get_pydantic_default_value_or_none(value) + return InputOutputModel( + type="TODO: SetType", + description="", + config=config_dict + ) + + +def get_compute_block() -> ComputeBlock: + """ + Converts Entrypoints & Settings to a ComputeBlock + """ + entrypoints = {} + for entrypoint, func in get_registered_functions().items(): + envs = {} + inputs = {} + outputs = {} + + if func["settings"]: + entrypoint_settings_class = func["settings"] + for key, value in entrypoint_settings_class.model_fields.items(): + if ( + isinstance(value.default_factory, type) and + issubclass(value.default_factory, InputSettings) + ): + inputs[key] = _build_input_output_dict_from_class( + value.default_factory + ) + elif ( + isinstance(value.default_factory, type) and + issubclass(value.default_factory, OutputSettings) + ): + outputs[key] = _build_input_output_dict_from_class( + value.default_factory + ) + else: + envs[key] = _get_pydantic_default_value_or_none(value) + + entrypoints[entrypoint] = Entrypoint( + description="", + envs=envs, + inputs=inputs, + outputs=outputs + ) + + return ComputeBlock( + name="", + description="", + author="", + entrypoints=entrypoints, + docker_image="" + ) diff --git a/scystream/sdk/config/config_loader.py b/scystream/sdk/config/config_loader.py index 13760e2..49bdc97 100644 --- a/scystream/sdk/config/config_loader.py +++ b/scystream/sdk/config/config_loader.py @@ -3,6 +3,7 @@ from pydantic import ValidationError from pathlib import Path from .models import ComputeBlock +from scystream.sdk.config.compute_block_utils import get_compute_block CONFIG_FILE_DEFAULT_NAME = "cbc.yaml" @@ -19,9 +20,9 @@ def _remove_empty_dicts(data): return data -def load_config( +def load_and_validate_config( config_file_name: str = CONFIG_FILE_DEFAULT_NAME, - config_path: Union[str, Path] = None + config_path: Union[str, Path] = None, ) -> ComputeBlock: """ Returns and Validates the Compute Block YAML definition. @@ -29,15 +30,24 @@ def load_config( """ try: file = _find_and_load_config(config_file_name, config_path) - block = ComputeBlock(**file) - # TODO: Check if envs && input/output configs correspond to the - # loaded one - return block + block_from_cfg = ComputeBlock(**file) + block_from_code = get_compute_block() + + if ( + block_from_cfg != block_from_code + ): + # check the total config + raise ValueError( + "The entrypoint configs (envs, inputs, outputs) defined in " + "your config yaml do not correspond with the entrypoint " + "settings defined in your code." + ) + return block_from_code except ValidationError as e: raise ValueError(f"Configuration file validation error: {e}") -def generate_yaml_from_compute_block( +def generate_config_from_compute_block( compute_block: ComputeBlock, output_path: Path ): diff --git a/scystream/sdk/config/models.py b/scystream/sdk/config/models.py index bcc67de..29313bf 100644 --- a/scystream/sdk/config/models.py +++ b/scystream/sdk/config/models.py @@ -4,13 +4,9 @@ FILE_TYPE_IDENTIFIER = "file" DB_TABLE_TYPE_IDENTIFIER = "db_table" -# TODO: reevaluate the identifier +# TODO: reevaluate identifier TODO_TYPE_IDENTIFIER = "TODO: SetType" -""" -This file contains the schema definition for the config file. -""" - class InputOutputModel(BaseModel): """ @@ -38,6 +34,20 @@ class InputOutputModel(BaseModel): (file_path, table_name, etc.)" ) + def __eq__(self, other): + """ + Compares the configuration keys only, as other attributes + are not relevant for determining equality at this stage. + """ + if isinstance(other, InputOutputModel): + return ( + self._sorted_config() == other._sorted_config() + ) + return False + + def _sorted_config(self): + return dict(sorted(self.config.items() if self.config else {})) + class Entrypoint(BaseModel): """ @@ -65,6 +75,28 @@ class Entrypoint(BaseModel): inputs: Optional[Dict[StrictStr, InputOutputModel]] = None outputs: Optional[Dict[StrictStr, InputOutputModel]] = None + def __eq__(self, other): + """ + Compares the envs, inputs, outputs only, as other attributes + are not relevant for determining equality at this stage. + """ + if isinstance(other, Entrypoint): + return ( + self._sorted_envs() == other._sorted_envs() and + self._sorted_inputs() == other._sorted_inputs() and + self._sorted_outputs() == other._sorted_outputs() + ) + return False + + def _sorted_envs(self): + return dict(sorted(self.envs.items()) if self.envs else {}) + + def _sorted_inputs(self): + return dict(sorted(self.inputs.items()) if self.inputs else {}) + + def _sorted_outputs(self): + return dict(sorted(self.outputs.items()) if self.outputs else {}) + class ComputeBlock(BaseModel): """ @@ -90,3 +122,18 @@ def check_entrypoints(cls, v): if not v: raise ValueError("At least one entrypoint must be defined.") return v + + def __eq__(self, other): + """ + Compares the entrypoints only, as other attributes + are not relevant for determining equality at this stage. + """ + + if isinstance(other, ComputeBlock): + return ( + self._sorted_entrypoints() == other._sorted_entrypoints() + ) + return False + + def _sorted_entrypoints(self): + return {key: value for key, value in sorted(self.entrypoints.items())} diff --git a/scystream/sdk/core.py b/scystream/sdk/core.py index 58cf18d..c514d3b 100644 --- a/scystream/sdk/core.py +++ b/scystream/sdk/core.py @@ -1,11 +1,7 @@ import functools -from typing import Callable, Type, Optional, Union -from .env.settings import EnvSettings +from typing import Callable, Type, Optional +from scystream.sdk.env.settings import EnvSettings from pydantic import ValidationError -from scystream.sdk.config.models import ComputeBlock, Entrypoint, \ - InputOutputModel -from pydantic_core import PydanticUndefinedType -from scystream.sdk.env.settings import InputSettings, OutputSettings _registered_functions = {} @@ -19,15 +15,15 @@ def decorator(func: Callable): @functools.wraps(func) def wrapper(*args, **kwargs): if settings_class is not None: - # Load settings + # TODO: validate the entrypoint settings with the config yaml + try: - # TODO: 1. LoadSettings - # TODO: 2. Generate config from settings (only for the entrypoint) - # TODO: 3. Validate if generated config and given config are same + # load the settings settings = settings_class.get_settings() except ValidationError as e: raise ValueError(f"Invalid environment configuration: {e}") + # inject the settings return func(settings, *args, **kwargs) else: return func(*args, **kwargs) @@ -42,70 +38,4 @@ def wrapper(*args, **kwargs): def get_registered_functions(): """Returns a dictionary of registered entrypoint functions.""" - print(_registered_functions) return _registered_functions - - -def _get_pydantic_default_value_or_none(value): - if type(value.default) is PydanticUndefinedType: - return None - return value.default - - -def _build_input_output_dict_from_class( - subject: Union[InputSettings, OutputSettings] -): - config_dict = {} - for key, value in subject.model_fields.items(): - config_dict[key] = _get_pydantic_default_value_or_none(value) - return InputOutputModel( - type="TODO: SetType", - description="", - config=config_dict - ) - - -def generate_compute_block() -> ComputeBlock: - """ - Converts the Settings to a ComputeBlock - """ - entrypoints = {} - for entrypoint, func in _registered_functions.items(): - envs = {} - inputs = {} - outputs = {} - - if func["settings"]: - entrypoint_settings_class = func["settings"] - for key, value in entrypoint_settings_class.model_fields.items(): - if ( - isinstance(value.default_factory, type) and - issubclass(value.default_factory, InputSettings) - ): - inputs[key] = _build_input_output_dict_from_class( - value.default_factory - ) - elif ( - isinstance(value.default_factory, type) and - issubclass(value.default_factory, OutputSettings) - ): - outputs[key] = _build_input_output_dict_from_class( - value.default_factory - ) - else: - envs[key] = _get_pydantic_default_value_or_none(value) - - entrypoints[entrypoint] = Entrypoint( - description="", - envs=envs, - inputs=inputs, - outputs=outputs - ) - - return ComputeBlock( - name="", - description="", - author="", - entrypoints=entrypoints, - docker_image="" - ) diff --git a/scystream/sdk/env/settings.py b/scystream/sdk/env/settings.py index e0c27c8..981353e 100644 --- a/scystream/sdk/env/settings.py +++ b/scystream/sdk/env/settings.py @@ -79,7 +79,6 @@ class InputSettings(EnvSettings): Abstraction-Layer for inputs could be extended """ - pass class OutputSettings(EnvSettings): @@ -87,4 +86,3 @@ class OutputSettings(EnvSettings): Abstraction-Layer for outputs could be exended """ - pass diff --git a/tests/test_config.py b/tests/test_config.py index 149812e..3426024 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,5 +1,6 @@ import unittest -from scystream.sdk.config.config_loader import load_config, ComputeBlock +from scystream.sdk.config.config_loader import load_and_validate_config, \ + ComputeBlock class TestComputeBlockValidation(unittest.TestCase): @@ -7,7 +8,7 @@ class TestComputeBlockValidation(unittest.TestCase): def test_valid_config(self): try: - compute_block = load_config( + compute_block = load_and_validate_config( "valid_config.yaml", config_path=self.TEST_CONFIG_FOLDER) self.assertIsInstance(compute_block, ComputeBlock) except Exception: @@ -15,22 +16,22 @@ def test_valid_config(self): def test_missing_entrypoints(self): with self.assertRaises(ValueError): - load_config("missing_entrypoints.yaml", - config_path=self.TEST_CONFIG_FOLDER) + load_and_validate_config("missing_entrypoints.yaml", + config_path=self.TEST_CONFIG_FOLDER) def test_invalid_datatypes(self): with self.assertRaises(ValueError): - load_config("invalid_datatype.yaml", - config_path=self.TEST_CONFIG_FOLDER) + load_and_validate_config("invalid_datatype.yaml", + config_path=self.TEST_CONFIG_FOLDER) def test_not_a_yaml(self): with self.assertRaises(ValueError): - load_config("not_a_yaml.json", - config_path=self.TEST_CONFIG_FOLDER) + load_and_validate_config("not_a_yaml.json", + config_path=self.TEST_CONFIG_FOLDER) def test_file_not_found(self): with self.assertRaises(FileNotFoundError): - load_config("test.yaml") + load_and_validate_config("test.yaml") if __name__ == "__main__": diff --git a/tests/test_core.py b/tests/test_core.py index f9a19ec..27107ed 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -10,7 +10,7 @@ def dummy_function(): registered = get_registered_functions() self.assertIn("dummy_function", registered) - self.assertEqual(registered["dummy_function"](), "Hello") + self.assertEqual(registered["dummy_function"]["function"](), "Hello") if __name__ == "__main__": From 5e12337d9250f104530638806a570c9cf942bd7d Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Tue, 12 Nov 2024 17:23:52 +0100 Subject: [PATCH 20/22] feat: validate on execute and custom validation function --- README.md | 185 +++++++++++--------- scystream/sdk/config/compute_block_utils.py | 10 +- scystream/sdk/config/config_loader.py | 60 ++++--- scystream/sdk/config/entrypoints.py | 12 ++ scystream/sdk/core.py | 18 +- scystream/sdk/scheduler.py | 9 +- 6 files changed, 168 insertions(+), 126 deletions(-) create mode 100644 scystream/sdk/config/entrypoints.py diff --git a/README.md b/README.md index 65034fa..7bb036c 100644 --- a/README.md +++ b/README.md @@ -8,22 +8,93 @@ You can install the package via pip once it's published: pip install scystream-sdk ``` -### Compute Blocks and their configs +## Introduction + One of the central concepts of scystream are the so-called **Compute Blocks**. A Compute Block describes an independent programm, that acts as some kind of worker which will be scheduled using the scystream-core application. -This worker executes a task (e.g. a NLP task, a crwaling task). +This worker executes a task (e.g. a NLP task, a crawling task). + +This SDK aims to provide helper functions and all other requirements you need to implement +a custom Compute Block on your own. Each worker can have multiple entrypoints, each aiming to solve one task. These entrypoints can be configured from the outside using the **Settings**. These are basically ENV-Variables, which will be parsed & validated using pydantic. -This SDK aims to implement helper functions and other requirements we expect each -Compute Block to have. +You can either set "global" Settings (for the entrypoint), by using the `envs` block. +Or you can set "input/output-related" Settings by using the `config` block in each input/output. + +## Basic Usage of the SDK + +```python3 +from scystream.sdk.core import entrypoint +from scystream.sdk.scheduler import Scheduler + + +@entrypoint() +def example_task(): + print("Executing example_task...") + + +@entrypoint() +def another_task(task_name): + print(f"Executing another_task with task name: {task_name}") + + +def main(): + Scheduler.list_entrypoints() + Scheduler.execute_function("example_task") + Scheduler.execute_function("another_task", "ScheduledTask") + + +if __name__ == "__main__": + main() + +``` + +## Defining Settings and Using them. + +Earlier, we already wrote about **Settings**. +Each Input & Output can be configured using these settings. +There are also Global Settings, refered to as `envs` in the `cbc.yaml` + +Below you can find a simple example of how we define & validate these settings. +Therefore you should use the `EnvSettings` class. + +```python3 +from scystream.sdk.core import entrypoint +from scystream.sdk.env.settings import EnvSettings + +class TextDataInputSettings(EnvSettings): + TXT_SRC_PATH: str # no default provided, manual setting is a MUST + +class DBDataInputSettings(EnvSettings): + DATA_TABLE_NAME: str = "nlp_information" + DB_HOST: str = "time.rwth-aachen.de" + DB_PORT: str = 1234 + +class TopicModellingEntrypointSettings(EnvSettings): + LANGUAGE: str = "de" + + text_data: TextDataInputSettings + db_data: DBDataInputSettings + +@entrypoint(TopicModellingEntrypointSettings) # Pass it to the Entrypoint +def topic_modelling(settings): # The settings param is automatically injected to your function, you can use it + print(f"Running topic modelling, using file: {settings.text_data.TXT_SRC_PATH}") + +@entrypoint() +def test_entrypint(): + print("This entrypoint does not have any configs.") +``` + +Of course, you will also be able to use your settings in other files/directories. +For that, just import your desired setting and use the `get_settings()` function. +It will load the configurations correctly. -To understand the concept of such a Compute Block even more, take a look at the -config below. +## Compute Block Config We expect every repository which will be used within the scystream application to contain a **Compute Block Config File**, the `cbc.yaml`, within the root directory. @@ -84,101 +155,53 @@ entrypoints: CSV_OUTPUT_PATH: "outputs/statistics.csv" ``` -For now, you have to write this config file on your own. However, at some -point you will be able to generate this config from your code. +### Generating a config -To read and validate such a config file you can proceed as follows: +After writing the functionality of your ComputeBlock (see more below) you can generate +the corresponding `cbc.yaml` by using the following function: ```python3 -from scystream.sdk.config.config_loader import load_config +from scystream.sdk.config.config_loader import generate_config_from_compute_block +from scystream.sdk.config.compute_block_utils import get_compute_block +from pathlib import Path -def main(): - load_config() +@entrypoint() +def example_entrypoint(): + print("Example...") if __name__ == "__main__": - main() + compute_block = get_compute_block() + generate_config_from_compute_block(cb, Path("cbc.yaml")) ``` -If you want the file to have another name than `cbc.yaml` or you want the file to be -somewhere else than the root directory you can define that using the parameters the -`load_config` function takes. +This will take all the entrypoints, their defined settings, and generate a config from them. -Example: +> [!NOTE] +> Make sure to edit the generated config by your user-defined metadata +> (e.g. author, description, docker_image, ...) -```python3 -load_config(config_file_name="test.yaml", config_path="configs/") -``` +### Validating a config -the `config_path` is the path relative to your root directory +Of course, you can also write the config completely on your own. -## Basic Usage of the SDK +> [!NOTE] +> When using `Scheduler.execute_function("entrypoint")` the Settings for the +> entrypoint and the config will be validated. +> If the Settings do not correspond to the definition in the yaml, execution will not be possible. -```python3 -from scystream.sdk.core import entrypoint -from scystream.sdk.scheduler import Scheduler +To validate the config, you can also use a helper function like this: +```python3 +from scystream.sdk.config.config_loader import validate_config_with_code @entrypoint() -def example_task(): - print("Executing example_task...") - - -@entrypoint() -def another_task(task_name): - print(f"Executing another_task with task name: {task_name}") - - -def main(): - Scheduler.list_entrypoints() - Scheduler.execute_function("example_task") - Scheduler.execute_function("another_task", "ScheduledTask") - +def example_entrypoint(): + print("Example...") if __name__ == "__main__": - main() - + validate_config_with_code() ``` -## Defining Settings and Using them. - -Earlier, we already wrote about **Settings**. -Each Input & Output can be configured using these settings. -There are also Global Settings, refered to as `envs` in the `cbc.yaml` - -Below you can find a simple example of how we define & validate these settings. -Therefore you should use the `EnvSettings` class. - -```python3 -from scystream.sdk.core import entrypoint -from scystream.sdk.env.settings import EnvSettings - -class TextDataInputSettings(EnvSettings): - TXT_SRC_PATH: str # no default provided, manual setting is a MUST - -class DBDataInputSettings(EnvSettings): - DATA_TABLE_NAME: str = "nlp_information" - DB_HOST: str = "time.rwth-aachen.de" - DB_PORT: str = 1234 - -class TopicModellingEntrypointSettings(EnvSettings): - LANGUAGE: str = "de" - - text_data: TextDataInputSettings - db_data: DBDataInputSettings - -@entrypoint(TopicModellingEntrypointSettings) # Pass it to the Entrypoint -def topic_modelling(settings): # The settings param is automatically injected to your function, you can use it - print(f"Running topic modelling, using file: {settings.text_data.TXT_SRC_PATH}") - -@entrypoint() -def test_entrypint(): - print("This entrypoint does not have any configs.") -``` - -Of course, you will also be able to use your settings in other files/directories. -For that, just import your desired setting and use the `get_settings()` function. -It will load the configurations correctly. - ## Development of the SDK ### Installation diff --git a/scystream/sdk/config/compute_block_utils.py b/scystream/sdk/config/compute_block_utils.py index 6c6f8bb..cd1ebbe 100644 --- a/scystream/sdk/config/compute_block_utils.py +++ b/scystream/sdk/config/compute_block_utils.py @@ -1,10 +1,10 @@ +from typing import Union from pydantic_core import PydanticUndefinedType from scystream.sdk.config.models import ComputeBlock, Entrypoint, \ InputOutputModel from scystream.sdk.env.settings import InputSettings, \ OutputSettings -from typing import Union -from scystream.sdk.core import get_registered_functions +from scystream.sdk.config.entrypoints import get_registered_functions def _get_pydantic_default_value_or_none(value): @@ -58,9 +58,9 @@ def get_compute_block() -> ComputeBlock: entrypoints[entrypoint] = Entrypoint( description="", - envs=envs, - inputs=inputs, - outputs=outputs + envs=envs if envs != {} else None, + inputs=inputs if inputs != {} else None, + outputs=outputs if outputs != {} else None ) return ComputeBlock( diff --git a/scystream/sdk/config/config_loader.py b/scystream/sdk/config/config_loader.py index 49bdc97..a50aef5 100644 --- a/scystream/sdk/config/config_loader.py +++ b/scystream/sdk/config/config_loader.py @@ -2,47 +2,58 @@ from typing import Union from pydantic import ValidationError from pathlib import Path -from .models import ComputeBlock +from scystream.sdk.config.models import ComputeBlock, Entrypoint, \ + InputOutputModel from scystream.sdk.config.compute_block_utils import get_compute_block CONFIG_FILE_DEFAULT_NAME = "cbc.yaml" -def _remove_empty_dicts(data): +def _compare_configs( + config_from_yaml: Union[ComputeBlock, Entrypoint, InputOutputModel], + config_from_code: Union[ComputeBlock, Entrypoint, InputOutputModel], + name="block" +): """ - Remove keys with empty dictionaries from a nested structure. + Compares two configurations and raises a ValueError if they don't match. """ - if isinstance(data, dict): - return {k: _remove_empty_dicts(v) for k, v in data.items() if v != {}} - elif isinstance(data, list): - return [_remove_empty_dicts(i) for i in data] + if config_from_yaml != config_from_code: + raise ValueError( + f"The {name} configs (envs, inputs, outputs) defined\ + in your config YAML do not match the settings defined\ + in your code." + ) + + +def validate_config_with_code( + config_file_name: str = CONFIG_FILE_DEFAULT_NAME, + config_path: Union[str, Path] = None, + entrypoint_name: str = None +): + block_from_cfg = load_config(config_file_name, config_path) + block_from_code = get_compute_block() + + if entrypoint_name: + _compare_configs( + block_from_cfg.entrypoints[entrypoint_name], + block_from_code.entrypoints[entrypoint_name] + ) else: - return data + _compare_configs(block_from_cfg, block_from_code) -def load_and_validate_config( +def load_config( config_file_name: str = CONFIG_FILE_DEFAULT_NAME, config_path: Union[str, Path] = None, ) -> ComputeBlock: """ - Returns and Validates the Compute Block YAML definition. - Returns a ComputeBlock instance if the validation is successfull + Returns the Compute Block defined by the passed yaml. + Returns a ComputeBlock instance if the syntax-validation is successfull """ try: file = _find_and_load_config(config_file_name, config_path) block_from_cfg = ComputeBlock(**file) - block_from_code = get_compute_block() - - if ( - block_from_cfg != block_from_code - ): - # check the total config - raise ValueError( - "The entrypoint configs (envs, inputs, outputs) defined in " - "your config yaml do not correspond with the entrypoint " - "settings defined in your code." - ) - return block_from_code + return block_from_cfg except ValidationError as e: raise ValueError(f"Configuration file validation error: {e}") @@ -51,9 +62,8 @@ def generate_config_from_compute_block( compute_block: ComputeBlock, output_path: Path ): - cleaned_data = _remove_empty_dicts(compute_block.dict()) with output_path.open("w") as file: - yaml.dump(cleaned_data, file, default_flow_style=False) + yaml.dump(compute_block.dict(), file, default_flow_style=False) def _find_and_load_config( diff --git a/scystream/sdk/config/entrypoints.py b/scystream/sdk/config/entrypoints.py new file mode 100644 index 0000000..ee48825 --- /dev/null +++ b/scystream/sdk/config/entrypoints.py @@ -0,0 +1,12 @@ +_registered_functions = {} + + +def register_entrypoint(func_name, func, settings_class): + _registered_functions[func_name] = { + "function": func, + "settings": settings_class + } + + +def get_registered_functions(): + return _registered_functions diff --git a/scystream/sdk/core.py b/scystream/sdk/core.py index c514d3b..abae394 100644 --- a/scystream/sdk/core.py +++ b/scystream/sdk/core.py @@ -1,9 +1,8 @@ import functools from typing import Callable, Type, Optional -from scystream.sdk.env.settings import EnvSettings from pydantic import ValidationError - -_registered_functions = {} +from scystream.sdk.config.entrypoints import register_entrypoint +from scystream.sdk.env.settings import EnvSettings def entrypoint(settings_class: Optional[Type[EnvSettings]] = None): @@ -15,8 +14,7 @@ def decorator(func: Callable): @functools.wraps(func) def wrapper(*args, **kwargs): if settings_class is not None: - # TODO: validate the entrypoint settings with the config yaml - + # Load the settings try: # load the settings settings = settings_class.get_settings() @@ -28,14 +26,6 @@ def wrapper(*args, **kwargs): else: return func(*args, **kwargs) - _registered_functions[func.__name__] = { - "function": wrapper, - "settings": settings_class - } + register_entrypoint(func.__name__, wrapper, settings_class) return wrapper return decorator - - -def get_registered_functions(): - """Returns a dictionary of registered entrypoint functions.""" - return _registered_functions diff --git a/scystream/sdk/scheduler.py b/scystream/sdk/scheduler.py index c610897..5549348 100644 --- a/scystream/sdk/scheduler.py +++ b/scystream/sdk/scheduler.py @@ -1,4 +1,5 @@ -from .core import get_registered_functions +from scystream.sdk.config.entrypoints import get_registered_functions +from scystream.sdk.config.config_loader import validate_config_with_code class Scheduler: @@ -11,6 +12,12 @@ def list_entrypoints(): @staticmethod def execute_function(name, *args, **kwargs): + """ + Validate the in code defined entrypoints + with the settings defined in the cfg file + """ + validate_config_with_code(entrypoint_name=name) + functions = get_registered_functions() if name in functions: return functions[name]["function"](*args, **kwargs) From 1799e7d76c0a85feaca58419921c98af2478a283 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Tue, 12 Nov 2024 19:27:26 +0100 Subject: [PATCH 21/22] tests: add and refactor tests --- scystream/sdk/config/config_loader.py | 47 +++-- scystream/sdk/config/entrypoints.py | 4 + tests/test_config.py | 28 +-- tests/test_core.py | 7 +- tests/test_setting_files/simple_cfg.yaml | 21 +++ .../simple_cfg_entrypoint_inv.yaml | 22 +++ .../simple_cfg_entrypoint_v.yaml | 35 ++++ .../simple_cfg_invalid.yaml | 25 +++ .../without_default_settings.yaml | 15 ++ tests/test_settings.py | 176 ++++++++++-------- 10 files changed, 278 insertions(+), 102 deletions(-) create mode 100644 tests/test_setting_files/simple_cfg.yaml create mode 100644 tests/test_setting_files/simple_cfg_entrypoint_inv.yaml create mode 100644 tests/test_setting_files/simple_cfg_entrypoint_v.yaml create mode 100644 tests/test_setting_files/simple_cfg_invalid.yaml create mode 100644 tests/test_setting_files/without_default_settings.yaml diff --git a/scystream/sdk/config/config_loader.py b/scystream/sdk/config/config_loader.py index a50aef5..d09213e 100644 --- a/scystream/sdk/config/config_loader.py +++ b/scystream/sdk/config/config_loader.py @@ -9,6 +9,31 @@ CONFIG_FILE_DEFAULT_NAME = "cbc.yaml" +class SDKConfig: + """ + This is a singleton class that holds the configuration of + the sdk. + For now, it only holds the config_path which points to + the cbc.yaml. + """ + _instance = None + + def __new__(cls): + if cls._instance is None: + cls._instance = super(SDKConfig, cls).__new__(cls) + cls._instance.config_path = CONFIG_FILE_DEFAULT_NAME + return cls._instance + + def set_config_path(self, config_path: str): + self.config_path = config_path + + def get_config_path(self) -> str: + return self.config_path + + +global_config = SDKConfig() + + def _compare_configs( config_from_yaml: Union[ComputeBlock, Entrypoint, InputOutputModel], config_from_code: Union[ComputeBlock, Entrypoint, InputOutputModel], @@ -26,11 +51,9 @@ def _compare_configs( def validate_config_with_code( - config_file_name: str = CONFIG_FILE_DEFAULT_NAME, - config_path: Union[str, Path] = None, entrypoint_name: str = None ): - block_from_cfg = load_config(config_file_name, config_path) + block_from_cfg = load_config() block_from_code = get_compute_block() if entrypoint_name: @@ -42,16 +65,13 @@ def validate_config_with_code( _compare_configs(block_from_cfg, block_from_code) -def load_config( - config_file_name: str = CONFIG_FILE_DEFAULT_NAME, - config_path: Union[str, Path] = None, -) -> ComputeBlock: +def load_config() -> ComputeBlock: """ Returns the Compute Block defined by the passed yaml. Returns a ComputeBlock instance if the syntax-validation is successfull """ try: - file = _find_and_load_config(config_file_name, config_path) + file = _find_and_load_config() block_from_cfg = ComputeBlock(**file) return block_from_cfg except ValidationError as e: @@ -66,19 +86,14 @@ def generate_config_from_compute_block( yaml.dump(compute_block.dict(), file, default_flow_style=False) -def _find_and_load_config( - config_file_name: str, - config_path: Union[str, Path] = None -): +def _find_and_load_config(): """ Loads the compute block config YAML from the projects root directory returns the loaded file """ - base_path = Path.cwd() - if config_path: - base_path /= Path(config_path) + config_path = global_config.get_config_path() - full_path = base_path / config_file_name + full_path = Path.cwd() / config_path if not full_path.is_file(): raise FileNotFoundError( diff --git a/scystream/sdk/config/entrypoints.py b/scystream/sdk/config/entrypoints.py index ee48825..8b285c5 100644 --- a/scystream/sdk/config/entrypoints.py +++ b/scystream/sdk/config/entrypoints.py @@ -10,3 +10,7 @@ def register_entrypoint(func_name, func, settings_class): def get_registered_functions(): return _registered_functions + + +def TEST_reset_registered_functions(): + _registered_functions.clear() diff --git a/tests/test_config.py b/tests/test_config.py index 3426024..e50f3be 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,6 @@ import unittest -from scystream.sdk.config.config_loader import load_and_validate_config, \ - ComputeBlock +from scystream.sdk.config.config_loader import load_config, \ + ComputeBlock, global_config class TestComputeBlockValidation(unittest.TestCase): @@ -8,30 +8,36 @@ class TestComputeBlockValidation(unittest.TestCase): def test_valid_config(self): try: - compute_block = load_and_validate_config( - "valid_config.yaml", config_path=self.TEST_CONFIG_FOLDER) + global_config.set_config_path( + f"{self.TEST_CONFIG_FOLDER}/valid_config.yaml") + compute_block = load_config() self.assertIsInstance(compute_block, ComputeBlock) except Exception: self.fail("ComputeBlock raised an Exception unexpectedly!") def test_missing_entrypoints(self): + global_config.set_config_path( + f"{self.TEST_CONFIG_FOLDER}/missing_entrypoints.yaml") with self.assertRaises(ValueError): - load_and_validate_config("missing_entrypoints.yaml", - config_path=self.TEST_CONFIG_FOLDER) + load_config() def test_invalid_datatypes(self): + global_config.set_config_path( + f"{self.TEST_CONFIG_FOLDER}/invalid_datatype.yaml") with self.assertRaises(ValueError): - load_and_validate_config("invalid_datatype.yaml", - config_path=self.TEST_CONFIG_FOLDER) + load_config() def test_not_a_yaml(self): + global_config.set_config_path( + f"{self.TEST_CONFIG_FOLDER}/not_a_yaml.json") with self.assertRaises(ValueError): - load_and_validate_config("not_a_yaml.json", - config_path=self.TEST_CONFIG_FOLDER) + load_config() def test_file_not_found(self): + global_config.set_config_path( + f"{self.TEST_CONFIG_FOLDER}/testyamll") with self.assertRaises(FileNotFoundError): - load_and_validate_config("test.yaml") + load_config() if __name__ == "__main__": diff --git a/tests/test_core.py b/tests/test_core.py index 27107ed..921164b 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,8 +1,13 @@ import unittest -from scystream.sdk.core import entrypoint, get_registered_functions +from scystream.sdk.core import entrypoint +from scystream.sdk.config.entrypoints import get_registered_functions +from scystream.sdk.config.entrypoints import TEST_reset_registered_functions class TestEntrypoint(unittest.TestCase): + def tearDown(self): + TEST_reset_registered_functions() + def test_entrypoint_registration(self): @entrypoint() def dummy_function(): diff --git a/tests/test_setting_files/simple_cfg.yaml b/tests/test_setting_files/simple_cfg.yaml new file mode 100644 index 0000000..6db4ded --- /dev/null +++ b/tests/test_setting_files/simple_cfg.yaml @@ -0,0 +1,21 @@ +name: +author: +description: +docker_image: +entrypoints: + example_entrypoint: + description: + envs: + LANGUAGE: de + inputs: + input_one: + config: + TEST: test + description: + type: 'TODO: SetType' + outputs: + output_one: + config: + OUT: out + description: + type: 'TODO: SetType' diff --git a/tests/test_setting_files/simple_cfg_entrypoint_inv.yaml b/tests/test_setting_files/simple_cfg_entrypoint_inv.yaml new file mode 100644 index 0000000..485b8aa --- /dev/null +++ b/tests/test_setting_files/simple_cfg_entrypoint_inv.yaml @@ -0,0 +1,22 @@ +name: +author: +description: +docker_image: +entrypoints: + example_entrypoint: + description: + envs: + LANGUAGE: de + inputs: + input_one: + config: + TEST: test + ADDITIONAL: tesing # SHOULD FAIL BECAUSE NOT IN SETTINGS CLASS + description: + type: 'TODO: SetType' + outputs: + output_one: + config: + OUT: out + description: + type: 'TODO: SetType' diff --git a/tests/test_setting_files/simple_cfg_entrypoint_v.yaml b/tests/test_setting_files/simple_cfg_entrypoint_v.yaml new file mode 100644 index 0000000..342b784 --- /dev/null +++ b/tests/test_setting_files/simple_cfg_entrypoint_v.yaml @@ -0,0 +1,35 @@ +name: +author: +description: +docker_image: +entrypoints: + example_entrypoint: + description: + envs: + LANGUAGE: de + inputs: + input_one: + config: + TEST: test + description: + type: 'TODO: SetType' + outputs: + output_one: + config: + OUT: out + description: + type: 'TODO: SetType' + test_entryping: + # This entrpoint is not defined in the Settings and passed to the + # however, as example_entrypoint is beeing calles in the test + # this should not fail + description: + envs: + TEST: null + ONE: test + inputs: + test_inp: + config: + TESTER: test + description: + type: 'TODO: SetType' diff --git a/tests/test_setting_files/simple_cfg_invalid.yaml b/tests/test_setting_files/simple_cfg_invalid.yaml new file mode 100644 index 0000000..3fc49ff --- /dev/null +++ b/tests/test_setting_files/simple_cfg_invalid.yaml @@ -0,0 +1,25 @@ +name: +author: +description: +docker_image: +entrypoints: + example_entrypoint: + description: + envs: + LANGUAGE: de + inputs: + input_one: + config: + TEST: test + description: + type: 'TODO: SetType' + outputs: + output_one: + config: + OUT: out + description: + type: 'TODO: SetType' + again_entrypoint: + description: + envs: + HI: null diff --git a/tests/test_setting_files/without_default_settings.yaml b/tests/test_setting_files/without_default_settings.yaml new file mode 100644 index 0000000..e688d90 --- /dev/null +++ b/tests/test_setting_files/without_default_settings.yaml @@ -0,0 +1,15 @@ +name: +author: +description: +docker_image: +entrypoints: + without_def_settings: + description: + envs: + LANGUAGE: null + inputs: + input_one: + config: + TEST: null + description: + type: 'TODO: SetType' diff --git a/tests/test_settings.py b/tests/test_settings.py index b6144d2..640d90f 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -1,135 +1,163 @@ import unittest import os from scystream.sdk.core import entrypoint -from scystream.sdk.env.settings import EnvSettings +from scystream.sdk.env.settings import EnvSettings, InputSettings, \ + OutputSettings from scystream.sdk.scheduler import Scheduler +from scystream.sdk.config.config_loader import global_config +from scystream.sdk.config.config_loader import validate_config_with_code +from scystream.sdk.config.entrypoints import TEST_reset_registered_functions +# Validate Cfgs -class DummyInputSettings(EnvSettings): - DUMMY_INPUT: str = "test" +class SimpleSettingsInputOne(InputSettings): + TEST: str = "test" -class WithDefaultSettings(EnvSettings): - DUMMY_GLOBAL: str = "dummy global var" - dummy_input_settings: DummyInputSettings +class SimpleSettingsOutputOne(OutputSettings): + OUT: str = "out" -class DummyInputSettingsNoDef(EnvSettings): - DUMMY_INPUT: str +class SimpleSettings(EnvSettings): + LANGUAGE: str = "de" + input_one: SimpleSettingsInputOne + output_one: SimpleSettingsOutputOne -class WithoutDefaultSettings(EnvSettings): - DUMMY_GLOBAL: str +# WithoutDefaults - dummy_input_settings_no_def: DummyInputSettingsNoDef +class WithoutDefaultsInputOne(InputSettings): + TEST: str -class WithoutDefaultNoNesting(EnvSettings): - TEST: str = "teststr" - MUST_SET: str +class WithoutDefaults(EnvSettings): + LANGUAGE: str # MUST BE SET -class SubOne(EnvSettings): - ONE: str - TWO: str + input_one: WithoutDefaultsInputOne -class SubTwo(EnvSettings): - TEST: str - NO_DEF: str +class TestSettings(unittest.TestCase): + TEST_SETTINGS_FILES = "tests/test_setting_files/" + def tearDown(self): + TEST_reset_registered_functions() -class TwoSubclasses(EnvSettings): - GLOBAL: str + def test_entrypoint_yaml_cfg_different_to_code_cfg(self): + # Tests if the passed settings to entrypoint config is different + # to the one in yaml + @entrypoint(SimpleSettings) + def example_entrypoint(settings): + print("Running example_entrypoint...") - input_one: SubOne - input_two: SubTwo + global_config.set_config_path( + f"{self.TEST_SETTINGS_FILES}/simple_cfg_entrypoint_inv.yaml" + ) + with self.assertRaises(ValueError): + Scheduler.execute_function("example_entrypoint") + + def test_entrypoint_yaml_cfg_not_different_to_code_cfg(self): + # Tests if the passed settings to entrypoint config is different + # to the one in yaml + # HINT: TOTAL CONFIG does not fit, only the entrypoint ones fits + @entrypoint(SimpleSettings) + def example_entrypoint(settings): + print("Running example_entrypoint...") + + global_config.set_config_path( + f"{self.TEST_SETTINGS_FILES}/simple_cfg_entrypoint_v.yaml" + ) + + try: + Scheduler.execute_function("example_entrypoint") + except Exception: + self.fail("") + + def test_validate_cfgs_no_error(self): + # Tests if validate_config_with_code works if config and settings + # correspond + @entrypoint(SimpleSettings) + def example_entrypoint(settings): + print(f"{settings}....") + + global_config.set_config_path( + f"{self.TEST_SETTINGS_FILES}/simple_cfg.yaml") + + try: + validate_config_with_code() + except Exception: + self.fail( + "validate_config_with_code raised an Exception unexpectedly!") + + def test_validate_cfgs_error(self): + # Tests if validate_config_with_code works if config and settings + # do not correspond + @entrypoint(SimpleSettings) + def example_entrypoint(settings): + print(f"{settings}....") + + global_config.set_config_path( + f"{self.TEST_SETTINGS_FILES}/simple_cfg_invalid.yaml") + + with self.assertRaises(ValueError): + validate_config_with_code() -class TestSettings(unittest.TestCase): def test_entrypoint_with_setting_default(self): - @entrypoint(WithDefaultSettings) + # Tests if defaults and overriding defaults with ENvs works + # We use SimpleSettings as they all have a default + @entrypoint(SimpleSettings) def with_default_settings(settings): - return settings.dummy_input_settings.DUMMY_INPUT + return settings.input_one.TEST + + global_config.set_config_path( + f"{self.TEST_SETTINGS_FILES}/simple_cfg.yaml") result = with_default_settings() self.assertEqual(result, "test") # set environ - os.environ["DUMMY_INPUT"] = "overridden setting" + os.environ["TEST"] = "overridden setting" result = with_default_settings() # check if overriding works self.assertEqual(result, "overridden setting") - del os.environ["DUMMY_INPUT"] + del os.environ["TEST"] def test_entrypoint_no_setting_default_one(self): - @entrypoint(WithoutDefaultSettings) + # Tests if fails, if ENVs that MUST be set, are not set + @entrypoint(WithoutDefaults) def without_def_settings(settings): print("test...") + global_config.set_config_path( + f"{self.TEST_SETTINGS_FILES}/without_default_settings.yaml") + # do we fail if environments not set with self.assertRaises(ValueError): Scheduler.execute_function("without_def_settings") def test_entrypoint_no_setting_default_two(self): - @entrypoint(WithoutDefaultSettings) + # Tests if it works, if ENVs that MUST be set, are actually set + @entrypoint(WithoutDefaults) def without_def_settings(settings): return ( - settings.DUMMY_GLOBAL, - settings.dummy_input_settings_no_def.DUMMY_INPUT + settings.LANGUAGE, + settings.input_one.TEST ) # set environments - os.environ["DUMMY_GLOBAL"] = "dummy global" - os.environ["DUMMY_INPUT"] = "dummy input" + os.environ["LANGUAGE"] = "dummy global" + os.environ["TEST"] = "dummy input" # check if environments have been set result = without_def_settings() self.assertEqual(result[0], "dummy global") self.assertEqual(result[1], "dummy input") - del os.environ["DUMMY_GLOBAL"] - del os.environ["DUMMY_INPUT"] - - def test_entrypoint_no_setting_defautl_three(self): - @entrypoint(WithoutDefaultNoNesting) - def no_nesting(settings): - print("testing...") - - with self.assertRaises(ValueError): - Scheduler.execute_function("no_nesting") - - def test_two_subs(self): - @entrypoint(TwoSubclasses) - def two_subs(settings): - return ( - settings.GLOBAL, - settings.input_one.ONE, - settings.input_one.TWO, - settings.input_two.TEST, - settings.input_two.NO_DEF - ) - - os.environ["GLOBAL"] = "global" - os.environ["ONE"] = "one" - os.environ["TWO"] = "two" - os.environ["TEST"] = "test" - os.environ["NO_DEF"] = "no_def" - - result = two_subs() - self.assertEqual(result[0], "global") - self.assertEqual(result[1], "one") - self.assertEqual(result[2], "two") - self.assertEqual(result[3], "test") - self.assertEqual(result[4], "no_def") - - del os.environ["GLOBAL"] - del os.environ["ONE"] - del os.environ["TWO"] + del os.environ["LANGUAGE"] del os.environ["TEST"] - del os.environ["NO_DEF"] if __name__ == "__main__": From e9eb8d67e529149ca8254d97cfba60e26698e1e1 Mon Sep 17 00:00:00 2001 From: PaulKalho Date: Tue, 12 Nov 2024 19:54:05 +0100 Subject: [PATCH 22/22] feat: export important functions directly --- README.md | 16 +++++++++--- scystream/sdk/config/__init__.py | 6 +++++ scystream/sdk/config/config_loader.py | 2 +- scystream/sdk/scheduler.py | 2 +- tests/test_config.py | 4 +-- tests/test_setting_files/ref.yaml | 21 ++++++++++++++++ tests/test_settings.py | 36 +++++++++++++++++++++++++-- 7 files changed, 78 insertions(+), 9 deletions(-) create mode 100644 tests/test_setting_files/ref.yaml diff --git a/README.md b/README.md index 7bb036c..c989026 100644 --- a/README.md +++ b/README.md @@ -161,8 +161,7 @@ After writing the functionality of your ComputeBlock (see more below) you can ge the corresponding `cbc.yaml` by using the following function: ```python3 -from scystream.sdk.config.config_loader import generate_config_from_compute_block -from scystream.sdk.config.compute_block_utils import get_compute_block +from scystream.sdk.config import generate_config_from_compute_block, get_compute_block from pathlib import Path @entrypoint() @@ -182,6 +181,17 @@ This will take all the entrypoints, their defined settings, and generate a confi ### Validating a config +If you want your `cbc.yaml` to be located in a different directory or have a different name, you +have to configure that accordingly: + +```python3 +from scystream.sdk.config import global_config + +if __name__ == "__main__": + # Set the config_path + global_config.set_config_path("custom_dir/custom_name.yaml") +``` + Of course, you can also write the config completely on your own. > [!NOTE] @@ -192,7 +202,7 @@ Of course, you can also write the config completely on your own. To validate the config, you can also use a helper function like this: ```python3 -from scystream.sdk.config.config_loader import validate_config_with_code +from scystream.sdk.config import validate_config_with_code @entrypoint() def example_entrypoint(): diff --git a/scystream/sdk/config/__init__.py b/scystream/sdk/config/__init__.py index e69de29..94120e7 100644 --- a/scystream/sdk/config/__init__.py +++ b/scystream/sdk/config/__init__.py @@ -0,0 +1,6 @@ +from .config_loader import global_config, \ + validate_config_with_code, load_config +from .compute_block_utils import get_compute_block + +__all__ = ["global_config", "validate_config_with_code", + "load_config", "EnvSettings", "get_compute_block"] diff --git a/scystream/sdk/config/config_loader.py b/scystream/sdk/config/config_loader.py index d09213e..ccaf120 100644 --- a/scystream/sdk/config/config_loader.py +++ b/scystream/sdk/config/config_loader.py @@ -83,7 +83,7 @@ def generate_config_from_compute_block( output_path: Path ): with output_path.open("w") as file: - yaml.dump(compute_block.dict(), file, default_flow_style=False) + yaml.dump(compute_block.model_dump(), file, default_flow_style=False) def _find_and_load_config(): diff --git a/scystream/sdk/scheduler.py b/scystream/sdk/scheduler.py index 5549348..10e6bb0 100644 --- a/scystream/sdk/scheduler.py +++ b/scystream/sdk/scheduler.py @@ -1,5 +1,5 @@ from scystream.sdk.config.entrypoints import get_registered_functions -from scystream.sdk.config.config_loader import validate_config_with_code +from scystream.sdk.config import validate_config_with_code class Scheduler: diff --git a/tests/test_config.py b/tests/test_config.py index e50f3be..fa1b28c 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,6 @@ import unittest -from scystream.sdk.config.config_loader import load_config, \ - ComputeBlock, global_config +from scystream.sdk.config import global_config, load_config +from scystream.sdk.config.models import ComputeBlock class TestComputeBlockValidation(unittest.TestCase): diff --git a/tests/test_setting_files/ref.yaml b/tests/test_setting_files/ref.yaml new file mode 100644 index 0000000..b722477 --- /dev/null +++ b/tests/test_setting_files/ref.yaml @@ -0,0 +1,21 @@ +author: +description: +docker_image: +entrypoints: + example_entrypoint: + description: + envs: + LANGUAGE: de + inputs: + input_one: + config: + TEST: test + description: + type: 'TODO: SetType' + outputs: + output_one: + config: + OUT: out + description: + type: 'TODO: SetType' +name: diff --git a/tests/test_settings.py b/tests/test_settings.py index 640d90f..27d9fef 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -4,9 +4,12 @@ from scystream.sdk.env.settings import EnvSettings, InputSettings, \ OutputSettings from scystream.sdk.scheduler import Scheduler -from scystream.sdk.config.config_loader import global_config -from scystream.sdk.config.config_loader import validate_config_with_code +from scystream.sdk.config.config_loader import global_config, \ + validate_config_with_code, get_compute_block, \ + generate_config_from_compute_block from scystream.sdk.config.entrypoints import TEST_reset_registered_functions +from pathlib import Path +import yaml # Validate Cfgs @@ -44,6 +47,35 @@ class TestSettings(unittest.TestCase): def tearDown(self): TEST_reset_registered_functions() + def test_generate_config_from_code(self): + generated_config_path = Path(f"{self.TEST_SETTINGS_FILES}/gen.yaml") + reference_config_path = Path(f"{self.TEST_SETTINGS_FILES}/ref.yaml") + + @entrypoint(SimpleSettings) + def example_entrypoint(settings): + print("Running...") + + try: + cb = get_compute_block() + generate_config_from_compute_block( + cb, generated_config_path) + except Exception as e: + self.fail(f"Exception raised unexpectedly: {e}") + + with generated_config_path.open("r") as gen_file: + generated_yaml = yaml.safe_load(gen_file) + + with reference_config_path.open("r") as ref_file: + reference_yaml = yaml.safe_load(ref_file) + + # Compare the contents + self.assertEqual( + generated_yaml, reference_yaml, + "Generated YAML does not match the reference YAML" + ) + + generated_config_path.unlink() + def test_entrypoint_yaml_cfg_different_to_code_cfg(self): # Tests if the passed settings to entrypoint config is different # to the one in yaml