Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add compute block config validation #5

Merged
merged 23 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ __pycache__/
dist/
build/
venv/
.venv/
116 changes: 116 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,119 @@ if __name__ == "__main__":
main()

```

### Compute Block Config Files
We expect every repository which will be used within the scystream application
to contain a `Compute Block Config File`, the `cbc.yaml`, within the root directory.

This yaml-file describes the compute block itself.
It shows the entrypoints, their inputs and outputs.

This is an example `cbc.yaml`:

```yaml
name: "NLP toolbox"
description: "Contains NLP algorithms..."
author: "John Doe"
docker_image: "https://ghcr.io/nlp-toolbox"

entrypoints:
topic_modelling:
description: "Run topic modelling"
envs:
LANGUAGE: "de"
inputs:
text_data:
description: "Text file. Can be uploaded by the user."
type: "file"
config:
TXT_SRC_PATH: null
db_data:
description: "Information in a database"
type: "db_table"
config:
DATA_TABLE_NAME: "nlp_information"
DB_HOST: "time.rwth-aachen.de"
DB_PORT: 1234
outputs:
topic_model:
type: "file"
description: "Topic model file"
config:
OUTPUT_PATH_TOPIC_MODEL: null
run_durations:
type: "db_table"
description: "Table that contains the run durations per day."
config:
RUN_DURATIONS_TABLE_NAME: "run_durations_nlp"

analyze_runtime:
description: "Analyze the runtimes"
inputs:
run_durations:
description: "Teble that contains all runtimes and dates"
type: "db_table"
config:
RUN_DURATIONS_TABLE_NAME: "run_durations_nlp"
outputs:
csv_output:
type: "file"
description: "A csv containing statistical information"
config:
CSV_OUTPUT_PATH: "outputs/statistics.csv"
```

To read and validate such a config file u can proceed as follows:

```python3
from scystream.sdk.config.config_loader import load_config

def main():
load_config()

if __name__ == "__main__":
main()
```

If you want the file to have another name than `cbc.yaml` or you want the file to be
somewhere else than the root directory you can define that using the parameters the
`load_config` function takes.

Example:

```python3
load_config(config_file_name="test.yaml", config_path="configs/")
```

the `config_path` is the path relative to your root directory


## Development of the SDK

### Installation

1. Create a venv

```bash
python3 -m venv .venv
```

2. Install the package within the venv

> [!NOTE]
> This will also install all the install_requirements from the setup.py

```bash
pip install -e .
```

3. Develop!

### Tests

To run all the tests run the following command:

```bash
python3 -m unittest discover -s tests
```

Empty file.
55 changes: 55 additions & 0 deletions scystream/sdk/config/config_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import yaml
from typing import Union
from pydantic import ValidationError
from pathlib import Path
from .models import ComputeBlock

CONFIG_FILE_DEFAULT_NAME = "cbc.yaml"


def load_config(
config_file_name: str = CONFIG_FILE_DEFAULT_NAME,
config_path: Union[str, Path] = None
) -> ComputeBlock:
"""
Returns and Validates the Compute Block YAML definition.
Returns a ComputeBlock instance if the validation is successfull
"""
try:
file = _find_and_load_config(config_file_name, config_path)
block = ComputeBlock(**file)
return block
except ValidationError as e:
raise ValueError(f"Configuration file validation error: {e}")


def _find_and_load_config(
config_file_name: str,
config_path: Union[str, Path] = None
):
"""
Loads the compute block config YAML from the projects root directory
returns the loaded file
"""
base_path = Path.cwd()
if config_path:
base_path /= Path(config_path)

full_path = base_path / config_file_name

if not full_path.is_file():
raise FileNotFoundError(
f"Configuration file '{full_path}' not found."
)

try:
with full_path.open("r") as file:
config_data = yaml.safe_load(file)
except FileNotFoundError:
raise FileNotFoundError(
f"Configuration file '{full_path}' not found.'"
)
except yaml.YAMLError as e:
raise ValueError(f"Error parsing YAML file: {e}")

return config_data
85 changes: 85 additions & 0 deletions scystream/sdk/config/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from typing import Optional, Dict, Literal, Union
from pydantic import BaseModel, StrictStr, field_validator, Field, \
StrictInt, StrictFloat

FILE_TYPE_IDENTIFIER = "file"
DB_TABLE_TYPE_IDENTIFIER = "db_table"

"""
This file contains the schema definition for the config file.
"""


class InputOutputModel(BaseModel):
"""
Represents configuration for inputs or outputs in a ComputeBlock.

The configuration is defined as a dictionary with key-value pairs, where:
- The key is the name of an environment variable (e.., `FILE_PATH`,
`TABLE_NAME`).
- The value is the default value for that environment variable, which can
be a string, integer, or float.

If a value is explicitly set to `null`, validation will fail unless the
ENV-Variable is manually set by the ComputeBlock user.
"""
type: Literal[FILE_TYPE_IDENTIFIER, DB_TABLE_TYPE_IDENTIFIER]
description: Optional[StrictStr] = None
config: Optional[
Dict[
StrictStr,
Optional[Union[StrictStr, StrictInt, StrictFloat]]
]] = Field(
default=None,
description="The configuration for the input values\
(file_path, table_name, etc.)"
)


class Entrypoint(BaseModel):
"""
Represents an entrypoint within a ComputeBlock.

An entrypoint includes:
- A description of the entrypoint's purpose.
- A dictionary of environment variables (`envs`), where each key-value
pair represents an environment variable and its default value.
- These variables should be shared variables across the entrypoint
- Input and output configurations, each described by the
`InputOutputModel`.

If an environment variable’s value is set to `None` in the configuration,
the ComputeBlock user must provide that variable during runtime, or else
the process will fail.
"""
description: StrictStr
envs: Optional[Dict[StrictStr, StrictStr]
] = None # Todo can be set to Null
inputs: Dict[StrictStr, InputOutputModel]
outputs: Dict[StrictStr, InputOutputModel]


class ComputeBlock(BaseModel):
"""
Represents a ComputeBlock configuration, which describes the compute
process, including entrypoints, inputs, and outputs.

A ComputeBlock is defined by:
- A name, description, and author.
- One or more entrypoints that specify how data is passed into and out of
the compute process.
- Optionally, a Docker image to specify the execution environment.

At least one entrypoint must be defined for the ComputeBlock to be valid.
"""
name: StrictStr
description: StrictStr
author: StrictStr
entrypoints: Dict[StrictStr, Entrypoint]
docker_image: Optional[StrictStr]

@field_validator("entrypoints")
def check_entrypoints(cls, v):
if not v:
raise ValueError("At least one entrypoint must be defined.")
return v
5 changes: 4 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
author_email="[email protected]",
license="MIT",
packages=find_packages(),
install_requires=[],
install_requires=[
"pydantic>=2.9.2",
"PyYAML>=6.0.2"
],
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
Expand Down
37 changes: 37 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import unittest
from scystream.sdk.config.config_loader import load_config, ComputeBlock


class TestComputeBlockValidation(unittest.TestCase):
TEST_CONFIG_FOLDER = "tests/test_config_files"

def test_valid_config(self):
try:
compute_block = load_config(
"valid_config.yaml", config_path=self.TEST_CONFIG_FOLDER)
self.assertIsInstance(compute_block, ComputeBlock)
except Exception:
self.fail("ComputeBlock raised an Exception unexpectedly!")

def test_missing_entrypoints(self):
with self.assertRaises(ValueError):
load_config("missing_entrypoints.yaml",
config_path=self.TEST_CONFIG_FOLDER)

def test_invalid_datatypes(self):
with self.assertRaises(ValueError):
load_config("invalid_datatype.yaml",
config_path=self.TEST_CONFIG_FOLDER)

def test_not_a_yaml(self):
with self.assertRaises(ValueError):
load_config("not_a_yaml.json",
config_path=self.TEST_CONFIG_FOLDER)

def test_file_not_found(self):
with self.assertRaises(FileNotFoundError):
load_config("test.yaml")


if __name__ == "__main__":
unittest.main()
41 changes: 41 additions & 0 deletions tests/test_config_files/invalid_datatype.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name: "NLP toolbox"
description: "Contains NLP algorithms..."
author: "John Doe"
docker_image: "https://ghcr.io/nlp-toolbox"

entrypoints:
topic_modelling:
description: "Run topic modelling"
envs:
LANG: "de"
inputs:
text_data:
description: "Text file. Must be uploaded by the user."
type: "invalid_type"
config:
TXT_SRC_PATH: null
db_data:
description: "Information in a database"
type: "db_table"
config:
DATA_TABLE_NAME: "test_db_table"
outputs:
topic_model:
type: "file"
description: "Topic model file"
config:
OUTPUT_PATH_TOPIC_MODEL: null
run_durations:
type: "db_table"
config:
DURATIONS_TABLE_NAME: "run_durations_table"

analyze_runtime:
description: "Analyze the runtimes"
inputs:
run_durations:
type: "db_table"
outputs:
csv_output:
type: "file"
description: "A csv containing statistical information"
5 changes: 5 additions & 0 deletions tests/test_config_files/missing_entrypoints.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
name: "NLP toolbox"
description: "Contains NLP algorithms..."
author: "John Doe"
docker_image: "https://ghcr.io/nlp-toolbox"

Loading
Loading