Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compute default error duration if omitted #91

Merged
merged 2 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,8 @@ For instance, with an `error_tolerance` of 5 and an `update_interval` of 60, `er

A useful guide is to set `error_duration` as `(error_tolerance + 1) * update_interval`, providing an additional buffer equivalent to one update interval.

If `error_tolerance` is set, but `error_duration` is not, the application will set an `error_duration` that is slightly longer than the minimum required to ensure correct error detection.

#### exit_on_error

`exit_on_error` (default: true) determines if the application should terminate, or disable the failing collector when number of errors exceed the tolerance. If set to `true`, the application will exit. Otherwise, the collector will be disabled for `disable_duration` seconds. For backwards compatibility with previous versions of Zabbix-auto-config, this option defaults to `true`. In a future major version, the default will be changed to `false`.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ dependencies = [
]

[project.optional-dependencies]
test = ["pytest>=7.4.3", "pytest-timeout>=2.2.0", "hypothesis>=6.62.1"]
test = ["pytest>=7.4.3", "pytest-timeout>=2.2.0", "hypothesis>=6.62.1", "inline-snapshot>=0.14.0"]

[project.urls]
Source = "https://github.com/unioslo/zabbix-auto-config"
Expand Down
49 changes: 34 additions & 15 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@

import pytest
import tomli
from hypothesis import given
from hypothesis import settings
from hypothesis import strategies as st
from inline_snapshot import snapshot
from pydantic import ValidationError

import zabbix_auto_config.models as models
Expand Down Expand Up @@ -69,32 +73,44 @@ def test_sourcecollectorsettings_no_tolerance() -> None:
error_duration=0,
)
assert settings.error_tolerance == 0
# In case the actual implementaiton changes in the future, we don't
# want to test the _exact_ value, but we know it will not be 0
assert settings.error_duration > 0
assert settings.error_duration == snapshot(9999)


def test_sourcecollectorsettings_no_error_duration():
# TODO: check if we can just remove this test
# In order to not have an error_duration, error_tolerance must be 0 too
settings = models.SourceCollectorSettings(
module_name="foo",
update_interval=60,
error_duration=0,
error_tolerance=0,
)
# See docstring in test_sourcecollectorsettings_no_tolerance
assert settings.error_duration > 0
assert settings.error_duration == snapshot(9999)

# With tolerance raises an error
# NOTE: we test the error message in depth in test_sourcecollectorsettings_invalid_error_duration
with pytest.raises(ValidationError):
models.SourceCollectorSettings(
module_name="foo",
update_interval=60,
error_duration=0,
error_tolerance=5,
)
# With tolerance we get a default value
settings = models.SourceCollectorSettings(
module_name="foo",
update_interval=60,
error_duration=0,
error_tolerance=5,
)

assert settings.error_duration == snapshot(354)


@given(
update_interval=st.integers(min_value=0, max_value=100),
error_tolerance=st.integers(min_value=0, max_value=100),
)
@settings(max_examples=1000)
def test_sourcecollectorsettings_no_error_duration_fuzz(
update_interval: int, error_tolerance: int
):
"""Test model with a variety of update intervals and error tolerances"""
models.SourceCollectorSettings(
module_name="foo",
update_interval=update_interval,
error_tolerance=error_tolerance,
)


def test_sourcecollectorsettings_duration_too_short():
Expand All @@ -112,6 +128,9 @@ def test_sourcecollectorsettings_duration_too_short():
error = errors[0]
assert "greater than 300" in error["msg"]
assert error["type"] == "value_error"
assert error["msg"] == snapshot(
"Value error, Invalid value for error_duration (180). It should be greater than 300: error_tolerance (5) * update_interval (60)"
)


def test_sourcecollectorsettings_duration_negative():
Expand Down
20 changes: 17 additions & 3 deletions zabbix_auto_config/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ class SourceCollectorSettings(ConfigBaseModel, extra="allow"):
description=(
"The duration in seconds that errors are stored."
"If `error_tolerance` errors occur in this period, the collector is marked as failing."
"If `error_tolerance`is set, but this is not, it is set to `round(error_tolerance * update_interval + (update_interval*0.9))`."
),
ge=0,
)
Expand All @@ -234,12 +235,25 @@ def _validate_error_duration_is_greater(self) -> Self:
# hack to ensure RollingErrorCounter.count() doesn't discard the error
# before it is counted
self.error_duration = 9999
elif (
return self

# Set default error duration if not set
if self.error_tolerance > 0 and not self.error_duration:
# Set the error duration to tolerance * update_interval + 90% of update_interval
# so that it's possible to hit the error tolerance within the duration if all
# errors happen in succession.
self.error_duration = round(
self.error_tolerance * self.update_interval
+ (self.update_interval * 0.9)
)

# Ensure the error duration is greater than the product of the error tolerance and update interval
if (
product := self.error_tolerance * self.update_interval
) > self.error_duration:
raise ValueError(
f"Invalid value for error_duration ({self.error_duration}). It should be greater than error_tolerance ({self.error_tolerance}) "
f"times update_interval ({self.update_interval}), i.e., greater than {product}. Please adjust accordingly."
f"Invalid value for error_duration ({self.error_duration}). "
f"It should be greater than {product}: error_tolerance ({self.error_tolerance}) * update_interval ({self.update_interval})"
)
return self

Expand Down
Loading