Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
871 changes: 871 additions & 0 deletions docs/tech-specs/confidence-based-agents.md

Large diffs are not rendered by default.

Binary file added tests/.coverage
Binary file not shown.
1 change: 1 addition & 0 deletions tests/unit/test_agent/test_confidence/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Confidence agent unit tests
365 changes: 365 additions & 0 deletions tests/unit/test_agent/test_confidence/test_confidence_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,365 @@
"""
Unit tests for the confidence evaluator module.

Tests confidence scoring logic for different tool types following TrustGraph testing patterns.
"""

import pytest
import json
from unittest.mock import Mock, AsyncMock

from trustgraph.agent.confidence.confidence import ConfidenceEvaluator
from trustgraph.agent.confidence.types import ConfidenceMetrics


class TestConfidenceEvaluator:
"""Test cases for the ConfidenceEvaluator class."""

@pytest.fixture
def evaluator(self):
"""Create a ConfidenceEvaluator instance for testing."""
return ConfidenceEvaluator()

# Graph Query Tests

def test_evaluate_graph_query_success_json_results(self, evaluator):
"""Test confidence evaluation for successful GraphQuery with JSON results."""
# Arrange
function_name = "GraphQuery"
arguments = {"query": "MATCH (n) RETURN n LIMIT 10"}
output = json.dumps([{"id": 1, "name": "entity1"}, {"id": 2, "name": "entity2"}])
execution_time_ms = 500

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert 0.8 <= result.score <= 1.0 # Good result size should have high confidence
assert "2 items" in result.reasoning
assert result.retry_count == 0

def test_evaluate_graph_query_empty_results(self, evaluator):
"""Test confidence evaluation for GraphQuery with empty results."""
# Arrange
function_name = "GraphQuery"
arguments = {"query": "MATCH (n) WHERE false RETURN n"}
output = "[]"
execution_time_ms = 200

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.4 # Empty results should have low confidence
assert "Empty result set" in result.reasoning
assert result.retry_count == 0

def test_evaluate_graph_query_large_results(self, evaluator):
"""Test confidence evaluation for GraphQuery with very large results."""
# Arrange
function_name = "GraphQuery"
arguments = {"query": "MATCH (n) RETURN n"}
large_result = [{"id": i} for i in range(1500)]
output = json.dumps(large_result)
execution_time_ms = 2000

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert 0.6 <= result.score <= 0.8 # Large results should have moderate confidence
assert "1500 items" in result.reasoning

def test_evaluate_graph_query_non_json_output(self, evaluator):
"""Test confidence evaluation for GraphQuery with non-JSON output."""
# Arrange
function_name = "GraphQuery"
arguments = {"query": "RETURN 'hello world'"}
output = "hello world"
execution_time_ms = 100

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert 0.5 <= result.score <= 0.7
assert "Non-JSON response" in result.reasoning

def test_evaluate_graph_query_very_fast_execution(self, evaluator):
"""Test confidence penalty for very fast execution."""
# Arrange
function_name = "GraphQuery"
arguments = {"query": "RETURN 1"}
output = json.dumps([{"result": 1}])
execution_time_ms = 50 # Very fast execution

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert "Very fast execution" in result.reasoning
# Score should be penalized for suspiciously fast execution

# Text Completion Tests

def test_evaluate_text_completion_success(self, evaluator):
"""Test confidence evaluation for successful TextCompletion."""
# Arrange
function_name = "TextCompletion"
arguments = {"prompt": "Explain quantum physics"}
output = "Quantum physics is the branch of physics that studies matter and energy at the smallest scales. It describes phenomena that occur at atomic and subatomic levels, where classical physics breaks down."
execution_time_ms = 1500

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score >= 0.7
assert "Reasonable response length" in result.reasoning
assert "Well-structured response" in result.reasoning
assert result.retry_count == 0

def test_evaluate_text_completion_empty_response(self, evaluator):
"""Test confidence evaluation for TextCompletion with empty response."""
# Arrange
function_name = "TextCompletion"
arguments = {"prompt": "Tell me about AI"}
output = ""
execution_time_ms = 500

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.2
assert "Empty response" in result.reasoning

def test_evaluate_text_completion_short_response(self, evaluator):
"""Test confidence evaluation for TextCompletion with very short response."""
# Arrange
function_name = "TextCompletion"
arguments = {"prompt": "What is AI?"}
output = "AI is AI."
execution_time_ms = 300

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.5
assert "Very short response" in result.reasoning

def test_evaluate_text_completion_error_indicators(self, evaluator):
"""Test confidence evaluation for TextCompletion with error indicators."""
# Arrange
function_name = "TextCompletion"
arguments = {"prompt": "Solve this complex equation"}
output = "I don't know how to solve this equation. Sorry, I cannot help with this."
execution_time_ms = 800

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.6
assert "Response indicates uncertainty" in result.reasoning

def test_evaluate_text_completion_very_long_response(self, evaluator):
"""Test confidence evaluation for TextCompletion with suspiciously long response."""
# Arrange
function_name = "TextCompletion"
arguments = {"prompt": "What is 2+2?"}
output = "Two plus two equals four. " * 1000 # Very long response for simple question
execution_time_ms = 3000

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.8 # Updated to match actual scoring logic
assert "Very long response" in result.reasoning

# MCP Tool Tests

def test_evaluate_mcp_tool_success(self, evaluator):
"""Test confidence evaluation for successful McpTool execution."""
# Arrange
function_name = "McpTool"
arguments = {"name": "weather_tool", "parameters": {"location": "San Francisco"}}
output = json.dumps({"temperature": 72, "conditions": "sunny", "success": True})
execution_time_ms = 1200

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score >= 0.7
assert "Tool execution successful" in result.reasoning

def test_evaluate_mcp_tool_error_response(self, evaluator):
"""Test confidence evaluation for McpTool with error response."""
# Arrange
function_name = "McpTool"
arguments = {"name": "broken_tool", "parameters": {}}
output = json.dumps({"error": "Tool execution failed", "success": False})
execution_time_ms = 500

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.3
assert "Tool returned error" in result.reasoning

def test_evaluate_mcp_tool_text_error(self, evaluator):
"""Test confidence evaluation for McpTool with text error output."""
# Arrange
function_name = "McpTool"
arguments = {"name": "api_tool", "parameters": {"endpoint": "/invalid"}}
output = "Error: API endpoint not found. Request failed."
execution_time_ms = 800

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.4
assert "Error indicated in output" in result.reasoning

def test_evaluate_mcp_tool_timeout_risk(self, evaluator):
"""Test confidence evaluation for McpTool with long execution time."""
# Arrange
function_name = "McpTool"
arguments = {"name": "slow_tool", "parameters": {}}
output = json.dumps({"result": "completed", "success": True})
execution_time_ms = 65000 # Over 1 minute

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.7 # Should be penalized for long execution
assert "Tool execution timeout risk" in result.reasoning

# Retry Testing

def test_evaluate_with_retries(self, evaluator):
"""Test confidence evaluation with retry penalties."""
# Arrange
function_name = "GraphQuery"
arguments = {"query": "MATCH (n) RETURN n"}
output = json.dumps([{"id": 1}])
execution_time_ms = 500
retry_count = 2

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms, retry_count)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.retry_count == retry_count
assert "Retry penalty applied (2 retries)" in result.reasoning
# Score should be lower than without retries due to penalty

# Generic Function Tests

def test_evaluate_unknown_function(self, evaluator):
"""Test confidence evaluation for unknown function type."""
# Arrange
function_name = "UnknownFunction"
arguments = {"param": "value"}
output = "Some output"
execution_time_ms = 1000

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert 0.5 <= result.score <= 0.7
assert "Generic evaluation for unknown function" in result.reasoning
assert "Function returned output" in result.reasoning

def test_evaluate_unknown_function_no_output(self, evaluator):
"""Test confidence evaluation for unknown function with no output."""
# Arrange
function_name = "UnknownFunction"
arguments = {"param": "value"}
output = ""
execution_time_ms = 500

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.4
assert "No output from function" in result.reasoning

# Prompt Function Tests

def test_evaluate_prompt_function(self, evaluator):
"""Test confidence evaluation for Prompt function."""
# Arrange
function_name = "Prompt"
arguments = {"prompt": "analyze this data"}
output = "The data shows a clear upward trend with seasonal variations."
execution_time_ms = 1200

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score >= 0.7 # Should use text completion evaluation
assert "Reasonable response length" in result.reasoning

# Edge Cases

def test_evaluate_with_exception_in_evaluation(self, evaluator):
"""Test confidence evaluation handles exceptions gracefully."""
# Arrange
function_name = "GraphQuery"
arguments = {"query": "test"}
# Create problematic output that might cause JSON parsing issues
output = "{'invalid': json, syntax}"
execution_time_ms = 500

# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)

# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.7 # Updated - non-JSON response gets moderate score
# Should handle the exception and provide reasonable confidence score

def test_confidence_score_bounds(self, evaluator):
"""Test that confidence scores are always within valid bounds."""
# Arrange
test_cases = [
("GraphQuery", {}, "[]", 100),
("TextCompletion", {}, "Great response!", 1000),
("McpTool", {}, '{"success": true}', 500),
("UnknownFunction", {}, "", 0),
]

# Act & Assert
for function_name, arguments, output, execution_time_ms in test_cases:
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
assert 0.0 <= result.score <= 1.0, f"Score out of bounds for {function_name}: {result.score}"
assert isinstance(result.reasoning, str)
assert len(result.reasoning) > 0
assert result.retry_count >= 0
Loading