diff --git a/docs/howtos/applications/cost.ipynb b/docs/howtos/applications/cost.ipynb index 6642d08c9..237b6c730 100644 --- a/docs/howtos/applications/cost.ipynb +++ b/docs/howtos/applications/cost.ipynb @@ -26,7 +26,19 @@ "cell_type": "code", "metadata": {}, "outputs": [], - "source": "from langchain_openai.chat_models import ChatOpenAI\nfrom langchain_core.prompt_values import StringPromptValue\n# lets import a parser for OpenAI\nfrom ragas.cost import get_token_usage_for_openai\n\ngpt4o = ChatOpenAI(model=\"gpt-4o\")\np = StringPromptValue(text=\"hai there\")\nllm_result = gpt4o.generate_prompt([p])\n\nget_token_usage_for_openai(llm_result)" + "source": [ + "from langchain_openai.chat_models import ChatOpenAI\n", + "from langchain_core.prompt_values import StringPromptValue\n", + "\n", + "# lets import a parser for OpenAI\n", + "from ragas.cost import get_token_usage_for_openai\n", + "\n", + "gpt4o = ChatOpenAI(model=\"gpt-4o\")\n", + "p = StringPromptValue(text=\"hai there\")\n", + "llm_result = gpt4o.generate_prompt([p])\n", + "\n", + "get_token_usage_for_openai(llm_result)" + ] }, { "cell_type": "markdown", diff --git a/docs/howtos/customizations/metrics/cost.ipynb b/docs/howtos/customizations/metrics/cost.ipynb index 9d5664e50..d1730e294 100644 --- a/docs/howtos/customizations/metrics/cost.ipynb +++ b/docs/howtos/customizations/metrics/cost.ipynb @@ -37,7 +37,19 @@ "cell_type": "code", "metadata": {}, "outputs": [], - "source": "from langchain_openai.chat_models import ChatOpenAI\nfrom langchain_core.prompt_values import StringPromptValue\n# lets import a parser for OpenAI\nfrom ragas.cost import get_token_usage_for_openai\n\ngpt4o = ChatOpenAI(model=\"gpt-4o\")\np = StringPromptValue(text=\"hai there\")\nllm_result = gpt4o.generate_prompt([p])\n\nget_token_usage_for_openai(llm_result)" + "source": [ + "from langchain_openai.chat_models import ChatOpenAI\n", + "from langchain_core.prompt_values import StringPromptValue\n", + "\n", + "# lets import a parser for OpenAI\n", + "from ragas.cost import get_token_usage_for_openai\n", + "\n", + "gpt4o = ChatOpenAI(model=\"gpt-4o\")\n", + "p = StringPromptValue(text=\"hai there\")\n", + "llm_result = gpt4o.generate_prompt([p])\n", + "\n", + "get_token_usage_for_openai(llm_result)" + ] }, { "cell_type": "markdown", diff --git a/docs/howtos/integrations/helicone.ipynb b/docs/howtos/integrations/helicone.ipynb index 2eb70a39c..9bf751f1f 100644 --- a/docs/howtos/integrations/helicone.ipynb +++ b/docs/howtos/integrations/helicone.ipynb @@ -47,7 +47,29 @@ "cell_type": "code", "metadata": {}, "outputs": [], - "source": "import os\nfrom datasets import Dataset\nfrom ragas import evaluate\nfrom ragas.metrics import faithfulness, answer_relevancy, context_precision\nfrom ragas.integrations.helicone import helicone_config # import helicone_config\n\n\n# Set up Helicone\nHELICONE_API_KEY = \"your_helicone_api_key_here\" # Replace with your actual Helicone API key\nhelicone_config.api_key = HELICONE_API_KEY\nos.environ[\"OPENAI_API_KEY\"] = (\n \"your_openai_api_key_here\" # Replace with your actual OpenAI API key\n)\n\n# Verify Helicone API key is set\nif HELICONE_API_KEY == \"your_helicone_api_key_here\":\n raise ValueError(\n \"Please replace 'your_helicone_api_key_here' with your actual Helicone API key.\"\n )" + "source": [ + "import os\n", + "from datasets import Dataset\n", + "from ragas import evaluate\n", + "from ragas.metrics import faithfulness, answer_relevancy, context_precision\n", + "from ragas.integrations.helicone import helicone_config # import helicone_config\n", + "\n", + "\n", + "# Set up Helicone\n", + "HELICONE_API_KEY = (\n", + " \"your_helicone_api_key_here\" # Replace with your actual Helicone API key\n", + ")\n", + "helicone_config.api_key = HELICONE_API_KEY\n", + "os.environ[\"OPENAI_API_KEY\"] = (\n", + " \"your_openai_api_key_here\" # Replace with your actual OpenAI API key\n", + ")\n", + "\n", + "# Verify Helicone API key is set\n", + "if HELICONE_API_KEY == \"your_helicone_api_key_here\":\n", + " raise ValueError(\n", + " \"Please replace 'your_helicone_api_key_here' with your actual Helicone API key.\"\n", + " )" + ] }, { "cell_type": "markdown", diff --git a/ragas/pyproject.toml b/ragas/pyproject.toml index 573e47705..15f590aef 100644 --- a/ragas/pyproject.toml +++ b/ragas/pyproject.toml @@ -63,6 +63,7 @@ dev = [ "haystack-ai", "sacrebleu", "r2r", + "scipy", ] test = [ "pytest", diff --git a/ragas/src/ragas/embeddings/haystack_wrapper.py b/ragas/src/ragas/embeddings/haystack_wrapper.py index 71ac1e978..4dc3501e9 100644 --- a/ragas/src/ragas/embeddings/haystack_wrapper.py +++ b/ragas/src/ragas/embeddings/haystack_wrapper.py @@ -37,10 +37,18 @@ def __init__( # Lazy Import of required Haystack components try: from haystack import AsyncPipeline - from haystack.components.embedders.azure_text_embedder import AzureOpenAITextEmbedder - from haystack.components.embedders.hugging_face_api_text_embedder import HuggingFaceAPITextEmbedder - from haystack.components.embedders.openai_text_embedder import OpenAITextEmbedder - from haystack.components.embedders.sentence_transformers_text_embedder import SentenceTransformersTextEmbedder + from haystack.components.embedders.azure_text_embedder import ( + AzureOpenAITextEmbedder, + ) + from haystack.components.embedders.hugging_face_api_text_embedder import ( + HuggingFaceAPITextEmbedder, + ) + from haystack.components.embedders.openai_text_embedder import ( + OpenAITextEmbedder, + ) + from haystack.components.embedders.sentence_transformers_text_embedder import ( + SentenceTransformersTextEmbedder, + ) except ImportError as exc: raise ImportError( "Haystack is not installed. Please install it with `pip install haystack-ai`." @@ -94,10 +102,18 @@ async def aembed_documents(self, texts: t.List[str]) -> t.List[t.List[float]]: def __repr__(self) -> str: try: - from haystack.components.embedders.azure_text_embedder import AzureOpenAITextEmbedder - from haystack.components.embedders.hugging_face_api_text_embedder import HuggingFaceAPITextEmbedder - from haystack.components.embedders.openai_text_embedder import OpenAITextEmbedder - from haystack.components.embedders.sentence_transformers_text_embedder import SentenceTransformersTextEmbedder + from haystack.components.embedders.azure_text_embedder import ( + AzureOpenAITextEmbedder, + ) + from haystack.components.embedders.hugging_face_api_text_embedder import ( + HuggingFaceAPITextEmbedder, + ) + from haystack.components.embedders.openai_text_embedder import ( + OpenAITextEmbedder, + ) + from haystack.components.embedders.sentence_transformers_text_embedder import ( + SentenceTransformersTextEmbedder, + ) except ImportError: return f"{self.__class__.__name__}(embeddings=Unknown(...))" diff --git a/ragas/src/ragas/llms/haystack_wrapper.py b/ragas/src/ragas/llms/haystack_wrapper.py index 0c92b3c9a..c31df42f1 100644 --- a/ragas/src/ragas/llms/haystack_wrapper.py +++ b/ragas/src/ragas/llms/haystack_wrapper.py @@ -39,8 +39,12 @@ def __init__( try: from haystack import AsyncPipeline from haystack.components.generators.azure import AzureOpenAIGenerator - from haystack.components.generators.hugging_face_api import HuggingFaceAPIGenerator - from haystack.components.generators.hugging_face_local import HuggingFaceLocalGenerator + from haystack.components.generators.hugging_face_api import ( + HuggingFaceAPIGenerator, + ) + from haystack.components.generators.hugging_face_local import ( + HuggingFaceLocalGenerator, + ) from haystack.components.generators.openai import OpenAIGenerator except ImportError as exc: raise ImportError( @@ -115,8 +119,12 @@ async def agenerate_text( def __repr__(self) -> str: try: from haystack.components.generators.azure import AzureOpenAIGenerator - from haystack.components.generators.hugging_face_api import HuggingFaceAPIGenerator - from haystack.components.generators.hugging_face_local import HuggingFaceLocalGenerator + from haystack.components.generators.hugging_face_api import ( + HuggingFaceAPIGenerator, + ) + from haystack.components.generators.hugging_face_local import ( + HuggingFaceLocalGenerator, + ) from haystack.components.generators.openai import OpenAIGenerator except ImportError: return f"{self.__class__.__name__}(llm=Unknown(...))" diff --git a/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py b/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py index 8a37081bb..97551616c 100644 --- a/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py +++ b/ragas/src/ragas/testset/transforms/relationship_builders/cosine.py @@ -12,59 +12,111 @@ class CosineSimilarityBuilder(RelationshipBuilder): property_name: str = "embedding" new_property_name: str = "cosine_similarity" threshold: float = 0.9 + block_size: int = 1024 - def _find_similar_embedding_pairs( - self, embeddings: np.ndarray, threshold: float - ) -> t.List[t.Tuple[int, int, float]]: - # Normalize the embeddings - normalized = embeddings / np.linalg.norm(embeddings, axis=1)[:, np.newaxis] + def _validate_embedding_shapes(self, embeddings: t.List[t.Any]): + if not embeddings: + raise ValueError(f"No nodes have a valid {self.property_name}") + first_len = len(embeddings[0]) + for idx, emb in enumerate(embeddings): + if len(emb) != first_len: + raise ValueError( + f"Embedding at index {idx} has length {len(emb)}, expected {first_len}. " + "All embeddings must have the same length." + ) - # Calculate cosine similarity matrix - similarity_matrix = np.dot(normalized, normalized.T) - # Find pairs with similarity >= threshold - similar_pairs = np.argwhere(similarity_matrix >= threshold) + def _block_cosine_similarity(self, i: np.ndarray, j: np.ndarray): + """Calculate cosine similarity matrix between two sets of embeddings.""" + i_norm = i / np.linalg.norm(i, axis=1, keepdims=True) + j_norm = j / np.linalg.norm(j, axis=1, keepdims=True) + return np.dot(i_norm, j_norm.T) - # Filter out self-comparisons and duplicate pairs - return [ - (pair[0], pair[1], similarity_matrix[pair[0], pair[1]]) - for pair in similar_pairs - if pair[0] < pair[1] - ] + async def _find_similar_embedding_pairs( + self, embeddings: np.ndarray, threshold: float, block_size: int = 1024 + ) -> t.Set[t.Tuple[int, int, float]]: + """Sharded computation of cosine similarity to find similar pairs.""" - async def transform(self, kg: KnowledgeGraph) -> t.List[Relationship]: - if self.property_name is None: - self.property_name = "embedding" + def process_block(i: int, j: int) -> t.Set[t.Tuple[int, int, float]]: + end_i = min(i + block_size, n_embeddings) + end_j = min(j + block_size, n_embeddings) + block = self._block_cosine_similarity( + embeddings[i:end_i, :], embeddings[j:end_j, :] + ) + similar_idx = np.argwhere(block >= threshold) + return { + (int(i + ii), int(j + jj), float(block[ii, jj])) + for ii, jj in similar_idx + if int(i + ii) < int(j + jj) + } + + n_embeddings, _dimension = embeddings.shape + triplets = set() + for i in range(0, n_embeddings, block_size): + for j in range(i, n_embeddings, block_size): + triplets.update(process_block(i, j)) + + return triplets + + async def transform(self, kg: KnowledgeGraph) -> t.List[Relationship]: embeddings = [] for node in kg.nodes: embedding = node.get_property(self.property_name) if embedding is None: raise ValueError(f"Node {node.id} has no {self.property_name}") embeddings.append(embedding) - - similar_pairs = self._find_similar_embedding_pairs( - np.array(embeddings), self.threshold + self._validate_embedding_shapes(embeddings) + similar_pairs = await self._find_similar_embedding_pairs( + np.array(embeddings), self.threshold, self.block_size ) - return [ Relationship( source=kg.nodes[i], target=kg.nodes[j], - type="cosine_similarity", + type=self.new_property_name, properties={self.new_property_name: similarity_float}, bidirectional=True, ) for i, j, similarity_float in similar_pairs ] + def generate_execution_plan(self, kg: KnowledgeGraph) -> t.List[t.Coroutine]: + """ + Generates a coroutine task for finding similar embedding pairs, which can be scheduled/executed by an Executor. + """ + embeddings = [] + for node in kg.nodes: + embedding = node.get_property(self.property_name) + if embedding is None: + raise ValueError(f"Node {node.id} has no {self.property_name}") + embeddings.append(embedding) + self._validate_embedding_shapes(embeddings) + + async def find_and_add_relationships(): + similar_pairs = await self._find_similar_embedding_pairs( + np.array(embeddings), self.threshold, self.block_size + ) + for i, j, similarity_float in similar_pairs: + rel = Relationship( + source=kg.nodes[i], + target=kg.nodes[j], + type=self.new_property_name, + properties={self.new_property_name: similarity_float}, + bidirectional=True, + ) + kg.relationships.append(rel) + + return [find_and_add_relationships()] + @dataclass class SummaryCosineSimilarityBuilder(CosineSimilarityBuilder): property_name: str = "summary_embedding" new_property_name: str = "summary_cosine_similarity" threshold: float = 0.1 + block_size: int = 1024 - def filter(self, kg: KnowledgeGraph) -> KnowledgeGraph: + def _document_summary_filter(self, kg: KnowledgeGraph) -> KnowledgeGraph: """ Filters the knowledge graph to only include nodes with a summary embedding. """ @@ -78,21 +130,22 @@ def filter(self, kg: KnowledgeGraph) -> KnowledgeGraph: return KnowledgeGraph(nodes=nodes) async def transform(self, kg: KnowledgeGraph) -> t.List[Relationship]: + filtered_kg = self._document_summary_filter(kg) embeddings = [ node.get_property(self.property_name) - for node in kg.nodes + for node in filtered_kg.nodes if node.get_property(self.property_name) is not None ] if not embeddings: raise ValueError(f"No nodes have a valid {self.property_name}") - similar_pairs = self._find_similar_embedding_pairs( - np.array(embeddings), self.threshold + similar_pairs = await self._find_similar_embedding_pairs( + np.array(embeddings), self.threshold, self.block_size ) return [ Relationship( - source=kg.nodes[i], - target=kg.nodes[j], - type="summary_cosine_similarity", + source=filtered_kg.nodes[i], + target=filtered_kg.nodes[j], + type=self.new_property_name, properties={self.new_property_name: similarity_float}, bidirectional=True, ) diff --git a/ragas/src/ragas/testset/transforms/relationship_builders/traditional.py b/ragas/src/ragas/testset/transforms/relationship_builders/traditional.py index ad33ea42f..5b1a7d6f8 100644 --- a/ragas/src/ragas/testset/transforms/relationship_builders/traditional.py +++ b/ragas/src/ragas/testset/transforms/relationship_builders/traditional.py @@ -1,3 +1,4 @@ +import itertools import typing as t from collections import Counter from dataclasses import dataclass @@ -19,39 +20,62 @@ def _jaccard_similarity(self, set1: t.Set[str], set2: t.Set[str]) -> float: union = len(set1.union(set2)) return intersection / union if union > 0 else 0.0 - async def transform(self, kg: KnowledgeGraph) -> t.List[Relationship]: - if self.property_name is None: - self.property_name - - similar_pairs = [] - for i, node1 in enumerate(kg.nodes): - for j, node2 in enumerate(kg.nodes): - if i >= j: - continue - items1 = node1.get_property(self.property_name) - items2 = node2.get_property(self.property_name) - if items1 is None or items2 is None: - raise ValueError( - f"Node {node1.id} or {node2.id} has no {self.property_name}" - ) - if self.key_name is not None: - items1 = items1.get(self.key_name, []) - items2 = items2.get(self.key_name, []) - similarity = self._jaccard_similarity(set(items1), set(items2)) - if similarity >= self.threshold: - similar_pairs.append((i, j, similarity)) + async def _find_similar_embedding_pairs( + self, kg: KnowledgeGraph + ) -> t.Set[t.Tuple[int, int, float]]: + """ + Finds all node index pairs with Jaccard similarity above the threshold. + Returns a set of (i, j, similarity) tuples. + """ + + similar_pairs = set() + for (i, node1), (j, node2) in itertools.combinations(enumerate(kg.nodes), 2): + items1 = node1.get_property(self.property_name) + items2 = node2.get_property(self.property_name) + if items1 is None or items2 is None: + raise ValueError( + f"Node {node1.id} or {node2.id} has no {self.property_name}" + ) + if self.key_name is not None: + items1 = items1.get(self.key_name, []) + items2 = items2.get(self.key_name, []) + similarity = self._jaccard_similarity(set(items1), set(items2)) + if similarity >= self.threshold: + similar_pairs.add((i, j, similarity)) + return similar_pairs + async def transform(self, kg: KnowledgeGraph) -> t.List[Relationship]: + similar_pairs = await self._find_similar_embedding_pairs(kg) return [ Relationship( source=kg.nodes[i], target=kg.nodes[j], - type="jaccard_similarity", + type=self.new_property_name, properties={self.new_property_name: similarity_float}, bidirectional=True, ) for i, j, similarity_float in similar_pairs ] + def generate_execution_plan(self, kg: KnowledgeGraph) -> t.List[t.Coroutine]: + """ + Generates a coroutine task for finding similar pairs, which can be scheduled/executed by an Executor. + """ + + async def find_and_add_relationships(): + similar_pairs = await self._find_similar_embedding_pairs(kg) + for i, j, similarity_float in similar_pairs: + rel = Relationship( + source=kg.nodes[i], + target=kg.nodes[j], + type=self.new_property_name, + properties={self.new_property_name: similarity_float}, + bidirectional=True, + ) + kg.relationships.append(rel) + + return [find_and_add_relationships()] + @dataclass class OverlapScoreBuilder(RelationshipBuilder): @@ -65,6 +89,7 @@ class OverlapScoreBuilder(RelationshipBuilder): def __post_init__(self): try: from rapidfuzz import distance + except ImportError: raise ImportError( "rapidfuzz is required for string distance. Please install it using `pip install rapidfuzz`" @@ -78,13 +103,11 @@ def __post_init__(self): } def _overlap_score(self, overlaps: t.List[bool]) -> float: - return sum(overlaps) / len(overlaps) if len(overlaps) > 0 else 0.0 def _get_noisy_items( self, nodes: t.List[Node], property_name: str, percent_cut_off: float = 0.05 ) -> t.List[str]: - all_items = [] for node in nodes: items = node.get_property(property_name) diff --git a/ragas/tests/unit/test_cosine_relationship_builders.py b/ragas/tests/unit/test_cosine_relationship_builders.py new file mode 100644 index 000000000..25333ed8c --- /dev/null +++ b/ragas/tests/unit/test_cosine_relationship_builders.py @@ -0,0 +1,435 @@ +import asyncio +import copy +import random +from uuid import UUID + +import numpy as np +import pytest + +from ragas.testset.graph import KnowledgeGraph, Node, NodeType, Relationship +from ragas.testset.transforms.relationship_builders.cosine import ( + CosineSimilarityBuilder, + SummaryCosineSimilarityBuilder, +) + + +def generate_test_vectors( + n: int = 16, + d: int = 32, + min_similarity: float = 0.5, + similar_fraction: float = 0.3, + seed: int | None = None, +) -> np.ndarray: + """ + Generate `n` unit vectors of dimension `d`, where at least `similar_fraction` of them + are similar to each other (cosine similarity > `min_similarity`), and the result is shuffled. + + Parameters: + - n (int): Total number of vectors to generate. + - d (int): Dimensionality of each vector. + - min_similarity (float): Minimum cosine similarity for similar pairs. + - similar_fraction (float): Fraction (0-1) of vectors that should be similar. + - seed (int): Optional random seed for reproducibility. + + Returns: + - np.ndarray: Array of shape (n, d) of unit vectors. + """ + + if seed is not None: + np.random.seed(seed) + random.seed(seed) + + num_similar = max(2, int(n * similar_fraction)) # at least two similar vectors + num_random = n - num_similar + + # Step 1: Create a base vector + base = np.random.randn(d) + base /= np.linalg.norm(base) + + # Step 2: Generate similar vectors + similar_vectors = [base] + angle = np.arccos(min_similarity) + + for _ in range(num_similar - 1): + perturbation = np.random.randn(d) + perturbation -= perturbation.dot(base) * base # make orthogonal + perturbation /= np.linalg.norm(perturbation) + + similar_vec = np.cos(angle * 0.9) * base + np.sin(angle * 0.9) * perturbation + similar_vec /= np.linalg.norm(similar_vec) + similar_vectors.append(similar_vec) + + # Step 3: Generate additional random unit vectors + random_vectors = [] + for _ in range(num_random): + v = np.random.randn(d) + v /= np.linalg.norm(v) + random_vectors.append(v) + + # Step 4: Combine and shuffle + all_vectors = similar_vectors + random_vectors + random.shuffle(all_vectors) + + return np.stack(all_vectors) + + +def cosine_similarity_matrix(embeddings: np.ndarray): + """Calculate cosine similarity matrix for a set of embeddings.""" + from scipy.spatial.distance import cdist + + similarity = 1 - cdist(embeddings, embeddings, metric="cosine") + + # normalized = embeddings / np.linalg.norm(embeddings, axis=1)[:, np.newaxis] + # similarity = np.dot(normalized, normalized.T) + return similarity + + +def cosine_similarity_pair(embeddings: np.ndarray, threshold: float): + """Find pairs of embeddings with cosine similarity >= threshold.""" + # Find pairs with similarity >= threshold + similarity_matrix = cosine_similarity_matrix(embeddings) + similar_pairs = np.argwhere(similarity_matrix >= threshold) + + # Filter out self-comparisons and duplicate pairs + return [ + (int(pair[0]), int(pair[1]), float(similarity_matrix[pair[0], pair[1]])) + for pair in similar_pairs + if pair[0] < pair[1] + ] + + +def vector_cosine_similarity(a, b): + """Find pairwise cosine similarity between two vectors.""" + return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) + + +@pytest.fixture +def simple_kg(): + # Arrange: create a simple knowledge graph with embeddings + # roughly, we expect the following relationships: + # 1 <-> 2 (0.1928 similarity) + # 2 <-> 3 (0.6520 similarity) + # 1 <-> 3 (0.8258 similarity) + nodes = [ + Node( + id=UUID("4da47a69-539c-49a2-b289-01780989d82c"), + type=NodeType.DOCUMENT, + properties={ + "embedding": [0.2313, -0.362, 0.5875, -0.0526, -0.0954], + "summary_embedding": [0.2313, -0.362, 0.5875, -0.0526, -0.0954], + }, + ), + Node( + id=UUID("f353e5c2-e432-4d1e-84a8-d750c93d4edf"), + type=NodeType.DOCUMENT, + properties={ + "embedding": [0.9066, 0.786, 0.6925, 0.8022, 0.5297], + "summary_embedding": [0.9066, 0.786, 0.6925, 0.8022, 0.5297], + }, + ), + Node( + id=UUID("437c8c08-cef6-4ebf-a35f-93d6168b61a4"), + type=NodeType.DOCUMENT, + properties={ + "embedding": [0.5555, -0.1074, 0.8454, 0.3499, -0.1669], + "summary_embedding": [0.5555, -0.1074, 0.8454, 0.3499, -0.1669], + }, + ), + ] + return KnowledgeGraph(nodes=nodes) + + +# node order +# UUID("4da47a69-539c-49a2-b289-01780989d82c") +# UUID("f353e5c2-e432-4d1e-84a8-d750c93d4edf") +# UUID("437c8c08-cef6-4ebf-a35f-93d6168b61a4") + + +@pytest.mark.parametrize( + "n_test_embeddings", + [ + (16), + (256), + (1024), + ], +) +def test__cosine_similarity(n_test_embeddings): + """ + Validate that the cosine similarity function correctly computes pairwise similarities + and that the results match expected values. + """ + + threshold = 0.7 + embeddings = generate_test_vectors( + n=n_test_embeddings, + d=64, + min_similarity=min(threshold + 0.025, 1.0), + similar_fraction=0.3, + ) + expected = cosine_similarity_matrix(embeddings) + + builder = CosineSimilarityBuilder(property_name="embedding", threshold=threshold) + result = builder._block_cosine_similarity(embeddings, embeddings) + + assert result.shape == expected.shape, "Result shape does not match expected shape" + assert np.allclose( + result, expected, atol=1e-5 + ), "Cosine similarity does not match expected values" + + +# Test for the internal _find_similar_embedding_pairs method +@pytest.mark.parametrize( + "n_test_embeddings, threshold, block_size", + [ + (16, 0.5, 16), + (16, 0.7, 16), + (16, 0.9, 16), + (16, 0.7, 32), # block size >> n_test_embeddings + (16, 0.7, 37), # block size >> n_test_embeddings + (32, 0.7, 16), # block size 1/2 n_test_embeddings + (37, 0.7, 4), # block size doesn't shard evenly + ], +) +def test__find_similar_embedding_pairs(n_test_embeddings, threshold, block_size): + """Validate that _find_similar_embedding_pairs correctly identifies pairs when compared with scipy's cosine distance.""" + + embeddings = generate_test_vectors( + n=n_test_embeddings, + d=64, + min_similarity=min(threshold + 0.025, 1.0), + similar_fraction=0.3, + ) + expected = cosine_similarity_pair(embeddings, threshold) + + builder = CosineSimilarityBuilder(property_name="embedding", threshold=threshold) + result = asyncio.run( + builder._find_similar_embedding_pairs( + embeddings, threshold=threshold, block_size=block_size + ) + ) + + assert len(result) == len(expected) + + for i, j, similarity_float in result: + assert i < j, "Pairs should be ordered (i < j)" + assert ( + similarity_float >= threshold + ), f"Similarity {similarity_float} should be >= {threshold}" + for x, y, expected_similarity in expected: + if i == x and j == y: + assert similarity_float == pytest.approx( + expected_similarity + ), "Cosine similarity does not match expected value" + + break + + +class TestCosineSimilarityBuilder: + @pytest.mark.asyncio + async def test_no_self_similarity_relationships(self, simple_kg): + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.1) + relationships = await builder.transform(copy.deepcopy(simple_kg)) + for r in relationships: + assert ( + r.source.id != r.target.id + ), "Self-relationships should not be created" + + @pytest.mark.asyncio + async def test_no_duplicate_relationships(self, simple_kg): + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.1) + relationships = await builder.transform(copy.deepcopy(simple_kg)) + seen = set() + for r in relationships: + pair = tuple(sorted([r.source.id, r.target.id])) + assert pair not in seen, "Duplicate relationships found" + seen.add(pair) + + @pytest.mark.asyncio + async def test_similarity_at_threshold(self): + node1 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + node2 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = CosineSimilarityBuilder(property_name="embedding", threshold=1.0) + relationships = await builder.transform(kg) + assert len(relationships) == 1, "Should create relationship at threshold" + + @pytest.mark.asyncio + async def test_all_below_threshold(self): + node1 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + node2 = Node(type=NodeType.CHUNK, properties={"embedding": [-1, 0, 0]}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.5) + relationships = await builder.transform(kg) + assert ( + len(relationships) == 0 + ), "No relationships should be created below threshold" + + @pytest.mark.asyncio + async def test_all_above_threshold(self): + node1 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + node2 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + node3 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + kg = KnowledgeGraph(nodes=[node1, node2, node3]) + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.9) + relationships = await builder.transform(kg) + assert len(relationships) == 3 + + @pytest.mark.asyncio + async def test_malformed_embedding_raises(self): + node1 = Node(type=NodeType.CHUNK, properties={"embedding": [1, 0, 0]}) + node2 = Node(type=NodeType.CHUNK, properties={"embedding": ["a", 0, 0]}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.5) + with pytest.raises(Exception): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_cosine_similarity_builder_empty_graph(self): + kg = KnowledgeGraph(nodes=[]) + builder = CosineSimilarityBuilder(property_name="embedding") + with pytest.raises(ValueError, match="No nodes have a valid embedding"): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_cosine_similarity_builder_basic(self, simple_kg): + # Act + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.5) + relationships = await builder.transform(simple_kg) + # Assert + assert all(isinstance(r, Relationship) for r in relationships) + assert all(r.type == "cosine_similarity" for r in relationships) + # 2 <-> 3 (~0.6520 similarity) + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + # 1 <-> 3 (~0.8258 similarity) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + + @pytest.mark.asyncio + async def test_cosine_similarity_builder_no_embeddings(self): + kg = KnowledgeGraph( + nodes=[ + Node(type=NodeType.DOCUMENT, properties={}), + Node(type=NodeType.DOCUMENT, properties={}), + ] + ) + builder = CosineSimilarityBuilder(property_name="embedding") + with pytest.raises(ValueError, match="has no embedding"): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_cosine_similarity_builder_shape_validation(self): + kg = KnowledgeGraph( + nodes=[ + Node(type=NodeType.DOCUMENT, properties={"embedding": [1.0, 0.0]}), + Node( + type=NodeType.DOCUMENT, + properties={"embedding": [0.0, 1.0, 2.0]}, + ), + ] + ) + builder = CosineSimilarityBuilder(property_name="embedding") + with pytest.raises( + ValueError, match="Embedding at index 1 has length 3, expected 2" + ): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_apply_transforms_cosine_similarity_builder(self, simple_kg): + from ragas.run_config import RunConfig + from ragas.testset.transforms.engine import apply_transforms + + # CosineSimilarityBuilder should add relationships to the graph + builder = CosineSimilarityBuilder(property_name="embedding", threshold=0.5) + kg = simple_kg + # Should mutate kg in-place + apply_transforms(kg, builder, run_config=RunConfig(max_workers=2)) + # Check that relationships were added + assert any( + r.type == "cosine_similarity" for r in kg.relationships + ), "No cosine_similarity relationships found after apply_transforms" + # Check that expected relationship exists + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in kg.relationships + ) + # 1 <-> 3 (~0.8258 similarity) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in kg.relationships + ) + + +class TestSummaryCosineSimilarityBuilder: + @pytest.mark.asyncio + async def test_summary_cosine_similarity_builder_basic(self, simple_kg): + builder = SummaryCosineSimilarityBuilder( + property_name="summary_embedding", threshold=0.5 + ) + relationships = await builder.transform(simple_kg) + assert all(isinstance(r, Relationship) for r in relationships) + assert all(r.type == "summary_cosine_similarity" for r in relationships) + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + + @pytest.mark.asyncio + async def test_summary_cosine_similarity_only_document_nodes(self): + node1 = Node( + type=NodeType.DOCUMENT, properties={"summary_embedding": [1, 0, 0]} + ) + node2 = Node(type=NodeType.CHUNK, properties={"summary_embedding": [1, 0, 0]}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = SummaryCosineSimilarityBuilder( + property_name="summary_embedding", threshold=0.5 + ) + relationships = await builder.transform(kg) + assert len(relationships) == 0 + + @pytest.mark.asyncio + async def test_summary_cosine_similarity_builder_filter_and_error(self): + kg = KnowledgeGraph(nodes=[Node(type=NodeType.DOCUMENT, properties={})]) + builder = SummaryCosineSimilarityBuilder(property_name="summary_embedding") + with pytest.raises(ValueError, match="has no summary_embedding"): + await builder.transform(kg) + + +@pytest.mark.asyncio +async def test_apply_transforms_summary_cosine_similarity_builder(simple_kg): + from ragas.run_config import RunConfig + from ragas.testset.transforms.engine import apply_transforms + + builder = SummaryCosineSimilarityBuilder( + property_name="summary_embedding", threshold=0.5 + ) + kg = simple_kg + apply_transforms(kg, builder, run_config=RunConfig(max_workers=2)) + assert any( + r.type == "summary_cosine_similarity" for r in kg.relationships + ), "No summary_cosine_similarity relationships found after apply_transforms" + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in kg.relationships + ) + # 1 <-> 3 (~0.8258 similarity) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in kg.relationships + ) diff --git a/ragas/tests/unit/test_traditional_relationship_builders.py b/ragas/tests/unit/test_traditional_relationship_builders.py new file mode 100644 index 000000000..4bd01d7b7 --- /dev/null +++ b/ragas/tests/unit/test_traditional_relationship_builders.py @@ -0,0 +1,396 @@ +import asyncio +import copy +import math +import random +import string +from typing import List, Set, Tuple +from uuid import UUID + +import numpy as np +import pytest + +from ragas.testset.graph import KnowledgeGraph, Node, NodeType, Relationship +from ragas.testset.transforms.relationship_builders.traditional import ( + JaccardSimilarityBuilder, +) + + +def generate_test_sets( + n: int = 16, + max_len: int = 32, + min_similarity: float = 0.5, + similar_fraction: float = 0.3, +) -> List[Set[str]]: + """ + Generate `n` sets up to `max_len`, where at least `similar_fraction` of all possible + pairs have Jaccard similarity >= `min_similarity`. The result is shuffled. + + Parameters: + - n (int): Total number of sets to generate. + - max_len (int): Maximum length of each set. + - min_similarity (float): Minimum Jaccard similarity for similar pairs. + - similar_fraction (float): Fraction (0-1) of sets that should be similar. + + Returns: + - list: List of generated sets. + """ + + def generate_entity(k: int = 5) -> str: + """Generate a random entity of length k.""" + return "".join(random.choices(string.ascii_lowercase, k=k)) + + def jaccard(a: set[str], b: set[str]) -> float: + from scipy.spatial.distance import jaccard as jaccard_dist + + # union of elements -> boolean indicator vectors + elems = sorted(a | b) + va = np.array([e in a for e in elems], dtype=bool) + vb = np.array([e in b for e in elems], dtype=bool) + # SciPy returns the Jaccard distance; similarity = 1 - distance + return 1.0 - jaccard_dist(va, vb) + + # bias toward shorter lengths (expovariate with λ=1.0) + def sample_length() -> int: + length = int(random.expovariate(1.0)) + return min(length, max_len) + + total_pairs = n * (n - 1) // 2 + target_similar = math.ceil(total_pairs * similar_fraction) + + # Initialize all sets with random, ragged lengths + sets = [{generate_entity() for _ in range(sample_length())} for _ in range(n)] + + # Count how many pairs are “similar” right now + current_similar = len(jaccard_similarity_pair(sets, min_similarity)) + + # Iteratively fix random non‐similar pairs until we hit target + max_attempts = target_similar * 10 + attempts = 0 + + while current_similar < target_similar and attempts < max_attempts: + # pick a non‐similar pair + bad_pairs = [ + (i, j) + for i in range(n) + for j in range(i + 1, n) + if jaccard(sets[i], sets[j]) < min_similarity + ] + if not bad_pairs: + break + i, j = random.choice(bad_pairs) + + # decide new lengths + Li, Lj = sample_length(), sample_length() + # solve for needed intersection size intersection_size such that + # intersection_size / (Li + Lj - intersection_size) >= min_similarity + intersection_size = math.ceil(min_similarity * (Li + Lj) / (1 + min_similarity)) + + # build new similar pair + shared = {generate_entity() for _ in range(intersection_size)} + Ai = shared | {generate_entity() for _ in range(Li - intersection_size)} + Bj = shared | {generate_entity() for _ in range(Lj - intersection_size)} + + sets[i], sets[j] = Ai, Bj + + current_similar = len(jaccard_similarity_pair(sets, min_similarity)) + attempts += 1 + else: + raise ValueError( + f"Could not generate enough similar pairs after {max_attempts} attempts." + ) + + # Create a core set of shared elements for similar sets + core_size = max(1, int(max_len * min_similarity)) + core = {generate_entity() for _ in range(core_size)} + + # Create a set of unique elements to draw from + base_pool = {generate_entity() for _ in range(n * max_len * 8)} + base_pool -= core + + n_similar = int(n * similar_fraction) + n_dissimilar = n - n_similar + + # Pre-calculate max add'l unique elements that can be added to core while still guaranteeing min_similarity + max_unique = int(core_size * ((1 - min_similarity) / min_similarity)) + if max_unique > max_len: + raise ValueError( + "max_unique exceeds max_len, cannot guarantee min_similarity with given parameters." + ) + + # Generate similar sets + similar = [] + for _ in range(n_similar): + # Random size for this set, at least the core size + set_len = core_size + random.randint(0, max_unique) + s = core.copy() + # Add random elements from the base pool until we reach set_len + while len(s) < set_len: + if not base_pool: + raise ValueError("Base pool is empty, cannot generate more sets.") + element = base_pool.pop() + if element not in s: + s.add(element) + similar.append(s) + + # Generate dissimilar sets + dissimilar = [] + for _ in range(n_dissimilar): + set_len = random.randint(0, max_len) + s = set() + while len(s) < set_len: + if not base_pool: + raise ValueError("Base pool is empty, cannot generate more sets.") + element = base_pool.pop() + if element not in s: + s.add(element) + dissimilar.append(s) + + sets = similar + dissimilar + random.shuffle(sets) + return sets + + +def validate_sets(sets: list[set[str]], min_similarity: float, similar_fraction: float): + n = len(sets) + n_similar_needed = int(n * similar_fraction) + + similar_pairs = jaccard_similarity_pair(sets, min_similarity) + n_similar_pairs = len(similar_pairs) + actual_similar_fraction = n_similar_pairs / (n * (n - 1) // 2) + + print(f"Expected similar pairs: {n_similar_needed}") + print(f"Actual similar pairs: {n_similar_pairs}") + print(f"Actual similar fraction: {actual_similar_fraction:.2f}") + print(f"Similarity threshold: {min_similarity}") + + +def jaccard_similarity_matrix(sets: List[Set[str]]) -> np.ndarray: + """Calculate Jaccard similarity matrix for a list of string sets.""" + n = len(sets) + similarity = np.zeros((n, n), dtype=float) + + for i in range(n): + for j in range(i, n): + intersection = sets[i].intersection(sets[j]) + union = sets[i].union(sets[j]) + score = len(intersection) / len(union) if union else 0.0 + similarity[i, j] = similarity[j, i] = score + + return similarity + + +def jaccard_similarity_pair( + sets: List[Set[str]], threshold: float +) -> List[Tuple[int, int, float]]: + """Find pairs of sets with Jaccard similarity >= threshold.""" + similarity_matrix = jaccard_similarity_matrix(sets) + similar_pairs = np.argwhere(similarity_matrix >= threshold) + + return [ + (int(i), int(j), float(similarity_matrix[i, j])) + for i, j in similar_pairs + if i < j # avoid self-pairs and duplicates + ] + + +@pytest.fixture +def simple_kg(): + # Arrange: create a simple knowledge graph with embeddings + # roughly, we expect the following relationships: + # 1 <-> 2 (0.0 similarity) + # 2 <-> 3 (0.1667 similarity) + # 1 <-> 3 (0.25 similarity) + nodes = [ + Node( + id=UUID("4da47a69-539c-49a2-b289-01780989d82c"), + type=NodeType.DOCUMENT, + properties={ + "entities": {"cat", "dog", "fish", "fox", "bird"}, + }, + ), + Node( + id=UUID("f353e5c2-e432-4d1e-84a8-d750c93d4edf"), + type=NodeType.DOCUMENT, + properties={ + "entities": {"apple", "banana"}, + }, + ), + Node( + id=UUID("437c8c08-cef6-4ebf-a35f-93d6168b61a4"), + type=NodeType.DOCUMENT, + properties={ + "entities": {"cat", "banana", "dog", "rock", "tree"}, + }, + ), + ] + return KnowledgeGraph(nodes=nodes) + + +# node order +# UUID("4da47a69-539c-49a2-b289-01780989d82c") +# UUID("f353e5c2-e432-4d1e-84a8-d750c93d4edf") +# UUID("437c8c08-cef6-4ebf-a35f-93d6168b61a4") + + +@pytest.mark.parametrize( + "n_test_sets, max_len, threshold", + [ + (8, 100, 0.2), + (16, 8, 0.1), + (16, 16, 0.5), + (32, 5, 0.3), + ], +) +def test__find_similar_embedding_pairs_jaccard(n_test_sets, max_len, threshold): + """ + Validate that _find_similar_embedding_pairs correctly identifies pairs when compared with scipy's jaccard distance. + """ + sets = generate_test_sets( + n=n_test_sets, + max_len=max_len, + min_similarity=min(threshold + 0.05, 1.0), + similar_fraction=0.3, + ) + expected = jaccard_similarity_pair(sets, threshold) + + kg = KnowledgeGraph( + nodes=[Node(type=NodeType.DOCUMENT, properties={"entities": s}) for s in sets] + ) + builder = JaccardSimilarityBuilder(property_name="entities", threshold=threshold) + result = list(asyncio.run(builder._find_similar_embedding_pairs(kg))) + + assert len(result) == len(expected) + for i, j, similarity_float in result: + assert i < j, "Pairs should be ordered (i < j)" + assert similarity_float >= threshold, ( + f"Similarity {similarity_float} should be >= {threshold}" + ) + for x, y, expected_similarity in expected: + if i == x and j == y: + assert similarity_float == pytest.approx(expected_similarity) + break + + +class TestJaccardSimilarityBuilder: + @pytest.mark.asyncio + async def test_no_self_similarity_relationships(self, simple_kg): + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.1) + relationships = await builder.transform(copy.deepcopy(simple_kg)) + for r in relationships: + assert r.source.id != r.target.id, ( + "Self-relationships should not be created" + ) + + @pytest.mark.asyncio + async def test_no_duplicate_relationships(self, simple_kg): + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.1) + relationships = await builder.transform(copy.deepcopy(simple_kg)) + seen = set() + for r in relationships: + pair = tuple(sorted([r.source.id, r.target.id])) + assert pair not in seen, "Duplicate relationships found" + seen.add(pair) + + @pytest.mark.asyncio + async def test_similarity_at_threshold(self): + node1 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + node2 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = JaccardSimilarityBuilder(property_name="entities", threshold=1.0) + relationships = await builder.transform(kg) + assert len(relationships) == 1, "Should create relationship at threshold" + + @pytest.mark.asyncio + async def test_all_below_threshold(self): + node1 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + node2 = Node(type=NodeType.DOCUMENT, properties={"entities": {"x", "y", "z"}}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.1) + relationships = await builder.transform(kg) + assert len(relationships) == 0, ( + "No relationships should be created below threshold" + ) + + @pytest.mark.asyncio + async def test_all_above_threshold(self): + node1 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + node2 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + node3 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + kg = KnowledgeGraph(nodes=[node1, node2, node3]) + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.9) + relationships = await builder.transform(kg) + assert len(relationships) == 3 + + @pytest.mark.asyncio + async def test_malformed_entities_raises(self): + node1 = Node(type=NodeType.DOCUMENT, properties={"entities": {"a", "b", "c"}}) + node2 = Node(type=NodeType.DOCUMENT, properties={"entities": None}) + kg = KnowledgeGraph(nodes=[node1, node2]) + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.5) + with pytest.raises(ValueError): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_jaccard_similarity_builder_empty_graph(self): + kg = KnowledgeGraph(nodes=[]) + builder = JaccardSimilarityBuilder(property_name="entities") + relationships = await builder.transform(kg) + assert relationships == [] + + @pytest.mark.asyncio + async def test_jaccard_similarity_builder_basic(self, simple_kg): + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.15) + relationships = await builder.transform(simple_kg) + assert all(isinstance(r, Relationship) for r in relationships) + assert all(r.type == "jaccard_similarity" for r in relationships) + # 2 <-> 3 (~0.1667 similarity) + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + # 1 <-> 3 (~0.25 similarity) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in relationships + ) + + @pytest.mark.asyncio + async def test_jaccard_similarity_builder_no_entities(self): + kg = KnowledgeGraph( + nodes=[ + Node(type=NodeType.DOCUMENT, properties={}), + Node(type=NodeType.DOCUMENT, properties={}), + ] + ) + builder = JaccardSimilarityBuilder(property_name="entities") + with pytest.raises(ValueError, match="has no entities"): + await builder.transform(kg) + + @pytest.mark.asyncio + async def test_apply_transforms_cosine_similarity_builder(self, simple_kg): + from ragas.run_config import RunConfig + from ragas.testset.transforms.engine import apply_transforms + + # JaccardSimilarityBuilder should add relationships to the graph + builder = JaccardSimilarityBuilder(property_name="entities", threshold=0.15) + kg = simple_kg + # Should mutate kg in-place + apply_transforms(kg, builder, run_config=RunConfig(max_workers=2)) + # Check that relationships were added + assert any(r.type == "jaccard_similarity" for r in kg.relationships), ( + "No jaccard_similarity relationships found after apply_transforms" + ) + # Check that expected relationship exists + assert any( + str(r.source.id) == "f353e5c2-e432-4d1e-84a8-d750c93d4edf" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in kg.relationships + ) + # 1 <-> 3 (~0.8258 similarity) + assert any( + str(r.source.id) == "4da47a69-539c-49a2-b289-01780989d82c" + and str(r.target.id) == "437c8c08-cef6-4ebf-a35f-93d6168b61a4" + for r in kg.relationships + )