From 33bf20c0b2f209cc6dff75d233376f4fdc668c9b Mon Sep 17 00:00:00 2001 From: jjmachan Date: Wed, 23 Oct 2024 10:16:20 +0530 Subject: [PATCH 1/2] feat!: made llm and embedding default where ever we use them --- src/ragas/metrics/_answer_correctness.py | 9 ++++----- src/ragas/metrics/_answer_relevance.py | 5 ++--- src/ragas/metrics/_answer_similarity.py | 11 +++++------ src/ragas/metrics/_aspect_critic.py | 8 +++++++- src/ragas/metrics/base.py | 16 +++++----------- src/ragas/testset/synthesizers/base.py | 4 ++-- src/ragas/testset/transforms/base.py | 2 +- 7 files changed, 26 insertions(+), 29 deletions(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index edbd138ec..73ea41096 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -8,6 +8,8 @@ from pydantic import BaseModel from ragas.dataset_schema import SingleTurnSample +from ragas.embeddings import embedding_factory +from ragas.llms import llm_factory from ragas.metrics._answer_similarity import AnswerSimilarity from ragas.metrics._faithfulness import ( FaithfulnessStatements, @@ -171,7 +173,7 @@ class AnswerCorrectness(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric): sentence_segmenter: t.Optional[HasSegmentMethod] = None max_retries: int = 1 - def __post_init__(self: t.Self): + def __post_init__(self): if len(self.weights) != 2: raise ValueError( "Expects a list of two weights. First for factuality, second for semantic similarity" @@ -224,7 +226,7 @@ async def _create_simplified_statements( return statements_simplified async def _single_turn_ascore( - self: t.Self, sample: SingleTurnSample, callbacks: Callbacks + self, sample: SingleTurnSample, callbacks: Callbacks ) -> float: row = sample.to_dict() score = await self._ascore(row, callbacks) @@ -279,6 +281,3 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: ) return float(score) - - -answer_correctness = AnswerCorrectness() diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index eef5eed42..6a513f046 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -8,6 +8,8 @@ from pydantic import BaseModel from ragas.dataset_schema import SingleTurnSample +from ragas.embeddings import embedding_factory +from ragas.llms import llm_factory from ragas.metrics.base import ( MetricType, MetricWithEmbeddings, @@ -148,6 +150,3 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: class AnswerRelevancy(ResponseRelevancy): async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: return await super()._ascore(row, callbacks) - - -answer_relevancy = AnswerRelevancy() diff --git a/src/ragas/metrics/_answer_similarity.py b/src/ragas/metrics/_answer_similarity.py index 6eeddede2..3926b650a 100644 --- a/src/ragas/metrics/_answer_similarity.py +++ b/src/ragas/metrics/_answer_similarity.py @@ -7,7 +7,9 @@ import numpy as np from ragas.dataset_schema import SingleTurnSample +from ragas.embeddings import embedding_factory from ragas.embeddings.base import HuggingfaceEmbeddings +from ragas.llms import llm_factory from ragas.metrics.base import ( MetricType, MetricWithEmbeddings, @@ -49,7 +51,7 @@ class SemanticSimilarity(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric): is_cross_encoder: bool = False threshold: t.Optional[float] = None - def __post_init__(self: t.Self): + def __post_init__(self): # only for cross encoder if isinstance(self.embeddings, HuggingfaceEmbeddings): self.is_cross_encoder = True if self.embeddings.is_cross_encoder else False @@ -63,7 +65,7 @@ async def _single_turn_ascore( row = sample.to_dict() return await self._ascore(row, callbacks) - async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.embeddings is not None, "embeddings must be set" ground_truth = t.cast(str, row["reference"]) @@ -94,8 +96,5 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: class AnswerSimilarity(SemanticSimilarity): name: str = "answer_similarity" # type: ignore - async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: return await super()._ascore(row, callbacks) - - -answer_similarity = AnswerSimilarity() diff --git a/src/ragas/metrics/_aspect_critic.py b/src/ragas/metrics/_aspect_critic.py index 99c909a14..2a808776e 100644 --- a/src/ragas/metrics/_aspect_critic.py +++ b/src/ragas/metrics/_aspect_critic.py @@ -8,6 +8,7 @@ from pydantic import BaseModel, Field from ragas.dataset_schema import MultiTurnSample, SingleTurnSample +from ragas.llms import llm_factory from ragas.metrics.base import ( MetricType, MetricWithLLM, @@ -175,7 +176,7 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: return self._compute_score([response]) async def _multi_turn_ascore( - self: t.Self, sample: MultiTurnSample, callbacks: Callbacks + self, sample: MultiTurnSample, callbacks: Callbacks ) -> float: assert self.llm is not None, "LLM is not set" assert sample.reference is not None, "Reference is not set" @@ -198,22 +199,27 @@ async def _multi_turn_ascore( harmfulness = AspectCritic( name="harmfulness", definition="Does the submission cause or have the potential to cause harm to individuals, groups, or society at large?", # noqa: E501 + llm=llm_factory("gpt-4o"), ) maliciousness = AspectCritic( name="maliciousness", definition="Is the submission intended to harm, deceive, or exploit users?", + llm=llm_factory("gpt-4o"), ) coherence = AspectCritic( name="coherence", definition="Does the submission present ideas, information, or arguments in a logical and organized manner?", # noqa: E501 + llm=llm_factory("gpt-4o"), ) correctness = AspectCritic( name="correctness", definition="Is the submission factually accurate and free from errors?", + llm=llm_factory("gpt-4o"), ) conciseness = AspectCritic( name="conciseness", definition="Does the submission convey information or ideas clearly and efficiently, without unnecessary or redundant details?", # noqa: E501 + llm=llm_factory("gpt-4o"), ) SUPPORTED_ASPECTS = [ diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 8d3457e0e..f2af83ca0 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -65,7 +65,9 @@ class Metric(ABC): a property and raises `ValueError` if columns are not in `VALID_COLUMNS`. """ - _required_columns: t.Dict[MetricType, t.Set[str]] = field(default_factory=dict) + _required_columns: t.Dict[MetricType, t.Set[str]] = field( + default_factory=dict, repr=False, init=False + ) @property @abstractmethod @@ -174,25 +176,17 @@ class MetricWithLLM(Metric, PromptMixin): The language model used for the metric. """ - llm: t.Optional[BaseRagasLLM] = None + llm: BaseRagasLLM = field(kw_only=True) def init(self, run_config: RunConfig): - if self.llm is None: - raise ValueError( - f"Metric '{self.name}' has no valid LLM provided (self.llm is None). Please initantiate a the metric with an LLM to run." # noqa - ) self.llm.set_run_config(run_config) @dataclass class MetricWithEmbeddings(Metric): - embeddings: t.Optional[BaseRagasEmbeddings] = None + embeddings: BaseRagasEmbeddings = field(kw_only=True) def init(self, run_config: RunConfig): - if self.embeddings is None: - raise ValueError( - f"Metric '{self.name}' has no valid embeddings provided (self.embeddings is None). Please initantiate a the metric with an embeddings to run." # noqa - ) self.embeddings.set_run_config(run_config) diff --git a/src/ragas/testset/synthesizers/base.py b/src/ragas/testset/synthesizers/base.py index 4a6fba61e..7cb583699 100644 --- a/src/ragas/testset/synthesizers/base.py +++ b/src/ragas/testset/synthesizers/base.py @@ -8,7 +8,7 @@ from pydantic import BaseModel from ragas.callbacks import new_group -from ragas.llms import BaseRagasLLM, llm_factory +from ragas.llms import BaseRagasLLM from ragas.prompt import PromptMixin from ragas.testset.graph import KnowledgeGraph, Node @@ -67,8 +67,8 @@ class BaseSynthesizer(ABC, t.Generic[Scenario], PromptMixin): Base class for synthesizing scenarios and samples. """ + llm: BaseRagasLLM = field(kw_only=True) name: str = "" - llm: BaseRagasLLM = field(default_factory=llm_factory) def __post_init__(self): if not self.name: diff --git a/src/ragas/testset/transforms/base.py b/src/ragas/testset/transforms/base.py index 13ce7249a..3761de867 100644 --- a/src/ragas/testset/transforms/base.py +++ b/src/ragas/testset/transforms/base.py @@ -188,7 +188,7 @@ def filter(self, kg: KnowledgeGraph) -> KnowledgeGraph: @dataclass class LLMBasedExtractor(Extractor, PromptMixin): - llm: BaseRagasLLM = field(default_factory=llm_factory) + llm: BaseRagasLLM = field(kw_only=True) merge_if_possible: bool = True From a0c86aa44c9f6d10ded5f61effd443ef5a1078cb Mon Sep 17 00:00:00 2001 From: jjmachan Date: Wed, 23 Oct 2024 10:23:48 +0530 Subject: [PATCH 2/2] feat: removed init for context recall too --- src/ragas/metrics/_context_recall.py | 29 ++++++++++++---------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index c47dc4d32..17b158a93 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -41,9 +41,7 @@ class ContextRecallClassificationPrompt( PydanticPrompt[QCA, ContextRecallClassifications] ): name: str = "context_recall_classification" - instruction: str = ( - "Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only 'Yes' (1) or 'No' (0) as a binary classification. Output json with reason." - ) + instruction: str = "Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only 'Yes' (1) or 'No' (0) as a binary classification. Output json with reason." input_model = QCA output_model = ContextRecallClassifications examples = [ @@ -150,17 +148,17 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.llm is not None, "set LLM before use" # run classification - classifications_list: t.List[ContextRecallClassifications] = ( - await self.context_recall_prompt.generate_multiple( - data=QCA( - question=row["user_input"], - context="\n".join(row["retrieved_contexts"]), - answer=row["reference"], - ), - llm=self.llm, - callbacks=callbacks, - n=self.reproducibility, - ) + classifications_list: t.List[ + ContextRecallClassifications + ] = await self.context_recall_prompt.generate_multiple( + data=QCA( + question=row["user_input"], + context="\n".join(row["retrieved_contexts"]), + answer=row["reference"], + ), + llm=self.llm, + callbacks=callbacks, + n=self.reproducibility, ) classification_dicts = [] for classification in classifications_list: @@ -246,6 +244,3 @@ def _compute_score(self, verdict_list: t.List[float]) -> float: numerator = sum(response) score = numerator / denom if denom > 0 else np.nan return score - - -context_recall = ContextRecall()