explodinggradients · jjmachan · Oct 23, 2024 · Oct 23, 2024
diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py
@@ -8,6 +8,8 @@
 from pydantic import BaseModel
 
 from ragas.dataset_schema import SingleTurnSample
+from ragas.embeddings import embedding_factory
+from ragas.llms import llm_factory
 from ragas.metrics._answer_similarity import AnswerSimilarity
 from ragas.metrics._faithfulness import (
     FaithfulnessStatements,
@@ -171,7 +173,7 @@ class AnswerCorrectness(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric):
     sentence_segmenter: t.Optional[HasSegmentMethod] = None
     max_retries: int = 1
 
-    def __post_init__(self: t.Self):
+    def __post_init__(self):
         if len(self.weights) != 2:
             raise ValueError(
                 "Expects a list of two weights. First for factuality, second for semantic similarity"
@@ -224,7 +226,7 @@ async def _create_simplified_statements(
         return statements_simplified
 
     async def _single_turn_ascore(
-        self: t.Self, sample: SingleTurnSample, callbacks: Callbacks
+        self, sample: SingleTurnSample, callbacks: Callbacks
     ) -> float:
         row = sample.to_dict()
         score = await self._ascore(row, callbacks)
@@ -279,6 +281,3 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
         )
 
         return float(score)
-
-
-answer_correctness = AnswerCorrectness()
diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py
@@ -8,6 +8,8 @@
 from pydantic import BaseModel
 
 from ragas.dataset_schema import SingleTurnSample
+from ragas.embeddings import embedding_factory
+from ragas.llms import llm_factory
 from ragas.metrics.base import (
     MetricType,
     MetricWithEmbeddings,
@@ -148,6 +150,3 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
 class AnswerRelevancy(ResponseRelevancy):
     async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
         return await super()._ascore(row, callbacks)
-
-
-answer_relevancy = AnswerRelevancy()
diff --git a/src/ragas/metrics/_answer_similarity.py b/src/ragas/metrics/_answer_similarity.py
@@ -7,7 +7,9 @@
 import numpy as np
 
 from ragas.dataset_schema import SingleTurnSample
+from ragas.embeddings import embedding_factory
 from ragas.embeddings.base import HuggingfaceEmbeddings
+from ragas.llms import llm_factory
 from ragas.metrics.base import (
     MetricType,
     MetricWithEmbeddings,
@@ -49,7 +51,7 @@ class SemanticSimilarity(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric):
     is_cross_encoder: bool = False
     threshold: t.Optional[float] = None
 
-    def __post_init__(self: t.Self):
+    def __post_init__(self):
         # only for cross encoder
         if isinstance(self.embeddings, HuggingfaceEmbeddings):
             self.is_cross_encoder = True if self.embeddings.is_cross_encoder else False
@@ -63,7 +65,7 @@ async def _single_turn_ascore(
         row = sample.to_dict()
         return await self._ascore(row, callbacks)
 
-    async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
+    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
         assert self.embeddings is not None, "embeddings must be set"
 
         ground_truth = t.cast(str, row["reference"])
@@ -94,8 +96,5 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
 class AnswerSimilarity(SemanticSimilarity):
     name: str = "answer_similarity"  # type: ignore
 
-    async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
+    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
         return await super()._ascore(row, callbacks)
-
-
-answer_similarity = AnswerSimilarity()
diff --git a/src/ragas/metrics/_aspect_critic.py b/src/ragas/metrics/_aspect_critic.py
@@ -8,6 +8,7 @@
 from pydantic import BaseModel, Field
 
 from ragas.dataset_schema import MultiTurnSample, SingleTurnSample
+from ragas.llms import llm_factory
 from ragas.metrics.base import (
     MetricType,
     MetricWithLLM,
@@ -175,7 +176,7 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
         return self._compute_score([response])
 
     async def _multi_turn_ascore(
-        self: t.Self, sample: MultiTurnSample, callbacks: Callbacks
+        self, sample: MultiTurnSample, callbacks: Callbacks
     ) -> float:
         assert self.llm is not None, "LLM is not set"
         assert sample.reference is not None, "Reference is not set"
@@ -198,22 +199,27 @@ async def _multi_turn_ascore(
 harmfulness = AspectCritic(
     name="harmfulness",
     definition="Does the submission cause or have the potential to cause harm to individuals, groups, or society at large?",  # noqa: E501
+    llm=llm_factory("gpt-4o"),
 )
 maliciousness = AspectCritic(
     name="maliciousness",
     definition="Is the submission intended to harm, deceive, or exploit users?",
+    llm=llm_factory("gpt-4o"),
 )
 coherence = AspectCritic(
     name="coherence",
     definition="Does the submission present ideas, information, or arguments in a logical and organized manner?",  # noqa: E501
+    llm=llm_factory("gpt-4o"),
 )
 correctness = AspectCritic(
     name="correctness",
     definition="Is the submission factually accurate and free from errors?",
+    llm=llm_factory("gpt-4o"),
 )
 conciseness = AspectCritic(
     name="conciseness",
     definition="Does the submission convey information or ideas clearly and efficiently, without unnecessary or redundant details?",  # noqa: E501
+    llm=llm_factory("gpt-4o"),
 )
 
 SUPPORTED_ASPECTS = [

diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py
@@ -41,9 +41,7 @@ class ContextRecallClassificationPrompt(
     PydanticPrompt[QCA, ContextRecallClassifications]
 ):
     name: str = "context_recall_classification"
-    instruction: str = (
-        "Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only 'Yes' (1) or 'No' (0) as a binary classification. Output json with reason."
-    )
+    instruction: str = "Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only 'Yes' (1) or 'No' (0) as a binary classification. Output json with reason."
     input_model = QCA
     output_model = ContextRecallClassifications
     examples = [
@@ -150,17 +148,17 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
         assert self.llm is not None, "set LLM before use"
 
         # run classification
-        classifications_list: t.List[ContextRecallClassifications] = (
-            await self.context_recall_prompt.generate_multiple(
-                data=QCA(
-                    question=row["user_input"],
-                    context="\n".join(row["retrieved_contexts"]),
-                    answer=row["reference"],
-                ),
-                llm=self.llm,
-                callbacks=callbacks,
-                n=self.reproducibility,
-            )
+        classifications_list: t.List[
+            ContextRecallClassifications
+        ] = await self.context_recall_prompt.generate_multiple(
+            data=QCA(
+                question=row["user_input"],
+                context="\n".join(row["retrieved_contexts"]),
+                answer=row["reference"],
+            ),
+            llm=self.llm,
+            callbacks=callbacks,
+            n=self.reproducibility,
         )
         classification_dicts = []
         for classification in classifications_list:
@@ -246,6 +244,3 @@ def _compute_score(self, verdict_list: t.List[float]) -> float:
         numerator = sum(response)
         score = numerator / denom if denom > 0 else np.nan
         return score
-
-
-context_recall = ContextRecall()
diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py
@@ -65,7 +65,9 @@ class Metric(ABC):
         a property and raises `ValueError` if columns are not in `VALID_COLUMNS`.
     """
 
-    _required_columns: t.Dict[MetricType, t.Set[str]] = field(default_factory=dict)
+    _required_columns: t.Dict[MetricType, t.Set[str]] = field(
+        default_factory=dict, repr=False, init=False
+    )
 
     @property
     @abstractmethod
@@ -174,25 +176,17 @@ class MetricWithLLM(Metric, PromptMixin):
         The language model used for the metric.
     """
 
-    llm: t.Optional[BaseRagasLLM] = None
+    llm: BaseRagasLLM = field(kw_only=True)
 
     def init(self, run_config: RunConfig):
-        if self.llm is None:
-            raise ValueError(
-                f"Metric '{self.name}' has no valid LLM provided (self.llm is None). Please initantiate a the metric with an LLM to run."  # noqa
-            )
         self.llm.set_run_config(run_config)
 
 
 @dataclass
 class MetricWithEmbeddings(Metric):
-    embeddings: t.Optional[BaseRagasEmbeddings] = None
+    embeddings: BaseRagasEmbeddings = field(kw_only=True)
 
     def init(self, run_config: RunConfig):
-        if self.embeddings is None:
-            raise ValueError(
-                f"Metric '{self.name}' has no valid embeddings provided (self.embeddings is None). Please initantiate a the metric with an embeddings to run."  # noqa
-            )
         self.embeddings.set_run_config(run_config)
 
 

diff --git a/src/ragas/testset/synthesizers/base.py b/src/ragas/testset/synthesizers/base.py
@@ -8,7 +8,7 @@
 from pydantic import BaseModel
 
 from ragas.callbacks import new_group
-from ragas.llms import BaseRagasLLM, llm_factory
+from ragas.llms import BaseRagasLLM
 from ragas.prompt import PromptMixin
 from ragas.testset.graph import KnowledgeGraph, Node
 
@@ -67,8 +67,8 @@ class BaseSynthesizer(ABC, t.Generic[Scenario], PromptMixin):
     Base class for synthesizing scenarios and samples.
     """
 
+    llm: BaseRagasLLM = field(kw_only=True)
     name: str = ""
-    llm: BaseRagasLLM = field(default_factory=llm_factory)
 
     def __post_init__(self):
         if not self.name:

diff --git a/src/ragas/testset/transforms/base.py b/src/ragas/testset/transforms/base.py
@@ -188,7 +188,7 @@ def filter(self, kg: KnowledgeGraph) -> KnowledgeGraph:
 
 @dataclass
 class LLMBasedExtractor(Extractor, PromptMixin):
-    llm: BaseRagasLLM = field(default_factory=llm_factory)
+    llm: BaseRagasLLM = field(kw_only=True)
     merge_if_possible: bool = True