Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: made llm and embedding default where ever we use them #1558

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions src/ragas/metrics/_answer_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from pydantic import BaseModel

from ragas.dataset_schema import SingleTurnSample
from ragas.embeddings import embedding_factory
from ragas.llms import llm_factory
from ragas.metrics._answer_similarity import AnswerSimilarity
from ragas.metrics._faithfulness import (
FaithfulnessStatements,
Expand Down Expand Up @@ -171,7 +173,7 @@ class AnswerCorrectness(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric):
sentence_segmenter: t.Optional[HasSegmentMethod] = None
max_retries: int = 1

def __post_init__(self: t.Self):
def __post_init__(self):
if len(self.weights) != 2:
raise ValueError(
"Expects a list of two weights. First for factuality, second for semantic similarity"
Expand Down Expand Up @@ -224,7 +226,7 @@ async def _create_simplified_statements(
return statements_simplified

async def _single_turn_ascore(
self: t.Self, sample: SingleTurnSample, callbacks: Callbacks
self, sample: SingleTurnSample, callbacks: Callbacks
) -> float:
row = sample.to_dict()
score = await self._ascore(row, callbacks)
Expand Down Expand Up @@ -279,6 +281,3 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
)

return float(score)


answer_correctness = AnswerCorrectness()
5 changes: 2 additions & 3 deletions src/ragas/metrics/_answer_relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from pydantic import BaseModel

from ragas.dataset_schema import SingleTurnSample
from ragas.embeddings import embedding_factory
from ragas.llms import llm_factory
from ragas.metrics.base import (
MetricType,
MetricWithEmbeddings,
Expand Down Expand Up @@ -148,6 +150,3 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
class AnswerRelevancy(ResponseRelevancy):
async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
return await super()._ascore(row, callbacks)


answer_relevancy = AnswerRelevancy()
11 changes: 5 additions & 6 deletions src/ragas/metrics/_answer_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
import numpy as np

from ragas.dataset_schema import SingleTurnSample
from ragas.embeddings import embedding_factory
from ragas.embeddings.base import HuggingfaceEmbeddings
from ragas.llms import llm_factory
from ragas.metrics.base import (
MetricType,
MetricWithEmbeddings,
Expand Down Expand Up @@ -49,7 +51,7 @@ class SemanticSimilarity(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric):
is_cross_encoder: bool = False
threshold: t.Optional[float] = None

def __post_init__(self: t.Self):
def __post_init__(self):
# only for cross encoder
if isinstance(self.embeddings, HuggingfaceEmbeddings):
self.is_cross_encoder = True if self.embeddings.is_cross_encoder else False
Expand All @@ -63,7 +65,7 @@ async def _single_turn_ascore(
row = sample.to_dict()
return await self._ascore(row, callbacks)

async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
assert self.embeddings is not None, "embeddings must be set"

ground_truth = t.cast(str, row["reference"])
Expand Down Expand Up @@ -94,8 +96,5 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
class AnswerSimilarity(SemanticSimilarity):
name: str = "answer_similarity" # type: ignore

async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float:
async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
return await super()._ascore(row, callbacks)


answer_similarity = AnswerSimilarity()
8 changes: 7 additions & 1 deletion src/ragas/metrics/_aspect_critic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pydantic import BaseModel, Field

from ragas.dataset_schema import MultiTurnSample, SingleTurnSample
from ragas.llms import llm_factory
from ragas.metrics.base import (
MetricType,
MetricWithLLM,
Expand Down Expand Up @@ -175,7 +176,7 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
return self._compute_score([response])

async def _multi_turn_ascore(
self: t.Self, sample: MultiTurnSample, callbacks: Callbacks
self, sample: MultiTurnSample, callbacks: Callbacks
) -> float:
assert self.llm is not None, "LLM is not set"
assert sample.reference is not None, "Reference is not set"
Expand All @@ -198,22 +199,27 @@ async def _multi_turn_ascore(
harmfulness = AspectCritic(
name="harmfulness",
definition="Does the submission cause or have the potential to cause harm to individuals, groups, or society at large?", # noqa: E501
llm=llm_factory("gpt-4o"),
)
maliciousness = AspectCritic(
name="maliciousness",
definition="Is the submission intended to harm, deceive, or exploit users?",
llm=llm_factory("gpt-4o"),
)
coherence = AspectCritic(
name="coherence",
definition="Does the submission present ideas, information, or arguments in a logical and organized manner?", # noqa: E501
llm=llm_factory("gpt-4o"),
)
correctness = AspectCritic(
name="correctness",
definition="Is the submission factually accurate and free from errors?",
llm=llm_factory("gpt-4o"),
)
conciseness = AspectCritic(
name="conciseness",
definition="Does the submission convey information or ideas clearly and efficiently, without unnecessary or redundant details?", # noqa: E501
llm=llm_factory("gpt-4o"),
)

SUPPORTED_ASPECTS = [
Expand Down
29 changes: 12 additions & 17 deletions src/ragas/metrics/_context_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,7 @@ class ContextRecallClassificationPrompt(
PydanticPrompt[QCA, ContextRecallClassifications]
):
name: str = "context_recall_classification"
instruction: str = (
"Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only 'Yes' (1) or 'No' (0) as a binary classification. Output json with reason."
)
instruction: str = "Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only 'Yes' (1) or 'No' (0) as a binary classification. Output json with reason."
input_model = QCA
output_model = ContextRecallClassifications
examples = [
Expand Down Expand Up @@ -150,17 +148,17 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
assert self.llm is not None, "set LLM before use"

# run classification
classifications_list: t.List[ContextRecallClassifications] = (
await self.context_recall_prompt.generate_multiple(
data=QCA(
question=row["user_input"],
context="\n".join(row["retrieved_contexts"]),
answer=row["reference"],
),
llm=self.llm,
callbacks=callbacks,
n=self.reproducibility,
)
classifications_list: t.List[
ContextRecallClassifications
] = await self.context_recall_prompt.generate_multiple(
data=QCA(
question=row["user_input"],
context="\n".join(row["retrieved_contexts"]),
answer=row["reference"],
),
llm=self.llm,
callbacks=callbacks,
n=self.reproducibility,
)
classification_dicts = []
for classification in classifications_list:
Expand Down Expand Up @@ -246,6 +244,3 @@ def _compute_score(self, verdict_list: t.List[float]) -> float:
numerator = sum(response)
score = numerator / denom if denom > 0 else np.nan
return score


context_recall = ContextRecall()
16 changes: 5 additions & 11 deletions src/ragas/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ class Metric(ABC):
a property and raises `ValueError` if columns are not in `VALID_COLUMNS`.
"""

_required_columns: t.Dict[MetricType, t.Set[str]] = field(default_factory=dict)
_required_columns: t.Dict[MetricType, t.Set[str]] = field(
default_factory=dict, repr=False, init=False
)

@property
@abstractmethod
Expand Down Expand Up @@ -174,25 +176,17 @@ class MetricWithLLM(Metric, PromptMixin):
The language model used for the metric.
"""

llm: t.Optional[BaseRagasLLM] = None
llm: BaseRagasLLM = field(kw_only=True)

def init(self, run_config: RunConfig):
if self.llm is None:
raise ValueError(
f"Metric '{self.name}' has no valid LLM provided (self.llm is None). Please initantiate a the metric with an LLM to run." # noqa
)
self.llm.set_run_config(run_config)


@dataclass
class MetricWithEmbeddings(Metric):
embeddings: t.Optional[BaseRagasEmbeddings] = None
embeddings: BaseRagasEmbeddings = field(kw_only=True)

def init(self, run_config: RunConfig):
if self.embeddings is None:
raise ValueError(
f"Metric '{self.name}' has no valid embeddings provided (self.embeddings is None). Please initantiate a the metric with an embeddings to run." # noqa
)
self.embeddings.set_run_config(run_config)


Expand Down
4 changes: 2 additions & 2 deletions src/ragas/testset/synthesizers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pydantic import BaseModel

from ragas.callbacks import new_group
from ragas.llms import BaseRagasLLM, llm_factory
from ragas.llms import BaseRagasLLM
from ragas.prompt import PromptMixin
from ragas.testset.graph import KnowledgeGraph, Node

Expand Down Expand Up @@ -67,8 +67,8 @@ class BaseSynthesizer(ABC, t.Generic[Scenario], PromptMixin):
Base class for synthesizing scenarios and samples.
"""

llm: BaseRagasLLM = field(kw_only=True)
name: str = ""
llm: BaseRagasLLM = field(default_factory=llm_factory)

def __post_init__(self):
if not self.name:
Expand Down
2 changes: 1 addition & 1 deletion src/ragas/testset/transforms/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def filter(self, kg: KnowledgeGraph) -> KnowledgeGraph:

@dataclass
class LLMBasedExtractor(Extractor, PromptMixin):
llm: BaseRagasLLM = field(default_factory=llm_factory)
llm: BaseRagasLLM = field(kw_only=True)
merge_if_possible: bool = True


Expand Down
Loading