From 0c0c2728640b53acb06ecc8f9f183ae57b6d7f63 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sat, 21 Sep 2024 12:21:19 +0530 Subject: [PATCH 1/5] add components --- src/ragas/experimental/metrics/component.py | 158 ++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 src/ragas/experimental/metrics/component.py diff --git a/src/ragas/experimental/metrics/component.py b/src/ragas/experimental/metrics/component.py new file mode 100644 index 000000000..1d8d8b746 --- /dev/null +++ b/src/ragas/experimental/metrics/component.py @@ -0,0 +1,158 @@ +from __future__ import annotations + +import logging +import typing as t +from abc import ABC, abstractmethod +from dataclasses import dataclass + +from pydantic import BaseModel, Field + +from ragas.experimental.llms.prompt import PydanticPrompt +from ragas.llms.base import BaseRagasLLM + +if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + +logger = logging.getLogger(__name__) + + +class StatementFaithfulnessAnswer(BaseModel): + statement: str = Field(..., description="the original statement, word-by-word") + reason: str = Field(..., description="the reason of the verdict") + verdict: int = Field(..., description="the verdict(0/1) of the faithfulness.") + + +class NLIStatementOutput(BaseModel): + statements: t.List[StatementFaithfulnessAnswer] + + +class NLIStatementInput(BaseModel): + context: str = Field(..., description="The context of the question") + statements: t.List[str] = Field(..., description="The statements to judge") + + +class NLIStatementPrompt(PydanticPrompt[NLIStatementInput, NLIStatementOutput]): + instruction = "Your task is to judge the faithfulness of a series of statements based on a given context. For each statement you must return verdict as 1 if the statement can be directly inferred based on the context or 0 if the statement can not be directly inferred based on the context." + input_model = NLIStatementInput + output_model = NLIStatementOutput + examples = [ + ( + NLIStatementInput( + context="""John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.""", + statements=[ + "John is majoring in Biology.", + "John is taking a course on Artificial Intelligence.", + "John is a dedicated student.", + "John has a part-time job.", + ], + ), + NLIStatementOutput( + statements=[ + StatementFaithfulnessAnswer( + statement="John is majoring in Biology.", + reason="John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.", + verdict=0, + ), + StatementFaithfulnessAnswer( + statement="John is taking a course on Artificial Intelligence.", + reason="The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.", + verdict=0, + ), + StatementFaithfulnessAnswer( + statement="John is a dedicated student.", + reason="The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.", + verdict=1, + ), + StatementFaithfulnessAnswer( + statement="John has a part-time job.", + reason="There is no information given in the context about John having a part-time job.", + verdict=0, + ), + ] + ), + ), + ( + NLIStatementInput( + context="Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy.", + statements=[ + "Albert Einstein was a genius.", + ], + ), + NLIStatementOutput( + statements=[ + StatementFaithfulnessAnswer( + statement="Albert Einstein was a genius.", + reason="The context and statement are unrelated", + verdict=0, + ) + ] + ), + ), + ] + + +class BaseNLIComponent(ABC): + @abstractmethod + async def apply( + self, hypothesis: str, premises: t.List[str], callbacks: Callbacks + ) -> t.List[bool]: + """ + Apply the NLI component to a list of premises and a hypothesis. + """ + raise NotImplementedError("apply method must be implemented by subclasses") + + +@dataclass +class LLMNLIComponent(BaseNLIComponent): + llm: BaseRagasLLM + nli_prompt: PydanticPrompt = NLIStatementPrompt() + + async def apply( + self, hypothesis: str, premises: t.List[str], callbacks: Callbacks + ) -> t.List[bool]: + assert self.llm is not None, "LLM must be set" + prompt_input = NLIStatementInput(context=hypothesis, statements=premises) + response = await self.nli_prompt.generate( + data=prompt_input, llm=self.llm, callbacks=callbacks + ) + return [bool(result.verdict) for result in response.statements] + + +@dataclass +class SequenceClassificationNLIComponent(BaseNLIComponent): + pretrained_model_name_or_path: str = "vectara/hallucination_evaluation_model" + batch_size: int = 32 + device: str = "cpu" + + def __post_init__(self): + try: + from transformers import AutoModelForSequenceClassification + except ImportError: + raise ImportError( + "Huggingface transformers must be installed to use this feature, try `pip install transformers`" + ) + except Exception as e: + raise RuntimeError("Failed to load the model") from e + self.model = AutoModelForSequenceClassification.from_pretrained( + self.pretrained_model_name_or_path, trust_remote_code=True + ) + self.model.to(self.device) + + def _create_batch( + self, pairs: t.List[t.Tuple[str, str]] + ) -> t.Generator[t.List[t.Tuple[str, str]], None, None]: + length_of_pairs = len(pairs) + for ndx in range(0, length_of_pairs, self.batch_size): + yield pairs[ndx : min(ndx + self.batch_size, length_of_pairs)] + + async def apply( + self, hypothesis: str, premises: t.List[str], callbacks: Callbacks + ) -> t.List[bool]: + scores = [] + pairs = [(hypothesis, premise) for premise in premises] + batch_pairs = self._create_batch(pairs) + for input_pairs in batch_pairs: # to avoid OOM + batch_scores = self.model.predict(input_pairs).cpu().detach().round() + scores += batch_scores + + return [bool(score) for score in scores] From 729581b55ded9061576382f5b10426e9d4f6338e Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sat, 21 Sep 2024 12:21:41 +0530 Subject: [PATCH 2/5] add nli component to experimental faithfulnes --- .../experimental/metrics/_faithfulness.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/ragas/experimental/metrics/_faithfulness.py b/src/ragas/experimental/metrics/_faithfulness.py index 3794ae75e..8e0429d5d 100644 --- a/src/ragas/experimental/metrics/_faithfulness.py +++ b/src/ragas/experimental/metrics/_faithfulness.py @@ -8,6 +8,7 @@ from pydantic import BaseModel, Field from ragas.experimental.llms.prompt import PydanticPrompt +from ragas.experimental.metrics.component import BaseNLIComponent, LLMNLIComponent from ragas.metrics.base import ( MetricType, MetricWithLLM, @@ -166,6 +167,7 @@ class FaithfulnessExperimental(MetricWithLLM, SingleTurnMetric): MetricType.SINGLE_TURN: {"user_input", "response", "retrieved_contexts"} } ) + nli_component: t.Optional[BaseNLIComponent] = None sentence_segmenter: t.Optional[HasSegmentMethod] = None max_retries: int = 1 _reproducibility: int = 1 @@ -188,7 +190,7 @@ def reproducibility(self, value): def __post_init__(self): self.long_form_answer_prompt = LongFormAnswerPrompt() - self.nli_statement_prompt = NLIStatementPrompt() + self.nli_component = LLMNLIComponent(llm=self.llm) # type: ignore if self.sentence_segmenter is None: # TODO: make this dynamic, taking language from prompt language = "english" @@ -196,13 +198,14 @@ def __post_init__(self): async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.llm is not None, "LLM is not set" + assert self.nli_component is not None, "NLI component is not set" answer, question, contexts = ( row["response"], row["user_input"], row["retrieved_contexts"], ) - + contexts = "\n".join(contexts) # get the sentences from the answer if self.sentence_segmenter is None: raise ValueError("Sentence segmenter is not set") @@ -226,19 +229,12 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: for component in sentence_components.sentences for statement in component.simpler_statements ] - verdicts = await self.nli_statement_prompt.generate( - data=NLIStatementInput( - context="\n".join(contexts), - statements=statements, - ), - llm=self.llm, - callbacks=callbacks, + verdicts = await self.nli_component.apply( + hypothesis=contexts, premises=statements, callbacks=callbacks ) # compute the score - num_faithful_statements = sum( - verdict.verdict for verdict in verdicts.statements - ) + num_faithful_statements = sum(verdicts) if len(statements): score = num_faithful_statements / len(statements) else: From 0860ea195f3da6218fb5bd0babfa5cf0c84ce54a Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sat, 21 Sep 2024 16:14:49 +0530 Subject: [PATCH 3/5] add textclass pipeline --- src/ragas/experimental/metrics/component.py | 79 ++++++++++++--------- 1 file changed, 47 insertions(+), 32 deletions(-) diff --git a/src/ragas/experimental/metrics/component.py b/src/ragas/experimental/metrics/component.py index 1d8d8b746..563fc4a20 100644 --- a/src/ragas/experimental/metrics/component.py +++ b/src/ragas/experimental/metrics/component.py @@ -3,9 +3,10 @@ import logging import typing as t from abc import ABC, abstractmethod -from dataclasses import dataclass +from dataclasses import dataclass, field from pydantic import BaseModel, Field +from transformers import AutoTokenizer, Pipeline, pipeline from ragas.experimental.llms.prompt import PydanticPrompt from ragas.llms.base import BaseRagasLLM @@ -94,7 +95,7 @@ class NLIStatementPrompt(PydanticPrompt[NLIStatementInput, NLIStatementOutput]): class BaseNLIComponent(ABC): @abstractmethod async def apply( - self, hypothesis: str, premises: t.List[str], callbacks: Callbacks + self, hypothesis: t.List[str], premise: str, callbacks: Callbacks ) -> t.List[bool]: """ Apply the NLI component to a list of premises and a hypothesis. @@ -108,10 +109,10 @@ class LLMNLIComponent(BaseNLIComponent): nli_prompt: PydanticPrompt = NLIStatementPrompt() async def apply( - self, hypothesis: str, premises: t.List[str], callbacks: Callbacks + self, hypothesis: t.List[str], premise: str, callbacks: Callbacks ) -> t.List[bool]: assert self.llm is not None, "LLM must be set" - prompt_input = NLIStatementInput(context=hypothesis, statements=premises) + prompt_input = NLIStatementInput(context=premise, statements=hypothesis) response = await self.nli_prompt.generate( data=prompt_input, llm=self.llm, callbacks=callbacks ) @@ -119,40 +120,54 @@ async def apply( @dataclass -class SequenceClassificationNLIComponent(BaseNLIComponent): - pretrained_model_name_or_path: str = "vectara/hallucination_evaluation_model" +class TextClassificationNLIComponent(BaseNLIComponent): + hf_pipeline: Pipeline + prompt: str + label: str batch_size: int = 32 - device: str = "cpu" + model_kwargs: t.Dict[str, t.Any] = field(default_factory=dict) def __post_init__(self): - try: - from transformers import AutoModelForSequenceClassification - except ImportError: - raise ImportError( - "Huggingface transformers must be installed to use this feature, try `pip install transformers`" - ) - except Exception as e: - raise RuntimeError("Failed to load the model") from e - self.model = AutoModelForSequenceClassification.from_pretrained( - self.pretrained_model_name_or_path, trust_remote_code=True + if "{premise}" not in self.prompt or "{hypothesis}" not in self.prompt: + raise ValueError("Prompt should not contain 'premise' or 'hypothesis'") + + self.model_kwargs["top_k"] = 1 + + @classmethod + def from_model_id( + cls, + model_id: str, + prompt: str, + label: str, + model_kwargs: t.Dict[str, t.Any] = {}, + pipeline_kwargs: t.Dict[str, t.Any] = {}, + ) -> TextClassificationNLIComponent: + tokenizer = AutoTokenizer.from_pretrained(model_id) + hf_pipeline = pipeline( + "text-classification", + model=model_id, + tokenizer=tokenizer, + **pipeline_kwargs, ) - self.model.to(self.device) - def _create_batch( - self, pairs: t.List[t.Tuple[str, str]] - ) -> t.Generator[t.List[t.Tuple[str, str]], None, None]: - length_of_pairs = len(pairs) - for ndx in range(0, length_of_pairs, self.batch_size): - yield pairs[ndx : min(ndx + self.batch_size, length_of_pairs)] + return cls( + hf_pipeline=hf_pipeline, + prompt=prompt, + label=label, + model_kwargs=model_kwargs, + ) async def apply( - self, hypothesis: str, premises: t.List[str], callbacks: Callbacks + self, hypothesis: t.List[str], premise: str, callbacks: Callbacks = None ) -> t.List[bool]: scores = [] - pairs = [(hypothesis, premise) for premise in premises] - batch_pairs = self._create_batch(pairs) - for input_pairs in batch_pairs: # to avoid OOM - batch_scores = self.model.predict(input_pairs).cpu().detach().round() - scores += batch_scores - - return [bool(score) for score in scores] + prompt_input_list = [ + self.prompt.format(hypothesis=text, premise=premise) for text in hypothesis + ] + for i in range(0, len(prompt_input_list), self.batch_size): + prompt_input_list_batch = prompt_input_list[i : i + self.batch_size] + response = self.hf_pipeline(prompt_input_list_batch, **self.model_kwargs) + response = [item[0]["label"] == self.label for item in response] + scores.extend(response) + + return scores From 579fff4d9b343f7234f2fd7cf6c547946bc8e6b3 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sat, 21 Sep 2024 16:14:58 +0530 Subject: [PATCH 4/5] fix args --- src/ragas/experimental/metrics/_faithfulness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragas/experimental/metrics/_faithfulness.py b/src/ragas/experimental/metrics/_faithfulness.py index 8e0429d5d..0362b6cf9 100644 --- a/src/ragas/experimental/metrics/_faithfulness.py +++ b/src/ragas/experimental/metrics/_faithfulness.py @@ -230,7 +230,7 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: for statement in component.simpler_statements ] verdicts = await self.nli_component.apply( - hypothesis=contexts, premises=statements, callbacks=callbacks + hypothesis=statements, premise=contexts, callbacks=callbacks ) # compute the score From 701784b030431bf1de72a08d3bba014af96daf4c Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sat, 21 Sep 2024 16:25:24 +0530 Subject: [PATCH 5/5] add assert --- src/ragas/experimental/metrics/component.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ragas/experimental/metrics/component.py b/src/ragas/experimental/metrics/component.py index 563fc4a20..749be31ef 100644 --- a/src/ragas/experimental/metrics/component.py +++ b/src/ragas/experimental/metrics/component.py @@ -167,7 +167,11 @@ async def apply( for i in range(0, len(prompt_input_list), self.batch_size): prompt_input_list_batch = prompt_input_list[i : i + self.batch_size] response = self.hf_pipeline(prompt_input_list_batch, **self.model_kwargs) - response = [item[0]["label"] == self.label for item in response] + assert isinstance(response, list), "Response should be a list" + assert all( + isinstance(item, dict) for item in response + ), "Items in response should be dictionaries" + response = [item[0].get("label") == self.label for item in response] # type: ignore scores.extend(response) return scores