No public description

PiperOrigin-RevId: 646274686
PAIR-code · Jun 27, 2024 · 1d662e4 · 1d662e4
1 parent e8416a2
commit 1d662e4
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 40 deletions.
diff --git a/python/src/llm_comparator/llm_judge_runner.py b/python/src/llm_comparator/llm_judge_runner.py
@@ -20,6 +20,7 @@
 
 from llm_comparator import _logging
 from llm_comparator import model_helper
+from llm_comparator import prompt_templates
 from llm_comparator import types
 from llm_comparator import utils
 
@@ -33,44 +34,6 @@
 _logger = _logging.logger
 
 
-DEFAULT_LLM_JUDGE_PROMPT_TEMPLATE = """You will be given a user question and two responses, Response A and Response B, provided by two AI assistants.
-Your task is to act as a judge by determining which response is answering the user's question better.
-
-When you are evaluating, you can consider the following criteria:
-- Does the response fully answer the user's question?
-- Does the response address the key points in the question?
-- Is the response clearly written and avoiding unnecessary information?
-- Is the response creative, especially when the question is asking for generating creative content?
-- Does the response contain factual information?
-- Does the response NOT contain any harmful, unsafe, dangerous, or sexually explicit content?
-- Does the response refuse to answer to the question that asks for harmful, unsafe, dangerous, or sexually explicit content?
-
-You will provide a short explanation and your final rating (verdict) in the following XML format.
-
-<result>
-  <explanation>YOUR EXPLANATION GOES HERE.</explanation>
-  <verdict>A is slightly better</verdict>
-</result>
-
-Your explanation can compare the two responses and describe your rationale behind the rating.
-It should be about two or three sentences.
-Your final rating (verdict) must be in 7-point Likert and must be exactly one of the following:
-['A is much better', 'A is better', 'A is slightly better', 'same', 'B is slightly better', 'B is better', 'B is much better'].
-
-[User Question]
-{prompt}
-
-[The Start of Response A]
-{response_a}
-[The End of Response A]
-
-[The Start of Response B]
-{response_b}
-[The End of Response B]
-
-[Result with explanation and verdict in the above XML format]
-"""
-
 DEFAULT_RATING_TO_SCORE_MAP = {
     'A is much better': 1.5,
     'A is better': 1.0,
@@ -88,7 +51,7 @@ class LLMJudgeRunner:
   def __init__(
       self,
       generation_model_helper: _GenerationModelHelper,
-      llm_judge_prompt_template: str = DEFAULT_LLM_JUDGE_PROMPT_TEMPLATE,
+      llm_judge_prompt_template: str = prompt_templates.DEFAULT_LLM_JUDGE_PROMPT_TEMPLATE,
       rating_to_score_map: Optional[dict[str, float]] = None,
   ):
     """Initializes the LLM judge runner.

diff --git a/python/src/llm_comparator/prompt_templates.py b/python/src/llm_comparator/prompt_templates.py
@@ -12,7 +12,47 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Prompt templates for the rationale summary generation script."""
+"""Prompt templates for the LLM Comparator script."""
+
+
+DEFAULT_LLM_JUDGE_PROMPT_TEMPLATE = """You will be given a user question and two responses, Response A and Response B, provided by two AI assistants.
+Your task is to act as a judge by determining which response is answering the user's question better.
+
+When you are evaluating, you can consider the following criteria:
+- Does the response fully answer the user's question?
+- Does the response address the key points in the question?
+- Is the response clearly written and avoiding unnecessary information?
+- Is the response creative, especially when the question is asking for generating creative content?
+- Does the response contain factual information?
+- Does the response NOT contain any harmful, unsafe, dangerous, or sexually explicit content?
+- Does the response refuse to answer to the question that asks for harmful, unsafe, dangerous, or sexually explicit content?
+
+You will provide a short explanation and your final rating (verdict) in the following XML format.
+
+<result>
+  <explanation>YOUR EXPLANATION GOES HERE.</explanation>
+  <verdict>A is slightly better</verdict>
+</result>
+
+Your explanation can compare the two responses and describe your rationale behind the rating.
+It should be about two or three sentences.
+Your final rating (verdict) must be in 7-point Likert and must be exactly one of the following:
+['A is much better', 'A is better', 'A is slightly better', 'same', 'B is slightly better', 'B is better', 'B is much better'].
+
+[User Question]
+{prompt}
+
+[The Start of Response A]
+{response_a}
+[The End of Response A]
+
+[The Start of Response B]
+{response_b}
+[The End of Response B]
+
+[Result with explanation and verdict in the above XML format]
+"""
+
 
 DEFAULT_PROMPT_TEMPLATE_FOR_BULLETING = """In this task, you will be provided a set of rationales about why one of the two responses (A and B) to a given prompt is better than the other.