From faa8172aab924d41737696c178673f2fc14f8bf6 Mon Sep 17 00:00:00 2001 From: Jinwei <55192557+Monstertail@users.noreply.github.com> Date: Sat, 1 Jun 2024 11:32:33 +0800 Subject: [PATCH] enhance evaluator by polishing prompt and adding info (#181) * outline of alignment criteria * reorganize evaluator prompts * fix bug if no trend provided in paper eval * minor prompt format and change eval parser * enable real tests * ease parser matching * disable real tests in test_eval * Fix issues identified by pre-commit hooks --------- Co-authored-by: chengzr01 Co-authored-by: Haofei Yu <1125027232@qq.com> --- data/dbs/test_agent_profile_db.json | 8 +- data/dbs/test_env_logs_db.json | 9 +- data/dbs/test_paper_profile_db.json | 8 +- data/dbs/test_research_progress_db.json | 4 +- poetry.lock | 2 +- research_town/evaluators/output_format.py | 23 +- research_town/evaluators/quality_evaluator.py | 90 ++++++- research_town/utils/eval_prompter.py | 230 ++++++++---------- 8 files changed, 219 insertions(+), 155 deletions(-) diff --git a/data/dbs/test_agent_profile_db.json b/data/dbs/test_agent_profile_db.json index 973dade6..6c2d7404 100644 --- a/data/dbs/test_agent_profile_db.json +++ b/data/dbs/test_agent_profile_db.json @@ -1,13 +1,13 @@ { - "d544f290-6748-46b5-a82e-fd8f40c1e4cc": { - "pk": "d544f290-6748-46b5-a82e-fd8f40c1e4cc", + "8514bd13-0501-4c4c-bd7c-1a76a6c54c77": { + "pk": "8514bd13-0501-4c4c-bd7c-1a76a6c54c77", "name": "Jane Smith", "bio": "Expert in NLP", "collaborators": [], "institute": "NLP Lab" }, - "9c581b74-86f6-4577-b400-9221df4c3917": { - "pk": "9c581b74-86f6-4577-b400-9221df4c3917", + "5faa7149-d6ed-46e4-95c5-a61041c6c621": { + "pk": "5faa7149-d6ed-46e4-95c5-a61041c6c621", "name": "Alice Johnson", "bio": "Data Scientist", "collaborators": [], diff --git a/data/dbs/test_env_logs_db.json b/data/dbs/test_env_logs_db.json index 737ba49b..02e9f796 100644 --- a/data/dbs/test_env_logs_db.json +++ b/data/dbs/test_env_logs_db.json @@ -1,8 +1,9 @@ { "PaperProfile": [], + "ResearchPaperSubmission": [], "AgentPaperReviewLog": [ { - "pk": "654935ea-be94-4898-80a4-bb5c7c12f286", + "pk": "6dcd86f3-e1cb-4dc7-9989-1e5483475475", "timestep": 0, "paper_pk": "paper2", "agent_pk": "agent2", @@ -12,7 +13,7 @@ ], "AgentPaperRebuttalLog": [ { - "pk": "5387eadb-6a18-44e1-b7a3-55c49c808efd", + "pk": "32760169-07ca-4cb1-9b4d-868ee9f9c04c", "timestep": 0, "paper_pk": "paper1", "agent_pk": "agent1", @@ -21,7 +22,7 @@ ], "AgentPaperMetaReviewLog": [ { - "pk": "f3bffbbc-c67c-40a5-82f1-200989b2bea9", + "pk": "0e9bd890-ba49-4cb6-bedd-01f1d21b2586", "timestep": 0, "paper_pk": "paper1", "agent_pk": "agent1", @@ -31,7 +32,7 @@ ], "AgentAgentDiscussionLog": [ { - "pk": "67a25e19-2182-4671-9005-a3f95dd3f7c0", + "pk": "7a4f1d87-d466-41d6-a105-f282573da839", "timestep": 0, "agent_from_pk": "agent1", "agent_from_name": "Rex Ying", diff --git a/data/dbs/test_paper_profile_db.json b/data/dbs/test_paper_profile_db.json index 2125a5ed..ea27695a 100644 --- a/data/dbs/test_paper_profile_db.json +++ b/data/dbs/test_paper_profile_db.json @@ -1,6 +1,6 @@ { - "43653097-1230-48e5-ba17-6f616bc93380": { - "pk": "43653097-1230-48e5-ba17-6f616bc93380", + "d5e1587d-14aa-4ac4-ab72-c2774cedb2cc": { + "pk": "d5e1587d-14aa-4ac4-ab72-c2774cedb2cc", "title": "Updated Sample Paper 1", "abstract": "This is the abstract for paper 1", "authors": [ @@ -22,8 +22,8 @@ "citation_count": 15, "award": null }, - "37e9c697-bd7b-40da-975f-579eddc9508e": { - "pk": "37e9c697-bd7b-40da-975f-579eddc9508e", + "dfbec221-2cce-4038-b5b8-7925e6ea916f": { + "pk": "dfbec221-2cce-4038-b5b8-7925e6ea916f", "title": "Sample Paper 3", "abstract": "This is the abstract for paper 3", "authors": [ diff --git a/data/dbs/test_research_progress_db.json b/data/dbs/test_research_progress_db.json index 38e4622f..c6577dc7 100644 --- a/data/dbs/test_research_progress_db.json +++ b/data/dbs/test_research_progress_db.json @@ -1,11 +1,11 @@ { "ResearchIdea": [ { - "pk": "585e0e17-ae53-44a1-a682-e4ee2883655c", + "pk": "177ca3d9-595f-4147-9f2a-562c9e2b08f1", "content": "Blockchain research proposal" }, { - "pk": "baf40f3b-f14b-48a0-bc1c-d84eaefa9e58", + "pk": "31d65766-93b6-45a3-87ba-0e9fbcf21855", "content": "Updated idea content" } ], diff --git a/poetry.lock b/poetry.lock index ebea61d9..f702ada1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiohttp" diff --git a/research_town/evaluators/output_format.py b/research_town/evaluators/output_format.py index 6e341b32..95d4eee4 100644 --- a/research_town/evaluators/output_format.py +++ b/research_town/evaluators/output_format.py @@ -1,4 +1,4 @@ -from beartype.typing import Type, TypeVar +from beartype.typing import List, Type, TypeVar from pydantic import BaseModel, Extra, Field, validator T = TypeVar('T', bound=BaseModel) @@ -7,6 +7,7 @@ class IdeaEvalOutput(BaseModel): overall_score: int = Field(default=-1) pk: str = Field(default='0') + dimension_scores: List[int] = Field(default=[]) class Config: extra = Extra.allow # Allows extra fields to be stored @@ -19,10 +20,17 @@ def validate_overall_score(cls: Type[T], v: int) -> int: raise ValueError('Overall score must be between 0 and 100') return v + @validator('dimension_scores', each_item=True) + def validate_dimension_scores(cls: Type[T], v: int) -> int: + if not (0 <= v <= 10): + raise ValueError('Each dimension score must be between 0 and 10') + return v + class PaperEvalOutput(BaseModel): overall_score: int = Field(default=-1) pk: str = Field(default='0') + dimension_scores: List[int] = Field(default=[]) class Config: extra = Extra.allow # Allows extra fields to be stored @@ -35,10 +43,17 @@ def validate_overall_score(cls: Type[T], v: int) -> int: raise ValueError('Overall score must be between 0 and 100') return v + @validator('dimension_scores', each_item=True) + def validate_dimension_scores(cls: Type[T], v: int) -> int: + if not (0 <= v <= 10): + raise ValueError('Each dimension score must be between 0 and 10') + return v + class ReviewEvalOutput(BaseModel): overall_score: int = Field(default=-1) pk: str = Field(default='0') + dimension_scores: List[int] = Field(default=[]) class Config: extra = Extra.allow # Allows extra fields to be stored @@ -51,6 +66,12 @@ def validate_overall_score(cls: Type[T], v: int) -> int: raise ValueError('Overall score must be between 0 and 100') return v + @validator('dimension_scores', each_item=True) + def validate_dimension_scores(cls: Type[T], v: int) -> int: + if not (0 <= v <= 10): + raise ValueError('Each dimension score must be between 0 and 10') + return v + class OutputFormatError(Exception): def __init__(self, message: str = 'Output format error') -> None: diff --git a/research_town/evaluators/quality_evaluator.py b/research_town/evaluators/quality_evaluator.py index 8c4867c1..4681ee71 100644 --- a/research_town/evaluators/quality_evaluator.py +++ b/research_town/evaluators/quality_evaluator.py @@ -37,14 +37,37 @@ def eval( return self.parsed_output def parse(self, raw_output: str) -> IdeaEvalOutput: - match = re.search(r'Overall\s*Score\s*\W*(\d+)\W*', raw_output, re.IGNORECASE) - if match: + overall_score_match = re.search( + r'Overall\s*Score\s*\W*(\d+)\W*', raw_output, re.IGNORECASE + ) + dimension_scores_match = re.search( + r'Dimension\s*Scores\s*\W*\s*\[([0-9,\s]+)\]', raw_output, re.IGNORECASE + ) + + if overall_score_match: try: - return IdeaEvalOutput(overall_score=int(match.group(1))) + overall_score = int(overall_score_match.group(1)) except ValueError as e: raise OutputFormatError(f'Invalid overall score: {e}') else: - raise OutputFormatError("Output format error: 'Overall Score' not found") + raise OutputFormatError( + f"Output format error: 'Overall Score' not found. Raw output is {raw_output}." + ) + + if dimension_scores_match: + try: + dimension_scores = list( + map(int, dimension_scores_match.group(1).split(',')) + ) + except ValueError as e: + raise OutputFormatError(f'Invalid dimension scores: {e}') + else: + raise OutputFormatError( + f"Output format error: 'Dimension Scores' not found. Raw output is {raw_output}." + ) + return IdeaEvalOutput( + overall_score=overall_score, dimension_scores=dimension_scores + ) class PaperQualityEvaluator(object): @@ -68,14 +91,37 @@ def eval( return self.parsed_output def parse(self, raw_output: str) -> PaperEvalOutput: - match = re.search(r'Overall\s*Score\s*\W*(\d+)\W*', raw_output, re.IGNORECASE) - if match: + overall_score_match = re.search( + r'Overall\s*Score\s*\W*(\d+)\W*', raw_output, re.IGNORECASE + ) + dimension_scores_match = re.search( + r'Dimension\s*Scores\s*\W*\s*\[([0-9,\s]+)\]', raw_output, re.IGNORECASE + ) + + if overall_score_match: try: - return PaperEvalOutput(overall_score=int(match.group(1))) + overall_score = int(overall_score_match.group(1)) except ValueError as e: raise OutputFormatError(f'Invalid overall score: {e}') else: - raise OutputFormatError("Output format error: 'Overall Score' not found") + raise OutputFormatError( + f"Output format error: 'Overall Score' not found. Raw output is {raw_output}." + ) + + if dimension_scores_match: + try: + dimension_scores = list( + map(int, dimension_scores_match.group(1).split(',')) + ) + except ValueError as e: + raise OutputFormatError(f'Invalid dimension scores: {e}') + else: + raise OutputFormatError( + f"Output format error: 'Dimension Scores' not found. Raw output is {raw_output}." + ) + return PaperEvalOutput( + overall_score=overall_score, dimension_scores=dimension_scores + ) class ReviewQualityEvaluator(object): @@ -104,13 +150,35 @@ def eval( return self.parsed_output def parse(self, raw_output: str) -> ReviewEvalOutput: - match = re.search(r'Overall\s*Score\s*\W*(\d+)\W*', raw_output, re.IGNORECASE) - if match: + overall_score_match = re.search( + r'Overall\s*Score\s*\W*(\d+)\W*', raw_output, re.IGNORECASE + ) + dimension_scores_match = re.search( + r'Dimension\s*Scores\s*\W*\s*\[([0-9,\s]+)\]', raw_output, re.IGNORECASE + ) + + if overall_score_match: try: - return ReviewEvalOutput(overall_score=int(match.group(1))) + overall_score = int(overall_score_match.group(1)) except ValueError as e: raise OutputFormatError(f'Invalid overall score: {e}') else: raise OutputFormatError( f"Output format error: 'Overall Score' not found. Raw output is {raw_output}." ) + + if dimension_scores_match: + try: + dimension_scores = list( + map(int, dimension_scores_match.group(1).split(',')) + ) + except ValueError as e: + raise OutputFormatError(f'Invalid dimension scores: {e}') + else: + raise OutputFormatError( + f"Output format error: 'Dimension Scores' not found. Raw output is {raw_output}." + ) + + return ReviewEvalOutput( + overall_score=overall_score, dimension_scores=dimension_scores + ) diff --git a/research_town/utils/eval_prompter.py b/research_town/utils/eval_prompter.py index 74bff7ae..8bf38a0b 100644 --- a/research_town/utils/eval_prompter.py +++ b/research_town/utils/eval_prompter.py @@ -1,5 +1,5 @@ from beartype import beartype -from beartype.typing import Dict, List +from beartype.typing import Dict, List, Optional from .model_prompting import model_prompting @@ -10,69 +10,57 @@ def idea_quality_eval_prompting( trend: str, model_name: str, ) -> str: - prompt_idea = ( - " Please evaluate the idea based on the following dimensions, considering the current research trend within the ML community. If the research trend field is left blank, please use your common knowledge to assess the trend. Finally, give an overall score (0-100) and 10 dimension scores (for each dimension, provide a rating (1-10)) as the evaluation for the idea. The output format should follow these rules: Overall Score of an idea (0-100), with 10 Dimension Scores: [d1, d2, d3, ..., d10], where di is the score of the i-th dimension. An example of output is: 'Overall Score=89. Dimension Scores=[8,9,9,9,9,9,9,9,9,9]'.\n" - ' The details of rating are as follow:\n' - '1. Novelty\n' - 'Rating (1-10):\n' - 'Comments:\n' - 'How original and unique is the idea?\n' - 'Does it introduce a new perspective or significant advancement compared to existing methods?\n' - 'How does it align with or diverge from the innovations highlighted in the trend?\n' - '2. Technical Depth\n' - 'Rating (1-10):\n' - 'Comments:\n' - 'Assess the technical rigor of the idea.\n' - 'Does it include solid theoretical foundations, robust algorithms, and detailed methodologies?\n' - 'Is the technical depth in line with the state-of-the-art techniques noted in the trend?\n' - '3. Impact and Significance\n' - 'Rating (1-10):\n' - 'Comments:\n' - 'Evaluate the potential impact of the idea on the ML community and beyond.\n' - 'How significant is its contribution to advancing the field?\n' - 'Does it address high-impact problems or gaps identified in the trend?\n' - '4. Feasibility and Practicality\n' - 'Rating (1-10):\n' - 'Comments:\n' - 'Assess the feasibility of implementing the idea.\n' - 'Is it practically applicable in real-world scenarios?\n' - 'Does it consider efficiency and scalability, in line with the practical application focus of the trend?\n' - '5. Theoretical Foundation and Conceptual Soundness\n' - 'Rating (1-10):\n' - 'Comments:\n' - 'Evaluate the theoretical foundation and conceptual soundness of the idea.\n' - 'Are the underlying principles well-defined and logically consistent?\n' - 'Does the idea demonstrate a deep understanding of relevant theories and concepts?\n' - 'How does it contribute to advancing theoretical understanding within the field?\n' - '6. Clarity and Presentation\n' - 'Rating (1-10):\n' - 'Comments:\n' - 'Assess the clarity, organization, and presentation quality of the idea.\n' - 'Is the idea communicated effectively, adhering to high presentation standards seen in top-tier ML conferences?\n' - '7. Potential for Real-world Applications\n' - 'Rating (1-10):\n' - 'Comments:\n' - 'Evaluate the potential of the idea to be applied in real-world scenarios.\n' - 'How applicable is it in practical settings and industry contexts?\n' - 'Does it address real-world problems or challenges identified in the trend?\n' - '8. Innovation Potential\n' - 'Rating (1-10):\n' - 'Comments:\n' - 'Assess the potential of the idea to inspire further research and innovation within the ML community.\n' - 'Does it open up new avenues for research or provide a novel framework aligning with the emerging trends and future directions of the trend?\n' - '9. Ethical Considerations\n' - 'Rating (1-10):\n' - 'Comments:\n' - 'Consider the ethical implications and societal impact of the idea.\n' - 'Does it adhere to the growing emphasis on ethical AI and responsible ML practices as highlighted in the trend?\n' - '10. Interdisciplinary Connections\n' - 'Rating (1-10):\n' - 'Comments:\n' - 'Evaluate the potential for the idea to connect with and contribute to other disciplines beyond ML.\n' - 'Does it align with the trend of interdisciplinary research and collaboration, integrating with fields such as data science, neuroscience, or social sciences?\n' - 'Here is the idea to evaluate: {idea}.\n' - 'Here is the research trend: {trend}.\n' - ) + prompt_idea = """ + Please evaluate the idea based on the following dimensions, considering the current research trend within the research community. If the research trend field is left blank, please use your common knowledge to assess the trend. Finally, give an overall score (0-100) and 10 dimension scores (for each dimension, provide a rating (1-10)) as the evaluation for the idea. + + + Here is the idea to evaluate: {idea}. + Here is the research trend: {trend}. + + + + The output format should follow these rules: Overall Score of an idea (0-100), with 6 Dimension Scores: [d1, d2, d3, ..., d6], where di is the score of the i-th dimension. An example of output is: Overall Score=89 Dimension Scores=[8,9,9,9,9,9].' + + + The details of rating are as follow: + 1. Novelty + Rating (1-10): + Comments: + How original and unique is the idea? + Does it introduce a new perspective or significant advancement compared to existing methods? + How does it align with or diverge from the innovations highlighted in the trend? + 2. Validity + Rating (1-10): + Comments: + Does it include solid theoretical foundations, robust algorithms, and detailed methodologies? + Is the method in line with the state-of-the-art techniques noted in the trend? + Are the underlying principles well-defined and logically consistent? + Does the idea demonstrate a deep understanding of relevant theories and concepts? + 3. Significance + Rating (1-10): + Comments: + Evaluate the potential impact of the idea on the specfic domain of research community that the idea belongs to and beyond. + How significant is its contribution to advancing the field? + Does it address high-impact problems or gaps identified in the trend? + How applicable is it in practical settings and industry contexts? + 4. Feasibility + Rating (1-10): + Comments: + Assess the feasibility of implementing the idea. + Is it practically applicable in real-world scenarios? + Does it consider efficiency and scalability, in line with the practical application focus of the trend? + 5. Clarity + Rating (1-10): + Comments: + Assess the clarity, organization, and presentation quality of the idea. + Is the idea communicated effectively, adhering to high presentation standards seen in top-tier conferences? + 6. Ethical Considerations + Rating (1-10): + Comments: + Consider the ethical implications and societal impact of the idea. + Does it adhere to the growing emphasis on ethical research practices as highlighted in the trend? + + """ input_data = {'idea': idea, 'trend': trend} prompt = prompt_idea.format_map(input_data) @@ -85,93 +73,71 @@ def idea_quality_eval_prompting( @beartype def paper_quality_eval_prompting( - idea: str, paper: Dict[str, str], model_name: str + idea: str, paper: Dict[str, str], model_name: str, trend: Optional[str] = None ) -> str: + # refer to idea eval, but replace those not needed, and paraphrase thoese have overlaps. paper_prompt = """ - Please evaluate the paper draft based on the following dimensions. Finally, give an overall score (0-100) and 10 dimension scores (for each dimension, provide a rating (1-10)) as the evaluation for the draft. The output format should follow these rules: Overall Score of a paper draft (0-100), with 10 Dimension Scores: [d1, d2, d3, ..., d10], where di is the score of the i-th dimension. An example of output is: 'Overall Score=85. Dimension Scores=[7,8,9,7,8,9,8,8,8,9]'. + Please evaluate the paper draft based on the following dimensions. Finally, give an overall score (0-100) and 10 dimension scores (for each dimension, provide a rating (1-10)) as the evaluation for the draft. . + - The details of rating are as follows: + + Here is the paper draft to evaluate: + Title: {title} + Abstract: {abstract} + Idea: {idea} + Research Trend: {trend} + - 1. Title Appeal - Rating (1-10): - Comments: - Does the title grab attention and generate interest? - Is it informative and reflective of the paper's content? + + The output format should follow these rules: Overall Score of an idea (0-100), with 6 Dimension Scores: [d1, d2, d3, ..., d6], where di is the score of the i-th dimension. An example of output is: Overall Score=89 Dimension Scores=[8,9,9,9,9,9].' + - 2. Abstract Quality + The details of rating are as follow: + 1. Novelty Rating (1-10): Comments: - How well does the abstract summarize the paper? - Is it clear, concise, and informative? - Does it effectively convey the significance and main contributions of the paper? - - 3. Title and Abstract Consistency + Does it paper introduce a novel problem or new perspective that has not been explored before? + Does it introduce a new techniques or significant advancement compared to existing methods? + How does it align with or diverge from the innovations highlighted in the trend? + 2. Validity Rating (1-10): Comments: - How well do the title and abstract align with each other? - Do they accurately represent the core idea and content of the paper? - - 4. Literature Review and Background + Does it include solid theoretical foundations, robust algorithms, and detailed methodologies in addressing the research problem? + Are the underlying principles well-defined and logically consistent? + 3. Significance Rating (1-10): Comments: - Assess the thoroughness of the literature review and background provided. - Is the context and relevance of the research well-established? - Does it cover key works and current trends in the field? - - 5. Methodology + Evaluate the potential contribution and impact of the paper on the specfic domain of research community that the paper belongs to and beyond. + How does it compare to existing works in terms of impact? + 4. Rigorousness Rating (1-10): Comments: - Evaluate the soundness and appropriateness of the methodology used. Are the research design and methods clearly described and justified? Is the methodology robust and suitable for addressing the research questions? - - 6. Results and Analysis - Rating (1-10): - Comments: - Assess the quality and clarity of the results presented. Are the results well-analyzed and interpreted? Do the findings support the claims made in the paper? - - 7. Clarity and Presentation + 5. Clarity Rating (1-10): Comments: Evaluate the clarity, organization, and presentation quality of the paper. + How well do the title and abstract summarize the paper? Are they clear, concise, and informative? + Does it effectively convey the significance and main contributions of the paper? + How well do the title and abstract align with each other? Do they accurately represent the core idea and content of the paper? Is the content well-structured and easy to follow? - Are figures, tables, and references used effectively? - - 8. Contribution to the Field - Rating (1-10): - Comments: - Evaluate the significance of the paper's contributions to the field. - Does it advance knowledge or offer new insights? - How does it compare to existing works in terms of impact? - - 9. Ethical Considerations + 6. Ethical Considerations Rating (1-10): Comments: - Consider the ethical implications and societal impact of the work. + Consider the ethical implications and societal impact of the paper. Does it adhere to ethical guidelines and responsible research practices? Are potential negative consequences or biases addressed? - - 10. Interdisciplinary Connections - Rating (1-10): - Comments: - Evaluate the potential for the work to connect with and contribute to other disciplines. - Does it integrate knowledge from other fields or offer insights relevant to them? - How well does it align with the trend of interdisciplinary research and collaboration? - - Here is the paper draft to evaluate: - - Title: {title} - Abstract: {abstract} - Idea: {idea} - + """ input_data = { 'idea': idea, 'title': paper['title'], 'abstract': paper['abstract'], + 'trend': trend if trend is not None else '', # Provide default value if None } prompt = paper_prompt.format_map(input_data) evaluation_result = model_prompting(model_name, prompt) @@ -191,22 +157,30 @@ def review_quality_eval_prompting( ) -> str: review_prompt = """ - Please evaluate the review based on the following dimensions. Finally, give an overall score (0-100) and 10 dimension scores (for each dimension, provide a rating (1-10)) as the evaluation for the review. The output format should follow these rules: Overall Score of a review (0-100), with 10 Dimension Scores: [d1, d2, d3, ..., d10], where di is the score of the i-th dimension. An example of output is: 'Overall Score=92. Dimension Scores=[9,9,9,9,9,9,9,9,9,9]'. - Output format: + Please evaluate the review based on the following dimensions. Finally, give an overall score (0-100) and 10 dimension scores (for each dimension, provide a rating (1-10)) as the evaluation for the review. - The details of rating are as follows: - {regulations} - + + Here is the review to evaluate: idea: {idea} research trend: {trend} paper: title-- {title}; abstract-- {abstract}. reviews: {review} final_decision:{final_decision} + + + + Output format: + The output format should follow these rules: Overall Score of a review (0-100), with 10 Dimension Scores: [d1, d2, d3, ..., d10], where di is the score of the i-th dimension. An example of output is: Overall Score=91. Dimension Scores=[9,9,9,9,9,9,9,9,9,10]. + + + The details of rating are as follows: + {regulations} + """ regulations = """ - 1. Review Summarization + 1. Summarization - Rating (1-10): - Comments: - Does the review accurately summarize the paper's motivation? @@ -243,7 +217,7 @@ def review_quality_eval_prompting( - Is the exposition of the paper clear? - What parts of the paper need revision to improve clarity? - 6. Relation to Prior Work + 6. Originality - Rating (1-10): - Comments: - Is it clearly discussed how this work differs from previous contributions? @@ -257,7 +231,7 @@ def review_quality_eval_prompting( - Is the work reasonably reproducible? - If not, are the reproducibility issues listed among the weaknesses? - 8. Impacts and Implications + 8. Significance - Rating (1-10): - Comments: - Have the authors adequately addressed the broader impact of their work?