diff --git a/poetry.lock b/poetry.lock index fc2cc132..43658fc4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiohttp" @@ -217,6 +217,24 @@ files = [ {file = "backcall-0.2.0.tar.gz", hash = "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e"}, ] +[[package]] +name = "beartype" +version = "0.18.5" +description = "Unbearably fast runtime type checking in pure Python." +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "beartype-0.18.5-py3-none-any.whl", hash = "sha256:5301a14f2a9a5540fe47ec6d34d758e9cd8331d36c4760fc7a5499ab86310089"}, + {file = "beartype-0.18.5.tar.gz", hash = "sha256:264ddc2f1da9ec94ff639141fbe33d22e12a9f75aa863b83b7046ffff1381927"}, +] + +[package.extras] +all = ["typing-extensions (>=3.10.0.0)"] +dev = ["autoapi (>=0.9.0)", "coverage (>=5.5)", "equinox", "mypy (>=0.800)", "numpy", "pandera", "pydata-sphinx-theme (<=0.7.2)", "pytest (>=4.0.0)", "sphinx", "sphinx (>=4.2.0,<6.0.0)", "sphinxext-opengraph (>=0.7.5)", "tox (>=3.20.1)", "typing-extensions (>=3.10.0.0)"] +doc-rtd = ["autoapi (>=0.9.0)", "pydata-sphinx-theme (<=0.7.2)", "sphinx (>=4.2.0,<6.0.0)", "sphinxext-opengraph (>=0.7.5)"] +test-tox = ["equinox", "mypy (>=0.800)", "numpy", "pandera", "pytest (>=4.0.0)", "sphinx", "typing-extensions (>=3.10.0.0)"] +test-tox-coverage = ["coverage (>=5.5)"] + [[package]] name = "certifi" version = "2024.2.2" @@ -3311,4 +3329,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more [metadata] lock-version = "2.0" python-versions = ">=3.8, <3.12" -content-hash = "276770471894b6a393eaf8fb0d53d3fc97d4f646aaa67c9639906dcefaae0904" +content-hash = "2a034a3d9e009e88b7312fb78b7a60779b679a365f53fe5faea0c058cdf6c6da" diff --git a/research_town/evaluators/output_format.py b/research_town/evaluators/output_format.py index 2ea3b030..e3b3ee04 100644 --- a/research_town/evaluators/output_format.py +++ b/research_town/evaluators/output_format.py @@ -47,7 +47,7 @@ def validate_overall_score(cls, v)-> int: if not (0 <= v <= 100): raise ValueError("Overall score must be between 0 and 100") return v - + class OutputFormatError(Exception): def __init__(self, message:str="Output format error")-> None: self.message = message diff --git a/research_town/evaluators/quality_evaluator.py b/research_town/evaluators/quality_evaluator.py index bc50e82d..b6af83c9 100644 --- a/research_town/evaluators/quality_evaluator.py +++ b/research_town/evaluators/quality_evaluator.py @@ -1,19 +1,18 @@ import re -from typing import Any, Dict, List +from typing import Any from ..utils.decorator import parsing_error_exponential_backoff from ..utils.eval_prompter import ( idea_quality_eval_prompting, paper_quality_eval_prompting, - review_quality_eval_prompting + review_quality_eval_prompting, ) - from .output_format import ( IdeaEvalOutput, OutputFormatError, PaperEvalOutput, - ReviewEvalOutput + ReviewEvalOutput, ) diff --git a/research_town/utils/eval_prompter.py b/research_town/utils/eval_prompter.py index fb75c400..40ee990b 100644 --- a/research_town/utils/eval_prompter.py +++ b/research_town/utils/eval_prompter.py @@ -204,11 +204,11 @@ def review_quality_eval_prompting( review_prompt = """ Please evaluate the review based on the following dimensions. Finally, give an overall score (0-100) and 10 dimension scores (for each dimension, provide a rating (1-10)) as the evaluation for the review. The output format should follow these rules: Overall Score of a review (0-100), with 10 Dimension Scores: [d1, d2, d3, ..., d10], where di is the score of the i-th dimension. An example of output is: 'Overall Score=92. Dimension Scores=[9,9,9,9,9,9,9,9,9,9]'. - Output format: + Output format: The details of rating are as follows: {regulations} - + Here is the review to evaluate: idea: {idea} research trend: {trend} @@ -307,4 +307,4 @@ def review_quality_eval_prompting( # merge results from List[Str] to Str combined_result = "\n".join(evaluation_result) - return combined_result \ No newline at end of file + return combined_result diff --git a/tests/test_eval.py b/tests/test_eval.py index 89f25e7d..588eeed9 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -1,4 +1,3 @@ -from typing import Any from unittest.mock import MagicMock, patch import pytest @@ -6,7 +5,7 @@ from research_town.evaluators.quality_evaluator import ( IdeaQualityEvaluator, PaperQualityEvaluator, - ReviewQualityEvaluator + ReviewQualityEvaluator, ) idea = "The idea behind Mamba is to improve upon existing foundation models in deep learning, which typically rely on the Transformer architecture and its attention mechanism. While subquadratic-time architectures like linear attention, gated convolution, recurrent models, and structured state space models (SSMs) have been developed to address the inefficiency of Transformers on long sequences, they have not matched the performance of attention-based models in key areas such as language processing. Mamba addresses the shortcomings of these models by enabling content-based reasoning and making several key improvements: Adaptive SSM Parameters: By allowing SSM parameters to be functions of the input, Mamba effectively handles discrete modalities. This enables the model to selectively propagate or forget information along the sequence based on the current token.Parallel Recurrent Algorithm: Despite the changes preventing the use of efficient convolutions, Mamba employs a hardware-aware parallel algorithm in recurrent mode to maintain efficiency.Simplified Architecture: Mamba integrates these selective SSMs into a streamlined neural network architecture that does not rely on attention or MLP blocks." @@ -162,7 +161,7 @@ def model_name(request: pytest.FixtureRequest) -> str: # Note(jinwei): please make sure the OPENAI API key is set for real tests with "use_mock=False". @pytest.mark.parametrize("use_mock", [True]) def test_evaluator_eval_idea(use_mock:bool, model_name: str) -> None: - + evaluator = IdeaQualityEvaluator(model_name= model_name) input_dict = {'idea': idea, 'trend': trend,'pk':0} @@ -182,8 +181,8 @@ def test_evaluator_eval_idea(use_mock:bool, model_name: str) -> None: # Note(jinwei): please make sure the OPENAI API key is set for real tests with "use_mock=False". @pytest.mark.parametrize("use_mock", [True]) def test_evaluator_eval_paper(use_mock:bool,model_name: str) -> None: - - + + paper = {'title': paper_title, 'abstract':paper_abstract} input_dict = {'idea': idea, 'paper': paper,'pk':0}