diff --git a/.gitignore b/.gitignore index 47be7db0..bfd3c6d1 100644 --- a/.gitignore +++ b/.gitignore @@ -185,6 +185,7 @@ research_bench/data/arxiv_ai_papers/output_with_references.json research_bench/data/arxiv_ai_papers/paper_info.json research_bench/crossbench/*.json research_bench/mlbench/*.json +research_bench/iclrbench/*.json research_bench/profile_dbs/* research_bench/results/* research_bench/profile_dbs_old diff --git a/README-CN.md b/README-CN.md index 77db4bd1..fe905d8e 100644 --- a/README-CN.md +++ b/README-CN.md @@ -114,3 +114,13 @@ pre-commit install

+ +## ResearchBench + +要执行ResearchBench实验,请运行 'research_bench/run_review_eval.sh' 脚本。你可以在脚本中调整参数,如使用实际的 `INPUT_PATH`。 + +如果遇到 `openreview` 未找到的错误,请通过运行 `pip install openreview` 安装该包。如果遇到与 `requests` 相关的问题,请将其版本更改为 `2.26`。 + +```bash +pip install requests==2.26 +``` \ No newline at end of file diff --git a/README.md b/README.md index e4ccefbd..099d8338 100644 --- a/README.md +++ b/README.md @@ -121,3 +121,13 @@ Check the github action result to make sure all tests pass. If not, fix the erro

+ +## ResearchBench + +To execute ResearchBench experiments, please execute 'research_bench/run_review_eval.sh' script. You can adjust the parameters in the script, using the actual `INPUT_PATH`. + +If you encounter `openreview` not found error, please install the package by running `pip install openreview`. If any issues come up regarding `requests`, please change its version to `2.26`. + +```bash +pip install requests==2.26 +``` \ No newline at end of file diff --git a/configs/agent_prompt/write_metareview_decision.yaml b/configs/agent_prompt/write_metareview_decision.yaml deleted file mode 100644 index a379e762..00000000 --- a/configs/agent_prompt/write_metareview_decision.yaml +++ /dev/null @@ -1,58 +0,0 @@ -fewshot_examples: -- "Here is the proposal: We present a novel approach to sentiment analysis using a hybrid model that combines transformer-based language models with traditional lexicon-based methods. Our method, named SentiHybrid, achieves state-of-the-art performance on multiple benchmark datasets while maintaining interpretability. - - Here are the reviews: - Reviewer 1 (Score: 7/10): The paper presents an interesting combination of deep learning and traditional methods. Strengths include improved performance and interpretability. Weaknesses are the limited analysis of computational complexity and lack of comparison with some recent methods. - - Reviewer 2 (Score: 8/10): Novel approach with promising results. The integration of lexicon-based methods with transformers is well-executed. However, the paper would benefit from more ablation studies and a larger-scale evaluation. - - Here is the summary of the reviews: Both reviewers acknowledge the novelty and potential of the hybrid approach. The main concerns are about the depth of analysis and scale of evaluation. - - Here is the strength of the submission: Innovative combination of deep learning and traditional methods, improved performance, and maintained interpretability. - - Here is the weakness of the submission: Initially limited analysis of computational complexity, lack of comprehensive comparisons, and need for more extensive evaluation. - - Please begin writing the decision of accept / reject." - -- "Decision: Accept" - -- "Here is the proposal: This paper introduces a new graph neural network architecture, DynamicGNN, designed for temporal graph representation learning. Our method adapts to changing graph structures over time and demonstrates superior performance on various temporal graph tasks, including link prediction and node classification. - - Here are the reviews: - Reviewer 1 (Score: 6/10): The paper addresses an important problem in temporal graph learning. The proposed method shows promise, but the experimental evaluation is limited. The comparison with baseline methods is insufficient, and the scalability of the approach is not well-demonstrated. - - Reviewer 2 (Score: 5/10): While the idea is interesting, the novelty of the approach is questionable. The paper lacks a thorough theoretical analysis, and the empirical results are not convincing enough. The writing could also be improved for clarity. - - Here is the summary of the reviews: Both reviewers acknowledge the importance of the problem but express concerns about the depth of evaluation, novelty, and theoretical grounding of the approach. - - Here is the strength of the submission: Addresses an important problem in temporal graph learning with a potentially adaptive approach. - - Here is the weakness of the submission: Limited experimental evaluation, questionable novelty, lack of theoretical analysis, and issues with clarity in writing. - - Please begin writing the decision of accept / reject." - -- "Decision: Reject" - -sys_prompt: > - You are an autonomous intelligent agent tasked to make a review decision. You need to decide whether the following submission should be accepted or rejected by an academic conference. - You will be provided with the following information: - Submission - The abstract of the paper submitted to this conference. - Reviews - It typically contains the score, a short summary, strength, and weakness of the submission. - Summary of Reviews - A short summary of the review. - Strength of Submission - Strength of the submission based on the review. - Weakness of Submission - Weakness of the submission based on the review. - - You should provide the following information: - Decision - 'accept' for accepting the submission to the conference, and 'reject' for rejecting this submission to the conference. -template: | - Here is the proposal: {proposal} - - Here are the reviews: {reviews} - - Here is the summary of the reviews: {summary} - - Here is the strength of the submission: {strength} - - Here is the weakness of the submission: {weakness} - - Please begin writing the decision of accept / reject. diff --git a/configs/agent_prompt/write_metareview_ethical.yaml b/configs/agent_prompt/write_metareview_ethical.yaml deleted file mode 100644 index 5d1fe502..00000000 --- a/configs/agent_prompt/write_metareview_ethical.yaml +++ /dev/null @@ -1,49 +0,0 @@ -fewshot_examples: -- "Here is the proposal: We present a novel deep learning model for predicting criminal recidivism rates. Our model, CrimePredictAI, uses a combination of demographic data, criminal history, and social media activity to achieve state-of-the-art accuracy in predicting repeat offenses. - - Here are the reviews: - Reviewer 1: The paper presents an interesting approach, but raises serious ethical concerns about privacy and potential bias in the model. - Reviewer 2: While technically sound, the paper doesn't adequately address the ethical implications of using social media data without explicit consent. - - Here is the summary of the reviews: Both reviewers highlight ethical concerns regarding privacy, consent, and potential bias in the proposed model. - - Please begin writing the ethical meta-review." - -- "Ethical Meta-Review: This paper raises significant ethical concerns that need to be addressed before publication. The primary issues are the potential for bias and discrimination, the use of personal data without explicit consent, and the broader societal implications of predictive policing technologies. While the authors have attempted to address some concerns in their rebuttal, the fundamental ethical issues with the approach remain unresolved. A more comprehensive ethical framework and safeguards against misuse need to be developed before this research can be considered for publication." - -- "Here is the proposal: Our paper introduces a new technique for generating synthetic human faces using advanced GANs. Our method, FaceGenPro, produces highly realistic faces that are indistinguishable from real photographs, with applications in entertainment, marketing, and privacy-preserving data generation. - - Here are the reviews: - Reviewer 1: The technical contribution is impressive, but the paper lacks a thorough discussion of the ethical implications, particularly regarding deepfakes and potential misuse. - Reviewer 2: While the method is novel, there are concerns about the dataset used for training and the potential for this technology to be used maliciously. - - Here is the summary of the reviews: Both reviewers acknowledge the technical merit but express concerns about ethical implications and potential misuse of the technology. - - Please begin writing the ethical meta-review." - -- "Ethical Meta-Review: The paper addresses minor ethical concerns raised by reviewers and includes a commendable discussion on potential misuse, along with proposed guidelines for responsible use. To further strengthen the ethical stance, the authors should expand these guidelines, potentially including technical measures to detect synthetic images, discuss the legal and policy implications of their technology, and add a section on the ethical limitations of their approach." - -sys_prompt: > - Please write an ethical meta-review for the following submission to an academic conference. Your meta-review should summarize the ethical considerations raised in the reviews, author rebuttals, and any ethical review conducted. Consider the following aspects: - - 1. Academic integrity: Any concerns about plagiarism or improper citations. - 2. Ethical implications of AI/ML technologies: Potential risks, harms, or long-term implications of the proposed methods or models. - 3. Human subjects research: Risks to participants, consent issues, or privacy concerns. - 4. Data usage and rights: Legitimacy of data sources, privacy issues, or proper attribution. - 5. Language and content: Any offensive or culturally insensitive language used. - 6. Broader ethical considerations: Potential dual use, environmental impacts, or societal implications. - - Your meta-review should help in making a final decision regarding the ethical soundness of the paper. Conclude with one of these statements: - - "This paper raises significant ethical concerns that need to be addressed before publication." - - "This paper has minor ethical concerns that should be addressed but do not prevent publication." - - "This paper does not raise significant ethical concerns and is ethically sound for publication." - - Provide a brief explanation for your conclusion, noting which specific ethical aspects informed your decision. -template: | - Here is the proposal: {proposal} - - Here are the reviews: {reviews} - - Here is the summary of the reviews: {summary} - - Please begin writing the ethical meta-review. diff --git a/configs/agent_prompt/write_metareview_strength.yaml b/configs/agent_prompt/write_metareview_strength.yaml index 8782bde2..9074e679 100644 --- a/configs/agent_prompt/write_metareview_strength.yaml +++ b/configs/agent_prompt/write_metareview_strength.yaml @@ -1,44 +1,17 @@ -fewshot_examples: -- "Here is the proposal: We present a novel deep learning architecture, TransformerX, for natural language processing tasks. Our model achieves state-of-the-art performance on multiple benchmarks while requiring significantly less computational resources than existing models. - - Here are the reviews: - Reviewer 1 (Score: 8/10): The paper presents an innovative approach to efficient NLP modeling. The results are impressive, showing both performance gains and reduced computational requirements. However, the theoretical analysis could be more rigorous. - - Reviewer 2 (Score: 9/10): This is a strong paper with clear contributions. The TransformerX architecture is well-designed and the extensive experiments demonstrate its effectiveness. The paper could benefit from more ablation studies. - - Here is the summary of the reviews: Both reviewers acknowledge the novelty and effectiveness of the proposed TransformerX architecture, with minor suggestions for improvement. - - Please begin writing the strength of the submission based on the review." - -- "Strength of the submission: The submission presents a strong, innovative approach to NLP modeling with clear empirical advantages and thorough evaluation, making it a valuable contribution to the field." - -- "Here is the proposal: Our paper introduces a novel graph neural network algorithm, GraphFusion, for multi-modal data integration in bioinformatics. We demonstrate its effectiveness in predicting protein-protein interactions and drug-target affinities, outperforming existing methods on several benchmark datasets. - - Here are the reviews: - Reviewer 1 (Score: 7/10): The paper presents an interesting approach to multi-modal data integration. The results on protein-protein interaction prediction are promising. However, the comparison with some recent methods is missing, and the scalability of the approach needs more discussion. - - Reviewer 2 (Score: 8/10): This is a solid contribution to bioinformatics and graph neural networks. The GraphFusion algorithm is well-designed and the experiments are comprehensive. The paper would benefit from a more in-depth analysis of the model's interpretability. - - Here is the summary of the reviews: Both reviewers recognize the value of the GraphFusion algorithm for multi-modal data integration in bioinformatics, with suggestions for additional comparisons and analyses. - - Please begin writing the strength of the submission based on the review." - -- "Strength of the submission: The submission presents a novel and effective approach to multi-modal data integration in bioinformatics, with clear empirical advantages, comprehensive evaluation, and potential for significant impact in both theoretical and applied research in the field." +fewshot_examples: [] sys_prompt: > You are an autonomous intelligent agent tasked to write the strength of the submission for the following submission you have made to an academic conference. Your summary of strength should summarize the reviews to help the reviewers to make a decision. You will be provided with the following information: - Submission - The abstract of the paper submitted to this conference. - Reviews - It typically contains the score, a short summary, strength, and weakness of the submission. - Summary of Reviews - A short summary of the review. + Submission - Full content of the paper submitted to this conference. + Reviews - It typically contains the score, strength, and weakness of the submission, each by a different reviewer. You should provide the following information: - Strength - The strength of the submission based on the review. -template: | - Here is the proposal: {proposal} + Strength - The strength of the submission based on the reviews. +template: | Here are the reviews: {reviews} - Here is the summary of the reviews: {summary} + Please summarize the important points from the 'strength' section of the reviews. - Please begin writing the strength of the submission based on the review. + Please write in bullet points. It should be 200 words long. diff --git a/configs/agent_prompt/write_metareview_summary.yaml b/configs/agent_prompt/write_metareview_summary.yaml deleted file mode 100644 index 74917d89..00000000 --- a/configs/agent_prompt/write_metareview_summary.yaml +++ /dev/null @@ -1,37 +0,0 @@ -fewshot_examples: -- "Here is the proposal: We present a novel federated learning algorithm, SecureFL, designed to enhance privacy and security in distributed machine learning. Our method achieves state-of-the-art performance on benchmark datasets while providing provable privacy guarantees and resistance to adversarial attacks. - - Here are the reviews: - Reviewer 1 (Score: 7/10): The paper addresses an important problem in federated learning. The proposed SecureFL algorithm shows promise in terms of privacy preservation. However, the empirical evaluation is limited, and the comparison with some recent methods is missing. - - Reviewer 2 (Score: 8/10): This is a solid contribution to privacy-preserving federated learning. The theoretical analysis is sound, and the privacy guarantees are well-established. The paper could benefit from more extensive experiments on real-world datasets and a discussion on computational overhead. - - Please begin writing the summary of reviews." - -- "Summary of reviews: The rebuttal appears to address all the main weaknesses mentioned in the reviews. The expanded empirical evaluation and additional comparisons should strengthen the paper's contribution. The inclusion of a computational overhead analysis will provide readers with a more comprehensive understanding of the proposed method's practical implications." - -- "Here is the proposal: Our paper introduces a new approach to automatic music generation using a combination of transformer models and reinforcement learning. We demonstrate that our method, MusicAI, can generate coherent and aesthetically pleasing musical compositions in various genres, outperforming existing methods in human evaluation studies. - - Here are the reviews: - Reviewer 1 (Score: 6/10): The paper presents an interesting approach to music generation. The combination of transformers and RL is novel in this context. However, the evaluation metrics are not well-justified, and the comparison with state-of-the-art methods is insufficient. The paper also lacks a discussion on the ethical implications of AI-generated music. - - Reviewer 2 (Score: 8/10): This is an innovative contribution to the field of AI-generated music. The MusicAI architecture is well-designed, and the human evaluation studies are a strong point. The paper would benefit from a more rigorous analysis of the RL component and its impact on the quality of generated music. - - Please begin writing the summary of reviews." - -- "Summary of reviews: The rebuttal appears to address all the main weaknesses mentioned in the reviews. The revised evaluation metrics, expanded comparisons, and added discussions on ethics and the RL component should significantly strengthen the paper. The inclusion of an ablation study for the RL component will provide readers with a better understanding of its contribution to the overall system." - -sys_prompt: > - You are an autonomous intelligent agent tasked to write a summary of the reviews for the following submission you have made to an academic conference. Your summary should summarize the reviews to help the reviewers to make a decision. - You will be provided with the following information: - Submission - The abstract of the paper submitted to this conference. - Reviews - It typically contains the score, a short summary, strength, and weakness of the submission. - - You should provide the following information: - Summary - A summary of the reviews. -template: | - Here is the proposal: {proposal} - - Here are the reviews: {reviews} - - Please begin writing the summary of reviews. diff --git a/configs/agent_prompt/write_metareview_weakness.yaml b/configs/agent_prompt/write_metareview_weakness.yaml index 23c257f0..3a7fa95d 100644 --- a/configs/agent_prompt/write_metareview_weakness.yaml +++ b/configs/agent_prompt/write_metareview_weakness.yaml @@ -1,44 +1,17 @@ -fewshot_examples: -- "Here is the proposal: We present a novel deep reinforcement learning algorithm, AdaptRL, for robotic manipulation tasks. Our method demonstrates superior performance and generalization across a variety of complex manipulation scenarios, outperforming existing approaches on standard benchmarks. - - Here are the reviews: - Reviewer 1 (Score: 7/10): The paper introduces an interesting approach to robotic manipulation. The results on benchmark tasks are impressive. However, the theoretical justification for the algorithm is somewhat lacking, and the comparison with some recent methods is missing. - - Reviewer 2 (Score: 8/10): This is a solid contribution to reinforcement learning for robotics. The AdaptRL algorithm shows promise in terms of generalization. The paper could benefit from more ablation studies and a discussion on real-world applicability. - - Here is the summary of the reviews: Both reviewers acknowledge the potential of AdaptRL for robotic manipulation tasks, with suggestions for improved theoretical analysis and more comprehensive evaluations. - - Please begin writing the weakness of the submission based on the review." - -- "Weakness of the submission: It's worth noting that the authors have addressed these weaknesses in their rebuttal, expanding the theoretical analysis, adding comparisons with recent methods, including more ablation studies, and discussing real-world applicability. However, the effectiveness of these additions would need to be evaluated in the revised manuscript." - -- "Here is the proposal: Our paper introduces a novel natural language processing model, MultiLingNLU, designed for multilingual natural language understanding. We demonstrate state-of-the-art performance on various NLU tasks across 100 languages, while maintaining a compact model size. - - Here are the reviews: - Reviewer 1 (Score: 7/10): The paper presents an impressive multilingual NLU model. The performance across 100 languages is noteworthy. However, the paper lacks a detailed analysis of the model's performance variations across language families. The energy efficiency claims need more substantiation. - - Reviewer 2 (Score: 8/10): This is a significant contribution to multilingual NLP. The compact model size is particularly impressive. The paper would benefit from more ablation studies and a discussion on potential biases in low-resource languages. - - Here is the summary of the reviews: Both reviewers recognize the significance of MultiLingNLU in multilingual NLP, particularly noting its performance across many languages and compact size. They suggest additional analyses and discussions to strengthen the paper. - - Please begin writing the weakness of the submission based on the review." - -- "Weakness of the submission: It's important to note that the authors have addressed these weaknesses in their rebuttal, adding analyses of performance across language families, providing energy consumption measurements, including ablation studies, and discussing biases in low-resource languages. The effectiveness of these additions would need to be evaluated in the revised manuscript." +fewshot_examples: [] sys_prompt: > You are an autonomous intelligent agent tasked to write the weakness of the submission for the following submission you have made to an academic conference. Your summary of weakness should summarize the reviews to help the reviewers to make a decision. You will be provided with the following information: - Submission - The abstract of the paper submitted to this conference. - Reviews - It typically contains the score, a short summary, strength, and weakness of the submission. - Summary of Reviews - A short summary of the review. + Submission - Full content of the paper submitted to this conference. + Reviews - It typically contains the score, weakness, and weakness of the submission, each by a different reviewer. You should provide the following information: - Weakness - The weaknesses of the submission based on the reviews. You should notice that the abstract might not cover every detail, so you shouldn't be overly strict. -template: | - Here is the proposal: {proposal} + Weakness - The weakness of the submission based on the reviews. - Here are the reviews: {reviews} +template: | + Here are the reviews: {reviews} - Here is the summary of the reviews: {summary} + Please summarize the important points from the 'weakness' section of the reviews. - Please begin writing the weakness of the submission based on the review. + Please write in bullet points. It should be 200 words long. diff --git a/configs/agent_prompt/write_review_ethical.yaml b/configs/agent_prompt/write_review_ethical.yaml deleted file mode 100644 index 4c423c78..00000000 --- a/configs/agent_prompt/write_review_ethical.yaml +++ /dev/null @@ -1,59 +0,0 @@ -fewshot_examples: -- "Here is the proposal: Our research introduces a novel explainable AI framework called XAI-Viz, designed to improve transparency in deep learning models used for medical image analysis. XAI-Viz generates visual explanations that highlight which parts of an image contributed to the model's decision, allowing clinicians to better understand the model’s reasoning process. The system incorporates a saliency mapping technique and a gradient-based attention mechanism to produce clear and interpretable visualizations. In a series of tests on medical imaging datasets, XAI-Viz consistently produced explanations that aligned well with expert annotations, demonstrating its potential to improve trust and reliability in AI-assisted diagnostics. - - Here is the summary of the paper: This paper introduces XAI-Viz, a framework aimed at enhancing the interpretability of deep learning models applied in medical image analysis. XAI-Viz generates visual explanations using saliency mapping and gradient-based attention, offering clinicians insights into how the model arrives at its decisions. The framework was evaluated on medical imaging datasets, and its explanations were found to align with expert opinions, suggesting that XAI-Viz can increase trust in AI-driven diagnostic tools by providing transparency and clarity. - - Please conduct an ethical review of the above submission." - -- "Ethical Review: This paper addresses crucial ethical considerations in the use of AI in healthcare, particularly concerning transparency and interpretability. The potential for improved clinician understanding of AI decisions is significant; however, the paper should explore the implications of relying on AI systems in clinical settings. It must address how misinterpretations of visual explanations could lead to incorrect clinical decisions, and propose guidelines to ensure that clinicians are adequately trained to use such systems responsibly." - -- "Here is the proposal: This research explores the use of AI in hiring processes, focusing on bias in algorithmic decision-making. We present an analysis of existing algorithms and propose a new framework for bias mitigation. - - Here is the summary of the paper: The study highlights the risks of bias in AI hiring tools and suggests methods for creating fairer algorithms. - - Please conduct an ethical review of the above submission." - -- "Ethical Review: This paper raises significant ethical concerns in the area(s) of bias in AI and fairness in employment practices. The potential for discrimination against certain groups must be addressed more thoroughly, and the paper should include a discussion on the ethical implications of deploying such algorithms in real-world hiring." - -sys_prompt: > - Please conduct an ethical review of the following paper submission for an academic conference. Consider these key ethical concerns: - - 1. Academic integrity: Check for any signs of plagiarism or improper citation. This is crucial for maintaining the integrity of academic research. - - 2. Ethical implications of AI/ML technologies: Evaluate if the paper involves technologies like large language models, pre-trained models, or generation models. Consider: - - Is the technology deployed or only theoretical? - - Is there potential for immediate harm or misuse? - - Are there long-term ethical implications even if not immediately deployed? - - 3. Human subjects research: If the study involves human participants: - - Are there any risks to the participants? - - Has proper consent been obtained? - - Are there adequate safeguards for participant privacy and data protection? - - 4. Data usage and rights: Examine the data sources used in the research: - - Is the data properly cited? - - Do the researchers have legitimate rights to use this data? - - Are there any privacy concerns related to the data? - - 5. Language and content: Review the paper for any potentially offensive or inappropriate language, considering: - - Cultural sensitivity - - Potential biases in terminology - - Respectful discussion of sensitive topics - - 6. Broader ethical considerations: Look for any other ethical issues that might not fit into the above categories but could be significant. This might include: - - Potential for dual use (benign and harmful applications) - - Environmental impacts of the research - - Societal implications of the technology or findings - - After your review, conclude with one of these statements: - - "This paper raises significant ethical concerns in the area(s) of [specific concern(s)]." OR - - "This paper does not raise significant ethical concerns." - - Provide a brief explanation for your conclusion, noting which specific guideline(s) informed your decision. - -template: | - Here is the proposal: {proposal} - - Here is the summary of the paper: {summary} - - Please conduct an ethical review of the above submission. diff --git a/configs/agent_prompt/write_review_score.yaml b/configs/agent_prompt/write_review_score.yaml index 77363877..5584e466 100644 --- a/configs/agent_prompt/write_review_score.yaml +++ b/configs/agent_prompt/write_review_score.yaml @@ -1,45 +1,52 @@ -fewshot_examples: -- "Here is the proposal: Our research introduces a novel explainable AI framework called XAI-Viz, designed to improve transparency in deep learning models used for medical image analysis. XAI-Viz generates visual explanations that highlight which parts of an image contributed to the model's decision, allowing clinicians to better understand the model’s reasoning process. The system incorporates a saliency mapping technique and a gradient-based attention mechanism to produce clear and interpretable visualizations. In a series of tests on medical imaging datasets, XAI-Viz consistently produced explanations that aligned well with expert annotations, demonstrating its potential to improve trust and reliability in AI-assisted diagnostics. - - Here is the summary of the paper: This paper introduces XAI-Viz, a framework aimed at enhancing the interpretability of deep learning models applied in medical image analysis. XAI-Viz generates visual explanations using saliency mapping and gradient-based attention, offering clinicians insights into how the model arrives at its decisions. The framework was evaluated on medical imaging datasets, and its explanations were found to align with expert opinions, suggesting that XAI-Viz can increase trust in AI-driven diagnostic tools by providing transparency and clarity. - - Here is the strength of the paper: The submission presents a groundbreaking contribution to the explainability of AI in medical image analysis. Key strengths include: (1) The innovative integration of explainability techniques like saliency mapping and gradient-based attention, making the system interpretable to clinicians. (2) The potential for significant real-world impact by enhancing trust in AI-assisted diagnostics, as XAI-Viz enables better clinician understanding of model decisions. (3) The alignment of visual explanations with expert annotations, indicating the system’s reliability and accuracy. (4) The focus on explainability, a critical need in healthcare, adds both social and practical value to the technological solution. - - Here is the weakness of the paper: While XAI-Viz demonstrates strong interpretability and alignment with expert annotations, it does not fully address potential concerns around computational complexity. The framework’s scalability in handling large datasets or more complex image modalities is also not discussed in depth, which might limit its applicability in broader clinical settings. - - Please begin scoring the submission." - -- "Based on the given information, I would give this submission a score of 8 out of 10." - -- "Here is the proposal: We present a new approach to autonomous driving using reinforcement learning techniques combined with advanced sensor fusion. Our method, DriveSmart, demonstrates improved decision-making in complex traffic scenarios and enhanced safety measures compared to existing autonomous driving systems. - - Here is the summary of the paper: The paper introduces DriveSmart, a novel autonomous driving system that leverages reinforcement learning and sensor fusion to improve decision-making and safety in complex traffic scenarios. - - Here is the strength of the paper: The submission offers a significant advancement in autonomous driving technology, with robust experimental results showing improved performance in challenging scenarios. The integration of reinforcement learning with sensor fusion is innovative and well-executed. - - Here is the weakness of the paper: The paper could benefit from more extensive real-world testing and a more thorough comparison with current industry standards. The computational requirements of the system are not adequately addressed, which raises questions about its practical implementation. - - Please begin scoring the submission." - -- "Based on the given information, I would give this submission a score of 7 out of 10." +fewshot_examples: [] sys_prompt: > You are an autonomous intelligent agent tasked to score the following submission. You should act as a professional and fair member of that conference to score. The score should be between 1 and 10, where 1 is the lowest and 10 is the highest. You will be provided with the following information: - Paper - Abstract of a submission to an academic conference. - Summary - A summary of the submission. + Paper - Full content of a submission to an academic conference. Strengths - Strengths of the submission. Weakness - Weakness of the submission. You should provide the following information: Score - A score between 1 to 10 to evaluate the overall quality of the submission to an academic journal. It should be one of 1, 2, ..., 10. 1 is the lowest score while 10 is the highest score. + + You should just provide one number as the score and nothing else. + + Please evaluate the submission based on the summarized strengths and weaknesses provided. The score should be more related to weakneess. If there is critical weakness existed in the submission, you should give a lower score. If the submission has a minor weakness, you can give a higher score. If the submission has no weakness, you should give a high score. But the strengths should also be considered in the evaluation. + You should use this format: Based on the given information, I would give this submission a score of [score] out of 10. Here [score] should be replaced with your score. template: | - Here is the proposal: {proposal} - Here is the summary of the paper: {summary} + Here is your profile: {bio} + Here is the strength of the paper: {strength} + Here is the weakness of the paper: {weakness} - Please begin scoring the submission. + Please refer to the rubrics below to evaluate the submission: + + 10/10: The submission is in 2% of all the papers. It changed my thinking on its topic, being one of the most thorough, convincing, and well-written papers I have ever read. + I will fight for this paper to be accepted. + + 8/10: The sumbission is among the top 10% of all the papers. It provides sufficient justification for all its arguments and claims. + Some extra experimentation is needed, but they are not essential. + The proposed method is very original and it can also generalize to various fields. + This submission deepens the understanding of some phoenomenons, or lowers the bar for future research on an existing problem. + + 6/10: The submission gives sufficient support for its major arguments or claims. + However, some minor points are not well justified and need extra support, or details. + The proposed method is moderately original, and it is generalizable to various fields. + The submission itself is not particularly innovative, so it would not be a significant loss if it were not accepted. + + 5/10: Some of the major arguments or claims are not sufficiently justified. + There exist major weaknesses in technical, or methodological aspects. + The proposed method is somewhat original, and it is generalizable to various fields. + I am more on the side of rejection, but I can be convinced otherwise. + 3/10: The submission makes only marginal contributions to the field. + + 1/10: The submission is not sufficiently thorough for publication. Or it is not relevant to the conference. + + You should not always consider the paper generally as an acceptable one. If the paper is really bad and have critical weakness, you should give a low score like 3. If the paper is really good and have important strengths, we encourage you to give a high score like 9. + + Your score is: diff --git a/configs/agent_prompt/write_review_strength.yaml b/configs/agent_prompt/write_review_strength.yaml index b9995f61..371e1d33 100644 --- a/configs/agent_prompt/write_review_strength.yaml +++ b/configs/agent_prompt/write_review_strength.yaml @@ -1,31 +1,38 @@ -fewshot_examples: -- "Here is the proposal: We present a novel deep learning architecture, TransformerX, for natural language processing tasks. Our model achieves state-of-the-art performance on multiple benchmarks while requiring significantly less computational resources than existing models. +fewshot_examples: [] - Here is the summary of the paper: This paper introduces TransformerX, a new deep learning model for NLP tasks that offers superior performance with reduced computational demands compared to current state-of-the-art models. +sys_prompt: > + You are an autonomous intelligent agent tasked to review a submission to an academic conference. You should write the strength of this paper. + You will be provided with the following information: + Submission - Full content of the submitted paper. - Please begin writing the strength of the submission." + You should provide the following information: + Strength - Advantages and strengths of the submission that can improve its chances to be accepted. -- "Strength: The submission demonstrates significant innovation in NLP modeling by introducing TransformerX, which achieves two critical advancements: (1) state-of-the-art performance across multiple benchmarks, indicating broad applicability and effectiveness, and (2) notably reduced computational resource requirements, addressing a major challenge in current deep learning approaches. This combination of improved performance and efficiency has the potential to make advanced NLP capabilities more accessible and practical for a wider range of applications and researchers." -- "Here is the proposal: Our research introduces a novel explainable AI framework called XAI-Viz, designed to improve transparency in deep learning models used for medical image analysis. XAI-Viz generates visual explanations that highlight which parts of an image contributed to the model's decision, allowing clinicians to better understand the model’s reasoning process. The system incorporates a saliency mapping technique and a gradient-based attention mechanism to produce clear and interpretable visualizations. In a series of tests on medical imaging datasets, XAI-Viz consistently produced explanations that aligned well with expert annotations, demonstrating its potential to improve trust and reliability in AI-assisted diagnostics. +template: | + Here is your profile: {bio} - Here is the summary of the paper: This paper introduces XAI-Viz, a framework aimed at enhancing the interpretability of deep learning models applied in medical image analysis. XAI-Viz generates visual explanations using saliency mapping and gradient-based attention, offering clinicians insights into how the model arrives at its decisions. The framework was evaluated on medical imaging datasets, and its explanations were found to align with expert opinions, suggesting that XAI-Viz can increase trust in AI-driven diagnostic tools by providing transparency and clarity. + Here is the submission: {proposal} - Please begin writing the strength of the submission." + Here are the abstracts of the cited papers: {citations} -- "Strength: The submission presents a groundbreaking contribution to the explainability of AI in medical image analysis. Key strengths include: (1) The innovative integration of explainability techniques like saliency mapping and gradient-based attention, making the system interpretable to clinicians. (2) The potential for significant real-world impact by enhancing trust in AI-assisted diagnostics, as XAI-Viz enables better clinician understanding of model decisions. (3) The alignment of visual explanations with expert annotations, indicating the system’s reliability and accuracy. (4) The focus on explainability, a critical need in healthcare, adds both social and practical value to the technological solution." + Please evaluate the submission based on the following criteria: -sys_prompt: > - You are an autonomous intelligent agent tasked to review a submission to an academic conference. You should write the strength of this paper. - You will be provided with the following information: - Submission - Abstract of the submitted paper. + Clarity: Is the writing clear, structured, and terms defined? + Baselines: Are baseline comparisons relevant, sufficient, and not excessive? + Novelty: Is the approach innovative or distinct from prior work? + Results: Are improvements significant, well-supported, and statistically robust? + Limitations: Are weaknesses acknowledged and future work discussed? + Related Work: Are key references cited and connections made? + Technical: Are methods detailed enough for replication? - You should provide the following information: - Strength - Advantages and strengths of the submission that can improve its chances to be accepted. + Please combine both the ideas and the experiments in the submission when evaluating it. + When commenting on the experiments, refer to the exact numbers from the experiments. -template: | - Here is the proposal: {proposal} + Write the strength in 200 words. + + Please begin writing the strength of the submission. It should be 200 words long. - Here is the summary of the paper: {summary} + Please write in bullet points. Do not limit yourself to the aforementioned criteria, like clarity, baselines, novelty, results, limitations, related work, and technical. - Please begin writing the strength of the submission. + You should also use your previous experience in your profile when analyzing the submission. diff --git a/configs/agent_prompt/write_review_summary.yaml b/configs/agent_prompt/write_review_summary.yaml deleted file mode 100644 index 39e7b58c..00000000 --- a/configs/agent_prompt/write_review_summary.yaml +++ /dev/null @@ -1,25 +0,0 @@ -fewshot_examples: -- "Here is the proposal: We present a novel deep learning architecture, TransformerX, for natural language processing tasks. Our model achieves state-of-the-art performance on multiple benchmarks while requiring significantly less computational resources than existing models. TransformerX incorporates a new attention mechanism and sparse activation function, enabling efficient processing of long sequences. We demonstrate its effectiveness on tasks including machine translation, text summarization, and sentiment analysis, consistently outperforming current leading models while using 30% less computing power and memory. - - Please begin writing the summary of the submission." - -- "Summary: This paper introduces TransformerX, an innovative deep learning architecture for natural language processing. The model's key features include a novel attention mechanism and sparse activation function, which enable it to process long sequences more efficiently. TransformerX achieves state-of-the-art performance across various NLP tasks such as machine translation, text summarization, and sentiment analysis. Notably, it accomplishes this while using 30% less computational resources compared to existing top-performing models, addressing the critical issue of efficiency in deep learning approaches for NLP." - -- "Here is the proposal: Our research introduces a novel explainable AI framework called XAI-Viz, designed to improve transparency in deep learning models used for medical image analysis. XAI-Viz generates visual explanations that highlight which parts of an image contributed to the model's decision, allowing clinicians to better understand the model’s reasoning process. The system incorporates a saliency mapping technique and a gradient-based attention mechanism to produce clear and interpretable visualizations. In a series of tests on medical imaging datasets, XAI-Viz consistently produced explanations that aligned well with expert annotations, demonstrating its potential to improve trust and reliability in AI-assisted diagnostics. - - Please begin writing the summary of the submission." - -- "Summary: This paper introduces XAI-Viz, a framework aimed at enhancing the interpretability of deep learning models applied in medical image analysis. XAI-Viz generates visual explanations using saliency mapping and gradient-based attention, offering clinicians insights into how the model arrives at its decisions. The framework was evaluated on medical imaging datasets, and its explanations were found to align with expert opinions, suggesting that XAI-Viz can increase trust in AI-driven diagnostic tools by providing transparency and clarity." - -sys_prompt: > - You are an autonomous intelligent agent tasked to review a submission to an academic conference. To help you and other reviewers, you should write a summary of the submission. - You will be provided with the following information: - Submission - Abstract of the submitted paper. - - You should provide the following information: - Summary - A summary of the submission. - -template: | - Here is the proposal: {proposal} - - Please begin writing the summary of the submission. diff --git a/configs/agent_prompt/write_review_weakness.yaml b/configs/agent_prompt/write_review_weakness.yaml index b942606a..09496b59 100644 --- a/configs/agent_prompt/write_review_weakness.yaml +++ b/configs/agent_prompt/write_review_weakness.yaml @@ -1,30 +1,37 @@ -fewshot_examples: -- "Here is the proposal: We present a novel deep learning architecture, TransformerX, for natural language processing tasks. Our model achieves state-of-the-art performance on multiple benchmarks while requiring significantly less computational resources than existing models. TransformerX incorporates a new attention mechanism and sparse activation function, enabling efficient processing of long sequences. We demonstrate its effectiveness on tasks including machine translation, text summarization, and sentiment analysis, consistently outperforming current leading models while using 30% less computing power and memory. +fewshot_examples: [] - Here is the summary of the paper: This paper introduces TransformerX, an innovative deep learning architecture for natural language processing. The model's key features include a novel attention mechanism and sparse activation function, which enable it to process long sequences more efficiently. TransformerX achieves state-of-the-art performance across various NLP tasks such as machine translation, text summarization, and sentiment analysis. Notably, it accomplishes this while using 30% less computational resources compared to existing top-performing models, addressing the critical issue of efficiency in deep learning approaches for NLP. +sys_prompt: > + You are an autonomous intelligent agent tasked to review a submission to an academic conference. You should write the weaknesses of this paper. + You will be provided with the following information: + Submission - Full content of the submitted paper. - Please begin writing the weakness of the submission." + You should provide the following information: + Weakness - Disadvantages and drawbacks of the submission that must be improved before it can be accepted. You should notice that the abstract might not cover every detail, so you shouldn't be overly strict. -- "Weakness: Although TransformerX shows significant improvement in efficiency and performance, the paper lacks detailed comparison with other transformer variants, such as Transformer-XL or Longformer, that specifically address the issue of long-sequence processing. Additionally, there is no discussion on the model's potential limitations in lower-resource languages or domains with less training data. The abstract does not mention how TransformerX generalizes across various datasets, which might limit the scope of its claims." +template: | + Here is your profile: {bio} -- "Here is the proposal: Our research introduces a novel explainable AI framework called XAI-Viz, designed to improve transparency in deep learning models used for medical image analysis. XAI-Viz generates visual explanations that highlight which parts of an image contributed to the model's decision, allowing clinicians to better understand the model’s reasoning process. The system incorporates a saliency mapping technique and a gradient-based attention mechanism to produce clear and interpretable visualizations. In a series of tests on medical imaging datasets, XAI-Viz consistently produced explanations that aligned well with expert annotations, demonstrating its potential to improve trust and reliability in AI-assisted diagnostics. + Here is the submission: {proposal} - Here is the summary of the paper: This paper introduces XAI-Viz, a framework aimed at enhancing the interpretability of deep learning models applied in medical image analysis. XAI-Viz generates visual explanations using saliency mapping and gradient-based attention, offering clinicians insights into how the model arrives at its decisions. The framework was evaluated on medical imaging datasets, and its explanations were found to align with expert opinions, suggesting that XAI-Viz can increase trust in AI-driven diagnostic tools by providing transparency and clarity. + Here are the abstracts of the cited papers: {citations} - Please begin writing the weakness of the submission." + Please evaluate the submission based on the following criteria: -- "Weakness: Although XAI-Viz improves model interpretability, the abstract does not mention how the system handles noisy or ambiguous data, which is common in medical imaging. Furthermore, the abstract lacks details on the computational overhead introduced by generating these visual explanations, which could be a concern in real-time diagnostic settings. The framework's alignment with expert annotations is promising, but there is no discussion about how XAI-Viz would generalize across different types of medical images or varying imaging modalities, which may limit its broader applicability." + Clarity: Is the writing clear, structured, and terms defined? + Baselines: Are baseline comparisons relevant, sufficient, and not excessive? + Novelty: Is the approach innovative or distinct from prior work? + Results: Are improvements significant, well-supported, and statistically robust? + Limitations: Are weaknesses acknowledged and future work discussed? + Related Work: Are key references cited and connections made? + Technical: Are methods detailed enough for replication? -sys_prompt: > - You are an autonomous intelligent agent tasked to review a submission to an academic conference. You should write the weaknesses of this paper. - You will be provided with the following information: - Submission - Abstract of the submitted paper. + Please combine both the ideas and the experiments in the submission when evaluating it. + When commenting on the experiments, refer to the exact numbers from the experiments. - You should provide the following information: - Weakness - Disadvantages and drawbacks of the submission that must be improved before it can be accepted. You should notice that the abstract might not cover every detail, so you shouldn't be overly strict. -template: | - Here is the proposal: {proposal} + Write the weakness in 200 words. + + Please begin writing the weakness of the submission. It should be 200 words long. - Here is the summary of the paper: {summary} + Please write in bullet points. Do not limit yourself to the aforementioned criteria, like clarity, baselines, novelty, results, limitations, related work, and technical. - Please begin writing the weakness of the submission. + You should also use your previous experience in your profile when analyzing the submission. diff --git a/configs/param.yaml b/configs/param.yaml index 94bef813..2a9a952c 100644 --- a/configs/param.yaml +++ b/configs/param.yaml @@ -5,7 +5,7 @@ related_paper_num: 5 return_num: 1 reviewer_num: 1 stream: null -temperature: 0.6 +temperature: 0.0 top_p: null write_proposal_strategy: default max_env_run_num: 1 diff --git a/research_bench/create_bench_from_conference.py b/research_bench/create_bench_from_conference.py index 4259101a..568f1e8b 100644 --- a/research_bench/create_bench_from_conference.py +++ b/research_bench/create_bench_from_conference.py @@ -150,4 +150,4 @@ def main() -> None: if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/research_bench/eval.py b/research_bench/eval.py index 9d347b9b..66a3340f 100644 --- a/research_bench/eval.py +++ b/research_bench/eval.py @@ -1,13 +1,13 @@ import re -from typing import Dict, List +from typing import Any, Dict, List import nltk import numpy as np -import voyageai from bert_score import score from litellm import embedding from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu from rouge_score import rouge_scorer +from voyageai.client import Client from research_town.utils.model_prompting import model_prompting @@ -102,7 +102,7 @@ def compute_openai_embedding_similarity(reference: str, hypothesis: str) -> floa def compute_voyageai_embedding_similarity(reference: str, hypothesis: str) -> float: - vo = voyageai.Client() + vo = Client() try: response_ref = vo.embed( model='voyage-3', texts=[reference], input_type='document' @@ -182,6 +182,36 @@ def compute_openai_embedding_similarity_per_question( return [0.0] * len(questions) +def compute_openai_embedding_similarity_per_section( + reference: str, hypothesis: str +) -> List[float]: + try: + sections = [ + 'Strength -', + 'Weakness -', + ] + + ref_sections = extract_and_clean_question_content(reference, sections) + hyp_sections = extract_and_clean_question_content(hypothesis, sections) + + similarities = [] + + for ref_text, hyp_text in zip(ref_sections, hyp_sections): + if not ref_text or not hyp_text: + print(f'Empty section: {ref_text} vs {hyp_text}') + similarities.append(0.0) + continue + + cosine_sim = compute_openai_embedding_similarity(ref_text, hyp_text) + similarities.append(float(cosine_sim)) + + return similarities + + except Exception as e: + print(f'Error computing embedding similarity per section: {e}') + return [0.0] * len(sections) + + def compute_voyageai_embedding_similarity_per_question( reference: str, hypothesis: str ) -> List[float]: @@ -215,6 +245,36 @@ def compute_voyageai_embedding_similarity_per_question( return [0.0] * len(questions) +def compute_voyageai_embedding_similarity_per_section( + reference: str, hypothesis: str +) -> List[float]: + try: + sections = [ + 'Strength -', + 'Weakness -', + ] + + ref_sections = extract_and_clean_question_content(reference, sections) + hyp_sections = extract_and_clean_question_content(hypothesis, sections) + + similarities = [] + + for ref_text, hyp_text in zip(ref_sections, hyp_sections): + if not ref_text or not hyp_text: + print(f'Empty section: {ref_text} vs {hyp_text}') + similarities.append(0.0) + continue + + cosine_sim = compute_voyageai_embedding_similarity(ref_text, hyp_text) + similarities.append(float(cosine_sim)) + + return similarities + + except Exception as e: + print(f'Error computing embedding similarity per section: {e}') + return [0.0] * len(sections) + + def compute_proposal_metrics(reference: str, generation: str) -> Dict[str, float]: bleu = compute_bleu(reference, generation) rouge_l = compute_rouge_l(reference, generation) @@ -249,19 +309,187 @@ def compute_proposal_metrics(reference: str, generation: str) -> Dict[str, float } -def compute_review_metrics(reference: str, generation: str) -> Dict[str, float]: - bleu = compute_bleu(reference, generation) - rouge_l = compute_rouge_l(reference, generation) - bert_score = compute_bertscore(reference, generation) - gpt_metric = compute_review_gpt_metric(reference, generation) - openai_sim = compute_openai_embedding_similarity(reference, generation) - voyageai_sim = compute_voyageai_embedding_similarity(reference, generation) - - return { - 'bleu': bleu, - 'rouge_l': rouge_l, - 'gpt_metric_score': gpt_metric, - 'bert_score': bert_score, - 'openai_sim': openai_sim, - 'voyageai_sim': voyageai_sim, +def compute_review_metrics( + strengths: List[str], + weaknesses: List[str], + generated_strength: str, + generated_weakness: str, +) -> Dict[str, List[float]]: + metrics_raw: Dict[str, Any] = { + # 'bleu': [], + # 'rouge_l': [], + # 'bert_score': [], + # 'openai_sim': [], + # 'voyageai_sim': [], + # 'openai_sim_strength': [], + # 'openai_sim_weakness': [], + # 'voyageai_sim_strength': [], + # 'voyageai_sim_weakness': [], + 'openai_strength': 0.0, + 'openai_weakness': 0.0, + 'voyageai_strength': 0.0, + 'voyageai_weakness': 0.0, + 'openai_sim_strengths': [], + 'openai_sim_weaknesses': [], + 'voyageai_sim_strengths': [], + 'voyageai_sim_weaknesses': [], + 'generated_strength_matchings_openai': [], + 'generated_weakness_matchings_openai': [], + 'generated_strength_matchings_voyageai': [], + 'generated_weakness_matchings_voyageai': [], } + # for each generated strength and weakness + # find most similar strength or weakness + # put it in openai_sim_strengths or openai_sim_weaknesses, or voyageai_sim_strengths or voyageai_sim_weaknesses + # then compute the average similarity + + openai_sim_strengths = [] + openai_sim_weaknesses = [] + voyageai_sim_strengths = [] + voyageai_sim_weaknesses = [] + + all_strength_openai_embeddings = [] + all_weakness_openai_embeddings = [] + all_strength_voyageai_embeddings = [] + all_weakness_voyageai_embeddings = [] + generated_strength_matchings_openai = [] + generated_weakness_matchings_openai = [] + generated_strength_matchings_voyageai = [] + generated_weakness_matchings_voyageai = [] + vo = Client() + + # as a precaution, remove all ''s and Nones from strengths and weaknesses + strengths = [strength for strength in strengths if strength] + weaknesses = [weakness for weakness in weaknesses if weakness] + + for strength in strengths: + response = embedding(model='text-embedding-3-large', input=[strength]) + embedding_strength = response['data'][0]['embedding'] + all_strength_openai_embeddings.append(embedding_strength) + + response = vo.embed(model='voyage-3', texts=[strength], input_type='document') + embedding_strength = response.embeddings[0] + all_strength_voyageai_embeddings.append(embedding_strength) + + for weakness in weaknesses: + response = embedding(model='text-embedding-3-large', input=[weakness]) + embedding_weakness = response['data'][0]['embedding'] + all_weakness_openai_embeddings.append(embedding_weakness) + + response = vo.embed(model='voyage-3', texts=[weakness], input_type='document') + embedding_weakness = response.embeddings[0] + all_weakness_voyageai_embeddings.append(embedding_weakness) + + for generated_strength in generated_strength.split('\n'): + if generated_strength: + response = embedding( + model='text-embedding-3-large', input=[generated_strength] + ) + embedding_strength = response['data'][0]['embedding'] + max_sim = -1.0 + max_idx = -1 + for i, strength in enumerate(all_strength_openai_embeddings): + cosine_sim = np.dot(embedding_strength, strength) / ( + np.linalg.norm(embedding_strength) * np.linalg.norm(strength) + ) + if cosine_sim > max_sim: + max_sim = cosine_sim + max_idx = i + openai_sim_strengths.append(max_sim) + generated_strength_matchings_openai.append( + { + 'generated': generated_strength, + 'matched': strengths[max_idx], + 'similarity': max_sim, + } + ) + + response = vo.embed( + model='voyage-3', texts=[generated_strength], input_type='document' + ) + embedding_strength = response.embeddings[0] + max_sim = -1.0 + max_idx = -1 + for i, strength in enumerate(all_strength_voyageai_embeddings): + cosine_sim = np.dot(embedding_strength, strength) / ( + np.linalg.norm(embedding_strength) * np.linalg.norm(strength) + ) + if cosine_sim > max_sim: + max_sim = cosine_sim + max_idx = i + voyageai_sim_strengths.append(max_sim) + generated_strength_matchings_voyageai.append( + { + 'generated': generated_strength, + 'matched': strengths[max_idx], + 'similarity': max_sim, + } + ) + + for generated_weakness in generated_weakness.split('\n'): + if generated_weakness: + response = embedding( + model='text-embedding-3-large', input=[generated_weakness] + ) + embedding_weakness = response['data'][0]['embedding'] + max_sim = -1.0 + max_idx = -1 + for i, weakness in enumerate(all_weakness_openai_embeddings): + cosine_sim = np.dot(embedding_weakness, weakness) / ( + np.linalg.norm(embedding_weakness) * np.linalg.norm(weakness) + ) + if cosine_sim > max_sim: + max_sim = cosine_sim + max_idx = i + openai_sim_weaknesses.append(max_sim) + generated_weakness_matchings_openai.append( + { + 'generated': generated_weakness, + 'matched': weaknesses[max_idx], + 'similarity': max_sim, + } + ) + + response = vo.embed( + model='voyage-3', texts=[generated_weakness], input_type='document' + ) + embedding_weakness = response.embeddings[0] + max_sim = -1.0 + max_idx = -1 + for i, weakness in enumerate(all_weakness_voyageai_embeddings): + cosine_sim = np.dot(embedding_weakness, weakness) / ( + np.linalg.norm(embedding_weakness) * np.linalg.norm(weakness) + ) + if cosine_sim > max_sim: + max_sim = cosine_sim + max_idx = i + voyageai_sim_weaknesses.append(max_sim) + generated_weakness_matchings_voyageai.append( + { + 'generated': generated_weakness, + 'matched': weaknesses[max_idx], + 'similarity': max_sim, + } + ) + + metrics_raw['openai_sim_strengths'] = openai_sim_strengths + metrics_raw['openai_sim_weaknesses'] = openai_sim_weaknesses + metrics_raw['voyageai_sim_strengths'] = voyageai_sim_strengths + metrics_raw['voyageai_sim_weaknesses'] = voyageai_sim_weaknesses + metrics_raw['openai_strength'] = np.mean(openai_sim_strengths).item() + metrics_raw['openai_weakness'] = np.mean(openai_sim_weaknesses).item() + metrics_raw['voyageai_strength'] = np.mean(voyageai_sim_strengths).item() + metrics_raw['voyageai_weakness'] = np.mean(voyageai_sim_weaknesses).item() + metrics_raw['generated_strength_matchings_openai'] = ( + generated_strength_matchings_openai + ) + metrics_raw['generated_weakness_matchings_openai'] = ( + generated_weakness_matchings_openai + ) + metrics_raw['generated_strength_matchings_voyageai'] = ( + generated_strength_matchings_voyageai + ) + metrics_raw['generated_weakness_matchings_voyageai'] = ( + generated_weakness_matchings_voyageai + ) + return metrics_raw diff --git a/backend/sample_data/manifest.json b/research_bench/manifest.json similarity index 91% rename from backend/sample_data/manifest.json rename to research_bench/manifest.json index cdccde5c..6f60ee1f 100644 --- a/backend/sample_data/manifest.json +++ b/research_bench/manifest.json @@ -1,22 +1,22 @@ { "namespaces": [ - "ReviewWritingLog", + "Profile", "Paper", - "IdeaBrainstormLog", - "LiteratureReviewLog", - "Review", "MetaReview", - "MetaReviewWritingLog", - "Proposal", + "Review", "Idea", - "Rebuttal", - "Profile", + "Proposal", "Insight", + "MetaReviewWritingLog", "RebuttalWritingLog", + "LiteratureReviewLog", + "Rebuttal", + "IdeaBrainstormLog", + "ReviewWritingLog", "ProposalWritingLog" ], "embedding_namespaces": [ - "Profile", - "Paper" + "Paper", + "Profile" ] -} +} \ No newline at end of file diff --git a/research_bench/proposal_writing.py b/research_bench/proposal_writing.py index 38ff556b..447325dd 100644 --- a/research_bench/proposal_writing.py +++ b/research_bench/proposal_writing.py @@ -118,7 +118,9 @@ def write_proposal_with_only_profiles(profiles: List[Profile], config: Config) - ), } ] - response = model_prompting(config.param.base_llm, prompt, max_token_num=config.param.max_token_num)[0] + response = model_prompting( + config.param.base_llm, prompt, max_token_num=config.param.max_token_num + )[0] return response @@ -153,7 +155,9 @@ def write_proposal_with_only_citations(ref_contents: List[str], config: Config) ), } ] - response = model_prompting(config.param.base_llm, prompt, max_token_num=config.param.max_token_num)[0] + response = model_prompting( + config.param.base_llm, prompt, max_token_num=config.param.max_token_num + )[0] return response @@ -192,7 +196,9 @@ def write_proposal_with_profiles_and_citations( ), } ] - response = model_prompting(config.param.base_llm, prompt, max_token_num=config.param.max_token_num)[0] + response = model_prompting( + config.param.base_llm, prompt, max_token_num=config.param.max_token_num + )[0] return response @@ -288,7 +294,9 @@ def write_proposal_sakana_ai_scientist( conversation.append({'role': 'user', 'content': idea_first_prompt}) - response = model_prompting(config.param.base_llm, conversation, max_token_num=config.param.max_token_num)[0] + response = model_prompting( + config.param.base_llm, conversation, max_token_num=config.param.max_token_num + )[0] conversation.append({'role': 'assistant', 'content': response}) for current_round in range(1, num_reflections + 1): @@ -297,21 +305,30 @@ def write_proposal_sakana_ai_scientist( ) conversation.append({'role': 'user', 'content': formatted_reflection_prompt}) - response = model_prompting(config.param.base_llm, conversation, max_token_num=config.param.max_token_num)[0] - + response = model_prompting( + config.param.base_llm, + conversation, + max_token_num=config.param.max_token_num, + )[0] + conversation.append({'role': 'assistant', 'content': response}) if 'I am done' in response: break - if 'I am done' in conversation[-1]['content'] and "[Question 1]" not in conversation[-1]['content']: + if ( + 'I am done' in conversation[-1]['content'] + and '[Question 1]' not in conversation[-1]['content'] + ): if 'NEW IDEA:' in conversation[-2]['content']: return conversation[-2]['content'].split('NEW IDEA:')[1] else: return conversation[-2]['content'] else: if 'NEW IDEA:' in conversation[-1]['content']: - return conversation[-1]['content'].split('NEW IDEA:')[1].split('I am done')[0] + return ( + conversation[-1]['content'].split('NEW IDEA:')[1].split('I am done')[0] + ) else: return conversation[-1]['content'].split('I am done')[0] diff --git a/research_bench/review_writing.py b/research_bench/review_writing.py new file mode 100644 index 00000000..4b409f2c --- /dev/null +++ b/research_bench/review_writing.py @@ -0,0 +1,716 @@ +""" +Review Writing Process Evaluation +Input: Real-world Papers +Process: Match Reviewers to Papers with Similar Interests + Evaluate Reviewers' Reviews using Similarity Metrics +Output: Reviewers' Similarity Scores +""" + +from typing import Any, Dict, List, Tuple + +from litellm.utils import token_counter + +from research_town.agents import AgentManager +from research_town.configs import Config +from research_town.data import Profile, Proposal, Review +from research_town.dbs import LogDB, PaperDB, ProfileDB, ProgressDB +from research_town.envs import ReviewWritingEnv +from research_town.utils.model_prompting import model_prompting + + +def write_review_research_town( + paper_content: str, + profiles_reviewers: List[Profile], + ref_contents: List[str], + config: Config, + top_k_reviewers: int = 5, +) -> Tuple[str, str, List[int], Dict[str, Dict[str, Any]]]: + log_db = LogDB(config=config.database) + progress_db = ProgressDB(config=config.database) + paper_db = PaperDB(config=config.database) + profile_db = ProfileDB(config=config.database) + agent_manager = AgentManager(config=config.param, profile_db=profile_db) + + env = ReviewWritingEnv( + name='review_writing', + log_db=log_db, + progress_db=progress_db, + paper_db=paper_db, + config=config, + agent_manager=agent_manager, + ) + + # chair_profile = profile_db.get(name=profiles[0].name)[0] + chair_profile = profiles_reviewers[0] + + # print('chair_profile', chair_profile) + chair = agent_manager.create_agent(chair_profile, role='chair') + if not chair_profile: + raise ValueError('Failed to create chair agent') + + reviewers = [ + agent_manager.create_agent(profile, role='reviewer') + for profile in profiles_reviewers[0:top_k_reviewers] + ] + + ref_contents = [ref if ref else '' for ref in ref_contents] + + env.on_enter( + leader=None, + chair=chair, + reviewers=reviewers, + proposals=[Proposal(content=paper_content, citations=ref_contents)], + ) + + run_result = env.run() + + strength: str = '' + weakness: str = '' + scores: List[int] = [] + + review_per_reviewer = {} + + if run_result is not None: + for progress, agent in run_result: + if isinstance(progress, Review): + assert progress.score is not None + scores.append(progress.score) + agent_name = agent.profile.name + assert progress.strength is not None + assert progress.weakness is not None + strength = progress.strength + weakness = progress.weakness + review_per_reviewer[agent_name] = { + 'strength': strength, + 'weakness': weakness, + 'score': progress.score, + } + + exit_status, exit_dict = env.on_exit() + # s_meta_review = f"{exit_dict.get('metareviews', '')}" + return strength, weakness, scores, review_per_reviewer + + +# Baselines +def write_review_zero_shot( + paper_content: str, config: Config +) -> Tuple[str, str, List[int], Dict[str, Dict[str, Any]]]: + prompt = [ + { + 'role': 'system', + 'content': ( + 'You are an autonomous intelligent agent tasked to review a submission to an academic conference. You should write the strength of this paper.\n' + 'You will be provided with the following information:\n' + 'Submission - Full content of the submitted paper.\n\n' + 'You should provide the following information:\n' + 'Strength - Advantages and strengths of the submission that can improve its chances to be accepted.\n\n' + ), + }, + { + 'role': 'user', + 'content': ( + f'Here is the submission: {paper_content}\n\n' + 'Please evaluate the submission based on the following criteria:\n\n' + 'Clarity: Is the writing clear, structured, and terms defined?\n' + 'Baselines: Are baseline comparisons relevant, sufficient, and not excessive?\n' + 'Novelty: Is the approach innovative or distinct from prior work?\n' + 'Results: Are improvements significant, well-supported, and statistically robust?\n' + 'Limitations: Are weaknesses acknowledged and future work discussed?\n' + 'Related Work: Are key references cited and connections made?\n' + 'Technical: Are methods detailed enough for replication?\n\n' + 'Please combine both the ideas and the experiments in the submission when evaluating it.\n' + 'When commenting on the experiments, refer to the exact numbers from the experiments.\n\n' + 'Write the strength in 200 words.\n\n' + 'Please begin writing the strength of the submission. It should be 200 words long.\n\n' + 'Please write in bullet points. Do not limit yourself to the aforementioned criteria, like clarity, baselines, novelty, results, limitations, related work, and technical.\n\n' + 'You should also use your previous experience in your profile when analyzing the submission.' + ), + }, + ] + response = model_prompting( + config.param.base_llm, + prompt, + max_token_num=config.param.max_token_num, + temperature=config.param.temperature, + )[0] + strength = response + + prompt = [ + { + 'role': 'system', + 'content': ( + 'You are an autonomous intelligent agent tasked to review a submission to an academic conference. You should write the weakness of this paper.\n' + 'You will be provided with the following information:\n' + 'Submission - Full content of the submitted paper.\n\n' + 'You should provide the following information:\n' + "Weakness - Disadvantages and drawbacks of the submission that must be improved before it can be accepted. You should notice that the abstract might not cover every detail, so you shouldn't be overly strict.\n\n" + ), + }, + { + 'role': 'user', + 'content': ( + f'Here is the submission: {paper_content}\n\n' + 'Please evaluate the submission based on the following criteria:\n' + 'Clarity: Is the writing clear, structured, and terms defined?\n' + 'Baselines: Are baseline comparisons relevant, sufficient, and not excessive?\n' + 'Novelty: Is the approach innovative or distinct from prior work?\n' + 'Results: Are improvements significant, well-supported, and statistically robust?\n' + 'Limitations: Are weaknesses acknowledged and future work discussed?\n' + 'Related Work: Are key references cited and connections made?\n' + 'Technical: Are methods detailed enough for replication?\n\n' + 'Please combine both the ideas and the experiments in the submission when evaluating it.\n' + 'When commenting on the experiments, refer to the exact numbers from the experiments.\n\n' + 'Write the weakness in 200 words.\n' + 'Please begin writing the weakness of the submission. It should be 200 words long.\n\n' + 'Please write in bullet points. Do not limit yourself to the aforementioned criteria, like clarity, baselines, novelty, results, limitations, related work, and technical.\n\n' + 'You should also use your previous experience in your profile when analyzing the submission.' + ), + }, + ] + response = model_prompting( + config.param.base_llm, + prompt, + max_token_num=config.param.max_token_num, + temperature=config.param.temperature, + )[0] + weakness = response + + prompt = [ + { + 'role': 'system', + 'content': ( + 'You are an autonomous intelligent agent tasked to score the following submission. You should act as a professional and fair member of that conference to score. ' + 'The score should be between 1 and 10, where 1 is the lowest and 10 is the highest.\n' + 'You will be provided with the following information:\n' + 'Paper - Full content of a submission to an academic conference.\n' + 'Strengths - Strengths of the submission.\n' + 'Weakness - Weakness of the submission.\n' + 'You should provide the following information:\n' + 'Score - A score between 1 to 10 to evaluate the overall quality of the submission to an academic journal. It should be one of 1, 2, ..., 10. 1 is the lowest score while 10 is the highest score.\n\n' + 'Please evaluate the submission based on the summarized strengths and weaknesses provided. The score should be more related to weaknesses. ' + 'If there is a critical weakness in the submission, you should give a lower score. If the submission has a minor weakness, you can give a higher score. ' + 'If the submission has no weaknesses, you should give a high score. But the strengths should also be considered in the evaluation.\n\n' + 'You should use this format:\n' + 'Based on the given information, I would give this submission a score of [score] out of 10.\n' + 'Here [score] should be replaced with your score.' + ), + }, + { + 'role': 'user', + 'content': ( + f'Here is the strength of the paper: {strength}\n\n' + f'Here is the weakness of the paper: {weakness}\n\n' + 'Please refer to the rubrics below to evaluate the submission:\n\n' + '10/10: The submission is in 2% of all the papers. It changed my thinking on its topic, being one of the most thorough, convincing, and well-written papers I have ever read. ' + 'I will fight for this paper to be accepted.\n\n' + '8/10: The submission is among the top 10% of all the papers. It provides sufficient justification for all its arguments and claims. ' + 'Some extra experimentation is needed, but they are not essential. ' + 'The proposed method is very original and it can also generalize to various fields. ' + 'This submission deepens the understanding of some phenomena, or lowers the bar for future research on an existing problem.\n\n' + '6/10: The submission gives sufficient support for its major arguments or claims. ' + 'However, some minor points are not well justified and need extra support, or details. ' + 'The proposed method is moderately original, and it is generalizable to various fields. ' + 'The submission itself is not particularly innovative, so it would not be a significant loss if it were not accepted.\n\n' + '5/10: Some of the major arguments or claims are not sufficiently justified. ' + 'There exist major weaknesses in technical, or methodological aspects. ' + 'The proposed method is somewhat original, and it is generalizable to various fields. ' + 'I am more on the side of rejection, but I can be convinced otherwise.\n\n' + '3/10: The submission makes only marginal contributions to the field.\n\n' + '1/10: The submission is not sufficiently thorough for publication. Or it is not relevant to the conference.\n\n' + 'You should not always consider the paper generally as an acceptable one. If the paper is really bad and has critical weaknesses, you should give a low score like 3. ' + 'If the paper is really good and has important strengths, we encourage you to give a high score like 9.\n\n' + 'Your score is: ' + ), + }, + ] + + response = model_prompting( + config.param.base_llm, + prompt, + max_token_num=config.param.max_token_num, + temperature=config.param.temperature, + )[0] + score_options = ['10', '1', '2', '3', '4', '5', '6', '7', '8', '9'] + score = 0 + for score_option in score_options: + if score_option in response: + score = int(score_option) + + return strength, weakness, [score], {} + + +def write_review_with_only_profiles( + paper_content: str, + profiles_reviewers: List[Profile], + config: Config, + top_k_reviewers: int = 5, +) -> Tuple[str, str, List[int], Dict[str, Dict[str, Any]]]: + # bio_strs = '\n'.join([profile.bio for profile in profiles_reviewers]) + strengths: List[str] = [] + weaknesses: List[str] = [] + scores: List[int] = [] + + import pdb + + pdb.set_trace() + + profiles_reviewers = profiles_reviewers[:top_k_reviewers] + + for profile in profiles_reviewers: + bio_str = profile.bio + token_input_count = 0 + token_output_count = 0 + prompt = [ + { + 'role': 'system', + 'content': ( + 'You are an autonomous intelligent agent tasked to review a submission to an academic conference. You should write the strength of this paper.\n' + 'You will be provided with the following information:\n' + 'Submission - Full content of the submitted paper.\n\n' + 'You should provide the following information:\n' + 'Strength - Advantages and strengths of the submission that can improve its chances to be accepted.\n\n' + ), + }, + { + 'role': 'user', + 'content': ( + f'Here is your profile: {bio_str}\n\n' + f'Here is the submission: {paper_content}\n\n' + 'Please evaluate the submission based on the following criteria:\n\n' + 'Clarity: Is the writing clear, structured, and terms defined?\n' + 'Baselines: Are baseline comparisons relevant, sufficient, and not excessive?\n' + 'Novelty: Is the approach innovative or distinct from prior work?\n' + 'Results: Are improvements significant, well-supported, and statistically robust?\n' + 'Limitations: Are weaknesses acknowledged and future work discussed?\n' + 'Related Work: Are key references cited and connections made?\n' + 'Technical: Are methods detailed enough for replication?\n\n' + 'Please combine both the ideas and the experiments in the submission when evaluating it.\n' + 'When commenting on the experiments, refer to the exact numbers from the experiments.\n\n' + 'Write the strength in 200 words.\n\n' + 'Please begin writing the strength of the submission. It should be 200 words long.\n\n' + 'Please write in bullet points. Do not limit yourself to the aforementioned criteria, like clarity, baselines, novelty, results, limitations, related work, and technical.\n\n' + 'You should also use your previous experience in your profile when analyzing the submission.' + ), + }, + ] + response = model_prompting( + config.param.base_llm, + prompt, + max_token_num=config.param.max_token_num, + temperature=config.param.temperature, + )[0] + # write prompt[0]['content'], bio_str, paper_content to json indent 4 + # import time + # import json + # with open(f'./{time.time()}.json', 'w') as f: + # json.dump({'prompt': prompt[0]['content'], 'bio': bio_str, 'prompt': prompt}, f, indent=4) + token_input_count += token_counter(model=config.param.base_llm, messages=prompt) + token_output_count += token_counter(model=config.param.base_llm, text=response) + strength = response + + prompt = [ + { + 'role': 'system', + 'content': ( + 'You are an autonomous intelligent agent tasked to review a submission to an academic conference. You should write the weakness of this paper.\n' + 'You will be provided with the following information:\n' + 'Submission - Full content of the submitted paper.\n\n' + 'You should provide the following information:\n' + "Weakness - Disadvantages and drawbacks of the submission that must be improved before it can be accepted. You should notice that the abstract might not cover every detail, so you shouldn't be overly strict.\n\n" + ), + }, + { + 'role': 'user', + 'content': ( + f'Here is your profile: {bio_str}\n\n' + f'Here is the submission: {paper_content}\n\n' + 'Please evaluate the submission based on the following criteria:\n\n' + 'Clarity: Is the writing clear, structured, and terms defined?\n' + 'Baselines: Are baseline comparisons relevant, sufficient, and not excessive?\n' + 'Novelty: Is the approach innovative or distinct from prior work?\n' + 'Results: Are improvements significant, well-supported, and statistically robust?\n' + 'Limitations: Are weaknesses acknowledged and future work discussed?\n' + 'Related Work: Are key references cited and connections made?\n' + 'Technical: Are methods detailed enough for replication?\n\n' + 'Please combine both the ideas and the experiments in the submission when evaluating it.\n' + 'When commenting on the experiments, refer to the exact numbers from the experiments.\n\n' + 'Write the weakness in 200 words.\n\n' + 'Please begin writing the weakness of the submission. It should be 200 words long.\n\n' + 'Please write in bullet points. Do not limit yourself to the aforementioned criteria, like clarity, baselines, novelty, results, limitations, related work, and technical.\n\n' + 'You should also use your previous experience in your profile when analyzing the submission.' + ), + }, + ] + response = model_prompting( + config.param.base_llm, + prompt, + max_token_num=config.param.max_token_num, + temperature=config.param.temperature, + )[0] + token_input_count += token_counter(model=config.param.base_llm, messages=prompt) + token_output_count += token_counter(model=config.param.base_llm, text=response) + weakness = response + + # score based on strength and weakness + # A score between 1 to 10 to evaluate the overall quality of the submission to an academic journal. It should be one of 1, 2, ..., 10. 1 is the lowest score while 10 is the highest score. + + prompt = [ + { + 'role': 'system', + 'content': ( + 'You are an autonomous intelligent agent tasked to score the following submission. You should act as a professional and fair member of that conference to score. ' + 'The score should be between 1 and 10, where 1 is the lowest and 10 is the highest.\n' + 'You will be provided with the following information:\n' + 'Paper - Full content of a submission to an academic conference.\n' + 'Strengths - Strengths of the submission.\n' + 'Weakness - Weakness of the submission.\n' + 'You should provide the following information:\n' + 'Score - A score between 1 to 10 to evaluate the overall quality of the submission to an academic journal. It should be one of 1, 2, ..., 10. 1 is the lowest score while 10 is the highest score.\n\n' + 'Please evaluate the submission based on the summarized strengths and weaknesses provided. The score should be more related to weaknesses. ' + 'If there is a critical weakness in the submission, you should give a lower score. If the submission has a minor weakness, you can give a higher score. ' + 'If the submission has no weaknesses, you should give a high score. But the strengths should also be considered in the evaluation.\n\n' + 'You should use this format:\n' + 'Based on the given information, I would give this submission a score of [score] out of 10.\n' + 'Here [score] should be replaced with your score.' + ), + }, + { + 'role': 'user', + 'content': ( + f'Here is your profile: {bio_str}\n\n' + f'Here is the strength of the paper: {strength}\n\n' + f'Here is the weakness of the paper: {weakness}\n\n' + 'Please refer to the rubrics below to evaluate the submission:\n\n' + '10/10: The submission is in 2% of all the papers. It changed my thinking on its topic, being one of the most thorough, convincing, and well-written papers I have ever read. ' + 'I will fight for this paper to be accepted.\n\n' + '8/10: The submission is among the top 10% of all the papers. It provides sufficient justification for all its arguments and claims. ' + 'Some extra experimentation is needed, but they are not essential. ' + 'The proposed method is very original and it can also generalize to various fields. ' + 'This submission deepens the understanding of some phenomena, or lowers the bar for future research on an existing problem.\n\n' + '6/10: The submission gives sufficient support for its major arguments or claims. ' + 'However, some minor points are not well justified and need extra support, or details. ' + 'The proposed method is moderately original, and it is generalizable to various fields. ' + 'The submission itself is not particularly innovative, so it would not be a significant loss if it were not accepted.\n\n' + '5/10: Some of the major arguments or claims are not sufficiently justified. ' + 'There exist major weaknesses in technical, or methodological aspects. ' + 'The proposed method is somewhat original, and it is generalizable to various fields. ' + 'I am more on the side of rejection, but I can be convinced otherwise.\n\n' + '3/10: The submission makes only marginal contributions to the field.\n\n' + '1/10: The submission is not sufficiently thorough for publication. Or it is not relevant to the conference.\n\n' + 'You should not always consider the paper generally as an acceptable one. If the paper is really bad and has critical weaknesses, you should give a low score like 3. ' + 'If the paper is really good and has important strengths, we encourage you to give a high score like 9.\n\n' + 'Your score is: ' + ), + }, + ] + response = model_prompting( + config.param.base_llm, + prompt, + max_token_num=config.param.max_token_num, + temperature=config.param.temperature, + )[0] + token_input_count += token_counter(model=config.param.base_llm, messages=prompt) + token_output_count += token_counter(model=config.param.base_llm, text=response) + score_raw = response + + score_options = ['10', '1', '2', '3', '4', '5', '6', '7', '8', '9'] + + score = 0 + for score_option in score_options: + if score_option in score_raw: + score = int(score_option) + + strengths.append(strength) + weaknesses.append(weakness) + scores.append(int(score)) + + print('token_input_count', token_input_count) + print('token_output_count', token_output_count) + + token_input_count = 0 + token_output_count = 0 + # seralize reviews + reviews_str = '' + for score, strength, weakness in zip(scores, strengths, weaknesses): + reviews_str += f'Score: {score}\nStrength: {strength}\nWeakness: {weakness}\n\n' + + # combine strengths, weaknesses, and scores + prompt = [ + { + 'role': 'system', + 'content': ( + 'You are an autonomous intelligent agent tasked to write the strength of the submission for the following submission you have made to an academic conference. ' + 'Your summary of strength should summarize the reviews to help the reviewers to make a decision.\n' + 'You will be provided with the following information:\n' + 'Submission - Full content of the paper submitted to this conference.\n' + 'Reviews - It typically contains the score, strength, and weakness of the submission, each by a different reviewer.\n\n' + 'You should provide the following information:\n' + 'Strength - The strength of the submission based on the reviews.\n' + ), + }, + { + 'role': 'user', + 'content': ( + 'Here are the reviews: \n' + f'{reviews_str}\n' + 'Please summarize the important points from the "strength" section of the reviews.\n' + 'Please write in bullet points. It should be 200 words long.\n' + ), + }, + ] + response = model_prompting( + config.param.base_llm, + prompt, + max_token_num=config.param.max_token_num, + temperature=config.param.temperature, + )[0] + token_input_count += token_counter(model=config.param.base_llm, messages=prompt) + token_output_count += token_counter(model=config.param.base_llm, text=response) + strength = response + + prompt = [ + { + 'role': 'system', + 'content': ( + 'You are an autonomous intelligent agent tasked to write the weakness of the submission for the following submission you have made to an academic conference. ' + 'Your summary of weakness should summarize the reviews to help the reviewers to make a decision.\n' + 'You will be provided with the following information:\n' + 'Submission - Full content of the paper submitted to this conference.\n' + 'Reviews - It typically contains the score, strength, and weakness of the submission, each by a different reviewer.\n\n' + 'You should provide the following information:\n' + 'Weakness - The weakness of the submission based on the reviews.\n' + ), + }, + { + 'role': 'user', + 'content': ( + 'Here are the reviews: \n' + f'{reviews_str}\n' + 'Please summarize the important points from the "weakness" section of the reviews.\n' + 'Please write in bullet points. It should be 200 words long.\n' + ), + }, + ] + response = model_prompting( + config.param.base_llm, + prompt, + max_token_num=config.param.max_token_num, + temperature=config.param.temperature, + )[0] + token_input_count += token_counter(model=config.param.base_llm, messages=prompt) + token_output_count += token_counter(model=config.param.base_llm, text=response) + weakness = response + response = model_prompting( + config.param.base_llm, + prompt, + max_token_num=config.param.max_token_num, + temperature=config.param.temperature, + )[0] + token_input_count += token_counter(model=config.param.base_llm, messages=prompt) + token_output_count += token_counter(model=config.param.base_llm, text=response) + weakness = response + + print('token_input_count', token_input_count) + print('token_output_count', token_output_count) + + return strength, weakness, scores, {} + + +def write_review_with_only_citations( + paper_content: str, ref_contents: List[str], config: Config +) -> Tuple[str, str, List[int], Dict[str, Dict[str, Any]]]: + # ref_strs = '\n'.join([ref for ref in ref_contents if ref is not None]) + # we need a better format + # 1. Cited paper reference: [ref] + # 2. Cited paper reference: [ref] + + ref_strs = '' + paper_index = 0 + for ref in ref_contents: + if ref: + ref_strs += f'{paper_index + 1}th cited abstract: {ref}\n' + paper_index += 1 + + prompt = [ + { + 'role': 'system', + 'content': ( + 'You are an autonomous intelligent agent tasked to review a submission to an academic conference. You should write the strength of this paper.\n' + 'You will be provided with the following information:\n' + 'Submission - Full content of the submitted paper.\n' + 'References - Abstracts of the cited papers.\n\n' + 'You should provide the following information:\n' + 'Strength - Advantages and strengths of the submission that can improve its chances to be accepted.\n\n' + ), + }, + { + 'role': 'user', + 'content': ( + f'Here is the submission: {paper_content}\n\n' + f'Here are the abstracts of the cited papers: {ref_strs}\n\n' + 'Please evaluate the submission based on the following criteria:\n\n' + 'Clarity: Is the writing clear, structured, and terms defined?\n' + 'Baselines: Are baseline comparisons relevant, sufficient, and not excessive?\n' + 'Novelty: Is the approach innovative or distinct from prior work?\n' + 'Results: Are improvements significant, well-supported, and statistically robust?\n' + 'Limitations: Are weaknesses acknowledged and future work discussed?\n' + 'Related Work: Are key references cited and connections made?\n' + 'Technical: Are methods detailed enough for replication?\n\n' + 'Please combine both the ideas and the experiments in the submission when evaluating it.\n' + 'When commenting on the experiments, refer to the exact numbers from the experiments.\n\n' + 'Write the strength in 200 words.\n\n' + 'Please begin writing the strength of the submission. It should be 200 words long.\n\n' + 'Please write in bullet points. Do not limit yourself to the aforementioned criteria, like clarity, baselines, novelty, results, limitations, related work, and technical.\n\n' + 'You should also use your previous experience in your profile when analyzing the submission.' + ), + }, + ] + response = model_prompting( + config.param.base_llm, + prompt, + max_token_num=config.param.max_token_num, + temperature=config.param.temperature, + )[0] + strength = response + + prompt = [ + { + 'role': 'system', + 'content': ( + 'You are an autonomous intelligent agent tasked to review a submission to an academic conference. You should write the weakness of this paper.\n' + 'You will be provided with the following information:\n' + 'Submission - Full content of the submitted paper.\n' + 'References - Abstracts of the cited papers.\n\n' + 'You should provide the following information:\n' + "Weakness - Disadvantages and drawbacks of the submission that must be improved before it can be accepted. You should notice that the abstract might not cover every detail, so you shouldn't be overly strict.\n\n" + ), + }, + { + 'role': 'user', + 'content': ( + f'Here is the submission: {paper_content}\n\n' + f'Here are the abstracts of the cited papers: {ref_strs}\n\n' + 'Please evaluate the submission based on the following criteria:\n\n' + 'Clarity: Is the writing clear, structured, and terms defined?\n' + 'Baselines: Are baseline comparisons relevant, sufficient, and not excessive?\n' + 'Novelty: Is the approach innovative or distinct from prior work?\n' + 'Results: Are improvements significant, well-supported, and statistically robust?\n' + 'Limitations: Are weaknesses acknowledged and future work discussed?\n' + 'Related Work: Are key references cited and connections made?\n' + 'Technical: Are methods detailed enough for replication?\n\n' + 'Please combine both the ideas and the experiments in the submission when evaluating it.\n' + 'When commenting on the experiments, refer to the exact numbers from the experiments.\n\n' + 'Write the weakness in 200 words.\n\n' + 'Please begin writing the weakness of the submission. It should be 200 words long.\n\n' + 'Please write in bullet points. Do not limit yourself to the aforementioned criteria, like clarity, baselines, novelty, results, limitations, related work, and technical.\n\n' + 'You should also use your previous experience in your profile when analyzing the submission.' + ), + }, + ] + response = model_prompting( + config.param.base_llm, + prompt, + max_token_num=config.param.max_token_num, + temperature=config.param.temperature, + )[0] + weakness = response + + prompt = [ + { + 'role': 'system', + 'content': ( + 'You are an autonomous intelligent agent tasked to score the following submission. You should act as a professional and fair member of that conference to score. ' + 'The score should be between 1 and 10, where 1 is the lowest and 10 is the highest.\n' + 'You will be provided with the following information:\n' + 'Paper - Full content of a submission to an academic conference.\n' + 'Strengths - Strengths of the submission.\n' + 'Weakness - Weakness of the submission.\n' + 'You should provide the following information:\n' + 'Score - A score between 1 to 10 to evaluate the overall quality of the submission to an academic journal. It should be one of 1, 2, ..., 10. 1 is the lowest score while 10 is the highest score.\n\n' + 'Please evaluate the submission based on the summarized strengths and weaknesses provided. The score should be more related to weaknesses. ' + 'If there is a critical weakness in the submission, you should give a lower score. If the submission has a minor weakness, you can give a higher score. ' + 'If the submission has no weaknesses, you should give a high score. But the strengths should also be considered in the evaluation.\n\n' + 'You should use this format:\n' + 'Based on the given information, I would give this submission a score of [score] out of 10.\n' + 'Here [score] should be replaced with your score.' + ), + }, + { + 'role': 'user', + 'content': ( + f'Here is the strength of the paper: {strength}\n\n' + f'Here is the weakness of the paper: {weakness}\n\n' + 'Please refer to the rubrics below to evaluate the submission:\n\n' + '10/10: The submission is in 2% of all the papers. It changed my thinking on its topic, being one of the most thorough, convincing, and well-written papers I have ever read. ' + 'I will fight for this paper to be accepted.\n\n' + '8/10: The submission is among the top 10% of all the papers. It provides sufficient justification for all its arguments and claims. ' + 'Some extra experimentation is needed, but they are not essential. ' + 'The proposed method is very original and it can also generalize to various fields. ' + 'This submission deepens the understanding of some phenomena, or lowers the bar for future research on an existing problem.\n\n' + '6/10: The submission gives sufficient support for its major arguments or claims. ' + 'However, some minor points are not well justified and need extra support, or details. ' + 'The proposed method is moderately original, and it is generalizable to various fields. ' + 'The submission itself is not particularly innovative, so it would not be a significant loss if it were not accepted.\n\n' + '5/10: Some of the major arguments or claims are not sufficiently justified. ' + 'There exist major weaknesses in technical, or methodological aspects. ' + 'The proposed method is somewhat original, and it is generalizable to various fields. ' + 'I am more on the side of rejection, but I can be convinced otherwise.\n\n' + '3/10: The submission makes only marginal contributions to the field.\n\n' + '1/10: The submission is not sufficiently thorough for publication. Or it is not relevant to the conference.\n\n' + 'You should not always consider the paper generally as an acceptable one. If the paper is really bad and has critical weaknesses, you should give a low score like 3. ' + 'If the paper is really good and has important strengths, we encourage you to give a high score like 9.\n\n' + 'Your score is: ' + ), + }, + ] + response = model_prompting( + config.param.base_llm, + prompt, + max_token_num=config.param.max_token_num, + temperature=config.param.temperature, + )[0] + score_options = ['10', '1', '2', '3', '4', '5', '6', '7', '8', '9'] + score = 0 + for score_option in score_options: + if score_option in response: + score = int(score_option) + + return strength, weakness, [score], {} + + +def write_review( + mode: str, + intro: str, + profiles: List[Profile], + profiles_reviewers: List[Profile], + full_content: Dict[str, Any], + ref_contents: List[str], + config: Config, + top_k_reviewers: int = 5, +) -> Tuple[str, str, List[int], Dict[str, Dict[str, Any]]]: + paper_content = '' + for idx, section in enumerate(full_content): + paper_content += f'{idx + 1}:\n\n' + section_text = full_content[section] + paper_content += section_text + '\n\n' + + if mode == 'reviewer_only': + return write_review_with_only_profiles( + paper_content, profiles_reviewers, config, top_k_reviewers + ) + elif mode == 'citation_only': + print( + f'You are in citation only mode. Top K reviewers parameter valued {top_k_reviewers} will be ignored.' + ) + return write_review_with_only_citations(paper_content, ref_contents, config) + elif mode == 'zero_shot': + print( + f'You are in zero shot mode. Top K reviewers parameter valued {top_k_reviewers} will be ignored.' + ) + return write_review_zero_shot(paper_content, config) + elif mode == 'research_town': + return write_review_research_town( + paper_content, profiles_reviewers, ref_contents, config, top_k_reviewers + ) + else: + raise ValueError(f'Invalid review writing mode: {mode}') diff --git a/research_bench/run_eval.py b/research_bench/run_proposal_eval.py similarity index 100% rename from research_bench/run_eval.py rename to research_bench/run_proposal_eval.py diff --git a/research_bench/run_eval.sh b/research_bench/run_proposal_eval.sh similarity index 77% rename from research_bench/run_eval.sh rename to research_bench/run_proposal_eval.sh index e8663b5c..d2d289c4 100755 --- a/research_bench/run_eval.sh +++ b/research_bench/run_proposal_eval.sh @@ -11,7 +11,7 @@ for MODE in "${MODES[@]}" do OUTPUT_PATH="${OUTPUT_DIR}/mlbench_result_4o_mini_${MODE}.jsonl" echo "Running evaluation for mode: $MODE" - poetry run python run_eval.py --input "$INPUT_PATH" --output "$OUTPUT_PATH" --mode "$MODE" --num_processes "$NUM_PROCESSES" + poetry run python run_proposal_eval.py --input "$INPUT_PATH" --output "$OUTPUT_PATH" --mode "$MODE" --num_processes "$NUM_PROCESSES" echo "Finished evaluation for mode: $MODE" done diff --git a/research_bench/run_review_eval.py b/research_bench/run_review_eval.py new file mode 100644 index 00000000..4ccbea6d --- /dev/null +++ b/research_bench/run_review_eval.py @@ -0,0 +1,207 @@ +import argparse +import json +import os +from multiprocessing import Lock +from typing import Any, Dict, List, Sequence, Tuple + +from tqdm import tqdm + +from research_bench.eval import compute_review_metrics +from research_bench.review_writing import write_review +from research_bench.utils import load_benchmark +from research_town.configs import Config +from research_town.data import Profile +from research_town.utils.logger import logger + + +def inference( + paper_id: str, + paper_data: Dict[str, Any], + author_data: Dict[str, Any], + reviewer_data: Dict[str, Any], + full_content: Dict[str, Any], + strengths_bp_flatte: List[str], + weaknesses_bp_flatte: List[str], + human_scores: List[int], + mode: str, + config: Config, + top_k_reviewers: int, +) -> Tuple[Dict[str, Any], Dict[str, List[float]]]: + intro = paper_data.get('introduction', '') + profiles = [Profile(**data) for data in author_data.values()] + profiles_reviewers = [Profile(**data) for data in reviewer_data.values()] + ref_abstracts = [ref['abstract'] for ref in paper_data.get('references', [])] + + generated_strength, generated_weakness, score, review_per_reviewer = write_review( + mode, + intro, + profiles, + profiles_reviewers, + full_content, + ref_abstracts, + config, + top_k_reviewers, + ) + + metrics = compute_review_metrics( + strengths_bp_flatte, + weaknesses_bp_flatte, + generated_strength, + generated_weakness, + ) + avg_score = sum(human_scores) / len(human_scores) + avg_generated_score = sum(score) / len(score) + dist = abs(avg_score - avg_generated_score) + results = { + 'paper_id': paper_id, + 'strengths_bp': strengths_bp_flatte, + 'weaknesses_bp': weaknesses_bp_flatte, + 'generated_strength': generated_strength, + 'generated_weakness': generated_weakness, + 'score': human_scores, + 'generated_scores': score, + 'avg_score': avg_score, + 'avg_generated_score': avg_generated_score, + 'score_diff': dist, + # 'review_per_reviewer': review_per_reviewer, + } + return results, metrics + + +def load_papers(input_path: str, output_path: str) -> Any: + dataset = load_benchmark(input_path) + + if os.path.exists(output_path): + with open(output_path, 'r') as f: + processed_ids = {json.loads(line)['paper_id'] for line in f} + return {k: v for k, v in dataset.items() if k not in processed_ids} + + return dataset + + +def save_results( + results: Dict[str, Any], metrics: Dict[str, Any], output_path: str, lock: Any +) -> None: + with lock: + with open(output_path, 'a') as f: + json.dump({**results, **metrics}, f) + f.write('\n') + + +def process_task( + task: Tuple[ + str, + Dict[str, Any], + Dict[str, Any], + Dict[str, Any], + Dict[str, Any], + List[str], + List[str], + List[int], + str, + Config, + int, + ], +) -> Tuple[Dict[str, Sequence[str]], Dict[str, List[float]]]: + return inference( + paper_id=task[0], + paper_data=task[1], + author_data=task[2], + reviewer_data=task[3], + full_content=task[4], + strengths_bp_flatte=task[5], + weaknesses_bp_flatte=task[6], + human_scores=task[7], + mode=task[8], + config=task[9], + top_k_reviewers=task[10], + ) + + +def main() -> None: + parser = argparse.ArgumentParser(description='Research Proposal Generator') + parser.add_argument( + '--input_path', type=str, required=True, help='Input JSON file path' + ) + parser.add_argument( + '--output_path', type=str, required=True, help='Output JSONL file path' + ) + parser.add_argument( + '--mode', + type=str, + required=True, + choices=[ + 'zero_shot', + 'reviewer_only', + 'citation_only', + 'author_citation', + 'textgnn', + 'sakana_ai_scientist', + 'research_town', + ], + help='Processing mode', + ) + parser.add_argument( + '--config_path', + type=str, + default='../configs', + help='Path to the configuration directory', + ) + parser.add_argument( + '--num_processes', + type=int, + default=os.cpu_count(), + help='Number of parallel processes to use', + ) + parser.add_argument( + '--top_k_reviewers', + type=int, + default=5, + help='Number of top reviewers to consider', + ) + args = parser.parse_args() + + config = Config(args.config_path) + top_k_reviewers = args.top_k_reviewers + dataset = load_papers(args.input_path, args.output_path) + logger.info(f'Processing {len(dataset)} papers') + + for paper_id, data in tqdm(dataset.items(), desc='Processing papers'): + full_content = data['full_content'] + paper_data = data['paper_data'] + author_data = data['author_data'] + reviewer_data = data['reviewer_data'] + reference_review = data['reviews'] + human_scores = [ + int(review.get('rating').split(':')[0]) for review in reference_review + ] + strengths_bp = [ + review.get('strengths_bullet', '') for review in reference_review + ] + # flatten + strengths_bp_flatten = [item for sublist in strengths_bp for item in sublist] + weaknesses_bp = [ + review.get('weaknesses_bullet', '') for review in reference_review + ] + # flatten + weaknesses_bp_flatten = [item for sublist in weaknesses_bp for item in sublist] + + results, metrics = inference( + paper_id, + paper_data, + author_data, + reviewer_data, + full_content, + strengths_bp_flatten, + weaknesses_bp_flatten, + human_scores, + args.mode, + config, + top_k_reviewers, + ) + lock = Lock() + save_results(results, metrics, args.output_path, lock) + + +if __name__ == '__main__': + main() diff --git a/research_bench/run_review_eval.sh b/research_bench/run_review_eval.sh new file mode 100644 index 00000000..a948c1d5 --- /dev/null +++ b/research_bench/run_review_eval.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Define the input and output paths, along with the modes to test +INPUT_PATH="./reviewbench/reviewbench_reviewers_full_content.json" +OUTPUT_DIR="./results" +MODES=("research_town") +NUM_PROCESSES=4 +TOP_K_REVIEWERS=5 + +# Loop through each mode and run the evaluation +for MODE in "${MODES[@]}" +do + OUTPUT_PATH="${OUTPUT_DIR}/reviewbench_result_4o_mini_${MODE}_top${TOP_K_REVIEWERS}.json" + echo "Running evaluation for mode: $MODE" + poetry run python run_review_eval.py --input "$INPUT_PATH" --output "$OUTPUT_PATH" --mode "$MODE" --num_processes "$NUM_PROCESSES" --top_k_reviewers "$TOP_K_REVIEWERS" + echo "Finished evaluation for mode: $MODE" +done + +echo "All modes tested successfully." diff --git a/research_bench/timed.sh b/research_bench/timed.sh new file mode 100644 index 00000000..97ed4d28 --- /dev/null +++ b/research_bench/timed.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# Wait for 1.5 hours + +# sleep 5400 # 1.5 hours = 5400 seconds +# wait with a countdown + +secs=$((1*25*60)) +while [ $secs -gt 0 ]; do + echo -ne "Waiting for 25 minutes to finish. Time left: $secs\033[0K\r" + sleep 1 + : $((secs--)) +done + +echo "Finished waiting for 1.5 hours." + +# Copy the file +cp results/iclrbench_result_4o_mini_research_town_topk_3.jsonl iclrbench/review_evaluation/iclrbench_result_4o_mini_research_town_topk_3.jsonl + +# Change directory to review_evaluation +cd iclrbench/review_evaluation + +# Run the evaluation script +bash auto_eval.sh + +echo "Finished running the evaluation script." diff --git a/research_town/agents/agent.py b/research_town/agents/agent.py index f235fbb1..cc5a1297 100644 --- a/research_town/agents/agent.py +++ b/research_town/agents/agent.py @@ -161,27 +161,25 @@ def write_proposal( @beartype @reviewer_required - def write_review(self, proposal: Proposal, config: Config) -> Review: + def write_review( + self, profile: Profile, proposal: Proposal, config: Config + ) -> Review: serialized_proposal = self.serializer.serialize(proposal) + serialized_profile = self.serializer.serialize(profile) ( - summary, strength, weakness, - ethical_concern, score, - summary_prompt_messages, strength_prompt_messages, weakness_prompt_messages, - ethical_prompt_messages, score_prompt_messages, ) = write_review_prompting( proposal=serialized_proposal, model_name=self.model_name, - summary_prompt_template=config.agent_prompt_template.write_review_summary, + profile=serialized_profile, strength_prompt_template=config.agent_prompt_template.write_review_strength, weakness_prompt_template=config.agent_prompt_template.write_review_weakness, - ethical_prompt_template=config.agent_prompt_template.write_review_ethical, score_prompt_template=config.agent_prompt_template.write_review_score, return_num=config.param.return_num, max_token_num=config.param.max_token_num, @@ -192,14 +190,14 @@ def write_review(self, proposal: Proposal, config: Config) -> Review: review = Review( proposal_pk=proposal.pk, reviewer_pk=self.profile.pk, - summary=summary, - summary_prompt_messages=summary_prompt_messages, + summary=None, + summary_prompt_messages=None, strength=strength, strength_prompt_messages=strength_prompt_messages, weakness=weakness, weakness_prompt_messages=weakness_prompt_messages, - ethical_concern=ethical_concern, - ethical_concern_prompt_messages=ethical_prompt_messages, + ethical_concern=None, + ethical_concern_prompt_messages=None, score=score, score_prompt_messages=score_prompt_messages, ) @@ -211,31 +209,21 @@ def write_metareview( self, proposal: Proposal, reviews: List[Review], + scores: List[int], config: Config, ) -> MetaReview: - serialized_proposal = self.serializer.serialize(proposal) serialized_reviews = self.serializer.serialize(reviews) ( - summary, strength, weakness, - ethical_concern, - decision, - summary_prompt_messages, strength_prompt_messages, weakness_prompt_messages, - ethical_prompt_messages, - decision_prompt_messages, ) = write_metareview_prompting( - proposal=serialized_proposal, reviews=serialized_reviews, model_name=self.model_name, - summary_prompt_template=config.agent_prompt_template.write_metareview_summary, strength_prompt_template=config.agent_prompt_template.write_metareview_strength, weakness_prompt_template=config.agent_prompt_template.write_metareview_weakness, - ethical_prompt_template=config.agent_prompt_template.write_metareview_ethical, - decision_prompt_template=config.agent_prompt_template.write_metareview_decision, return_num=config.param.return_num, max_token_num=config.param.max_token_num, temperature=config.param.temperature, @@ -243,21 +231,24 @@ def write_metareview( stream=config.param.stream, ) + metareview_threshold = 6 + decision = all(score >= metareview_threshold for score in scores) + metareview = MetaReview( proposal_pk=proposal.pk, chair_pk=self.profile.pk, reviewer_pks=[review.reviewer_pk for review in reviews], author_pk=self.profile.pk, - summary=summary, - summary_prompt_messages=summary_prompt_messages, + summary=None, + summary_prompt_messages=None, strength=strength, strength_prompt_messages=strength_prompt_messages, weakness=weakness, weakness_prompt_messages=weakness_prompt_messages, - ethical_concern=ethical_concern, - ethical_concern_prompt_messages=ethical_prompt_messages, + ethical_concern=None, + ethical_concern_prompt_messages=None, decision=decision, - decision_prompt_messages=decision_prompt_messages, + decision_prompt_messages=None, ) return metareview diff --git a/research_town/configs/config.py b/research_town/configs/config.py index 6949cb0d..f1e1e460 100644 --- a/research_town/configs/config.py +++ b/research_town/configs/config.py @@ -78,16 +78,11 @@ class AgentPromptTemplate(BaseModel): write_proposal_cot: Dict[str, Union[str, List[str]]] write_proposal_react: Dict[str, Union[str, List[str]]] write_proposal_reflexion: Dict[str, Union[str, List[str]]] - write_review_summary: Dict[str, Union[str, List[str]]] write_review_strength: Dict[str, Union[str, List[str]]] write_review_weakness: Dict[str, Union[str, List[str]]] - write_review_ethical: Dict[str, Union[str, List[str]]] write_review_score: Dict[str, Union[str, List[str]]] - write_metareview_summary: Dict[str, Union[str, List[str]]] write_metareview_strength: Dict[str, Union[str, List[str]]] write_metareview_weakness: Dict[str, Union[str, List[str]]] - write_metareview_ethical: Dict[str, Union[str, List[str]]] - write_metareview_decision: Dict[str, Union[str, List[str]]] write_rebuttal: Dict[str, Union[str, List[str]]] @root_validator(pre=True) @@ -101,38 +96,18 @@ def validate_placeholders(cls: Any, values: Dict[str, Any]) -> Dict[str, Any]: 'write_proposal_cot': ['{idea}', '{papers}'], 'write_proposal_react': ['{idea}', '{papers}'], 'write_proposal_reflexion': ['{idea}', '{papers}'], - 'write_review_summary': ['{proposal}'], - 'write_review_strength': ['{proposal}', '{summary}'], - 'write_review_weakness': ['{proposal}', '{summary}'], - 'write_review_ethical': ['{proposal}', '{summary}'], + 'write_review_strength': ['{bio}', '{proposal}', '{citations}'], + 'write_review_weakness': ['{bio}', '{proposal}', '{citations}'], 'write_review_score': [ - '{proposal}', - '{summary}', + '{bio}', '{strength}', '{weakness}', ], - 'write_metareview_summary': ['{proposal}', '{reviews}'], 'write_metareview_strength': [ - '{proposal}', '{reviews}', - '{summary}', ], 'write_metareview_weakness': [ - '{proposal}', '{reviews}', - '{summary}', - ], - 'write_metareview_ethical': [ - '{proposal}', - '{reviews}', - '{summary}', - ], - 'write_metareview_decision': [ - '{proposal}', - '{reviews}', - '{summary}', - '{strength}', - '{weakness}', ], 'write_rebuttal': ['{proposal}', '{review}'], } diff --git a/research_town/data/data.py b/research_town/data/data.py index a4deecea..189da773 100644 --- a/research_town/data/data.py +++ b/research_town/data/data.py @@ -97,6 +97,7 @@ class Proposal(Progress): q3: Optional[str] = Field(default=None) q4: Optional[str] = Field(default=None) q5: Optional[str] = Field(default=None) + citations: Optional[List[str]] = Field(default=[]) abstract: str = Field(default='') title: Optional[str] = Field(default=None) conference: Optional[str] = Field(default=None) diff --git a/research_town/envs/env_review_writing.py b/research_town/envs/env_review_writing.py index c5098b40..16b0a28f 100644 --- a/research_town/envs/env_review_writing.py +++ b/research_town/envs/env_review_writing.py @@ -3,7 +3,7 @@ from ..agents import Agent, AgentManager from ..configs import Config -from ..data import MetaReview, Progress, Rebuttal, Review +from ..data import MetaReview, Progress, Review from ..dbs import LogDB, PaperDB, ProgressDB from .env_base import BaseEnv @@ -33,7 +33,8 @@ def on_enter( **context: Any, ) -> None: if 'leader' not in context or context['leader'] is None: - context['leader'] = self.agent_manager.sample_leader() + # context['leader'] = self.agent_manager.sample_leader() + context['leader'] = [] if 'chair' not in context or context['chair'] is None: context['chair'] = self.agent_manager.sample_chair() if 'reviewers' not in context or context['reviewers'] is None: @@ -64,28 +65,20 @@ def run(self) -> Generator[Tuple[Progress, Agent], None, None]: self.reviews: List[Review] = [] for reviewer in self.reviewers: review = reviewer.write_review( + profile=reviewer.profile, proposal=proposal, config=self.config, ) self.reviews.append(review) yield review, reviewer - # Rebuttal Submitting - self.rebuttals: List[Rebuttal] = [] - for review in self.reviews: - rebuttal = self.leader.write_rebuttal( - proposal=proposal, - review=review, - config=self.config, - ) - self.rebuttals.append(rebuttal) - yield rebuttal, self.leader - # Paper Meta Reviewing + scores = [review.score for review in self.reviews] metareview = self.chair.write_metareview( proposal=proposal, reviews=self.reviews, config=self.config, + scores=scores, ) self.metareviews.append(metareview) yield metareview, self.chair diff --git a/research_town/utils/agent_prompter.py b/research_town/utils/agent_prompter.py index e55f0c09..3eea632b 100644 --- a/research_town/utils/agent_prompter.py +++ b/research_town/utils/agent_prompter.py @@ -2,10 +2,12 @@ from beartype import beartype from beartype.typing import Dict, List, Optional, Tuple, Union +from litellm.utils import token_counter from .model_prompting import model_prompting from .prompt_constructor import openai_format_prompt_construct from .string_mapper import ( + map_cited_abstracts_to_str, map_idea_list_to_str, map_idea_to_str, map_insight_list_to_str, @@ -165,10 +167,9 @@ def write_proposal_prompting( def write_review_prompting( proposal: Dict[str, str], model_name: str, - summary_prompt_template: Dict[str, Union[str, List[str]]], + profile: Dict[str, str], strength_prompt_template: Dict[str, Union[str, List[str]]], weakness_prompt_template: Dict[str, Union[str, List[str]]], - ethical_prompt_template: Dict[str, Union[str, List[str]]], score_prompt_template: Dict[str, Union[str, List[str]]], return_num: Optional[int] = 1, max_token_num: Optional[int] = 512, @@ -176,44 +177,37 @@ def write_review_prompting( top_p: Optional[float] = None, stream: Optional[bool] = None, ) -> Tuple[ - str, - str, str, str, int, List[Dict[str, str]], List[Dict[str, str]], List[Dict[str, str]], - List[Dict[str, str]], - List[Dict[str, str]], ]: + token_input_count = 0 + token_output_count = 0 proposal_str = map_proposal_to_str(proposal) - summary_template_input = {'proposal': proposal_str} - summary_messages = openai_format_prompt_construct( - summary_prompt_template, summary_template_input - ) - summary = model_prompting( - model_name, - summary_messages, - return_num, - max_token_num, - temperature, - top_p, - stream, - )[0] - strength_template_input = {'proposal': proposal_str, 'summary': summary} + citations: Union[str, List[str]] = proposal.get('citations', []) + assert isinstance(citations, list) + citations_str = map_cited_abstracts_to_str(citations) + + strength_template_input = { + 'proposal': proposal_str, + 'bio': profile['bio'], + 'citations': citations_str, + } strength_messages = openai_format_prompt_construct( strength_prompt_template, strength_template_input ) - weakness_template_input = {'proposal': proposal_str, 'summary': summary} + weakness_template_input = { + 'proposal': proposal_str, + 'bio': profile['bio'], + 'citations': citations_str, + } weakness_messages = openai_format_prompt_construct( weakness_prompt_template, weakness_template_input ) - ethical_template_input = {'proposal': proposal_str, 'summary': summary} - ethical_messages = openai_format_prompt_construct( - ethical_prompt_template, ethical_template_input - ) strength = model_prompting( model_name, @@ -233,68 +227,58 @@ def write_review_prompting( top_p, stream, )[0] - ethical_concern = model_prompting( - model_name, - ethical_messages, - return_num, - max_token_num, - temperature, - top_p, - stream, - )[0] + + token_input_count += token_counter(model=model_name, messages=strength_messages) + token_input_count += token_counter(model=model_name, messages=weakness_messages) + token_output_count += token_counter(model=model_name, text=strength) + token_output_count += token_counter(model=model_name, text=weakness) score_template_input = { - 'proposal': proposal_str, - 'summary': summary, 'strength': strength, 'weakness': weakness, - 'ethical_concern': ethical_concern, + 'bio': profile['bio'], } score_messages = openai_format_prompt_construct( score_prompt_template, score_template_input ) - score_str = ( - model_prompting( - model_name, - score_messages, - return_num, - max_token_num, - temperature, - top_p, - stream, - )[0] - .split( - 'Based on the given information, I would give this submission a score of ' - )[1] - .split(' out of 10')[0] - ) - score = int(score_str[0]) if score_str[0].isdigit() else 0 + score_response_str = model_prompting( + model_name, + score_messages, + return_num, + max_token_num, + temperature, + top_p, + stream, + )[0] + + token_input_count += token_counter(model=model_name, messages=score_messages) + token_output_count += token_counter(model=model_name, text=score_response_str) + + # find the first number in 10, 1, 2, 3, 4, 5, 6, 7, 8, 9 + score_str = re.findall(r'\d+', score_response_str) + score_str_1st = score_str[0] if score_str else 0 + score = int(score_str_1st) + + print(f'Token input count: {token_input_count}') + print(f'Token output count: {token_output_count}') return ( - summary, strength, weakness, - ethical_concern, score, - summary_messages, strength_messages, weakness_messages, - ethical_messages, score_messages, ) @beartype def write_metareview_prompting( - proposal: Dict[str, str], reviews: List[Dict[str, Union[int, str]]], model_name: str, - summary_prompt_template: Dict[str, Union[str, List[str]]], strength_prompt_template: Dict[str, Union[str, List[str]]], weakness_prompt_template: Dict[str, Union[str, List[str]]], - ethical_prompt_template: Dict[str, Union[str, List[str]]], - decision_prompt_template: Dict[str, Union[str, List[str]]], return_num: Optional[int] = 1, max_token_num: Optional[int] = 512, temperature: Optional[float] = 0.0, @@ -303,48 +287,18 @@ def write_metareview_prompting( ) -> Tuple[ str, str, - str, - str, - bool, - List[Dict[str, str]], - List[Dict[str, str]], - List[Dict[str, str]], List[Dict[str, str]], List[Dict[str, str]], ]: - proposal_str = map_proposal_to_str(proposal) - reviews_str = map_review_list_to_str(reviews) - summary_template_input = { - 'proposal': proposal_str, - 'reviews': reviews_str, - } - summary_messages = openai_format_prompt_construct( - summary_prompt_template, summary_template_input - ) - summary = model_prompting( - model_name, - summary_messages, - return_num, - max_token_num, - temperature, - top_p, - stream, - )[0] + token_input_count = 0 + token_output_count = 0 + reviews_str = map_review_list_to_str(reviews) strength_template_input = { - 'proposal': proposal_str, 'reviews': reviews_str, - 'summary': summary, } weakness_template_input = { - 'proposal': proposal_str, - 'reviews': reviews_str, - 'summary': summary, - } - ethical_template_input = { - 'proposal': proposal_str, 'reviews': reviews_str, - 'summary': summary, } strength_messages = openai_format_prompt_construct( strength_prompt_template, strength_template_input @@ -352,10 +306,6 @@ def write_metareview_prompting( weakness_messages = openai_format_prompt_construct( weakness_prompt_template, weakness_template_input ) - ethical_messages = openai_format_prompt_construct( - ethical_prompt_template, ethical_template_input - ) - strength = model_prompting( model_name, strength_messages, @@ -374,49 +324,16 @@ def write_metareview_prompting( top_p, stream, )[0] - ethical_concern = model_prompting( - model_name, - ethical_messages, - return_num, - max_token_num, - temperature, - top_p, - stream, - )[0] - - decision_template_input = { - 'proposal': proposal_str, - 'reviews': reviews_str, - 'summary': summary, - 'strength': strength, - 'weakness': weakness, - 'ethical_concern': ethical_concern, - } - decision_messages = openai_format_prompt_construct( - decision_prompt_template, decision_template_input - ) - decision_str = model_prompting( - model_name, - decision_messages, - return_num, - max_token_num, - temperature, - top_p, - stream, - ) - decision = 'accept' in decision_str[0].lower() + token_input_count += token_counter(model=model_name, messages=strength_messages) + token_input_count += token_counter(model=model_name, messages=weakness_messages) + token_output_count += token_counter(model=model_name, text=strength) + token_output_count += token_counter(model=model_name, text=weakness) return ( - summary, strength, weakness, - ethical_concern, - decision, - summary_messages, strength_messages, weakness_messages, - ethical_messages, - decision_messages, ) diff --git a/research_town/utils/string_mapper.py b/research_town/utils/string_mapper.py index d59c4a75..6ad90390 100644 --- a/research_town/utils/string_mapper.py +++ b/research_town/utils/string_mapper.py @@ -18,6 +18,13 @@ def map_proposal_to_str(paper: Dict[str, str]) -> str: return f"{paper['content']}" +def map_cited_abstracts_to_str(abstracts: List[str]) -> str: + result = '' + for i, abstract in enumerate(abstracts): + result += f'{i+1}th cited abstract: ' + abstract + '\n' + return result + + def map_paper_list_to_str(papers: List[Dict[str, str]]) -> str: result = '' for i, paper in enumerate(papers): @@ -39,14 +46,14 @@ def map_review_list_to_str(reviews: List[Dict[str, Union[int, str]]]) -> str: def map_review_to_str(review: Dict[str, Union[int, str]]) -> str: assert 'score' in review - assert 'summary' in review + # assert 'summary' in review assert 'strength' in review assert 'weakness' in review score = review['score'] - summary = review['summary'] + # summary = review['summary'] strength = review['strength'] weakness = review['weakness'] - return f'Score: {score}\nSummary: {summary}\nStrength: {strength}\nWeakness: {weakness}' + return f'Score: {score}\nStrength: {strength}\nWeakness: {weakness}\n\n' def map_rebuttal_list_to_str(rebuttals: List[Dict[str, str]]) -> str: @@ -77,7 +84,7 @@ def map_metareview_to_str(metareview: Dict[str, str]) -> str: summary = metareview['summary'] strength = metareview['strength'] weakness = metareview['weakness'] - return f'Summary: {summary}\nStrength: {strength}\nWeakness: {weakness}\nDecision: {decision}' + return f'Summary: {summary}\nStrength: {strength}\nWeakness: {weakness}\nDecision: {decision}\n' def map_insight_list_to_str(insights: List[Dict[str, str]]) -> str: diff --git a/tests/agents/test_agents.py b/tests/agents/test_agents.py index 5472818d..c9bcb8db 100644 --- a/tests/agents/test_agents.py +++ b/tests/agents/test_agents.py @@ -85,11 +85,11 @@ def test_write_review(mock_model_prompting: MagicMock) -> None: role='reviewer', ) review = agent.write_review( + profile=profile_A, proposal=research_proposal_A, config=example_config, ) assert isinstance(review, Review) - assert review.summary == 'Summary of the paper1' assert review.strength == 'Strength of the paper1' assert review.weakness == 'Weakness of the paper1' assert review.score == 8 @@ -109,16 +109,18 @@ def test_write_metareview(mock_model_prompting: MagicMock) -> None: role='chair', ) review = agent_reviewer.write_review( + profile=profile_A, proposal=research_proposal_A, config=example_config, ) + assert review.score is not None metareview = agent_chair.write_metareview( + scores=[review.score], proposal=research_proposal_A, reviews=[review], config=example_config, ) assert isinstance(metareview, MetaReview) - assert metareview.summary == 'Meta review summary1' assert metareview.strength == 'Meta review strength1' assert metareview.weakness == 'Meta review weakness1' assert metareview.decision is True @@ -139,6 +141,7 @@ def test_write_rebuttal(mock_model_prompting: MagicMock) -> None: role='leader', ) review = agent_reviewer.write_review( + profile=profile_A, proposal=research_proposal_A, config=example_config, ) diff --git a/tests/mocks/mocking_func.py b/tests/mocks/mocking_func.py index 0bd1d845..270550a7 100644 --- a/tests/mocks/mocking_func.py +++ b/tests/mocks/mocking_func.py @@ -36,14 +36,6 @@ def mock_prompting( return ['Summarized idea1', 'Summarized idea2', 'Summarized idea3'] elif prompt[0]['content'] == agent_prompt_template.write_proposal['sys_prompt']: return ['Paper abstract1', 'Paper abstract2', 'Paper abstract3'] - elif ( - prompt[0]['content'] == agent_prompt_template.write_review_summary['sys_prompt'] - ): - return [ - 'Summary of the paper1', - 'Summary of the paper2', - 'Summary of the paper3', - ] elif ( prompt[0]['content'] == agent_prompt_template.write_review_strength['sys_prompt'] @@ -68,11 +60,6 @@ def mock_prompting( 'Based on the given information, I would give this submission a score of 6 out of 10.', 'Based on the given information, I would give this submission a score of 5 out of 10.', ] - elif ( - prompt[0]['content'] - == agent_prompt_template.write_metareview_summary['sys_prompt'] - ): - return ['Meta review summary1', 'Meta review summary2', 'Meta review summary3'] elif ( prompt[0]['content'] == agent_prompt_template.write_metareview_strength['sys_prompt'] @@ -91,11 +78,6 @@ def mock_prompting( 'Meta review weakness2', 'Meta review weakness3', ] - elif ( - prompt[0]['content'] - == agent_prompt_template.write_metareview_decision['sys_prompt'] - ): - return ['accept', 'accept', 'reject'] elif prompt[0]['content'] == agent_prompt_template.write_rebuttal['sys_prompt']: return ['Rebuttal text1', 'Rebuttal text2', 'Rebuttal text3'] elif prompt[0]['content'] == eval_prompt_template.insight_quality['sys_prompt']: diff --git a/tests/utils/test_string_mapper.py b/tests/utils/test_string_mapper.py index 7ac548d4..ce470efb 100644 --- a/tests/utils/test_string_mapper.py +++ b/tests/utils/test_string_mapper.py @@ -60,7 +60,7 @@ def test_map_review_list_to_str() -> None: 'weakness': 'Weakness 3', }, ] - expected_result = 'Score: 5\nSummary: Review 1\nStrength: Strength 1\nWeakness: Weakness 1Score: 3\nSummary: Review 2\nStrength: Strength 2\nWeakness: Weakness 2Score: 4\nSummary: Review 3\nStrength: Strength 3\nWeakness: Weakness 3' + expected_result = 'Score: 5\nStrength: Strength 1\nWeakness: Weakness 1\n\nScore: 3\nStrength: Strength 2\nWeakness: Weakness 2\n\nScore: 4\nStrength: Strength 3\nWeakness: Weakness 3\n\n' assert map_review_list_to_str(reviews) == expected_result @@ -83,9 +83,7 @@ def test_map_review_to_str() -> None: 'strength': 'Strength', 'weakness': 'Weakness', } - expected_result = ( - 'Score: 4\nSummary: This is a review\nStrength: Strength\nWeakness: Weakness' - ) + expected_result = 'Score: 4\nStrength: Strength\nWeakness: Weakness\n\n' assert map_review_to_str(review) == expected_result @@ -96,7 +94,7 @@ def test_map_metareview_to_str() -> None: 'weakness': 'Weakness', 'decision': 'accept', } - expected_result = 'Summary: This is a meta review\nStrength: Strength\nWeakness: Weakness\nDecision: accept' + expected_result = 'Summary: This is a meta review\nStrength: Strength\nWeakness: Weakness\nDecision: accept\n' assert map_metareview_to_str(metareview) == expected_result @@ -121,7 +119,7 @@ def test_map_metareview_list_to_str() -> None: 'decision': 'accept', }, ] - expected_result = 'Summary: Meta review 1\nStrength: Strength 1\nWeakness: Weakness 1\nDecision: acceptSummary: Meta review 2\nStrength: Strength 2\nWeakness: Weakness 2\nDecision: rejectSummary: Meta review 3\nStrength: Strength 3\nWeakness: Weakness 3\nDecision: accept' + expected_result = 'Summary: Meta review 1\nStrength: Strength 1\nWeakness: Weakness 1\nDecision: accept\nSummary: Meta review 2\nStrength: Strength 2\nWeakness: Weakness 2\nDecision: reject\nSummary: Meta review 3\nStrength: Strength 3\nWeakness: Weakness 3\nDecision: accept\n' assert map_metareview_list_to_str(metareviews) == expected_result