From ddba8ba5ad5f1508c6028402f787c4dc0ada7cd9 Mon Sep 17 00:00:00 2001
From: Zirui Cheng <112863445+chengzr01@users.noreply.github.com>
Date: Thu, 23 May 2024 02:24:36 +0800
Subject: [PATCH] support rebuttal process (#58)

* support paper rebuttal environment (#36)

* fix review paper

* fix review paper test

* fix type errors (#36)

* fix test errors

* fix test errors

* update decision making and paper rebuttal (#36)

* fix test errors

* support return bool decision and int score

* fix ruff

* support full testing

* support full testing

* fix one typo

* fix mypy error

* fix pre-commit error

---------

Co-authored-by: timsanders256 <73690408+timsanders256@users.noreply.github.com>
Co-authored-by: Haofei Yu <1125027232@qq.com>
---
 research_town/agents/agent_base.py       |  31 ++++--
 research_town/envs/env_paper_rebuttal.py |  76 +++++++++++++--
 research_town/utils/agent_prompting.py   | 115 +++++++++++++++++++----
 tests/test_agent_base.py                 |  80 ++++++++++++----
 tests/test_envs.py                       |  21 +++++
 5 files changed, 269 insertions(+), 54 deletions(-)
 create mode 100644 tests/test_envs.py

diff --git a/research_town/agents/agent_base.py b/research_town/agents/agent_base.py
index 40157ab1..c1051662 100644
--- a/research_town/agents/agent_base.py
+++ b/research_town/agents/agent_base.py
@@ -8,7 +8,10 @@
     communicate_with_multiple_researchers_prompting,
     find_collaborators_prompting,
     generate_ideas_prompting,
+    make_review_decision_prompting,
+    rebut_review_prompting,
     review_paper_prompting,
+    review_score_prompting,
     summarize_research_direction_prompting,
     summarize_research_field_prompting,
     write_paper_abstract_prompting,
@@ -44,7 +47,7 @@ def get_profile(self, author_name: str) -> Dict[str, Any]:
             papers_list, papers_by_year = self._get_papers(entries, author_name)
             if len(papers_list) > 40:
                 papers_list = self._select_papers(papers_by_year, author_name)
-            
+
             # Trim the list to the 10 most recent papers
             papers_list = papers_list[:10]
 
@@ -159,7 +162,7 @@ def read_paper(
         trend_output = trend[0]
         return trend_output
 
-    def find_collaborators(self, input: Dict[str, str], parameter: float =0.5, max_number: int =3) -> List[str]:
+    def find_collaborators(self, input: Dict[str, str], parameter: float = 0.5, max_number: int = 3) -> List[str]:
         start_author = [self.name]
         graph, _, _ = bfs(
             author_list=start_author, node_limit=max_number)
@@ -207,11 +210,23 @@ def write_paper(self, input: List[str], external_data: Dict[str, Dict[str, List[
         paper_abstract = write_paper_abstract_prompting(input, external_data)
         return paper_abstract[0]
 
-    def review_paper(self, input: Dict[str, str], external_data: Dict[str, str]) -> str:
-        paper_review = review_paper_prompting(input, external_data)
-        return paper_review[0]
+    def review_paper(self, external_data: Dict[str, str]) -> Tuple[int, str]:
+        paper_review = review_paper_prompting(external_data)[0]
+        print(paper_review)
+        review_score = review_score_prompting(paper_review)
+        print(review_score, paper_review)
+        return review_score, paper_review
 
     def make_review_decision(
-        self, input: Dict[str, str], external_data: Dict[str, str]
-    ) -> str:
-        return "accept"
+        self, submission: Dict[str, str], review: Dict[str, Tuple[int, str]]
+    ) -> Tuple[bool, str]:
+        meta_review = make_review_decision_prompting(submission, review)
+        if "accept" in meta_review[0].lower():
+            review_decision = True
+        else:
+            review_decision = False
+        return review_decision, meta_review[0]
+
+    def rebut_review(self, submission: Dict[str, str], review: Dict[str, Tuple[int, str]], decision: Dict[str, Tuple[bool, str]]) -> str:
+        rebut_review = rebut_review_prompting(submission, review, decision)
+        return rebut_review[0]
diff --git a/research_town/envs/env_paper_rebuttal.py b/research_town/envs/env_paper_rebuttal.py
index 63b6bdee..7a660887 100644
--- a/research_town/envs/env_paper_rebuttal.py
+++ b/research_town/envs/env_paper_rebuttal.py
@@ -1,4 +1,4 @@
-from typing import Dict
+from typing import Dict, Tuple
 
 from .env_base import BaseMultiAgentEnv
 
@@ -6,15 +6,73 @@
 class PaperRebuttalMultiAgentEnv(BaseMultiAgentEnv):
     def __init__(self, agent_dict: Dict[str, str]) -> None:
         super().__init__(agent_dict)
+        self.turn_number = 0
+        self.turn_max = 1
+        self.terminated = False
+        self.roles: Dict[str, str] = {}
+        self.submission: Dict[str, str] = {}
+        self.review = ""
+        self.decision = ""
+        self.rebuttal = ""
+
+    def assign_roles(self, role_dict: Dict[str, str]) -> None:
+        self.roles = role_dict
+
+    def initialize_submission(self, external_data: Dict[str, str]) -> None:
+        self.submission = external_data
+
+    def submit_review(self, review_dict: Dict[str, Tuple[int, str]]) -> None:
+        review_serialize = [
+            f"Reviewer: {name}\nScore: {review[0]}\nReview: {review[1]}" for name, review in review_dict.items()]
+        self.review = "\n\n".join(review_serialize)
+
+    def submit_decision(self, decision_dict: Dict[str, Tuple[bool, str]]) -> None:
+        decision_count = {"accept": 0, "reject": 0}
+        for _, decision in decision_dict.items():
+            if decision[0]:
+                decision_count["accept"] += 1
+            else:
+                decision_count["reject"] += 1
+        count_max = 0
+        for d, count in decision_count.items():
+            if count > count_max:
+                count_max = count
+                self.decision = d
+
+    def submit_rebuttal(self, rebuttal_dict: Dict[str, str]) -> None:
+        rebuttal_serialize = [
+            f"Author: {name}\nRebuttal: {rebuttal}" for name, rebuttal in rebuttal_dict.items()]
+        self.rebuttal = "\n\n".join(rebuttal_serialize)
 
     def step(self) -> None:
-        external_data = self.kb.get_data(10, "machine learning")
-        for agent_name, agent in self.agents.items():
-            agent.read_paper(external_data=external_data, domain="machine learning")
-            agent.review_paper({}, {})
-            agent.make_review_decision({}, {})
+        # Paper Reviewing
+        review_dict: Dict[str, Tuple[int, str]] = {}
+        for name, role in self.roles.items():
+            if role == "reviewer":
+                review_dict[name] = self.agents[name].review_paper(
+                    external_data=self.submission)
+        self.submit_review(review_dict)
+
+        # Decision Making
+        decision_dict: Dict[str, Tuple[bool, str]] = {}
+        for name, role in self.roles.items():
+            if role == "reviewer":
+                decision_dict[name] = self.agents[name].make_review_decision(
+                    submission=self.submission, review=review_dict)
+        self.submit_decision(decision_dict)
 
-        self.submit_rebuttal()
+        # Rebuttal Submitting
+        rebuttal_dict: Dict[str, str] = {}
+        for name, role in self.roles.items():
+            if role == "author":
+                rebuttal_dict[name] = self.agents[name].rebut_review(
+                    submission=self.submission,
+                    review=review_dict,
+                    decision=decision_dict)
+        self.submit_rebuttal(rebuttal_dict)
 
-    def submit_rebuttal(self) -> None:
-        pass
+        self.turn_number += 1
+        if self.decision == "accept":
+            self.terminated = True
+        if self.turn_number >= self.turn_max:
+            self.terminated = True
diff --git a/research_town/utils/agent_prompting.py b/research_town/utils/agent_prompting.py
index 0af68916..b3e3cdc2 100644
--- a/research_town/utils/agent_prompting.py
+++ b/research_town/utils/agent_prompting.py
@@ -1,5 +1,5 @@
 import os
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 import openai
 
@@ -33,7 +33,8 @@ def get_query_embedding(query: str) -> Any:
 
 
 def find_nearest_neighbors(data_embeddings: List[Any], query_embedding: Any, num_neighbors: int) -> Any:
-    neighbors = neiborhood_search(data_embeddings, query_embedding, num_neighbors)
+    neighbors = neiborhood_search(
+        data_embeddings, query_embedding, num_neighbors)
     neighbors = neighbors.reshape(-1)
 
     return neighbors.tolist()
@@ -60,10 +61,13 @@ def summarize_research_field_prompting(
 
     query_embedding = get_query_embedding(query)
 
-    text_chunks = [abstract for papers in dataset.values() for abstract in papers["abstract"]]
-    data_embeddings = [embedding for embeddings in data_embedding.values() for embedding in embeddings]
+    text_chunks = [abstract for papers in dataset.values()
+                   for abstract in papers["abstract"]]
+    data_embeddings = [embedding for embeddings in data_embedding.values()
+                       for embedding in embeddings]
 
-    nearest_indices = find_nearest_neighbors(data_embeddings, query_embedding, num_neighbors=10)
+    nearest_indices = find_nearest_neighbors(
+        data_embeddings, query_embedding, num_neighbors=10)
     context = [text_chunks[i] for i in nearest_indices]
 
     template_input["papers"] = "; ".join(context)
@@ -71,7 +75,8 @@ def summarize_research_field_prompting(
 
     return openai_prompting(llm_model, prompt)
 
-def find_collaborators_prompting(input: Dict[str, str], self_profile: Dict[str, str], collaborator_profiles: Dict[str, str], parameter: float =0.5, max_number: int =3,  llm_model: Optional[str] = "mistralai/Mixtral-8x7B-Instruct-v0.1",) -> List[str]:
+
+def find_collaborators_prompting(input: Dict[str, str], self_profile: Dict[str, str], collaborator_profiles: Dict[str, str], parameter: float = 0.5, max_number: int = 3,  llm_model: Optional[str] = "mistralai/Mixtral-8x7B-Instruct-v0.1",) -> List[str]:
     self_serialize = [
         f"Name: {name}\nProfile: {self_profile[name]}" for _, name in enumerate(self_profile.keys())]
     self_serialize_all = "\n\n".join(self_serialize)
@@ -96,6 +101,7 @@ def find_collaborators_prompting(input: Dict[str, str], self_profile: Dict[str,
     prompt = prompt_qa.format_map(input)
     return openai_prompting(llm_model, prompt)
 
+
 def generate_ideas_prompting(
     trend: str,
     llm_model: Optional[str] = "mistralai/Mixtral-8x7B-Instruct-v0.1",
@@ -158,40 +164,107 @@ def write_paper_abstract_prompting(
         "Here are the external data, which is a list abstracts of related papers: {papers_serialize_all}"
     )
 
-    template_input = {"ideas_serialize_all": ideas_serialize_all, "papers_serialize_all": papers_serialize_all}
+    template_input = {"ideas_serialize_all": ideas_serialize_all,
+                      "papers_serialize_all": papers_serialize_all}
     prompt = prompt_template.format_map(template_input)
     return openai_prompting(llm_model, prompt)
 
-def review_paper_prompting(titles: Dict[str, str], external_data: Dict[str, str],  llm_model: Optional[str] = "mistralai/Mixtral-8x7B-Instruct-v0.1") -> List[str]:
+def review_score_prompting(paper_review: str, llm_model: Optional[str] = "mistralai/Mixtral-8x7B-Instruct-v0.1") -> int:
+    prompt_qa = (
+        "Please provide a score for the following reviews. The score should be between 1 and 10, where 1 is the lowest and 10 is the highest. Only returns one number score."
+        "Here are the reviews: {paper_review}"
+    )
+    input = {"paper_review": paper_review}
+    prompt = prompt_qa.format_map(input)
+    score_str = openai_prompting(llm_model, prompt)
+    if score_str[0].isdigit():
+        return int(score_str[0])
+    else:
+        return 0
+
+def review_paper_prompting(external_data: Dict[str, str],  llm_model: Optional[str] = "mistralai/Mixtral-8x7B-Instruct-v0.1") -> List[str]:
     """
     Review paper from using list, and external data (published papers)
     """
 
-    titles_serialize = []
-    for _, timestamp in enumerate(titles.keys()):
-        title_entry = f"Time: {timestamp}\nPaper: {external_data[timestamp]}"
-        titles_serialize.append(title_entry)
-    titles_serialize_all = "\n\n".join(titles_serialize)
-
     papers_serialize = []
     for _, timestamp in enumerate(external_data.keys()):
-        paper_entry = f"Time: {timestamp}\nPaper: {external_data[timestamp]}"
+        paper_entry = f"Title: {timestamp}\nPaper: {external_data[timestamp]}"
         papers_serialize.append(paper_entry)
     papers_serialize_all = "\n\n".join(papers_serialize)
 
     prompt_qa = (
         "Please give some reviews based on the following inputs and external data."
         "You might use two or more of these titles if they are related and works well together."
-        "Here are the titles: {titles_serialize_all}"
         "Here are the external data, which is a list of related papers: {papers_serialize_all}"
     )
 
-    input = {"titles_serialize_all": titles_serialize_all,
-             "papers_serialize_all": papers_serialize_all}
+    input = {"papers_serialize_all": papers_serialize_all}
 
     prompt = prompt_qa.format_map(input)
     return openai_prompting(llm_model, prompt)
 
+
+def make_review_decision_prompting(submission: Dict[str, str], review: Dict[str, Tuple[int,str]], llm_model: Optional[str] = "mistralai/Mixtral-8x7B-Instruct-v0.1") -> List[str]:
+    submission_serialize = []
+    for _, title in enumerate(submission.keys()):
+        abstract = submission[title]
+        submission_entry = f"Title: {title}\nAbstract:{abstract}\n"
+        submission_serialize.append(submission_entry)
+    submission_serialize_all = "\n\n".join(submission_serialize)
+
+    review_serialize = []
+    for _, name in enumerate(review.keys()):
+        content = review[name]
+        review_entry = f"Name: {name}\nContent: {content}\n"
+        review_serialize.append(review_entry)
+    review_serialize_all = "\n\n".join(review_serialize)
+
+    prompt_template = (
+        "Please make an review decision to decide whether the following submission should be accepted or rejected by an academic conference. Here are several reviews from reviewers for this submission. Please indicate your review decision as accept or reject."
+        "Here is the submission: {submission_serialize_all}"
+        "Here are the reviews: {review_serialize_all}"
+    )
+    template_input = {"submission_serialize_all": submission_serialize_all,
+                      "review_serialize_all": review_serialize_all}
+    prompt = prompt_template.format_map(template_input)
+    return openai_prompting(llm_model, prompt)
+
+
+def rebut_review_prompting(submission: Dict[str, str], review: Dict[str, Tuple[int, str]], decision: Dict[str, Tuple[bool, str]], llm_model: Optional[str] = "mistralai/Mixtral-8x7B-Instruct-v0.1") -> List[str]:
+    submission_serialize = []
+    for _, title in enumerate(submission.keys()):
+        abstract = submission[title]
+        submission_entry = f"Title: {title}\nAbstract:{abstract}\n"
+        submission_serialize.append(submission_entry)
+    submission_serialize_all = "\n\n".join(submission_serialize)
+
+    review_serialize = []
+    for _, name in enumerate(review.keys()):
+        content = review[name]
+        review_entry = f"Name: {name}\nContent: {content}\n"
+        review_serialize.append(review_entry)
+    review_serialize_all = "\n\n".join(review_serialize)
+
+    decision_serialize = []
+    for _, name in enumerate(decision.keys()):
+        content = decision[name]
+        decision_entry = f"Name: {name}\nDecision: {content}\n"
+        decision_serialize.append(decision_entry)
+    decision_serialize_all = "\n\n".join(decision_serialize)
+
+    prompt_template = (
+        "Please write a rebuttal for the following submission you have made to an academic conference. Here are the reviews and decisions from the reviewers. Your rebuttal should rebut the reviews to convince the reviewers to accept your submission."
+        "Here is the submission: {submission_serialize_all}"
+        "Here are the reviews: {review_serialize_all}"
+        "Here are the decisions: {decision_serialize_all}"
+    )
+    template_input = {"submission_serialize_all": submission_serialize_all,
+                      "review_serialize_all": review_serialize_all, "decision_serialize_all": decision_serialize_all}
+    prompt = prompt_template.format_map(template_input)
+    return openai_prompting(llm_model, prompt)
+
+
 def communicate_with_multiple_researchers_prompting(
     input: Dict[str, str],
     llm_model: Optional[str] = "mistralai/Mixtral-8x7B-Instruct-v0.1",
@@ -199,12 +272,14 @@ def communicate_with_multiple_researchers_prompting(
     """
     This is a single-round chat method. One that contains a chat history can better enable
     """
-    single_round_chat_serialize = [f"Message from researcher named {name}: {message}" for name, message in input.items()]
+    single_round_chat_serialize = [
+        f"Message from researcher named {name}: {message}" for name, message in input.items()]
     single_round_chat_serialize_all = "\n".join(single_round_chat_serialize)
     prompt_template = (
         "Please continue in a conversation with other fellow researchers for me, where you will address their concerns in a scholarly way. "
         "Here are the messages from other researchers: {single_round_chat_serialize_all}"
     )
-    template_input = {"single_round_chat_serialize_all": single_round_chat_serialize_all}
+    template_input = {
+        "single_round_chat_serialize_all": single_round_chat_serialize_all}
     prompt = prompt_template.format_map(template_input)
     return openai_prompting(llm_model, prompt)
diff --git a/tests/test_agent_base.py b/tests/test_agent_base.py
index 05495cd4..47e4eb82 100644
--- a/tests/test_agent_base.py
+++ b/tests/test_agent_base.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import Any, List
 from unittest.mock import MagicMock, patch
 
 from research_town.agents.agent_base import BaseResearchAgent
@@ -7,7 +7,8 @@
 @patch("research_town.utils.agent_prompting.openai_prompting")
 def test_get_profile(mock_openai_prompting: MagicMock) -> None:
     mock_response = MagicMock()
-    mock_response.return_value = ["I am a research agent who is interested in machine learning."]
+    mock_response.return_value = [
+        "I am a research agent who is interested in machine learning."]
 
     mock_openai_prompting.return_value = mock_response
 
@@ -16,42 +17,70 @@ def test_get_profile(mock_openai_prompting: MagicMock) -> None:
     assert profile["name"] == "Jiaxuan You"
     assert "profile" in profile.keys()
 
+@patch("research_town.utils.agent_prompting.openai_prompting")
+def test_make_review_decision(mock_openai_prompting: MagicMock) -> None:
+    mock_openai_prompting.return_value = [
+        "Accept. This is a good paper."]
+
+    research_agent = BaseResearchAgent("Jiaxuan You")
+    submission = {"MambaOut: Do We Really Need Mamba for Vision?": "Mamba, an architecture with RNN-like token mixer of state space model (SSM), was recently introduced to address the quadratic complexity of the attention mechanism and subsequently applied to vision tasks. Nevertheless, the performance of Mamba for vision is often underwhelming when compared with convolutional and attention-based models. In this paper, we delve into the essence of Mamba, and conceptually conclude that Mamba is ideally suited for tasks with long-sequence and autoregressive characteristics. For vision tasks, as image classification does not align with either characteristic, we hypothesize that Mamba is not necessary for this task; Detection and segmentation tasks are also not autoregressive, yet they adhere to the long-sequence characteristic, so we believe it is still worthwhile to explore Mamba's potential for these tasks. To empirically verify our hypotheses, we construct a series of models named \\emph{MambaOut} through stacking Mamba blocks while removing their core token mixer, SSM. Experimental results strongly support our hypotheses. Specifically, our MambaOut model surpasses all visual Mamba models on ImageNet image classification, indicating that Mamba is indeed unnecessary for this task. As for detection and segmentation, MambaOut cannot match the performance of state-of-the-art visual Mamba models, demonstrating the potential of Mamba for long-sequence visual tasks."}
+    review = research_agent.review_paper(external_data=submission)
+    review_decision, meta_review = research_agent.make_review_decision(
+        submission=submission, review={"Jiaxuan You": review})
+    assert review_decision is True
+    assert meta_review == "Accept. This is a good paper."
+
+@patch("research_town.utils.agent_prompting.openai_prompting")
+def test_review_paper(mock_openai_prompting: MagicMock) -> None:
+    def mock_response(*args: Any, **kwargs: Any) -> List[str]:
+        prompt = args[1]
+        if "Please give some reviews based on the following inputs and external data." in prompt:
+            return ["This is a paper review for MambaOut."]
+        elif "Please provide a score for the following reviews." in prompt:
+            return ["2"]
+        return ["Default response"]
+
+    mock_openai_prompting.side_effect = mock_response
+
+    research_agent = BaseResearchAgent("Jiaxuan You")
+    score, review = research_agent.review_paper(external_data={"MambaOut: Do We Really Need Mamba for Vision?": "Mamba, an architecture with RNN-like token mixer of state space model (SSM), was recently introduced to address the quadratic complexity of the attention mechanism and subsequently applied to vision tasks. Nevertheless, the performance of Mamba for vision is often underwhelming when compared with convolutional and attention-based models. In this paper, we delve into the essence of Mamba, and conceptually conclude that Mamba is ideally suited for tasks with long-sequence and autoregressive characteristics. For vision tasks, as image classification does not align with either characteristic, we hypothesize that Mamba is not necessary for this task; Detection and segmentation tasks are also not autoregressive, yet they adhere to the long-sequence characteristic, so we believe it is still worthwhile to explore Mamba's potential for these tasks. To empirically verify our hypotheses, we construct a series of models named \\emph{MambaOut} through stacking Mamba blocks while removing their core token mixer, SSM. Experimental results strongly support our hypotheses. Specifically, our MambaOut model surpasses all visual Mamba models on ImageNet image classification, indicating that Mamba is indeed unnecessary for this task. As for detection and segmentation, MambaOut cannot match the performance of state-of-the-art visual Mamba models, demonstrating the potential of Mamba for long-sequence visual tasks."})
+    print(score, review)
+    assert score == 2
+    assert review == "This is a paper review for MambaOut."
+
 @patch("research_town.utils.agent_prompting.openai_prompting")
 def test_generate_idea(mock_openai_prompting: MagicMock) -> None:
     mock_openai_prompting.return_value = ["This is a generated idea."]
 
     research_agent = BaseResearchAgent("Jiaxuan You")
     ideas = research_agent.generate_idea({"2024-04": {"abstract": ["Believable proxies of human behavior can empower interactive applications ranging from immersive environments to rehearsal spaces for interpersonal communication to prototyping tools. In this paper, we introduce generative agents--computational software agents that simulate believable human behavior. Generative agents wake up, cook breakfast, and head to work; artists paint, while authors write; they form opinions, notice each other, and initiate conversations; they remember and reflect on days past as they plan the next day. To enable generative agents, we describe an architecture that extends a large language model to store a complete record of the agent's experiences using natural language, synthesize those memories over time into higher-level reflections, and retrieve them dynamically to plan behavior. We instantiate generative agents to populate an interactive sandbox environment inspired by The Sims, where end users can interact with a small town of twenty five agents using natural language. In an evaluation, these generative agents produce believable individual and emergent social behaviors: for example, starting with only a single user-specified notion that one agent wants to throw a Valentine's Day party, the agents autonomously spread invitations to the party over the next two days, make new acquaintances, ask each other out on dates to the party, and coordinate to show up for the party together at the right time. We demonstrate through ablation that the components of our agent architecture--observation, planning, and reflection--each contribute critically to the believability of agent behavior. By fusing large language models with computational, interactive agents, this work introduces architectural and interaction patterns for enabling believable simulations of human behavior. "]}}, domain="machine learning")
-    
+
     assert isinstance(ideas, list)
     assert len(ideas) > 0
 
 @patch("research_town.utils.agent_prompting.openai_prompting")
 def test_communicate(mock_openai_prompting: MagicMock) -> None:
-    mock_openai_prompting.return_value = ["I believe in the potential of using automous agents to simulate the current research pipeline."]
+    mock_openai_prompting.return_value = [
+        "I believe in the potential of using automous agents to simulate the current research pipeline."]
 
     research_agent = BaseResearchAgent("Jiaxuan You")
-    response = research_agent.communicate({"Alice": "I believe in the potential of using automous agents to simulate the current research pipeline."})
+    response = research_agent.communicate(
+        {"Alice": "I believe in the potential of using automous agents to simulate the current research pipeline."})
     assert isinstance(response, str)
     assert response != ""
 
+
 @patch("research_town.utils.agent_prompting.openai_prompting")
 def test_write_paper_abstract(mock_openai_prompting: MagicMock) -> None:
     mock_openai_prompting.return_value = ["Believable proxies of human behavior can empower interactive applications ranging from immersive environments to rehearsal spaces for interpersonal communication to prototyping tools. In this paper, we introduce generative agents--computational software agents that simulate believable human behavior. Generative agents wake up, cook breakfast, and head to work; artists paint, while authors write; they form opinions, notice each other, and initiate conversations; they remember and reflect on days past as they plan the next day. To enable generative agents, we describe an architecture that extends a large language model to store a complete record of the agent's experiences using natural language, synthesize those memories over time into higher-level reflections, and retrieve them dynamically to plan behavior. We instantiate generative agents to populate an interactive sandbox environment inspired by The Sims, where end users can interact with a small town of twenty five agents using natural language. In an evaluation, these generative agents produce believable individual and emergent social behaviors: for example, starting with only a single user-specified notion that one agent wants to throw a Valentine's Day party, the agents autonomously spread invitations to the party over the next two days, make new acquaintances, ask each other out on dates to the party, and coordinate to show up for the party together at the right time. We demonstrate through ablation that the components of our agent architecture--observation, planning, and reflection--each contribute critically to the believability of agent behavior. By fusing large language models with computational, interactive agents, this work introduces architectural and interaction patterns for enabling believable simulations of human behavior. "]
 
     research_agent = BaseResearchAgent("Jiaxuan You")
-    abstract = research_agent.write_paper(["We can simulate the scientific research pipeline with agents."], {"2024-04":{"abstract":["Believable proxies of human behavior can empower interactive applications ranging from immersive environments to rehearsal spaces for interpersonal communication to prototyping tools. In this paper, we introduce generative agents--computational software agents that simulate believable human behavior. Generative agents wake up, cook breakfast, and head to work; artists paint, while authors write; they form opinions, notice each other, and initiate conversations; they remember and reflect on days past as they plan the next day. To enable generative agents, we describe an architecture that extends a large language model to store a complete record of the agent's experiences using natural language, synthesize those memories over time into higher-level reflections, and retrieve them dynamically to plan behavior. We instantiate generative agents to populate an interactive sandbox environment inspired by The Sims, where end users can interact with a small town of twenty five agents using natural language. In an evaluation, these generative agents produce believable individual and emergent social behaviors: for example, starting with only a single user-specified notion that one agent wants to throw a Valentine's Day party, the agents autonomously spread invitations to the party over the next two days, make new acquaintances, ask each other out on dates to the party, and coordinate to show up for the party together at the right time. We demonstrate through ablation that the components of our agent architecture--observation, planning, and reflection--each contribute critically to the believability of agent behavior. By fusing large language models with computational, interactive agents, this work introduces architectural and interaction patterns for enabling believable simulations of human behavior. "]}})
+    abstract = research_agent.write_paper(["We can simulate the scientific research pipeline with agents."], {"2024-04": {"abstract": ["Believable proxies of human behavior can empower interactive applications ranging from immersive environments to rehearsal spaces for interpersonal communication to prototyping tools. In this paper, we introduce generative agents--computational software agents that simulate believable human behavior. Generative agents wake up, cook breakfast, and head to work; artists paint, while authors write; they form opinions, notice each other, and initiate conversations; they remember and reflect on days past as they plan the next day. To enable generative agents, we describe an architecture that extends a large language model to store a complete record of the agent's experiences using natural language, synthesize those memories over time into higher-level reflections, and retrieve them dynamically to plan behavior. We instantiate generative agents to populate an interactive sandbox environment inspired by The Sims, where end users can interact with a small town of twenty five agents using natural language. In an evaluation, these generative agents produce believable individual and emergent social behaviors: for example, starting with only a single user-specified notion that one agent wants to throw a Valentine's Day party, the agents autonomously spread invitations to the party over the next two days, make new acquaintances, ask each other out on dates to the party, and coordinate to show up for the party together at the right time. We demonstrate through ablation that the components of our agent architecture--observation, planning, and reflection--each contribute critically to the believability of agent behavior. By fusing large language models with computational, interactive agents, this work introduces architectural and interaction patterns for enabling believable simulations of human behavior. "]}})
     assert isinstance(abstract, str)
     assert abstract != ""
 
-@patch("research_town.utils.agent_prompting.openai_prompting")
-def test_review_paper(mock_openai_prompting: MagicMock) -> None:
-    mock_openai_prompting.return_value = ["This is a paper review for MambaOut."]
-    
-    research_agent = BaseResearchAgent("Jiaxuan You")
-    review = research_agent.review_paper(input={"13 May 2024": "MambaOut: Do We Really Need Mamba for Vision?"}, external_data={"13 May 2024": "Mamba, an architecture with RNN-like token mixer of state space model (SSM), was recently introduced to address the quadratic complexity of the attention mechanism and subsequently applied to vision tasks. Nevertheless, the performance of Mamba for vision is often underwhelming when compared with convolutional and attention-based models. In this paper, we delve into the essence of Mamba, and conceptually conclude that Mamba is ideally suited for tasks with long-sequence and autoregressive characteristics. For vision tasks, as image classification does not align with either characteristic, we hypothesize that Mamba is not necessary for this task; Detection and segmentation tasks are also not autoregressive, yet they adhere to the long-sequence characteristic, so we believe it is still worthwhile to explore Mamba's potential for these tasks. To empirically verify our hypotheses, we construct a series of models named \\emph{MambaOut} through stacking Mamba blocks while removing their core token mixer, SSM. Experimental results strongly support our hypotheses. Specifically, our MambaOut model surpasses all visual Mamba models on ImageNet image classification, indicating that Mamba is indeed unnecessary for this task. As for detection and segmentation, MambaOut cannot match the performance of state-of-the-art visual Mamba models, demonstrating the potential of Mamba for long-sequence visual tasks."})
-    assert isinstance(review, str)
-    assert review != ""
+
+
 
 @patch("research_town.utils.agent_prompting.openai_prompting")
 def test_read_paper(mock_openai_prompting: MagicMock) -> None:
@@ -63,11 +92,28 @@ def test_read_paper(mock_openai_prompting: MagicMock) -> None:
     summary = research_agent.read_paper(external_data, domain)
     assert isinstance(summary, str)
 
+
 @patch("research_town.utils.agent_prompting.openai_prompting")
 def test_find_collaborators(mock_openai_prompting: MagicMock) -> None:
-    mock_openai_prompting.return_value = ["These are collaborators including Jure Leskovec, Rex Ying, Saining Xie, Kaiming He."]
-    
+    mock_openai_prompting.return_value = [
+        "These are collaborators including Jure Leskovec, Rex Ying, Saining Xie, Kaiming He."]
+
     research_agent = BaseResearchAgent("Jiaxuan You")
     collaborators = research_agent.find_collaborators(
         input={"11 May 2024": "Organize a workshop on how far are we from AGI (artificial general intelligence) at ICLR 2024. This workshop aims to become a melting pot for ideas, discussions, and debates regarding our proximity to AGI."}, parameter=0.5, max_number=3)
-    assert isinstance(collaborators, List)
+    assert isinstance(collaborators, list)
+
+
+@patch("research_town.utils.agent_prompting.openai_prompting")
+def test_rebut_review(mock_openai_prompting: MagicMock) -> None:
+    mock_openai_prompting.return_value = [
+        "This is a paper rebuttal"]
+
+    research_agent = BaseResearchAgent("Jiaxuan You")
+    submission = {"MambaOut: Do We Really Need Mamba for Vision?": "Mamba, an architecture with RNN-like token mixer of state space model (SSM), was recently introduced to address the quadratic complexity of the attention mechanism and subsequently applied to vision tasks. Nevertheless, the performance of Mamba for vision is often underwhelming when compared with convolutional and attention-based models. In this paper, we delve into the essence of Mamba, and conceptually conclude that Mamba is ideally suited for tasks with long-sequence and autoregressive characteristics. For vision tasks, as image classification does not align with either characteristic, we hypothesize that Mamba is not necessary for this task; Detection and segmentation tasks are also not autoregressive, yet they adhere to the long-sequence characteristic, so we believe it is still worthwhile to explore Mamba's potential for these tasks. To empirically verify our hypotheses, we construct a series of models named \\emph{MambaOut} through stacking Mamba blocks while removing their core token mixer, SSM. Experimental results strongly support our hypotheses. Specifically, our MambaOut model surpasses all visual Mamba models on ImageNet image classification, indicating that Mamba is indeed unnecessary for this task. As for detection and segmentation, MambaOut cannot match the performance of state-of-the-art visual Mamba models, demonstrating the potential of Mamba for long-sequence visual tasks."}
+    review = research_agent.review_paper(external_data=submission)
+    review_decision = research_agent.make_review_decision(
+        submission=submission, review={"Jiaxuan You": review})
+    rebut_review = research_agent.rebut_review(submission=submission, review={
+        "Jiaxuan You": review}, decision={"Jiaxuan You": review_decision})
+    assert isinstance(rebut_review, str)
diff --git a/tests/test_envs.py b/tests/test_envs.py
new file mode 100644
index 00000000..8fd68cf3
--- /dev/null
+++ b/tests/test_envs.py
@@ -0,0 +1,21 @@
+from unittest.mock import MagicMock, patch
+
+from research_town.envs.env_paper_rebuttal import (
+    PaperRebuttalMultiAgentEnv,
+)
+
+
+@patch("research_town.utils.agent_prompting.openai_prompting")
+def test_paper_rebuttal_env(mock_openai_prompting: MagicMock) -> None:
+    mock_openai_prompting.return_value = [
+        "Paper Rebuttal Environment."]
+    env = PaperRebuttalMultiAgentEnv(agent_dict={"Jiaxuan You": "Jiaxuan You", "Rex Ying":
+                                                 "Rex Ying", "Jure Leskovec": "Jure Leskovec", "Christos Faloutsos": "Christos Faloutsos"})
+    env.assign_roles({"Jiaxuan You": "author", "Rex Ying": "author",
+                     "Jure Leskovec": "reviewer", "Christos Faloutsos": "reviewer"})
+    env.initialize_submission({"MambaOut: Do We Really Need Mamba for Vision?": "Mamba, an architecture with RNN-like token mixer of state space model (SSM), was recently introduced to address the quadratic complexity of the attention mechanism and subsequently applied to vision tasks. Nevertheless, the performance of Mamba for vision is often underwhelming when compared with convolutional and attention-based models. In this paper, we delve into the essence of Mamba, and conceptually conclude that Mamba is ideally suited for tasks with long-sequence and autoregressive characteristics. For vision tasks, as image classification does not align with either characteristic, we hypothesize that Mamba is not necessary for this task; Detection and segmentation tasks are also not autoregressive, yet they adhere to the long-sequence characteristic, so we believe it is still worthwhile to explore Mamba's potential for these tasks. To empirically verify our hypotheses, we construct a series of models named \\emph{MambaOut} through stacking Mamba blocks while removing their core token mixer, SSM. Experimental results strongly support our hypotheses. Specifically, our MambaOut model surpasses all visual Mamba models on ImageNet image classification, indicating that Mamba is indeed unnecessary for this task. As for detection and segmentation, MambaOut cannot match the performance of state-of-the-art visual Mamba models, demonstrating the potential of Mamba for long-sequence visual tasks."})
+    while not env.terminated:
+        env.step()
+    assert isinstance(env.review, str)
+    assert isinstance(env.decision, str)
+    assert isinstance(env.rebuttal, str)