From 1ce0df9bfc23192f23b21a0c2603109814cd03f5 Mon Sep 17 00:00:00 2001
From: Dongge Liu <donggeliu@google.com>
Date: Mon, 21 Oct 2024 14:38:35 +1100
Subject: [PATCH] Prototyper improvements (#661)

## Functionality
1. Validate if the function-under-test is referenced by the fuzz target
(fixes #658)
2. Use cached image from `chronos`.
3. Use exact exception matching in decorator `@retryable` to ensure the
correct config (e.g., #retries) is applied.
4. Compile the new fuzz target with the old build script once even if
LLM proposes a new build script, in case the new one is wrong.
5. Show chat history of each trail for simpler debugging.
6. Show exception (e.g., cloud build expiry) in report.
---
 agent/base_agent.py             |   8 ++-
 agent/prototyper.py             | 107 +++++++++++++++++++++++++-------
 ci/k8s/pr-exp.yaml              |   4 ++
 common/cloud_builder.py         |  43 ++++++++-----
 experiment/oss_fuzz_checkout.py |  44 +++++++++++--
 llm_toolkit/models.py           |  75 ++++++++++++++++++----
 logger.py                       |   7 ++-
 results.py                      |  13 ++--
 stage/execution_stage.py        |  25 ++++----
 tool/container_tool.py          |  68 +++++++++++++++-----
 utils.py                        |  10 +--
 11 files changed, 313 insertions(+), 91 deletions(-)
diff --git a/agent/base_agent.py b/agent/base_agent.py
index 732b5699d3..f4e99300ad 100644
--- a/agent/base_agent.py
+++ b/agent/base_agent.py
@@ -59,10 +59,13 @@ def _filter_code(self, raw_code_block: str) -> str:
     return filtered_code_block
 
   def _format_bash_execution_result(self, process: sp.CompletedProcess) -> str:
+    stdout = self.llm.truncate_prompt(process.stdout)
+    # TODO(dongge) Share input limit evenly if both stdout and stderr overlong.
+    stderr = self.llm.truncate_prompt(process.stderr, stdout)
     return (f'<bash>\n{process.args}\n</bash>\n'
             f'<return code>\n{process.returncode}\n</return code>\n'
-            f'<stdout>\n{process.stdout}\n</stdout>\n'
-            f'<stderr>\n{process.stderr}\n</stderr>\n')
+            f'<stdout>\n{stdout}\n</stdout>\n'
+            f'<stderr>\n{stderr}\n</stderr>\n')
 
   def _container_handle_bash_command(self, cur_round: int, response: str,
                                      tool: BaseTool) -> Prompt:
@@ -113,6 +116,7 @@ def cloud_main(cls) -> None:
     args = cls._parse_args()
 
     agent = utils.deserialize_from_dill(args.agent)
+    agent.llm.cloud_setup()
     result_history = utils.deserialize_from_dill(args.result_history)
     result = agent.execute(result_history)
     utils.serialize_to_dill(result, args.result_new)
diff --git a/agent/prototyper.py b/agent/prototyper.py
index 999915d86c..610a6fba3a 100644
--- a/agent/prototyper.py
+++ b/agent/prototyper.py
@@ -6,6 +6,7 @@
 
 import logger
 from agent.base_agent import BaseAgent
+from experiment.benchmark import Benchmark
 from llm_toolkit.prompt_builder import DefaultTemplateBuilder
 from llm_toolkit.prompts import Prompt
 from results import BuildResult, Result
@@ -24,7 +25,6 @@ def _initial_prompt(self, results: list[Result]) -> Prompt:
     default_prompt_builder = DefaultTemplateBuilder(model=self.llm,
                                                     benchmark=benchmark)
     prompt = default_prompt_builder.build([])
-    # TODO(dongge): Find a way to save prompt and log for agents
     return prompt
 
   def _update_fuzz_target_and_build_script(self, cur_round: int, response: str,
@@ -43,7 +43,9 @@ def _update_fuzz_target_and_build_script(self, cur_round: int, response: str,
 
     build_script_source = self._filter_code(
         self._parse_tag(response, 'build script'))
-    build_result.build_script_source = build_script_source
+    # Sometimes LLM adds chronos, which makes no sense for new build scripts.
+    build_result.build_script_source = build_script_source.replace(
+        'source /src/chronos.sh', '')
     if build_script_source:
       logger.debug('ROUND %02d Parsed build script from LLM: %s', cur_round,
                    build_script_source)
@@ -51,18 +53,58 @@ def _update_fuzz_target_and_build_script(self, cur_round: int, response: str,
       logger.debug('ROUND %02d No build script in conclusion: %s', cur_round,
                    response)
 
-  def _update_build_result(self, buid_result: BuildResult,
-                           compile_process: sp.CompletedProcess,
-                           status: bool) -> None:
+  def _update_build_result(self, build_result: BuildResult,
+                           compile_process: sp.CompletedProcess, status: bool,
+                           referenced: bool) -> None:
     """Updates the build result with the latest info."""
-    buid_result.compiles = status
-    buid_result.compile_error = compile_process.stderr
-    buid_result.compile_log = self._format_bash_execution_result(
+    build_result.compiles = status
+    build_result.compile_error = compile_process.stderr
+    build_result.compile_log = self._format_bash_execution_result(
         compile_process)
+    build_result.is_function_referenced = referenced
 
   def _validate_fuzz_target_and_build_script(self, cur_round: int,
                                              build_result: BuildResult) -> None:
     """Validates the new fuzz target and build script."""
+    # Steps:
+    #   1. Recompile without modifying the build script, in case LLM is wrong.
+    #   2. Recompile with the modified build script, if any.
+    build_script_source = build_result.build_script_source
+
+    logger.info('First compile fuzz target without modifying build script.')
+    build_result.build_script_source = ''
+    self._validate_fuzz_target_and_build_script_via_recompile(
+        cur_round, build_result)
+
+    if not build_result.success and build_script_source:
+      logger.info('Then compile fuzz target with modified build script.')
+      build_result.build_script_source = build_script_source
+      self._validate_fuzz_target_and_build_script_via_recompile(
+          cur_round, build_result, use_recompile=False)
+
+  def _validate_fuzz_target_references_function(
+      self, compilation_tool: ProjectContainerTool, benchmark: Benchmark,
+      cur_round: int) -> bool:
+    """Validates if the LLM generated fuzz target assembly code references
+    function-under-test."""
+    disassemble_result = compilation_tool.execute(
+        'objdump --disassemble=LLVMFuzzerTestOneInput -d '
+        f'/out/{benchmark.target_name}')
+    function_referenced = (disassemble_result.returncode == 0 and
+                           benchmark.function_name in disassemble_result.stdout)
+    logger.debug('ROUND %02d Final fuzz target function referenced: %s',
+                 cur_round, function_referenced)
+    if not function_referenced:
+      logger.debug('ROUND %02d Final fuzz target function not referenced',
+                   cur_round)
+    return function_referenced
+
+  def _validate_fuzz_target_and_build_script_via_recompile(
+      self,
+      cur_round: int,
+      build_result: BuildResult,
+      use_recompile: bool = True) -> None:
+    """Validates the new fuzz target and build script by recompiling them."""
     benchmark = build_result.benchmark
     compilation_tool = ProjectContainerTool(benchmark=benchmark)
 
@@ -82,8 +124,7 @@ def _validate_fuzz_target_and_build_script(self, cur_round: int,
 
     # Recompile.
     logger.info('===== ROUND %02d Recompile =====', cur_round)
-    compile_command = 'compile > /dev/null'
-    compile_process = compilation_tool.execute(compile_command)
+    compile_process = compilation_tool.compile(use_recompile=use_recompile)
     compile_succeed = compile_process.returncode == 0
     logger.debug('ROUND %02d Fuzz target compile Succeessfully: %s', cur_round,
                  compile_succeed)
@@ -93,11 +134,16 @@ def _validate_fuzz_target_and_build_script(self, cur_round: int,
     binary_exists = ls_result.returncode == 0
     logger.debug('ROUND %02d Final fuzz target binary exists: %s', cur_round,
                  binary_exists)
-    compilation_tool.terminate()
 
+    # Validate if function-under-test is referenced by the fuzz target.
+    function_referenced = self._validate_fuzz_target_references_function(
+        compilation_tool, benchmark, cur_round)
+
+    compilation_tool.terminate()
     self._update_build_result(build_result,
                               compile_process=compile_process,
-                              status=compile_succeed and binary_exists)
+                              status=compile_succeed and binary_exists,
+                              referenced=function_referenced)
 
   def _container_handle_conclusion(
       self, cur_round: int, response: str,
@@ -109,18 +155,34 @@ def _container_handle_conclusion(
     self._update_fuzz_target_and_build_script(cur_round, response, build_result)
 
     self._validate_fuzz_target_and_build_script(cur_round, build_result)
-    if build_result.compiles:
+    if build_result.success:
       logger.info('***** Prototyper succeded in %02d rounds *****', cur_round)
       return None
 
-    logger.info('***** Failed to recompile in %02d rounds *****', cur_round)
-    prompt_text = ('Failed to build fuzz target. Here is the fuzz target, build'
-                   ' script, compliation command, and other compilation runtime'
-                   ' output.\n<fuzz target>\n'
-                   f'{build_result.fuzz_target_source}\n</fuzz target>\n'
-                   f'<build script>\n{build_result.build_script_source}\n'
-                   '</build script>\n'
-                   f'{build_result.compile_log}')
+    if not build_result.compiles:
+      compile_log = self.llm.truncate_prompt(build_result.compile_log)
+      logger.info('***** Failed to recompile in %02d rounds *****', cur_round)
+      prompt_text = (
+          'Failed to build fuzz target. Here is the fuzz target, build'
+          ' script, compliation command, and other compilation runtime'
+          ' output.\n<fuzz target>\n'
+          f'{build_result.fuzz_target_source}\n</fuzz target>\n'
+          f'<build script>\n{build_result.build_script_source}\n'
+          f'</build script>\n<compilation log>\n{compile_log}\n'
+          '</compilation log>\n')
+    elif not build_result.is_function_referenced:
+      logger.info(
+          '***** Fuzz target does not reference function-under-test in %02d '
+          'rounds *****', cur_round)
+      prompt_text = (
+          'The fuzz target builds successfully, but the target function '
+          f'`{build_result.benchmark.function_signature}` was not used by '
+          '`LLVMFuzzerTestOneInput` in fuzz target. YOU MUST CALL FUNCTION '
+          f'`{build_result.benchmark.function_signature}` INSIDE FUNCTION '
+          '`LLVMFuzzerTestOneInput`.')
+    else:
+      prompt_text = ''
+
     prompt = DefaultTemplateBuilder(self.llm, initial=prompt_text).build([])
     return prompt
 
@@ -140,7 +202,7 @@ def execute(self, result_history: list[Result]) -> BuildResult:
     prompt = self._initial_prompt(result_history)
     benchmark = last_result.benchmark
     self.inspect_tool = ProjectContainerTool(benchmark, name='inspect')
-    self.inspect_tool.execute('{compile && rm -rf /out/*} > /dev/null')
+    self.inspect_tool.compile(extra_commands=' && rm -rf /out/* > /dev/null')
     cur_round = 1
     prompt.append(self.inspect_tool.tutorial())
     build_result = BuildResult(benchmark=benchmark,
@@ -157,7 +219,6 @@ def execute(self, result_history: list[Result]) -> BuildResult:
         prompt = self._container_tool_reaction(cur_round, response,
                                                build_result)
         cur_round += 1
-        self._sleep_random_duration()
     finally:
       # Cleanup: stop and remove the container
       logger.debug('Stopping and removing the inspect container %s',
diff --git a/ci/k8s/pr-exp.yaml b/ci/k8s/pr-exp.yaml
index f7411de935..46d6c15eaf 100644
--- a/ci/k8s/pr-exp.yaml
+++ b/ci/k8s/pr-exp.yaml
@@ -47,6 +47,10 @@ spec:
           value: '10'
         - name: VERTEX_AI_LOCATIONS
           value: 'asia-east1,asia-east2,asia-northeast1,asia-northeast3,asia-south1,asia-southeast1,australia-southeast1,europe-central2,europe-north1,europe-southwest1,europe-west1,europe-west2,europe-west3,europe-west4,europe-west6,europe-west8,europe-west9,northamerica-northeast1,southamerica-east1,us-central1,us-east1,us-east4,us-east5,us-south1,us-west1,us-west4'
+        - name: CLOUD_BUILD_LOCATION
+          value: 'us-west2'
+        - name: GCB_BUILDPOOL_NAME
+          value: projects/oss-fuzz/locations/us-west2/workerPools/buildpool-llm-agents
         - name: REDIRECT_OUTS
           value: '${GKE_REDIRECT_OUTS}'
       # imagePullSecrets:
diff --git a/common/cloud_builder.py b/common/cloud_builder.py
index 829152811f..32ad0b22d9 100644
--- a/common/cloud_builder.py
+++ b/common/cloud_builder.py
@@ -23,8 +23,9 @@
 
 OF_REPO = 'https://github.com/google/oss-fuzz.git'
 OFG_ROOT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
-US_CENTRAL_CLIENT_OPTIONS = google.api_core.client_options.ClientOptions(
-    api_endpoint='https://us-central1-cloudbuild.googleapis.com/')
+REGION = os.getenv('CLOUD_BUILD_LOCATION', 'us-west2')
+REGIONAL_CLIENT_OPTIONS = google.api_core.client_options.ClientOptions(
+    api_endpoint=f'https://{REGION}-cloudbuild.googleapis.com/')
 _CHAT_HISTORY_PREFIX_PATTERN = r'^Step\s+#(\d+)\s+-\s+"agent-step":\s+'
 
 
@@ -60,7 +61,7 @@ def __init__(self, args: argparse.Namespace) -> None:
         'v1',
         credentials=self.credentials,
         cache_discovery=False,
-        client_options=US_CENTRAL_CLIENT_OPTIONS).projects().builds()
+        client_options=REGIONAL_CLIENT_OPTIONS).projects().builds()
     self.storage_client = storage.Client(credentials=self.credentials)
 
   def _upload_to_gcs(self, local_file_path: str) -> str:
@@ -143,6 +144,9 @@ def _request_cloud_build(self, ofg_repo_url: str, agent_dill_url: str,
                     '/workspace:/workspace',
                     '-v',
                     '/var/run/docker.sock:/var/run/docker.sock',
+                    '-e',
+                    'VERTEX_AI_LOCATIONS=' +
+                    os.getenv("VERTEX_AI_LOCATIONS", ""),
                     '--network=cloudbuild',
                     # Built from this repo's `Dockerfile.cloudbuild-agent`.
                     ('us-central1-docker.pkg.dev/oss-fuzz/oss-fuzz-gen/'
@@ -192,10 +196,10 @@ def _request_cloud_build(self, ofg_repo_url: str, agent_dill_url: str,
                                     body=cloud_build_config).execute()
     build_id = build_info.get('metadata', {}).get('build', {}).get('id', '')
 
-    logging.info('Cloud Build ID: %s', build_id)
+    logging.info('Created Cloud Build ID %s at %s', build_id, REGION)
     return build_id
 
-  def _wait_for_build(self, build_id: str) -> bool:
+  def _wait_for_build(self, build_id: str) -> str:
     """Wait for a GCB build."""
     prev_status = status = None
     while status in [None, 'WORKING', 'QUEUED']:
@@ -205,11 +209,11 @@ def _wait_for_build(self, build_id: str) -> bool:
         if status != prev_status:
           logging.info('Cloud Build %s Status: %s', build_id, status)
           prev_status = status
-        time.sleep(60)  # Avoid rate limiting.
       except (googleapiclient.errors.HttpError, BrokenPipeError) as e:
-        logging.error('Cloud build %s failed: %s', build_id, e)
-        return False
-    return status == 'SUCCESS'
+        logging.warning('Failed to check cloud build status %s: %s', build_id,
+                        e)
+      time.sleep(60)  # Avoid rate limiting.
+    return status or ''
 
   def _cancel_build(self, build_id: str) -> None:
     """Cancel a GCB build"""
@@ -240,7 +244,7 @@ def _get_build_log(self, build_id: str) -> str:
       return log_content
     except NotFound as e:
       logging.error('Cloud build log %s not found: %s', log_file_uri, e)
-      return ''
+      return f'Cloud build log {log_file_uri} not found: {e}.'
 
   def _download_from_gcs(self, destination_file_name: str) -> None:
     """Downloads the result file from GCS."""
@@ -278,22 +282,33 @@ def run(self, agent: BaseAgent, result_history: list[Result],
                                          new_result_filename)
 
     # Step 4: Download new result dill.
+    cloud_build_log = ''
     new_result_dill = os.path.join(dill_dir, new_result_filename)
     try:
-      if self._wait_for_build(build_id):
+      cloud_build_final_status = self._wait_for_build(build_id)
+      if cloud_build_final_status == 'SUCCESS':
         self._download_from_gcs(new_result_dill)
-    except (KeyboardInterrupt, SystemExit):
+      else:
+        logging.error('Cloud build %s failed with status: %s', build_id,
+                      cloud_build_final_status)
+        cloud_build_log += (f'Cloud build {build_id} failed with status: '
+                            f'{cloud_build_final_status}.\n')
+    except (KeyboardInterrupt, SystemExit) as e:
       self._cancel_build(build_id)
-    build_log = self._get_build_log(build_id)
+      logging.error('Cloud build %s cancled: %s', build_id, e)
+      cloud_build_log += f'Cloud build {build_id} cancled: {e}.\n'
+
+    cloud_build_log += self._get_build_log(build_id)
 
     # Step 4: Deserialize dilld file.
     result = utils.deserialize_from_dill(new_result_dill)
     if not result:
+      cloud_build_log += f'Failed to deserialize from dill {new_result_dill}.\n'
       last_result = result_history[-1]
       result = Result(benchmark=last_result.benchmark,
                       trial=last_result.trial,
                       work_dirs=last_result.work_dirs,
                       author=agent)
-    result.chat_history = {agent.name: build_log}
+    result.chat_history = {agent.name: cloud_build_log}
 
     return result
diff --git a/experiment/oss_fuzz_checkout.py b/experiment/oss_fuzz_checkout.py
index 17f37041bc..0263672c91 100644
--- a/experiment/oss_fuzz_checkout.py
+++ b/experiment/oss_fuzz_checkout.py
@@ -375,23 +375,57 @@ def prepare_build(project_name, sanitizer, generated_project):
     shutil.copy(original_dockerfile, dockerfile_to_use)
 
 
-def _image_exists(image_name: str) -> bool:
-  """Checks if the given |image_name| exits."""
+def _image_exists_locally(image_name: str, project_name: str) -> bool:
+  """Checks if the given |image_name| exits locally."""
   try:
     all_images = sp.run(['docker', 'images', '--format', '{{.Repository}}'],
                         stdout=sp.PIPE,
                         text=True,
                         check=True).stdout.splitlines()
+    if image_name in all_images:
+      logger.info('Will use local cached images of %s: %s', project_name,
+                  image_name)
+      return True
   except sp.CalledProcessError:
-    logger.info('Unable to list all docker images')
+    logger.warning('Unable to use local cached image of %s: %s', project_name,
+                   image_name)
+  return False
+
+
+def _image_exists_online(image_name: str, project_name: str) -> bool:
+  """Checks if the given |image_name| exits in the cloud registry."""
+  online_image_name = _get_project_cache_image_name(project_name, 'address')
+  try:
+    sp.run(['docker', 'pull', online_image_name],
+           stdout=sp.PIPE,
+           text=True,
+           check=True)
+    logger.info('Pulled online cached images of %s: %s', project_name,
+                online_image_name)
+    sp.run([
+        'docker', 'run', '--entrypoint', '/usr/local/bin/recompile',
+        online_image_name
+    ],
+           stdout=sp.PIPE,
+           text=True,
+           check=True)
+
+    sp.run(['docker', 'tag', online_image_name, image_name],
+           stdout=sp.PIPE,
+           text=True,
+           check=True)
+    logger.info('Will use online cached images: %s', project_name)
+    return True
+  except sp.CalledProcessError:
+    logger.warning('Unable to use online cached images: %s', project_name)
     return False
-  return image_name in all_images
 
 
 def prepare_project_image(project: str) -> str:
   """Prepares original image of the |project|'s fuzz target build container."""
   image_name = f'gcr.io/oss-fuzz/{project}'
-  if _image_exists(image_name):
+  if (_image_exists_locally(image_name, project_name=project) or
+      _image_exists_online(image_name, project_name=project)):
     logger.info('Using existing project image for %s', project)
     return image_name
   logger.info('Unable to find existing project image for %s', project)
diff --git a/llm_toolkit/models.py b/llm_toolkit/models.py
index 14937f4bd8..0122d543b9 100644
--- a/llm_toolkit/models.py
+++ b/llm_toolkit/models.py
@@ -56,6 +56,8 @@ class LLM:
   # TODO(mihaimaruseac): Should this be MAX_TOKENS or a different global?
   context_window: int = 2000  # Default token size.
 
+  MAX_INPUT_TOKEN: int = sys.maxsize
+
   _max_attempts = 5  # Maximum number of attempts to get prediction response
 
   def __init__(
@@ -201,6 +203,13 @@ def _save_output(self, index: int, content: str, response_dir: str) -> None:
     with open(raw_output_path, 'w+') as output_file:
       output_file.write(content)
 
+  def truncate_prompt(self,
+                      raw_prompt_text: Any,
+                      extra_text: Any = None) -> Any:
+    """Truncates the prompt text to fit in MAX_INPUT_TOKEN."""
+    del extra_text
+    return raw_prompt_text
+
   @abstractmethod
   def get_chat_client(self, model: Any) -> Any:
     """Returns a new chat session."""
@@ -408,7 +417,13 @@ def prompt_type(self) -> type[prompts.Prompt]:
 
   def estimate_token_num(self, text) -> int:
     """Estimates the number of tokens in |text|."""
-    # Roughly 1.5 tokens per word:
+    # A rough estimation for very large prompt: Gemini suggest 4 char per token,
+    # using 3 here to be safer.
+    text = text or ''
+    if len(text) // 3 > self.MAX_INPUT_TOKEN:
+      return len(text) // 3
+
+    # Otherwise, roughly 1.5 tokens per word:
     return int(len(re.split('[^a-zA-Z0-9]+', text)) * 1.5 + 0.5)
 
   # ============================== Generation ============================== #
@@ -607,24 +622,62 @@ class GeminiV1D5(GeminiModel):
 class GeminiV1D5Chat(GeminiV1D5):
   """Gemini 1.5 for chat session."""
   name = 'vertex_ai_gemini-1-5-chat'
+  _vertex_ai_model = 'gemini-1.5-pro-002'
+
+  # Avoids sending large prompts.
+  MAX_INPUT_TOKEN: int = 128000  # max 2000000
 
   def get_chat_client(self, model: GenerativeModel) -> Any:
     return model.start_chat(response_validation=False)
 
-  @retryable(exceptions=[
-      GoogleAPICallError,
-      InvalidArgument,
-  ],
-             other_exceptions={ResourceExhausted: 100})
+  @retryable(
+      exceptions=[
+          GoogleAPICallError,
+          InvalidArgument,
+          ValueError,  # TODO(dongge): Handle RECITATION specifically.
+          IndexError,  # A known error from vertexai.
+      ],
+      other_exceptions={ResourceExhausted: 100})
   def _do_generate(self, client: ChatSession, prompt: str,
                    config: dict[str, Any]) -> Any:
     """Generates chat response."""
     logger.info('%s generating response with config: %s', self.name, config)
-    return client.send_message(
-        prompt,
-        stream=False,
-        generation_config=config,
-        safety_settings=self.safety_config).text  # type: ignore
+    try:
+      return client.send_message(
+          prompt,
+          stream=False,
+          generation_config=config,
+          safety_settings=self.safety_config).text  # type: ignore
+    except Exception as e:
+      logger.error('%s failed to generated response: %s; Config: %s', e,
+                   self.name, config)
+      return ''
+
+  def truncate_prompt(self,
+                      raw_prompt_text: Any,
+                      extra_text: Any = None) -> Any:
+    """Truncates the prompt text to fit in MAX_INPUT_TOKEN."""
+    original_token_count = self.estimate_token_num(raw_prompt_text)
+
+    token_count = original_token_count
+    if token_count > self.MAX_INPUT_TOKEN:
+      raw_prompt_text = raw_prompt_text[-3 * self.MAX_INPUT_TOKEN:]
+
+    extra_text_token_count = self.estimate_token_num(extra_text)
+    # Reserve 10000 tokens for raw prompt wrappers.
+    max_raw_prompt_token_size = (self.MAX_INPUT_TOKEN - extra_text_token_count -
+                                 10000)
+
+    while token_count > max_raw_prompt_token_size:
+      estimate_truncate_size = int(
+          (1 - max_raw_prompt_token_size / token_count) * len(raw_prompt_text))
+      raw_prompt_text = raw_prompt_text[estimate_truncate_size + 1:]
+
+      token_count = self.estimate_token_num(raw_prompt_text)
+      logger.warning('Truncated raw prompt from %d to %d tokens:',
+                     original_token_count, token_count)
+
+    return raw_prompt_text
 
   def chat_llm(self, client: ChatSession, prompt: prompts.Prompt) -> str:
     if self.ai_binary:
diff --git a/logger.py b/logger.py
index 0ae3a18be8..864ce15f5f 100644
--- a/logger.py
+++ b/logger.py
@@ -49,11 +49,14 @@ def write_result(self, result_status_dir: str, result: Result) -> None:
   def write_chat_history(self, result: Result) -> None:
     """Writes fuzz target."""
     # TODO(dongge): Find a proper way to write this.
-    fuzz_target_path = os.path.join(result.work_dirs.base, 'prompt.txt')
+    trial_result_dir = os.path.join(result.work_dirs.status,
+                                    f'{result.trial:02d}')
+    os.makedirs(trial_result_dir, exist_ok=True)
+    chat_history_path = os.path.join(trial_result_dir, 'log.txt')
     chat_history = '\n'.join(
         f'{agent_name}\n{chat_history}\n'
         for agent_name, chat_history in result.chat_history.items())
-    self.write_to_file(fuzz_target_path, chat_history)
+    self.write_to_file(chat_history_path, chat_history)
 
 
 def debug(msg: object,
diff --git a/results.py b/results.py
index c0ec96e56e..1abe4339d5 100644
--- a/results.py
+++ b/results.py
@@ -56,6 +56,7 @@ class BuildResult(Result):
   compiles: bool  # Build success/failure.
   compile_error: str  # Build error message.
   compile_log: str  # Build full output.
+  is_function_referenced: bool  # Fuzz target references function-under-test.
 
   def __init__(self,
                benchmark: Benchmark,
@@ -64,6 +65,7 @@ def __init__(self,
                compiles: bool = False,
                compile_error: str = '',
                compile_log: str = '',
+               is_function_referenced: bool = False,
                fuzz_target_source: str = '',
                build_script_source: str = '',
                author: Any = None,
@@ -73,17 +75,19 @@ def __init__(self,
     self.compiles = compiles
     self.compile_error = compile_error
     self.compile_log = compile_log
+    self.is_function_referenced = is_function_referenced
 
   def to_dict(self) -> dict:
     return super().to_dict() | {
-        'compiles': self.compiles,
+        'compiles': self.success,
         'compile_error': self.compile_error,
         'compile_log': self.compile_log,
+        'is_function_referenced': self.is_function_referenced,
     }
 
   @property
   def success(self):
-    return self.compiles
+    return self.compiles and self.is_function_referenced
 
 
 class RunResult(BuildResult):
@@ -110,6 +114,7 @@ def __init__(
       compiles: bool = False,
       compile_error: str = '',
       compile_log: str = '',
+      is_function_referenced: bool = False,
       crashes: bool = False,  # Runtime crash.
       run_error: str = '',  # Runtime crash error message.
       run_log: str = '',  # Full fuzzing output.
@@ -128,8 +133,8 @@ def __init__(
       author: Any = None,
       chat_history: Optional[dict] = None) -> None:
     super().__init__(benchmark, trial, work_dirs, compiles, compile_error,
-                     compile_log, fuzz_target_source, build_script_source,
-                     author, chat_history)
+                     compile_log, is_function_referenced, fuzz_target_source,
+                     build_script_source, author, chat_history)
     self.crashes = crashes
     self.run_error = run_error
     self.run_log = run_log
diff --git a/stage/execution_stage.py b/stage/execution_stage.py
index 0030222c85..cc394eafce 100644
--- a/stage/execution_stage.py
+++ b/stage/execution_stage.py
@@ -115,10 +115,11 @@ def execute(self, result_history: list[Result]) -> Result:
           fuzz_target_source=last_result.fuzz_target_source,
           build_script_source=last_result.build_script_source,
           chat_history=last_result.chat_history,
-          author=self,
+          author=repr(self),
           compiles=last_result.compiles,
           compile_error=last_result.compile_error,
           compile_log=last_result.compile_log,
+          is_function_referenced=last_result.is_function_referenced,
           crashes=run_result.crashes,
           run_error=run_result.crash_info,
           run_log=run_result.log_path,
@@ -134,15 +135,17 @@ def execute(self, result_history: list[Result]) -> Result:
           total_pcs=run_result.total_pcs)
     except Exception as e:
       self.logger.error('Exception %s occurred on %s', e, last_result)
-      runresult = RunResult(benchmark=benchmark,
-                            trial=last_result.trial,
-                            work_dirs=last_result.work_dirs,
-                            fuzz_target_source=last_result.fuzz_target_source,
-                            build_script_source=last_result.build_script_source,
-                            chat_history=last_result.chat_history,
-                            author=self,
-                            compiles=last_result.compiles,
-                            compile_error=last_result.compile_error,
-                            compile_log=last_result.compile_log)
+      runresult = RunResult(
+          benchmark=benchmark,
+          trial=last_result.trial,
+          work_dirs=last_result.work_dirs,
+          fuzz_target_source=last_result.fuzz_target_source,
+          build_script_source=last_result.build_script_source,
+          chat_history=last_result.chat_history,
+          author=repr(self),
+          compiles=last_result.compiles,
+          compile_error=last_result.compile_error,
+          compile_log=last_result.compile_log,
+          is_function_referenced=last_result.is_function_referenced)
 
     return runresult
diff --git a/tool/container_tool.py b/tool/container_tool.py
index 5781aad2ea..e4bfefd0ff 100644
--- a/tool/container_tool.py
+++ b/tool/container_tool.py
@@ -30,27 +30,48 @@ def _prepare_project_image(self) -> str:
       return image_name
     raise Exception(f'Failed to build image for {self.benchmark.project}')
 
-  def _execute_command(self,
-                       command: list[str],
-                       in_container: bool = False) -> sp.CompletedProcess:
+  def _execute_command_in_container(self,
+                                    command: list[str]) -> sp.CompletedProcess:
     """Executes the |command| in subprocess and log output."""
-    result = sp.run(command,
-                    stdout=sp.PIPE,
-                    stderr=sp.PIPE,
-                    check=False,
-                    text=True)
+    try:
+      result = sp.run(command,
+                      stdout=sp.PIPE,
+                      stderr=sp.PIPE,
+                      check=False,
+                      text=True,
+                      encoding='utf-8',
+                      errors='ignore')
 
-    if in_container:
       logger.debug(
           'Executing command (%s) in container %s: Return code %d. STDOUT: %s, '
           'STDERR: %s', command, self.container_id, result.returncode,
           result.stdout, result.stderr)
-    else:
+      return result
+    except Exception as e:
+      logger.error(
+          'Executing command (%s) in container failed with Exception: %s',
+          command, e)
+      return sp.CompletedProcess(command, returncode=1, stdout='', stderr='')
+
+  def _execute_command(self, command: list[str]) -> sp.CompletedProcess:
+    """Executes the |command| in subprocess and log output."""
+    try:
+      result = sp.run(command,
+                      stdout=sp.PIPE,
+                      stderr=sp.PIPE,
+                      check=False,
+                      text=True,
+                      encoding='utf-8',
+                      errors='ignore')
+
       logger.debug(
-          'Executing command (%s): Return code %d. STDOUT: %s, '
-          'STDERR: %s', command, result.returncode, result.stdout,
-          result.stderr)
-    return result
+          'Executing command (%s): Return code %d. STDOUT: %s, STDERR: %s',
+          command, result.returncode, result.stdout, result.stderr)
+      return result
+    except Exception as e:
+      logger.error('Executing command (%s) failed with Exception: %s', command,
+                   e)
+      return sp.CompletedProcess(command, returncode=1, stdout='', stderr='')
 
   def _start_docker_container(self) -> str:
     """Runs the project's OSS-Fuzz image as a background container and returns
@@ -60,6 +81,8 @@ def _start_docker_container(self) -> str:
         f'FUZZING_LANGUAGE={self.benchmark.language}', self.image_name
     ]
     result = self._execute_command(run_container_command)
+    if result.returncode:
+      logger.error('Failed to start container of image: %s', self.image_name)
     container_id = result.stdout.strip()
     return container_id
 
@@ -69,10 +92,25 @@ def execute(self, command: str) -> sp.CompletedProcess:
     execute_command_in_container = [
         'docker', 'exec', self.container_id, '/bin/bash', '-c', command
     ]
-    process = self._execute_command(execute_command_in_container, True)
+    process = self._execute_command_in_container(execute_command_in_container)
     process.args = command
     return process
 
+  def compile(self,
+              use_recompile: bool = True,
+              extra_commands: str = '') -> sp.CompletedProcess:
+    """Compiles or recompiles the fuzz target."""
+    if use_recompile:
+      logger.info('Will attempt to use recompile')
+      self.execute(
+          '[ -f /usr/local/bin/recompile ] && echo "Will use recompile" '
+          '&& mv /usr/local/bin/recompile /usr/local/bin/compile')
+    else:
+      logger.info('Will use the original compile')
+
+    command = 'compile > /dev/null' + extra_commands
+    return self.execute(command)
+
   def terminate(self) -> bool:
     """Terminates the container."""
     terminate_container_command = ['docker', 'stop', self.container_id]
diff --git a/utils.py b/utils.py
index 4717377da1..af7df201cd 100644
--- a/utils.py
+++ b/utils.py
@@ -68,16 +68,18 @@ def wrapper(*args, **kwargs):
           # Expected exceptions and their subclass.
           num_attempts = next(
               (attempts for exc_type, attempts in exception_config.items()
-               if isinstance(e, exc_type)), 1)
+               if type(e) is exc_type), 1)  # pylint: disable=unidiomatic-typecheck
 
           logging.error(
-              'Exception %s on function %s(args=%s, kwargs=%s), attempt %d/%d',
-              e, func.__name__, args, kwargs, attempt, num_attempts)
+              'Exception %s (%s) on function %s(args=%s, kwargs=%s), attempt '
+              '%d/%d', type(e), e, func.__name__, args, kwargs, attempt,
+              num_attempts)
 
           if attempt >= num_attempts:
             logging.error(
                 'Max attempts %d/%d reached for %s(args=%s, kwargs=%s) due to '
-                '%s', attempt, num_attempts, func.__name__, args, kwargs, e)
+                '%s (%s)', attempt, num_attempts, func.__name__, args, kwargs,
+                type(e), e)
             raise
 
           attempt += 1