Prototyper improvements (#661)

## Functionality 1. Validate if the function-under-test is referenced by the fuzz target (fixes #658) 2. Use cached image from `chronos`. 3. Use exact exception matching in decorator `@retryable` to ensure the correct config (e.g., #retries) is applied. 4. Compile the new fuzz target with the old build script once even if LLM proposes a new build script, in case the new one is wrong. 5. Show chat history of each trail for simpler debugging. 6. Show exception (e.g., cloud build expiry) in report.
google · Oct 21, 2024 · 1ce0df9 · 1ce0df9
1 parent 6bc5565
commit 1ce0df9
Show file tree

Hide file tree

Showing 11 changed files with 313 additions and 91 deletions.
diff --git a/agent/base_agent.py b/agent/base_agent.py
@@ -59,10 +59,13 @@ def _filter_code(self, raw_code_block: str) -> str:
     return filtered_code_block
 
   def _format_bash_execution_result(self, process: sp.CompletedProcess) -> str:
+    stdout = self.llm.truncate_prompt(process.stdout)
+    # TODO(dongge) Share input limit evenly if both stdout and stderr overlong.
+    stderr = self.llm.truncate_prompt(process.stderr, stdout)
     return (f'<bash>\n{process.args}\n</bash>\n'
             f'<return code>\n{process.returncode}\n</return code>\n'
-            f'<stdout>\n{process.stdout}\n</stdout>\n'
-            f'<stderr>\n{process.stderr}\n</stderr>\n')
+            f'<stdout>\n{stdout}\n</stdout>\n'
+            f'<stderr>\n{stderr}\n</stderr>\n')
 
   def _container_handle_bash_command(self, cur_round: int, response: str,
                                      tool: BaseTool) -> Prompt:
@@ -113,6 +116,7 @@ def cloud_main(cls) -> None:
     args = cls._parse_args()
 
     agent = utils.deserialize_from_dill(args.agent)
+    agent.llm.cloud_setup()
     result_history = utils.deserialize_from_dill(args.result_history)
     result = agent.execute(result_history)
     utils.serialize_to_dill(result, args.result_new)

diff --git a/agent/prototyper.py b/agent/prototyper.py
@@ -6,6 +6,7 @@
 
 import logger
 from agent.base_agent import BaseAgent
+from experiment.benchmark import Benchmark
 from llm_toolkit.prompt_builder import DefaultTemplateBuilder
 from llm_toolkit.prompts import Prompt
 from results import BuildResult, Result
@@ -24,7 +25,6 @@ def _initial_prompt(self, results: list[Result]) -> Prompt:
     default_prompt_builder = DefaultTemplateBuilder(model=self.llm,
                                                     benchmark=benchmark)
     prompt = default_prompt_builder.build([])
-    # TODO(dongge): Find a way to save prompt and log for agents
     return prompt
 
   def _update_fuzz_target_and_build_script(self, cur_round: int, response: str,
@@ -43,26 +43,68 @@ def _update_fuzz_target_and_build_script(self, cur_round: int, response: str,
 
     build_script_source = self._filter_code(
         self._parse_tag(response, 'build script'))
-    build_result.build_script_source = build_script_source
+    # Sometimes LLM adds chronos, which makes no sense for new build scripts.
+    build_result.build_script_source = build_script_source.replace(
+        'source /src/chronos.sh', '')
     if build_script_source:
       logger.debug('ROUND %02d Parsed build script from LLM: %s', cur_round,
                    build_script_source)
     else:
       logger.debug('ROUND %02d No build script in conclusion: %s', cur_round,
                    response)
 
-  def _update_build_result(self, buid_result: BuildResult,
-                           compile_process: sp.CompletedProcess,
-                           status: bool) -> None:
+  def _update_build_result(self, build_result: BuildResult,
+                           compile_process: sp.CompletedProcess, status: bool,
+                           referenced: bool) -> None:
     """Updates the build result with the latest info."""
-    buid_result.compiles = status
-    buid_result.compile_error = compile_process.stderr
-    buid_result.compile_log = self._format_bash_execution_result(
+    build_result.compiles = status
+    build_result.compile_error = compile_process.stderr
+    build_result.compile_log = self._format_bash_execution_result(
         compile_process)
+    build_result.is_function_referenced = referenced
 
   def _validate_fuzz_target_and_build_script(self, cur_round: int,
                                              build_result: BuildResult) -> None:
     """Validates the new fuzz target and build script."""
+    # Steps:
+    #   1. Recompile without modifying the build script, in case LLM is wrong.
+    #   2. Recompile with the modified build script, if any.
+    build_script_source = build_result.build_script_source
+
+    logger.info('First compile fuzz target without modifying build script.')
+    build_result.build_script_source = ''
+    self._validate_fuzz_target_and_build_script_via_recompile(
+        cur_round, build_result)
+
+    if not build_result.success and build_script_source:
+      logger.info('Then compile fuzz target with modified build script.')
+      build_result.build_script_source = build_script_source
+      self._validate_fuzz_target_and_build_script_via_recompile(
+          cur_round, build_result, use_recompile=False)
+
+  def _validate_fuzz_target_references_function(
+      self, compilation_tool: ProjectContainerTool, benchmark: Benchmark,
+      cur_round: int) -> bool:
+    """Validates if the LLM generated fuzz target assembly code references
+    function-under-test."""
+    disassemble_result = compilation_tool.execute(
+        'objdump --disassemble=LLVMFuzzerTestOneInput -d '
+        f'/out/{benchmark.target_name}')
+    function_referenced = (disassemble_result.returncode == 0 and
+                           benchmark.function_name in disassemble_result.stdout)
+    logger.debug('ROUND %02d Final fuzz target function referenced: %s',
+                 cur_round, function_referenced)
+    if not function_referenced:
+      logger.debug('ROUND %02d Final fuzz target function not referenced',
+                   cur_round)
+    return function_referenced
+
+  def _validate_fuzz_target_and_build_script_via_recompile(
+      self,
+      cur_round: int,
+      build_result: BuildResult,
+      use_recompile: bool = True) -> None:
+    """Validates the new fuzz target and build script by recompiling them."""
     benchmark = build_result.benchmark
     compilation_tool = ProjectContainerTool(benchmark=benchmark)
 
@@ -82,8 +124,7 @@ def _validate_fuzz_target_and_build_script(self, cur_round: int,
 
     # Recompile.
     logger.info('===== ROUND %02d Recompile =====', cur_round)
-    compile_command = 'compile > /dev/null'
-    compile_process = compilation_tool.execute(compile_command)
+    compile_process = compilation_tool.compile(use_recompile=use_recompile)
     compile_succeed = compile_process.returncode == 0
     logger.debug('ROUND %02d Fuzz target compile Succeessfully: %s', cur_round,
                  compile_succeed)
@@ -93,11 +134,16 @@ def _validate_fuzz_target_and_build_script(self, cur_round: int,
     binary_exists = ls_result.returncode == 0
     logger.debug('ROUND %02d Final fuzz target binary exists: %s', cur_round,
                  binary_exists)
-    compilation_tool.terminate()
 
+    # Validate if function-under-test is referenced by the fuzz target.
+    function_referenced = self._validate_fuzz_target_references_function(
+        compilation_tool, benchmark, cur_round)
+
+    compilation_tool.terminate()
     self._update_build_result(build_result,
                               compile_process=compile_process,
-                              status=compile_succeed and binary_exists)
+                              status=compile_succeed and binary_exists,
+                              referenced=function_referenced)
 
   def _container_handle_conclusion(
       self, cur_round: int, response: str,
@@ -109,18 +155,34 @@ def _container_handle_conclusion(
     self._update_fuzz_target_and_build_script(cur_round, response, build_result)
 
     self._validate_fuzz_target_and_build_script(cur_round, build_result)
-    if build_result.compiles:
+    if build_result.success:
       logger.info('***** Prototyper succeded in %02d rounds *****', cur_round)
       return None
 
-    logger.info('***** Failed to recompile in %02d rounds *****', cur_round)
-    prompt_text = ('Failed to build fuzz target. Here is the fuzz target, build'
-                   ' script, compliation command, and other compilation runtime'
-                   ' output.\n<fuzz target>\n'
-                   f'{build_result.fuzz_target_source}\n</fuzz target>\n'
-                   f'<build script>\n{build_result.build_script_source}\n'
-                   '</build script>\n'
-                   f'{build_result.compile_log}')
+    if not build_result.compiles:
+      compile_log = self.llm.truncate_prompt(build_result.compile_log)
+      logger.info('***** Failed to recompile in %02d rounds *****', cur_round)
+      prompt_text = (
+          'Failed to build fuzz target. Here is the fuzz target, build'
+          ' script, compliation command, and other compilation runtime'
+          ' output.\n<fuzz target>\n'
+          f'{build_result.fuzz_target_source}\n</fuzz target>\n'
+          f'<build script>\n{build_result.build_script_source}\n'
+          f'</build script>\n<compilation log>\n{compile_log}\n'
+          '</compilation log>\n')
+    elif not build_result.is_function_referenced:
+      logger.info(
+          '***** Fuzz target does not reference function-under-test in %02d '
+          'rounds *****', cur_round)
+      prompt_text = (
+          'The fuzz target builds successfully, but the target function '
+          f'`{build_result.benchmark.function_signature}` was not used by '
+          '`LLVMFuzzerTestOneInput` in fuzz target. YOU MUST CALL FUNCTION '
+          f'`{build_result.benchmark.function_signature}` INSIDE FUNCTION '
+          '`LLVMFuzzerTestOneInput`.')
+    else:
+      prompt_text = ''
+
     prompt = DefaultTemplateBuilder(self.llm, initial=prompt_text).build([])
     return prompt
 
@@ -140,7 +202,7 @@ def execute(self, result_history: list[Result]) -> BuildResult:
     prompt = self._initial_prompt(result_history)
     benchmark = last_result.benchmark
     self.inspect_tool = ProjectContainerTool(benchmark, name='inspect')
-    self.inspect_tool.execute('{compile && rm -rf /out/*} > /dev/null')
+    self.inspect_tool.compile(extra_commands=' && rm -rf /out/* > /dev/null')
     cur_round = 1
     prompt.append(self.inspect_tool.tutorial())
     build_result = BuildResult(benchmark=benchmark,
@@ -157,7 +219,6 @@ def execute(self, result_history: list[Result]) -> BuildResult:
         prompt = self._container_tool_reaction(cur_round, response,
                                                build_result)
         cur_round += 1
-        self._sleep_random_duration()
     finally:
       # Cleanup: stop and remove the container
       logger.debug('Stopping and removing the inspect container %s',

diff --git a/ci/k8s/pr-exp.yaml b/ci/k8s/pr-exp.yaml
@@ -47,6 +47,10 @@ spec:
           value: '10'
         - name: VERTEX_AI_LOCATIONS
           value: 'asia-east1,asia-east2,asia-northeast1,asia-northeast3,asia-south1,asia-southeast1,australia-southeast1,europe-central2,europe-north1,europe-southwest1,europe-west1,europe-west2,europe-west3,europe-west4,europe-west6,europe-west8,europe-west9,northamerica-northeast1,southamerica-east1,us-central1,us-east1,us-east4,us-east5,us-south1,us-west1,us-west4'
+        - name: CLOUD_BUILD_LOCATION
+          value: 'us-west2'
+        - name: GCB_BUILDPOOL_NAME
+          value: projects/oss-fuzz/locations/us-west2/workerPools/buildpool-llm-agents
         - name: REDIRECT_OUTS
           value: '${GKE_REDIRECT_OUTS}'
       # imagePullSecrets:

diff --git a/common/cloud_builder.py b/common/cloud_builder.py
@@ -23,8 +23,9 @@
 
 OF_REPO = 'https://github.com/google/oss-fuzz.git'
 OFG_ROOT_DIR = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
-US_CENTRAL_CLIENT_OPTIONS = google.api_core.client_options.ClientOptions(
-    api_endpoint='https://us-central1-cloudbuild.googleapis.com/')
+REGION = os.getenv('CLOUD_BUILD_LOCATION', 'us-west2')
+REGIONAL_CLIENT_OPTIONS = google.api_core.client_options.ClientOptions(
+    api_endpoint=f'https://{REGION}-cloudbuild.googleapis.com/')
 _CHAT_HISTORY_PREFIX_PATTERN = r'^Step\s+#(\d+)\s+-\s+"agent-step":\s+'
 
 
@@ -60,7 +61,7 @@ def __init__(self, args: argparse.Namespace) -> None:
         'v1',
         credentials=self.credentials,
         cache_discovery=False,
-        client_options=US_CENTRAL_CLIENT_OPTIONS).projects().builds()
+        client_options=REGIONAL_CLIENT_OPTIONS).projects().builds()
     self.storage_client = storage.Client(credentials=self.credentials)
 
   def _upload_to_gcs(self, local_file_path: str) -> str:
@@ -143,6 +144,9 @@ def _request_cloud_build(self, ofg_repo_url: str, agent_dill_url: str,
                     '/workspace:/workspace',
                     '-v',
                     '/var/run/docker.sock:/var/run/docker.sock',
+                    '-e',
+                    'VERTEX_AI_LOCATIONS=' +
+                    os.getenv("VERTEX_AI_LOCATIONS", ""),
                     '--network=cloudbuild',
                     # Built from this repo's `Dockerfile.cloudbuild-agent`.
                     ('us-central1-docker.pkg.dev/oss-fuzz/oss-fuzz-gen/'
@@ -192,10 +196,10 @@ def _request_cloud_build(self, ofg_repo_url: str, agent_dill_url: str,
                                     body=cloud_build_config).execute()
     build_id = build_info.get('metadata', {}).get('build', {}).get('id', '')
 
-    logging.info('Cloud Build ID: %s', build_id)
+    logging.info('Created Cloud Build ID %s at %s', build_id, REGION)
     return build_id
 
-  def _wait_for_build(self, build_id: str) -> bool:
+  def _wait_for_build(self, build_id: str) -> str:
     """Wait for a GCB build."""
     prev_status = status = None
     while status in [None, 'WORKING', 'QUEUED']:
@@ -205,11 +209,11 @@ def _wait_for_build(self, build_id: str) -> bool:
         if status != prev_status:
           logging.info('Cloud Build %s Status: %s', build_id, status)
           prev_status = status
-        time.sleep(60)  # Avoid rate limiting.
       except (googleapiclient.errors.HttpError, BrokenPipeError) as e:
-        logging.error('Cloud build %s failed: %s', build_id, e)
-        return False
-    return status == 'SUCCESS'
+        logging.warning('Failed to check cloud build status %s: %s', build_id,
+                        e)
+      time.sleep(60)  # Avoid rate limiting.
+    return status or ''
 
   def _cancel_build(self, build_id: str) -> None:
     """Cancel a GCB build"""
@@ -240,7 +244,7 @@ def _get_build_log(self, build_id: str) -> str:
       return log_content
     except NotFound as e:
       logging.error('Cloud build log %s not found: %s', log_file_uri, e)
-      return ''
+      return f'Cloud build log {log_file_uri} not found: {e}.'
 
   def _download_from_gcs(self, destination_file_name: str) -> None:
     """Downloads the result file from GCS."""
@@ -278,22 +282,33 @@ def run(self, agent: BaseAgent, result_history: list[Result],
                                          new_result_filename)
 
     # Step 4: Download new result dill.
+    cloud_build_log = ''
     new_result_dill = os.path.join(dill_dir, new_result_filename)
     try:
-      if self._wait_for_build(build_id):
+      cloud_build_final_status = self._wait_for_build(build_id)
+      if cloud_build_final_status == 'SUCCESS':
         self._download_from_gcs(new_result_dill)
-    except (KeyboardInterrupt, SystemExit):
+      else:
+        logging.error('Cloud build %s failed with status: %s', build_id,
+                      cloud_build_final_status)
+        cloud_build_log += (f'Cloud build {build_id} failed with status: '
+                            f'{cloud_build_final_status}.\n')
+    except (KeyboardInterrupt, SystemExit) as e:
       self._cancel_build(build_id)
-    build_log = self._get_build_log(build_id)
+      logging.error('Cloud build %s cancled: %s', build_id, e)
+      cloud_build_log += f'Cloud build {build_id} cancled: {e}.\n'
+
+    cloud_build_log += self._get_build_log(build_id)
 
     # Step 4: Deserialize dilld file.
     result = utils.deserialize_from_dill(new_result_dill)
     if not result:
+      cloud_build_log += f'Failed to deserialize from dill {new_result_dill}.\n'
       last_result = result_history[-1]
       result = Result(benchmark=last_result.benchmark,
                       trial=last_result.trial,
                       work_dirs=last_result.work_dirs,
                       author=agent)
-    result.chat_history = {agent.name: build_log}
+    result.chat_history = {agent.name: cloud_build_log}
 
     return result
diff --git a/experiment/oss_fuzz_checkout.py b/experiment/oss_fuzz_checkout.py
@@ -375,23 +375,57 @@ def prepare_build(project_name, sanitizer, generated_project):
     shutil.copy(original_dockerfile, dockerfile_to_use)
 
 
-def _image_exists(image_name: str) -> bool:
-  """Checks if the given |image_name| exits."""
+def _image_exists_locally(image_name: str, project_name: str) -> bool:
+  """Checks if the given |image_name| exits locally."""
   try:
     all_images = sp.run(['docker', 'images', '--format', '{{.Repository}}'],
                         stdout=sp.PIPE,
                         text=True,
                         check=True).stdout.splitlines()
+    if image_name in all_images:
+      logger.info('Will use local cached images of %s: %s', project_name,
+                  image_name)
+      return True
   except sp.CalledProcessError:
-    logger.info('Unable to list all docker images')
+    logger.warning('Unable to use local cached image of %s: %s', project_name,
+                   image_name)
+  return False
+
+
+def _image_exists_online(image_name: str, project_name: str) -> bool:
+  """Checks if the given |image_name| exits in the cloud registry."""
+  online_image_name = _get_project_cache_image_name(project_name, 'address')
+  try:
+    sp.run(['docker', 'pull', online_image_name],
+           stdout=sp.PIPE,
+           text=True,
+           check=True)
+    logger.info('Pulled online cached images of %s: %s', project_name,
+                online_image_name)
+    sp.run([
+        'docker', 'run', '--entrypoint', '/usr/local/bin/recompile',
+        online_image_name
+    ],
+           stdout=sp.PIPE,
+           text=True,
+           check=True)
+
+    sp.run(['docker', 'tag', online_image_name, image_name],
+           stdout=sp.PIPE,
+           text=True,
+           check=True)
+    logger.info('Will use online cached images: %s', project_name)
+    return True
+  except sp.CalledProcessError:
+    logger.warning('Unable to use online cached images: %s', project_name)
     return False
-  return image_name in all_images
 
 
 def prepare_project_image(project: str) -> str:
   """Prepares original image of the |project|'s fuzz target build container."""
   image_name = f'gcr.io/oss-fuzz/{project}'
-  if _image_exists(image_name):
+  if (_image_exists_locally(image_name, project_name=project) or
+      _image_exists_online(image_name, project_name=project)):
     logger.info('Using existing project image for %s', project)
     return image_name
   logger.info('Unable to find existing project image for %s', project)