[JVM] Add retry logic for no coverage gain (#742)

This PR relocate the calculation of coverage total and coverage diff into the checking and retry loop. This fixes can then use the lively caluclated coverage diff to determine if the successfully build and run harness increase the project coverage or not. If it does not increase the project coverage, use the new retry logic prompts to ask LLM to help fixing the harness. The fixing of errors in generated harness as well as harness with no coverage increase are count together. After this fix, it increase the rate of successfully build and run for generated harness and make sure more of them could have help improveing project coverage. *Remark, this coverage feedback approach is currently only work exclusively for JVM projects.* --------- Signed-off-by: Arthur Chan <[email protected]>
google · Dec 7, 2024 · fe43b0c · fe43b0c
1 parent 1eb15f3
commit fe43b0c
Show file tree

Hide file tree

Showing 6 changed files with 199 additions and 54 deletions.
diff --git a/data_prep/introspector.py b/data_prep/introspector.py
@@ -343,8 +343,10 @@ def query_introspector_public_classes(project: str) -> list[str]:
   return _get_data(resp, 'classes', [])
 
 
-def query_introspector_source_code(project: str, filepath: str, begin_line: int,
-                                   end_line: int) -> str:
+def query_introspector_source_code(project: str,
+                                   filepath: str,
+                                   begin_line: int = 0,
+                                   end_line: int = 10000) -> str:
   """Queries FuzzIntrospector API for source code of a
     file |filepath| between |begin_line| and |end_line|."""
 

diff --git a/experiment/evaluator.py b/experiment/evaluator.py
@@ -283,18 +283,23 @@ def _fix_generated_fuzz_target(self, ai_binary: str,
                                  run_result: Optional[RunResult],
                                  dual_logger: _Logger, language: str):
     """Fixes the generated fuzz target."""
-    if build_result.succeeded and not language == 'jvm':
-      if run_result:
-        error_desc, errors = run_result.semantic_check.get_error_info()
+    jvm_coverage_fix = False
+    error_desc, errors = '', []
+    if build_result.succeeded:
+      if language == 'jvm':
+        jvm_coverage_fix = True
       else:
-        dual_logger.log(f'Warning: Build succeed but no run_result in '
-                        f'{generated_oss_fuzz_project}.')
-        error_desc, errors = '', []
+        if run_result:
+          error_desc, errors = run_result.semantic_check.get_error_info()
+        else:
+          dual_logger.log(f'Warning: Build succeed but no run_result in '
+                          f'{generated_oss_fuzz_project}.')
     else:
       error_desc, errors = None, build_result.errors
+
     code_fixer.llm_fix(ai_binary, target_path, self.benchmark, iteration,
                        error_desc, errors, self.builder_runner.fixer_model_name,
-                       language)
+                       language, jvm_coverage_fix)
     shutil.copyfile(
         target_path,
         os.path.join(oss_fuzz_checkout.OSS_FUZZ_DIR, 'projects',
@@ -388,9 +393,57 @@ def check_target(self, ai_binary, target_path: str) -> Result:
         build_result = BuildResult()
         run_result = None
 
+      # 2. Calculate coverage percentage and coverage diff
+      coverage_summary = None
+      total_lines = 0
+      coverage_percent = 0.0
+      coverage_diff = 0.0
+      if run_result:
+        # Gets line coverage (diff) details.
+        coverage_summary = self._load_existing_coverage_summary()
+
+        if self.benchmark.language in ['python', 'jvm'] and run_result.coverage:
+          # The Jacoco.xml coverage report used to generate summary.json on
+          # OSS-Fuzz for JVM projects does not trace the source file location.
+          # Thus the conversion may miss some classes because they are not
+          # present during coverage report generation. This fix gets the total
+          # line calculation from the jacoco.xml report of the current run
+          # directly and compares it with the total_lines retrieved from
+          # summary.json. Then the larger total_lines is used which is assumed
+          # to be more accurate. This is the same case for python project which
+          # the total line is determined from the all_cov.json file.
+          total_lines = run_result.coverage.total_lines
+        elif coverage_summary:
+          total_lines = compute_total_lines_without_fuzz_targets(
+              coverage_summary, generated_target_name)
+        else:
+          total_lines = 0
+
+        if run_result.total_pcs:
+          coverage_percent = run_result.cov_pcs / run_result.total_pcs
+        else:
+          dual_logger.log(
+              f'Warning: total_pcs == 0 in {generated_oss_fuzz_project}.')
+          coverage_percent = 0.0
+
+        existing_textcov = self.load_existing_textcov()
+        if run_result.coverage:
+          run_result.coverage.subtract_covered_lines(existing_textcov)
+
+        if total_lines and run_result.coverage:
+          coverage_diff = run_result.coverage.covered_lines / total_lines
+        else:
+          dual_logger.log(
+              f'Warning: total_lines == 0 in {generated_oss_fuzz_project}.')
+          coverage_diff = 0.0
+
       if self.benchmark.language == 'jvm':
-        # Unexpected exceptions that crash JVM fuzzers does not need to be fixed.
+        # For JVM, the generation is consider success if either is true
+        # 1) Build success and run crashed (expected for exceptions)
+        # 2) Build success, run success and coverage diff > 0
         gen_succ = build_result.succeeded and run_result
+        if gen_succ and run_result and run_result.succeeded:
+          gen_succ = gen_succ and (coverage_diff > 0)
       else:
         gen_succ = build_result.succeeded and run_result and run_result.succeeded
 
@@ -465,43 +518,6 @@ def check_target(self, ai_binary, target_path: str) -> Result:
                  run_result.coverage_report_path, run_result.reproducer_path,
                  True, run_result.semantic_check.type, run_result.triage))
 
-    # Gets line coverage (diff) details.
-    coverage_summary = self._load_existing_coverage_summary()
-
-    if self.benchmark.language in ['python', 'jvm']:
-      # The Jacoco.xml coverage report used to generate summary.json on OSS-Fuzz
-      # for JVM projects does not trace the source file location. Thus the
-      # conversion may miss some classes because they are not present during
-      # coverage report generation. This fix gets the total line calculation
-      # from the jacoco.xml report of the current run directly and compares it
-      # with the total_lines retrieved from summary.json. Then the larger
-      # total_lines is used which is assumed to be more accurate.
-      # This is the same case for python project which the total line
-      # is determined from the all_cov.json file.
-      total_lines = run_result.coverage.total_lines
-    elif coverage_summary:
-      total_lines = compute_total_lines_without_fuzz_targets(
-          coverage_summary, generated_target_name)
-    else:
-      total_lines = 0
-
-    if run_result.total_pcs:
-      coverage_percent = run_result.cov_pcs / run_result.total_pcs
-    else:
-      dual_logger.log(
-          f'Warning: total_pcs == 0 in {generated_oss_fuzz_project}.')
-      coverage_percent = 0.0
-
-    existing_textcov = self.load_existing_textcov()
-    run_result.coverage.subtract_covered_lines(existing_textcov)
-
-    if total_lines:
-      coverage_diff = run_result.coverage.covered_lines / total_lines
-    else:
-      dual_logger.log(
-          f'Warning: total_lines == 0 in {generated_oss_fuzz_project}.')
-      coverage_diff = 0.0
-
     dual_logger.log(
         f'Result for {generated_oss_fuzz_project}: '
         f'crashes={run_result.crashes}, coverage={coverage_percent} '

diff --git a/llm_toolkit/code_fixer.py b/llm_toolkit/code_fixer.py
@@ -368,7 +368,7 @@ def group_error_messages(error_lines: list[str]) -> list[str]:
 
 def llm_fix(ai_binary: str, target_path: str, benchmark: benchmarklib.Benchmark,
             llm_fix_id: int, error_desc: Optional[str], errors: list[str],
-            fixer_model_name: str, language: str) -> None:
+            fixer_model_name: str, language: str, jvm_cov_fix: bool) -> None:
   """Reads and fixes |target_path| in place with LLM based on |error_log|."""
   fuzz_target_source_code = parser.parse_code(target_path)
 
@@ -385,6 +385,7 @@ def llm_fix(ai_binary: str, target_path: str, benchmark: benchmarklib.Benchmark,
                 prompt_path,
                 response_dir,
                 language,
+                jvm_cov_fix,
                 fixer_model_name,
                 temperature=0.5 - llm_fix_id * 0.04)
 
@@ -427,6 +428,7 @@ def apply_llm_fix(ai_binary: str,
                   prompt_path: str,
                   response_dir: str,
                   language: str,
+                  jvm_cov_fix: bool,
                   fixer_model_name: str = models.DefaultModel.name,
                   temperature: float = 0.4):
   """Queries LLM to fix the code."""
@@ -440,7 +442,7 @@ def apply_llm_fix(ai_binary: str,
   if language == 'jvm':
     builder = prompt_builder.JvmErrorFixingBuilder(fixer_model, benchmark,
                                                    fuzz_target_source_code,
-                                                   errors)
+                                                   errors, jvm_cov_fix)
     prompt = builder.build([], None, None)
     prompt.save(prompt_path)
   else:

diff --git a/llm_toolkit/prompt_builder.py b/llm_toolkit/prompt_builder.py
@@ -1062,16 +1062,22 @@ def __init__(self,
                benchmark: Benchmark,
                generated_harness: str,
                errors: list[str],
+               jvm_cov_fix: bool,
                template_dir: str = DEFAULT_TEMPLATE_DIR):
     super().__init__(model)
     self._template_dir = template_dir
     self.benchmark = benchmark
     self.generated_harness = generated_harness
     self.error_str = '\n'.join(errors)
+    self.jvm_cov_fix = jvm_cov_fix
 
     # Load templates.
-    self.template_file = self._find_template(
-        template_dir, 'jvm_requirement_error_fixing.txt')
+    if self.jvm_cov_fix:
+      self.template_file = self._find_template(
+          template_dir, 'jvm_requirement_coverage_fixing.txt')
+    else:
+      self.template_file = self._find_template(
+          template_dir, 'jvm_requirement_error_fixing.txt')
 
   def _find_template(self, template_dir: str, template_name: str) -> str:
     """Finds template file based on |template_dir|."""
@@ -1099,15 +1105,40 @@ def build(self,
     with open(self.template_file, 'r') as f:
       prompt_text = f.read()
 
+    proj = self.benchmark.project
+
     # Format the repository
     target_repository = oss_fuzz_checkout.get_project_repository(
         self.benchmark.project)
     prompt_text = prompt_text.replace('{TARGET_REPO}', target_repository)
+    prompt_text = prompt_text.replace('{HARNESS_NAME}',
+                                      self.benchmark.target_name)
 
-    # Add the generated harness and error string to prompt
+    # Add the generated harness to prompt
     prompt_text = prompt_text.replace('{GENERATED_HARNESS}',
                                       self.generated_harness)
-    prompt_text = prompt_text.replace('{ERRORS}', self.error_str)
+
+    if self.jvm_cov_fix:
+      # Add source code of all existing harnesses to prompt
+      source_list = []
+      harnesses = introspector.query_introspector_for_harness_intrinsics(proj)
+      for pair in harnesses:
+        path = pair.get('source', '')
+        if path:
+          source = introspector.query_introspector_source_code(proj, path)
+          if source:
+            source_list.append(source)
+
+      prompt_text = prompt_text.replace('{EXISTING_HARNESS}',
+                                        '\n---\n'.join(source_list))
+
+      # Add all public candidates to prompt
+      methods = introspector.query_introspector_jvm_all_public_candidates(proj)
+      name = [method['function_name'] for method in methods]
+      prompt_text = prompt_text.replace('{PUBLIC_METHODS}', ','.join(name))
+    else:
+      # Add the error string to prompt
+      prompt_text = prompt_text.replace('{ERRORS}', self.error_str)
 
     self._prompt.add_priming(prompt_text)
     return self._prompt

diff --git a/prompts/template_xml/jvm_requirement_coverage_fixing.txt b/prompts/template_xml/jvm_requirement_coverage_fixing.txt
@@ -0,0 +1,94 @@
+I'm a security engineer looking to write good fuzzing harnesses. I want you help me improve my fuzzing harness so it could covers more part of the code.
+
+The target library is {TARGET_REPO}.
+
+The target project is implemented in the Java programming language; therefore, the harness should also be written in Java.
+The fuzzing harness must be executable within the Jazzer fuzzing framework.
+
+Below is the source code of the target fuzzing harness that I would like to improve:
+<code>
+{GENERATED_HARNESS}
+</code>
+
+For reference, the source code for all existing harnesses of the project is provided below, separated by `---`:
+<code>
+{EXISTING_HARNESS}
+</code>
+
+Additionally, a list of all public methods and constructors of the project is included for your reference, you should try to expand the fuzzing harness that calls these targets to improve the overall fuzzing coverage:
+{PUBLIC_METHODS}
+
+Your task is to improve the target fuzzing harness provided above to increase code coverage for additional parts of the project that are not covered by the existing fuzzing harnesses. Please ensure that the changes made are minimal.
+In your response, include ONLY the code for the harness, nothing more. You should wrap the code in <code></code> tags.
+
+Here is an additional list of requirements that you MUST follow.
+<requirements>
+<item>NEVER use any methods from the <code>java.lang.Random</code> class in the generated code.</item>
+<item>NEVER use any classes or methods in the <code>java.lang.reflect</code> package in the generated code.</item>
+<item>NEVER use the @FuzzTest annotation for specifying the fuzzing method.</item>
+<item>NEVER use any assert, printing and logging statements in the generated harness.</item>
+<item>NEVER use any multithreading or multi-processing approach.</item>
+<item>You MUST create the object before calling the target method.</item>
+<item>Please use {HARNESS_NAME} as the Java class name.</item>
+<item>You MUST invoke the close method of any resource class objects that implements the java.lang.AutoCloseable interface in the finally block after the target method is invoked.</item>
+<item>Always create the fuzzing harness from the following templates:
+<code>
+import com.code_intelligence.jazzer.api.FuzzedDataProvider;
+// Other imports
+
+public class {HARNESS_NAME} {
+  public static void fuzzerInitialize() {
+    // Initializing objects for fuzzing
+  }
+
+  public static void fuzzerTearDown() {
+    // Tear down objects after fuzzing
+  }
+
+  public static void fuzzerTestOneInput(FuzzedDataProvider data) {
+    // Use the FuzzedDataProvider object to generate random data for fuzzing
+
+    // Fuzz by invoking the target method with random parameters / objects generated above.
+  }
+}
+</code></item>
+<item>
+You MUST ONLY use any of the following methods from the FuzzedDataProvider of the Jazzer framework for generating random data for fuzzing.
+If the needed return value is not found in the table, try use constructors or methods to create the needed random object. But you MUST try your best to randomise the random object with the methods in the table.
+
+| Method                                      | Return Value                          |
+|---------------------------------------------|---------------------------------------|
+| `consumeBytes(int length)`                  | `byte[]`                              |
+| `consumeRemainingAsBytes()`                 | `byte[]`                              |
+| `consumeString(int length)`                 | `String`                              |
+| `consumeRemainingAsString()`                | `String`                              |
+| `consumeBoolean()`                          | `boolean`                             |
+| `consumeInt(int min, int max)`              | `int`                                 |
+| `consumeInt()`                              | `int`                                 |
+| `consumeLong(long min, long max)`           | `long`                                |
+| `consumeLong()`                             | `long`                                |
+| `consumeFloat(float min, float max)`        | `float`                               |
+| `consumeFloat()`                            | `float`                               |
+| `consumeDouble(double min, double max)`     | `double`                              |
+| `consumeDouble()`                           | `double`                              |
+| `consumeChar()`                             | `char`                                |
+| `consumeChar(char min, char max)`           | `char`                                |
+| `consumeShort(short min, short max)`        | `short`                               |
+| `consumeShort()`                            | `short`                               |
+| `consumeRemainingAsCharSequence()`          | `CharSequence`                        |
+| `consumeBytestring()`                       | `byte[]`                              |
+| `consumeBigInteger(int minNumBits)`         | `BigInteger`                          |
+| `consumeEnum(Class<E> enumType)`            | `E` (Enum type)                       |
+| `consumeProbabilityDouble()`                | `double`                              |
+| `consumeFraction()`                         | `double`                              |
+| `pickValue(T... values)`                    | `T` (Type of value)                   |
+| `pickValue(List<T> values)`                 | `T` (Type of value)                   |
+| `consumeByte()`                             | `byte`                                |
+| `consumeIntList(int length)`                | `List<Integer>`                       |
+| `consumeLongList(int length)`               | `List<Long>`                          |
+| `consumeFloatList(int length)`              | `List<Float>`                         |
+| `consumeDoubleList(int length)`             | `List<Double>`                        |
+| `consumeCharList(int length)`               | `List<Character>`                     |
+
+</item>
+</requirements>
diff --git a/prompts/template_xml/jvm_requirement_error_fixing.txt b/prompts/template_xml/jvm_requirement_error_fixing.txt
@@ -1,4 +1,4 @@
-I'm a security engineer looking to convert unit tests into fuzzing harnesses. I got some compilation errors and want you to help fix them.
+I'm a security engineer looking to write a good fuzzing harnesses. I got some compilation errors and want you to help fix them.
 
 The target library is {TARGET_REPO}.