sweepai · wwzeng1 · Jun 8, 2024 · Jun 6, 2024 · Jun 7, 2024 · Jun 7, 2024
diff --git a/sweepai/config/server.py b/sweepai/config/server.py
@@ -209,3 +209,5 @@
 
 assert OPENAI_API_KEY, "OPENAI_API_KEY is required."
 assert COHERE_API_KEY, "COHERE_API_KEY is required."
+
+CIRCLE_CI_PAT = os.environ.get("CIRCLE_CI_PAT", None) # if this is present, we will poll from and get logs from circleci
diff --git a/sweepai/core/sweep_bot.py b/sweepai/core/sweep_bot.py
@@ -98,7 +98,7 @@ def to_raw_string(s):
 {github_actions_logs}
 </github_actions_logs>
 
-You have previously already made the following changes:
+You have previously made the following changes. The diffs represent the current state of the file/project:
 <changes_made>
 {changes_made}
 </changes_made>
@@ -122,7 +122,7 @@ def to_raw_string(s):
 {current_github_actions_logs}
 </current_github_actions_logs>
 
-You have previously already made the following changes:
+You have previously made the following changes. The diffs represent the current state of the file/project:
 <changes_made>
 {changes_made}
 </changes_made>
@@ -1136,7 +1136,7 @@ def context_get_files_to_change(
             content=snippet.expand(300).get_snippet(add_lines=False) if snippet.type_name == "source" else snippet.get_snippet(add_lines=False),
         ) for i, snippet in enumerate(relevant_snippets)
     )
-    relevant_snippets_message = f"# Relevant codebase files:\nHere are the relevant files from the codebase. We previously summarized each of the files to help you solve the GitHub issue. These will be your primary reference to solve the problem:\n\n<relevant_files>\n{joined_relevant_snippets}\n</relevant_files>"
+    relevant_snippets_message = f"# Relevant codebase files:\nHere are the relevant files from the codebase. These will be your primary reference to solve the problem:\n\n<relevant_files>\n{joined_relevant_snippets}\n</relevant_files>"
     messages.append(
         Message(
             role="user",
@@ -1450,7 +1450,7 @@ def get_files_to_change_for_gha(
     messages.append(
         Message(role="system", content=issue_sub_request_system_prompt, key="system")
     )
-
+    # update the state of the snippets to be current
     for relevant_snippet in relevant_snippets:
         if relevant_snippet.file_path in updated_files:
             relevant_snippet.content = updated_files[relevant_snippet.file_path]["contents"]
@@ -1459,18 +1459,6 @@ def get_files_to_change_for_gha(
         if read_only_snippet.file_path in updated_files:
             read_only_snippet.content = updated_files[read_only_snippet.file_path]["contents"]
 
-    new_relevant_snippets = []
-    new_read_only_snippets = []
-    for snippet in relevant_snippets + read_only_snippets:
-        if snippet in new_relevant_snippets or snippet in new_read_only_snippets:
-            continue
-        if "test" not in snippet.file_path:
-            new_read_only_snippets.append(snippet)
-        else:
-            new_relevant_snippets.append(snippet)
-    relevant_snippets = new_relevant_snippets
-    read_only_snippets = new_read_only_snippets
-
     interleaved_snippets = []
     for i in range(max(len(relevant_snippets), len(read_only_snippets))):
         if i < len(relevant_snippets):
@@ -1499,7 +1487,6 @@ def get_files_to_change_for_gha(
                 key="relevant_snippets",
             )
         )
-
     relevant_snippet_template = '<relevant_file index="{i}">\n<file_path>\n{file_path}\n</file_path>\n<source>\n{content}\n</source>\n</relevant_file>'
     joined_relevant_snippets = "\n".join(
         relevant_snippet_template.format(
@@ -1508,25 +1495,14 @@ def get_files_to_change_for_gha(
             content=snippet.expand(300).get_snippet(add_lines=False) if snippet.type_name == "source" else snippet.get_snippet(add_lines=False),
         ) for i, snippet in enumerate(relevant_snippets)
     )
-    relevant_snippets_message = f"# Relevant codebase files:\nHere are the relevant files from the codebase. We previously summarized each of the files to help you solve the GitHub issue. These will be your primary reference to solve the problem:\n\n<relevant_files>\n{joined_relevant_snippets}\n</relevant_files>"
+    relevant_snippets_message = f"# Relevant codebase files:\nHere are the relevant files from the codebase. These will be your primary reference to solve the problem:\n\n<relevant_files>\n{joined_relevant_snippets}\n</relevant_files>"
     messages.append(
         Message(
             role="user",
             content=relevant_snippets_message,
             key="relevant_snippets",
         )
     )
-    # previous_diffs = get_previous_diffs(
-    #     problem_statement,
-    #     cloned_repo=cloned_repo,
-    #     relevant_file_paths=[snippet.file_path for snippet in relevant_snippets],
-    # )
-    # messages.append( # temporarily disable in main
-    #     Message(
-    #         role="user",
-    #         content=previous_diffs,
-    #     )
-    # )
     messages.append(
         Message(
             role="user",

diff --git a/sweepai/handlers/on_check_suite.py b/sweepai/handlers/on_check_suite.py
@@ -3,13 +3,22 @@
 """
 import io
 import re
+from time import sleep
 import zipfile
 
+from loguru import logger
 import requests
 
+from github.Repository import Repository
+from github.CommitStatus import CommitStatus
+from sweepai.config.server import CIRCLE_CI_PAT
 from sweepai.logn.cache import file_cache
 from sweepai.utils.github_utils import get_token
 
+MAX_LINES = 500
+LINES_TO_KEEP = 100
+CIRCLECI_SLEEP_DURATION_SECONDS = 15
+
 log_message = """GitHub actions yielded the following error.
 
 {error_logs}
@@ -28,6 +37,8 @@ def get_files_in_dir(zipfile: zipfile.ZipFile, dir: str):
         if file.startswith(dir) and not file.endswith("/")
     ]
 
+def remove_ansi_tags(logs: str) -> str:
+    return re.sub(r'\x1b\[[0-9;]*[a-zA-Z]', "", logs, flags=re.MULTILINE)
 
 @file_cache()
 def download_logs(repo_full_name: str, run_id: int, installation_id: int, get_errors_only=True):
@@ -71,8 +82,6 @@ def download_logs(repo_full_name: str, run_id: int, installation_id: int, get_er
 
 def clean_gh_logs(logs_str: str):
     # Extraction process could be better
-    MAX_LINES = 500
-    LINES_TO_KEEP = 100
     log_list = logs_str.split("\n")
     truncated_logs = [log[log.find(" ") + 1 :] for log in log_list]
     logs_str = "\n".join(truncated_logs)
@@ -84,6 +93,13 @@ def clean_gh_logs(logs_str: str):
     command_line = match.group(1).strip()
     log_content = match.group(2).strip()
     error_line = match.group(3).strip() # can be super long
+    return clean_cicd_logs(
+        command=command_line,
+        error=error_line,
+        logs=log_content,
+    )
+
+def clean_cicd_logs(command: str, error: str, logs: str):
     patterns = [
         # for docker
         "Already exists",
@@ -111,21 +127,114 @@ def clean_gh_logs(logs_str: str):
     ]
     cleaned_logs = [
         log.strip()
-        for log in log_content.split("\n")
+        for log in logs.split("\n")
         if not any(log.strip().startswith(pattern) for pattern in patterns)
     ]
     if len(cleaned_logs) > MAX_LINES:
         # return the first LINES_TO_KEEP and the last LINES_TO_KEEP
         cleaned_logs = cleaned_logs[:LINES_TO_KEEP] + ["..."] + cleaned_logs[-LINES_TO_KEEP:]
     cleaned_logs_str = "\n".join(cleaned_logs)
     error_content = ""
-    if len(error_line) < 200000:
+    if len(error) < 200000:
         error_content = f"""<errors>
-{error_line}
+{error}
 </errors>"""
     cleaned_response = gha_prompt.format(
-        command_line=command_line,
+        command_line=command,
         error_content=error_content,
         cleaned_logs_str=cleaned_logs_str,
     )
-    return cleaned_response
+    return cleaned_response
+
+def get_circleci_job_details(job_number, project_slug, vcs_type='github'):
+    # project_slug is the repo full name
+    headers = {'Circle-Token': CIRCLE_CI_PAT}
+    url = f"https://circleci.com/api/v1.1/project/{vcs_type}/{project_slug}/{job_number}"
+    response = requests.get(url, headers=headers)
+    return response.json()
+
+# take a commit and return all failing logs as a list
+def get_failing_circleci_log_from_url(circleci_run_url: str, repo_full_name: str):
+    if not CIRCLE_CI_PAT:
+        logger.warning("CIRCLE_CI_APIKEY not set")
+        return []
+    headers = {'Circle-Token': CIRCLE_CI_PAT}
+    job_number = circleci_run_url.split("/")[-1]
+    circleci_run_details = get_circleci_job_details(job_number, repo_full_name)
+    steps = circleci_run_details['steps']
+    failing_steps = []
+    failed_commands_and_logs = ""
+    for step in steps:
+        if step['actions'][0]['exit_code'] != 0:
+            failing_steps.append(step)
+    for step in failing_steps:
+        actions = step['actions']
+        for action in actions:
+            if action.get("status") != "failed":
+                continue
+            if 'output_url' in action:
+                log_url = action['output_url']
+                log_response = requests.get(log_url, headers=headers)
+                log_response = log_response.json()
+                # these might return in a different order; watch out
+                log_message = log_response[0]["message"] if len(log_response) > 0 else ""
+                error_message = log_response[1].get("message", "") if len(log_response) > 1 else ""
+                log_message = remove_ansi_tags(log_message)
+                error_message = remove_ansi_tags(error_message)
+                command = action.get("bash_command", "No command found.") # seems like this is the only command
+                circle_ci_failing_logs = clean_cicd_logs(
+                    command=command,
+                    error=error_message,
+                    logs=log_message,
+                )
+                if circle_ci_failing_logs:
+                    failed_commands_and_logs += circle_ci_failing_logs + "\n"
+    return failed_commands_and_logs
+
+def get_failing_circleci_logs(
+    repo: Repository,
+    current_commit: str,
+):
+    # get the pygithub commit object
+    all_logs = ""
+    failing_statuses = []
+    total_poll_attempts = 0
+    # hacky workaround because circleci can have a setup that takes a long time, and we will report "success" because the setup has finished but the actual CI is still running
+    logger.debug("Waiting for 60 seconds before polling for CircleCI status.")
+    sleep(60)
+    while True:
+        commit = repo.get_commit(current_commit)
+        status = commit.get_combined_status()
+        # https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#get-the-combined-status-for-a-specific-reference
+        all_statuses: list[CommitStatus] = status.statuses
+        # if all are success, break
+        if all(status.state == "success" for status in all_statuses):
+            failing_statuses = []
+            logger.debug(f"Exiting polling for CircleCI as all statuses are success. Statuses were: {all_statuses}")
+            break
+        # if any of the statuses are failure, return those statuses
+        failing_statuses = [status for status in all_statuses if status.state == "failure"]
+        if failing_statuses:
+            logger.debug(f"Exiting polling for CircleCI as some statuses are failing. Statuses were: {all_statuses}")
+            break
+        # if any of the statuses are pending, sleep and try again
+        if any(status.state == "pending" for status in all_statuses):
+            if total_poll_attempts * CIRCLECI_SLEEP_DURATION_SECONDS // 60 >= 60:
+                logger.debug("Polling for CircleCI has taken too long, giving up.")
+                break
+            # wait between check attempts
+            total_poll_attempts += 1
+            logger.debug(f"Polling to see if CircleCI has finished... {total_poll_attempts}.")
+            sleep(CIRCLECI_SLEEP_DURATION_SECONDS)
+            continue
+    # done polling
+    for status_detail in failing_statuses:
+        # CircleCI run detected
+        if 'circleci' in status_detail.context.lower():
+            failing_circle_ci_log = get_failing_circleci_log_from_url(
+                circleci_run_url=status_detail.target_url,
+                repo_full_name=repo.full_name
+            ) # may be empty string
+            if failing_circle_ci_log:
+                all_logs += failing_circle_ci_log + "\n"
+    return all_logs
Original file line number	Diff line number	Diff line change
Expand Up		@@ -209,3 +209,5 @@

		assert OPENAI_API_KEY, "OPENAI_API_KEY is required."
		assert COHERE_API_KEY, "COHERE_API_KEY is required."

		CIRCLE_CI_PAT = os.environ.get("CIRCLE_CI_PAT", None) # if this is present, we will poll from and get logs from circleci