diff --git a/sweepai/config/client.py b/sweepai/config/client.py index 3e3c2b3bba..4aed8b63d0 100644 --- a/sweepai/config/client.py +++ b/sweepai/config/client.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import re import traceback from functools import lru_cache @@ -313,6 +314,15 @@ def is_file_suitable(self, file_contents: str) -> tuple[bool, str]: if len(file_contents)/line_count > 200: return False, "This file was determined to be non human readable due to the average line length." return True, "" + + # returns if a file is likely autogenerated or not + def is_file_auto_generated(self, file_name: str) -> tuple[bool, str]: + # if there is a string of numbers in the file name that is more than 4 characters long, it is likely autogenerated + pattern = r'\d{4,}' + match = re.search(pattern, file_name) + if bool(match): + return True, "The filename means that this file is likely auto generated." + return False, "" diff --git a/sweepai/core/review_utils.py b/sweepai/core/review_utils.py index b51fe2b1a4..f7f50c8d8c 100644 --- a/sweepai/core/review_utils.py +++ b/sweepai/core/review_utils.py @@ -133,6 +133,13 @@ def get_pr_changes(repo: Repository, pr: PullRequest) -> tuple[dict[str, PRChang errored = True e = UnsuitableFileException(reason) unsuitable_files.append((file_name, e)) + else: + # drop likely autogenerated files based on file name + auto_generated, reason = sweep_config.is_file_auto_generated(file_name) + if auto_generated: + errored = True + e = UnsuitableFileException(reason) + unsuitable_files.append((file_name, e)) if errored: posthog.capture(