Skip to content

chore: add comment formatting pre-commit hook script #1126

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,19 @@ repos:
args: [--settings-path, pyproject.toml]
exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.*

# Format comment grammar.
- repo: local
hooks:
- id: comment-checker
name: Fix comment grammar and formatting
entry: python
args:
- ./scripts/dev_scripts/format_checker.py
- tests
- src/macaron
language: system
pass_filenames: true

# Add Black code formatters.
- repo: https://github.com/ambv/black
rev: 25.1.0
Expand Down Expand Up @@ -179,7 +192,9 @@ repos:
language: system
pass_filenames: false

# On push to the remote, run the unit tests.


# On push to the remote, run the unit tests.
- repo: local
hooks:
- id: pytest
Expand Down
18 changes: 18 additions & 0 deletions docs/source/pages/developers_guide/style_guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,21 @@ For variables of a class: we do not use the ``Attribute`` section as per the `nu
x: float
#: The y coordinate of the point.
y: float


--------
Comments
--------

Comments should use typical grammar where appropriate. Ideally, they should be comprised of sentences that start with capital letters, and end with punctuation.
We use a pre-commit hook script to help enforce this.
The script is a lightweight implementation that can make mistakes in some cases.
Python script files that have been changed in the commit will be passed to the checker, if they reside in either the ``src/macaron`` or ``tests`` directories.

''''''''''''''''''''''''
Working with the script:
''''''''''''''''''''''''

- Try to avoid having Proper nouns in the middle of sentences spill over to a new line, as this will be considered as the start of a sentence instead.
- To prevent the script from changing a particular file, add the following comment after the copyright information: "# grammar: off"
- To disable the script for a particular comment, use a double pound sign, e.g. "## <The comment to ignore>"
281 changes: 281 additions & 0 deletions scripts/dev_scripts/format_checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""This script checks grammar formatting in Python source files."""
import glob
import os.path
import re
import sys
import tempfile


def main(source_locations: list[str] = None) -> int:
"""Check the grammar formatting of Python source files passed directly, or in directories.

If the passed list is empty, fallback to system arguments.
If source locations include files and directories, this script will only check files within the passed directories.
Otherwise, all files, or files within directories, will be checked.
Macaron's integration test directories are always excluded.
"""
if not source_locations and len(sys.argv) >= 2:
source_locations = []
for arg in sys.argv[1:]:
source_locations.append(arg)

if not source_locations or not isinstance(source_locations, list):
return 0

restrictive_mode = False
source_directories = []
source_files = []
for source_location in source_locations:
if os.path.isfile(source_location):
if not os.path.basename(source_location).lower().endswith(".py"):
continue
source_files.append(source_location)
elif os.path.isdir(source_location):
source_directories.append(source_location)
if source_files and source_directories:
restrictive_mode = True

checker = FormatChecker()
target_files = []

if restrictive_mode:
for source_file in source_files:
for source_directory in source_directories:
if os.path.commonpath([source_file, source_directory]):
target_files.append(source_file)
break
else:
for source_directory in source_directories:
target_files = target_files + glob.glob(f"{source_directory}/**/*.py", recursive=True)
for source_file in source_files:
target_files.append(source_file)

changed_files = 0
for target_file in target_files:
if os.path.join("integration", "cases") in target_file:
# Exclude integration test directories.
continue
with open(target_file, "r", encoding="utf-8") as file:
lines = file.readlines()
changed_files = changed_files + checker.check_file(target_file, lines)

return 1 if changed_files else 0


class FormatChecker:

TOOL_EXCEPTIONS = {"noqa:", "pylint:", "type:", "nosec", "nosec:", "flake8:", "pragma:", "pyarmor:", ":meta"}
LOWER_CASE_EXCEPTIONS = {"npm", "http:", "https:", "git@", "jdk"}
END_PUNCTUATION = {".", "!", "?"}
OTHER_PUNCTUATION = {",", ":", ";"}
SPECIAL_EXCEPTIONS = {"e.g.", "i.e.", "n.b."}
DISABLE_KEYWORD = ["grammar:", "off"]
WRAPPERS = ["'", '"', ")"]

def check_file(self, source_file: str, lines: list[str]) -> int:
"""Check and fix the contents of the passed file."""
grouped_comment_lines = []
current_group = []
inline_comment_lines = []
start_indices = {}
for index, line in enumerate(lines):
if index < 2:
# Copyright headers are checked elsewhere.
continue
if "#" not in line:
continue
line = line.rstrip()

line_split = re.split("(\s+)", line)
start_index = -1
for part_index, part in enumerate(line_split):
if part in {"#", "#:", "##"}:
start_index = part_index
break
if start_index == -1 or line_split[start_index] == "##":
# TODO discuss this per-line disable feature.
continue

# Check for disable keywords of other tools.
if start_index + 2 >= len(line_split):
continue
first_part = line_split[start_index + 2]
if first_part in self.TOOL_EXCEPTIONS:
continue

# Check for this disabler.
if start_index + 4 < len(line_split):
second_part = line_split[start_index + 4]
if first_part == self.DISABLE_KEYWORD[0] and second_part == self.DISABLE_KEYWORD[1]:
return 0

# Separate in-line and regular comments.
start_indices[index] = start_index
if start_index > 3:
# In-line comment.
inline_comment_lines.append(index)
else:
# Regular comments.
if not current_group:
current_group.append(index)
grouped_comment_lines.append(current_group)
else:
if index - current_group[-1] > 1:
current_group = []
grouped_comment_lines.append(current_group)
current_group.append(index)

# TODO create class for handling lines and offsets.
change_count = 0
for index in inline_comment_lines:
line = lines[index].rstrip()
line_split = re.split("(\s+)", line)
start_index = -1
for part_index, part in enumerate(line_split):
if part in {"#", "#:", "##"}:
start_index = part_index
break
if start_index == -1:
continue
changed_line, changed = self.check_line(line_split, None, None, start_index + 2)
if changed:
change_count += 1
lines[index] = "".join(changed_line) + os.linesep

split_lines = {}
for group in grouped_comment_lines:
for index in group:
split_lines[index] = re.split("(\s+)", lines[index].rstrip())

for index, group in enumerate(grouped_comment_lines):
group_changed = False
for group_index, line_index in enumerate(group):
current_line = split_lines[line_index]
prev_line = None
if group_index > 0:
prev_line = split_lines[line_index - 1][-1]
next_line = None
if group_index < len(group) - 1:
next_line = split_lines[line_index + 1][start_indices[line_index + 1] + 2]
current_line, changed = self.check_line(
current_line, prev_line, next_line, start_indices[line_index] + 2
)
if changed:
lines[line_index] = "".join(current_line) + os.linesep
group_changed = group_changed | changed
if group_changed:
change_count += 1

if change_count:
# Save changes.
target_dir = os.path.dirname(source_file)
file_mode = os.stat(source_file).st_mode
print(f"*** Adjusting file: {source_file}")
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", dir=target_dir, delete=False) as file:
file.writelines(lines)
os.replace(file.name, source_file)
os.chmod(source_file, file_mode)

return 1 if change_count else 0

def check_line(
self, current_line: list[str], prev_line: str | None, next_line: str | None, start_index: int = 0
) -> tuple[list[str], bool]:
"""Check the current line for formatting issues."""
if current_line[start_index] == "-":
# Ignore bullet point style lines.
return current_line, False

has_changed = False
if (
(prev_line and self.has_sentence_end(prev_line) or not prev_line)
and not self.has_sentence_start(current_line[start_index])
and re.match("[a-z]", current_line[start_index][0])
):
current_line[start_index] = current_line[start_index][0].upper() + current_line[start_index][1:]
has_changed = True
if (
next_line
and re.match("^[A-Z][^A-Z]*$", next_line)
and current_line[start_index][-1] not in self.END_PUNCTUATION
):
current_line, line_changed = self.fix_sentence_end(current_line)
has_changed = has_changed | line_changed

for index, current_word in enumerate(current_line):
if index <= start_index:
continue
if not current_word:
continue
prev_word = current_line[index - 2]
if not self.has_sentence_end(prev_word):
continue
if (
prev_word.lower() in self.SPECIAL_EXCEPTIONS
or prev_word.startswith("(")
and prev_word[1:].lower() in self.SPECIAL_EXCEPTIONS
):
continue
if self.has_sentence_start(current_word):
continue
if re.match("[a-z]", current_word[0]):
continue
current_line[index] = current_word[0].upper() + current_word[1:]
has_changed = True

if not next_line:
current_line, line_changed = self.fix_sentence_end(current_line)
has_changed = has_changed | line_changed

return current_line, has_changed

def fix_sentence_end(self, line: list[str]) -> tuple[list[str], bool]:
"""Fix the end of a sentence by adding a period when appropriate."""
if line[-1].endswith('"""') or line[-1].endswith(":"):
return line, False
if line[-1].startswith("https://") or line[-1].startswith("http://") or line[-1].startswith("git@"):
# Allow URLs to end sentences without a period.
return line, False
if (
line[-1][-1] not in self.END_PUNCTUATION
and line[-1][-1] not in self.OTHER_PUNCTUATION
and not (line[-1][-1] in self.WRAPPERS and line[-1][-2] == ".")
and re.search("[a-zA-Z0-9]", line[-1])
):
# Add a period if line does not end with one, or another acceptable punctuation.
# Also check for periods within wrappers, e.g. parenthesis.
# Only add a period if the final word contains at least one alphanumeric character.
line[-1] += "."
return line, True

return line, False

def has_sentence_start(self, word: str) -> bool:
"""Check if the passed line starts with a capital letter, etc."""
# TODO refactor this method to distinguish between when a sentence has a capital letter start, versus when it
# is only exempt from needing one.
if re.search("[_@$+:]", word):
# Ignore non-standard words such as variable references and URLs.
return True
if word in self.LOWER_CASE_EXCEPTIONS:
return True
if len(word) >= 2 and re.match("^[A-Z][0-9A-Z]+$", word):
# Ignore words that are entirely in capitals.
return True
if re.match("^[A-Z]", word):
return True
return False

def has_sentence_end(self, line: str | list[str]) -> bool:
"""Check if the passed line ends with a punctuation mark."""
end_word = line[-1] if isinstance(line, list) else line
if end_word.lower() in self.SPECIAL_EXCEPTIONS:
return False
return end_word[-1] in self.END_PUNCTUATION


if __name__ == "__main__":
raise SystemExit(main())
6 changes: 3 additions & 3 deletions src/macaron/dependency_analyzer/cyclonedx.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def add_latest_version(
latest_deps[key] = item
else:
try:
# These are stored as variables so mypy does not complain about None values (union-attr)
# These are stored as variables so mypy does not complain about None values (union-attr).
latest_value_purl = latest_value.get("purl")
item_purl = item.get("purl")
if (
Expand Down Expand Up @@ -476,7 +476,7 @@ def _resolve_more_dependencies(dependencies: dict[str, DependencyInfo]) -> None:
if outcome not in {RepoFinderInfo.FOUND, RepoFinderInfo.FOUND_FROM_PARENT}:
logger.debug("Failed to find url for purl: %s", item["purl"])
else:
# TODO decide how to handle possible duplicates here
# TODO decide how to handle possible duplicates here.
item["url"] = url
item["available"] = SCMStatus.AVAILABLE
item["note"] = ""
Expand Down Expand Up @@ -692,7 +692,7 @@ def convert_components_to_artifacts(
# submodules that produce development artifacts in the same repo.
if (
"snapshot" in (purl.version or "").lower()
# or "" is not necessary but mypy produces a FP otherwise.
# Or "" is not necessary but mypy produces a FP otherwise.
and root_component
and purl.namespace == root_component.group
):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""Define and initialize the base analyzer."""
Expand All @@ -23,7 +23,7 @@ def __init__(
self.name: str = name
self.heuristic: Heuristics = heuristic
self.depends_on: list[tuple[Heuristics, HeuristicResult]] | None = (
depends_on # Contains the dependent heuristics and the expected result of each heuristic
depends_on # Contains the dependent heuristics and the expected result of each heuristic.
)

@abstractmethod
Expand Down
Loading
Loading