Manual Spellcheck Review & Approval #112
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Manual Spellcheck & Auto PR | |
on: | |
workflow_dispatch: # Runs only when manually triggered | |
permissions: | |
contents: write # Needed to push changes to a new branch | |
pull-requests: write # Needed to create a PR | |
jobs: | |
spellcheck: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout Repository | |
uses: actions/checkout@v4 | |
- name: Install Dependencies | |
run: | | |
pip install codespell | |
pip install fuzzywuzzy[speedup] | |
pip install nltk | |
python3 -c "import nltk; nltk.download('punkt'); nltk.data.path.append('/usr/local/nltk_data')" | |
- name: Verify Spellcheck Ignore List Exists | |
run: | | |
if [ ! -f .github/spellcheck-ignore.txt ]; then | |
echo "Error: spellcheck-ignore.txt not found!" && exit 1 | |
fi | |
- name: Run Spellcheck and Apply Fixes | |
run: | | |
set -e # Exit on error | |
# Run codespell and save output | |
codespell --ignore-words=.github/spellcheck-ignore.txt \ | |
--skip=".git,*.lock,*.json,*.yaml,*.yml,*.css,*.html" \ | |
--quiet-level=2 > spellcheck_report_raw.txt || true | |
# Process corrections with Python | |
python3 <<EOF | |
import re | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
from fuzzywuzzy import process | |
# Ensure 'punkt' is available before using it | |
nltk.download('punkt', quiet=True) | |
nltk.data.path.append('/usr/local/nltk_data') | |
# Load spellcheck ignore list with case sensitivity | |
ignore_list = {} | |
with open(".github/spellcheck-ignore.txt", "r", encoding="utf-8") as f: | |
for line in f: | |
word = line.strip() | |
ignore_list[word.lower()] = word # Store lowercase -> correct-case | |
# Function to check if a word is inside a code block, backticks, URL, or file reference | |
def is_code_or_url_or_file(line): | |
return re.search(r'`.*?`|https?://\S+|www\.\S+|/[\w./-]+', line) | |
# Function to check if a word is part of a Markdown link | |
def is_markdown_link(line, original): | |
return re.search(r'\[.*?\]\(.*' + re.escape(original) + r'.*\)', line) | |
# Function to determine if an ignore list word should be used | |
def should_use_ignore_list(original, suggestion, line): | |
best_match, score = process.extractOne(original, ignore_list.keys()) | |
# Must be at least 90% similar to be considered a match | |
if score < 90: | |
return False | |
# Reject if original contains best_match as a substring (e.g., "certifcate" vs "CE") | |
if best_match in original and len(original) > len(best_match): | |
return False | |
return True | |
# Process spellcheck output and apply fixes | |
with open("spellcheck_report_raw.txt", "r", encoding="utf-8") as infile, open("spellcheck_report.txt", "w", encoding="utf-8") as outfile: | |
for line in infile: | |
match = re.match(r"(.*):(\d+): (\S+) ==> (\S+)", line) | |
if match: | |
file_path, line_number, original, suggestion = match.groups() | |
corrected_word = suggestion | |
# Read the full line from the file | |
with open(file_path, "r", encoding="utf-8") as file: | |
content_lines = file.readlines() | |
context_line = content_lines[int(line_number) - 1].strip() | |
# Tokenize the sentence for context-based correction | |
sentences = sent_tokenize(context_line) | |
relevant_sentence = next((s for s in sentences if original in s), context_line) | |
# **Fix #1: Case-sensitive correction for ignore list terms** | |
if original.lower() in ignore_list: | |
if is_code_or_url_or_file(relevant_sentence) or is_markdown_link(relevant_sentence, original): | |
corrected_word = original.lower() # Keep lowercase in URLs, links, or file paths | |
else: | |
corrected_word = ignore_list[original.lower()] # Use exact case from ignore list | |
# **Fix #2: Reject weak matches and default to the English dictionary** | |
elif should_use_ignore_list(original, suggestion, relevant_sentence): | |
best_match, _ = process.extractOne(original, ignore_list.keys()) | |
corrected_word = ignore_list[best_match] | |
# **Fix #3: Apply corrections based on full sentence** | |
relevant_sentence = re.sub(r'\b' + re.escape(original) + r'\b', corrected_word, relevant_sentence, count=1) | |
# **Fix #4: Ensure no extra punctuation is introduced** | |
relevant_sentence = relevant_sentence.replace("..", ".").replace(",.", ".") | |
# Write final output | |
outfile.write(f"{file_path}:{line_number}: {original} ==> {corrected_word}\n") | |
# Apply fix while maintaining case rules | |
content_lines[int(line_number) - 1] = relevant_sentence + "\n" | |
with open(file_path, "w", encoding="utf-8") as file: | |
file.writelines(content_lines) | |
EOF | |
# Check if any files were modified | |
if git status --porcelain | grep -q '^ M'; then | |
echo "Corrections applied. Preparing to create PR." | |
else | |
echo "No spelling corrections found. Exiting." | |
exit 0 | |
fi | |
- name: Create Pull Request with Corrections | |
env: | |
GITHUB_TOKEN: ${{ secrets.PAT_GITHUB_ACTIONS }} | |
run: | | |
git config --global user.name "github-actions[bot]" | |
git config --global user.email "github-actions[bot]@users.noreply.github.com" | |
BRANCH_NAME="spellcheck-fixes-$(date +%s)" | |
git checkout -b $BRANCH_NAME | |
# Commit the changes if there are any | |
if [ -n "$(git status --porcelain)" ]; then | |
git add . | |
git commit -m "Spellcheck: Automatically fixed detected misspellings" | |
git push origin $BRANCH_NAME | |
# Create PR using GitHub CLI | |
gh pr create \ | |
--base main \ | |
--head $BRANCH_NAME \ | |
--title "Spellcheck Fixes" \ | |
--body "This PR contains automatically applied spelling corrections." | |
else | |
echo "No changes detected. Skipping PR creation." | |
fi |