Tools: Consolidated validators to .tools/validation (#5597)

* Consolidated validators to .tools/validation, and reworked file walker * Better snippet validation workflow name * bump to python 3.11 * Commented validators * Cleaned up IGNORE_FILES checks between walk... and git...
awsdocs · Dec 15, 2023 · a7331a7 · a7331a7
1 parent 02453be
commit a7331a7
Show file tree

Hide file tree

Showing 10 changed files with 158 additions and 95 deletions.
diff --git a/.github/workflows/validate-doc-metadata.yml b/.github/workflows/validate-doc-metadata.yml
@@ -7,21 +7,5 @@ jobs:
   validate-doc-metadata:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
-        with:
-          persist-credentials: false
-          fetch-depth: 0
-      - name: Set up Python 3.9.x
-        uses: actions/setup-python@v3
-        with:
-          python-version: "3.9.x"
-      - name: Install dependencies
-        run: |
-          python -m pip install yamale
-          python -m pip install yamllint
-      - name: Lint with yamllint
-        run: yamllint --format standard -c .tools/validation/.yamllint.yaml .doc_gen/metadata
-      - name: Lint with yamllint
-        run: yamllint --format standard -c .tools/validation/.yamllint.yaml .doc_gen/metadata/curated
-      - name: Validate metadata with yamale
-        run: python .tools/validation/validate_doc_metadata.py --doc-gen .doc_gen
+      - name: Pass
+        run: true
diff --git a/.github/workflows/pre-validate.yml → .github/workflows/validate.yml b/.github/workflows/pre-validate.yml → .github/workflows/validate.yml
@@ -1,5 +1,5 @@
 # Run Python checkin script on push or pull request
-name: Miscellaneous pre-validation checks
+name: Snippet & Repo Validation Checks
 
 # Controls when the action will run.
 on:
@@ -26,7 +26,13 @@ jobs:
       - name: setup python
         uses: actions/setup-python@v3
         with:
-          python-version: 3.8 #install the python needed
-      - name: Run Python script to vet code examples # Runs a single command using the runners shell
-        run: |
-          python3 .tools/pre_validate/pre_validate.py --q
+          python-version: 3.11 #install the python needed
+      - name: Install dependencies
+        run: >-
+          python3 -m pip install -r .tools/base_requirements.txt
+      - name: Lint metadata files
+        run: >-
+          yamllint --format standard -c .tools/validation/.yamllint.yaml .doc_gen/metadata .doc_gen/metadata/curated
+      - name: Validate metadata and repo
+        run: >-
+          python3 .tools/validation/validate.py --q --doc-gen .doc_gen
diff --git a/.gitignore b/.gitignore
@@ -1,19 +1,20 @@
-.venv
-build_dir/
-target/
+*.exe
 *.swp
+.*~
 .DS_Store
-.metadata
-.recommenders
 ._*
-vendor/
-*.exe
 .idea
-.vscode
-venv
-.*~
-xcuserdata
+.metadata
+.phpunit.result.cache
+.recommenders
 .swiftpm
+.venv
+.vscode
 Package.resolved
+build_dir
+node_modules
 super-linter.log
-.phpunit.result.cache
+target
+vendor
+venv
+xcuserdata
diff --git a/.tools/base_requirements.txt b/.tools/base_requirements.txt
@@ -1,6 +1,7 @@
 black==23.9.1
 flake8==6.1.0
 mypy-extensions==1.0.0
+pathspec==0.11.2
 PyYAML==6.0.1
 requests==2.31.0
 types-PyYAML==6.0.12.12

diff --git a/.tools/pre_validate/README.md → .tools/validation/README.md b/.tools/pre_validate/README.md → .tools/validation/README.md
diff --git a/.tools/validation/validate_doc_metadata.py → .tools/validation/metadata_validator.py b/.tools/validation/validate_doc_metadata.py → .tools/validation/metadata_validator.py
@@ -153,7 +153,7 @@ def _is_valid(self, value):
 
 def validate_files(schema_name: Path, meta_names: Iterable[Path], validators):
     """Iterate a list of files and validate each one against a schema."""
-    success = True
+    errors = 0
 
     schema = yamale.make_schema(schema_name, validators=validators)
     for meta_name in meta_names:
@@ -163,11 +163,11 @@ def validate_files(schema_name: Path, meta_names: Iterable[Path], validators):
             print(f"{meta_name.resolve()} validation success! 👍")
         except YamaleError as e:
             print(e.message)
-            success = False
-    return success
+            errors += 1
+    return errors
 
 
-def validate_all(doc_gen: Path):
+def validate_metadata(doc_gen: Path):
     # with open(doc_gen / "metadata" / "sdks.yaml") as sdks_file:
     #     sdks_yaml: dict[str, any] = yaml.safe_load(sdks_file)
 
@@ -203,13 +203,13 @@ def validate_all(doc_gen: Path):
         ("curated_sources_schema.yaml", "curated/sources.yaml"),
         ("curated_example_schema.yaml", "curated/*_metadata.yaml"),
     ]
-    success = True
+    errors = 0
     for schema, metadata in to_validate:
-        success &= validate_files(
+        errors += validate_files(
             schema_root / schema, (doc_gen / "metadata").glob(metadata), validators
         )
 
-    return success
+    return errors
 
 
 def main():
@@ -222,9 +222,9 @@ def main():
     )
     args = parser.parse_args()
 
-    success = validate_all(Path(args.doc_gen))
+    errors = validate_metadata(Path(args.doc_gen))
 
-    if success:
+    if errors == 0:
         print("Validation succeeded! 👍👍👍")
     else:
         print("\n********************************************")

diff --git a/.tools/pre_validate/pre_validate.py → .tools/validation/project_validator.py b/.tools/pre_validate/pre_validate.py → .tools/validation/project_validator.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
 """
@@ -22,6 +23,9 @@
 import argparse
 import logging
 import sys
+from pathlib import Path
+from pathspec import GitIgnoreSpec
+from typing import Generator
 from words import WORDS
 
 logger = logging.getLogger(__name__)
@@ -57,22 +61,6 @@
 # action in a clean environment (aside from testing),
 # exhaustive ignore lists shouldn't be necessary.
 
-# Folders to skip.
-IGNORE_FOLDERS = {
-    ".doc_gen",
-    ".pytest_cache",
-    ".tools",
-    ".venv",
-    "__pycache__",
-    "bin",
-    "cdk.out",
-    "dist",
-    "node_modules",
-    "obj",
-    "target",
-    "venv",
-}
-
 # Files to skip.
 IGNORE_FILES = {
     ".moviedata.json",
@@ -208,53 +196,88 @@
 }
 
 
-def check_files(root, quiet):
+def match_path_to_specs(path: Path, specs: list[GitIgnoreSpec]) -> bool:
+    """
+    Return True if we should skip this path, that is, it is matched by a .gitignore.
+    """
+    for spec in specs:
+        if spec.match_file(path):
+            return True
+    return False
+
+
+def walk_with_gitignore(
+    root: Path, specs: list[GitIgnoreSpec] = []
+) -> Generator[Path, None, None]:
+    """
+    Starting from a root directory, walk the file system yielding a path for each file.
+    However, it also reads `.gitignore` files, so that it behaves like `git ls-files`.
+    It does not actively use `git ls-files` because it wouldn't catch new files without
+    fiddling with a number of flags.
+    """
+    gitignore = root / ".gitignore"
+    if gitignore.exists():
+        with open(root / ".gitignore", "r", encoding="utf-8") as gitignore:
+            specs = [*specs, GitIgnoreSpec.from_lines(gitignore.readlines())]
+    for entry in os.scandir(root):
+        if not match_path_to_specs(entry.path, specs):
+            path = Path(entry.path)
+            if entry.is_dir():
+                yield from walk_with_gitignore(path, specs)
+            else:
+                yield path
+
+
+def get_files(root: Path) -> Generator[Path, None, None]:
+    """
+    Yield non-skipped files, that is, anything not matching git ls-files and not
+    in the "to skip" files that are in git but are machine generated, so we don't
+    want to validate them.
+    """
+    for path in walk_with_gitignore(root):
+        filename = path.parts[-1]
+        ext = os.path.splitext(filename)[1].lstrip(".")
+        if ext.lower() in EXT_LOOKUP and filename not in IGNORE_FILES:
+            yield path
+
+
+def check_files(root: Path):
     """
     Walk a folder system, scanning all files with specified extensions.
     Errors are logged and counted and the count of errors is returned.
 
     :param root: The root folder to start the walk.
-    :param quiet: When True, suppress most output.
     :return: The number of errors found in the scanned files.
     """
     file_count = 0
     error_count = 0
-    for path, dirs, files in os.walk(root, topdown=True):
-        dirs[:] = [d for d in dirs if d not in IGNORE_FOLDERS]
-        for filename in files:
-            ext = os.path.splitext(filename)[1].lstrip(".")
-            if ext.lower() in EXT_LOOKUP:
-                file_path = os.path.join(path, filename)
-                if filename in IGNORE_FILES:
-                    if not quiet:
-                        print("\nFile: " + file_path + " is skipped")
-                    continue
-                file_count += 1
-                if not quiet:
-                    print("\nChecking File: " + file_path)
-                with open(file_path, encoding="utf-8") as f:
-                    file_contents = f.read()
-
-                error_count += verify_no_deny_list_words(file_contents, file_path)
-                error_count += verify_no_secret_keys(file_contents, file_path)
-                error_count += verify_no_secret_keys(filename, file_path)
-                error_count += verify_snippet_start_end(file_contents, file_path)
+    for file_path in get_files(root):
+        file_count += 1
+        logger.info("\nChecking File: %s", file_path)
+
+        with open(file_path, encoding="utf-8") as f:
+            file_contents = f.read()
+
+        error_count += verify_no_deny_list_words(file_contents, file_path)
+        error_count += verify_no_secret_keys(file_contents, file_path)
+        error_count += verify_no_secret_keys(file_contents, file_path)
+        error_count += verify_snippet_start_end(file_contents, file_path)
 
     print(f"{file_count} files scanned in {root}.\n")
     return error_count
 
 
-def verify_no_deny_list_words(file_contents, file_location):
+def verify_no_deny_list_words(file_contents: str, file_location: Path):
     """Verify no words in the file are in the list of denied words."""
     error_count = 0
     for word in file_contents.split():
         if word.lower() in DENY_LIST:
-            logger.error(f"Word '%s' in %s is not allowed.", word, file_location)
+            logger.error("Word '%s' in %s is not allowed.", word, file_location)
             error_count += 1
     return error_count
 
 
-def verify_sample_files(root_path):
+def verify_sample_files(root_path: Path) -> int:
     """Verify sample files meet the requirements and have not moved."""
     sample_files_folder = os.path.join(root_path, "resources/sample_files")
     media_folder = ".sample_media"
@@ -269,23 +292,23 @@ def verify_sample_files(root_path):
             ext = os.path.splitext(file_name)[1].lstrip(".")
             if file_name not in EXPECTED_SAMPLE_FILES:
                 logger.error(
-                    f"File '%s' in %s was not found in the list of expected sample files. If this is a new sample file, add it to the EXPECTED_SAMPLE_FILES list in pre_validate.py.",
+                    "File '%s' in %s was not found in the list of expected sample files. If this is a new sample file, add it to the EXPECTED_SAMPLE_FILES list in pre_validate.py.",
                     file_name,
                     sample_files_folder,
                 )
                 error_count += 1
             if ext.lower() in MEDIA_FILE_TYPES:
                 if media_folder not in file_path:
                     logger.error(
-                        f"File '%s' in %s must be in the %s directory.",
+                        "File '%s' in %s must be in the %s directory.",
                         file_name,
                         sample_files_folder,
                         media_folder,
                     )
                     error_count += 1
             if (os.path.getsize(file_path) / ONE_MB_AS_BYTES) > MAX_FILE_SIZE_MB:
                 logger.error(
-                    f"File '%s' in %s is larger than the allowed size for a sample file.",
+                    "File '%s' in %s is larger than the allowed size for a sample file.",
                     file_name,
                     sample_files_folder,
                 )
@@ -294,15 +317,16 @@ def verify_sample_files(root_path):
     for sample_file in EXPECTED_SAMPLE_FILES:
         if sample_file not in file_list:
             logger.error(
-                f"Expected sample file '%s' was not found in '%s'. If this file was intentionally removed, remove it from the EXPECTED_SAMPLE_FILES list in pre_validate.py.",
+                "Expected sample file '%s' was not found in '%s'. If this file was intentionally removed, remove it from the EXPECTED_SAMPLE_FILES list in pre_validate.py.",
                 sample_file,
                 sample_files_folder,
             )
             error_count += 1
+
     return error_count
 
 
-def verify_no_secret_keys(file_contents, file_location):
+def verify_no_secret_keys(file_contents: str, file_location: Path) -> int:
     """Verify the file does not contain 20- or 40- length character strings,
     which might be secret keys. Allow strings in the allowlist in
     https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/.github/pre_validate/pre_validate.py.
@@ -339,7 +363,7 @@ def verify_no_secret_keys(file_contents, file_location):
     return error_count
 
 
-def verify_snippet_start_end(file_contents, file_location):
+def verify_snippet_start_end(file_contents: str, file_location: Path) -> int:
     """Scan the file contents for snippet-start and snippet-end tags and verify
     that they are in matched pairs. Log errors and return the count of errors."""
     error_count = 0
@@ -350,7 +374,7 @@ def verify_snippet_start_end(file_contents, file_location):
         if snippet_start in word:
             tag = word.split("[")[1]
             if tag in snippet_tags:
-                logger.error(f"Duplicate tag {tag[:-1]} found in {file_location}.")
+                logger.error("Duplicate tag %s found in %s.", tag[:-1], file_location)
                 error_count += 1
             else:
                 snippet_tags.add(tag)
@@ -360,8 +384,9 @@ def verify_snippet_start_end(file_contents, file_location):
                 snippet_tags.remove(tag)
             else:
                 logger.error(
-                    f"End tag {tag[:-1]} with no matching start tag "
-                    f"found in {file_location}."
+                    "End tag %s with no matching start tag " "found in %s.",
+                    tag[:-1],
+                    file_location,
                 )
                 error_count += 1
 
@@ -391,7 +416,9 @@ def main():
     )
     args = parser.parse_args()
 
-    root_path = os.path.abspath(".") if not args.root else os.path.abspath(args.root)
+    root_path = Path(
+        os.path.abspath(".") if not args.root else os.path.abspath(args.root)
+    )
 
     print("----------\n\nRun Tests\n")
     error_count = check_files(root_path, args.quiet)

diff --git a/...ls/pre_validate/test/test_pre_validate.py → .tools/validation/test/test_pre_validate.py b/...ls/pre_validate/test/test_pre_validate.py → .tools/validation/test/test_pre_validate.py