checks-checker

#!/usr/bin/env python3
import argparse
import logging
import re
import sys
import types
from urllib.parse import urlparse, parse_qs
import glob
import os

import requests
import yaml
from bs4 import BeautifulSoup


def getUrlsFromText(text) -> set:
    """Returns URLs found in a text

    Taken from https://www.geeksforgeeks.org/python-check-url-string/

    Parameters:
    text (string): The text to search for URLs

    Returns:
    urls (set): A list of URLs that have been found in the text

    """
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    result = re.findall(regex, text)
    urls = set([x[0] for x in result])
    return urls

#def checkUrl(url: str) -> tuple(list(str), list(str)):
def checkUrl(url: str) -> tuple:
    """Checks an URL for validity

    Parameters:
    url (string): The text to search for URLs

    Returns:
      (errors, warnings)
    """
    warnings=[]
    errors=[]

    parts = urlparse(url)

    # formal checks first
    if parts.query:
        query=parse_qs(parts.query)
        if parts.hostname == "cloud.google.com":
            if query['hl']:
                errors.append(f"Don't request a specific document language (?hl={query['hl'][0]}) in {url}")

    if parts.hostname == "documentation.suse.com":
        # We should/must use the single-html version for SUSE documentation
        # Otherwise links might break if the the document structure changes
        if "/single-html/" not in parts.path:
            errors.append(f"Use the single-html version of the document instead! {url}")
        if not parts.fragment:
            errors.append(f"no fragment used in {url}")

    # After the formal checks, try to actually access the URL
    try:
        response = requests.get(url)
    except requests.ConnectionError:
        errors.append(("e",f"Connection failed for {url}"))
        return (errors, warnings)
    if not response.ok:
        errors.append(f"status {response.status_code} returned for {url}")
        return (errors, warnings)

    # If a URL gets redirected, we might want to 
    # change the documentation link to the redirection target.
    if response.url != url:
        warnings.append(f"got redirected to {response.url} from {url}")
    htmlDocument=response.text
    
    # Parse the document
    soup = BeautifulSoup(htmlDocument, 'html.parser')

    # If the URL has a fragment, look for it in the document
    if parts.fragment:
        htmlElements=soup.find_all(id=parts.fragment)
        if len(htmlElements)==0:
            # Not an id, trying name
            htmlElements=soup.find_all(name=parts.fragment)        
        if len(htmlElements)==0:
            errors.append(f"fragment \"{parts.fragment}\" not found in {url}")
            return (errors, warnings)

    # get the canonical link of the document    
    canonical_links=soup.select('head link[rel*=canonical]')
    # canonical_links might be empty
    if len(canonical_links)>0 and 'href' in canonical_links[0]:
        # # Sometimes documents have multiple instances of canonical links
        # # I've never experienced them differ, so we don't care  
        # if len(canonical_links)>1:
        #     print("More than 1 canonical link!")
        #     for canonical_link in canonical_links:
        #         print(f"Canonical link: {canonical_link['href']}")
        canonical_url=canonical_links[0]['href'] 

        # for suse documentation, remove the index.html suffix (probably works for everyone else too) 
        if parts.hostname == "documentation.suse.com":
            # Python 3.9 and newer: canonical.removesuffix('index.html')
            if canonical_url.endswith('index.html'):
                canonical_url = canonical_url[:-10]
        # add the fragment to the canonical URL, if any
        if parts.fragment:
            canonical_url += "#" + parts.fragment
        # Warn if the URL used is not the canonical one
        if canonical_url != url:
            if parts.hostname == "documentation.suse.com":
                errors.append(f"Not the canonical URL! Use {canonical_url} instead of {url}")
            else:
                warnings.append(f"Not the canonical URL! Use {canonical_url} instead of {url}")

    return (errors, warnings)

def checkGetDocuUrls(check: dict) -> int:
    # Check URLs in remediation text
    if 'remediation' not in check:
        logging.info(f"Check {check['id']} has no remediation section, skipping check")
        return 0
    urls = getUrlsFromText(check['remediation'])
    logging.debug(f"{len(urls)} URLs found")
    if len(urls)>0:
        for url in urls:
            print(f"{check['id']}: {url}")
    return 0

def checkCheckUrls(check: dict) -> int:
    """Checks the URLs in a check

    Parameters:
    check (dict): The check to check the URLs of 

    Returns:
    errorcount (int): returns the number of errors encountered, if any

    """
    urlErrors=0
    # Check URLs in remediation text
    if 'remediation' not in check:
        logging.info(f"Check {check['id']} has no remediation section, skipping check")
        return 0
    urls = getUrlsFromText(check['remediation'])
    logging.debug(f"{len(urls)} URLs found")
    if len(urls)>0:
        for url in urls:
            errors, warnings = checkUrl(url)
            for warning in warnings:
                logging.warning(f"check {check['id']}: {warning}")
            for error in errors:
                logging.error(f"check {check['id']}: {error}")
            urlErrors += len(errors)

    return urlErrors

def checkCheck(check: dict) -> int:
    """Checks a check

    Parameters:
    check (dict): The check to check 

    Returns:
    errorcount (int): returns the number of errors encountered, if any

    """
    checkErrors=0
    logging.info(f"checking check {check['id']}")
    checkErrors+=checkCheckUrls(check)
    return checkErrors

def getChecksfromCheckFile(filename) -> list:
    """parses a checks file and returns a list of the contained check(s) as dictionary 

    Parameters:
    filename (string): name of the checks file to parse

    Returns:
    checks (list): checks contained in the parsed file (list of dicts)

    """
    logging.debug(f"processing file {filename}")
    checks=[]
    try:
        with open(filename, 'r') as file:
            docs = yaml.safe_load_all(file)
            if not isinstance(docs, types.GeneratorType):
                logging.warning(f"file \"{filename}\" contains no valid YAML documents, skipping")
                return []
            for check in docs:
                if isinstance(check, str):
                    logging.debug("File is not YAML, skipping")
                    continue
                if not 'id' in check:
                    logging.debug("Check has no id property, skipping")
                    continue
                checks.append(check)
    except FileNotFoundError:
        logging.debug(f"file \"{filename}\" not found, skipping")
    except(yaml.scanner.ScannerError, yaml.parser.ParserError):
        logging.debug(f"file \"{filename}\" not a valid YAML file, skipping")

    logging.debug(f"found {len(checks)} checks in file \"{filename}\"")
    return checks


def main():

    # Parse Arguments
    parser = argparse.ArgumentParser(description ='Check Wanda checks')
    parser.add_argument(dest='filenames', metavar='filename', nargs='+', help='checks file')
    parser.add_argument('-l', '--log-level', dest='loglevel', default='WARNING', action='store', help='set log level')
    parser.add_argument('-m', '--match', dest='match', default='', action='store', help='only process files matching this glob')
    parser.add_argument('-d', '--doc-urls', dest='doc_urls', default=False, action='store_true', help='only print documentation URLs')
    args = parser.parse_args()

    # Set up logging
    logger = logging.getLogger()
    logFormatter = logging.Formatter(fmt='%(levelname)s: %(message)s')
    logHandler = logging.StreamHandler()
    logHandler.setFormatter(logFormatter)
    logger.addHandler(logHandler)
    try:
        logger.setLevel(args.loglevel)
    except ValueError:
        logging.critical("Invalid log level: {} (valid levels: DEBUG, INFO, WARNING, ERROR)".format(args.loglevel))
        sys.exit(255)

    allErrors=0
    # Check all files given as parameters
    for pattern in args.filenames:
        for filename in glob.iglob(pattern, recursive=True):
            if args.match != '' and not filename in glob.iglob(args.match, recursive=True):
                logging.debug(f"file {filename} does not match glob {args.match}, skipping")
                continue  
            if os.path.isdir(filename):
                logging.debug(f"file {filename} is a directory, skipping")
                continue  
            try:
                checks=getChecksfromCheckFile(filename)
                for check in checks:
                    if args.doc_urls: 
                        allErrors += checkGetDocuUrls(check)
                    else:
                        allErrors += checkCheck(check)
            except IsADirectoryError:
                pass

    logging.shutdown()
    # Return 1 if any erros occured, or 0 if everything was ok
    #exit(int(allErrors > 0))
    exit(0)


#---------------------------------
if __name__ == '__main__':
    main()