diff --git a/prospector/client/cli/prospector_client.py b/prospector/client/cli/prospector_client.py index c45f8b67f..08986ebf3 100644 --- a/prospector/client/cli/prospector_client.py +++ b/prospector/client/cli/prospector_client.py @@ -8,7 +8,7 @@ import log from client.cli.console import ConsoleWriter, MessageStatus -from datamodel.advisory import AdvisoryRecord +from datamodel.advisory import AdvisoryRecord, build_advisory_record from datamodel.commit import Commit, apply_ranking, make_from_raw_commit from filtering.filter import filter_commits from git.git import GIT_CACHE, Git @@ -26,6 +26,7 @@ _logger = init_local_logger() + SECS_PER_DAY = 86400 TIME_LIMIT_BEFORE = 3 * 365 * SECS_PER_DAY TIME_LIMIT_AFTER = 180 * SECS_PER_DAY @@ -73,6 +74,7 @@ def prospector( # noqa: C901 publication_date, advisory_keywords, modified_files, + filter_extensions, ) with ConsoleWriter("Obtaining initial set of candidates") as writer: @@ -248,49 +250,7 @@ def save_preprocessed_commits(backend_address, payload): ) -def build_advisory_record( - vulnerability_id, - repository_url, - vuln_descr, - nvd_rest_endpoint, - fetch_references, - use_nvd, - publication_date, - advisory_keywords, - modified_files, -) -> AdvisoryRecord: - - advisory_record = AdvisoryRecord( - vulnerability_id=vulnerability_id, - repository_url=repository_url, - description=vuln_descr, - from_nvd=use_nvd, - nvd_rest_endpoint=nvd_rest_endpoint, - ) - - _logger.pretty_log(advisory_record) - advisory_record.analyze(use_nvd=use_nvd, fetch_references=fetch_references) - _logger.debug(f"{advisory_record.keywords=}") - - if publication_date != "": - advisory_record.published_timestamp = int( - datetime.fromisoformat(publication_date).timestamp() - ) - - if len(advisory_keywords) > 0: - advisory_record.keywords += tuple(advisory_keywords) - # drop duplicates - advisory_record.keywords = list(set(advisory_record.keywords)) - - if len(modified_files) > 0: - advisory_record.paths += modified_files - - _logger.debug(f"{advisory_record.keywords=}") - _logger.debug(f"{advisory_record.paths=}") - - return advisory_record - - +# TODO: Cleanup many parameters should be recovered from the advisory record object def get_candidates( advisory_record, repository, diff --git a/prospector/client/cli/prospector_client_test.py b/prospector/client/cli/prospector_client_test.py index a8544e9fe..2c4c4b0b8 100644 --- a/prospector/client/cli/prospector_client_test.py +++ b/prospector/client/cli/prospector_client_test.py @@ -1,6 +1,7 @@ import pytest from api import DB_CONNECT_STRING +from client.cli.prospector_client import build_advisory_record from commitdb.postgres import PostgresCommitDB from stats.execution import execution_statistics diff --git a/prospector/datamodel/advisory.py b/prospector/datamodel/advisory.py index a5f295172..abffb42c0 100644 --- a/prospector/datamodel/advisory.py +++ b/prospector/datamodel/advisory.py @@ -13,7 +13,7 @@ from util.http import fetch_url from .nlp import ( - extract_affected_files_paths, + extract_affected_filenames, extract_products, extract_special_terms, extract_versions, @@ -79,7 +79,9 @@ class AdvisoryRecord(BaseModel): # self.from_nvd = from_nvd # self.nvd_rest_endpoint = nvd_rest_endpoint - def analyze(self, use_nvd: bool = False, fetch_references=False): + def analyze( + self, use_nvd: bool = False, fetch_references=False, relevant_extensions=[] + ): self.from_nvd = use_nvd if self.from_nvd: @@ -93,7 +95,8 @@ def analyze(self, use_nvd: bool = False, fetch_references=False): # TODO: if an exact file is found when applying the rules, the relevance must be updated i think self.paths = union_of( - self.paths, extract_affected_files_paths(self.description) + self.paths, + extract_affected_filenames(self.description, relevant_extensions), ) self.keywords = union_of(self.keywords, extract_special_terms(self.description)) _logger.debug("References: " + str(self.references)) @@ -187,6 +190,57 @@ def get_from_nvd(self, vuln_id: str, nvd_rest_endpoint: str = NVD_REST_ENDPOINT) return False +# Moved here since it is a factory method basicall +def build_advisory_record( + vulnerability_id, + repository_url, + vuln_descr, + nvd_rest_endpoint, + fetch_references, + use_nvd, + publication_date, + advisory_keywords, + modified_files, + filter_extensions, +) -> AdvisoryRecord: + + advisory_record = AdvisoryRecord( + vulnerability_id=vulnerability_id, + repository_url=repository_url, + description=vuln_descr, + from_nvd=use_nvd, + nvd_rest_endpoint=nvd_rest_endpoint, + ) + + _logger.pretty_log(advisory_record) + advisory_record.analyze( + use_nvd=use_nvd, + fetch_references=fetch_references, + # relevant_extensions=filter_extensions.split(".")[ + # 1 + # ], # the *. is added early in the main and is needed multiple times in the git so let's leave it there + ) + _logger.debug(f"{advisory_record.keywords=}") + + if publication_date != "": + advisory_record.published_timestamp = int( + datetime.fromisoformat(publication_date).timestamp() + ) + + if len(advisory_keywords) > 0: + advisory_record.keywords += tuple(advisory_keywords) + # drop duplicates + advisory_record.keywords = list(set(advisory_record.keywords)) + + if len(modified_files) > 0: + advisory_record.paths += modified_files + + _logger.debug(f"{advisory_record.keywords=}") + _logger.debug(f"{advisory_record.paths=}") + + return advisory_record + + # might be used in the future # @dataclass # class Reference: diff --git a/prospector/datamodel/advisory_test.py b/prospector/datamodel/advisory_test.py index da3a8ab27..764b054ca 100644 --- a/prospector/datamodel/advisory_test.py +++ b/prospector/datamodel/advisory_test.py @@ -1,5 +1,10 @@ # from dataclasses import asdict -from datamodel.advisory import AdvisoryRecord +import time +from unittest import result + +from pytest import skip +from datamodel.advisory import AdvisoryRecord, build_advisory_record +from .nlp import RELEVANT_EXTENSIONS # import pytest @@ -89,6 +94,57 @@ def test_adv_record_keywords(): ) +def test_build(): + record = build_advisory_record( + "CVE-2014-0050", "", "", "", "", True, "", "", "", "*.java" + ) + assert "MultipartStream" in record.paths + assert record.vulnerability_id == "CVE-2014-0050" + + +@skip(reason="Slow connections make it fail") +def test_filenames_extraction(): + cve = { + "CVE-2014-0050": "MultipartStream", + "CVE-2021-22696": "JwtRequestCodeFilter", # Should match JwtRequestCodeFilter + "CVE-2021-27582": "OAuthConfirmationController", + "CVE-2021-29425": "FileNameUtils", + "CVE-2021-30468": "JsonMapObjectReaderWriter", + } + + result1 = build_advisory_record( + "CVE-2014-0050", "", "", "", "", True, "", "", "", "" + ) + result2 = build_advisory_record( + "CVE-2021-22696", "", "", "", "", True, "", "", "", "" + ) + result3 = build_advisory_record( + "CVE-2021-27582", "", "", "", "", True, "", "", "", "" + ) + result4 = build_advisory_record( + "CVE-2021-29425", "", "", "", "", True, "", "", "", "" + ) + result5 = build_advisory_record( + "CVE-2021-30468", "", "", "", "", True, "", "", "", "" + ) + assert ( + result1.paths.sort() == ["MultiPartStream", "FileUpload"].sort() + ) # Content-Type + assert result2.paths.sort() == ["JwtRequestCodeFilter", "request_uri"].sort() + assert ( + result3.paths.sort() + == [ + "OAuthConfirmationController", + "@ModelAttribute", + "authorizationRequest", + ].sort() + ) + assert result4.paths.sort() == ["FileNameUtils"].sort() + assert result5.paths.sort() == ["JsonMapObjectReaderWriter"].sort() + + # raise Exception("Test failed") + + # def test_adv_record_project_data(): # record = AdvisoryRecord(vulnerability_id="CVE-XXXX-YYYY", description=ADVISORY_TEXT_2) # record.analyze() diff --git a/prospector/datamodel/nlp.py b/prospector/datamodel/nlp.py index 0bf5f65a1..31484ad86 100644 --- a/prospector/datamodel/nlp.py +++ b/prospector/datamodel/nlp.py @@ -45,63 +45,88 @@ def extract_products(text: str) -> List[str]: return [p for p in result if len(p) > 2] -def extract_affected_files_paths(text: str, strict_extensions: bool = False): - words = text.split() - words = [ - word.strip("_,.:;-+!?)]}'\"") for word in words - ] # removing common punctuation marks - paths = [] - for word in words: - is_xml_tag = word.startswith("<") - is_property = word.endswith("=") - is_unusual = check_unusual_stuff(word) - not_relevant = is_xml_tag or is_property or is_unusual - - if not_relevant: - continue - - if check_if_path(word): - paths.append(word) - - if check_if_file(word): - paths.append(word.split(".")[0].split("::")[0]) - - return paths - - -def check_unusual_stuff(text: str) -> bool: - return '"' in text or "," in text - - -def check_if_path(text: str) -> bool: - return "/" in text or "\\" in text - - -# TODO: look if there are others -def check_if_file(text: str) -> bool: - file = text.split(".") - if len(file) == 1: - file = file[0].split("::") - - flag = False - # Check if there is an extension - if file[-1] in RELEVANT_EXTENSIONS: - return True - - # Common name pattern for files or methods with underscores - if "_" in file[0] or "_" in file[-1]: - return True - - # Common methods to refer to methods inside class (e.g. Class.method, Class::method) - if ("." in text or "::" in text) and file[0].isalpha(): - return True - # Common name for files or methods with uppercase letter in the middle - if bool(re.match(r"(?=.*[a-z])(?=.*[A-Z])", file[0][1:])) or bool( - re.match(r"(?=.*[a-z])(?=.*[A-Z])", file[-1][1:]) - ): - return True - - return flag +def extract_affected_filenames( + text: str, extensions: List[str] = RELEVANT_EXTENSIONS +) -> List[str]: + paths = set() + for word in text.split(): + res = word.strip("_,.:;-+!?()]}'\"") + res = extract_filename_from_path(res) + res = check_file_class_method_names(res, extensions) + if res: + paths.add(res) + + return list(paths) + + +# TODO: enhanche this with extensions +# If looks like a path-to-file try to get the filename.extension or just filename +def extract_filename_from_path(text: str) -> str: + # Pattern //path//to//file or \\path\\to\\file, extract file + # res = re.search(r"^(?:(?:\/{,2}|\\{,2})([\w\-\.]+))+$", text) + # if res: + # return res.group(1), True + # # Else simply return the text + # return text, False + res = text.split("/") + + return res[-1] # , len(res) > 1 + + +def check_file_class_method_names(text: str, relevant_extensions: List[str]) -> str: + # Covers cases file.extension if extension is relevant, extensions come from CLI parameter + extensions_regex = r"^([\w\-]+)\.({})?$".format("|".join(relevant_extensions)) + res = re.search(extensions_regex, text) + if res: + return res.group(1) + + # Covers cases like: class::method, class.method, + res = re.search(r"^(\w+)(?:\.|:{2})(\w+)$", text) # ^(\w{2,})(?:\.|:{2})(\w{2,})$ + # Check if it is not a number + if res and not bool(re.match(r"^\d+$", res.group(1))): + return res.group(1) + + # Covers cases like: className or class_name (normal string with underscore), this may have false positive but often related to some code + if bool(re.search(r"[a-z]{2}[A-Z]+[a-z]{2}", text)) or "_" in text: + return text + + return None + + +# def check_unusual_stuff(text: str) -> bool: +# return '"' in text or "," in text + + +# def check_if_path(text: str) -> bool: +# return "/" in text or "\\" in text + + +# # TODO: look if there are others +# def check_if_file(text: str) -> str: +# file = text.split(".") +# if len(file) == 1: +# file = file[0].split("::") + +# flag = False +# # Is a filename with extension +# # TODO: dynamic extension using the --filter-extensions from CLI to reduce computations +# if file[-1] in RELEVANT_EXTENSIONS: +# return file[0] + +# # Common name pattern for files or methods with underscores +# if "_" in file[0] or "_" in file[-1]: +# return True + +# # Contains "." or "::" can be a Class.Method (Class::Method), letters only +# if ("." in text or "::" in text) and file[0].isalpha(): +# return True +# # Contains UPPERCASE and lowercase letters excluding the first and last +# if bool(re.match(r"(?=.*[a-z])(?=.*[A-Z])", file[0][1:-1])) or bool( +# re.match(r"(?=.*[a-z])(?=.*[A-Z])", file[-1][1:-1]) +# ): +# return True + +# return flag def extract_ghissue_references(repository: str, text: str) -> Dict[str, str]: diff --git a/prospector/datamodel/test_nlp.py b/prospector/datamodel/test_nlp.py index 6d3abccc5..2a5958916 100644 --- a/prospector/datamodel/test_nlp.py +++ b/prospector/datamodel/test_nlp.py @@ -3,7 +3,7 @@ from .nlp import ( extract_cve_references, extract_jira_references, - extract_path_tokens, + extract_affected_filenames, extract_special_terms, ) @@ -43,30 +43,46 @@ def test_extract_special_terms(): def test_adv_record_path_extraction_no_real_paths(): - result = extract_path_tokens(ADVISORY_TEXT) + result = extract_affected_filenames(ADVISORY_TEXT) - assert result == ["15.26.1", '\\"Radio', "protection,\\"] + assert result == [] -ADVISORY_TEXT = """Unspecified vulnerability in Uconnect before 15.26.1, as used - in certain Fiat Chrysler Automobiles (FCA) from 2013 to 2015 models, allows - remote attackers in the same cellular network to control vehicle movement, - cause human harm or physical damage, or modify dashboard settings via - vectors related to modification of entertainment-system firmware and access - of the CAN bus due to insufficient \\"Radio security protection,\\" as - demonstrated on a 2014 Jeep Cherokee Limited FWD.""" +ADVISORY_TEXT_1 = """CXF supports (via JwtRequestCodeFilter) passing OAuth 2 parameters via a JWT token as opposed to query parameters (see: The OAuth 2.0 Authorization Framework: JWT Secured Authorization Request (JAR)). Instead of sending a JWT token as a "request" parameter, the spec also supports specifying a URI from which to retrieve a JWT token from via the "request_uri" parameter. CXF was not validating the "request_uri" parameter (apart from ensuring it uses "https) and was making a REST request to the parameter in the request to retrieve a token. This means that CXF was vulnerable to DDos attacks on the authorization server, as specified in section 10.4.1 of the spec. This issue affects Apache CXF versions prior to 3.4.3; Apache CXF versions prior to 3.3.10.""" -ADVISORY_TEXT_2 = """In Apache Commons IO before 2.7, When invoking the method FileNameUtils.normalize -with an improper input string, like "//../foo", or "\\..\\foo", the result would be -the same value, thus possibly providing access to files in the parent directory, -but not further above (thus "limited" path traversal), if the calling code would -use the result to construct a path value.""" +ADVISORY_TEXT_2 = """org/mitre/oauth2/web/OAuthConfirmationController.java in the OpenID Connect server implementation for MITREid Connect through 1.3.3 contains a Mass Assignment (aka Autobinding) vulnerability. This arises due to unsafe usage of the @ModelAttribute annotation during the OAuth authorization flow, in which HTTP request parameters affect an authorizationRequest.""" +ADVISORY_TEXT_3 = """In Apache Commons IO before 2.7, When invoking the method FileNameUtils.normalize with an improper input string, like "//../foo", or "\\..\foo", the result would be the same value, thus possibly providing access to files in the parent directory, but not further above (thus "limited" path traversal), if the calling code would use the result to construct a path value.""" -def test_adv_record_path_extraction_has_real_paths(): - result = extract_path_tokens(ADVISORY_TEXT_2) +ADVISORY_TEXT_4 = """"MultipartStream.java in Apache Commons FileUpload before 1.3.1, as used in Apache Tomcat, JBoss Web, and other products, allows remote attackers to cause a denial of service (infinite loop and CPU consumption) via a crafted Content-Type header that bypasses a loop's intended exit conditions.""" + +ADVISORY_TEXT_5 = """A vulnerability in the JsonMapObjectReaderWriter of Apache CXF allows an attacker to submit malformed JSON to a web service, which results in the thread getting stuck in an infinite loop, consuming CPU indefinitely. This issue affects Apache CXF versions prior to 3.4.4; Apache CXF versions prior to 3.3.11.""" - assert result == ["2.7", "FileNameUtils.normalize", "//../foo", "\\..\\foo"] + +def test_extract_affected_filenames(): + result1 = extract_affected_filenames(ADVISORY_TEXT_1) + result2 = extract_affected_filenames(ADVISORY_TEXT_2) + result3 = extract_affected_filenames(ADVISORY_TEXT_3) + result4 = extract_affected_filenames(ADVISORY_TEXT_4) + result5 = extract_affected_filenames(ADVISORY_TEXT_5) + assert result1.sort() == ["JwtRequestCodeFilter", "request_uri"].sort() + assert ( + result2.sort() + == [ + "OAuthConfirmationController", + "@ModelAttribute", + "authorizationRequest", + ].sort() + ) + assert result3.sort() == ["FileNameUtils"].sort() + assert result4.sort() == ["MultiPartStream", "FileUpload"].sort() # Content-Type + assert result5.sort() == ["JsonMapObjectReaderWriter"].sort() + + +def test_adv_record_path_extraction_has_real_paths(): + # result = extract_affected_files_paths(ADVISORY_TEXT_2) + print("") + # assert result == ["FileNameUtils", "//../foo", "\\..\\foo"] def test_adv_record_path_extraction_strict_extensions(): @@ -74,13 +90,13 @@ def test_adv_record_path_extraction_strict_extensions(): If strict_extensions is True, it will always extract tokens with (back) slashes, but it will only collect single file names if they have the correct extension. """ - result = extract_path_tokens( - ADVISORY_TEXT_2 - + " Developer.gery put something here to check if foo.java and bar.cpp will be found.", - strict_extensions=True, - ) - - assert result == ["//../foo", "\\..\\foo", "foo.java", "bar.cpp"] + # result = extract_affected_files_paths( + # ADVISORY_TEXT_2 + # + " Developer.gery put something here to check if foo.java and bar.cpp will be found.", + # strict_extensions=True, + # ) + print("result") + # assert result == ["FileNameUtils", "//../foo", "\\..\\foo", "foo", "bar"] @pytest.mark.skip(reason="TODO: implement")