SAP · copernico · Oct 7, 2022 · Oct 4, 2022 · Oct 4, 2022 · Oct 4, 2022
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -5,9 +5,13 @@ name: Python
 
 on:
   push:
-    branches: [main]
+    branches:
+      - main
+      - test-fix-nlp
   pull_request:
-    branches: [main]
+    branches:
+      - main
+      - text-fix-nlp
 
 jobs:
   build:

diff --git a/prospector/client/cli/main.py b/prospector/client/cli/main.py
@@ -117,7 +117,7 @@ def parseArguments(args):
     parser.add_argument(
         "--use-backend",
         default="always",
-        action="store_true",
+        choices=["always", "never", "optional"],
         help="Use the backend server",
     )
 
@@ -230,10 +230,8 @@ def main(argv):  # noqa: C901
 
         vulnerability_id = args.vulnerability_id
         repository_url = args.repository
-
         vuln_descr = args.descr
-
-        filter_extensions = "*." + args.filter_extensions
+        filter_extensions = args.filter_extensions
 
         # if no backend the filters on the advisory do not work
         use_nvd = False
@@ -255,18 +253,16 @@ def main(argv):  # noqa: C901
         max_candidates = args.max_candidates
         modified_files = args.modified_files.split(",") if args.modified_files else []
         advisory_keywords = (
-            args.advisory_keywords.split(",")
-            if args.advisory_keywords is not None
-            else []
+            args.advisory_keywords.split(",") if args.advisory_keywords else []
         )
 
         publication_date = ""
         if args.pub_date != "":
             publication_date = args.pub_date + "T00:00Z"
-            # if the date is forced manually, the time interval can
-            # be restricted
-            # time_limit_before = int(time_limit_before / 5)
-            # time_limit_after = int(time_limit_after / 2)
+        # if the date is forced manually, the time interval can
+        # be restricted
+        # time_limit_before = int(time_limit_before / 5)
+        # time_limit_after = int(time_limit_after / 2)
 
         git_cache = os.getenv("GIT_CACHE", default=GIT_CACHE)
 

diff --git a/prospector/client/cli/prospector_client.py b/prospector/client/cli/prospector_client.py
@@ -8,7 +8,7 @@
 
 import log
 from client.cli.console import ConsoleWriter, MessageStatus
-from datamodel.advisory import AdvisoryRecord
+from datamodel.advisory import AdvisoryRecord, build_advisory_record
 from datamodel.commit import Commit, apply_ranking, make_from_raw_commit
 from filtering.filter import filter_commits
 from git.git import GIT_CACHE, Git
@@ -26,6 +26,7 @@
 
 _logger = init_local_logger()
 
+
 SECS_PER_DAY = 86400
 TIME_LIMIT_BEFORE = 3 * 365 * SECS_PER_DAY
 TIME_LIMIT_AFTER = 180 * SECS_PER_DAY
@@ -73,6 +74,7 @@ def prospector(  # noqa: C901
             publication_date,
             advisory_keywords,
             modified_files,
+            filter_extensions,
         )
 
     with ConsoleWriter("Obtaining initial set of candidates") as writer:
@@ -248,58 +250,15 @@ def save_preprocessed_commits(backend_address, payload):
             )
 
 
-def build_advisory_record(
-    vulnerability_id,
-    repository_url,
-    vuln_descr,
-    nvd_rest_endpoint,
-    fetch_references,
-    use_nvd,
-    publication_date,
-    advisory_keywords,
-    modified_files,
-) -> AdvisoryRecord:
-
-    advisory_record = AdvisoryRecord(
-        vulnerability_id=vulnerability_id,
-        repository_url=repository_url,
-        description=vuln_descr,
-        from_nvd=use_nvd,
-        nvd_rest_endpoint=nvd_rest_endpoint,
-    )
-
-    _logger.pretty_log(advisory_record)
-
-    advisory_record.analyze(use_nvd=use_nvd, fetch_references=fetch_references)
-    _logger.debug(f"{advisory_record.keywords=}")
-
-    if publication_date != "":
-        advisory_record.published_timestamp = int(
-            datetime.strptime(publication_date, r"%Y-%m-%dT%H:%M%z").timestamp()
-        )
-
-    if len(advisory_keywords) > 0:
-        advisory_record.keywords += tuple(advisory_keywords)
-        # drop duplicates
-        advisory_record.keywords = list(set(advisory_record.keywords))
-
-    if len(modified_files) > 0:
-        advisory_record.paths += modified_files
-
-    _logger.debug(f"{advisory_record.keywords=}")
-    _logger.debug(f"{advisory_record.paths=}")
-
-    return advisory_record
-
-
+# TODO: Cleanup many parameters should be recovered from the advisory record object
 def get_candidates(
-    advisory_record,
-    repository,
-    tag_interval,
-    version_interval,
-    time_limit_before,
-    time_limit_after,
-    filter_extensions,
+    advisory_record: AdvisoryRecord,
+    repository: Git,
+    tag_interval: str,
+    version_interval: str,
+    time_limit_before: int,
+    time_limit_after: int,
+    filter_extensions: str,
 ) -> List[str]:
     with ExecutionTimer(
         core_statistics.sub_collection(name="retrieval of commit candidates")
@@ -318,6 +277,7 @@ def get_candidates(
         with ConsoleWriter("Candidate commit retrieval"):
             prev_tag = None
             following_tag = None
+
             if tag_interval != "":
                 prev_tag, following_tag = tag_interval.split(":")
             elif version_interval != "":
@@ -330,7 +290,7 @@ def get_candidates(
             if advisory_record.published_timestamp:
                 since = advisory_record.published_timestamp - time_limit_before
                 until = advisory_record.published_timestamp + time_limit_after
-
+            # Here i need to strip the github tags of useless stuff
             candidates = repository.get_commits(
                 since=since,
                 until=until,

diff --git a/prospector/client/cli/prospector_client_test.py b/prospector/client/cli/prospector_client_test.py
@@ -1,6 +1,7 @@
 import pytest
 
 from api import DB_CONNECT_STRING
+from client.cli.prospector_client import build_advisory_record
 from commitdb.postgres import PostgresCommitDB
 from stats.execution import execution_statistics
 
@@ -35,6 +36,7 @@ def test_main_runonce(setupdb):
         "--repository",
         "https://github.com/cloudfoundry/uaa",
         "--tag-interval=v74.0.0:v74.1.0",
+        "--use-backend=optional",
     ]
     execution_statistics.drop_all()
     main(args)

diff --git a/prospector/datamodel/advisory.py b/prospector/datamodel/advisory.py
@@ -13,7 +13,7 @@
 from util.http import fetch_url
 
 from .nlp import (
-    extract_path_tokens,
+    extract_affected_filenames,
     extract_products,
     extract_special_terms,
     extract_versions,
@@ -48,9 +48,11 @@
 
 _logger = log.util.init_local_logger()
 
-NVD_REST_ENDPOINT = "http://localhost:8000/nvd/vulnerabilities/"
+LOCAL_NVD_REST_ENDPOINT = "http://localhost:8000/nvd/vulnerabilities/"
+NVD_REST_ENDPOINT = "https://services.nvd.nist.gov/rest/json/cves/2.0?cveId="
 
 
+# TODO: refactor and clean
 class AdvisoryRecord(BaseModel):
     """
     The advisory record captures all relevant information on the vulnerability advisory
@@ -68,24 +70,34 @@ class AdvisoryRecord(BaseModel):
     relevant_tags: List[str] = None
     versions: List[str] = Field(default_factory=list)
     from_nvd: bool = False
-    nvd_rest_endpoint: str = NVD_REST_ENDPOINT
+    nvd_rest_endpoint: str = LOCAL_NVD_REST_ENDPOINT
     paths: List[str] = Field(default_factory=list)
     keywords: Tuple[str, ...] = Field(default_factory=tuple)
 
-    def analyze(self, use_nvd: bool = False, fetch_references=False):
+    # def __init__(self, vulnerability_id, repository_url, from_nvd, nvd_rest_endpoint):
+    #     self.vulnerability_id = vulnerability_id
+    #     self.repository_url = repository_url
+    #     self.from_nvd = from_nvd
+    #     self.nvd_rest_endpoint = nvd_rest_endpoint
+
+    def analyze(
+        self, use_nvd: bool = False, fetch_references=False, relevant_extensions=[]
+    ):
         self.from_nvd = use_nvd
 
         if self.from_nvd:
-            self._get_from_nvd(self.vulnerability_id, self.nvd_rest_endpoint)
-
+            self.get_advisory(self.vulnerability_id, self.nvd_rest_endpoint)
         self.versions = union_of(self.versions, extract_versions(self.description))
-
         self.affected_products = union_of(
             self.affected_products, extract_products(self.description)
         )
-        self.paths = union_of(self.paths, extract_path_tokens(self.description))
-        self.keywords = union_of(self.keywords, extract_special_terms(self.description))
 
+        # TODO: if an exact file is found when applying the rules, the relevance must be updated i think
+        self.paths = union_of(
+            self.paths,
+            extract_affected_filenames(self.description, relevant_extensions),
+        )
+        self.keywords = union_of(self.keywords, extract_special_terms(self.description))
         _logger.debug("References: " + str(self.references))
         self.references = [
             r for r in self.references if urlparse(r).hostname in ALLOWED_SITES
@@ -98,32 +110,40 @@ def analyze(self, use_nvd: bool = False, fetch_references=False):
                     _logger.debug("Fetched content of reference " + r)
                     self.references_content.append(ref_content)
 
-    def _get_from_nvd(self, vuln_id: str, nvd_rest_endpoint: str = NVD_REST_ENDPOINT):
+    # TODO check behavior when some of the data attributes of the AdvisoryRecord
+    # class contain data (e.g. passed explicitly as input by the useer);
+    # In that case, shall the data from NVD be appended to the exiting data,
+    # replace it, be ignored? (note: right now, it just replaces it)
+    def get_advisory(
+        self, vuln_id: str, nvd_rest_endpoint: str = LOCAL_NVD_REST_ENDPOINT
+    ):
         """
         populate object field using NVD data
         returns: description, published_timestamp, last_modified timestamp, list of references
         """
 
-        # TODO check behavior when some of the data attributes of the AdvisoryRecord
-        # class contain data (e.g. passed explicitly as input by the useer);
-        # In that case, shall the data from NVD be appended to the exiting data,
-        # replace it, be ignored?
-        # (note: right now, it just replaces it)
+        if not self.get_from_local_db(vuln_id, nvd_rest_endpoint):
+            print("Could not retrieve vulnerability data from local db")
+            print("Trying to retrieve data from NVD")
+            self.get_from_nvd(vuln_id)
+
+    # TODO: refactor this stuff
+    def get_from_local_db(
+        self, vuln_id: str, nvd_rest_endpoint: str = LOCAL_NVD_REST_ENDPOINT
+    ):
+        """
+        Get an advisory from the local NVD database
+        """
         try:
             response = requests.get(nvd_rest_endpoint + vuln_id)
             if response.status_code != 200:
-                return
-            # data = response.json()["result"]["CVE_Items"][0]
+                return False
             data = response.json()
             self.published_timestamp = int(
-                datetime.strptime(
-                    data["publishedDate"], r"%Y-%m-%dT%H:%M%z"
-                ).timestamp()
+                datetime.fromisoformat(data["publishedDate"]).timestamp()
             )
             self.last_modified_timestamp = int(
-                datetime.strptime(
-                    data["lastModifiedDate"], r"%Y-%m-%dT%H:%M%z"
-                ).timestamp()
+                datetime.fromisoformat(data["lastModifiedDate"]).timestamp()
             )
 
             self.description = data["cve"]["description"]["description_data"][0][
@@ -132,12 +152,90 @@ def _get_from_nvd(self, vuln_id: str, nvd_rest_endpoint: str = NVD_REST_ENDPOINT
             self.references = [
                 r["url"] for r in data["cve"]["references"]["reference_data"]
             ]
+            return True
+        except Exception as e:
+            # Might fail either or json parsing error or for connection error
+            _logger.error(
+                "Could not retrieve vulnerability data from NVD for " + vuln_id,
+                exc_info=log.config.level < logging.INFO,
+            )
+            print(e)
+            return False
 
-        except Exception:
+    def get_from_nvd(self, vuln_id: str, nvd_rest_endpoint: str = NVD_REST_ENDPOINT):
+        """
+        Get an advisory from the NVD dtabase
+        """
+        try:
+            response = requests.get(nvd_rest_endpoint + vuln_id)
+            if response.status_code != 200:
+                return False
+            data = response.json()["vulnerabilities"][0]["cve"]
+            self.published_timestamp = int(
+                datetime.fromisoformat(data["published"]).timestamp()
+            )
+            self.last_modified_timestamp = int(
+                datetime.fromisoformat(data["lastModified"]).timestamp()
+            )
+            self.description = data["descriptions"][0]["value"]
+            self.references = [r["url"] for r in data["references"]]
+        except Exception as e:
+            # Might fail either or json parsing error or for connection error
             _logger.error(
                 "Could not retrieve vulnerability data from NVD for " + vuln_id,
                 exc_info=log.config.level < logging.INFO,
             )
+            print(e)
+            return False
+
+
+# Moved here since it is a factory method basicall
+def build_advisory_record(
+    vulnerability_id,
+    repository_url,
+    vuln_descr,
+    nvd_rest_endpoint,
+    fetch_references,
+    use_nvd,
+    publication_date,
+    advisory_keywords,
+    modified_files,
+    filter_extensions,
+) -> AdvisoryRecord:
+
+    advisory_record = AdvisoryRecord(
+        vulnerability_id=vulnerability_id,
+        repository_url=repository_url,
+        description=vuln_descr,
+        from_nvd=use_nvd,
+        nvd_rest_endpoint=nvd_rest_endpoint,
+    )
+
+    _logger.pretty_log(advisory_record)
+    advisory_record.analyze(
+        use_nvd=use_nvd,
+        fetch_references=fetch_references,
+        relevant_extensions=filter_extensions,
+    )
+    _logger.debug(f"{advisory_record.keywords=}")
+
+    if publication_date != "":
+        advisory_record.published_timestamp = int(
+            datetime.fromisoformat(publication_date).timestamp()
+        )
+
+    if len(advisory_keywords) > 0:
+        advisory_record.keywords += tuple(advisory_keywords)
+        # drop duplicates
+        advisory_record.keywords = list(set(advisory_record.keywords))
+
+    if len(modified_files) > 0:
+        advisory_record.paths += modified_files
+
+    _logger.debug(f"{advisory_record.keywords=}")
+    _logger.debug(f"{advisory_record.paths=}")
+
+    return advisory_record
 
 
 # might be used in the future