Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge NLP modifications, NVD failure fallback #324

Merged
merged 7 commits into from
Oct 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,13 @@ name: Python

on:
push:
branches: [main]
branches:
- main
- test-fix-nlp
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a temporary branch, no need to include it

pull_request:
branches: [main]
branches:
- main
- text-fix-nlp
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here


jobs:
build:
Expand Down
18 changes: 7 additions & 11 deletions prospector/client/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def parseArguments(args):
parser.add_argument(
"--use-backend",
default="always",
action="store_true",
choices=["always", "never", "optional"],
help="Use the backend server",
)

Expand Down Expand Up @@ -230,10 +230,8 @@ def main(argv): # noqa: C901

vulnerability_id = args.vulnerability_id
repository_url = args.repository

vuln_descr = args.descr

filter_extensions = "*." + args.filter_extensions
filter_extensions = args.filter_extensions

# if no backend the filters on the advisory do not work
use_nvd = False
Expand All @@ -255,18 +253,16 @@ def main(argv): # noqa: C901
max_candidates = args.max_candidates
modified_files = args.modified_files.split(",") if args.modified_files else []
advisory_keywords = (
args.advisory_keywords.split(",")
if args.advisory_keywords is not None
else []
args.advisory_keywords.split(",") if args.advisory_keywords else []
)

publication_date = ""
if args.pub_date != "":
publication_date = args.pub_date + "T00:00Z"
# if the date is forced manually, the time interval can
# be restricted
# time_limit_before = int(time_limit_before / 5)
# time_limit_after = int(time_limit_after / 2)
# if the date is forced manually, the time interval can
# be restricted
# time_limit_before = int(time_limit_before / 5)
# time_limit_after = int(time_limit_after / 2)

git_cache = os.getenv("GIT_CACHE", default=GIT_CACHE)

Expand Down
66 changes: 13 additions & 53 deletions prospector/client/cli/prospector_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import log
from client.cli.console import ConsoleWriter, MessageStatus
from datamodel.advisory import AdvisoryRecord
from datamodel.advisory import AdvisoryRecord, build_advisory_record
from datamodel.commit import Commit, apply_ranking, make_from_raw_commit
from filtering.filter import filter_commits
from git.git import GIT_CACHE, Git
Expand All @@ -26,6 +26,7 @@

_logger = init_local_logger()


SECS_PER_DAY = 86400
TIME_LIMIT_BEFORE = 3 * 365 * SECS_PER_DAY
TIME_LIMIT_AFTER = 180 * SECS_PER_DAY
Expand Down Expand Up @@ -73,6 +74,7 @@ def prospector( # noqa: C901
publication_date,
advisory_keywords,
modified_files,
filter_extensions,
)

with ConsoleWriter("Obtaining initial set of candidates") as writer:
Expand Down Expand Up @@ -248,58 +250,15 @@ def save_preprocessed_commits(backend_address, payload):
)


def build_advisory_record(
vulnerability_id,
repository_url,
vuln_descr,
nvd_rest_endpoint,
fetch_references,
use_nvd,
publication_date,
advisory_keywords,
modified_files,
) -> AdvisoryRecord:

advisory_record = AdvisoryRecord(
vulnerability_id=vulnerability_id,
repository_url=repository_url,
description=vuln_descr,
from_nvd=use_nvd,
nvd_rest_endpoint=nvd_rest_endpoint,
)

_logger.pretty_log(advisory_record)

advisory_record.analyze(use_nvd=use_nvd, fetch_references=fetch_references)
_logger.debug(f"{advisory_record.keywords=}")

if publication_date != "":
advisory_record.published_timestamp = int(
datetime.strptime(publication_date, r"%Y-%m-%dT%H:%M%z").timestamp()
)

if len(advisory_keywords) > 0:
advisory_record.keywords += tuple(advisory_keywords)
# drop duplicates
advisory_record.keywords = list(set(advisory_record.keywords))

if len(modified_files) > 0:
advisory_record.paths += modified_files

_logger.debug(f"{advisory_record.keywords=}")
_logger.debug(f"{advisory_record.paths=}")

return advisory_record


# TODO: Cleanup many parameters should be recovered from the advisory record object
def get_candidates(
advisory_record,
repository,
tag_interval,
version_interval,
time_limit_before,
time_limit_after,
filter_extensions,
advisory_record: AdvisoryRecord,
repository: Git,
tag_interval: str,
version_interval: str,
time_limit_before: int,
time_limit_after: int,
filter_extensions: str,
) -> List[str]:
with ExecutionTimer(
core_statistics.sub_collection(name="retrieval of commit candidates")
Expand All @@ -318,6 +277,7 @@ def get_candidates(
with ConsoleWriter("Candidate commit retrieval"):
prev_tag = None
following_tag = None

if tag_interval != "":
prev_tag, following_tag = tag_interval.split(":")
elif version_interval != "":
Expand All @@ -330,7 +290,7 @@ def get_candidates(
if advisory_record.published_timestamp:
since = advisory_record.published_timestamp - time_limit_before
until = advisory_record.published_timestamp + time_limit_after

# Here i need to strip the github tags of useless stuff
candidates = repository.get_commits(
since=since,
until=until,
Expand Down
2 changes: 2 additions & 0 deletions prospector/client/cli/prospector_client_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest

from api import DB_CONNECT_STRING
from client.cli.prospector_client import build_advisory_record
from commitdb.postgres import PostgresCommitDB
from stats.execution import execution_statistics

Expand Down Expand Up @@ -35,6 +36,7 @@ def test_main_runonce(setupdb):
"--repository",
"https://github.com/cloudfoundry/uaa",
"--tag-interval=v74.0.0:v74.1.0",
"--use-backend=optional",
]
execution_statistics.drop_all()
main(args)
Expand Down
146 changes: 122 additions & 24 deletions prospector/datamodel/advisory.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from util.http import fetch_url

from .nlp import (
extract_path_tokens,
extract_affected_filenames,
extract_products,
extract_special_terms,
extract_versions,
Expand Down Expand Up @@ -48,9 +48,11 @@

_logger = log.util.init_local_logger()

NVD_REST_ENDPOINT = "http://localhost:8000/nvd/vulnerabilities/"
LOCAL_NVD_REST_ENDPOINT = "http://localhost:8000/nvd/vulnerabilities/"
NVD_REST_ENDPOINT = "https://services.nvd.nist.gov/rest/json/cves/2.0?cveId="


# TODO: refactor and clean
class AdvisoryRecord(BaseModel):
"""
The advisory record captures all relevant information on the vulnerability advisory
Expand All @@ -68,24 +70,34 @@ class AdvisoryRecord(BaseModel):
relevant_tags: List[str] = None
versions: List[str] = Field(default_factory=list)
from_nvd: bool = False
nvd_rest_endpoint: str = NVD_REST_ENDPOINT
nvd_rest_endpoint: str = LOCAL_NVD_REST_ENDPOINT
paths: List[str] = Field(default_factory=list)
keywords: Tuple[str, ...] = Field(default_factory=tuple)

def analyze(self, use_nvd: bool = False, fetch_references=False):
# def __init__(self, vulnerability_id, repository_url, from_nvd, nvd_rest_endpoint):
# self.vulnerability_id = vulnerability_id
# self.repository_url = repository_url
# self.from_nvd = from_nvd
# self.nvd_rest_endpoint = nvd_rest_endpoint

def analyze(
self, use_nvd: bool = False, fetch_references=False, relevant_extensions=[]
):
self.from_nvd = use_nvd

if self.from_nvd:
self._get_from_nvd(self.vulnerability_id, self.nvd_rest_endpoint)

self.get_advisory(self.vulnerability_id, self.nvd_rest_endpoint)
self.versions = union_of(self.versions, extract_versions(self.description))

self.affected_products = union_of(
self.affected_products, extract_products(self.description)
)
self.paths = union_of(self.paths, extract_path_tokens(self.description))
self.keywords = union_of(self.keywords, extract_special_terms(self.description))

# TODO: if an exact file is found when applying the rules, the relevance must be updated i think
self.paths = union_of(
self.paths,
extract_affected_filenames(self.description, relevant_extensions),
)
self.keywords = union_of(self.keywords, extract_special_terms(self.description))
_logger.debug("References: " + str(self.references))
self.references = [
r for r in self.references if urlparse(r).hostname in ALLOWED_SITES
Expand All @@ -98,32 +110,40 @@ def analyze(self, use_nvd: bool = False, fetch_references=False):
_logger.debug("Fetched content of reference " + r)
self.references_content.append(ref_content)

def _get_from_nvd(self, vuln_id: str, nvd_rest_endpoint: str = NVD_REST_ENDPOINT):
# TODO check behavior when some of the data attributes of the AdvisoryRecord
# class contain data (e.g. passed explicitly as input by the useer);
# In that case, shall the data from NVD be appended to the exiting data,
# replace it, be ignored? (note: right now, it just replaces it)
def get_advisory(
self, vuln_id: str, nvd_rest_endpoint: str = LOCAL_NVD_REST_ENDPOINT
):
"""
populate object field using NVD data
returns: description, published_timestamp, last_modified timestamp, list of references
"""

# TODO check behavior when some of the data attributes of the AdvisoryRecord
# class contain data (e.g. passed explicitly as input by the useer);
# In that case, shall the data from NVD be appended to the exiting data,
# replace it, be ignored?
# (note: right now, it just replaces it)
if not self.get_from_local_db(vuln_id, nvd_rest_endpoint):
print("Could not retrieve vulnerability data from local db")
print("Trying to retrieve data from NVD")
self.get_from_nvd(vuln_id)

# TODO: refactor this stuff
def get_from_local_db(
self, vuln_id: str, nvd_rest_endpoint: str = LOCAL_NVD_REST_ENDPOINT
):
"""
Get an advisory from the local NVD database
"""
try:
response = requests.get(nvd_rest_endpoint + vuln_id)
if response.status_code != 200:
return
# data = response.json()["result"]["CVE_Items"][0]
return False
data = response.json()
self.published_timestamp = int(
datetime.strptime(
data["publishedDate"], r"%Y-%m-%dT%H:%M%z"
).timestamp()
datetime.fromisoformat(data["publishedDate"]).timestamp()
)
self.last_modified_timestamp = int(
datetime.strptime(
data["lastModifiedDate"], r"%Y-%m-%dT%H:%M%z"
).timestamp()
datetime.fromisoformat(data["lastModifiedDate"]).timestamp()
)

self.description = data["cve"]["description"]["description_data"][0][
Expand All @@ -132,12 +152,90 @@ def _get_from_nvd(self, vuln_id: str, nvd_rest_endpoint: str = NVD_REST_ENDPOINT
self.references = [
r["url"] for r in data["cve"]["references"]["reference_data"]
]
return True
except Exception as e:
# Might fail either or json parsing error or for connection error
_logger.error(
"Could not retrieve vulnerability data from NVD for " + vuln_id,
exc_info=log.config.level < logging.INFO,
)
print(e)
return False

except Exception:
def get_from_nvd(self, vuln_id: str, nvd_rest_endpoint: str = NVD_REST_ENDPOINT):
"""
Get an advisory from the NVD dtabase
"""
try:
response = requests.get(nvd_rest_endpoint + vuln_id)
if response.status_code != 200:
return False
data = response.json()["vulnerabilities"][0]["cve"]
self.published_timestamp = int(
datetime.fromisoformat(data["published"]).timestamp()
)
self.last_modified_timestamp = int(
datetime.fromisoformat(data["lastModified"]).timestamp()
)
self.description = data["descriptions"][0]["value"]
self.references = [r["url"] for r in data["references"]]
except Exception as e:
# Might fail either or json parsing error or for connection error
_logger.error(
"Could not retrieve vulnerability data from NVD for " + vuln_id,
exc_info=log.config.level < logging.INFO,
)
print(e)
return False


# Moved here since it is a factory method basicall
def build_advisory_record(
vulnerability_id,
repository_url,
vuln_descr,
nvd_rest_endpoint,
fetch_references,
use_nvd,
publication_date,
advisory_keywords,
modified_files,
filter_extensions,
) -> AdvisoryRecord:

advisory_record = AdvisoryRecord(
vulnerability_id=vulnerability_id,
repository_url=repository_url,
description=vuln_descr,
from_nvd=use_nvd,
nvd_rest_endpoint=nvd_rest_endpoint,
)

_logger.pretty_log(advisory_record)
advisory_record.analyze(
use_nvd=use_nvd,
fetch_references=fetch_references,
relevant_extensions=filter_extensions,
)
_logger.debug(f"{advisory_record.keywords=}")

if publication_date != "":
advisory_record.published_timestamp = int(
datetime.fromisoformat(publication_date).timestamp()
)

if len(advisory_keywords) > 0:
advisory_record.keywords += tuple(advisory_keywords)
# drop duplicates
advisory_record.keywords = list(set(advisory_record.keywords))

if len(modified_files) > 0:
advisory_record.paths += modified_files

_logger.debug(f"{advisory_record.keywords=}")
_logger.debug(f"{advisory_record.paths=}")

return advisory_record


# might be used in the future
Expand Down
Loading