Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix for cve_lookup installation RAM usage #1306

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/install/requirements_pre_install.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ requests==2.32.2
pydantic==2.4.0
werkzeug~=3.0.3
toml==0.10.2
# needed during installation of cve_lookup plugin
ijson==3.3.0

git+https://github.com/fkie-cad/common_helper_files.git

3 changes: 1 addition & 2 deletions src/plugins/analysis/cve_lookup/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,10 @@ def install_files(self):
Install files for the CVE lookup plugin.
"""
os.chdir('internal')
cve_list = parse_data()
connection = DbConnection()
connection.drop_tables()
db = DbSetup(connection)
db.add_cve_items(cve_list)
db.add_cve_items(parse_data())
os.chdir(self.base_path)


Expand Down
37 changes: 23 additions & 14 deletions src/plugins/analysis/cve_lookup/internal/data_parsing.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,40 @@
from __future__ import annotations

import json
import lzma
import re
from typing import TYPE_CHECKING
from pathlib import Path
from shlex import split
from subprocess import run

import ijson
import requests
from requests.adapters import HTTPAdapter, Retry

if TYPE_CHECKING:
from requests.models import Response

from ..internal.helper_functions import CveEntry

FILE_NAME = 'CVE-all.json.xz'
CVE_URL = f'https://github.com/fkie-cad/nvd-json-data-feeds/releases/latest/download/{FILE_NAME}'
DB_DIR = Path(__file__).parent / 'database'
OUTPUT_FILE = DB_DIR / FILE_NAME


def _retrieve_url(download_url: str) -> Response:
def _retrieve_url(download_url: str, target: Path):
adapter = HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.1))
with requests.Session() as session:
session.mount('http://', adapter)
return session.get(download_url)
with requests.get(download_url, stream=True) as request:
request.raise_for_status()
with target.open('wb') as fp:
for chunk in request.iter_content(chunk_size=65_536):
fp.write(chunk)


def download_and_decompress_data() -> bytes:
def download_and_decompress_file() -> Path:
"""
Downloads data from a URL, saves it to a file, decompresses it, and returns the decompressed data.
Downloads data from a URL, saves it to a file, decompresses it, and returns the path.
"""
response = _retrieve_url(CVE_URL)
return lzma.decompress(response.content)
_retrieve_url(CVE_URL, OUTPUT_FILE)
run(split(f'unxz --force {OUTPUT_FILE.name}'), cwd=DB_DIR, check=True)
return DB_DIR / OUTPUT_FILE.stem # the .xz suffix was removed during extraction


def extract_english_summary(descriptions: list) -> str:
Expand Down Expand Up @@ -87,8 +92,12 @@ def parse_data() -> list[CveEntry]:
"""
Parse the data from the JSON file and return a list of CveEntry objects.
"""
cve_json = json.loads(download_and_decompress_data())
return [extract_data_from_cve(cve_item) for cve_item in cve_json.get('cve_items', [])]
cve_path = download_and_decompress_file()
with cve_path.open('rb') as fp:
# the file is huge, so we use ijson to stream the data
for cve_item in ijson.items(fp, 'cve_items.item'):
yield extract_data_from_cve(cve_item)
cve_path.unlink() # remove the temporary file after we are done


if __name__ == '__main__':
Expand Down
80 changes: 46 additions & 34 deletions src/plugins/analysis/cve_lookup/internal/database/db_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

import re
from pathlib import Path
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Iterable

from ..helper_functions import CveEntry, replace_wildcards
from .schema import Association, Cpe, Cve
from .schema import Association, Base, Cpe, Cve

if TYPE_CHECKING:
from .db_connection import DbConnection
Expand All @@ -20,6 +20,8 @@ def __init__(self, connection: DbConnection):
self.connection = connection
self.connection.create_tables()
self.session = self.connection.create_session()
self.existing_cve_ids = set()
self.existing_cpe_ids = set()

def create_cve(self, cve_item: CveEntry) -> Cve:
"""
Expand Down Expand Up @@ -51,41 +53,51 @@ def create_cpe(self, cpe_id: str):
update=update,
)

def add_cve_items(self, cve_list: list[CveEntry]):
def create_association(self, cve_id: str, cpe_entry: tuple[str, str, str, str, str]) -> Association:
"""
Add CVE items to the database.
Create an Association object from a CVE ID and a CPE entry.
"""
existing_cve_ids = set()
existing_cpe_ids = set()
(
cpe_id,
version_start_including,
version_start_excluding,
version_end_including,
version_end_excluding,
) = cpe_entry
return Association(
cve_id=cve_id,
cpe_id=cpe_id,
version_start_including=version_start_including,
version_start_excluding=version_start_excluding,
version_end_including=version_end_including,
version_end_excluding=version_end_excluding,
)

cves = []
associations = []
cpes = []
def add_cve_items(self, cve_list: Iterable[CveEntry], chunk_size: int = 2**12):
"""
Add CVE items to the database chunk-wise.
"""

db_objects: list[Base] = []

for cve_item in cve_list:
if cve_item.cve_id not in existing_cve_ids:
cves.append(self.create_cve(cve_item))
for cpe_entry in cve_item.cpe_entries:
(
cpe_id,
version_start_including,
version_start_excluding,
version_end_including,
version_end_excluding,
) = cpe_entry
if cpe_id not in existing_cpe_ids:
cpes.append(self.create_cpe(cpe_id))
existing_cpe_ids.add(cpe_id)
associations.append(
Association(
cve_id=cve_item.cve_id,
cpe_id=cpe_id,
version_start_including=version_start_including,
version_start_excluding=version_start_excluding,
version_end_including=version_end_including,
version_end_excluding=version_end_excluding,
)
)
existing_cve_ids.add(cve_item.cve_id)
self.session.bulk_save_objects(cves + associations + cpes)
if cve_item.cve_id not in self.existing_cve_ids:
db_objects.extend(self._create_db_objects_for_cve(cve_item))
if len(db_objects) >= chunk_size:
self._save_objects(db_objects)
db_objects.clear()
if db_objects:
self._save_objects(db_objects)

def _create_db_objects_for_cve(self, cve_item: CveEntry) -> Iterable[Base]:
yield self.create_cve(cve_item)
for cpe_entry in cve_item.cpe_entries:
if (cpe_id := cpe_entry[0]) not in self.existing_cpe_ids:
yield self.create_cpe(cpe_id)
self.existing_cpe_ids.add(cpe_id)
yield self.create_association(cve_item.cve_id, cpe_entry)
self.existing_cve_ids.add(cve_item.cve_id)

def _save_objects(self, objects: list[Base]):
self.session.bulk_save_objects(objects)
self.session.commit()
Loading