From da243af1738e1023028bca55c9c2cb9baee79cd7 Mon Sep 17 00:00:00 2001 From: Alex Chantavy Date: Thu, 15 Jun 2023 08:53:49 -0700 Subject: [PATCH 01/18] Improve drift detection perf, tests, and add some typehints (#1186) Uses a set membership check instead of iterating through each item in the list of results to detect drift. Adds unit test to ensure order of results does not matter in the drift detection check. Also adds some typehints. --- cartography/driftdetect/config.py | 35 ++++++------ cartography/driftdetect/detect_deviations.py | 16 ++++-- cartography/driftdetect/get_states.py | 26 +++++++-- cartography/driftdetect/model.py | 17 +++--- tests/unit/driftdetect/test_detector.py | 57 +++++++++++++++++--- 5 files changed, 109 insertions(+), 42 deletions(-) diff --git a/cartography/driftdetect/config.py b/cartography/driftdetect/config.py index a8b2f4f625..0a32361e6b 100644 --- a/cartography/driftdetect/config.py +++ b/cartography/driftdetect/config.py @@ -1,3 +1,6 @@ +from typing import Optional + + class UpdateConfig: """ A common interface for the drift-detection update configuration. @@ -18,10 +21,10 @@ class UpdateConfig: def __init__( self, - drift_detection_directory, - neo4j_uri, - neo4j_user=None, - neo4j_password=None, + drift_detection_directory: str, + neo4j_uri: str, + neo4j_user: Optional[str] = None, + neo4j_password: Optional[str] = None, ): self.neo4j_uri = neo4j_uri self.neo4j_user = neo4j_user @@ -46,13 +49,13 @@ class GetDriftConfig: def __init__( self, - query_directory, - start_state, - end_state, + query_directory: str, + start_state: str, + end_state: str, ): - self.query_directory = query_directory - self.start_state = start_state - self.end_state = end_state + self.query_directory: str = query_directory + self.start_state: str = start_state + self.end_state: str = end_state class AddShortcutConfig: @@ -72,10 +75,10 @@ class AddShortcutConfig: def __init__( self, - query_directory, - shortcut, - filename, + query_directory: str, + shortcut: str, + filename: str, ): - self.query_directory = query_directory - self.shortcut = shortcut - self.filename = filename + self.query_directory: str = query_directory + self.shortcut: str = shortcut + self.filename: str = filename diff --git a/cartography/driftdetect/detect_deviations.py b/cartography/driftdetect/detect_deviations.py index df0ad86cd4..8155760814 100644 --- a/cartography/driftdetect/detect_deviations.py +++ b/cartography/driftdetect/detect_deviations.py @@ -1,8 +1,12 @@ import logging import os +from typing import List +from typing import Union from marshmallow import ValidationError +from cartography.driftdetect.config import GetDriftConfig +from cartography.driftdetect.model import State from cartography.driftdetect.reporter import report_drift from cartography.driftdetect.serializers import ShortcutSchema from cartography.driftdetect.serializers import StateSchema @@ -12,7 +16,7 @@ logger = logging.getLogger(__name__) -def run_drift_detection(config): +def run_drift_detection(config: GetDriftConfig) -> None: try: if not valid_directory(config.query_directory): logger.error("Invalid Drift Detection Directory") @@ -59,7 +63,7 @@ def run_drift_detection(config): logger.exception(msg) -def perform_drift_detection(start_state, end_state): +def perform_drift_detection(start_state: State, end_state: State): """ Returns differences (additions and missing results) between two States. @@ -81,7 +85,7 @@ def perform_drift_detection(start_state, end_state): return new_results, missing_results -def compare_states(start_state, end_state): +def compare_states(start_state: State, end_state: State): """ Helper function for comparing differences between two States. @@ -92,10 +96,12 @@ def compare_states(start_state, end_state): :return: list of tuples of differences between states in the form (dictionary, State object) """ differences = [] + # Use set for faster membership check + start_state_results = {tuple(res) for res in start_state.results} for result in end_state.results: - if result in start_state.results: + if tuple(result) in start_state_results: continue - drift = [] + drift: List[Union[str, List[str]]] = [] for field in result: value = field.split("|") if len(value) > 1: diff --git a/cartography/driftdetect/get_states.py b/cartography/driftdetect/get_states.py index ebb0713171..eaa40bbba9 100644 --- a/cartography/driftdetect/get_states.py +++ b/cartography/driftdetect/get_states.py @@ -1,12 +1,18 @@ import logging import os.path import time +from typing import Any +from typing import Dict +from typing import List import neo4j.exceptions from marshmallow import ValidationError from neo4j import GraphDatabase +from cartography.client.core.tx import read_list_of_dicts_tx from cartography.driftdetect.add_shortcut import add_shortcut +from cartography.driftdetect.config import UpdateConfig +from cartography.driftdetect.model import State from cartography.driftdetect.serializers import ShortcutSchema from cartography.driftdetect.serializers import StateSchema from cartography.driftdetect.storage import FileSystem @@ -15,7 +21,7 @@ logger = logging.getLogger(__name__) -def run_get_states(config): +def run_get_states(config: UpdateConfig) -> None: """ Handles neo4j errors and then updates detectors. @@ -90,7 +96,13 @@ def run_get_states(config): logger.exception(err) -def get_query_state(session, query_directory, state_serializer, storage, filename): +def get_query_state( + session: neo4j.Session, + query_directory: str, + state_serializer: StateSchema, + storage, + filename: str, +) -> State: """ Gets the most recent state of a query. @@ -115,7 +127,7 @@ def get_query_state(session, query_directory, state_serializer, storage, filenam return state -def get_state(session, state): +def get_state(session: neo4j.Session, state: State) -> None: """ Connects to a neo4j session, runs the validation query, then saves the results to a state. @@ -126,10 +138,14 @@ def get_state(session, state): :return: """ - new_results = session.run(state.validation_query) + new_results: List[Dict[str, Any]] = session.read_transaction( + read_list_of_dicts_tx, + state.validation_query, + ) logger.debug(f"Updating results for {state.name}") - state.properties = new_results.keys() + # The keys will be the same across all items in the returned list + state.properties = list(new_results[0].keys()) results = [] for record in new_results: diff --git a/cartography/driftdetect/model.py b/cartography/driftdetect/model.py index 8887fbc02d..f92c959271 100644 --- a/cartography/driftdetect/model.py +++ b/cartography/driftdetect/model.py @@ -1,4 +1,5 @@ import logging +from typing import List logger = logging.getLogger(__name__) @@ -19,13 +20,13 @@ class State: def __init__( self, - name, - validation_query, - properties, - results, + name: str, + validation_query: str, + properties: List[str], + results: List[List[str]], ): - self.name = name - self.validation_query = validation_query - self.properties = properties - self.results = results + self.name: str = name + self.validation_query: str = validation_query + self.properties: List[str] = properties + self.results: List[List[str]] = results diff --git a/tests/unit/driftdetect/test_detector.py b/tests/unit/driftdetect/test_detector.py index 71c223d145..9e828904d3 100644 --- a/tests/unit/driftdetect/test_detector.py +++ b/tests/unit/driftdetect/test_detector.py @@ -1,5 +1,6 @@ from unittest.mock import MagicMock +from cartography.client.core.tx import read_list_of_dicts_tx from cartography.driftdetect.detect_deviations import compare_states from cartography.driftdetect.get_states import get_state from cartography.driftdetect.model import State @@ -26,13 +27,13 @@ def test_state_no_drift(): mock_result.__getitem__.side_effect = results.__getitem__ mock_result.__iter__.side_effect = results.__iter__ - mock_session.run.return_value = mock_result + mock_session.read_transaction.return_value = mock_result data = FileSystem.load("tests/data/detectors/test_expectations.json") state_old = StateSchema().load(data) state_new = State(state_old.name, state_old.validation_query, state_old.properties, []) get_state(mock_session, state_new) drifts = compare_states(state_old, state_new) - mock_session.run.assert_called_with(state_new.validation_query) + mock_session.read_transaction.assert_called_with(read_list_of_dicts_tx, state_new.validation_query) assert not drifts @@ -54,20 +55,60 @@ def test_state_picks_up_drift(): {key: "7"}, ] + # Arrange mock_result.__getitem__.side_effect = results.__getitem__ mock_result.__iter__.side_effect = results.__iter__ - mock_session.run.return_value = mock_result + mock_session.read_transaction.return_value = mock_result data = FileSystem.load("tests/data/detectors/test_expectations.json") state_old = StateSchema().load(data) state_new = State(state_old.name, state_old.validation_query, state_old.properties, []) get_state(mock_session, state_new) state_new.properties = state_old.properties + + # Act drifts = compare_states(state_old, state_new) - mock_session.run.assert_called_with(state_new.validation_query) + + # Assert + mock_session.read_transaction.assert_called_with(read_list_of_dicts_tx, state_new.validation_query) assert drifts assert ["7"] in drifts +def test_state_order_does_not_matter(): + """ + Test that a state that detects drift. + :return: + """ + key = "d.test" + mock_session = MagicMock() + mock_result = MagicMock() + results = [ + {key: "1"}, + {key: "2"}, + {key: "6"}, # This one is out of order + {key: "3"}, + {key: "4"}, + {key: "5"}, + ] + + # Arrange + mock_result.__getitem__.side_effect = results.__getitem__ + mock_result.__iter__.side_effect = results.__iter__ + mock_session.read_transaction.return_value = mock_result + data = FileSystem.load("tests/data/detectors/test_expectations.json") + state_old = StateSchema().load(data) + state_new = State(state_old.name, state_old.validation_query, state_old.properties, []) + get_state(mock_session, state_new) + state_new.properties = state_old.properties + + # Act + drifts = compare_states(state_old, state_new) + + # Assert + mock_session.read_transaction.assert_called_with(read_list_of_dicts_tx, state_new.validation_query) + assert not drifts + + def test_state_multiple_expectations(): """ Test that multiple fields runs properly. @@ -89,14 +130,14 @@ def test_state_multiple_expectations(): mock_result.__getitem__.side_effect = results.__getitem__ mock_result.__iter__.side_effect = results.__iter__ - mock_session.run.return_value = mock_result + mock_session.read_transaction.return_value = mock_result data = FileSystem.load("tests/data/detectors/test_multiple_expectations.json") state_old = StateSchema().load(data) state_new = State(state_old.name, state_old.validation_query, state_old.properties, []) get_state(mock_session, state_new) state_new.properties = state_old.properties drifts = compare_states(state_old, state_new) - mock_session.run.assert_called_with(state_new.validation_query) + mock_session.read_transaction.assert_called_with(read_list_of_dicts_tx, state_new.validation_query) assert ["7", "14"] in drifts @@ -121,14 +162,14 @@ def test_drift_from_multiple_properties(): ] mock_result.__getitem__.side_effect = results.__getitem__ mock_result.__iter__.side_effect = results.__iter__ - mock_session.run.return_value = mock_result + mock_session.read_transaction.return_value = mock_result data = FileSystem.load("tests/data/detectors/test_multiple_properties.json") state_old = StateSchema().load(data) state_new = State(state_old.name, state_old.validation_query, state_old.properties, []) get_state(mock_session, state_new) state_new.properties = state_old.properties drifts = compare_states(state_old, state_new) - mock_session.run.assert_called_with(state_new.validation_query) + mock_session.read_transaction.assert_called_with(read_list_of_dicts_tx, state_new.validation_query) assert ["7", "14", ["21", "28", "35"]] in drifts assert ["3", "10", ["17", "24", "31"]] not in drifts From 1e9d4a72b3da793fc3535a66f99e61b3823fbfd4 Mon Sep 17 00:00:00 2001 From: Alex Chantavy Date: Thu, 15 Jun 2023 09:49:36 -0700 Subject: [PATCH 02/18] Move GH outside-collab cleanup job to correct place (#1187) Previously GitHub outside collaborators were cleaned up by the GH user sync. This is incorrect behavior because outside collaborators are actually synced in by the GH repo sync. This can be problematic if the GH sync exhausts API quota and crashes in the middle, leaving the graph in an inconsistent state. This quick patch makes it so that the outside collaborators are properly synced in and cleaned up by the GH repo sync. --- .../jobs/cleanup/github_repos_cleanup.json | 25 +++++++++++++++++++ .../jobs/cleanup/github_users_cleanup.json | 25 ------------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/cartography/data/jobs/cleanup/github_repos_cleanup.json b/cartography/data/jobs/cleanup/github_repos_cleanup.json index f840f89c9f..0d5bc41112 100644 --- a/cartography/data/jobs/cleanup/github_repos_cleanup.json +++ b/cartography/data/jobs/cleanup/github_repos_cleanup.json @@ -38,6 +38,31 @@ "query": "MATCH (:GitHubRepository)-[r:REQUIRES]->(:PythonLibrary) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", "iterative": true, "iterationsize": 100 + }, + { + "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_ADMIN]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", + "iterative": true, + "iterationsize": 100 + }, + { + "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_MAINTAIN]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", + "iterative": true, + "iterationsize": 100 + }, + { + "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_READ]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", + "iterative": true, + "iterationsize": 100 + }, + { + "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_TRIAGE]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", + "iterative": true, + "iterationsize": 100 + }, + { + "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_WRITE]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", + "iterative": true, + "iterationsize": 100 }], "name": "cleanup GitHub repos data" } diff --git a/cartography/data/jobs/cleanup/github_users_cleanup.json b/cartography/data/jobs/cleanup/github_users_cleanup.json index a120d387fa..4419d8d650 100644 --- a/cartography/data/jobs/cleanup/github_users_cleanup.json +++ b/cartography/data/jobs/cleanup/github_users_cleanup.json @@ -14,31 +14,6 @@ "iterative": true, "iterationsize": 100 }, - { - "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_ADMIN]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_MAINTAIN]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_READ]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_TRIAGE]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_WRITE]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - }, { "query": "MATCH (:GitHubUser)-[r:MEMBER_OF]->(:GitHubOrganization) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", "iterative": true, From 80d764ae75e56bfe3913902a286059cd2160dd58 Mon Sep 17 00:00:00 2001 From: Alex Chantavy Date: Thu, 15 Jun 2023 13:17:14 -0700 Subject: [PATCH 03/18] 0.80.0 (#1188) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 62846a3c65..774cbae0b3 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages from setuptools import setup -__version__ = '0.79.2' +__version__ = '0.80.0' setup( From e8c558178259df4dbc81f92f4b77a480c40aed72 Mon Sep 17 00:00:00 2001 From: Jeremy Chapeau <113923302+resilience-jychp@users.noreply.github.com> Date: Fri, 23 Jun 2023 18:20:33 +0200 Subject: [PATCH 04/18] Hotfix (#1190) This PR fix two bugs in current version : - #1189 - #1182 --- cartography/cli.py | 3 +++ cartography/intel/github/repos.py | 5 ++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cartography/cli.py b/cartography/cli.py index 4ae159e40d..db4041f8f9 100644 --- a/cartography/cli.py +++ b/cartography/cli.py @@ -665,6 +665,9 @@ def main(self, argv: str) -> int: ) config.duo_api_key = os.environ.get(config.duo_api_key_env_var) config.duo_api_secret = os.environ.get(config.duo_api_secret_env_var) + else: + config.duo_api_key = None + config.duo_api_secret = None # Run cartography try: diff --git a/cartography/intel/github/repos.py b/cartography/intel/github/repos.py index 8fcadbd7f0..238537e033 100644 --- a/cartography/intel/github/repos.py +++ b/cartography/intel/github/repos.py @@ -312,14 +312,13 @@ def _transform_python_requirements( continue try: req = Requirement(stripped_line) + parsed_list.append(req) except InvalidRequirement: # INFO and not WARN/ERROR as we intentionally don't support all ways to specify Python requirements logger.info( f"Failed to parse line \"{line}\" in repo {repo_url}'s requirements.txt; skipping line.", exc_info=True, ) - continue - parsed_list.append(req) for req in parsed_list: pinned_version = None @@ -563,5 +562,5 @@ def sync( logger.info("Syncing GitHub repos") repos_json = get(github_api_key, github_url, organization) repo_data = transform(repos_json) - load(neo4j_session, repo_data, common_job_parameters['UPDATE_TAG']) + load(neo4j_session, common_job_parameters, repo_data) run_cleanup_job('github_repos_cleanup.json', neo4j_session, common_job_parameters) From 143a443fd1a62bff8abb561cb365376a91c16b22 Mon Sep 17 00:00:00 2001 From: Ramon Petgrave <32398091+ramonpetgrave64@users.noreply.github.com> Date: Fri, 23 Jun 2023 12:40:17 -0400 Subject: [PATCH 05/18] 0.81.0 (#1191) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 774cbae0b3..07a7f9cee9 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages from setuptools import setup -__version__ = '0.80.0' +__version__ = '0.81.0' setup( From 7c2829166c209cb1622669c5da3cbcbbcbb66b20 Mon Sep 17 00:00:00 2001 From: Hector Eryx Paredes Camacho Date: Mon, 10 Jul 2023 15:56:22 -0600 Subject: [PATCH 06/18] Set to None the gsuite token instead of github one (#1204) --- cartography/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cartography/cli.py b/cartography/cli.py index db4041f8f9..cd01e02409 100644 --- a/cartography/cli.py +++ b/cartography/cli.py @@ -638,7 +638,7 @@ def main(self, argv: str) -> int: logger.debug(f"Reading config string for GSuite from environment variable {config.gsuite_tokens_env_var}") config.gsuite_config = os.environ.get(config.gsuite_tokens_env_var) else: - config.github_config = None + config.gsuite_tokens_env_var = None # Lastpass config if config.lastpass_cid_env_var: From 48f50caed6ac889225e94611b1315f6a95c19470 Mon Sep 17 00:00:00 2001 From: Alex Chantavy Date: Thu, 13 Jul 2023 19:17:23 -0700 Subject: [PATCH 07/18] Update usage docs and roadmap links (#1196) Adds docs on - discovering what other nodes are connected to a given node type - discovering what node properties are present on a given node type Updates docs on - roadmap link - making syncmetadata docs more discoverable --- README.md | 13 +-- .../modules/_cartography-metadata/schema.md | 18 +++ docs/root/usage/schema.md | 1 + docs/root/usage/tutorial.md | 105 +++++++++++++++--- docs/schema/syncmetadata.md | 18 +-- 5 files changed, 113 insertions(+), 42 deletions(-) create mode 100644 docs/root/modules/_cartography-metadata/schema.md diff --git a/README.md b/README.md index 62ae48bbdf..cac7c3d266 100644 --- a/README.md +++ b/README.md @@ -39,16 +39,13 @@ Start [here](https://lyft.github.io/cartography/install.html). ## Usage Start with our [tutorial](https://lyft.github.io/cartography/usage/tutorial.html). Our [data schema](https://lyft.github.io/cartography/usage/schema.html) is a helpful reference when you get stuck. -## Contact +## Community - Join us on `#cartography` on the [Lyft OSS Slack](https://join.slack.com/t/lyftoss/shared_invite/enQtOTYzODg5OTQwNDE2LTFiYjgwZWM3NTNhMTFkZjc4Y2IxOTI4NTdiNTdhNjQ4M2Q5NTIzMjVjOWI4NmVlNjRiZmU2YzA5NTc3MmFjYTQ). - -## Community Meeting - -Talk to us and see what we're working on at our [monthly community meeting](https://calendar.google.com/calendar/embed?src=lyft.com_p10o6ceuiieq9sqcn1ef61v1io%40group.calendar.google.com&ctz=America%2FLos_Angeles). -- Meeting minutes are [here](https://docs.google.com/document/d/1VyRKmB0dpX185I15BmNJZpfAJ_Ooobwz0U1WIhjDxvw). -- Recorded videos are posted [here](https://www.youtube.com/playlist?list=PLMga2YJvAGzidUWJB_fnG7EHI4wsDDsE1). -- Our current project road map is [here](https://docs.google.com/document/d/18MOsGI-isFvag1fGk718Aht7wQPueWd4SqOI9KapBa8/edit#heading=h.15nsmgmjaaml). +- Talk to us and see what we're working on at our [monthly community meeting](https://calendar.google.com/calendar/embed?src=lyft.com_p10o6ceuiieq9sqcn1ef61v1io%40group.calendar.google.com&ctz=America%2FLos_Angeles). + - Meeting minutes are [here](https://docs.google.com/document/d/1VyRKmB0dpX185I15BmNJZpfAJ_Ooobwz0U1WIhjDxvw). + - Recorded videos are posted [here](https://www.youtube.com/playlist?list=PLMga2YJvAGzidUWJB_fnG7EHI4wsDDsE1). +- Our current project roadmap is [here](https://github.com/orgs/lyft/projects/26/views/1). ## Contributing Thank you for considering contributing to Cartography! diff --git a/docs/root/modules/_cartography-metadata/schema.md b/docs/root/modules/_cartography-metadata/schema.md new file mode 100644 index 0000000000..878d8268df --- /dev/null +++ b/docs/root/modules/_cartography-metadata/schema.md @@ -0,0 +1,18 @@ +## Cartography metadata schema + +.. _metadata_schema: + +Some Cartography sync jobs write nodes to convey information about the job itself. See https://github.com/lyft/cartography/issues/758 for more background on this. + +### SyncMetadata:ModuleSyncMetadata + +This is a node to represent metadata about the sync job of a particular module. Its existence indicates that a particular sync job did happen. +The 'types' used here should be actual node labels. For example, if we did sync a particular AWSAccount's S3Buckets, +the `grouptype` is 'AWSAccount', the `groupid` is the particular account's `id`, and the `syncedtype` is 'S3Bucket'. + +| Field | Description | Source| +|-------|-------------|------| +|**id**|`{group_type}_{group_id}_{synced_type}`|util.py| +|grouptype| The parent module's type |util.py| +|groupid|The parent module's id|util.py| +|syncedtype|The sub-module's type|util.py| diff --git a/docs/root/usage/schema.md b/docs/root/usage/schema.md index 3d6da845c4..a5f1d101d4 100644 --- a/docs/root/usage/schema.md +++ b/docs/root/usage/schema.md @@ -22,6 +22,7 @@ - In these docs, more specific nodes will be decorated with `GenericNode::SpecificNode` notation. For example, if we have a `Car` node and a `RaceCar` node, we will refer to the `RaceCar` as `Car::RaceCar`. +.. mdinclude:: ../modules/_cartography-metadata/schema.md .. mdinclude:: ../modules/aws/schema.md .. mdinclude:: ../modules/azure/schema.md .. mdinclude:: ../modules/crxcavator/schema.md diff --git a/docs/root/usage/tutorial.md b/docs/root/usage/tutorial.md index 357c8a7f2e..f0cb52fa9b 100644 --- a/docs/root/usage/tutorial.md +++ b/docs/root/usage/tutorial.md @@ -2,24 +2,16 @@ Once everything has been installed and synced, you can view the Neo4j web interface at http://localhost:7474. You can view the reference on this [here](https://neo4j.com/developer/guide-neo4j-browser/#_installing_and_starting_neo4j_browser). -### Permalinking Bookmarklet +If you already know Neo4j and just need to know what are the nodes, attributes, and graph relationships for our representation of infrastructure assets, you can view our [sample queries](samplequeries.html). More sample queries are available at https://github.com/marco-lancini/cartography-queries. -You can set up a bookmarklet that lets you quickly get a permalink to a Cartography query. To do so, add a bookmark with the following contents as the URL - make sure to replace `neo4j.contoso.com:7474` with your instance of Neo4j: +Otherwise, read on for this handhold-y tutorial filled with examples. Suppose we wanted to find out: -```javascript -javascript:(() => { const query = document.querySelectorAll('article label span')[0].innerText; if (query === ':server connect') { console.log('no query has been run!'); return; } const searchParams = new URLSearchParams(); searchParams.append('connectURL', 'bolt://neo4j:neo4j@neo4j.contoso.net:7687'); searchParams.append('cmd', 'edit'); searchParams.append('arg', query.replaceAll(/\r /g, '\r')); newURL = `http://neo4j.contoso.net:7474/browser/?${searchParams}`; window.open(newURL, '_blank', 'noopener'); })() -``` - -Then, any time you are in the web interface, you can click the bookmarklet to open a new tab with a permalink to your most recently executed query in the URL bar. - -### ℹī¸ Already know [how to query Neo4j](https://neo4j.com/developer/cypher-query-language/)? You can skip to our reference material! -If you already know Neo4j and just need to know what are the nodes, attributes, and graph relationships for our representation of infrastructure assets, you can skip this handholdy walkthrough and see our [sample queries](samplequeries.md). - -### What [RDS](https://aws.amazon.com/rds/) instances are installed in my [AWS](https://aws.amazon.com/) accounts? -``` +### What [RDS](https://aws.amazon.com/rds/) instances are installed in my AWS accounts? +```cypher MATCH (aws:AWSAccount)-[r:RESOURCE]->(rds:RDSInstance) return * ``` + ![Visualization of RDS nodes and AWS nodes](../images/accountsandrds.png) In this query we asked Neo4j to find all `[:RESOURCE]` relationships from AWSAccounts to RDSInstances, and return the nodes and the `:RESOURCE` relationships. @@ -35,7 +27,7 @@ and then pick options on the menu that shows up at the bottom of the view like t ### Which RDS instances have [encryption](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Overview.Encryption.html) turned off? -``` +```cypher MATCH (a:AWSAccount)-[:RESOURCE]->(rds:RDSInstance{storage_encrypted:false}) RETURN a.name, rds.id ``` @@ -49,7 +41,7 @@ If you want to go back to viewing the graph and not a table, simply make sure yo Let's look at some other AWS assets now. ### Which [EC2](https://aws.amazon.com/ec2/) instances are directly exposed to the internet? -``` +```cypher MATCH (instance:EC2Instance{exposed_internet: true}) RETURN instance.instanceid, instance.publicdnsname ``` @@ -60,7 +52,7 @@ These instances are open to the internet either through permissive inbound IP pe If you know a lot about AWS, you may have noticed that EC2 instances [don't actually have an exposed_internet field](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_Instance.html). We're able to query for this because Cartography performs some [data enrichment](#data-enrichment) to add this field to EC2Instance nodes. ### Which [S3](https://aws.amazon.com/s3/) buckets have a policy granting any level of anonymous access to the bucket? -``` +```cypher MATCH (s:S3Bucket) WHERE s.anonymous_access = true RETURN s @@ -76,13 +68,81 @@ A couple of other things to notice: instead of using the "{}" notation to filter Let's go back to analyzing RDS instances. In an earlier example we queried for RDS instances that have encryption turned off. We can aggregate this data by AWSAccount with a small change: -``` +```cypher MATCH (a:AWSAccount)-[:RESOURCE]->(rds:RDSInstance) WHERE rds.storage_encrypted = false RETURN a.name as AWSAccount, count(rds) as UnencryptedInstances ``` ![Table of unencrypted RDS instances by AWS account](../images/unencryptedcounts.png) + +### Given a node label, what other node labels can be connected to it? + +Suppose we wanted to know what other assets can be connected to a DNSRecord. We would ask the graph like this: + +```cypher +match (d:DNSRecord)--(n) +return distinct labels(n); +``` + +This says "what are the possible labels for all nodes connected to all DNSRecord nodes `d` in my graph?" Your answer might look like this: + +``` +["AWSDNSRecord", "DNSRecord"] +["AWSDNSZone", "DNSZone"] +["LoadBalancerV2"] +["NameServer"] +["ESDomain"] +["LoadBalancer"] +["EC2Instance", "Instance"] +``` + +You can then make the path more specific like this: + +```cypher +match (d:DNSRecord)--(:EC2Instance)--(n) +return distinct labels(n); +``` + +And then you can continue building your query. + +We also include [full schema docs](schema.html), but this way of building a query can be faster and more interactive. + + +### Given a node label, what are the possible property names defined on it? + +We can find what properties are available on an S3Bucket like this: + +```cypher +match (n:S3Bucket) return properties(n) limit 1; +``` + +The result will look like this: + +``` +{ + "bucket_key_enabled": false, + "creationdate": "2022-05-10 00:22:52+00:00", + "ignore_public_acls": true, + "anonymous_access": false, + "firstseen": 1652400141863, + "block_public_policy": true, + "versioning_status": "Enabled", + "block_public_acls": true, + "anonymous_actions": [], + "name": "my-fake-bucket-123", + "lastupdated": 1688605272, + "encryption_algorithm": "AES256", + "default_encryption": true, + "id": "my-fake-bucket-123", + "arn": "arn:aws:s3:::my-fake-bucket-123", + "restrict_public_buckets": false +} +``` + +Our [full schema docs](schema.html) describe all possible fields, but listing out properties this way lets you avoid switching between browser tabs. + + ### Learning more If you want to learn more in depth about Neo4j and Cypher queries you can look at [this tutorial](https://neo4j.com/developer/cypher-query-language/) and see this [reference card](https://neo4j.com/docs/cypher-refcard/current/). @@ -117,3 +177,14 @@ You can add your own custom attributes and relationships without writing Python ### Mapping AWS Access Permissions Cartography can map permissions between IAM Principals and resources in the graph. Here's [how](../modules/aws/permissions-mapping.html). + + +### Permalinking Bookmarklet + +You can set up a bookmarklet that lets you quickly get a permalink to a Cartography query. To do so, add a bookmark with the following contents as the URL - make sure to replace `neo4j.contoso.com:7474` with your instance of Neo4j: + +```javascript +javascript:(() => { const query = document.querySelectorAll('article label span')[0].innerText; if (query === ':server connect') { console.log('no query has been run!'); return; } const searchParams = new URLSearchParams(); searchParams.append('connectURL', 'bolt://neo4j:neo4j@neo4j.contoso.net:7687'); searchParams.append('cmd', 'edit'); searchParams.append('arg', query.replaceAll(/\r /g, '\r')); newURL = `http://neo4j.contoso.net:7474/browser/?${searchParams}`; window.open(newURL, '_blank', 'noopener'); })() +``` + +Then, any time you are in the web interface, you can click the bookmarklet to open a new tab with a permalink to your most recently executed query in the URL bar. diff --git a/docs/schema/syncmetadata.md b/docs/schema/syncmetadata.md index baad4be28e..7572a7ff29 100644 --- a/docs/schema/syncmetadata.md +++ b/docs/schema/syncmetadata.md @@ -1,17 +1 @@ -## SyncMetadata - -SyncMetadata nodes are created by sync jobs to convey information about the job itself. See this doc for how this is -used. - -## SyncMetadata:ModuleSyncMetadata - -This is a node to represent some metadata about the sync job of a particular module or sub-module. Its existence should suggest that a paritcular sync job did happen. -The 'types' used here should be actual node labels. For example, if we did sync a particular AWSAccount's S3Buckets, -the `grouptype` is 'AWSAccount', the `groupid` is the particular account's `id`, and the `syncedtype` is 'S3Bucket'. - -| Field | Description | Source| -|-------|-------------|------| -|**id**|`{group_type}_{group_id}_{synced_type}`|util.py| -|grouptype| The parent module's type |util.py| -|groupid|The parent module's id|util.py| -|syncedtype|The sub-module's type|util.py| +This document has been moved [here](https://lyft.github.io/cartography/modules/_cartography-metadata/schema.html) From fb3247a8f902a86bdb3d0a6099f4bdacdeccc0fb Mon Sep 17 00:00:00 2001 From: Alex Chantavy Date: Fri, 14 Jul 2023 13:16:49 -0700 Subject: [PATCH 08/18] #1210: EBSVolume => new data model, Allow node attr updates from multiple intel modules (#1214) See #1210 for full context. #1154 tried to solve this problem by updating the querybuilder but this was too complex and would not generalize well. This solution is simpler where we use different property classes for each API response so that we don't overwrite properties on a node set by another sync job. This PR can be reviewed commit-by-commit: - c0d9ac4cc35fd93e260964cfa41a813e2a2032a7 shows a repro of the error with a failing integration test. - facb63bcbac6b68eec0fd2ea3f6b0550ac40eb10 shows the solution using multiple classes. --------- Co-authored-by: Ramon Petgrave <32398091+ramonpetgrave64@users.noreply.github.com> --- cartography/intel/aws/ec2/instances.py | 4 +- cartography/intel/aws/ec2/volumes.py | 118 +++++------ cartography/intel/aws/util/arns.py | 18 ++ cartography/models/aws/ec2/volumes.py | 46 ++++- tests/data/aws/ec2/volumes.py | 4 +- .../intel/aws/ec2/test_ec2_volumes.py | 194 +++++++++++------- 6 files changed, 237 insertions(+), 147 deletions(-) create mode 100644 cartography/intel/aws/util/arns.py diff --git a/cartography/intel/aws/ec2/instances.py b/cartography/intel/aws/ec2/instances.py index 1c69eb06a8..87c7e32028 100644 --- a/cartography/intel/aws/ec2/instances.py +++ b/cartography/intel/aws/ec2/instances.py @@ -17,7 +17,7 @@ from cartography.models.aws.ec2.reservations import EC2ReservationSchema from cartography.models.aws.ec2.securitygroups import EC2SecurityGroupSchema from cartography.models.aws.ec2.subnets import EC2SubnetSchema -from cartography.models.aws.ec2.volumes import EBSVolumeSchema +from cartography.models.aws.ec2.volumes import EBSVolumeInstanceSchema from cartography.util import aws_handle_regions from cartography.util import timeit @@ -273,7 +273,7 @@ def load_ec2_instance_ebs_volumes( ) -> None: load( neo4j_session, - EBSVolumeSchema(), + EBSVolumeInstanceSchema(), ebs_data, Region=region, AWS_ID=current_aws_account_id, diff --git a/cartography/intel/aws/ec2/volumes.py b/cartography/intel/aws/ec2/volumes.py index 6de03c7d42..3ad50f1186 100644 --- a/cartography/intel/aws/ec2/volumes.py +++ b/cartography/intel/aws/ec2/volumes.py @@ -6,7 +6,9 @@ import boto3 import neo4j +from cartography.client.core.tx import load from cartography.graph.job import GraphJob +from cartography.intel.aws.util.arns import build_arn from cartography.models.aws.ec2.volumes import EBSVolumeSchema from cartography.util import aws_handle_regions from cartography.util import timeit @@ -16,7 +18,7 @@ @timeit @aws_handle_regions -def get_volumes(boto3_session: boto3.session.Session, region: str) -> List[Dict]: +def get_volumes(boto3_session: boto3.session.Session, region: str) -> List[Dict[str, Any]]: client = boto3_session.client('ec2', region_name=region) paginator = client.get_paginator('describe_volumes') volumes: List[Dict] = [] @@ -26,90 +28,76 @@ def get_volumes(boto3_session: boto3.session.Session, region: str) -> List[Dict] def transform_volumes(volumes: List[Dict[str, Any]], region: str, current_aws_account_id: str) -> List[Dict[str, Any]]: + result = [] for volume in volumes: - volume['VolumeArn'] = f"arn:aws:ec2:{region}:{current_aws_account_id}:volume/{volume['VolumeId']}" - volume['CreateTime'] = str(volume['CreateTime']) - return volumes + attachments = volume.get('Attachments', []) + active_attachments = [a for a in attachments if a['State'] == 'attached'] + + volume_id = volume['VolumeId'] + raw_vol = ({ + 'Arn': build_arn('ec2', current_aws_account_id, 'volume', volume_id, region), + 'AvailabilityZone': volume['AvailabilityZone'], + 'CreateTime': volume['CreateTime'], + 'Encrypted': volume['Encrypted'], + 'Size': volume['Size'], + 'State': volume['State'], + 'OutpostArn': volume['OutpostArn'], + 'SnapshotId': volume['SnapshotId'], + 'Iops': volume['Iops'], + 'FastRestored': volume['FastRestored'], + 'MultiAttachEnabled': volume['MultiAttachEnabled'], + 'VolumeType': volume['VolumeType'], + 'VolumeId': volume_id, + 'KmsKeyId': volume['KmsKeyId'], + }) + + if not active_attachments: + result.append(raw_vol) + continue + + for attachment in active_attachments: + vol_with_attachment = raw_vol.copy() + vol_with_attachment['InstanceId'] = attachment['InstanceId'] + result.append(vol_with_attachment) + + return result @timeit def load_volumes( - neo4j_session: neo4j.Session, data: List[Dict], region: str, current_aws_account_id: str, update_tag: int, + neo4j_session: neo4j.Session, + ebs_data: List[Dict[str, Any]], + region: str, + current_aws_account_id: str, + update_tag: int, ) -> None: - ingest_volumes = """ - UNWIND $volumes_list as volume - MERGE (vol:EBSVolume{id: volume.VolumeId}) - ON CREATE SET vol.firstseen = timestamp() - SET vol.arn = volume.VolumeArn, - vol.lastupdated = $update_tag, - vol.availabilityzone = volume.AvailabilityZone, - vol.createtime = volume.CreateTime, - vol.encrypted = volume.Encrypted, - vol.size = volume.Size, - vol.state = volume.State, - vol.outpostarn = volume.OutpostArn, - vol.snapshotid = volume.SnapshotId, - vol.iops = volume.Iops, - vol.fastrestored = volume.FastRestored, - vol.multiattachenabled = volume.MultiAttachEnabled, - vol.type = volume.VolumeType, - vol.kmskeyid = volume.KmsKeyId, - vol.region=$Region - WITH vol - MATCH (aa:AWSAccount{id: $AWS_ACCOUNT_ID}) - MERGE (aa)-[r:RESOURCE]->(vol) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $update_tag - """ - - neo4j_session.run( - ingest_volumes, - volumes_list=data, - AWS_ACCOUNT_ID=current_aws_account_id, + load( + neo4j_session, + EBSVolumeSchema(), + ebs_data, Region=region, - update_tag=update_tag, + AWS_ID=current_aws_account_id, + lastupdated=update_tag, ) -def load_volume_relationships( - neo4j_session: neo4j.Session, - volumes: List[Dict[str, Any]], - aws_update_tag: int, -) -> None: - add_relationship_query = """ - MATCH (volume:EBSVolume{arn: $VolumeArn}) - WITH volume - MATCH (instance:EC2Instance{instanceid: $InstanceId}) - MERGE (volume)-[r:ATTACHED_TO_EC2_INSTANCE]->(instance) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $aws_update_tag - """ - for volume in volumes: - for attachment in volume.get('Attachments', []): - if attachment['State'] != 'attached': - continue - neo4j_session.run( - add_relationship_query, - VolumeArn=volume['VolumeArn'], - InstanceId=attachment['InstanceId'], - aws_update_tag=aws_update_tag, - ) - - @timeit -def cleanup_volumes(neo4j_session: neo4j.Session, common_job_parameters: Dict) -> None: +def cleanup_volumes(neo4j_session: neo4j.Session, common_job_parameters: Dict[str, Any]) -> None: GraphJob.from_node_schema(EBSVolumeSchema(), common_job_parameters).run(neo4j_session) @timeit def sync_ebs_volumes( - neo4j_session: neo4j.Session, boto3_session: boto3.session.Session, regions: List[str], - current_aws_account_id: str, update_tag: int, common_job_parameters: Dict, + neo4j_session: neo4j.Session, + boto3_session: boto3.session.Session, + regions: List[str], + current_aws_account_id: str, + update_tag: int, + common_job_parameters: Dict[str, Any], ) -> None: for region in regions: logger.debug("Syncing volumes for region '%s' in account '%s'.", region, current_aws_account_id) data = get_volumes(boto3_session, region) transformed_data = transform_volumes(data, region, current_aws_account_id) load_volumes(neo4j_session, transformed_data, region, current_aws_account_id, update_tag) - load_volume_relationships(neo4j_session, transformed_data, update_tag) cleanup_volumes(neo4j_session, common_job_parameters) diff --git a/cartography/intel/aws/util/arns.py b/cartography/intel/aws/util/arns.py new file mode 100644 index 0000000000..e6108b82c3 --- /dev/null +++ b/cartography/intel/aws/util/arns.py @@ -0,0 +1,18 @@ +from typing import Optional + + +def build_arn( + resource: str, + account: str, + typename: str, + name: str, + region: Optional[str] = None, + partition: Optional[str] = None, +) -> str: + if not partition: + # TODO: support partitions from others. Please file an issue on this if needed, would love to hear from you + partition = 'aws' + if not region: + # Some resources are present in all regions, e.g. IAM policies + region = "" + return f"arn:{partition}:{resource}:{region}:{account}:{typename}/{name}" diff --git a/cartography/models/aws/ec2/volumes.py b/cartography/models/aws/ec2/volumes.py index 2140f4fcd0..bb6925780c 100644 --- a/cartography/models/aws/ec2/volumes.py +++ b/cartography/models/aws/ec2/volumes.py @@ -13,10 +13,23 @@ @dataclass(frozen=True) class EBSVolumeNodeProperties(CartographyNodeProperties): + arn: PropertyRef = PropertyRef('Arn', extra_index=True) id: PropertyRef = PropertyRef('VolumeId') + volumeid: PropertyRef = PropertyRef('VolumeId', extra_index=True) region: PropertyRef = PropertyRef('Region', set_in_kwargs=True) lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) - deleteontermination: PropertyRef = PropertyRef('DeleteOnTermination') + availabilityzone: PropertyRef = PropertyRef('AvailabilityZone') + createtime: PropertyRef = PropertyRef('CreateTime') + encrypted: PropertyRef = PropertyRef('Encrypted') + size: PropertyRef = PropertyRef('Size') + state: PropertyRef = PropertyRef('State') + outpostarn: PropertyRef = PropertyRef('OutpostArn') + snapshotid: PropertyRef = PropertyRef('SnapshotId') + iops: PropertyRef = PropertyRef('Iops') + fastrestored: PropertyRef = PropertyRef('FastRestored') + multiattachenabled: PropertyRef = PropertyRef('MultiAttachEnabled') + type: PropertyRef = PropertyRef('VolumeType') + kmskeyid: PropertyRef = PropertyRef('KmsKeyId') @dataclass(frozen=True) @@ -53,6 +66,9 @@ class EBSVolumeToEC2Instance(CartographyRelSchema): @dataclass(frozen=True) class EBSVolumeSchema(CartographyNodeSchema): + """ + EBS Volume properties as returned from the EBS Volume API response + """ label: str = 'EBSVolume' properties: EBSVolumeNodeProperties = EBSVolumeNodeProperties() sub_resource_relationship: EBSVolumeToAWSAccount = EBSVolumeToAWSAccount() @@ -61,3 +77,31 @@ class EBSVolumeSchema(CartographyNodeSchema): EBSVolumeToEC2Instance(), ], ) + + +@dataclass(frozen=True) +class EBSVolumeInstanceProperties(CartographyNodeProperties): + """ + EBS Volume properties as known by an EC2 instance. + The EC2 instance API response includes a `deleteontermination` field and the volume id. + """ + arn: PropertyRef = PropertyRef('Arn', extra_index=True) + id: PropertyRef = PropertyRef('VolumeId') + volumeid: PropertyRef = PropertyRef('VolumeId', extra_index=True) + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + deleteontermination: PropertyRef = PropertyRef('DeleteOnTermination') + + +@dataclass(frozen=True) +class EBSVolumeInstanceSchema(CartographyNodeSchema): + """ + EBS Volume from EC2 Instance API response. This is separate from `EBSVolumeSchema` to prevent issue #1210. + """ + label: str = 'EBSVolume' + properties: EBSVolumeInstanceProperties = EBSVolumeInstanceProperties() + sub_resource_relationship: EBSVolumeToAWSAccount = EBSVolumeToAWSAccount() + other_relationships: OtherRelationships = OtherRelationships( + [ + EBSVolumeToEC2Instance(), + ], + ) diff --git a/tests/data/aws/ec2/volumes.py b/tests/data/aws/ec2/volumes.py index e3a29c03fa..6e08003e73 100644 --- a/tests/data/aws/ec2/volumes.py +++ b/tests/data/aws/ec2/volumes.py @@ -14,7 +14,7 @@ 'Size': 123, 'SnapshotId': 'sn-01', 'State': 'available', - 'VolumeId': 'v-01', + 'VolumeId': 'vol-0df', 'Iops': 123, 'VolumeType': 'standard', 'FastRestored': True, @@ -33,7 +33,7 @@ 'OutpostArn': 'arn1', 'Size': 123, 'State': 'available', - 'VolumeId': 'v-02', + 'VolumeId': 'vol-03', 'Iops': 123, 'SnapshotId': 'sn-02', 'VolumeType': 'standard', diff --git a/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py b/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py index 705d2db3ec..b852581303 100644 --- a/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py +++ b/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py @@ -3,94 +3,116 @@ import cartography.intel.aws.ec2.instances import cartography.intel.aws.ec2.volumes -import tests.data.aws.ec2.instances -import tests.data.aws.ec2.volumes from cartography.intel.aws.ec2.instances import sync_ec2_instances +from cartography.intel.aws.ec2.volumes import sync_ebs_volumes from tests.data.aws.ec2.instances import DESCRIBE_INSTANCES - +from tests.data.aws.ec2.volumes import DESCRIBE_VOLUMES +from tests.integration.cartography.intel.aws.common import create_test_account +from tests.integration.util import check_nodes +from tests.integration.util import check_rels TEST_ACCOUNT_ID = '000000000000' TEST_REGION = 'eu-west-1' TEST_UPDATE_TAG = 123456789 -def test_load_volumes(neo4j_session): +@patch.object(cartography.intel.aws.ec2.volumes, 'get_volumes', return_value=DESCRIBE_VOLUMES) +def test_sync_ebs_volumes(mock_get_vols, neo4j_session): # Arrange - data = tests.data.aws.ec2.volumes.DESCRIBE_VOLUMES - transformed_data = cartography.intel.aws.ec2.volumes.transform_volumes(data, TEST_REGION, TEST_ACCOUNT_ID) + boto3_session = MagicMock() + create_test_account(neo4j_session, TEST_ACCOUNT_ID, TEST_UPDATE_TAG) # Act - cartography.intel.aws.ec2.volumes.load_volumes( + sync_ebs_volumes( neo4j_session, - transformed_data, - TEST_REGION, + boto3_session, + [TEST_REGION], TEST_ACCOUNT_ID, TEST_UPDATE_TAG, + {'UPDATE_TAG': TEST_UPDATE_TAG, 'AWS_ID': TEST_ACCOUNT_ID}, ) # Assert - expected_nodes = { - "v-01", "v-02", + assert check_nodes(neo4j_session, 'EBSVolume', ['arn']) == { + ('arn:aws:ec2:eu-west-1:000000000000:volume/vol-03',), + ('arn:aws:ec2:eu-west-1:000000000000:volume/vol-0df',), } - nodes = neo4j_session.run( - """ - MATCH (r:EBSVolume) RETURN r.id; - """, - ) - actual_nodes = {n['r.id'] for n in nodes} - - assert actual_nodes == expected_nodes - - -def test_load_volume_to_account_rels(neo4j_session): + # Assert + assert check_rels( + neo4j_session, + 'AWSAccount', + 'id', + 'EBSVolume', + 'volumeid', + 'RESOURCE', + rel_direction_right=True, + ) == { + (TEST_ACCOUNT_ID, 'vol-03'), + (TEST_ACCOUNT_ID, 'vol-0df'), + } - # Arrange: Create Test AWSAccount - neo4j_session.run( - """ - MERGE (aws:AWSAccount{id: $aws_account_id}) - ON CREATE SET aws.firstseen = timestamp() - SET aws.lastupdated = $aws_update_tag - """, - aws_account_id=TEST_ACCOUNT_ID, - aws_update_tag=TEST_UPDATE_TAG, - ) - # Act: Load Test Volumes - data = tests.data.aws.ec2.volumes.DESCRIBE_VOLUMES - transformed_data = cartography.intel.aws.ec2.volumes.transform_volumes(data, TEST_REGION, TEST_ACCOUNT_ID) +@patch.object(cartography.intel.aws.ec2.instances, 'get_ec2_instances', return_value=DESCRIBE_INSTANCES['Reservations']) +@patch.object(cartography.intel.aws.ec2.volumes, 'get_volumes', return_value=DESCRIBE_VOLUMES) +def test_sync_ebs_volumes_e2e(mock_get_vols, mock_get_instances, neo4j_session): + # Arrange + neo4j_session.run('MATCH (n) DETACH DELETE n;') + boto3_session = MagicMock() + create_test_account(neo4j_session, TEST_ACCOUNT_ID, TEST_UPDATE_TAG) - cartography.intel.aws.ec2.volumes.load_volumes( + # Act: sync_ec2_instances() loads attached ebs volumes + sync_ec2_instances( neo4j_session, - transformed_data, - TEST_REGION, + boto3_session, + [TEST_REGION], TEST_ACCOUNT_ID, TEST_UPDATE_TAG, + {'UPDATE_TAG': TEST_UPDATE_TAG, 'AWS_ID': TEST_ACCOUNT_ID}, ) - # Assert - expected = { - (TEST_ACCOUNT_ID, 'v-01'), - (TEST_ACCOUNT_ID, 'v-02'), + # Assert that deleteontermination is set by sync_ec2_instances. The encrypted property isn't returned by this API. + assert check_nodes(neo4j_session, 'EBSVolume', ['id', 'deleteontermination', 'encrypted']) == { + ('vol-03', True, None), + ('vol-04', True, None), + ('vol-09', True, None), + ('vol-0df', True, None), } - result = neo4j_session.run( - """ - MATCH (n1:AWSAccount)-[:RESOURCE]->(n2:EBSVolume) RETURN n1.id, n2.id; - """, - ) - actual = { - (r['n1.id'], r['n2.id']) for r in result + # Assert that they are attached to the instances + assert check_rels( + neo4j_session, + 'EC2Instance', + 'instanceid', + 'EBSVolume', + 'volumeid', + 'ATTACHED_TO', + rel_direction_right=False, + ) == { + ('i-01', 'vol-0df'), + ('i-02', 'vol-03'), + ('i-03', 'vol-09'), + ('i-04', 'vol-04'), } - assert actual == expected - + # Assert that we created the account to volume rels correctly + assert check_rels( + neo4j_session, + 'AWSAccount', + 'id', + 'EBSVolume', + 'volumeid', + 'RESOURCE', + rel_direction_right=True, + ) == { + ('000000000000', 'vol-03'), + ('000000000000', 'vol-04'), + ('000000000000', 'vol-09'), + ('000000000000', 'vol-0df'), + } -@patch.object(cartography.intel.aws.ec2.instances, 'get_ec2_instances', return_value=DESCRIBE_INSTANCES['Reservations']) -def test_load_volume_to_instance_rels(mock_get_instances, neo4j_session): - # Arrange: Load in ec2 instances first - boto3_session = MagicMock() - sync_ec2_instances( + # Act + sync_ebs_volumes( neo4j_session, boto3_session, [TEST_REGION], @@ -98,28 +120,46 @@ def test_load_volume_to_instance_rels(mock_get_instances, neo4j_session): TEST_UPDATE_TAG, {'UPDATE_TAG': TEST_UPDATE_TAG, 'AWS_ID': TEST_ACCOUNT_ID}, ) - # Prep the volume data - raw_volumes = tests.data.aws.ec2.volumes.DESCRIBE_VOLUMES - transformed_volumes = cartography.intel.aws.ec2.volumes.transform_volumes(raw_volumes, TEST_REGION, TEST_ACCOUNT_ID) - # Act - cartography.intel.aws.ec2.volumes.load_volume_relationships( - neo4j_session, - transformed_volumes, - TEST_UPDATE_TAG, - ) + # Assert that additional fields such as `encrypted` have been added by sync_ebs_volumes(), while + # deleteontermination has not been overwritten with None by sync_ebs_volumes() + assert check_nodes(neo4j_session, 'EBSVolume', ['id', 'deleteontermination', 'encrypted']) == { + # Attached to the instances initially + ('vol-04', True, None), + ('vol-09', True, None), + # Added by ebs sync + ('vol-03', True, True), + ('vol-0df', True, True), + } - # Assert - result = neo4j_session.run( - """ - MATCH (n1:EC2Instance)<-[:ATTACHED_TO_EC2_INSTANCE]-(n2:EBSVolume) RETURN n1.id, n2.id; - """, - ) - expected = { - ('i-01', 'v-01'), - ('i-02', 'v-02'), + # Assert that they are still attached to the instances + assert check_rels( + neo4j_session, + 'EC2Instance', + 'instanceid', + 'EBSVolume', + 'volumeid', + 'ATTACHED_TO', + rel_direction_right=False, + ) == { + ('i-01', 'vol-0df'), + ('i-02', 'vol-03'), + ('i-03', 'vol-09'), + ('i-04', 'vol-04'), } - actual = { - (r['n1.id'], r['n2.id']) for r in result + + # Assert that the account to volume rels still exist + assert check_rels( + neo4j_session, + 'AWSAccount', + 'id', + 'EBSVolume', + 'volumeid', + 'RESOURCE', + rel_direction_right=True, + ) == { + ('000000000000', 'vol-03'), + ('000000000000', 'vol-04'), + ('000000000000', 'vol-09'), + ('000000000000', 'vol-0df'), } - assert actual == expected From ff943385c566108b83e972ac5d7788f2685a8e8a Mon Sep 17 00:00:00 2001 From: Dean Liu Date: Mon, 17 Jul 2023 11:08:29 -0700 Subject: [PATCH 09/18] Fix index out of range for drift detection returning no results (#1220) It's possible for neo4j sessions `read_transaction` in `get_state` to return an empty list in the drift detection module. This PR ensures that there are entries before referencing index 0. ``` File "/code/venvs/venv/lib/python3.8/site-packages/cartography/driftdetect/get_states.py", line 123, in get_query_state get_state(session, state) File "/code/venvs/venv/lib/python3.8/site-packages/cartography/driftdetect/get_states.py", line 148, in get_state state.properties = list(new_results[0].keys()) IndexError: list index out of range ``` --- cartography/driftdetect/get_states.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cartography/driftdetect/get_states.py b/cartography/driftdetect/get_states.py index eaa40bbba9..b7e0575618 100644 --- a/cartography/driftdetect/get_states.py +++ b/cartography/driftdetect/get_states.py @@ -145,9 +145,9 @@ def get_state(session: neo4j.Session, state: State) -> None: logger.debug(f"Updating results for {state.name}") # The keys will be the same across all items in the returned list - state.properties = list(new_results[0].keys()) - results = [] + state.properties = list(new_results[0].keys()) if len(new_results) > 0 else [] + results = [] for record in new_results: values = [] for field in record.values(): From 50040a9f7421954846e273ef5da410b6e7f10fd4 Mon Sep 17 00:00:00 2001 From: Ramon Petgrave <32398091+ramonpetgrave64@users.noreply.github.com> Date: Mon, 17 Jul 2023 14:36:53 -0400 Subject: [PATCH 10/18] 0.82.0 (#1221) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 07a7f9cee9..bbfdb79531 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages from setuptools import setup -__version__ = '0.81.0' +__version__ = '0.82.0' setup( From c78da1a1b70027d2ba3e46ce6225fe27e15811ee Mon Sep 17 00:00:00 2001 From: Alex Chantavy Date: Mon, 24 Jul 2023 08:56:23 -0700 Subject: [PATCH 11/18] Add contributing guidelines for issues (#1226) Instruct how we plan to use the Discussions feature --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cac7c3d266..11208559fb 100644 --- a/README.md +++ b/README.md @@ -54,9 +54,12 @@ Thank you for considering contributing to Cartography! Legal stuff: This project is governed by [Lyft's code of conduct](https://github.com/lyft/code-of-conduct). All contributors and participants agree to abide by its terms. +### Bug reports and feature requests and discussions +Submit a GitHub issue to report a bug or request a new feature. If we decide that the issue needs more discussion - usually because the scope is too large or we need to make careful decision - we will convert the issue to a [GitHub Discussion](https://github.com/lyft/cartography/discussions). + ### Developing Cartography -Get started with our [developer documentation](https://lyft.github.io/cartography/dev/developer-guide.html). +Get started with our [developer documentation](https://lyft.github.io/cartography/dev/developer-guide.html). Please feel free to submit your own PRs to update documentation if you've found a better way to explain something. #### Sign the Contributor License Agreement (CLA) From 361fc5db55fb191a5b25332a1f52900f45e93aae Mon Sep 17 00:00:00 2001 From: Alex Chantavy Date: Wed, 2 Aug 2023 10:06:35 -0700 Subject: [PATCH 12/18] #1216: AWS SSM->new data model (#1217) Refactors AWS SSM module to use cartography's data model. --------- Co-authored-by: Ryan Lane Co-authored-by: Ramon Petgrave <32398091+ramonpetgrave64@users.noreply.github.com> --- cartography/data/indexes.cypher | 4 - .../jobs/cleanup/aws_import_ssm_cleanup.json | 25 --- cartography/intel/aws/ssm.py | 145 +++++++----------- cartography/models/aws/ssm/__init__.py | 0 .../models/aws/ssm/instance_information.py | 82 ++++++++++ cartography/models/aws/ssm/instance_patch.py | 70 +++++++++ .../cartography/intel/aws/test_ssm.py | 11 +- 7 files changed, 214 insertions(+), 123 deletions(-) delete mode 100644 cartography/data/jobs/cleanup/aws_import_ssm_cleanup.json create mode 100644 cartography/models/aws/ssm/__init__.py create mode 100644 cartography/models/aws/ssm/instance_information.py create mode 100644 cartography/models/aws/ssm/instance_patch.py diff --git a/cartography/data/indexes.cypher b/cartography/data/indexes.cypher index 072599f719..085f84e270 100644 --- a/cartography/data/indexes.cypher +++ b/cartography/data/indexes.cypher @@ -321,10 +321,6 @@ CREATE INDEX IF NOT EXISTS FOR (n:SpotlightVulnerability) ON (n.host_info_local_ CREATE INDEX IF NOT EXISTS FOR (n:SpotlightVulnerability) ON (n.lastupdated); CREATE INDEX IF NOT EXISTS FOR (n:SQSQueue) ON (n.id); CREATE INDEX IF NOT EXISTS FOR (n:SQSQueue) ON (n.lastupdated); -CREATE INDEX IF NOT EXISTS FOR (n:SSMInstanceInformation) ON (n.id); -CREATE INDEX IF NOT EXISTS FOR (n:SSMInstanceInformation) ON (n.lastupdated); -CREATE INDEX IF NOT EXISTS FOR (n:SSMInstancePatch) ON (n.id); -CREATE INDEX IF NOT EXISTS FOR (n:SSMInstancePatch) ON (n.lastupdated); CREATE INDEX IF NOT EXISTS FOR (n:User) ON (n.arn); CREATE INDEX IF NOT EXISTS FOR (n:User) ON (n.lastupdated); CREATE INDEX IF NOT EXISTS FOR (n:AzureTenant) ON (n.id); diff --git a/cartography/data/jobs/cleanup/aws_import_ssm_cleanup.json b/cartography/data/jobs/cleanup/aws_import_ssm_cleanup.json deleted file mode 100644 index 7f77a3c84d..0000000000 --- a/cartography/data/jobs/cleanup/aws_import_ssm_cleanup.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "statements": [ - { - "query": "MATCH (:AWSAccount{id: $AWS_ID})-[:RESOURCE]->(n:SSMInstanceInformation) WHERE n.lastupdated <> $UPDATE_TAG WITH n LIMIT $LIMIT_SIZE DETACH DELETE (n)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:AWSAccount{id: $AWS_ID})-[:RESOURCE]->(:EC2Instance)-[r:HAS_INFORMATION]->(:SSMInstanceInformation) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:AWSAccount{id: $AWS_ID})-[:RESOURCE]->(n:SSMInstancePatch) WHERE n.lastupdated <> $UPDATE_TAG WITH n LIMIT $LIMIT_SIZE DETACH DELETE (n)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:AWSAccount{id: $AWS_ID})-[:RESOURCE]->(:EC2Instance)-[r:HAS_PATCH]->(:SSMInstancePatch) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - } - ], - "name": "cleanup SSM" -} diff --git a/cartography/intel/aws/ssm.py b/cartography/intel/aws/ssm.py index cd69499d6a..937cba52bf 100644 --- a/cartography/intel/aws/ssm.py +++ b/cartography/intel/aws/ssm.py @@ -6,9 +6,12 @@ import boto3 import neo4j +from cartography.client.core.tx import load +from cartography.graph.job import GraphJob +from cartography.models.aws.ssm.instance_information import SSMInstanceInformationSchema +from cartography.models.aws.ssm.instance_patch import SSMInstancePatchSchema from cartography.util import aws_handle_regions from cartography.util import dict_date_to_epoch -from cartography.util import run_cleanup_job from cartography.util import timeit logger = logging.getLogger(__name__) @@ -31,7 +34,9 @@ def get_instance_ids(neo4j_session: neo4j.Session, region: str, current_aws_acco @timeit @aws_handle_regions def get_instance_information( - boto3_session: boto3.session.Session, region: str, instance_ids: List[str], + boto3_session: boto3.session.Session, + region: str, + instance_ids: List[str], ) -> List[Dict[str, Any]]: client = boto3_session.client('ssm', region_name=region) instance_information: List[Dict[str, Any]] = [] @@ -46,10 +51,21 @@ def get_instance_information( return instance_information +def transform_instance_information(data_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + for ii in data_list: + ii["LastPingDateTime"] = dict_date_to_epoch(ii, "LastPingDateTime") + ii["RegistrationDate"] = dict_date_to_epoch(ii, "RegistrationDate") + ii["LastAssociationExecutionDate"] = dict_date_to_epoch(ii, "LastAssociationExecutionDate") + ii["LastSuccessfulAssociationExecutionDate"] = dict_date_to_epoch(ii, "LastSuccessfulAssociationExecutionDate") + return data_list + + @timeit @aws_handle_regions def get_instance_patches( - boto3_session: boto3.session.Session, region: str, instance_ids: List[str], + boto3_session: boto3.session.Session, + region: str, + instance_ids: List[str], ) -> List[Dict[str, Any]]: client = boto3_session.client('ssm', region_name=region) instance_patches: List[Dict[str, Any]] = [] @@ -65,6 +81,16 @@ def get_instance_patches( return instance_patches +def transform_instance_patches(data_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + for p in data_list: + p["Id"] = f"{p['_instance_id']}-{p['Title']}" + p["InstalledTime"] = dict_date_to_epoch(p, "InstalledTime") + # Split the comma separated CVEIds, if they exist, and strip + # the empty string from the list if not. + p["CVEIds"] = list(filter(None, p.get("CVEIds", "").split(","))) + return data_list + + @timeit def load_instance_information( neo4j_session: neo4j.Session, @@ -73,55 +99,13 @@ def load_instance_information( current_aws_account_id: str, aws_update_tag: int, ) -> None: - ingest_query = """ - UNWIND $InstanceInformation AS instance - MERGE (i:SSMInstanceInformation{id: instance.InstanceId}) - ON CREATE SET i.firstseen = timestamp() - SET i.instance_id = instance.InstanceId, - i.ping_status = instance.PingStatus, - i.last_ping_date_time = instance.LastPingDateTime, - i.agent_version = instance.AgentVersion, - i.is_latest_version = instance.IsLatestVersion, - i.platform_type = instance.PlatformType, - i.platform_name = instance.PlatformName, - i.platform_version = instance.PlatformVersion, - i.activation_id = instance.ActivationId, - i.iam_role = instance.IamRole, - i.registration_date = instance.RegistrationDate, - i.resource_type = instance.ResourceType, - i.name = instance.Name, - i.ip_address = instance.IPAddress, - i.computer_name = instance.ComputerName, - i.association_status = instance.AssociationStatus, - i.last_association_execution_date = instance.LastAssociationExecutionDate, - i.last_successful_association_execution_date = instance.LastSuccessfulAssociationExecutionDate, - i.source_id = instance.SourceId, - i.source_type = instance.SourceType, - i.region = $Region, - i.lastupdated = $aws_update_tag - WITH i - MATCH (owner:AWSAccount{id: $AWS_ACCOUNT_ID}) - MERGE (owner)-[r:RESOURCE]->(i) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $aws_update_tag - WITH i - MATCH (owner:AWSAccount{id: $AWS_ACCOUNT_ID})-[:RESOURCE]->(ec2_instance:EC2Instance{id: i.instance_id}) - MERGE (ec2_instance)-[r2:HAS_INFORMATION]->(i) - ON CREATE SET r2.firstseen = timestamp() - SET r2.lastupdated = $aws_update_tag - """ - for ii in data: - ii["LastPingDateTime"] = dict_date_to_epoch(ii, "LastPingDateTime") - ii["RegistrationDate"] = dict_date_to_epoch(ii, "RegistrationDate") - ii["LastAssociationExecutionDate"] = dict_date_to_epoch(ii, "LastAssociationExecutionDate") - ii["LastSuccessfulAssociationExecutionDate"] = dict_date_to_epoch(ii, "LastSuccessfulAssociationExecutionDate") - - neo4j_session.run( - ingest_query, - InstanceInformation=data, + load( + neo4j_session, + SSMInstanceInformationSchema(), + data, Region=region, - AWS_ACCOUNT_ID=current_aws_account_id, - aws_update_tag=aws_update_tag, + AWS_ID=current_aws_account_id, + lastupdated=aws_update_tag, ) @@ -133,61 +117,40 @@ def load_instance_patches( current_aws_account_id: str, aws_update_tag: int, ) -> None: - ingest_query = """ - UNWIND $InstancePatch AS patch - MERGE (p:SSMInstancePatch{id: patch._instance_id + "-" + patch.Title}) - ON CREATE SET p.firstseen = timestamp() - SET p.instance_id = patch._instance_id, - p.title = patch.Title, - p.kb_id = patch.KBId, - p.classification = patch.Classification, - p.severity = patch.Severity, - p.state = patch.State, - p.installed_time = patch.InstalledTime, - p.cve_ids = patch.CVEIds, - p.region = $Region, - p.lastupdated = $aws_update_tag - WITH p - MATCH (owner:AWSAccount{id: $AWS_ACCOUNT_ID}) - MERGE (owner)-[r:RESOURCE]->(p) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $aws_update_tag - WITH p - MATCH (owner:AWSAccount{id: $AWS_ACCOUNT_ID})-[:RESOURCE]->(ec2_instance:EC2Instance{id: p.instance_id}) - MERGE (ec2_instance)-[r2:HAS_PATCH]->(p) - ON CREATE SET r2.firstseen = timestamp() - SET r2.lastupdated = $aws_update_tag - """ - for p in data: - p["InstalledTime"] = dict_date_to_epoch(p, "InstalledTime") - # Split the comma separated CVEIds, if they exist, and strip - # the empty string from the list if not. - p["CVEIds"] = list(filter(None, p.get("CVEIds", "").split(","))) - - neo4j_session.run( - ingest_query, - InstancePatch=data, + load( + neo4j_session, + SSMInstancePatchSchema(), + data, Region=region, - AWS_ACCOUNT_ID=current_aws_account_id, - aws_update_tag=aws_update_tag, + AWS_ID=current_aws_account_id, + lastupdated=aws_update_tag, ) @timeit -def cleanup_ssm(neo4j_session: neo4j.Session, common_job_parameters: Dict) -> None: - run_cleanup_job('aws_import_ssm_cleanup.json', neo4j_session, common_job_parameters) +def cleanup_ssm(neo4j_session: neo4j.Session, common_job_parameters: Dict[str, Any]) -> None: + logger.info("Running SSM cleanup") + GraphJob.from_node_schema(SSMInstanceInformationSchema(), common_job_parameters).run(neo4j_session) + GraphJob.from_node_schema(SSMInstancePatchSchema(), common_job_parameters).run(neo4j_session) @timeit def sync( - neo4j_session: neo4j.Session, boto3_session: boto3.session.Session, regions: List[str], current_aws_account_id: str, - update_tag: int, common_job_parameters: Dict, + neo4j_session: neo4j.Session, + boto3_session: boto3.session.Session, + regions: List[str], + current_aws_account_id: str, + update_tag: int, + common_job_parameters: Dict[str, Any], ) -> None: for region in regions: logger.info("Syncing SSM for region '%s' in account '%s'.", region, current_aws_account_id) instance_ids = get_instance_ids(neo4j_session, region, current_aws_account_id) data = get_instance_information(boto3_session, region, instance_ids) + data = transform_instance_information(data) load_instance_information(neo4j_session, data, region, current_aws_account_id, update_tag) + data = get_instance_patches(boto3_session, region, instance_ids) + data = transform_instance_patches(data) load_instance_patches(neo4j_session, data, region, current_aws_account_id, update_tag) cleanup_ssm(neo4j_session, common_job_parameters) diff --git a/cartography/models/aws/ssm/__init__.py b/cartography/models/aws/ssm/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cartography/models/aws/ssm/instance_information.py b/cartography/models/aws/ssm/instance_information.py new file mode 100644 index 0000000000..b678ebe30d --- /dev/null +++ b/cartography/models/aws/ssm/instance_information.py @@ -0,0 +1,82 @@ +from dataclasses import dataclass + +from cartography.models.core.common import PropertyRef +from cartography.models.core.nodes import CartographyNodeProperties +from cartography.models.core.nodes import CartographyNodeSchema +from cartography.models.core.relationships import CartographyRelProperties +from cartography.models.core.relationships import CartographyRelSchema +from cartography.models.core.relationships import LinkDirection +from cartography.models.core.relationships import make_target_node_matcher +from cartography.models.core.relationships import OtherRelationships +from cartography.models.core.relationships import TargetNodeMatcher + + +@dataclass(frozen=True) +class SSMInstanceInformationNodeProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('InstanceId') + instance_id: PropertyRef = PropertyRef('InstanceId', extra_index=True) + region: PropertyRef = PropertyRef('Region', set_in_kwargs=True) + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + ping_status: PropertyRef = PropertyRef('PingStatus') + last_ping_date_time: PropertyRef = PropertyRef('LastPingDateTime') + agent_version: PropertyRef = PropertyRef('AgentVersion') + is_latest_version: PropertyRef = PropertyRef('IsLatestVersion') + platform_type: PropertyRef = PropertyRef('PlatformType') + platform_name: PropertyRef = PropertyRef('PlatformName') + platform_version: PropertyRef = PropertyRef('PlatformVersion') + activation_id: PropertyRef = PropertyRef('ActivationId') + iam_role: PropertyRef = PropertyRef('IamRole') + registration_date: PropertyRef = PropertyRef('RegistrationDate') + resource_type: PropertyRef = PropertyRef('ResourceType') + name: PropertyRef = PropertyRef('Name') + ip_address: PropertyRef = PropertyRef('IPAddress') + computer_name: PropertyRef = PropertyRef('ComputerName') + association_status: PropertyRef = PropertyRef('AssociationStatus') + last_association_execution_date: PropertyRef = PropertyRef('LastAssociationExecutionDate') + last_successful_association_execution_date: PropertyRef = PropertyRef('LastSuccessfulAssociationExecutionDate') + source_id: PropertyRef = PropertyRef('SourceId') + source_type: PropertyRef = PropertyRef('SourceType') + + +@dataclass(frozen=True) +class SSMInstanceInformationToAWSAccountRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class SSMInstanceInformationToAWSAccount(CartographyRelSchema): + target_node_label: str = 'AWSAccount' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('AWS_ID', set_in_kwargs=True)}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "RESOURCE" + properties: SSMInstanceInformationToAWSAccountRelProperties = SSMInstanceInformationToAWSAccountRelProperties() + + +@dataclass(frozen=True) +class SSMInstanceInformationToEC2InstanceRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class SSMInstanceInformationToEC2Instance(CartographyRelSchema): + target_node_label: str = 'EC2Instance' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('InstanceId')}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "HAS_INFORMATION" + properties: SSMInstanceInformationToEC2InstanceRelProperties = SSMInstanceInformationToEC2InstanceRelProperties() + + +@dataclass(frozen=True) +class SSMInstanceInformationSchema(CartographyNodeSchema): + label: str = 'SSMInstanceInformation' + properties: SSMInstanceInformationNodeProperties = SSMInstanceInformationNodeProperties() + sub_resource_relationship: SSMInstanceInformationToAWSAccount = SSMInstanceInformationToAWSAccount() + other_relationships: OtherRelationships = OtherRelationships( + [ + SSMInstanceInformationToEC2Instance(), + ], + ) diff --git a/cartography/models/aws/ssm/instance_patch.py b/cartography/models/aws/ssm/instance_patch.py new file mode 100644 index 0000000000..25686f5b80 --- /dev/null +++ b/cartography/models/aws/ssm/instance_patch.py @@ -0,0 +1,70 @@ +from dataclasses import dataclass + +from cartography.models.core.common import PropertyRef +from cartography.models.core.nodes import CartographyNodeProperties +from cartography.models.core.nodes import CartographyNodeSchema +from cartography.models.core.relationships import CartographyRelProperties +from cartography.models.core.relationships import CartographyRelSchema +from cartography.models.core.relationships import LinkDirection +from cartography.models.core.relationships import make_target_node_matcher +from cartography.models.core.relationships import OtherRelationships +from cartography.models.core.relationships import TargetNodeMatcher + + +@dataclass(frozen=True) +class SSMInstancePatchNodeProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('Id') + instance_id: PropertyRef = PropertyRef('_instance_id', extra_index=True) + region: PropertyRef = PropertyRef('Region', set_in_kwargs=True) + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + title: PropertyRef = PropertyRef('Title', extra_index=True) + kb_id: PropertyRef = PropertyRef('KBId', extra_index=True) + classification: PropertyRef = PropertyRef('Classification') + severity: PropertyRef = PropertyRef('Severity') + state: PropertyRef = PropertyRef('State') + installed_time: PropertyRef = PropertyRef('InstalledTime') + cve_ids: PropertyRef = PropertyRef('CVEIds') + + +@dataclass(frozen=True) +class SSMInstancePatchToAWSAccountRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class SSMInstancePatchToAWSAccount(CartographyRelSchema): + target_node_label: str = 'AWSAccount' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('AWS_ID', set_in_kwargs=True)}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "RESOURCE" + properties: SSMInstancePatchToAWSAccountRelProperties = SSMInstancePatchToAWSAccountRelProperties() + + +@dataclass(frozen=True) +class SSMInstancePatchToEC2InstanceRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class SSMInstancePatchToEC2Instance(CartographyRelSchema): + target_node_label: str = 'EC2Instance' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('_instance_id')}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "HAS_PATCH" + properties: SSMInstancePatchToEC2InstanceRelProperties = SSMInstancePatchToEC2InstanceRelProperties() + + +@dataclass(frozen=True) +class SSMInstancePatchSchema(CartographyNodeSchema): + label: str = 'SSMInstancePatch' + properties: SSMInstancePatchNodeProperties = SSMInstancePatchNodeProperties() + sub_resource_relationship: SSMInstancePatchToAWSAccount = SSMInstancePatchToAWSAccount() + other_relationships: OtherRelationships = OtherRelationships( + [ + SSMInstancePatchToEC2Instance(), + ], + ) diff --git a/tests/integration/cartography/intel/aws/test_ssm.py b/tests/integration/cartography/intel/aws/test_ssm.py index a27e1a3f4f..8b31471e7c 100644 --- a/tests/integration/cartography/intel/aws/test_ssm.py +++ b/tests/integration/cartography/intel/aws/test_ssm.py @@ -28,13 +28,16 @@ def _ensure_load_instances(neo4j_session): @patch.object(cartography.intel.aws.ec2.instances, 'get_ec2_instances', return_value=DESCRIBE_INSTANCES['Reservations']) def test_load_instance_information(mock_get_instances, neo4j_session): + # Arrange # load account and instances, to be able to test relationships create_test_account(neo4j_session, TEST_ACCOUNT_ID, TEST_UPDATE_TAG) _ensure_load_instances(neo4j_session) + # Act + data_list = cartography.intel.aws.ssm.transform_instance_information(tests.data.aws.ssm.INSTANCE_INFORMATION) cartography.intel.aws.ssm.load_instance_information( neo4j_session, - tests.data.aws.ssm.INSTANCE_INFORMATION, + data_list, TEST_REGION, TEST_ACCOUNT_ID, TEST_UPDATE_TAG, @@ -84,15 +87,17 @@ def test_load_instance_information(mock_get_instances, neo4j_session): assert actual_nodes == {"i-02"} -def test_load_instance_patches(neo4j_session): +@patch.object(cartography.intel.aws.ec2.instances, 'get_ec2_instances', return_value=DESCRIBE_INSTANCES['Reservations']) +def test_load_instance_patches(mock_get_instances, neo4j_session): # Arrange: load account and instances, to be able to test relationships create_test_account(neo4j_session, TEST_ACCOUNT_ID, TEST_UPDATE_TAG) _ensure_load_instances(neo4j_session) # Act + data_list = cartography.intel.aws.ssm.transform_instance_patches(tests.data.aws.ssm.INSTANCE_PATCHES) cartography.intel.aws.ssm.load_instance_patches( neo4j_session, - tests.data.aws.ssm.INSTANCE_PATCHES, + data_list, TEST_REGION, TEST_ACCOUNT_ID, TEST_UPDATE_TAG, From b0a58a5d720c53a2fc73121d1e67b66456cbc5cc Mon Sep 17 00:00:00 2001 From: Hector Eryx Paredes Camacho Date: Thu, 3 Aug 2023 13:08:32 -0600 Subject: [PATCH 13/18] Adds support to query Semgrep API to ingest SCA vulns (#1224) Adds a new Schema and Intel Job to query Semgrep Enterprise API and ingest Semgrep Supply Chain (SSC) findings. The schema connects a Semgrep Deployment with an id specific to a customer in Semgrep Enterprise to an SCA finding and location as a sub resource relationships. It also connects to a Github repository to match findings against where they were found. Each finding can have a location with the specific lines of code where the vulnerable dependency is being used. ![SemgrepCartographyfinal](https://github.com/lyft/cartography/assets/9236431/9a99ecdd-b40f-430e-bff5-fe950f4c713e) --------- Co-authored-by: Alex Chantavy --- cartography/cli.py | 16 ++ cartography/config.py | 4 + cartography/intel/semgrep/__init__.py | 23 ++ cartography/intel/semgrep/findings.py | 217 ++++++++++++++++ cartography/models/semgrep/__init__.py | 0 cartography/models/semgrep/deployment.py | 19 ++ cartography/models/semgrep/findings.py | 80 ++++++ cartography/models/semgrep/locations.py | 69 +++++ docs/root/modules/semgrep/config.md | 9 + docs/root/modules/semgrep/index.rst | 13 + docs/root/modules/semgrep/schema.md | 91 +++++++ docs/schema/README.md | 3 + tests/data/semgrep/__init__.py | 0 tests/data/semgrep/sca.py | 117 +++++++++ .../cartography/intel/semgrep/__init__.py | 0 .../intel/semgrep/test_findings.py | 240 ++++++++++++++++++ 16 files changed, 901 insertions(+) create mode 100644 cartography/intel/semgrep/__init__.py create mode 100644 cartography/intel/semgrep/findings.py create mode 100644 cartography/models/semgrep/__init__.py create mode 100644 cartography/models/semgrep/deployment.py create mode 100644 cartography/models/semgrep/findings.py create mode 100644 cartography/models/semgrep/locations.py create mode 100644 docs/root/modules/semgrep/config.md create mode 100644 docs/root/modules/semgrep/index.rst create mode 100644 docs/root/modules/semgrep/schema.md create mode 100644 tests/data/semgrep/__init__.py create mode 100644 tests/data/semgrep/sca.py create mode 100644 tests/integration/cartography/intel/semgrep/__init__.py create mode 100644 tests/integration/cartography/intel/semgrep/test_findings.py diff --git a/cartography/cli.py b/cartography/cli.py index cd01e02409..0a3287eb12 100644 --- a/cartography/cli.py +++ b/cartography/cli.py @@ -500,6 +500,15 @@ def _build_parser(self): 'The Duo api hostname' ), ) + parser.add_argument( + '--semgrep-app-token-env-var', + type=str, + default=None, + help=( + 'The name of environment variable containing the Semgrep app token key. ' + 'Required if you are using the Semgrep intel module. Ignored otherwise.' + ), + ) return parser def main(self, argv: str) -> int: @@ -669,6 +678,13 @@ def main(self, argv: str) -> int: config.duo_api_key = None config.duo_api_secret = None + # Semgrep config + if config.semgrep_app_token_env_var: + logger.debug(f"Reading Semgrep App Token from environment variable {config.semgrep_app_token_env_var}") + config.semgrep_app_token = os.environ.get(config.semgrep_app_token_env_var) + else: + config.semgrep_app_token = None + # Run cartography try: return cartography.sync.run_with_config(self.sync, config) diff --git a/cartography/config.py b/cartography/config.py index 7485b16058..556d7082a1 100644 --- a/cartography/config.py +++ b/cartography/config.py @@ -103,6 +103,8 @@ class Config: :param duo_api_key: The Duo api secret. Optional. :type duo_api_hostname: str :param duo_api_hostname: The Duo api hostname, e.g. "api-abc123.duosecurity.com". Optional. + :param semgrep_app_token: The Semgrep api token. Optional. + :type semgrep_app_token: str """ def __init__( @@ -157,6 +159,7 @@ def __init__( duo_api_key=None, duo_api_secret=None, duo_api_hostname=None, + semgrep_app_token=None, ): self.neo4j_uri = neo4j_uri self.neo4j_user = neo4j_user @@ -208,3 +211,4 @@ def __init__( self.duo_api_key = duo_api_key self.duo_api_secret = duo_api_secret self.duo_api_hostname = duo_api_hostname + self.semgrep_app_token = semgrep_app_token diff --git a/cartography/intel/semgrep/__init__.py b/cartography/intel/semgrep/__init__.py new file mode 100644 index 0000000000..dbd72f1dd3 --- /dev/null +++ b/cartography/intel/semgrep/__init__.py @@ -0,0 +1,23 @@ +import logging + +import neo4j + +from cartography.config import Config +from cartography.intel.semgrep.findings import sync +from cartography.util import timeit + + +logger = logging.getLogger(__name__) + + +@timeit +def start_semgrep_ingestion( + neo4j_session: neo4j.Session, config: Config, +) -> None: + common_job_parameters = { + "UPDATE_TAG": config.update_tag, + } + if not config.semgrep_app_token: + logger.info('Semgrep import is not configured - skipping this module. See docs to configure.') + return + sync(neo4j_session, config.semgrep_app_token, config.update_tag, common_job_parameters) diff --git a/cartography/intel/semgrep/findings.py b/cartography/intel/semgrep/findings.py new file mode 100644 index 0000000000..8612df28f8 --- /dev/null +++ b/cartography/intel/semgrep/findings.py @@ -0,0 +1,217 @@ +import logging +from typing import Any +from typing import Dict +from typing import List +from typing import Tuple + +import neo4j +import requests + +from cartography.client.core.tx import load +from cartography.graph.job import GraphJob +from cartography.models.semgrep.deployment import SemgrepDeploymentSchema +from cartography.models.semgrep.findings import SemgrepSCAFindingSchema +from cartography.models.semgrep.locations import SemgrepSCALocationSchema +from cartography.stats import get_stats_client +from cartography.util import merge_module_sync_metadata +from cartography.util import timeit + +logger = logging.getLogger(__name__) +stat_handler = get_stats_client(__name__) +_TIMEOUT = (60, 60) + + +@timeit +def get_deployment(semgrep_app_token: str) -> Dict[str, Any]: + """ + Gets the deployment associated with the passed Semgrep App token. + param: semgrep_app_token: The Semgrep App token to use for authentication. + """ + deployment = {} + deployment_url = "https://semgrep.dev/api/v1/deployments" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {semgrep_app_token}", + } + response = requests.get(deployment_url, headers=headers, timeout=_TIMEOUT) + response.raise_for_status() + + data = response.json() + deployment["id"] = data["deployments"][0]["id"] + deployment["name"] = data["deployments"][0]["name"] + deployment["slug"] = data["deployments"][0]["slug"] + + return deployment + + +@timeit +def get_sca_vulns(semgrep_app_token: str, deployment_id: str) -> List[Dict[str, Any]]: + """ + Gets the SCA vulns associated with the passed Semgrep App token and deployment id. + param: semgrep_app_token: The Semgrep App token to use for authentication. + param: deployment_id: The Semgrep deployment id to use for retrieving SCA vulns. + """ + all_vulns = [] + sca_url = f"https://semgrep.dev/api/sca/deployments/{deployment_id}/vulns" + has_more = True + cursor = "" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {semgrep_app_token}", + } + + while has_more: + params = {} + if cursor: + params = {"cursor": cursor} + + response = requests.get(sca_url, params=params, headers=headers, timeout=_TIMEOUT) + response.raise_for_status() + data = response.json() + vulns = data["vulns"] + cursor = data.get("cursor") + has_more = data.get("hasMore", False) + all_vulns.extend(vulns) + + return all_vulns + + +def transform_sca_vulns(raw_vulns: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[Dict[str, str]]]: + """ + Transforms the raw SCA vulns response from Semgrep API into a list of dicts + that can be used to create the SemgrepSCAFinding nodes. + """ + vulns = [] + usages = [] + for vuln in raw_vulns: + sca_vuln: Dict[str, Any] = {} + # Mandatory fields + unique_id = f"{vuln['repositoryName']}|{vuln['advisory']['ruleId']}" + sca_vuln["id"] = unique_id + sca_vuln["repositoryName"] = vuln["repositoryName"] + sca_vuln["ruleId"] = vuln["advisory"]["ruleId"] + sca_vuln["title"] = vuln["advisory"]["title"] + sca_vuln["description"] = vuln["advisory"]["description"] + sca_vuln["ecosystem"] = vuln["advisory"]["ecosystem"] + sca_vuln["severity"] = vuln["advisory"]["severity"] + sca_vuln["reachability"] = vuln["advisory"]["reachability"] + sca_vuln["reachableIf"] = vuln["advisory"]["reachableIf"] + sca_vuln["exposureType"] = vuln["exposureType"] + dependency = f"{vuln['matchedDependency']['name']}|{vuln['matchedDependency']['versionSpecifier']}" + sca_vuln["matchedDependency"] = dependency + sca_vuln["dependencyFileLocation_path"] = vuln["dependencyFileLocation"]["path"] + sca_vuln["dependencyFileLocation_url"] = vuln["dependencyFileLocation"]["url"] + # Optional fields + sca_vuln["transitivity"] = vuln.get("transitivity", None) + cves = vuln.get("advisory", {}).get("references", {}).get("cveIds") + if len(cves) > 0: + # Take the first CVE + sca_vuln["cveId"] = vuln["advisory"]["references"]["cveIds"][0] + if vuln.get('closestSafeDependency'): + dep_fix = f"{vuln['closestSafeDependency']['name']}|{vuln['closestSafeDependency']['versionSpecifier']}" + sca_vuln["closestSafeDependency"] = dep_fix + if vuln["advisory"].get("references", {}).get("urls", []): + sca_vuln["ref_urls"] = vuln["advisory"].get("references", {}).get("urls", []) + sca_vuln["openedAt"] = vuln.get("openedAt", None) + for usage in vuln.get("usages", []): + usage_dict = {} + usage_dict["SCA_ID"] = unique_id + usage_dict["findingId"] = usage["findingId"] + usage_dict["path"] = usage["location"]["path"] + usage_dict["startLine"] = usage["location"]["startLine"] + usage_dict["startCol"] = usage["location"]["startCol"] + usage_dict["endLine"] = usage["location"]["endLine"] + usage_dict["endCol"] = usage["location"]["endCol"] + usage_dict["url"] = usage["location"]["url"] + usages.append(usage_dict) + vulns.append(sca_vuln) + return vulns, usages + + +@timeit +def load_semgrep_deployment( + neo4j_session: neo4j.Session, deployment: Dict[str, Any], update_tag: int, +) -> None: + logger.info(f"Loading Semgrep deployment info {deployment} into the graph...") + load( + neo4j_session, + SemgrepDeploymentSchema(), + [deployment], + lastupdated=update_tag, + ) + + +@timeit +def load_semgrep_sca_vulns( + neo4j_session: neo4j.Session, + vulns: List[Dict[str, Any]], + deployment_id: str, + update_tag: int, +) -> None: + logger.info(f"Loading {len(vulns)} Semgrep SCA vulns info into the graph.") + load( + neo4j_session, + SemgrepSCAFindingSchema(), + vulns, + lastupdated=update_tag, + DEPLOYMENT_ID=deployment_id, + ) + + +@timeit +def load_semgrep_sca_usages( + neo4j_session: neo4j.Session, + usages: List[Dict[str, Any]], + deployment_id: str, + update_tag: int, +) -> None: + logger.info(f"Loading {len(usages)} Semgrep SCA usages info into the graph.") + load( + neo4j_session, + SemgrepSCALocationSchema(), + usages, + lastupdated=update_tag, + DEPLOYMENT_ID=deployment_id, + ) + + +@timeit +def cleanup( + neo4j_session: neo4j.Session, common_job_parameters: Dict[str, Any], +) -> None: + logger.info("Running Semgrep SCA findings cleanup job.") + findings_cleanup_job = GraphJob.from_node_schema( + SemgrepSCAFindingSchema(), common_job_parameters, + ) + findings_cleanup_job.run(neo4j_session) + logger.info("Running Semgrep SCA Locations cleanup job.") + locations_cleanup_job = GraphJob.from_node_schema( + SemgrepSCALocationSchema(), common_job_parameters, + ) + locations_cleanup_job.run(neo4j_session) + + +@timeit +def sync( + neo4j_sesion: neo4j.Session, + semgrep_app_token: str, + update_tag: int, + common_job_parameters: Dict[str, Any], +) -> None: + logger.info("Running Semgrep SCA findings sync job.") + semgrep_deployment = get_deployment(semgrep_app_token) + load_semgrep_deployment(neo4j_sesion, semgrep_deployment, update_tag) + common_job_parameters["DEPLOYMENT_ID"] = semgrep_deployment["id"] + raw_vulns = get_sca_vulns(semgrep_app_token, semgrep_deployment["id"]) + vulns, usages = transform_sca_vulns(raw_vulns) + load_semgrep_sca_vulns(neo4j_sesion, vulns, semgrep_deployment["id"], update_tag) + load_semgrep_sca_usages(neo4j_sesion, usages, semgrep_deployment["id"], update_tag) + cleanup(neo4j_sesion, common_job_parameters) + merge_module_sync_metadata( + neo4j_session=neo4j_sesion, + group_type='Semgrep', + group_id=common_job_parameters["DEPLOYMENT_ID"], + synced_type='SCA', + update_tag=update_tag, + stat_handler=stat_handler, + ) diff --git a/cartography/models/semgrep/__init__.py b/cartography/models/semgrep/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cartography/models/semgrep/deployment.py b/cartography/models/semgrep/deployment.py new file mode 100644 index 0000000000..6248e01491 --- /dev/null +++ b/cartography/models/semgrep/deployment.py @@ -0,0 +1,19 @@ +from dataclasses import dataclass + +from cartography.models.core.common import PropertyRef +from cartography.models.core.nodes import CartographyNodeProperties +from cartography.models.core.nodes import CartographyNodeSchema + + +@dataclass(frozen=True) +class SemgrepDeploymentProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('id') + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + name: PropertyRef = PropertyRef('name', extra_index=True) + slug: PropertyRef = PropertyRef('slug', extra_index=True) + + +@dataclass(frozen=True) +class SemgrepDeploymentSchema(CartographyNodeSchema): + label: str = 'SemgrepDeployment' + properties: SemgrepDeploymentProperties = SemgrepDeploymentProperties() diff --git a/cartography/models/semgrep/findings.py b/cartography/models/semgrep/findings.py new file mode 100644 index 0000000000..79b2e7011d --- /dev/null +++ b/cartography/models/semgrep/findings.py @@ -0,0 +1,80 @@ +from dataclasses import dataclass + +from cartography.models.core.common import PropertyRef +from cartography.models.core.nodes import CartographyNodeProperties +from cartography.models.core.nodes import CartographyNodeSchema +from cartography.models.core.relationships import CartographyRelProperties +from cartography.models.core.relationships import CartographyRelSchema +from cartography.models.core.relationships import LinkDirection +from cartography.models.core.relationships import make_target_node_matcher +from cartography.models.core.relationships import OtherRelationships +from cartography.models.core.relationships import TargetNodeMatcher + + +@dataclass(frozen=True) +class SemgrepSCAFindingNodeProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('id') + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + rule_id: PropertyRef = PropertyRef('ruleId', extra_index=True) + repository: PropertyRef = PropertyRef('repositoryName', extra_index=True) + summary: PropertyRef = PropertyRef('title', extra_index=True) + description: PropertyRef = PropertyRef('description') + package_manager: PropertyRef = PropertyRef('ecosystem') + severity: PropertyRef = PropertyRef('severity') + cve_id: PropertyRef = PropertyRef('cveId', extra_index=True) + reachability_check: PropertyRef = PropertyRef('reachability') + reachability_condition: PropertyRef = PropertyRef('reachableIf') + reachability: PropertyRef = PropertyRef('exposureType') + transitivity: PropertyRef = PropertyRef('transitivity') + dependency: PropertyRef = PropertyRef('matchedDependency') + dependency_fix: PropertyRef = PropertyRef('closestSafeDependency') + ref_urls: PropertyRef = PropertyRef('ref_urls') + dependency_file: PropertyRef = PropertyRef('dependencyFileLocation_path', extra_index=True) + dependency_file_url: PropertyRef = PropertyRef('dependencyFileLocation_url', extra_index=True) + scan_time: PropertyRef = PropertyRef('openedAt') + + +@dataclass(frozen=True) +class SemgrepSCAFindingToSemgrepDeploymentRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +# (:SemgrepSCAFinding)<-[:RESOURCE]-(:SemgrepDeployment) +class SemgrepSCAFindingToSemgrepDeploymentSchema(CartographyRelSchema): + target_node_label: str = 'SemgrepDeployment' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('DEPLOYMENT_ID', set_in_kwargs=True)}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "RESOURCE" + properties: SemgrepSCAFindingToSemgrepDeploymentRelProperties = SemgrepSCAFindingToSemgrepDeploymentRelProperties() + + +@dataclass(frozen=True) +class SemgrepSCAFindingToGithubRepoRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +# (:SemgrepSCAFinding)-[:FOUND_IN]->(:GitHubRepository) +class SemgrepSCAFindingToGithubRepoRel(CartographyRelSchema): + target_node_label: str = 'GitHubRepository' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'fullname': PropertyRef('repositoryName')}, + ) + direction: LinkDirection = LinkDirection.OUTWARD + rel_label: str = "FOUND_IN" + properties: SemgrepSCAFindingToGithubRepoRelProperties = SemgrepSCAFindingToGithubRepoRelProperties() + + +@dataclass(frozen=True) +class SemgrepSCAFindingSchema(CartographyNodeSchema): + label: str = 'SemgrepSCAFinding' + properties: SemgrepSCAFindingNodeProperties = SemgrepSCAFindingNodeProperties() + sub_resource_relationship: SemgrepSCAFindingToSemgrepDeploymentSchema = SemgrepSCAFindingToSemgrepDeploymentSchema() + other_relationships: OtherRelationships = OtherRelationships( + [ + SemgrepSCAFindingToGithubRepoRel(), + ], + ) diff --git a/cartography/models/semgrep/locations.py b/cartography/models/semgrep/locations.py new file mode 100644 index 0000000000..74a054d700 --- /dev/null +++ b/cartography/models/semgrep/locations.py @@ -0,0 +1,69 @@ +from dataclasses import dataclass + +from cartography.models.core.common import PropertyRef +from cartography.models.core.nodes import CartographyNodeProperties +from cartography.models.core.nodes import CartographyNodeSchema +from cartography.models.core.relationships import CartographyRelProperties +from cartography.models.core.relationships import CartographyRelSchema +from cartography.models.core.relationships import LinkDirection +from cartography.models.core.relationships import make_target_node_matcher +from cartography.models.core.relationships import OtherRelationships +from cartography.models.core.relationships import TargetNodeMatcher + + +@dataclass(frozen=True) +class SemgrepSCALocationProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('findingId') + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + path: PropertyRef = PropertyRef('path', extra_index=True) + start_line: PropertyRef = PropertyRef('startLine') + start_col: PropertyRef = PropertyRef('startCol') + end_line: PropertyRef = PropertyRef('endLine') + end_col: PropertyRef = PropertyRef('endCol') + url: PropertyRef = PropertyRef('url') + + +@dataclass(frozen=True) +class SemgrepSCALocToSemgrepSCAFindingRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +# (:SemgrepSCALocation)<-[:USAGE_AT]-(:SemgrepSCAFinding) +class SemgrepSCALocToSemgrepSCAFindingRelSchema(CartographyRelSchema): + target_node_label: str = 'SemgrepSCAFinding' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('SCA_ID')}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "USAGE_AT" + properties: SemgrepSCALocToSemgrepSCAFindingRelProperties = SemgrepSCALocToSemgrepSCAFindingRelProperties() + + +@dataclass(frozen=True) +class SemgrepSCALocToSemgrepSCADeploymentRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +# (:SemgrepSCALocation)<-[:RESOURCE]-(:SemgrepSCADeployment) +class SemgrepSCALocToSCADeploymentRelSchema(CartographyRelSchema): + target_node_label: str = 'SemgrepDeployment' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('DEPLOYMENT_ID', set_in_kwargs=True)}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "RESOURCE" + properties: SemgrepSCALocToSemgrepSCADeploymentRelProperties = SemgrepSCALocToSemgrepSCADeploymentRelProperties() + + +@dataclass(frozen=True) +class SemgrepSCALocationSchema(CartographyNodeSchema): + label: str = 'SemgrepSCALocation' + properties: SemgrepSCALocationProperties = SemgrepSCALocationProperties() + sub_resource_relationship: SemgrepSCALocToSCADeploymentRelSchema = SemgrepSCALocToSCADeploymentRelSchema() + other_relationships: OtherRelationships = OtherRelationships( + [ + SemgrepSCALocToSemgrepSCAFindingRelSchema(), + ], + ) diff --git a/docs/root/modules/semgrep/config.md b/docs/root/modules/semgrep/config.md new file mode 100644 index 0000000000..6beb6616c5 --- /dev/null +++ b/docs/root/modules/semgrep/config.md @@ -0,0 +1,9 @@ +## Semgrep Configuration + +.. _semgrep_config: + +Follow these steps to ingest Semgrep findings with Cartography. + +1. Create a token with *Agent (CI)* and *Web API scopes* [Creating a SEMGREP_APP_TOKEN](https://semgrep.dev/docs/semgrep-ci/running-semgrep-ci-with-semgrep-cloud-platform/#creating-a-semgrep_app_token). +1. Populate an environment variable with the secrets value of the token +1. Pass the environment variable name to the `--semgrep-app-token-env-var` CLI arg. diff --git a/docs/root/modules/semgrep/index.rst b/docs/root/modules/semgrep/index.rst new file mode 100644 index 0000000000..54a4d625b9 --- /dev/null +++ b/docs/root/modules/semgrep/index.rst @@ -0,0 +1,13 @@ +Semgrep +#### + +The Semgrep module has the following coverage: + +* Deployment +* SCA Findings + +.. toctree:: + :hidden: + :glob: + + * diff --git a/docs/root/modules/semgrep/schema.md b/docs/root/modules/semgrep/schema.md new file mode 100644 index 0000000000..415e946c2c --- /dev/null +++ b/docs/root/modules/semgrep/schema.md @@ -0,0 +1,91 @@ +## Semgrep Schema + +.. _semgrep_schema: + +### SemgrepDeployment + +Represents a Semgrep [Deployment](https://semgrep.dev/api/v1/docs/#tag/Deployment), a unit encapsulating a security organization inside Semgrep Cloud. Works as the parent of all other Semgrep resources. + +| Field | Description | +|-------|--------------| +| firstseen | Timestamp of when a sync job first discovered this node | +| lastupdated | Timestamp of the last time the node was updated | +| **id** | Unique integer id representing the deployment | +| **slug** | Lowercase string id representing the deployment to query the API | +| **name** | Name of security organization connected to the deployment | + +#### Relationships + +- A SemgrepDeployment contains SemgrepSCAFinding's + + ``` + (SemgrepDeployment)-[RESOURCE]->(SemgrepSCAFinding) + ``` + +- A SemgrepDeployment contains SemgrepSCALocation's + + + ``` + (SemgrepDeployment)-[RESOURCE]->(SemgrepSCALocation) + ``` + + ``` + +### SemgrepSCAFinding + +Represents a [Semgre Supply Chain](https://semgrep.dev/docs/semgrep-supply-chain/overview/) finding. This is, a vulnerability in a dependency of a project discovered by Semgrep performing software composition analysis (SCA) and code reachability analysis. Before ingesting this node, make sure you have run Semgrep CI and that it's connected to Semgrep Cloud Platform [Running Semgrep CI with Semgrep Cloud Platform](https://semgrep.dev/docs/semgrep-ci/running-semgrep-ci-with-semgrep-cloud-platform/). + +| Field | Description | +|-------|--------------| +| firstseen | Timestamp of when a sync job first discovered this node | +| lastupdated | Timestamp of the last time the node was updated | +| **id** | A composed id based using the repository path and the rule that triggered the finding | +| **rule_id** | The rule that triggered the finding | +| **repository** | The repository path where the finding was discovered | +| summary | A short title summarizing of the finding | +| description | Description of the vulnerability. | +| package_manager | The ecosystem of the dependency where the finding was discovered (e.g. pypi, npm, maven) | +| severity | Severity of the finding (e.g. CRITICAL, HIGH, MEDIUM, LOW) | +| cve_id | CVE id of the vulnerability from NVD. Check [cve_schema](../cve/schema.md) | +| reachability_check | Whether the vulnerability reachability is confirmed, not confirmed or needs to be manually confirmed | +| reachability_condition | Description of the reachability condition (e.g. reachable if code is used in X way) | +| reachability | Whether the vulnerability is reachable or not | +| transitivity | Whether the vulnerability is transitive or not (e.g. dependency, transitive) | +| dependency | Dependency where the finding was discovered. Includes dependency name and version | +| dependency_fix | Dependency version that fixes the vulnerability | +| ref_urls | List of reference urls for the finding | +| dependency_file | Path of the file where the finding was discovered (e.g. lock.json, requirements.txt) | +| dependency_file_url | URL of the file where the finding was discovered | +| scan_time | Date and time when the finding was discovered in UTC | + + +#### Relationships + +- An SemgrepSCAFinding connected to a GithubRepository (optional) + + ``` + (SemgrepSCAFinding)-[FOUND_IN]->(GithubRepository) + ``` + +- A SemgrepSCAFinding vulnerable dependency usage at SemgrepSCALocation (optional) + + ``` + (SemgrepSCAFinding)-[USAGE_AT]->(SemgrepSCALocation) + ``` + + +### SemgrepSCALocation + +Represents the location in a repository where a vulnerable dependency is used in a way that can trigger the vulnerability. + +| Field | Description | +|-------|--------------| +| firstseen | Timestamp of when a sync job first discovered this node | +| lastupdated | Timestamp of the last time the node was updated | +| **id** | Unique id identifying the location of the finding | +| path | Path of the file where the usage was discovered | +| start_line | Line where the usage starts | +| start_col | Column where the usage starts | +| end_line | Line where the usage ends | +| end_col | Column where the usage ends | +| url | URL of the file where the usage was discovered | diff --git a/docs/schema/README.md b/docs/schema/README.md index 8b3edbca7f..60e6c96542 100644 --- a/docs/schema/README.md +++ b/docs/schema/README.md @@ -59,5 +59,8 @@ ## SyncMetadata - Click [here](syncmetadata.md) +## Semgrep +- Click [here](https://lyft.github.io/cartography/modules/semgrep/schema.html) + ## More to come! 👍 diff --git a/tests/data/semgrep/__init__.py b/tests/data/semgrep/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/semgrep/sca.py b/tests/data/semgrep/sca.py new file mode 100644 index 0000000000..bda27f1f22 --- /dev/null +++ b/tests/data/semgrep/sca.py @@ -0,0 +1,117 @@ +DEPLOYMENTS = { + "id": "123456", + "name": "YourOrg", + "slug": "yourorg", +} + +SCA_RESPONSE = { + "vulns": [ + { + "title": "Reachable vuln", + "advisory": { + "ruleId": "ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590", + "title": "Reachable vuln", + "description": "description", + "ecosystem": "go", + "severity": "HIGH", + "references": { + "cveIds": ["CVE-2023-37897"], + "cweIds": ["CWE-617: Reachable Assertion"], + "owaspIds": ["A06:2021 - Vulnerable and Outdated Components"], + "urls": [ + "https://github.com/advisories//GHSA-9436-3gmp-4f53", + "https://nvd.nist.gov/vuln/detail/CVE-2023-37897", + ], + }, + "announcedAt": "2023-07-19T21:15:08Z", + "ruleText": '{\n "id": "ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590",\n "languages": [\n "python",\n "java",\n "ruby"\n ],\n "message": "message ",\n }', # noqa E501 + "reachability": "MANUAL_REVIEW_REACHABLE", + "vulnerableDependencies": [ + {"name": "grav", "versionSpecifier": "< 1.7.42.1"}, + ], + "safeDependencies": [ + {"name": "grav", "versionSpecifier": "1.7.42.2"}, + ], + "reachableIf": "a non-administrator, user account that has Admin panel access and Create/Update page permissions", # noqa E501 + }, + "exposureType": "REACHABLE", + "repositoryId": "123456", + "matchedDependency": {"name": "grav", "versionSpecifier": "1.7.42.0"}, + "dependencyFileLocation": { + "path": "go.mod", + "startLine": "111", + "url": "https://github.com/yourorg/yourrepo/blame/71bbed12f950de8335006d7f91112263d8504f1b/go.mod#L111", + "startCol": "0", + "endLine": "0", + "endCol": "0", + }, + "usages": [ + { + "findingId": "20128504", + "location": { + "path": "src/packages/directory/file1.go", + "startLine": "24", + "startCol": "57", + "endLine": "24", + "endCol": "78", + "url": "https://github.com/yourorg/yourrepo/blame/6fdee8f2727f4506cfbbe553e23b895e27956588/src/packages/directory/file1.go.ts#L24", # noqa E501 + "committedAt": "1970-01-01T00:00:00Z", + }, + }, + { + "findingId": "20128505", + "location": { + "path": "src/packages/directory/file2.go", + "startLine": "24", + "startCol": "37", + "endLine": "24", + "endCol": "54", + "url": "https://github.com/yourorg/yourrepo/blame/6fdee8f2727f4506cfbbe553e23b895e27956588/src/packages/directory/file2.go.ts#L24", # noqa E501 + "committedAt": "1970-01-01T00:00:00Z", + }, + }, + ], + "triage": { + "status": "NEW", + "dismissReason": "UNKNOWN_REASON", + "issueUrl": "", + "prUrl": "", + }, + "groupKey": "132465::::ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590::reachable", + "closestSafeDependency": {"name": "grav", "versionSpecifier": "1.7.42.2"}, + "repositoryName": "yourorg/yourrepo", + "openedAt": "2023-07-19T12:51:53Z", + "firstTriagedAt": "1970-01-01T00:00:00Z", + "transitivity": "DIRECT", + "subdirectory": "", + "packageManager": "no_package_manager", + }, + ], + "hasMore": True, + "cursor": "123456", +} + +RAW_VULNS = SCA_RESPONSE["vulns"] +VULN_ID = "yourorg/yourrepo|ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590" +USAGES = [ + { + "SCA_ID": VULN_ID, + "findingId": "20128504", + "path": "src/packages/directory/file1.go", + "startLine": "24", + "startCol": "57", + "endLine": "24", + "endCol": "78", + "url": "https://github.com/yourorg/yourrepo/blame/6fdee8f2727f4506cfbbe553e23b895e27956588/src/packages/directory/file1.go.ts#L24", # noqa E501 + }, + { + "SCA_ID": VULN_ID, + "findingId": "20128505", + "path": "src/packages/directory/file2.go", + "startLine": "24", + "startCol": "37", + "endLine": "24", + "endCol": "54", + "url": "https://github.com/yourorg/yourrepo/blame/6fdee8f2727f4506cfbbe553e23b895e27956588/src/packages/directory/file2.go.ts#L24", # noqa E501 + }, +] diff --git a/tests/integration/cartography/intel/semgrep/__init__.py b/tests/integration/cartography/intel/semgrep/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/cartography/intel/semgrep/test_findings.py b/tests/integration/cartography/intel/semgrep/test_findings.py new file mode 100644 index 0000000000..2a00783949 --- /dev/null +++ b/tests/integration/cartography/intel/semgrep/test_findings.py @@ -0,0 +1,240 @@ +from string import Template +from typing import List +from unittest.mock import patch + +import neo4j + +import cartography.intel.semgrep.findings +import tests.data.semgrep.sca +from cartography.intel.semgrep.findings import sync +from tests.integration.util import check_nodes +from tests.integration.util import check_rels + +TEST_REPO_ID = "https://github.com/yourorg/yourrepo" +TEST_REPO_FULL_NAME = "yourorg/yourrepo" +TEST_REPO_NAME = "yourrepo" +TEST_UPDATE_TAG = 123456789 + + +def _check_nodes_as_list(neo4j_session: neo4j.Session, node_label: str, attrs: List[str]): + """ + Like tests.integration.util.check_nodes()` but returns a list instead of a set. + """ + if not attrs: + raise ValueError("`attrs` passed to check_nodes() must have at least one element.") + + attrs = ", ".join(f"n.{attr}" for attr in attrs) + query_template = Template("MATCH (n:$NodeLabel) RETURN $Attrs") + result = neo4j_session.run( + query_template.safe_substitute(NodeLabel=node_label, Attrs=attrs), + ) + return sum([row.values() for row in result], []) + + +def _create_github_repos(neo4j_session): + # Creates a set of GitHub repositories in the graph + neo4j_session.run( + """ + MERGE (repo:GitHubRepository{id: $repo_id, fullname: $repo_fullname, name: $repo_name}) + ON CREATE SET repo.firstseen = timestamp() + SET repo.lastupdated = $update_tag + """, + repo_id=TEST_REPO_ID, + repo_fullname=TEST_REPO_FULL_NAME, + update_tag=TEST_UPDATE_TAG, + repo_name=TEST_REPO_NAME, + ) + + +@patch.object( + cartography.intel.semgrep.findings, + "get_deployment", + return_value=tests.data.semgrep.sca.DEPLOYMENTS, +) +@patch.object( + cartography.intel.semgrep.findings, + "get_sca_vulns", + return_value=tests.data.semgrep.sca.RAW_VULNS, +) +def test_sync(mock_get_sca_vulns, mock_get_deployment, neo4j_session): + # Arrange + _create_github_repos(neo4j_session) + semgrep_app_token = "your_semgrep_app_token" + common_job_parameters = { + "UPDATE_TAG": TEST_UPDATE_TAG, + } + + # Act + sync(neo4j_session, semgrep_app_token, TEST_UPDATE_TAG, common_job_parameters) + + # Assert + expected_deployment_nodes = {("123456", "YourOrg", "yourorg")} + + assert ( + check_nodes( + neo4j_session, + "SemgrepDeployment", + ["id", "name", "slug"], + ) == + expected_deployment_nodes + ) + expected_sca_vuln_nodes = [ + "yourorg/yourrepo|ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590", + TEST_UPDATE_TAG, + "yourorg/yourrepo", + "ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590", + "Reachable vuln", + "description", + "go", + "HIGH", + "CVE-2023-37897", + "MANUAL_REVIEW_REACHABLE", + "REACHABLE", + "DIRECT", + "grav|1.7.42.0", + "grav|1.7.42.2", + "go.mod", + "https://github.com/yourorg/yourrepo/blame/71bbed12f950de8335006d7f91112263d8504f1b/go.mod#L111", + ["https://github.com/advisories//GHSA-9436-3gmp-4f53", "https://nvd.nist.gov/vuln/detail/CVE-2023-37897"], + "2023-07-19T12:51:53Z", + ] + assert ( + _check_nodes_as_list( + neo4j_session, + "SemgrepSCAFinding", + [ + "id", + "lastupdated", + "repository", + "rule_id", + "summary", + "description", + "package_manager", + "severity", + "cve_id", + "reachability_check", + "reachability", + "transitivity", + "dependency", + "dependency_fix", + "dependency_file", + "dependency_file_url", + "ref_urls", + "scan_time", + ], + ) == + expected_sca_vuln_nodes + ) + expected_sca_location_nodes = { + ( + "20128504", + "src/packages/directory/file1.go", + "24", + "57", + "24", + "78", + "https://github.com/yourorg/yourrepo/blame/6fdee8f2727f4506cfbbe553e23b895e27956588/src/packages/directory/file1.go.ts#L24", # noqa E501 + ), + ( + "20128505", + "src/packages/directory/file2.go", + "24", + "37", + "24", + "54", + "https://github.com/yourorg/yourrepo/blame/6fdee8f2727f4506cfbbe553e23b895e27956588/src/packages/directory/file2.go.ts#L24", # noqa E501 + ), + } + assert ( + check_nodes( + neo4j_session, + "SemgrepSCALocation", + [ + "id", + "path", + "start_line", + "start_col", + "end_line", + "end_col", + "url", + ], + ) == + expected_sca_location_nodes + ) + expected_findings_resource_relationships = { + ( + "123456", + "yourorg/yourrepo|ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590", + ), + } + assert ( + check_rels( + neo4j_session, + "SemgrepDeployment", + "id", + "SemgrepSCAFinding", + "id", + "RESOURCE", + ) == + expected_findings_resource_relationships + ) + expected_locations_resource_relationships = { + ( + "123456", + "20128504", + ), + ( + "123456", + "20128505", + ), + } + assert ( + check_rels( + neo4j_session, + "SemgrepDeployment", + "id", + "SemgrepSCALocation", + "id", + "RESOURCE", + ) == + expected_locations_resource_relationships + ) + expected_found_in_relationships = { + ( + "yourorg/yourrepo", + "yourorg/yourrepo|ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590", + ), + } + assert ( + check_rels( + neo4j_session, + "GitHubRepository", + "fullname", + "SemgrepSCAFinding", + "id", + "FOUND_IN", + rel_direction_right=False, + ) == + expected_found_in_relationships + ) + expected_location_relationships = { + ( + "yourorg/yourrepo|ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590", + "20128504", + ), + ( + "yourorg/yourrepo|ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590", + "20128505", + ), + } + assert ( + check_rels( + neo4j_session, + "SemgrepSCAFinding", + "id", + "SemgrepSCALocation", + "id", + "USAGE_AT", + ) == + expected_location_relationships + ) From 875df3a23ec5d7a50bfd2b1ce36d02ac7fa7f400 Mon Sep 17 00:00:00 2001 From: Ramon Petgrave <32398091+ramonpetgrave64@users.noreply.github.com> Date: Fri, 4 Aug 2023 12:24:39 -0400 Subject: [PATCH 14/18] 0.83.0 (#1228) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bbfdb79531..28e4125117 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages from setuptools import setup -__version__ = '0.82.0' +__version__ = '0.83.0' setup( From d4c450e307ba9579e18b1a6ac09f0436ec0d402a Mon Sep 17 00:00:00 2001 From: Andrew Womeldorf Date: Mon, 14 Aug 2023 16:42:35 -0500 Subject: [PATCH 15/18] Refactor EC2 Volume to allow optional properties. (#1231) The existing EC2 Volume transformation assumes all properties will be present. See [EC2 API Reference for Volumes](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_Volume.html) to see that all properties on the Volume are optional. --- cartography/intel/aws/ec2/volumes.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cartography/intel/aws/ec2/volumes.py b/cartography/intel/aws/ec2/volumes.py index 3ad50f1186..6b8610d3cc 100644 --- a/cartography/intel/aws/ec2/volumes.py +++ b/cartography/intel/aws/ec2/volumes.py @@ -36,19 +36,19 @@ def transform_volumes(volumes: List[Dict[str, Any]], region: str, current_aws_ac volume_id = volume['VolumeId'] raw_vol = ({ 'Arn': build_arn('ec2', current_aws_account_id, 'volume', volume_id, region), - 'AvailabilityZone': volume['AvailabilityZone'], - 'CreateTime': volume['CreateTime'], - 'Encrypted': volume['Encrypted'], - 'Size': volume['Size'], - 'State': volume['State'], - 'OutpostArn': volume['OutpostArn'], - 'SnapshotId': volume['SnapshotId'], - 'Iops': volume['Iops'], - 'FastRestored': volume['FastRestored'], - 'MultiAttachEnabled': volume['MultiAttachEnabled'], - 'VolumeType': volume['VolumeType'], + 'AvailabilityZone': volume.get('AvailabilityZone'), + 'CreateTime': volume.get('CreateTime'), + 'Encrypted': volume.get('Encrypted'), + 'Size': volume.get('Size'), + 'State': volume.get('State'), + 'OutpostArn': volume.get('OutpostArn'), + 'SnapshotId': volume.get('SnapshotId'), + 'Iops': volume.get('Iops'), + 'FastRestored': volume.get('FastRestored'), + 'MultiAttachEnabled': volume.get('MultiAttachEnabled'), + 'VolumeType': volume.get('VolumeType'), 'VolumeId': volume_id, - 'KmsKeyId': volume['KmsKeyId'], + 'KmsKeyId': volume.get('KmsKeyId'), }) if not active_attachments: From 07ee314bee7868e779cfb1b891d6ee7e860267a0 Mon Sep 17 00:00:00 2001 From: Surendra Pathak Date: Mon, 14 Aug 2023 14:57:31 -0700 Subject: [PATCH 16/18] Added ZeusCloud as list of companies using cartography (#1230) Details: https://github.com/Zeus-Labs/ZeusCloud --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 11208559fb..ad5242bb80 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,7 @@ and follow the instructions to sign the CLA. 1. [Thought Machine](https://thoughtmachine.net/) 1. [MessageBird](https://messagebird.com) 1. [Cloudanix](https://www.cloudanix.com/) +1. [ZeusCloud](https://www.zeuscloud.io/) 1. {Your company here} :-) If your organization uses Cartography, please file a PR and update this list. Say hi on Slack too! From af0b60fde29bb0e0923fa609e33286bb9a889946 Mon Sep 17 00:00:00 2001 From: Ramon Petgrave <32398091+ramonpetgrave64@users.noreply.github.com> Date: Tue, 22 Aug 2023 18:14:10 -0400 Subject: [PATCH 17/18] #1206: add async to aws:s3 and aws:ecr (#1192) Add some helper functions for refactoring existing modules to be more async and use them for aws:s3 and aws:ecr. In a test on one of our larger aws accounts, we've seen ~90% reduction in sync time for s3 and ecr. - `to_async` wraps a regular synchronous function so that it can be used within async functions in a non-blocking fashion - `so_sync` 1. takes a series are [Awaitables](https://docs.python.org/3.8/library/asyncio-task.html#asyncio-awaitables) (e.g., container objects returned when invoking async functions) 2. schedules them all to be run simultaneously 3. blocks until they are all finished 4. returns all the results in the same order There is a caveat when running these functions from a Jupyter notebook. You must do a workaround, but homefully in future versions of asyncio this will not be needed ``` # import nest_asyncio # nest_asyncio.apply() ``` ### Testing Existing unit tests mostly cover these changes, but I also did some manual testing against a real AWS account. --- cartography/intel/aws/ecr.py | 26 ++++- cartography/intel/aws/s3.py | 30 ++++-- cartography/util.py | 96 +++++++++++++++++++ .../aws/test_permission_relationships.py | 8 +- 4 files changed, 146 insertions(+), 14 deletions(-) diff --git a/cartography/intel/aws/ecr.py b/cartography/intel/aws/ecr.py index 0b5dbb12e2..4569024d52 100644 --- a/cartography/intel/aws/ecr.py +++ b/cartography/intel/aws/ecr.py @@ -1,4 +1,5 @@ import logging +from typing import Any from typing import Dict from typing import List @@ -9,6 +10,8 @@ from cartography.util import batch from cartography.util import run_cleanup_job from cartography.util import timeit +from cartography.util import to_asynchronous +from cartography.util import to_synchronous logger = logging.getLogger(__name__) @@ -139,6 +142,25 @@ def cleanup(neo4j_session: neo4j.Session, common_job_parameters: Dict) -> None: run_cleanup_job('aws_import_ecr_cleanup.json', neo4j_session, common_job_parameters) +def _get_image_data( + boto3_session: boto3.session.Session, + region: str, + repositories: List[Dict[str, Any]], +) -> Dict[str, Any]: + ''' + Given a list of repositories, get the image data for each repository, + return as a mapping from repositoryUri to image object + ''' + image_data = {} + + async def async_get_images(repo: Dict[str, Any]) -> None: + repo_image_obj = await to_asynchronous(get_ecr_repository_images, boto3_session, region, repo['repositoryName']) + image_data[repo['repositoryUri']] = repo_image_obj + to_synchronous(*[async_get_images(repo) for repo in repositories]) + + return image_data + + @timeit def sync( neo4j_session: neo4j.Session, boto3_session: boto3.session.Session, regions: List[str], current_aws_account_id: str, @@ -148,9 +170,7 @@ def sync( logger.info("Syncing ECR for region '%s' in account '%s'.", region, current_aws_account_id) image_data = {} repositories = get_ecr_repositories(boto3_session, region) - for repo in repositories: - repo_image_obj = get_ecr_repository_images(boto3_session, region, repo['repositoryName']) - image_data[repo['repositoryUri']] = repo_image_obj + image_data = _get_image_data(boto3_session, region, repositories) load_ecr_repositories(neo4j_session, repositories, region, current_aws_account_id, update_tag) repo_images_list = transform_ecr_repository_images(image_data) load_ecr_repository_images(neo4j_session, repo_images_list, region, update_tag) diff --git a/cartography/intel/aws/s3.py b/cartography/intel/aws/s3.py index 6524bb4c44..6db11cc9fd 100644 --- a/cartography/intel/aws/s3.py +++ b/cartography/intel/aws/s3.py @@ -1,3 +1,4 @@ +import asyncio import hashlib import json import logging @@ -20,6 +21,8 @@ from cartography.util import run_analysis_job from cartography.util import run_cleanup_job from cartography.util import timeit +from cartography.util import to_asynchronous +from cartography.util import to_synchronous logger = logging.getLogger(__name__) stat_handler = get_stats_client(__name__) @@ -55,7 +58,9 @@ def get_s3_bucket_details( # a local store for s3 clients so that we may re-use clients for an AWS region s3_regional_clients: Dict[Any, Any] = {} - for bucket in bucket_data['Buckets']: + BucketDetail = Tuple[str, Dict[str, Any], Dict[str, Any], Dict[str, Any], Dict[str, Any], Dict[str, Any]] + + async def _get_bucket_detail(bucket: Dict[str, Any]) -> BucketDetail: # Note: bucket['Region'] is sometimes None because # client.get_bucket_location() does not return a location constraint for buckets # in us-east-1 region @@ -63,12 +68,23 @@ def get_s3_bucket_details( if not client: client = boto3_session.client('s3', bucket['Region']) s3_regional_clients[bucket['Region']] = client - acl = get_acl(bucket, client) - policy = get_policy(bucket, client) - encryption = get_encryption(bucket, client) - versioning = get_versioning(bucket, client) - public_access_block = get_public_access_block(bucket, client) - yield bucket['Name'], acl, policy, encryption, versioning, public_access_block + ( + acl, + policy, + encryption, + versioning, + public_access_block, + ) = await asyncio.gather( + to_asynchronous(get_acl, bucket, client), + to_asynchronous(get_policy, bucket, client), + to_asynchronous(get_encryption, bucket, client), + to_asynchronous(get_versioning, bucket, client), + to_asynchronous(get_public_access_block, bucket, client), + ) + return bucket['Name'], acl, policy, encryption, versioning, public_access_block + + bucket_details = to_synchronous(*[_get_bucket_detail(bucket) for bucket in bucket_data['Buckets']]) + yield from bucket_details @timeit diff --git a/cartography/util.py b/cartography/util.py index 5d26157643..30b9bccd80 100644 --- a/cartography/util.py +++ b/cartography/util.py @@ -1,9 +1,12 @@ +import asyncio import logging import re import sys +from functools import partial from functools import wraps from string import Template from typing import Any +from typing import Awaitable from typing import BinaryIO from typing import Callable from typing import cast @@ -25,6 +28,7 @@ from cartography.stats import get_stats_client from cartography.stats import ScopedStatsClient + if sys.version_info >= (3, 7): from importlib.resources import open_binary, read_text else: @@ -141,6 +145,7 @@ def load_resource_binary(package: str, resource_name: str) -> BinaryIO: return open_binary(package, resource_name) +R = TypeVar('R') F = TypeVar('F', bound=Callable[..., Any]) @@ -297,3 +302,94 @@ def batch(items: Iterable, size: int = DEFAULT_BATCH_SIZE) -> List[List]: items[i: i + size] for i in range(0, len(items), size) ] + + +def is_throttling_exception(exc: Exception) -> bool: + ''' + Returns True if the exception is caused by a client libraries throttling mechanism + ''' + # https://boto3.amazonaws.com/v1/documentation/api/1.19.9/guide/error-handling.html + if isinstance(exc, botocore.exceptions.ClientError): + if exc.response['Error']['Code'] in ['LimitExceededException', 'Throttling']: + return True + # add other exceptions here, if needed, like: + # https://cloud.google.com/python/docs/reference/storage/1.39.0/retry_timeout#configuring-retries + # if isinstance(exc, google.api_core.exceptions.TooManyRequests): + # return True + return False + + +def to_asynchronous(func: Callable[..., R], *args: Any, **kwargs: Any) -> Awaitable[R]: + ''' + Returns a Future that will run a function and its arguments in the default threadpool. + Helper until we start using python 3.9's asyncio.to_thread + + Calls are also wrapped within a backoff decorator to handle throttling errors. + + :param func: the function to be wrapped by the Future + :param args: a series of arguments to be passed into func + :param kwargs: a series of keyword arguments to be passed into func + + example: + def my_func(arg1, arg2, kwarg1): + return arg1 + arg2 + kwarg1 + + # normal synchronous call: + result = my_func(1, 2, kwarg1=3) + + # asynchronous call: + future = to_asynchronous(my_func, 1, 2, kwarg1=3) + + # the result is stored in the future, and can be retrieved + # from within another async function with: + await future + + # or from within a synchronous function with our helper: + to_synchronous(future) + + NOTE: to use this in a Jupyter notebook, you need to do: + # import nest_asyncio + # nest_asyncio.apply() + ''' + CartographyThrottlingException = type('CartographyThrottlingException', (Exception,), {}) + + @wraps(func) + def wrapper(*args: Any, **kwargs: Any) -> R: + try: + return func(*args, **kwargs) + except Exception as exc: + if is_throttling_exception(exc): + raise CartographyThrottlingException from exc + raise + + # don't use @backoff as decorator, to preserve typing + wrapped = backoff.on_exception(backoff.expo, CartographyThrottlingException)(wrapper) + call = partial(wrapped, *args, **kwargs) + return asyncio.get_event_loop().run_in_executor(None, call) + + +def to_synchronous(*awaitables: Awaitable[Any]) -> List[Any]: + ''' + Synchronously waits for the Awaitable(s) to complete and returns their result(s). + See https://docs.python.org/3.8/library/asyncio-task.html#asyncio-awaitables + + :param awaitables: a series of Awaitable objects, with each object being its own argument. + i.e., not a single list of Awaitables + + example: + async def my_async_func(my_arg): + return my_arg + + async def another_async_func(my_arg2): + return my_arg2 + + remember that an invocation of an async function returns a Future (Awaitable), + which needs to be awaited to get the result. You cannot await a Future from within + a non-async function, so you could use this helper to get the result from a Future + + future_1 = my_async_func(1) + future_2 = another_async_func(2) + + results = to_synchronous(future_1, future_2) + ''' + return asyncio.get_event_loop().run_until_complete(asyncio.gather(*awaitables)) diff --git a/tests/unit/cartography/intel/aws/test_permission_relationships.py b/tests/unit/cartography/intel/aws/test_permission_relationships.py index b19604256a..bc85bdd0e7 100644 --- a/tests/unit/cartography/intel/aws/test_permission_relationships.py +++ b/tests/unit/cartography/intel/aws/test_permission_relationships.py @@ -36,7 +36,7 @@ def test_not_action_statement(): "action": [ "*", ], - "notaction":[ + "notaction": [ "S3:GetObject", ], "resource": [ @@ -209,7 +209,7 @@ def test_non_matching_notresource(): "action": [ "s3:Get*", ], - "resource":["*"], + "resource": ["*"], "notresource": [ "arn:aws:s3:::nottest", ], @@ -417,7 +417,7 @@ def test_single_comma(): "action": [ "s3:?et*", ], - "resource":["arn:aws:s3:::testbucke?"], + "resource": ["arn:aws:s3:::testbucke?"], "effect": "Allow", }, ] @@ -432,7 +432,7 @@ def test_multiple_comma(): "action": [ "s3:?et*", ], - "resource":["arn:aws:s3:::????bucket"], + "resource": ["arn:aws:s3:::????bucket"], "effect": "Allow", }, ] From ab04b205ee942700e352e5c8911d8a596c8c38d2 Mon Sep 17 00:00:00 2001 From: Jeremy Chapeau <113923302+resilience-jychp@users.noreply.github.com> Date: Thu, 24 Aug 2023 00:17:01 +0200 Subject: [PATCH 18/18] fix: GSuite data only populated on match (#1233) On Gsuite Intel, User and Group properties are only populated on existing node (ON MATCH). On first run only ID is populated. This PR fix this issue. --- cartography/intel/gsuite/api.py | 14 +++++++------- tests/unit/cartography/intel/gsuite/test_api.py | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/cartography/intel/gsuite/api.py b/cartography/intel/gsuite/api.py index 62d9cdfdab..92ed829e5e 100644 --- a/cartography/intel/gsuite/api.py +++ b/cartography/intel/gsuite/api.py @@ -130,9 +130,9 @@ def load_gsuite_groups(neo4j_session: neo4j.Session, groups: List[Dict], gsuite_ UNWIND $GroupData as group MERGE (g:GSuiteGroup{id: group.id}) ON CREATE SET - g.firstseen = $UpdateTag - ON MATCH SET - g.group_id = group.id, + g.firstseen = $UpdateTag, + g.group_id = group.id + SET g.admin_created = group.adminCreated, g.description = group.description, g.direct_members_count = group.directMembersCount, @@ -152,9 +152,9 @@ def load_gsuite_users(neo4j_session: neo4j.Session, users: List[Dict], gsuite_up UNWIND $UserData as user MERGE (u:GSuiteUser{id: user.id}) ON CREATE SET - u.firstseen = $UpdateTag - ON MATCH SET u.user_id = user.id, + u.firstseen = $UpdateTag + SET u.agreed_to_terms = user.agreedToTerms, u.archived = user.archived, u.change_password_at_next_login = user.changePasswordAtNextLogin, @@ -193,7 +193,7 @@ def load_gsuite_members(neo4j_session: neo4j.Session, group: Dict, members: List MERGE (user)-[r:MEMBER_GSUITE_GROUP]->(group) ON CREATE SET r.firstseen = $UpdateTag - ON MATCH SET + SET r.lastupdated = $UpdateTag """ neo4j_session.run( @@ -208,7 +208,7 @@ def load_gsuite_members(neo4j_session: neo4j.Session, group: Dict, members: List MERGE (group_1)-[r:MEMBER_GSUITE_GROUP]->(group_2) ON CREATE SET r.firstseen = $UpdateTag - ON MATCH SET + SET r.lastupdated = $UpdateTag """ neo4j_session.run(membership_qry, MemberData=members, GroupID=group.get("id"), UpdateTag=gsuite_update_tag) diff --git a/tests/unit/cartography/intel/gsuite/test_api.py b/tests/unit/cartography/intel/gsuite/test_api.py index 04950555a1..3e4a3b7546 100644 --- a/tests/unit/cartography/intel/gsuite/test_api.py +++ b/tests/unit/cartography/intel/gsuite/test_api.py @@ -107,9 +107,9 @@ def test_load_gsuite_groups(): UNWIND $GroupData as group MERGE (g:GSuiteGroup{id: group.id}) ON CREATE SET - g.firstseen = $UpdateTag - ON MATCH SET - g.group_id = group.id, + g.firstseen = $UpdateTag, + g.group_id = group.id + SET g.admin_created = group.adminCreated, g.description = group.description, g.direct_members_count = group.directMembersCount, @@ -135,9 +135,9 @@ def test_load_gsuite_users(): UNWIND $UserData as user MERGE (u:GSuiteUser{id: user.id}) ON CREATE SET - u.firstseen = $UpdateTag - ON MATCH SET u.user_id = user.id, + u.firstseen = $UpdateTag + SET u.agreed_to_terms = user.agreedToTerms, u.archived = user.archived, u.change_password_at_next_login = user.changePasswordAtNextLogin,