diff --git a/README.md b/README.md index 62ae48bbdf..ad5242bb80 100644 --- a/README.md +++ b/README.md @@ -39,16 +39,13 @@ Start [here](https://lyft.github.io/cartography/install.html). ## Usage Start with our [tutorial](https://lyft.github.io/cartography/usage/tutorial.html). Our [data schema](https://lyft.github.io/cartography/usage/schema.html) is a helpful reference when you get stuck. -## Contact +## Community - Join us on `#cartography` on the [Lyft OSS Slack](https://join.slack.com/t/lyftoss/shared_invite/enQtOTYzODg5OTQwNDE2LTFiYjgwZWM3NTNhMTFkZjc4Y2IxOTI4NTdiNTdhNjQ4M2Q5NTIzMjVjOWI4NmVlNjRiZmU2YzA5NTc3MmFjYTQ). - -## Community Meeting - -Talk to us and see what we're working on at our [monthly community meeting](https://calendar.google.com/calendar/embed?src=lyft.com_p10o6ceuiieq9sqcn1ef61v1io%40group.calendar.google.com&ctz=America%2FLos_Angeles). -- Meeting minutes are [here](https://docs.google.com/document/d/1VyRKmB0dpX185I15BmNJZpfAJ_Ooobwz0U1WIhjDxvw). -- Recorded videos are posted [here](https://www.youtube.com/playlist?list=PLMga2YJvAGzidUWJB_fnG7EHI4wsDDsE1). -- Our current project road map is [here](https://docs.google.com/document/d/18MOsGI-isFvag1fGk718Aht7wQPueWd4SqOI9KapBa8/edit#heading=h.15nsmgmjaaml). +- Talk to us and see what we're working on at our [monthly community meeting](https://calendar.google.com/calendar/embed?src=lyft.com_p10o6ceuiieq9sqcn1ef61v1io%40group.calendar.google.com&ctz=America%2FLos_Angeles). + - Meeting minutes are [here](https://docs.google.com/document/d/1VyRKmB0dpX185I15BmNJZpfAJ_Ooobwz0U1WIhjDxvw). + - Recorded videos are posted [here](https://www.youtube.com/playlist?list=PLMga2YJvAGzidUWJB_fnG7EHI4wsDDsE1). +- Our current project roadmap is [here](https://github.com/orgs/lyft/projects/26/views/1). ## Contributing Thank you for considering contributing to Cartography! @@ -57,9 +54,12 @@ Thank you for considering contributing to Cartography! Legal stuff: This project is governed by [Lyft's code of conduct](https://github.com/lyft/code-of-conduct). All contributors and participants agree to abide by its terms. +### Bug reports and feature requests and discussions +Submit a GitHub issue to report a bug or request a new feature. If we decide that the issue needs more discussion - usually because the scope is too large or we need to make careful decision - we will convert the issue to a [GitHub Discussion](https://github.com/lyft/cartography/discussions). + ### Developing Cartography -Get started with our [developer documentation](https://lyft.github.io/cartography/dev/developer-guide.html). +Get started with our [developer documentation](https://lyft.github.io/cartography/dev/developer-guide.html). Please feel free to submit your own PRs to update documentation if you've found a better way to explain something. #### Sign the Contributor License Agreement (CLA) @@ -73,6 +73,7 @@ and follow the instructions to sign the CLA. 1. [Thought Machine](https://thoughtmachine.net/) 1. [MessageBird](https://messagebird.com) 1. [Cloudanix](https://www.cloudanix.com/) +1. [ZeusCloud](https://www.zeuscloud.io/) 1. {Your company here} :-) If your organization uses Cartography, please file a PR and update this list. Say hi on Slack too! diff --git a/cartography/cli.py b/cartography/cli.py index 54525bc770..e3a4d2ea36 100644 --- a/cartography/cli.py +++ b/cartography/cli.py @@ -500,6 +500,15 @@ def _build_parser(self): 'The Duo api hostname' ), ) + parser.add_argument( + '--semgrep-app-token-env-var', + type=str, + default=None, + help=( + 'The name of environment variable containing the Semgrep app token key. ' + 'Required if you are using the Semgrep intel module. Ignored otherwise.' + ), + ) parser.add_argument( '--gandi-apikey-env-var', type=str, @@ -646,7 +655,7 @@ def main(self, argv: str) -> int: logger.debug(f"Reading config string for GSuite from environment variable {config.gsuite_tokens_env_var}") config.gsuite_config = os.environ.get(config.gsuite_tokens_env_var) else: - config.github_config = None + config.gsuite_tokens_env_var = None # Lastpass config if config.lastpass_cid_env_var: @@ -673,6 +682,16 @@ def main(self, argv: str) -> int: ) config.duo_api_key = os.environ.get(config.duo_api_key_env_var) config.duo_api_secret = os.environ.get(config.duo_api_secret_env_var) + else: + config.duo_api_key = None + config.duo_api_secret = None + + # Semgrep config + if config.semgrep_app_token_env_var: + logger.debug(f"Reading Semgrep App Token from environment variable {config.semgrep_app_token_env_var}") + config.semgrep_app_token = os.environ.get(config.semgrep_app_token_env_var) + else: + config.semgrep_app_token = None # Gandi config if config.gandi_apikey_env_var: diff --git a/cartography/config.py b/cartography/config.py index 54d53389b3..5d45447199 100644 --- a/cartography/config.py +++ b/cartography/config.py @@ -103,6 +103,8 @@ class Config: :param duo_api_key: The Duo api secret. Optional. :type duo_api_hostname: str :param duo_api_hostname: The Duo api hostname, e.g. "api-abc123.duosecurity.com". Optional. + :param semgrep_app_token: The Semgrep api token. Optional. + :type semgrep_app_token: str :type: gandi_apikey: str :param gandi_apikey: API authentication key for Gandi. Optional. """ @@ -159,6 +161,7 @@ def __init__( duo_api_key=None, duo_api_secret=None, duo_api_hostname=None, + semgrep_app_token=None, gandi_apikey=None, ): self.neo4j_uri = neo4j_uri @@ -211,4 +214,5 @@ def __init__( self.duo_api_key = duo_api_key self.duo_api_secret = duo_api_secret self.duo_api_hostname = duo_api_hostname + self.semgrep_app_token = semgrep_app_token self.gandi_apikey = gandi_apikey diff --git a/cartography/data/indexes.cypher b/cartography/data/indexes.cypher index 072599f719..085f84e270 100644 --- a/cartography/data/indexes.cypher +++ b/cartography/data/indexes.cypher @@ -321,10 +321,6 @@ CREATE INDEX IF NOT EXISTS FOR (n:SpotlightVulnerability) ON (n.host_info_local_ CREATE INDEX IF NOT EXISTS FOR (n:SpotlightVulnerability) ON (n.lastupdated); CREATE INDEX IF NOT EXISTS FOR (n:SQSQueue) ON (n.id); CREATE INDEX IF NOT EXISTS FOR (n:SQSQueue) ON (n.lastupdated); -CREATE INDEX IF NOT EXISTS FOR (n:SSMInstanceInformation) ON (n.id); -CREATE INDEX IF NOT EXISTS FOR (n:SSMInstanceInformation) ON (n.lastupdated); -CREATE INDEX IF NOT EXISTS FOR (n:SSMInstancePatch) ON (n.id); -CREATE INDEX IF NOT EXISTS FOR (n:SSMInstancePatch) ON (n.lastupdated); CREATE INDEX IF NOT EXISTS FOR (n:User) ON (n.arn); CREATE INDEX IF NOT EXISTS FOR (n:User) ON (n.lastupdated); CREATE INDEX IF NOT EXISTS FOR (n:AzureTenant) ON (n.id); diff --git a/cartography/data/jobs/cleanup/aws_import_ssm_cleanup.json b/cartography/data/jobs/cleanup/aws_import_ssm_cleanup.json deleted file mode 100644 index 7f77a3c84d..0000000000 --- a/cartography/data/jobs/cleanup/aws_import_ssm_cleanup.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "statements": [ - { - "query": "MATCH (:AWSAccount{id: $AWS_ID})-[:RESOURCE]->(n:SSMInstanceInformation) WHERE n.lastupdated <> $UPDATE_TAG WITH n LIMIT $LIMIT_SIZE DETACH DELETE (n)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:AWSAccount{id: $AWS_ID})-[:RESOURCE]->(:EC2Instance)-[r:HAS_INFORMATION]->(:SSMInstanceInformation) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:AWSAccount{id: $AWS_ID})-[:RESOURCE]->(n:SSMInstancePatch) WHERE n.lastupdated <> $UPDATE_TAG WITH n LIMIT $LIMIT_SIZE DETACH DELETE (n)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:AWSAccount{id: $AWS_ID})-[:RESOURCE]->(:EC2Instance)-[r:HAS_PATCH]->(:SSMInstancePatch) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - } - ], - "name": "cleanup SSM" -} diff --git a/cartography/data/jobs/cleanup/github_repos_cleanup.json b/cartography/data/jobs/cleanup/github_repos_cleanup.json index f840f89c9f..0d5bc41112 100644 --- a/cartography/data/jobs/cleanup/github_repos_cleanup.json +++ b/cartography/data/jobs/cleanup/github_repos_cleanup.json @@ -38,6 +38,31 @@ "query": "MATCH (:GitHubRepository)-[r:REQUIRES]->(:PythonLibrary) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", "iterative": true, "iterationsize": 100 + }, + { + "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_ADMIN]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", + "iterative": true, + "iterationsize": 100 + }, + { + "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_MAINTAIN]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", + "iterative": true, + "iterationsize": 100 + }, + { + "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_READ]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", + "iterative": true, + "iterationsize": 100 + }, + { + "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_TRIAGE]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", + "iterative": true, + "iterationsize": 100 + }, + { + "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_WRITE]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", + "iterative": true, + "iterationsize": 100 }], "name": "cleanup GitHub repos data" } diff --git a/cartography/data/jobs/cleanup/github_users_cleanup.json b/cartography/data/jobs/cleanup/github_users_cleanup.json index a120d387fa..4419d8d650 100644 --- a/cartography/data/jobs/cleanup/github_users_cleanup.json +++ b/cartography/data/jobs/cleanup/github_users_cleanup.json @@ -14,31 +14,6 @@ "iterative": true, "iterationsize": 100 }, - { - "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_ADMIN]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_MAINTAIN]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_READ]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_TRIAGE]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:GitHubUser)-[r:OUTSIDE_COLLAB_WRITE]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - }, { "query": "MATCH (:GitHubUser)-[r:MEMBER_OF]->(:GitHubOrganization) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", "iterative": true, diff --git a/cartography/driftdetect/config.py b/cartography/driftdetect/config.py index a8b2f4f625..0a32361e6b 100644 --- a/cartography/driftdetect/config.py +++ b/cartography/driftdetect/config.py @@ -1,3 +1,6 @@ +from typing import Optional + + class UpdateConfig: """ A common interface for the drift-detection update configuration. @@ -18,10 +21,10 @@ class UpdateConfig: def __init__( self, - drift_detection_directory, - neo4j_uri, - neo4j_user=None, - neo4j_password=None, + drift_detection_directory: str, + neo4j_uri: str, + neo4j_user: Optional[str] = None, + neo4j_password: Optional[str] = None, ): self.neo4j_uri = neo4j_uri self.neo4j_user = neo4j_user @@ -46,13 +49,13 @@ class GetDriftConfig: def __init__( self, - query_directory, - start_state, - end_state, + query_directory: str, + start_state: str, + end_state: str, ): - self.query_directory = query_directory - self.start_state = start_state - self.end_state = end_state + self.query_directory: str = query_directory + self.start_state: str = start_state + self.end_state: str = end_state class AddShortcutConfig: @@ -72,10 +75,10 @@ class AddShortcutConfig: def __init__( self, - query_directory, - shortcut, - filename, + query_directory: str, + shortcut: str, + filename: str, ): - self.query_directory = query_directory - self.shortcut = shortcut - self.filename = filename + self.query_directory: str = query_directory + self.shortcut: str = shortcut + self.filename: str = filename diff --git a/cartography/driftdetect/detect_deviations.py b/cartography/driftdetect/detect_deviations.py index df0ad86cd4..8155760814 100644 --- a/cartography/driftdetect/detect_deviations.py +++ b/cartography/driftdetect/detect_deviations.py @@ -1,8 +1,12 @@ import logging import os +from typing import List +from typing import Union from marshmallow import ValidationError +from cartography.driftdetect.config import GetDriftConfig +from cartography.driftdetect.model import State from cartography.driftdetect.reporter import report_drift from cartography.driftdetect.serializers import ShortcutSchema from cartography.driftdetect.serializers import StateSchema @@ -12,7 +16,7 @@ logger = logging.getLogger(__name__) -def run_drift_detection(config): +def run_drift_detection(config: GetDriftConfig) -> None: try: if not valid_directory(config.query_directory): logger.error("Invalid Drift Detection Directory") @@ -59,7 +63,7 @@ def run_drift_detection(config): logger.exception(msg) -def perform_drift_detection(start_state, end_state): +def perform_drift_detection(start_state: State, end_state: State): """ Returns differences (additions and missing results) between two States. @@ -81,7 +85,7 @@ def perform_drift_detection(start_state, end_state): return new_results, missing_results -def compare_states(start_state, end_state): +def compare_states(start_state: State, end_state: State): """ Helper function for comparing differences between two States. @@ -92,10 +96,12 @@ def compare_states(start_state, end_state): :return: list of tuples of differences between states in the form (dictionary, State object) """ differences = [] + # Use set for faster membership check + start_state_results = {tuple(res) for res in start_state.results} for result in end_state.results: - if result in start_state.results: + if tuple(result) in start_state_results: continue - drift = [] + drift: List[Union[str, List[str]]] = [] for field in result: value = field.split("|") if len(value) > 1: diff --git a/cartography/driftdetect/get_states.py b/cartography/driftdetect/get_states.py index ebb0713171..b7e0575618 100644 --- a/cartography/driftdetect/get_states.py +++ b/cartography/driftdetect/get_states.py @@ -1,12 +1,18 @@ import logging import os.path import time +from typing import Any +from typing import Dict +from typing import List import neo4j.exceptions from marshmallow import ValidationError from neo4j import GraphDatabase +from cartography.client.core.tx import read_list_of_dicts_tx from cartography.driftdetect.add_shortcut import add_shortcut +from cartography.driftdetect.config import UpdateConfig +from cartography.driftdetect.model import State from cartography.driftdetect.serializers import ShortcutSchema from cartography.driftdetect.serializers import StateSchema from cartography.driftdetect.storage import FileSystem @@ -15,7 +21,7 @@ logger = logging.getLogger(__name__) -def run_get_states(config): +def run_get_states(config: UpdateConfig) -> None: """ Handles neo4j errors and then updates detectors. @@ -90,7 +96,13 @@ def run_get_states(config): logger.exception(err) -def get_query_state(session, query_directory, state_serializer, storage, filename): +def get_query_state( + session: neo4j.Session, + query_directory: str, + state_serializer: StateSchema, + storage, + filename: str, +) -> State: """ Gets the most recent state of a query. @@ -115,7 +127,7 @@ def get_query_state(session, query_directory, state_serializer, storage, filenam return state -def get_state(session, state): +def get_state(session: neo4j.Session, state: State) -> None: """ Connects to a neo4j session, runs the validation query, then saves the results to a state. @@ -126,12 +138,16 @@ def get_state(session, state): :return: """ - new_results = session.run(state.validation_query) + new_results: List[Dict[str, Any]] = session.read_transaction( + read_list_of_dicts_tx, + state.validation_query, + ) logger.debug(f"Updating results for {state.name}") - state.properties = new_results.keys() - results = [] + # The keys will be the same across all items in the returned list + state.properties = list(new_results[0].keys()) if len(new_results) > 0 else [] + results = [] for record in new_results: values = [] for field in record.values(): diff --git a/cartography/driftdetect/model.py b/cartography/driftdetect/model.py index 8887fbc02d..f92c959271 100644 --- a/cartography/driftdetect/model.py +++ b/cartography/driftdetect/model.py @@ -1,4 +1,5 @@ import logging +from typing import List logger = logging.getLogger(__name__) @@ -19,13 +20,13 @@ class State: def __init__( self, - name, - validation_query, - properties, - results, + name: str, + validation_query: str, + properties: List[str], + results: List[List[str]], ): - self.name = name - self.validation_query = validation_query - self.properties = properties - self.results = results + self.name: str = name + self.validation_query: str = validation_query + self.properties: List[str] = properties + self.results: List[List[str]] = results diff --git a/cartography/intel/aws/ec2/instances.py b/cartography/intel/aws/ec2/instances.py index 1c69eb06a8..87c7e32028 100644 --- a/cartography/intel/aws/ec2/instances.py +++ b/cartography/intel/aws/ec2/instances.py @@ -17,7 +17,7 @@ from cartography.models.aws.ec2.reservations import EC2ReservationSchema from cartography.models.aws.ec2.securitygroups import EC2SecurityGroupSchema from cartography.models.aws.ec2.subnets import EC2SubnetSchema -from cartography.models.aws.ec2.volumes import EBSVolumeSchema +from cartography.models.aws.ec2.volumes import EBSVolumeInstanceSchema from cartography.util import aws_handle_regions from cartography.util import timeit @@ -273,7 +273,7 @@ def load_ec2_instance_ebs_volumes( ) -> None: load( neo4j_session, - EBSVolumeSchema(), + EBSVolumeInstanceSchema(), ebs_data, Region=region, AWS_ID=current_aws_account_id, diff --git a/cartography/intel/aws/ec2/volumes.py b/cartography/intel/aws/ec2/volumes.py index 6de03c7d42..6b8610d3cc 100644 --- a/cartography/intel/aws/ec2/volumes.py +++ b/cartography/intel/aws/ec2/volumes.py @@ -6,7 +6,9 @@ import boto3 import neo4j +from cartography.client.core.tx import load from cartography.graph.job import GraphJob +from cartography.intel.aws.util.arns import build_arn from cartography.models.aws.ec2.volumes import EBSVolumeSchema from cartography.util import aws_handle_regions from cartography.util import timeit @@ -16,7 +18,7 @@ @timeit @aws_handle_regions -def get_volumes(boto3_session: boto3.session.Session, region: str) -> List[Dict]: +def get_volumes(boto3_session: boto3.session.Session, region: str) -> List[Dict[str, Any]]: client = boto3_session.client('ec2', region_name=region) paginator = client.get_paginator('describe_volumes') volumes: List[Dict] = [] @@ -26,90 +28,76 @@ def get_volumes(boto3_session: boto3.session.Session, region: str) -> List[Dict] def transform_volumes(volumes: List[Dict[str, Any]], region: str, current_aws_account_id: str) -> List[Dict[str, Any]]: + result = [] for volume in volumes: - volume['VolumeArn'] = f"arn:aws:ec2:{region}:{current_aws_account_id}:volume/{volume['VolumeId']}" - volume['CreateTime'] = str(volume['CreateTime']) - return volumes + attachments = volume.get('Attachments', []) + active_attachments = [a for a in attachments if a['State'] == 'attached'] + + volume_id = volume['VolumeId'] + raw_vol = ({ + 'Arn': build_arn('ec2', current_aws_account_id, 'volume', volume_id, region), + 'AvailabilityZone': volume.get('AvailabilityZone'), + 'CreateTime': volume.get('CreateTime'), + 'Encrypted': volume.get('Encrypted'), + 'Size': volume.get('Size'), + 'State': volume.get('State'), + 'OutpostArn': volume.get('OutpostArn'), + 'SnapshotId': volume.get('SnapshotId'), + 'Iops': volume.get('Iops'), + 'FastRestored': volume.get('FastRestored'), + 'MultiAttachEnabled': volume.get('MultiAttachEnabled'), + 'VolumeType': volume.get('VolumeType'), + 'VolumeId': volume_id, + 'KmsKeyId': volume.get('KmsKeyId'), + }) + + if not active_attachments: + result.append(raw_vol) + continue + + for attachment in active_attachments: + vol_with_attachment = raw_vol.copy() + vol_with_attachment['InstanceId'] = attachment['InstanceId'] + result.append(vol_with_attachment) + + return result @timeit def load_volumes( - neo4j_session: neo4j.Session, data: List[Dict], region: str, current_aws_account_id: str, update_tag: int, + neo4j_session: neo4j.Session, + ebs_data: List[Dict[str, Any]], + region: str, + current_aws_account_id: str, + update_tag: int, ) -> None: - ingest_volumes = """ - UNWIND $volumes_list as volume - MERGE (vol:EBSVolume{id: volume.VolumeId}) - ON CREATE SET vol.firstseen = timestamp() - SET vol.arn = volume.VolumeArn, - vol.lastupdated = $update_tag, - vol.availabilityzone = volume.AvailabilityZone, - vol.createtime = volume.CreateTime, - vol.encrypted = volume.Encrypted, - vol.size = volume.Size, - vol.state = volume.State, - vol.outpostarn = volume.OutpostArn, - vol.snapshotid = volume.SnapshotId, - vol.iops = volume.Iops, - vol.fastrestored = volume.FastRestored, - vol.multiattachenabled = volume.MultiAttachEnabled, - vol.type = volume.VolumeType, - vol.kmskeyid = volume.KmsKeyId, - vol.region=$Region - WITH vol - MATCH (aa:AWSAccount{id: $AWS_ACCOUNT_ID}) - MERGE (aa)-[r:RESOURCE]->(vol) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $update_tag - """ - - neo4j_session.run( - ingest_volumes, - volumes_list=data, - AWS_ACCOUNT_ID=current_aws_account_id, + load( + neo4j_session, + EBSVolumeSchema(), + ebs_data, Region=region, - update_tag=update_tag, + AWS_ID=current_aws_account_id, + lastupdated=update_tag, ) -def load_volume_relationships( - neo4j_session: neo4j.Session, - volumes: List[Dict[str, Any]], - aws_update_tag: int, -) -> None: - add_relationship_query = """ - MATCH (volume:EBSVolume{arn: $VolumeArn}) - WITH volume - MATCH (instance:EC2Instance{instanceid: $InstanceId}) - MERGE (volume)-[r:ATTACHED_TO_EC2_INSTANCE]->(instance) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $aws_update_tag - """ - for volume in volumes: - for attachment in volume.get('Attachments', []): - if attachment['State'] != 'attached': - continue - neo4j_session.run( - add_relationship_query, - VolumeArn=volume['VolumeArn'], - InstanceId=attachment['InstanceId'], - aws_update_tag=aws_update_tag, - ) - - @timeit -def cleanup_volumes(neo4j_session: neo4j.Session, common_job_parameters: Dict) -> None: +def cleanup_volumes(neo4j_session: neo4j.Session, common_job_parameters: Dict[str, Any]) -> None: GraphJob.from_node_schema(EBSVolumeSchema(), common_job_parameters).run(neo4j_session) @timeit def sync_ebs_volumes( - neo4j_session: neo4j.Session, boto3_session: boto3.session.Session, regions: List[str], - current_aws_account_id: str, update_tag: int, common_job_parameters: Dict, + neo4j_session: neo4j.Session, + boto3_session: boto3.session.Session, + regions: List[str], + current_aws_account_id: str, + update_tag: int, + common_job_parameters: Dict[str, Any], ) -> None: for region in regions: logger.debug("Syncing volumes for region '%s' in account '%s'.", region, current_aws_account_id) data = get_volumes(boto3_session, region) transformed_data = transform_volumes(data, region, current_aws_account_id) load_volumes(neo4j_session, transformed_data, region, current_aws_account_id, update_tag) - load_volume_relationships(neo4j_session, transformed_data, update_tag) cleanup_volumes(neo4j_session, common_job_parameters) diff --git a/cartography/intel/aws/ecr.py b/cartography/intel/aws/ecr.py index 0b5dbb12e2..4569024d52 100644 --- a/cartography/intel/aws/ecr.py +++ b/cartography/intel/aws/ecr.py @@ -1,4 +1,5 @@ import logging +from typing import Any from typing import Dict from typing import List @@ -9,6 +10,8 @@ from cartography.util import batch from cartography.util import run_cleanup_job from cartography.util import timeit +from cartography.util import to_asynchronous +from cartography.util import to_synchronous logger = logging.getLogger(__name__) @@ -139,6 +142,25 @@ def cleanup(neo4j_session: neo4j.Session, common_job_parameters: Dict) -> None: run_cleanup_job('aws_import_ecr_cleanup.json', neo4j_session, common_job_parameters) +def _get_image_data( + boto3_session: boto3.session.Session, + region: str, + repositories: List[Dict[str, Any]], +) -> Dict[str, Any]: + ''' + Given a list of repositories, get the image data for each repository, + return as a mapping from repositoryUri to image object + ''' + image_data = {} + + async def async_get_images(repo: Dict[str, Any]) -> None: + repo_image_obj = await to_asynchronous(get_ecr_repository_images, boto3_session, region, repo['repositoryName']) + image_data[repo['repositoryUri']] = repo_image_obj + to_synchronous(*[async_get_images(repo) for repo in repositories]) + + return image_data + + @timeit def sync( neo4j_session: neo4j.Session, boto3_session: boto3.session.Session, regions: List[str], current_aws_account_id: str, @@ -148,9 +170,7 @@ def sync( logger.info("Syncing ECR for region '%s' in account '%s'.", region, current_aws_account_id) image_data = {} repositories = get_ecr_repositories(boto3_session, region) - for repo in repositories: - repo_image_obj = get_ecr_repository_images(boto3_session, region, repo['repositoryName']) - image_data[repo['repositoryUri']] = repo_image_obj + image_data = _get_image_data(boto3_session, region, repositories) load_ecr_repositories(neo4j_session, repositories, region, current_aws_account_id, update_tag) repo_images_list = transform_ecr_repository_images(image_data) load_ecr_repository_images(neo4j_session, repo_images_list, region, update_tag) diff --git a/cartography/intel/aws/s3.py b/cartography/intel/aws/s3.py index 6524bb4c44..6db11cc9fd 100644 --- a/cartography/intel/aws/s3.py +++ b/cartography/intel/aws/s3.py @@ -1,3 +1,4 @@ +import asyncio import hashlib import json import logging @@ -20,6 +21,8 @@ from cartography.util import run_analysis_job from cartography.util import run_cleanup_job from cartography.util import timeit +from cartography.util import to_asynchronous +from cartography.util import to_synchronous logger = logging.getLogger(__name__) stat_handler = get_stats_client(__name__) @@ -55,7 +58,9 @@ def get_s3_bucket_details( # a local store for s3 clients so that we may re-use clients for an AWS region s3_regional_clients: Dict[Any, Any] = {} - for bucket in bucket_data['Buckets']: + BucketDetail = Tuple[str, Dict[str, Any], Dict[str, Any], Dict[str, Any], Dict[str, Any], Dict[str, Any]] + + async def _get_bucket_detail(bucket: Dict[str, Any]) -> BucketDetail: # Note: bucket['Region'] is sometimes None because # client.get_bucket_location() does not return a location constraint for buckets # in us-east-1 region @@ -63,12 +68,23 @@ def get_s3_bucket_details( if not client: client = boto3_session.client('s3', bucket['Region']) s3_regional_clients[bucket['Region']] = client - acl = get_acl(bucket, client) - policy = get_policy(bucket, client) - encryption = get_encryption(bucket, client) - versioning = get_versioning(bucket, client) - public_access_block = get_public_access_block(bucket, client) - yield bucket['Name'], acl, policy, encryption, versioning, public_access_block + ( + acl, + policy, + encryption, + versioning, + public_access_block, + ) = await asyncio.gather( + to_asynchronous(get_acl, bucket, client), + to_asynchronous(get_policy, bucket, client), + to_asynchronous(get_encryption, bucket, client), + to_asynchronous(get_versioning, bucket, client), + to_asynchronous(get_public_access_block, bucket, client), + ) + return bucket['Name'], acl, policy, encryption, versioning, public_access_block + + bucket_details = to_synchronous(*[_get_bucket_detail(bucket) for bucket in bucket_data['Buckets']]) + yield from bucket_details @timeit diff --git a/cartography/intel/aws/ssm.py b/cartography/intel/aws/ssm.py index cd69499d6a..937cba52bf 100644 --- a/cartography/intel/aws/ssm.py +++ b/cartography/intel/aws/ssm.py @@ -6,9 +6,12 @@ import boto3 import neo4j +from cartography.client.core.tx import load +from cartography.graph.job import GraphJob +from cartography.models.aws.ssm.instance_information import SSMInstanceInformationSchema +from cartography.models.aws.ssm.instance_patch import SSMInstancePatchSchema from cartography.util import aws_handle_regions from cartography.util import dict_date_to_epoch -from cartography.util import run_cleanup_job from cartography.util import timeit logger = logging.getLogger(__name__) @@ -31,7 +34,9 @@ def get_instance_ids(neo4j_session: neo4j.Session, region: str, current_aws_acco @timeit @aws_handle_regions def get_instance_information( - boto3_session: boto3.session.Session, region: str, instance_ids: List[str], + boto3_session: boto3.session.Session, + region: str, + instance_ids: List[str], ) -> List[Dict[str, Any]]: client = boto3_session.client('ssm', region_name=region) instance_information: List[Dict[str, Any]] = [] @@ -46,10 +51,21 @@ def get_instance_information( return instance_information +def transform_instance_information(data_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + for ii in data_list: + ii["LastPingDateTime"] = dict_date_to_epoch(ii, "LastPingDateTime") + ii["RegistrationDate"] = dict_date_to_epoch(ii, "RegistrationDate") + ii["LastAssociationExecutionDate"] = dict_date_to_epoch(ii, "LastAssociationExecutionDate") + ii["LastSuccessfulAssociationExecutionDate"] = dict_date_to_epoch(ii, "LastSuccessfulAssociationExecutionDate") + return data_list + + @timeit @aws_handle_regions def get_instance_patches( - boto3_session: boto3.session.Session, region: str, instance_ids: List[str], + boto3_session: boto3.session.Session, + region: str, + instance_ids: List[str], ) -> List[Dict[str, Any]]: client = boto3_session.client('ssm', region_name=region) instance_patches: List[Dict[str, Any]] = [] @@ -65,6 +81,16 @@ def get_instance_patches( return instance_patches +def transform_instance_patches(data_list: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + for p in data_list: + p["Id"] = f"{p['_instance_id']}-{p['Title']}" + p["InstalledTime"] = dict_date_to_epoch(p, "InstalledTime") + # Split the comma separated CVEIds, if they exist, and strip + # the empty string from the list if not. + p["CVEIds"] = list(filter(None, p.get("CVEIds", "").split(","))) + return data_list + + @timeit def load_instance_information( neo4j_session: neo4j.Session, @@ -73,55 +99,13 @@ def load_instance_information( current_aws_account_id: str, aws_update_tag: int, ) -> None: - ingest_query = """ - UNWIND $InstanceInformation AS instance - MERGE (i:SSMInstanceInformation{id: instance.InstanceId}) - ON CREATE SET i.firstseen = timestamp() - SET i.instance_id = instance.InstanceId, - i.ping_status = instance.PingStatus, - i.last_ping_date_time = instance.LastPingDateTime, - i.agent_version = instance.AgentVersion, - i.is_latest_version = instance.IsLatestVersion, - i.platform_type = instance.PlatformType, - i.platform_name = instance.PlatformName, - i.platform_version = instance.PlatformVersion, - i.activation_id = instance.ActivationId, - i.iam_role = instance.IamRole, - i.registration_date = instance.RegistrationDate, - i.resource_type = instance.ResourceType, - i.name = instance.Name, - i.ip_address = instance.IPAddress, - i.computer_name = instance.ComputerName, - i.association_status = instance.AssociationStatus, - i.last_association_execution_date = instance.LastAssociationExecutionDate, - i.last_successful_association_execution_date = instance.LastSuccessfulAssociationExecutionDate, - i.source_id = instance.SourceId, - i.source_type = instance.SourceType, - i.region = $Region, - i.lastupdated = $aws_update_tag - WITH i - MATCH (owner:AWSAccount{id: $AWS_ACCOUNT_ID}) - MERGE (owner)-[r:RESOURCE]->(i) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $aws_update_tag - WITH i - MATCH (owner:AWSAccount{id: $AWS_ACCOUNT_ID})-[:RESOURCE]->(ec2_instance:EC2Instance{id: i.instance_id}) - MERGE (ec2_instance)-[r2:HAS_INFORMATION]->(i) - ON CREATE SET r2.firstseen = timestamp() - SET r2.lastupdated = $aws_update_tag - """ - for ii in data: - ii["LastPingDateTime"] = dict_date_to_epoch(ii, "LastPingDateTime") - ii["RegistrationDate"] = dict_date_to_epoch(ii, "RegistrationDate") - ii["LastAssociationExecutionDate"] = dict_date_to_epoch(ii, "LastAssociationExecutionDate") - ii["LastSuccessfulAssociationExecutionDate"] = dict_date_to_epoch(ii, "LastSuccessfulAssociationExecutionDate") - - neo4j_session.run( - ingest_query, - InstanceInformation=data, + load( + neo4j_session, + SSMInstanceInformationSchema(), + data, Region=region, - AWS_ACCOUNT_ID=current_aws_account_id, - aws_update_tag=aws_update_tag, + AWS_ID=current_aws_account_id, + lastupdated=aws_update_tag, ) @@ -133,61 +117,40 @@ def load_instance_patches( current_aws_account_id: str, aws_update_tag: int, ) -> None: - ingest_query = """ - UNWIND $InstancePatch AS patch - MERGE (p:SSMInstancePatch{id: patch._instance_id + "-" + patch.Title}) - ON CREATE SET p.firstseen = timestamp() - SET p.instance_id = patch._instance_id, - p.title = patch.Title, - p.kb_id = patch.KBId, - p.classification = patch.Classification, - p.severity = patch.Severity, - p.state = patch.State, - p.installed_time = patch.InstalledTime, - p.cve_ids = patch.CVEIds, - p.region = $Region, - p.lastupdated = $aws_update_tag - WITH p - MATCH (owner:AWSAccount{id: $AWS_ACCOUNT_ID}) - MERGE (owner)-[r:RESOURCE]->(p) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $aws_update_tag - WITH p - MATCH (owner:AWSAccount{id: $AWS_ACCOUNT_ID})-[:RESOURCE]->(ec2_instance:EC2Instance{id: p.instance_id}) - MERGE (ec2_instance)-[r2:HAS_PATCH]->(p) - ON CREATE SET r2.firstseen = timestamp() - SET r2.lastupdated = $aws_update_tag - """ - for p in data: - p["InstalledTime"] = dict_date_to_epoch(p, "InstalledTime") - # Split the comma separated CVEIds, if they exist, and strip - # the empty string from the list if not. - p["CVEIds"] = list(filter(None, p.get("CVEIds", "").split(","))) - - neo4j_session.run( - ingest_query, - InstancePatch=data, + load( + neo4j_session, + SSMInstancePatchSchema(), + data, Region=region, - AWS_ACCOUNT_ID=current_aws_account_id, - aws_update_tag=aws_update_tag, + AWS_ID=current_aws_account_id, + lastupdated=aws_update_tag, ) @timeit -def cleanup_ssm(neo4j_session: neo4j.Session, common_job_parameters: Dict) -> None: - run_cleanup_job('aws_import_ssm_cleanup.json', neo4j_session, common_job_parameters) +def cleanup_ssm(neo4j_session: neo4j.Session, common_job_parameters: Dict[str, Any]) -> None: + logger.info("Running SSM cleanup") + GraphJob.from_node_schema(SSMInstanceInformationSchema(), common_job_parameters).run(neo4j_session) + GraphJob.from_node_schema(SSMInstancePatchSchema(), common_job_parameters).run(neo4j_session) @timeit def sync( - neo4j_session: neo4j.Session, boto3_session: boto3.session.Session, regions: List[str], current_aws_account_id: str, - update_tag: int, common_job_parameters: Dict, + neo4j_session: neo4j.Session, + boto3_session: boto3.session.Session, + regions: List[str], + current_aws_account_id: str, + update_tag: int, + common_job_parameters: Dict[str, Any], ) -> None: for region in regions: logger.info("Syncing SSM for region '%s' in account '%s'.", region, current_aws_account_id) instance_ids = get_instance_ids(neo4j_session, region, current_aws_account_id) data = get_instance_information(boto3_session, region, instance_ids) + data = transform_instance_information(data) load_instance_information(neo4j_session, data, region, current_aws_account_id, update_tag) + data = get_instance_patches(boto3_session, region, instance_ids) + data = transform_instance_patches(data) load_instance_patches(neo4j_session, data, region, current_aws_account_id, update_tag) cleanup_ssm(neo4j_session, common_job_parameters) diff --git a/cartography/intel/aws/util/arns.py b/cartography/intel/aws/util/arns.py new file mode 100644 index 0000000000..e6108b82c3 --- /dev/null +++ b/cartography/intel/aws/util/arns.py @@ -0,0 +1,18 @@ +from typing import Optional + + +def build_arn( + resource: str, + account: str, + typename: str, + name: str, + region: Optional[str] = None, + partition: Optional[str] = None, +) -> str: + if not partition: + # TODO: support partitions from others. Please file an issue on this if needed, would love to hear from you + partition = 'aws' + if not region: + # Some resources are present in all regions, e.g. IAM policies + region = "" + return f"arn:{partition}:{resource}:{region}:{account}:{typename}/{name}" diff --git a/cartography/intel/github/repos.py b/cartography/intel/github/repos.py index 8fcadbd7f0..238537e033 100644 --- a/cartography/intel/github/repos.py +++ b/cartography/intel/github/repos.py @@ -312,14 +312,13 @@ def _transform_python_requirements( continue try: req = Requirement(stripped_line) + parsed_list.append(req) except InvalidRequirement: # INFO and not WARN/ERROR as we intentionally don't support all ways to specify Python requirements logger.info( f"Failed to parse line \"{line}\" in repo {repo_url}'s requirements.txt; skipping line.", exc_info=True, ) - continue - parsed_list.append(req) for req in parsed_list: pinned_version = None @@ -563,5 +562,5 @@ def sync( logger.info("Syncing GitHub repos") repos_json = get(github_api_key, github_url, organization) repo_data = transform(repos_json) - load(neo4j_session, repo_data, common_job_parameters['UPDATE_TAG']) + load(neo4j_session, common_job_parameters, repo_data) run_cleanup_job('github_repos_cleanup.json', neo4j_session, common_job_parameters) diff --git a/cartography/intel/gsuite/api.py b/cartography/intel/gsuite/api.py index 62d9cdfdab..92ed829e5e 100644 --- a/cartography/intel/gsuite/api.py +++ b/cartography/intel/gsuite/api.py @@ -130,9 +130,9 @@ def load_gsuite_groups(neo4j_session: neo4j.Session, groups: List[Dict], gsuite_ UNWIND $GroupData as group MERGE (g:GSuiteGroup{id: group.id}) ON CREATE SET - g.firstseen = $UpdateTag - ON MATCH SET - g.group_id = group.id, + g.firstseen = $UpdateTag, + g.group_id = group.id + SET g.admin_created = group.adminCreated, g.description = group.description, g.direct_members_count = group.directMembersCount, @@ -152,9 +152,9 @@ def load_gsuite_users(neo4j_session: neo4j.Session, users: List[Dict], gsuite_up UNWIND $UserData as user MERGE (u:GSuiteUser{id: user.id}) ON CREATE SET - u.firstseen = $UpdateTag - ON MATCH SET u.user_id = user.id, + u.firstseen = $UpdateTag + SET u.agreed_to_terms = user.agreedToTerms, u.archived = user.archived, u.change_password_at_next_login = user.changePasswordAtNextLogin, @@ -193,7 +193,7 @@ def load_gsuite_members(neo4j_session: neo4j.Session, group: Dict, members: List MERGE (user)-[r:MEMBER_GSUITE_GROUP]->(group) ON CREATE SET r.firstseen = $UpdateTag - ON MATCH SET + SET r.lastupdated = $UpdateTag """ neo4j_session.run( @@ -208,7 +208,7 @@ def load_gsuite_members(neo4j_session: neo4j.Session, group: Dict, members: List MERGE (group_1)-[r:MEMBER_GSUITE_GROUP]->(group_2) ON CREATE SET r.firstseen = $UpdateTag - ON MATCH SET + SET r.lastupdated = $UpdateTag """ neo4j_session.run(membership_qry, MemberData=members, GroupID=group.get("id"), UpdateTag=gsuite_update_tag) diff --git a/cartography/intel/semgrep/__init__.py b/cartography/intel/semgrep/__init__.py new file mode 100644 index 0000000000..dbd72f1dd3 --- /dev/null +++ b/cartography/intel/semgrep/__init__.py @@ -0,0 +1,23 @@ +import logging + +import neo4j + +from cartography.config import Config +from cartography.intel.semgrep.findings import sync +from cartography.util import timeit + + +logger = logging.getLogger(__name__) + + +@timeit +def start_semgrep_ingestion( + neo4j_session: neo4j.Session, config: Config, +) -> None: + common_job_parameters = { + "UPDATE_TAG": config.update_tag, + } + if not config.semgrep_app_token: + logger.info('Semgrep import is not configured - skipping this module. See docs to configure.') + return + sync(neo4j_session, config.semgrep_app_token, config.update_tag, common_job_parameters) diff --git a/cartography/intel/semgrep/findings.py b/cartography/intel/semgrep/findings.py new file mode 100644 index 0000000000..8612df28f8 --- /dev/null +++ b/cartography/intel/semgrep/findings.py @@ -0,0 +1,217 @@ +import logging +from typing import Any +from typing import Dict +from typing import List +from typing import Tuple + +import neo4j +import requests + +from cartography.client.core.tx import load +from cartography.graph.job import GraphJob +from cartography.models.semgrep.deployment import SemgrepDeploymentSchema +from cartography.models.semgrep.findings import SemgrepSCAFindingSchema +from cartography.models.semgrep.locations import SemgrepSCALocationSchema +from cartography.stats import get_stats_client +from cartography.util import merge_module_sync_metadata +from cartography.util import timeit + +logger = logging.getLogger(__name__) +stat_handler = get_stats_client(__name__) +_TIMEOUT = (60, 60) + + +@timeit +def get_deployment(semgrep_app_token: str) -> Dict[str, Any]: + """ + Gets the deployment associated with the passed Semgrep App token. + param: semgrep_app_token: The Semgrep App token to use for authentication. + """ + deployment = {} + deployment_url = "https://semgrep.dev/api/v1/deployments" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {semgrep_app_token}", + } + response = requests.get(deployment_url, headers=headers, timeout=_TIMEOUT) + response.raise_for_status() + + data = response.json() + deployment["id"] = data["deployments"][0]["id"] + deployment["name"] = data["deployments"][0]["name"] + deployment["slug"] = data["deployments"][0]["slug"] + + return deployment + + +@timeit +def get_sca_vulns(semgrep_app_token: str, deployment_id: str) -> List[Dict[str, Any]]: + """ + Gets the SCA vulns associated with the passed Semgrep App token and deployment id. + param: semgrep_app_token: The Semgrep App token to use for authentication. + param: deployment_id: The Semgrep deployment id to use for retrieving SCA vulns. + """ + all_vulns = [] + sca_url = f"https://semgrep.dev/api/sca/deployments/{deployment_id}/vulns" + has_more = True + cursor = "" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {semgrep_app_token}", + } + + while has_more: + params = {} + if cursor: + params = {"cursor": cursor} + + response = requests.get(sca_url, params=params, headers=headers, timeout=_TIMEOUT) + response.raise_for_status() + data = response.json() + vulns = data["vulns"] + cursor = data.get("cursor") + has_more = data.get("hasMore", False) + all_vulns.extend(vulns) + + return all_vulns + + +def transform_sca_vulns(raw_vulns: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[Dict[str, str]]]: + """ + Transforms the raw SCA vulns response from Semgrep API into a list of dicts + that can be used to create the SemgrepSCAFinding nodes. + """ + vulns = [] + usages = [] + for vuln in raw_vulns: + sca_vuln: Dict[str, Any] = {} + # Mandatory fields + unique_id = f"{vuln['repositoryName']}|{vuln['advisory']['ruleId']}" + sca_vuln["id"] = unique_id + sca_vuln["repositoryName"] = vuln["repositoryName"] + sca_vuln["ruleId"] = vuln["advisory"]["ruleId"] + sca_vuln["title"] = vuln["advisory"]["title"] + sca_vuln["description"] = vuln["advisory"]["description"] + sca_vuln["ecosystem"] = vuln["advisory"]["ecosystem"] + sca_vuln["severity"] = vuln["advisory"]["severity"] + sca_vuln["reachability"] = vuln["advisory"]["reachability"] + sca_vuln["reachableIf"] = vuln["advisory"]["reachableIf"] + sca_vuln["exposureType"] = vuln["exposureType"] + dependency = f"{vuln['matchedDependency']['name']}|{vuln['matchedDependency']['versionSpecifier']}" + sca_vuln["matchedDependency"] = dependency + sca_vuln["dependencyFileLocation_path"] = vuln["dependencyFileLocation"]["path"] + sca_vuln["dependencyFileLocation_url"] = vuln["dependencyFileLocation"]["url"] + # Optional fields + sca_vuln["transitivity"] = vuln.get("transitivity", None) + cves = vuln.get("advisory", {}).get("references", {}).get("cveIds") + if len(cves) > 0: + # Take the first CVE + sca_vuln["cveId"] = vuln["advisory"]["references"]["cveIds"][0] + if vuln.get('closestSafeDependency'): + dep_fix = f"{vuln['closestSafeDependency']['name']}|{vuln['closestSafeDependency']['versionSpecifier']}" + sca_vuln["closestSafeDependency"] = dep_fix + if vuln["advisory"].get("references", {}).get("urls", []): + sca_vuln["ref_urls"] = vuln["advisory"].get("references", {}).get("urls", []) + sca_vuln["openedAt"] = vuln.get("openedAt", None) + for usage in vuln.get("usages", []): + usage_dict = {} + usage_dict["SCA_ID"] = unique_id + usage_dict["findingId"] = usage["findingId"] + usage_dict["path"] = usage["location"]["path"] + usage_dict["startLine"] = usage["location"]["startLine"] + usage_dict["startCol"] = usage["location"]["startCol"] + usage_dict["endLine"] = usage["location"]["endLine"] + usage_dict["endCol"] = usage["location"]["endCol"] + usage_dict["url"] = usage["location"]["url"] + usages.append(usage_dict) + vulns.append(sca_vuln) + return vulns, usages + + +@timeit +def load_semgrep_deployment( + neo4j_session: neo4j.Session, deployment: Dict[str, Any], update_tag: int, +) -> None: + logger.info(f"Loading Semgrep deployment info {deployment} into the graph...") + load( + neo4j_session, + SemgrepDeploymentSchema(), + [deployment], + lastupdated=update_tag, + ) + + +@timeit +def load_semgrep_sca_vulns( + neo4j_session: neo4j.Session, + vulns: List[Dict[str, Any]], + deployment_id: str, + update_tag: int, +) -> None: + logger.info(f"Loading {len(vulns)} Semgrep SCA vulns info into the graph.") + load( + neo4j_session, + SemgrepSCAFindingSchema(), + vulns, + lastupdated=update_tag, + DEPLOYMENT_ID=deployment_id, + ) + + +@timeit +def load_semgrep_sca_usages( + neo4j_session: neo4j.Session, + usages: List[Dict[str, Any]], + deployment_id: str, + update_tag: int, +) -> None: + logger.info(f"Loading {len(usages)} Semgrep SCA usages info into the graph.") + load( + neo4j_session, + SemgrepSCALocationSchema(), + usages, + lastupdated=update_tag, + DEPLOYMENT_ID=deployment_id, + ) + + +@timeit +def cleanup( + neo4j_session: neo4j.Session, common_job_parameters: Dict[str, Any], +) -> None: + logger.info("Running Semgrep SCA findings cleanup job.") + findings_cleanup_job = GraphJob.from_node_schema( + SemgrepSCAFindingSchema(), common_job_parameters, + ) + findings_cleanup_job.run(neo4j_session) + logger.info("Running Semgrep SCA Locations cleanup job.") + locations_cleanup_job = GraphJob.from_node_schema( + SemgrepSCALocationSchema(), common_job_parameters, + ) + locations_cleanup_job.run(neo4j_session) + + +@timeit +def sync( + neo4j_sesion: neo4j.Session, + semgrep_app_token: str, + update_tag: int, + common_job_parameters: Dict[str, Any], +) -> None: + logger.info("Running Semgrep SCA findings sync job.") + semgrep_deployment = get_deployment(semgrep_app_token) + load_semgrep_deployment(neo4j_sesion, semgrep_deployment, update_tag) + common_job_parameters["DEPLOYMENT_ID"] = semgrep_deployment["id"] + raw_vulns = get_sca_vulns(semgrep_app_token, semgrep_deployment["id"]) + vulns, usages = transform_sca_vulns(raw_vulns) + load_semgrep_sca_vulns(neo4j_sesion, vulns, semgrep_deployment["id"], update_tag) + load_semgrep_sca_usages(neo4j_sesion, usages, semgrep_deployment["id"], update_tag) + cleanup(neo4j_sesion, common_job_parameters) + merge_module_sync_metadata( + neo4j_session=neo4j_sesion, + group_type='Semgrep', + group_id=common_job_parameters["DEPLOYMENT_ID"], + synced_type='SCA', + update_tag=update_tag, + stat_handler=stat_handler, + ) diff --git a/cartography/models/aws/ec2/volumes.py b/cartography/models/aws/ec2/volumes.py index 2140f4fcd0..bb6925780c 100644 --- a/cartography/models/aws/ec2/volumes.py +++ b/cartography/models/aws/ec2/volumes.py @@ -13,10 +13,23 @@ @dataclass(frozen=True) class EBSVolumeNodeProperties(CartographyNodeProperties): + arn: PropertyRef = PropertyRef('Arn', extra_index=True) id: PropertyRef = PropertyRef('VolumeId') + volumeid: PropertyRef = PropertyRef('VolumeId', extra_index=True) region: PropertyRef = PropertyRef('Region', set_in_kwargs=True) lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) - deleteontermination: PropertyRef = PropertyRef('DeleteOnTermination') + availabilityzone: PropertyRef = PropertyRef('AvailabilityZone') + createtime: PropertyRef = PropertyRef('CreateTime') + encrypted: PropertyRef = PropertyRef('Encrypted') + size: PropertyRef = PropertyRef('Size') + state: PropertyRef = PropertyRef('State') + outpostarn: PropertyRef = PropertyRef('OutpostArn') + snapshotid: PropertyRef = PropertyRef('SnapshotId') + iops: PropertyRef = PropertyRef('Iops') + fastrestored: PropertyRef = PropertyRef('FastRestored') + multiattachenabled: PropertyRef = PropertyRef('MultiAttachEnabled') + type: PropertyRef = PropertyRef('VolumeType') + kmskeyid: PropertyRef = PropertyRef('KmsKeyId') @dataclass(frozen=True) @@ -53,6 +66,9 @@ class EBSVolumeToEC2Instance(CartographyRelSchema): @dataclass(frozen=True) class EBSVolumeSchema(CartographyNodeSchema): + """ + EBS Volume properties as returned from the EBS Volume API response + """ label: str = 'EBSVolume' properties: EBSVolumeNodeProperties = EBSVolumeNodeProperties() sub_resource_relationship: EBSVolumeToAWSAccount = EBSVolumeToAWSAccount() @@ -61,3 +77,31 @@ class EBSVolumeSchema(CartographyNodeSchema): EBSVolumeToEC2Instance(), ], ) + + +@dataclass(frozen=True) +class EBSVolumeInstanceProperties(CartographyNodeProperties): + """ + EBS Volume properties as known by an EC2 instance. + The EC2 instance API response includes a `deleteontermination` field and the volume id. + """ + arn: PropertyRef = PropertyRef('Arn', extra_index=True) + id: PropertyRef = PropertyRef('VolumeId') + volumeid: PropertyRef = PropertyRef('VolumeId', extra_index=True) + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + deleteontermination: PropertyRef = PropertyRef('DeleteOnTermination') + + +@dataclass(frozen=True) +class EBSVolumeInstanceSchema(CartographyNodeSchema): + """ + EBS Volume from EC2 Instance API response. This is separate from `EBSVolumeSchema` to prevent issue #1210. + """ + label: str = 'EBSVolume' + properties: EBSVolumeInstanceProperties = EBSVolumeInstanceProperties() + sub_resource_relationship: EBSVolumeToAWSAccount = EBSVolumeToAWSAccount() + other_relationships: OtherRelationships = OtherRelationships( + [ + EBSVolumeToEC2Instance(), + ], + ) diff --git a/cartography/models/aws/ssm/__init__.py b/cartography/models/aws/ssm/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cartography/models/aws/ssm/instance_information.py b/cartography/models/aws/ssm/instance_information.py new file mode 100644 index 0000000000..b678ebe30d --- /dev/null +++ b/cartography/models/aws/ssm/instance_information.py @@ -0,0 +1,82 @@ +from dataclasses import dataclass + +from cartography.models.core.common import PropertyRef +from cartography.models.core.nodes import CartographyNodeProperties +from cartography.models.core.nodes import CartographyNodeSchema +from cartography.models.core.relationships import CartographyRelProperties +from cartography.models.core.relationships import CartographyRelSchema +from cartography.models.core.relationships import LinkDirection +from cartography.models.core.relationships import make_target_node_matcher +from cartography.models.core.relationships import OtherRelationships +from cartography.models.core.relationships import TargetNodeMatcher + + +@dataclass(frozen=True) +class SSMInstanceInformationNodeProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('InstanceId') + instance_id: PropertyRef = PropertyRef('InstanceId', extra_index=True) + region: PropertyRef = PropertyRef('Region', set_in_kwargs=True) + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + ping_status: PropertyRef = PropertyRef('PingStatus') + last_ping_date_time: PropertyRef = PropertyRef('LastPingDateTime') + agent_version: PropertyRef = PropertyRef('AgentVersion') + is_latest_version: PropertyRef = PropertyRef('IsLatestVersion') + platform_type: PropertyRef = PropertyRef('PlatformType') + platform_name: PropertyRef = PropertyRef('PlatformName') + platform_version: PropertyRef = PropertyRef('PlatformVersion') + activation_id: PropertyRef = PropertyRef('ActivationId') + iam_role: PropertyRef = PropertyRef('IamRole') + registration_date: PropertyRef = PropertyRef('RegistrationDate') + resource_type: PropertyRef = PropertyRef('ResourceType') + name: PropertyRef = PropertyRef('Name') + ip_address: PropertyRef = PropertyRef('IPAddress') + computer_name: PropertyRef = PropertyRef('ComputerName') + association_status: PropertyRef = PropertyRef('AssociationStatus') + last_association_execution_date: PropertyRef = PropertyRef('LastAssociationExecutionDate') + last_successful_association_execution_date: PropertyRef = PropertyRef('LastSuccessfulAssociationExecutionDate') + source_id: PropertyRef = PropertyRef('SourceId') + source_type: PropertyRef = PropertyRef('SourceType') + + +@dataclass(frozen=True) +class SSMInstanceInformationToAWSAccountRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class SSMInstanceInformationToAWSAccount(CartographyRelSchema): + target_node_label: str = 'AWSAccount' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('AWS_ID', set_in_kwargs=True)}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "RESOURCE" + properties: SSMInstanceInformationToAWSAccountRelProperties = SSMInstanceInformationToAWSAccountRelProperties() + + +@dataclass(frozen=True) +class SSMInstanceInformationToEC2InstanceRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class SSMInstanceInformationToEC2Instance(CartographyRelSchema): + target_node_label: str = 'EC2Instance' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('InstanceId')}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "HAS_INFORMATION" + properties: SSMInstanceInformationToEC2InstanceRelProperties = SSMInstanceInformationToEC2InstanceRelProperties() + + +@dataclass(frozen=True) +class SSMInstanceInformationSchema(CartographyNodeSchema): + label: str = 'SSMInstanceInformation' + properties: SSMInstanceInformationNodeProperties = SSMInstanceInformationNodeProperties() + sub_resource_relationship: SSMInstanceInformationToAWSAccount = SSMInstanceInformationToAWSAccount() + other_relationships: OtherRelationships = OtherRelationships( + [ + SSMInstanceInformationToEC2Instance(), + ], + ) diff --git a/cartography/models/aws/ssm/instance_patch.py b/cartography/models/aws/ssm/instance_patch.py new file mode 100644 index 0000000000..25686f5b80 --- /dev/null +++ b/cartography/models/aws/ssm/instance_patch.py @@ -0,0 +1,70 @@ +from dataclasses import dataclass + +from cartography.models.core.common import PropertyRef +from cartography.models.core.nodes import CartographyNodeProperties +from cartography.models.core.nodes import CartographyNodeSchema +from cartography.models.core.relationships import CartographyRelProperties +from cartography.models.core.relationships import CartographyRelSchema +from cartography.models.core.relationships import LinkDirection +from cartography.models.core.relationships import make_target_node_matcher +from cartography.models.core.relationships import OtherRelationships +from cartography.models.core.relationships import TargetNodeMatcher + + +@dataclass(frozen=True) +class SSMInstancePatchNodeProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('Id') + instance_id: PropertyRef = PropertyRef('_instance_id', extra_index=True) + region: PropertyRef = PropertyRef('Region', set_in_kwargs=True) + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + title: PropertyRef = PropertyRef('Title', extra_index=True) + kb_id: PropertyRef = PropertyRef('KBId', extra_index=True) + classification: PropertyRef = PropertyRef('Classification') + severity: PropertyRef = PropertyRef('Severity') + state: PropertyRef = PropertyRef('State') + installed_time: PropertyRef = PropertyRef('InstalledTime') + cve_ids: PropertyRef = PropertyRef('CVEIds') + + +@dataclass(frozen=True) +class SSMInstancePatchToAWSAccountRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class SSMInstancePatchToAWSAccount(CartographyRelSchema): + target_node_label: str = 'AWSAccount' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('AWS_ID', set_in_kwargs=True)}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "RESOURCE" + properties: SSMInstancePatchToAWSAccountRelProperties = SSMInstancePatchToAWSAccountRelProperties() + + +@dataclass(frozen=True) +class SSMInstancePatchToEC2InstanceRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +class SSMInstancePatchToEC2Instance(CartographyRelSchema): + target_node_label: str = 'EC2Instance' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('_instance_id')}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "HAS_PATCH" + properties: SSMInstancePatchToEC2InstanceRelProperties = SSMInstancePatchToEC2InstanceRelProperties() + + +@dataclass(frozen=True) +class SSMInstancePatchSchema(CartographyNodeSchema): + label: str = 'SSMInstancePatch' + properties: SSMInstancePatchNodeProperties = SSMInstancePatchNodeProperties() + sub_resource_relationship: SSMInstancePatchToAWSAccount = SSMInstancePatchToAWSAccount() + other_relationships: OtherRelationships = OtherRelationships( + [ + SSMInstancePatchToEC2Instance(), + ], + ) diff --git a/cartography/models/semgrep/__init__.py b/cartography/models/semgrep/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cartography/models/semgrep/deployment.py b/cartography/models/semgrep/deployment.py new file mode 100644 index 0000000000..6248e01491 --- /dev/null +++ b/cartography/models/semgrep/deployment.py @@ -0,0 +1,19 @@ +from dataclasses import dataclass + +from cartography.models.core.common import PropertyRef +from cartography.models.core.nodes import CartographyNodeProperties +from cartography.models.core.nodes import CartographyNodeSchema + + +@dataclass(frozen=True) +class SemgrepDeploymentProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('id') + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + name: PropertyRef = PropertyRef('name', extra_index=True) + slug: PropertyRef = PropertyRef('slug', extra_index=True) + + +@dataclass(frozen=True) +class SemgrepDeploymentSchema(CartographyNodeSchema): + label: str = 'SemgrepDeployment' + properties: SemgrepDeploymentProperties = SemgrepDeploymentProperties() diff --git a/cartography/models/semgrep/findings.py b/cartography/models/semgrep/findings.py new file mode 100644 index 0000000000..79b2e7011d --- /dev/null +++ b/cartography/models/semgrep/findings.py @@ -0,0 +1,80 @@ +from dataclasses import dataclass + +from cartography.models.core.common import PropertyRef +from cartography.models.core.nodes import CartographyNodeProperties +from cartography.models.core.nodes import CartographyNodeSchema +from cartography.models.core.relationships import CartographyRelProperties +from cartography.models.core.relationships import CartographyRelSchema +from cartography.models.core.relationships import LinkDirection +from cartography.models.core.relationships import make_target_node_matcher +from cartography.models.core.relationships import OtherRelationships +from cartography.models.core.relationships import TargetNodeMatcher + + +@dataclass(frozen=True) +class SemgrepSCAFindingNodeProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('id') + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + rule_id: PropertyRef = PropertyRef('ruleId', extra_index=True) + repository: PropertyRef = PropertyRef('repositoryName', extra_index=True) + summary: PropertyRef = PropertyRef('title', extra_index=True) + description: PropertyRef = PropertyRef('description') + package_manager: PropertyRef = PropertyRef('ecosystem') + severity: PropertyRef = PropertyRef('severity') + cve_id: PropertyRef = PropertyRef('cveId', extra_index=True) + reachability_check: PropertyRef = PropertyRef('reachability') + reachability_condition: PropertyRef = PropertyRef('reachableIf') + reachability: PropertyRef = PropertyRef('exposureType') + transitivity: PropertyRef = PropertyRef('transitivity') + dependency: PropertyRef = PropertyRef('matchedDependency') + dependency_fix: PropertyRef = PropertyRef('closestSafeDependency') + ref_urls: PropertyRef = PropertyRef('ref_urls') + dependency_file: PropertyRef = PropertyRef('dependencyFileLocation_path', extra_index=True) + dependency_file_url: PropertyRef = PropertyRef('dependencyFileLocation_url', extra_index=True) + scan_time: PropertyRef = PropertyRef('openedAt') + + +@dataclass(frozen=True) +class SemgrepSCAFindingToSemgrepDeploymentRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +# (:SemgrepSCAFinding)<-[:RESOURCE]-(:SemgrepDeployment) +class SemgrepSCAFindingToSemgrepDeploymentSchema(CartographyRelSchema): + target_node_label: str = 'SemgrepDeployment' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('DEPLOYMENT_ID', set_in_kwargs=True)}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "RESOURCE" + properties: SemgrepSCAFindingToSemgrepDeploymentRelProperties = SemgrepSCAFindingToSemgrepDeploymentRelProperties() + + +@dataclass(frozen=True) +class SemgrepSCAFindingToGithubRepoRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +# (:SemgrepSCAFinding)-[:FOUND_IN]->(:GitHubRepository) +class SemgrepSCAFindingToGithubRepoRel(CartographyRelSchema): + target_node_label: str = 'GitHubRepository' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'fullname': PropertyRef('repositoryName')}, + ) + direction: LinkDirection = LinkDirection.OUTWARD + rel_label: str = "FOUND_IN" + properties: SemgrepSCAFindingToGithubRepoRelProperties = SemgrepSCAFindingToGithubRepoRelProperties() + + +@dataclass(frozen=True) +class SemgrepSCAFindingSchema(CartographyNodeSchema): + label: str = 'SemgrepSCAFinding' + properties: SemgrepSCAFindingNodeProperties = SemgrepSCAFindingNodeProperties() + sub_resource_relationship: SemgrepSCAFindingToSemgrepDeploymentSchema = SemgrepSCAFindingToSemgrepDeploymentSchema() + other_relationships: OtherRelationships = OtherRelationships( + [ + SemgrepSCAFindingToGithubRepoRel(), + ], + ) diff --git a/cartography/models/semgrep/locations.py b/cartography/models/semgrep/locations.py new file mode 100644 index 0000000000..74a054d700 --- /dev/null +++ b/cartography/models/semgrep/locations.py @@ -0,0 +1,69 @@ +from dataclasses import dataclass + +from cartography.models.core.common import PropertyRef +from cartography.models.core.nodes import CartographyNodeProperties +from cartography.models.core.nodes import CartographyNodeSchema +from cartography.models.core.relationships import CartographyRelProperties +from cartography.models.core.relationships import CartographyRelSchema +from cartography.models.core.relationships import LinkDirection +from cartography.models.core.relationships import make_target_node_matcher +from cartography.models.core.relationships import OtherRelationships +from cartography.models.core.relationships import TargetNodeMatcher + + +@dataclass(frozen=True) +class SemgrepSCALocationProperties(CartographyNodeProperties): + id: PropertyRef = PropertyRef('findingId') + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + path: PropertyRef = PropertyRef('path', extra_index=True) + start_line: PropertyRef = PropertyRef('startLine') + start_col: PropertyRef = PropertyRef('startCol') + end_line: PropertyRef = PropertyRef('endLine') + end_col: PropertyRef = PropertyRef('endCol') + url: PropertyRef = PropertyRef('url') + + +@dataclass(frozen=True) +class SemgrepSCALocToSemgrepSCAFindingRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +# (:SemgrepSCALocation)<-[:USAGE_AT]-(:SemgrepSCAFinding) +class SemgrepSCALocToSemgrepSCAFindingRelSchema(CartographyRelSchema): + target_node_label: str = 'SemgrepSCAFinding' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('SCA_ID')}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "USAGE_AT" + properties: SemgrepSCALocToSemgrepSCAFindingRelProperties = SemgrepSCALocToSemgrepSCAFindingRelProperties() + + +@dataclass(frozen=True) +class SemgrepSCALocToSemgrepSCADeploymentRelProperties(CartographyRelProperties): + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + + +@dataclass(frozen=True) +# (:SemgrepSCALocation)<-[:RESOURCE]-(:SemgrepSCADeployment) +class SemgrepSCALocToSCADeploymentRelSchema(CartographyRelSchema): + target_node_label: str = 'SemgrepDeployment' + target_node_matcher: TargetNodeMatcher = make_target_node_matcher( + {'id': PropertyRef('DEPLOYMENT_ID', set_in_kwargs=True)}, + ) + direction: LinkDirection = LinkDirection.INWARD + rel_label: str = "RESOURCE" + properties: SemgrepSCALocToSemgrepSCADeploymentRelProperties = SemgrepSCALocToSemgrepSCADeploymentRelProperties() + + +@dataclass(frozen=True) +class SemgrepSCALocationSchema(CartographyNodeSchema): + label: str = 'SemgrepSCALocation' + properties: SemgrepSCALocationProperties = SemgrepSCALocationProperties() + sub_resource_relationship: SemgrepSCALocToSCADeploymentRelSchema = SemgrepSCALocToSCADeploymentRelSchema() + other_relationships: OtherRelationships = OtherRelationships( + [ + SemgrepSCALocToSemgrepSCAFindingRelSchema(), + ], + ) diff --git a/cartography/util.py b/cartography/util.py index 5d26157643..30b9bccd80 100644 --- a/cartography/util.py +++ b/cartography/util.py @@ -1,9 +1,12 @@ +import asyncio import logging import re import sys +from functools import partial from functools import wraps from string import Template from typing import Any +from typing import Awaitable from typing import BinaryIO from typing import Callable from typing import cast @@ -25,6 +28,7 @@ from cartography.stats import get_stats_client from cartography.stats import ScopedStatsClient + if sys.version_info >= (3, 7): from importlib.resources import open_binary, read_text else: @@ -141,6 +145,7 @@ def load_resource_binary(package: str, resource_name: str) -> BinaryIO: return open_binary(package, resource_name) +R = TypeVar('R') F = TypeVar('F', bound=Callable[..., Any]) @@ -297,3 +302,94 @@ def batch(items: Iterable, size: int = DEFAULT_BATCH_SIZE) -> List[List]: items[i: i + size] for i in range(0, len(items), size) ] + + +def is_throttling_exception(exc: Exception) -> bool: + ''' + Returns True if the exception is caused by a client libraries throttling mechanism + ''' + # https://boto3.amazonaws.com/v1/documentation/api/1.19.9/guide/error-handling.html + if isinstance(exc, botocore.exceptions.ClientError): + if exc.response['Error']['Code'] in ['LimitExceededException', 'Throttling']: + return True + # add other exceptions here, if needed, like: + # https://cloud.google.com/python/docs/reference/storage/1.39.0/retry_timeout#configuring-retries + # if isinstance(exc, google.api_core.exceptions.TooManyRequests): + # return True + return False + + +def to_asynchronous(func: Callable[..., R], *args: Any, **kwargs: Any) -> Awaitable[R]: + ''' + Returns a Future that will run a function and its arguments in the default threadpool. + Helper until we start using python 3.9's asyncio.to_thread + + Calls are also wrapped within a backoff decorator to handle throttling errors. + + :param func: the function to be wrapped by the Future + :param args: a series of arguments to be passed into func + :param kwargs: a series of keyword arguments to be passed into func + + example: + def my_func(arg1, arg2, kwarg1): + return arg1 + arg2 + kwarg1 + + # normal synchronous call: + result = my_func(1, 2, kwarg1=3) + + # asynchronous call: + future = to_asynchronous(my_func, 1, 2, kwarg1=3) + + # the result is stored in the future, and can be retrieved + # from within another async function with: + await future + + # or from within a synchronous function with our helper: + to_synchronous(future) + + NOTE: to use this in a Jupyter notebook, you need to do: + # import nest_asyncio + # nest_asyncio.apply() + ''' + CartographyThrottlingException = type('CartographyThrottlingException', (Exception,), {}) + + @wraps(func) + def wrapper(*args: Any, **kwargs: Any) -> R: + try: + return func(*args, **kwargs) + except Exception as exc: + if is_throttling_exception(exc): + raise CartographyThrottlingException from exc + raise + + # don't use @backoff as decorator, to preserve typing + wrapped = backoff.on_exception(backoff.expo, CartographyThrottlingException)(wrapper) + call = partial(wrapped, *args, **kwargs) + return asyncio.get_event_loop().run_in_executor(None, call) + + +def to_synchronous(*awaitables: Awaitable[Any]) -> List[Any]: + ''' + Synchronously waits for the Awaitable(s) to complete and returns their result(s). + See https://docs.python.org/3.8/library/asyncio-task.html#asyncio-awaitables + + :param awaitables: a series of Awaitable objects, with each object being its own argument. + i.e., not a single list of Awaitables + + example: + async def my_async_func(my_arg): + return my_arg + + async def another_async_func(my_arg2): + return my_arg2 + + remember that an invocation of an async function returns a Future (Awaitable), + which needs to be awaited to get the result. You cannot await a Future from within + a non-async function, so you could use this helper to get the result from a Future + + future_1 = my_async_func(1) + future_2 = another_async_func(2) + + results = to_synchronous(future_1, future_2) + ''' + return asyncio.get_event_loop().run_until_complete(asyncio.gather(*awaitables)) diff --git a/docs/root/modules/_cartography-metadata/schema.md b/docs/root/modules/_cartography-metadata/schema.md new file mode 100644 index 0000000000..878d8268df --- /dev/null +++ b/docs/root/modules/_cartography-metadata/schema.md @@ -0,0 +1,18 @@ +## Cartography metadata schema + +.. _metadata_schema: + +Some Cartography sync jobs write nodes to convey information about the job itself. See https://github.com/lyft/cartography/issues/758 for more background on this. + +### SyncMetadata:ModuleSyncMetadata + +This is a node to represent metadata about the sync job of a particular module. Its existence indicates that a particular sync job did happen. +The 'types' used here should be actual node labels. For example, if we did sync a particular AWSAccount's S3Buckets, +the `grouptype` is 'AWSAccount', the `groupid` is the particular account's `id`, and the `syncedtype` is 'S3Bucket'. + +| Field | Description | Source| +|-------|-------------|------| +|**id**|`{group_type}_{group_id}_{synced_type}`|util.py| +|grouptype| The parent module's type |util.py| +|groupid|The parent module's id|util.py| +|syncedtype|The sub-module's type|util.py| diff --git a/docs/root/modules/semgrep/config.md b/docs/root/modules/semgrep/config.md new file mode 100644 index 0000000000..6beb6616c5 --- /dev/null +++ b/docs/root/modules/semgrep/config.md @@ -0,0 +1,9 @@ +## Semgrep Configuration + +.. _semgrep_config: + +Follow these steps to ingest Semgrep findings with Cartography. + +1. Create a token with *Agent (CI)* and *Web API scopes* [Creating a SEMGREP_APP_TOKEN](https://semgrep.dev/docs/semgrep-ci/running-semgrep-ci-with-semgrep-cloud-platform/#creating-a-semgrep_app_token). +1. Populate an environment variable with the secrets value of the token +1. Pass the environment variable name to the `--semgrep-app-token-env-var` CLI arg. diff --git a/docs/root/modules/semgrep/index.rst b/docs/root/modules/semgrep/index.rst new file mode 100644 index 0000000000..54a4d625b9 --- /dev/null +++ b/docs/root/modules/semgrep/index.rst @@ -0,0 +1,13 @@ +Semgrep +#### + +The Semgrep module has the following coverage: + +* Deployment +* SCA Findings + +.. toctree:: + :hidden: + :glob: + + * diff --git a/docs/root/modules/semgrep/schema.md b/docs/root/modules/semgrep/schema.md new file mode 100644 index 0000000000..415e946c2c --- /dev/null +++ b/docs/root/modules/semgrep/schema.md @@ -0,0 +1,91 @@ +## Semgrep Schema + +.. _semgrep_schema: + +### SemgrepDeployment + +Represents a Semgrep [Deployment](https://semgrep.dev/api/v1/docs/#tag/Deployment), a unit encapsulating a security organization inside Semgrep Cloud. Works as the parent of all other Semgrep resources. + +| Field | Description | +|-------|--------------| +| firstseen | Timestamp of when a sync job first discovered this node | +| lastupdated | Timestamp of the last time the node was updated | +| **id** | Unique integer id representing the deployment | +| **slug** | Lowercase string id representing the deployment to query the API | +| **name** | Name of security organization connected to the deployment | + +#### Relationships + +- A SemgrepDeployment contains SemgrepSCAFinding's + + ``` + (SemgrepDeployment)-[RESOURCE]->(SemgrepSCAFinding) + ``` + +- A SemgrepDeployment contains SemgrepSCALocation's + + + ``` + (SemgrepDeployment)-[RESOURCE]->(SemgrepSCALocation) + ``` + + ``` + +### SemgrepSCAFinding + +Represents a [Semgre Supply Chain](https://semgrep.dev/docs/semgrep-supply-chain/overview/) finding. This is, a vulnerability in a dependency of a project discovered by Semgrep performing software composition analysis (SCA) and code reachability analysis. Before ingesting this node, make sure you have run Semgrep CI and that it's connected to Semgrep Cloud Platform [Running Semgrep CI with Semgrep Cloud Platform](https://semgrep.dev/docs/semgrep-ci/running-semgrep-ci-with-semgrep-cloud-platform/). + +| Field | Description | +|-------|--------------| +| firstseen | Timestamp of when a sync job first discovered this node | +| lastupdated | Timestamp of the last time the node was updated | +| **id** | A composed id based using the repository path and the rule that triggered the finding | +| **rule_id** | The rule that triggered the finding | +| **repository** | The repository path where the finding was discovered | +| summary | A short title summarizing of the finding | +| description | Description of the vulnerability. | +| package_manager | The ecosystem of the dependency where the finding was discovered (e.g. pypi, npm, maven) | +| severity | Severity of the finding (e.g. CRITICAL, HIGH, MEDIUM, LOW) | +| cve_id | CVE id of the vulnerability from NVD. Check [cve_schema](../cve/schema.md) | +| reachability_check | Whether the vulnerability reachability is confirmed, not confirmed or needs to be manually confirmed | +| reachability_condition | Description of the reachability condition (e.g. reachable if code is used in X way) | +| reachability | Whether the vulnerability is reachable or not | +| transitivity | Whether the vulnerability is transitive or not (e.g. dependency, transitive) | +| dependency | Dependency where the finding was discovered. Includes dependency name and version | +| dependency_fix | Dependency version that fixes the vulnerability | +| ref_urls | List of reference urls for the finding | +| dependency_file | Path of the file where the finding was discovered (e.g. lock.json, requirements.txt) | +| dependency_file_url | URL of the file where the finding was discovered | +| scan_time | Date and time when the finding was discovered in UTC | + + +#### Relationships + +- An SemgrepSCAFinding connected to a GithubRepository (optional) + + ``` + (SemgrepSCAFinding)-[FOUND_IN]->(GithubRepository) + ``` + +- A SemgrepSCAFinding vulnerable dependency usage at SemgrepSCALocation (optional) + + ``` + (SemgrepSCAFinding)-[USAGE_AT]->(SemgrepSCALocation) + ``` + + +### SemgrepSCALocation + +Represents the location in a repository where a vulnerable dependency is used in a way that can trigger the vulnerability. + +| Field | Description | +|-------|--------------| +| firstseen | Timestamp of when a sync job first discovered this node | +| lastupdated | Timestamp of the last time the node was updated | +| **id** | Unique id identifying the location of the finding | +| path | Path of the file where the usage was discovered | +| start_line | Line where the usage starts | +| start_col | Column where the usage starts | +| end_line | Line where the usage ends | +| end_col | Column where the usage ends | +| url | URL of the file where the usage was discovered | diff --git a/docs/root/usage/schema.md b/docs/root/usage/schema.md index 3d6da845c4..a5f1d101d4 100644 --- a/docs/root/usage/schema.md +++ b/docs/root/usage/schema.md @@ -22,6 +22,7 @@ - In these docs, more specific nodes will be decorated with `GenericNode::SpecificNode` notation. For example, if we have a `Car` node and a `RaceCar` node, we will refer to the `RaceCar` as `Car::RaceCar`. +.. mdinclude:: ../modules/_cartography-metadata/schema.md .. mdinclude:: ../modules/aws/schema.md .. mdinclude:: ../modules/azure/schema.md .. mdinclude:: ../modules/crxcavator/schema.md diff --git a/docs/root/usage/tutorial.md b/docs/root/usage/tutorial.md index 357c8a7f2e..f0cb52fa9b 100644 --- a/docs/root/usage/tutorial.md +++ b/docs/root/usage/tutorial.md @@ -2,24 +2,16 @@ Once everything has been installed and synced, you can view the Neo4j web interface at http://localhost:7474. You can view the reference on this [here](https://neo4j.com/developer/guide-neo4j-browser/#_installing_and_starting_neo4j_browser). -### Permalinking Bookmarklet +If you already know Neo4j and just need to know what are the nodes, attributes, and graph relationships for our representation of infrastructure assets, you can view our [sample queries](samplequeries.html). More sample queries are available at https://github.com/marco-lancini/cartography-queries. -You can set up a bookmarklet that lets you quickly get a permalink to a Cartography query. To do so, add a bookmark with the following contents as the URL - make sure to replace `neo4j.contoso.com:7474` with your instance of Neo4j: +Otherwise, read on for this handhold-y tutorial filled with examples. Suppose we wanted to find out: -```javascript -javascript:(() => { const query = document.querySelectorAll('article label span')[0].innerText; if (query === ':server connect') { console.log('no query has been run!'); return; } const searchParams = new URLSearchParams(); searchParams.append('connectURL', 'bolt://neo4j:neo4j@neo4j.contoso.net:7687'); searchParams.append('cmd', 'edit'); searchParams.append('arg', query.replaceAll(/\r /g, '\r')); newURL = `http://neo4j.contoso.net:7474/browser/?${searchParams}`; window.open(newURL, '_blank', 'noopener'); })() -``` - -Then, any time you are in the web interface, you can click the bookmarklet to open a new tab with a permalink to your most recently executed query in the URL bar. - -### ℹī¸ Already know [how to query Neo4j](https://neo4j.com/developer/cypher-query-language/)? You can skip to our reference material! -If you already know Neo4j and just need to know what are the nodes, attributes, and graph relationships for our representation of infrastructure assets, you can skip this handholdy walkthrough and see our [sample queries](samplequeries.md). - -### What [RDS](https://aws.amazon.com/rds/) instances are installed in my [AWS](https://aws.amazon.com/) accounts? -``` +### What [RDS](https://aws.amazon.com/rds/) instances are installed in my AWS accounts? +```cypher MATCH (aws:AWSAccount)-[r:RESOURCE]->(rds:RDSInstance) return * ``` + ![Visualization of RDS nodes and AWS nodes](../images/accountsandrds.png) In this query we asked Neo4j to find all `[:RESOURCE]` relationships from AWSAccounts to RDSInstances, and return the nodes and the `:RESOURCE` relationships. @@ -35,7 +27,7 @@ and then pick options on the menu that shows up at the bottom of the view like t ### Which RDS instances have [encryption](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Overview.Encryption.html) turned off? -``` +```cypher MATCH (a:AWSAccount)-[:RESOURCE]->(rds:RDSInstance{storage_encrypted:false}) RETURN a.name, rds.id ``` @@ -49,7 +41,7 @@ If you want to go back to viewing the graph and not a table, simply make sure yo Let's look at some other AWS assets now. ### Which [EC2](https://aws.amazon.com/ec2/) instances are directly exposed to the internet? -``` +```cypher MATCH (instance:EC2Instance{exposed_internet: true}) RETURN instance.instanceid, instance.publicdnsname ``` @@ -60,7 +52,7 @@ These instances are open to the internet either through permissive inbound IP pe If you know a lot about AWS, you may have noticed that EC2 instances [don't actually have an exposed_internet field](https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_Instance.html). We're able to query for this because Cartography performs some [data enrichment](#data-enrichment) to add this field to EC2Instance nodes. ### Which [S3](https://aws.amazon.com/s3/) buckets have a policy granting any level of anonymous access to the bucket? -``` +```cypher MATCH (s:S3Bucket) WHERE s.anonymous_access = true RETURN s @@ -76,13 +68,81 @@ A couple of other things to notice: instead of using the "{}" notation to filter Let's go back to analyzing RDS instances. In an earlier example we queried for RDS instances that have encryption turned off. We can aggregate this data by AWSAccount with a small change: -``` +```cypher MATCH (a:AWSAccount)-[:RESOURCE]->(rds:RDSInstance) WHERE rds.storage_encrypted = false RETURN a.name as AWSAccount, count(rds) as UnencryptedInstances ``` ![Table of unencrypted RDS instances by AWS account](../images/unencryptedcounts.png) + +### Given a node label, what other node labels can be connected to it? + +Suppose we wanted to know what other assets can be connected to a DNSRecord. We would ask the graph like this: + +```cypher +match (d:DNSRecord)--(n) +return distinct labels(n); +``` + +This says "what are the possible labels for all nodes connected to all DNSRecord nodes `d` in my graph?" Your answer might look like this: + +``` +["AWSDNSRecord", "DNSRecord"] +["AWSDNSZone", "DNSZone"] +["LoadBalancerV2"] +["NameServer"] +["ESDomain"] +["LoadBalancer"] +["EC2Instance", "Instance"] +``` + +You can then make the path more specific like this: + +```cypher +match (d:DNSRecord)--(:EC2Instance)--(n) +return distinct labels(n); +``` + +And then you can continue building your query. + +We also include [full schema docs](schema.html), but this way of building a query can be faster and more interactive. + + +### Given a node label, what are the possible property names defined on it? + +We can find what properties are available on an S3Bucket like this: + +```cypher +match (n:S3Bucket) return properties(n) limit 1; +``` + +The result will look like this: + +``` +{ + "bucket_key_enabled": false, + "creationdate": "2022-05-10 00:22:52+00:00", + "ignore_public_acls": true, + "anonymous_access": false, + "firstseen": 1652400141863, + "block_public_policy": true, + "versioning_status": "Enabled", + "block_public_acls": true, + "anonymous_actions": [], + "name": "my-fake-bucket-123", + "lastupdated": 1688605272, + "encryption_algorithm": "AES256", + "default_encryption": true, + "id": "my-fake-bucket-123", + "arn": "arn:aws:s3:::my-fake-bucket-123", + "restrict_public_buckets": false +} +``` + +Our [full schema docs](schema.html) describe all possible fields, but listing out properties this way lets you avoid switching between browser tabs. + + ### Learning more If you want to learn more in depth about Neo4j and Cypher queries you can look at [this tutorial](https://neo4j.com/developer/cypher-query-language/) and see this [reference card](https://neo4j.com/docs/cypher-refcard/current/). @@ -117,3 +177,14 @@ You can add your own custom attributes and relationships without writing Python ### Mapping AWS Access Permissions Cartography can map permissions between IAM Principals and resources in the graph. Here's [how](../modules/aws/permissions-mapping.html). + + +### Permalinking Bookmarklet + +You can set up a bookmarklet that lets you quickly get a permalink to a Cartography query. To do so, add a bookmark with the following contents as the URL - make sure to replace `neo4j.contoso.com:7474` with your instance of Neo4j: + +```javascript +javascript:(() => { const query = document.querySelectorAll('article label span')[0].innerText; if (query === ':server connect') { console.log('no query has been run!'); return; } const searchParams = new URLSearchParams(); searchParams.append('connectURL', 'bolt://neo4j:neo4j@neo4j.contoso.net:7687'); searchParams.append('cmd', 'edit'); searchParams.append('arg', query.replaceAll(/\r /g, '\r')); newURL = `http://neo4j.contoso.net:7474/browser/?${searchParams}`; window.open(newURL, '_blank', 'noopener'); })() +``` + +Then, any time you are in the web interface, you can click the bookmarklet to open a new tab with a permalink to your most recently executed query in the URL bar. diff --git a/docs/schema/README.md b/docs/schema/README.md index 8b3edbca7f..60e6c96542 100644 --- a/docs/schema/README.md +++ b/docs/schema/README.md @@ -59,5 +59,8 @@ ## SyncMetadata - Click [here](syncmetadata.md) +## Semgrep +- Click [here](https://lyft.github.io/cartography/modules/semgrep/schema.html) + ## More to come! 👍 diff --git a/docs/schema/syncmetadata.md b/docs/schema/syncmetadata.md index baad4be28e..7572a7ff29 100644 --- a/docs/schema/syncmetadata.md +++ b/docs/schema/syncmetadata.md @@ -1,17 +1 @@ -## SyncMetadata - -SyncMetadata nodes are created by sync jobs to convey information about the job itself. See this doc for how this is -used. - -## SyncMetadata:ModuleSyncMetadata - -This is a node to represent some metadata about the sync job of a particular module or sub-module. Its existence should suggest that a paritcular sync job did happen. -The 'types' used here should be actual node labels. For example, if we did sync a particular AWSAccount's S3Buckets, -the `grouptype` is 'AWSAccount', the `groupid` is the particular account's `id`, and the `syncedtype` is 'S3Bucket'. - -| Field | Description | Source| -|-------|-------------|------| -|**id**|`{group_type}_{group_id}_{synced_type}`|util.py| -|grouptype| The parent module's type |util.py| -|groupid|The parent module's id|util.py| -|syncedtype|The sub-module's type|util.py| +This document has been moved [here](https://lyft.github.io/cartography/modules/_cartography-metadata/schema.html) diff --git a/setup.py b/setup.py index 62846a3c65..28e4125117 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import find_packages from setuptools import setup -__version__ = '0.79.2' +__version__ = '0.83.0' setup( diff --git a/tests/data/aws/ec2/volumes.py b/tests/data/aws/ec2/volumes.py index e3a29c03fa..6e08003e73 100644 --- a/tests/data/aws/ec2/volumes.py +++ b/tests/data/aws/ec2/volumes.py @@ -14,7 +14,7 @@ 'Size': 123, 'SnapshotId': 'sn-01', 'State': 'available', - 'VolumeId': 'v-01', + 'VolumeId': 'vol-0df', 'Iops': 123, 'VolumeType': 'standard', 'FastRestored': True, @@ -33,7 +33,7 @@ 'OutpostArn': 'arn1', 'Size': 123, 'State': 'available', - 'VolumeId': 'v-02', + 'VolumeId': 'vol-03', 'Iops': 123, 'SnapshotId': 'sn-02', 'VolumeType': 'standard', diff --git a/tests/data/semgrep/__init__.py b/tests/data/semgrep/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/semgrep/sca.py b/tests/data/semgrep/sca.py new file mode 100644 index 0000000000..bda27f1f22 --- /dev/null +++ b/tests/data/semgrep/sca.py @@ -0,0 +1,117 @@ +DEPLOYMENTS = { + "id": "123456", + "name": "YourOrg", + "slug": "yourorg", +} + +SCA_RESPONSE = { + "vulns": [ + { + "title": "Reachable vuln", + "advisory": { + "ruleId": "ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590", + "title": "Reachable vuln", + "description": "description", + "ecosystem": "go", + "severity": "HIGH", + "references": { + "cveIds": ["CVE-2023-37897"], + "cweIds": ["CWE-617: Reachable Assertion"], + "owaspIds": ["A06:2021 - Vulnerable and Outdated Components"], + "urls": [ + "https://github.com/advisories//GHSA-9436-3gmp-4f53", + "https://nvd.nist.gov/vuln/detail/CVE-2023-37897", + ], + }, + "announcedAt": "2023-07-19T21:15:08Z", + "ruleText": '{\n "id": "ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590",\n "languages": [\n "python",\n "java",\n "ruby"\n ],\n "message": "message ",\n }', # noqa E501 + "reachability": "MANUAL_REVIEW_REACHABLE", + "vulnerableDependencies": [ + {"name": "grav", "versionSpecifier": "< 1.7.42.1"}, + ], + "safeDependencies": [ + {"name": "grav", "versionSpecifier": "1.7.42.2"}, + ], + "reachableIf": "a non-administrator, user account that has Admin panel access and Create/Update page permissions", # noqa E501 + }, + "exposureType": "REACHABLE", + "repositoryId": "123456", + "matchedDependency": {"name": "grav", "versionSpecifier": "1.7.42.0"}, + "dependencyFileLocation": { + "path": "go.mod", + "startLine": "111", + "url": "https://github.com/yourorg/yourrepo/blame/71bbed12f950de8335006d7f91112263d8504f1b/go.mod#L111", + "startCol": "0", + "endLine": "0", + "endCol": "0", + }, + "usages": [ + { + "findingId": "20128504", + "location": { + "path": "src/packages/directory/file1.go", + "startLine": "24", + "startCol": "57", + "endLine": "24", + "endCol": "78", + "url": "https://github.com/yourorg/yourrepo/blame/6fdee8f2727f4506cfbbe553e23b895e27956588/src/packages/directory/file1.go.ts#L24", # noqa E501 + "committedAt": "1970-01-01T00:00:00Z", + }, + }, + { + "findingId": "20128505", + "location": { + "path": "src/packages/directory/file2.go", + "startLine": "24", + "startCol": "37", + "endLine": "24", + "endCol": "54", + "url": "https://github.com/yourorg/yourrepo/blame/6fdee8f2727f4506cfbbe553e23b895e27956588/src/packages/directory/file2.go.ts#L24", # noqa E501 + "committedAt": "1970-01-01T00:00:00Z", + }, + }, + ], + "triage": { + "status": "NEW", + "dismissReason": "UNKNOWN_REASON", + "issueUrl": "", + "prUrl": "", + }, + "groupKey": "132465::::ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590::reachable", + "closestSafeDependency": {"name": "grav", "versionSpecifier": "1.7.42.2"}, + "repositoryName": "yourorg/yourrepo", + "openedAt": "2023-07-19T12:51:53Z", + "firstTriagedAt": "1970-01-01T00:00:00Z", + "transitivity": "DIRECT", + "subdirectory": "", + "packageManager": "no_package_manager", + }, + ], + "hasMore": True, + "cursor": "123456", +} + +RAW_VULNS = SCA_RESPONSE["vulns"] +VULN_ID = "yourorg/yourrepo|ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590" +USAGES = [ + { + "SCA_ID": VULN_ID, + "findingId": "20128504", + "path": "src/packages/directory/file1.go", + "startLine": "24", + "startCol": "57", + "endLine": "24", + "endCol": "78", + "url": "https://github.com/yourorg/yourrepo/blame/6fdee8f2727f4506cfbbe553e23b895e27956588/src/packages/directory/file1.go.ts#L24", # noqa E501 + }, + { + "SCA_ID": VULN_ID, + "findingId": "20128505", + "path": "src/packages/directory/file2.go", + "startLine": "24", + "startCol": "37", + "endLine": "24", + "endCol": "54", + "url": "https://github.com/yourorg/yourrepo/blame/6fdee8f2727f4506cfbbe553e23b895e27956588/src/packages/directory/file2.go.ts#L24", # noqa E501 + }, +] diff --git a/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py b/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py index 705d2db3ec..b852581303 100644 --- a/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py +++ b/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py @@ -3,94 +3,116 @@ import cartography.intel.aws.ec2.instances import cartography.intel.aws.ec2.volumes -import tests.data.aws.ec2.instances -import tests.data.aws.ec2.volumes from cartography.intel.aws.ec2.instances import sync_ec2_instances +from cartography.intel.aws.ec2.volumes import sync_ebs_volumes from tests.data.aws.ec2.instances import DESCRIBE_INSTANCES - +from tests.data.aws.ec2.volumes import DESCRIBE_VOLUMES +from tests.integration.cartography.intel.aws.common import create_test_account +from tests.integration.util import check_nodes +from tests.integration.util import check_rels TEST_ACCOUNT_ID = '000000000000' TEST_REGION = 'eu-west-1' TEST_UPDATE_TAG = 123456789 -def test_load_volumes(neo4j_session): +@patch.object(cartography.intel.aws.ec2.volumes, 'get_volumes', return_value=DESCRIBE_VOLUMES) +def test_sync_ebs_volumes(mock_get_vols, neo4j_session): # Arrange - data = tests.data.aws.ec2.volumes.DESCRIBE_VOLUMES - transformed_data = cartography.intel.aws.ec2.volumes.transform_volumes(data, TEST_REGION, TEST_ACCOUNT_ID) + boto3_session = MagicMock() + create_test_account(neo4j_session, TEST_ACCOUNT_ID, TEST_UPDATE_TAG) # Act - cartography.intel.aws.ec2.volumes.load_volumes( + sync_ebs_volumes( neo4j_session, - transformed_data, - TEST_REGION, + boto3_session, + [TEST_REGION], TEST_ACCOUNT_ID, TEST_UPDATE_TAG, + {'UPDATE_TAG': TEST_UPDATE_TAG, 'AWS_ID': TEST_ACCOUNT_ID}, ) # Assert - expected_nodes = { - "v-01", "v-02", + assert check_nodes(neo4j_session, 'EBSVolume', ['arn']) == { + ('arn:aws:ec2:eu-west-1:000000000000:volume/vol-03',), + ('arn:aws:ec2:eu-west-1:000000000000:volume/vol-0df',), } - nodes = neo4j_session.run( - """ - MATCH (r:EBSVolume) RETURN r.id; - """, - ) - actual_nodes = {n['r.id'] for n in nodes} - - assert actual_nodes == expected_nodes - - -def test_load_volume_to_account_rels(neo4j_session): + # Assert + assert check_rels( + neo4j_session, + 'AWSAccount', + 'id', + 'EBSVolume', + 'volumeid', + 'RESOURCE', + rel_direction_right=True, + ) == { + (TEST_ACCOUNT_ID, 'vol-03'), + (TEST_ACCOUNT_ID, 'vol-0df'), + } - # Arrange: Create Test AWSAccount - neo4j_session.run( - """ - MERGE (aws:AWSAccount{id: $aws_account_id}) - ON CREATE SET aws.firstseen = timestamp() - SET aws.lastupdated = $aws_update_tag - """, - aws_account_id=TEST_ACCOUNT_ID, - aws_update_tag=TEST_UPDATE_TAG, - ) - # Act: Load Test Volumes - data = tests.data.aws.ec2.volumes.DESCRIBE_VOLUMES - transformed_data = cartography.intel.aws.ec2.volumes.transform_volumes(data, TEST_REGION, TEST_ACCOUNT_ID) +@patch.object(cartography.intel.aws.ec2.instances, 'get_ec2_instances', return_value=DESCRIBE_INSTANCES['Reservations']) +@patch.object(cartography.intel.aws.ec2.volumes, 'get_volumes', return_value=DESCRIBE_VOLUMES) +def test_sync_ebs_volumes_e2e(mock_get_vols, mock_get_instances, neo4j_session): + # Arrange + neo4j_session.run('MATCH (n) DETACH DELETE n;') + boto3_session = MagicMock() + create_test_account(neo4j_session, TEST_ACCOUNT_ID, TEST_UPDATE_TAG) - cartography.intel.aws.ec2.volumes.load_volumes( + # Act: sync_ec2_instances() loads attached ebs volumes + sync_ec2_instances( neo4j_session, - transformed_data, - TEST_REGION, + boto3_session, + [TEST_REGION], TEST_ACCOUNT_ID, TEST_UPDATE_TAG, + {'UPDATE_TAG': TEST_UPDATE_TAG, 'AWS_ID': TEST_ACCOUNT_ID}, ) - # Assert - expected = { - (TEST_ACCOUNT_ID, 'v-01'), - (TEST_ACCOUNT_ID, 'v-02'), + # Assert that deleteontermination is set by sync_ec2_instances. The encrypted property isn't returned by this API. + assert check_nodes(neo4j_session, 'EBSVolume', ['id', 'deleteontermination', 'encrypted']) == { + ('vol-03', True, None), + ('vol-04', True, None), + ('vol-09', True, None), + ('vol-0df', True, None), } - result = neo4j_session.run( - """ - MATCH (n1:AWSAccount)-[:RESOURCE]->(n2:EBSVolume) RETURN n1.id, n2.id; - """, - ) - actual = { - (r['n1.id'], r['n2.id']) for r in result + # Assert that they are attached to the instances + assert check_rels( + neo4j_session, + 'EC2Instance', + 'instanceid', + 'EBSVolume', + 'volumeid', + 'ATTACHED_TO', + rel_direction_right=False, + ) == { + ('i-01', 'vol-0df'), + ('i-02', 'vol-03'), + ('i-03', 'vol-09'), + ('i-04', 'vol-04'), } - assert actual == expected - + # Assert that we created the account to volume rels correctly + assert check_rels( + neo4j_session, + 'AWSAccount', + 'id', + 'EBSVolume', + 'volumeid', + 'RESOURCE', + rel_direction_right=True, + ) == { + ('000000000000', 'vol-03'), + ('000000000000', 'vol-04'), + ('000000000000', 'vol-09'), + ('000000000000', 'vol-0df'), + } -@patch.object(cartography.intel.aws.ec2.instances, 'get_ec2_instances', return_value=DESCRIBE_INSTANCES['Reservations']) -def test_load_volume_to_instance_rels(mock_get_instances, neo4j_session): - # Arrange: Load in ec2 instances first - boto3_session = MagicMock() - sync_ec2_instances( + # Act + sync_ebs_volumes( neo4j_session, boto3_session, [TEST_REGION], @@ -98,28 +120,46 @@ def test_load_volume_to_instance_rels(mock_get_instances, neo4j_session): TEST_UPDATE_TAG, {'UPDATE_TAG': TEST_UPDATE_TAG, 'AWS_ID': TEST_ACCOUNT_ID}, ) - # Prep the volume data - raw_volumes = tests.data.aws.ec2.volumes.DESCRIBE_VOLUMES - transformed_volumes = cartography.intel.aws.ec2.volumes.transform_volumes(raw_volumes, TEST_REGION, TEST_ACCOUNT_ID) - # Act - cartography.intel.aws.ec2.volumes.load_volume_relationships( - neo4j_session, - transformed_volumes, - TEST_UPDATE_TAG, - ) + # Assert that additional fields such as `encrypted` have been added by sync_ebs_volumes(), while + # deleteontermination has not been overwritten with None by sync_ebs_volumes() + assert check_nodes(neo4j_session, 'EBSVolume', ['id', 'deleteontermination', 'encrypted']) == { + # Attached to the instances initially + ('vol-04', True, None), + ('vol-09', True, None), + # Added by ebs sync + ('vol-03', True, True), + ('vol-0df', True, True), + } - # Assert - result = neo4j_session.run( - """ - MATCH (n1:EC2Instance)<-[:ATTACHED_TO_EC2_INSTANCE]-(n2:EBSVolume) RETURN n1.id, n2.id; - """, - ) - expected = { - ('i-01', 'v-01'), - ('i-02', 'v-02'), + # Assert that they are still attached to the instances + assert check_rels( + neo4j_session, + 'EC2Instance', + 'instanceid', + 'EBSVolume', + 'volumeid', + 'ATTACHED_TO', + rel_direction_right=False, + ) == { + ('i-01', 'vol-0df'), + ('i-02', 'vol-03'), + ('i-03', 'vol-09'), + ('i-04', 'vol-04'), } - actual = { - (r['n1.id'], r['n2.id']) for r in result + + # Assert that the account to volume rels still exist + assert check_rels( + neo4j_session, + 'AWSAccount', + 'id', + 'EBSVolume', + 'volumeid', + 'RESOURCE', + rel_direction_right=True, + ) == { + ('000000000000', 'vol-03'), + ('000000000000', 'vol-04'), + ('000000000000', 'vol-09'), + ('000000000000', 'vol-0df'), } - assert actual == expected diff --git a/tests/integration/cartography/intel/aws/test_ssm.py b/tests/integration/cartography/intel/aws/test_ssm.py index a27e1a3f4f..8b31471e7c 100644 --- a/tests/integration/cartography/intel/aws/test_ssm.py +++ b/tests/integration/cartography/intel/aws/test_ssm.py @@ -28,13 +28,16 @@ def _ensure_load_instances(neo4j_session): @patch.object(cartography.intel.aws.ec2.instances, 'get_ec2_instances', return_value=DESCRIBE_INSTANCES['Reservations']) def test_load_instance_information(mock_get_instances, neo4j_session): + # Arrange # load account and instances, to be able to test relationships create_test_account(neo4j_session, TEST_ACCOUNT_ID, TEST_UPDATE_TAG) _ensure_load_instances(neo4j_session) + # Act + data_list = cartography.intel.aws.ssm.transform_instance_information(tests.data.aws.ssm.INSTANCE_INFORMATION) cartography.intel.aws.ssm.load_instance_information( neo4j_session, - tests.data.aws.ssm.INSTANCE_INFORMATION, + data_list, TEST_REGION, TEST_ACCOUNT_ID, TEST_UPDATE_TAG, @@ -84,15 +87,17 @@ def test_load_instance_information(mock_get_instances, neo4j_session): assert actual_nodes == {"i-02"} -def test_load_instance_patches(neo4j_session): +@patch.object(cartography.intel.aws.ec2.instances, 'get_ec2_instances', return_value=DESCRIBE_INSTANCES['Reservations']) +def test_load_instance_patches(mock_get_instances, neo4j_session): # Arrange: load account and instances, to be able to test relationships create_test_account(neo4j_session, TEST_ACCOUNT_ID, TEST_UPDATE_TAG) _ensure_load_instances(neo4j_session) # Act + data_list = cartography.intel.aws.ssm.transform_instance_patches(tests.data.aws.ssm.INSTANCE_PATCHES) cartography.intel.aws.ssm.load_instance_patches( neo4j_session, - tests.data.aws.ssm.INSTANCE_PATCHES, + data_list, TEST_REGION, TEST_ACCOUNT_ID, TEST_UPDATE_TAG, diff --git a/tests/integration/cartography/intel/semgrep/__init__.py b/tests/integration/cartography/intel/semgrep/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/cartography/intel/semgrep/test_findings.py b/tests/integration/cartography/intel/semgrep/test_findings.py new file mode 100644 index 0000000000..2a00783949 --- /dev/null +++ b/tests/integration/cartography/intel/semgrep/test_findings.py @@ -0,0 +1,240 @@ +from string import Template +from typing import List +from unittest.mock import patch + +import neo4j + +import cartography.intel.semgrep.findings +import tests.data.semgrep.sca +from cartography.intel.semgrep.findings import sync +from tests.integration.util import check_nodes +from tests.integration.util import check_rels + +TEST_REPO_ID = "https://github.com/yourorg/yourrepo" +TEST_REPO_FULL_NAME = "yourorg/yourrepo" +TEST_REPO_NAME = "yourrepo" +TEST_UPDATE_TAG = 123456789 + + +def _check_nodes_as_list(neo4j_session: neo4j.Session, node_label: str, attrs: List[str]): + """ + Like tests.integration.util.check_nodes()` but returns a list instead of a set. + """ + if not attrs: + raise ValueError("`attrs` passed to check_nodes() must have at least one element.") + + attrs = ", ".join(f"n.{attr}" for attr in attrs) + query_template = Template("MATCH (n:$NodeLabel) RETURN $Attrs") + result = neo4j_session.run( + query_template.safe_substitute(NodeLabel=node_label, Attrs=attrs), + ) + return sum([row.values() for row in result], []) + + +def _create_github_repos(neo4j_session): + # Creates a set of GitHub repositories in the graph + neo4j_session.run( + """ + MERGE (repo:GitHubRepository{id: $repo_id, fullname: $repo_fullname, name: $repo_name}) + ON CREATE SET repo.firstseen = timestamp() + SET repo.lastupdated = $update_tag + """, + repo_id=TEST_REPO_ID, + repo_fullname=TEST_REPO_FULL_NAME, + update_tag=TEST_UPDATE_TAG, + repo_name=TEST_REPO_NAME, + ) + + +@patch.object( + cartography.intel.semgrep.findings, + "get_deployment", + return_value=tests.data.semgrep.sca.DEPLOYMENTS, +) +@patch.object( + cartography.intel.semgrep.findings, + "get_sca_vulns", + return_value=tests.data.semgrep.sca.RAW_VULNS, +) +def test_sync(mock_get_sca_vulns, mock_get_deployment, neo4j_session): + # Arrange + _create_github_repos(neo4j_session) + semgrep_app_token = "your_semgrep_app_token" + common_job_parameters = { + "UPDATE_TAG": TEST_UPDATE_TAG, + } + + # Act + sync(neo4j_session, semgrep_app_token, TEST_UPDATE_TAG, common_job_parameters) + + # Assert + expected_deployment_nodes = {("123456", "YourOrg", "yourorg")} + + assert ( + check_nodes( + neo4j_session, + "SemgrepDeployment", + ["id", "name", "slug"], + ) == + expected_deployment_nodes + ) + expected_sca_vuln_nodes = [ + "yourorg/yourrepo|ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590", + TEST_UPDATE_TAG, + "yourorg/yourrepo", + "ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590", + "Reachable vuln", + "description", + "go", + "HIGH", + "CVE-2023-37897", + "MANUAL_REVIEW_REACHABLE", + "REACHABLE", + "DIRECT", + "grav|1.7.42.0", + "grav|1.7.42.2", + "go.mod", + "https://github.com/yourorg/yourrepo/blame/71bbed12f950de8335006d7f91112263d8504f1b/go.mod#L111", + ["https://github.com/advisories//GHSA-9436-3gmp-4f53", "https://nvd.nist.gov/vuln/detail/CVE-2023-37897"], + "2023-07-19T12:51:53Z", + ] + assert ( + _check_nodes_as_list( + neo4j_session, + "SemgrepSCAFinding", + [ + "id", + "lastupdated", + "repository", + "rule_id", + "summary", + "description", + "package_manager", + "severity", + "cve_id", + "reachability_check", + "reachability", + "transitivity", + "dependency", + "dependency_fix", + "dependency_file", + "dependency_file_url", + "ref_urls", + "scan_time", + ], + ) == + expected_sca_vuln_nodes + ) + expected_sca_location_nodes = { + ( + "20128504", + "src/packages/directory/file1.go", + "24", + "57", + "24", + "78", + "https://github.com/yourorg/yourrepo/blame/6fdee8f2727f4506cfbbe553e23b895e27956588/src/packages/directory/file1.go.ts#L24", # noqa E501 + ), + ( + "20128505", + "src/packages/directory/file2.go", + "24", + "37", + "24", + "54", + "https://github.com/yourorg/yourrepo/blame/6fdee8f2727f4506cfbbe553e23b895e27956588/src/packages/directory/file2.go.ts#L24", # noqa E501 + ), + } + assert ( + check_nodes( + neo4j_session, + "SemgrepSCALocation", + [ + "id", + "path", + "start_line", + "start_col", + "end_line", + "end_col", + "url", + ], + ) == + expected_sca_location_nodes + ) + expected_findings_resource_relationships = { + ( + "123456", + "yourorg/yourrepo|ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590", + ), + } + assert ( + check_rels( + neo4j_session, + "SemgrepDeployment", + "id", + "SemgrepSCAFinding", + "id", + "RESOURCE", + ) == + expected_findings_resource_relationships + ) + expected_locations_resource_relationships = { + ( + "123456", + "20128504", + ), + ( + "123456", + "20128505", + ), + } + assert ( + check_rels( + neo4j_session, + "SemgrepDeployment", + "id", + "SemgrepSCALocation", + "id", + "RESOURCE", + ) == + expected_locations_resource_relationships + ) + expected_found_in_relationships = { + ( + "yourorg/yourrepo", + "yourorg/yourrepo|ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590", + ), + } + assert ( + check_rels( + neo4j_session, + "GitHubRepository", + "fullname", + "SemgrepSCAFinding", + "id", + "FOUND_IN", + rel_direction_right=False, + ) == + expected_found_in_relationships + ) + expected_location_relationships = { + ( + "yourorg/yourrepo|ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590", + "20128504", + ), + ( + "yourorg/yourrepo|ssc-92af1d99-4fb3-4d4e-a9f4-d57572cd6590", + "20128505", + ), + } + assert ( + check_rels( + neo4j_session, + "SemgrepSCAFinding", + "id", + "SemgrepSCALocation", + "id", + "USAGE_AT", + ) == + expected_location_relationships + ) diff --git a/tests/unit/cartography/intel/aws/test_permission_relationships.py b/tests/unit/cartography/intel/aws/test_permission_relationships.py index b19604256a..bc85bdd0e7 100644 --- a/tests/unit/cartography/intel/aws/test_permission_relationships.py +++ b/tests/unit/cartography/intel/aws/test_permission_relationships.py @@ -36,7 +36,7 @@ def test_not_action_statement(): "action": [ "*", ], - "notaction":[ + "notaction": [ "S3:GetObject", ], "resource": [ @@ -209,7 +209,7 @@ def test_non_matching_notresource(): "action": [ "s3:Get*", ], - "resource":["*"], + "resource": ["*"], "notresource": [ "arn:aws:s3:::nottest", ], @@ -417,7 +417,7 @@ def test_single_comma(): "action": [ "s3:?et*", ], - "resource":["arn:aws:s3:::testbucke?"], + "resource": ["arn:aws:s3:::testbucke?"], "effect": "Allow", }, ] @@ -432,7 +432,7 @@ def test_multiple_comma(): "action": [ "s3:?et*", ], - "resource":["arn:aws:s3:::????bucket"], + "resource": ["arn:aws:s3:::????bucket"], "effect": "Allow", }, ] diff --git a/tests/unit/cartography/intel/gsuite/test_api.py b/tests/unit/cartography/intel/gsuite/test_api.py index 04950555a1..3e4a3b7546 100644 --- a/tests/unit/cartography/intel/gsuite/test_api.py +++ b/tests/unit/cartography/intel/gsuite/test_api.py @@ -107,9 +107,9 @@ def test_load_gsuite_groups(): UNWIND $GroupData as group MERGE (g:GSuiteGroup{id: group.id}) ON CREATE SET - g.firstseen = $UpdateTag - ON MATCH SET - g.group_id = group.id, + g.firstseen = $UpdateTag, + g.group_id = group.id + SET g.admin_created = group.adminCreated, g.description = group.description, g.direct_members_count = group.directMembersCount, @@ -135,9 +135,9 @@ def test_load_gsuite_users(): UNWIND $UserData as user MERGE (u:GSuiteUser{id: user.id}) ON CREATE SET - u.firstseen = $UpdateTag - ON MATCH SET u.user_id = user.id, + u.firstseen = $UpdateTag + SET u.agreed_to_terms = user.agreedToTerms, u.archived = user.archived, u.change_password_at_next_login = user.changePasswordAtNextLogin, diff --git a/tests/unit/driftdetect/test_detector.py b/tests/unit/driftdetect/test_detector.py index 71c223d145..9e828904d3 100644 --- a/tests/unit/driftdetect/test_detector.py +++ b/tests/unit/driftdetect/test_detector.py @@ -1,5 +1,6 @@ from unittest.mock import MagicMock +from cartography.client.core.tx import read_list_of_dicts_tx from cartography.driftdetect.detect_deviations import compare_states from cartography.driftdetect.get_states import get_state from cartography.driftdetect.model import State @@ -26,13 +27,13 @@ def test_state_no_drift(): mock_result.__getitem__.side_effect = results.__getitem__ mock_result.__iter__.side_effect = results.__iter__ - mock_session.run.return_value = mock_result + mock_session.read_transaction.return_value = mock_result data = FileSystem.load("tests/data/detectors/test_expectations.json") state_old = StateSchema().load(data) state_new = State(state_old.name, state_old.validation_query, state_old.properties, []) get_state(mock_session, state_new) drifts = compare_states(state_old, state_new) - mock_session.run.assert_called_with(state_new.validation_query) + mock_session.read_transaction.assert_called_with(read_list_of_dicts_tx, state_new.validation_query) assert not drifts @@ -54,20 +55,60 @@ def test_state_picks_up_drift(): {key: "7"}, ] + # Arrange mock_result.__getitem__.side_effect = results.__getitem__ mock_result.__iter__.side_effect = results.__iter__ - mock_session.run.return_value = mock_result + mock_session.read_transaction.return_value = mock_result data = FileSystem.load("tests/data/detectors/test_expectations.json") state_old = StateSchema().load(data) state_new = State(state_old.name, state_old.validation_query, state_old.properties, []) get_state(mock_session, state_new) state_new.properties = state_old.properties + + # Act drifts = compare_states(state_old, state_new) - mock_session.run.assert_called_with(state_new.validation_query) + + # Assert + mock_session.read_transaction.assert_called_with(read_list_of_dicts_tx, state_new.validation_query) assert drifts assert ["7"] in drifts +def test_state_order_does_not_matter(): + """ + Test that a state that detects drift. + :return: + """ + key = "d.test" + mock_session = MagicMock() + mock_result = MagicMock() + results = [ + {key: "1"}, + {key: "2"}, + {key: "6"}, # This one is out of order + {key: "3"}, + {key: "4"}, + {key: "5"}, + ] + + # Arrange + mock_result.__getitem__.side_effect = results.__getitem__ + mock_result.__iter__.side_effect = results.__iter__ + mock_session.read_transaction.return_value = mock_result + data = FileSystem.load("tests/data/detectors/test_expectations.json") + state_old = StateSchema().load(data) + state_new = State(state_old.name, state_old.validation_query, state_old.properties, []) + get_state(mock_session, state_new) + state_new.properties = state_old.properties + + # Act + drifts = compare_states(state_old, state_new) + + # Assert + mock_session.read_transaction.assert_called_with(read_list_of_dicts_tx, state_new.validation_query) + assert not drifts + + def test_state_multiple_expectations(): """ Test that multiple fields runs properly. @@ -89,14 +130,14 @@ def test_state_multiple_expectations(): mock_result.__getitem__.side_effect = results.__getitem__ mock_result.__iter__.side_effect = results.__iter__ - mock_session.run.return_value = mock_result + mock_session.read_transaction.return_value = mock_result data = FileSystem.load("tests/data/detectors/test_multiple_expectations.json") state_old = StateSchema().load(data) state_new = State(state_old.name, state_old.validation_query, state_old.properties, []) get_state(mock_session, state_new) state_new.properties = state_old.properties drifts = compare_states(state_old, state_new) - mock_session.run.assert_called_with(state_new.validation_query) + mock_session.read_transaction.assert_called_with(read_list_of_dicts_tx, state_new.validation_query) assert ["7", "14"] in drifts @@ -121,14 +162,14 @@ def test_drift_from_multiple_properties(): ] mock_result.__getitem__.side_effect = results.__getitem__ mock_result.__iter__.side_effect = results.__iter__ - mock_session.run.return_value = mock_result + mock_session.read_transaction.return_value = mock_result data = FileSystem.load("tests/data/detectors/test_multiple_properties.json") state_old = StateSchema().load(data) state_new = State(state_old.name, state_old.validation_query, state_old.properties, []) get_state(mock_session, state_new) state_new.properties = state_old.properties drifts = compare_states(state_old, state_new) - mock_session.run.assert_called_with(state_new.validation_query) + mock_session.read_transaction.assert_called_with(read_list_of_dicts_tx, state_new.validation_query) assert ["7", "14", ["21", "28", "35"]] in drifts assert ["3", "10", ["17", "24", "31"]] not in drifts