feat: Enable lightweight scanning option (#136)

* feat: Enable lightweight scanning option We use a predefined schema to select the most interesting fields for printing. User can enable it with -ls flag. Additionally * Requests are now timed out by 120 seconds * Impersonation is now disabled by default * Unwrapped several unnecessary lists in responses * Fixes extra null output in GCS scanning results * [tests] Relaxing and updating unit tests * [tests] Printing error file on failure ( Related to #135
google · Apr 7, 2023 · a229393 · a229393
1 parent ac11c81
commit a229393
Show file tree

Hide file tree

Showing 11 changed files with 2,489 additions and 2,465 deletions.
diff --git a/example_config b/example_config
@@ -81,7 +81,7 @@
   "service_accounts": {
     "fetch": true,
     "comment": "Fetch list of available service accounts",
-    "impersonate": true
+    "impersonate": false
   },
   "dns_policies": {
     "fetch": true

diff --git a/src/gcp_scanner/arguments.py b/src/gcp_scanner/arguments.py
@@ -43,7 +43,13 @@ def arg_parser():
       dest='output',
       default='scan_db',
       help='Path to output directory')
-
+  parser.add_argument(
+      '-ls',
+      '--light-scan',
+      default=False,
+      dest='light_scan',
+      action='store_true',
+      help='Return only the most important GCP resource fields in the output.')
   parser.add_argument(
       '-k',
       '--sa-key-path',

diff --git a/src/gcp_scanner/crawl.py b/src/gcp_scanner/crawl.py
@@ -368,7 +368,7 @@ def get_bucket_names(project_name: str, credentials: Credentials,
       break
 
     for bucket in response.get("items", []):
-      buckets_dict[bucket["name"]] = (bucket, None)
+      buckets_dict[bucket["name"]] = bucket
       if dump_fd is not None:
         ret_fields = "nextPageToken,items(name,size,contentType,timeCreated)"
 
@@ -469,7 +469,8 @@ def get_gke_images(project_name: str, access_token: str) -> Dict[str, Any]:
     gcr_url = f"https://{region}gcr.io/v2/{project_name}/tags/list"
     try:
       res = requests.get(
-          gcr_url, auth=HTTPBasicAuth("oauth2accesstoken", access_token))
+          gcr_url, auth=HTTPBasicAuth("oauth2accesstoken", access_token),
+          timeout=120)
       if not res.ok:
         logging.info("Failed to retrieve gcr images list. Status code: %d",
                      res.status_code)
@@ -897,7 +898,7 @@ def get_iam_policy(project_name: str,
     return None
 
 
-def get_associated_service_accounts(
+def get_sas_for_impersonation(
     iam_policy: List[Dict[str, Any]]) -> List[str]:
   """Extract a list of unique SAs from IAM policy associated with project.
 
@@ -913,16 +914,11 @@ def get_associated_service_accounts(
 
   list_of_sas = list()
   for entry in iam_policy:
-    for member in entry["members"]:
-      if "deleted:" in member:
-        continue
-      account_name = None
-      for element in member.split(":"):
-        if "@" in element:
-          account_name = element
-          break
-      if account_name and account_name not in list_of_sas:
-        list_of_sas.append(account_name)
+    for sa_name in entry.get("members", []):
+      if sa_name.startswith("serviceAccount") and "@" in sa_name:
+        account_name = sa_name.split(":")[1]
+        if account_name not in list_of_sas:
+          list_of_sas.append(account_name)
 
   return list_of_sas
 
@@ -983,7 +979,7 @@ def list_services(project_id: str, credentials: Credentials) -> List[Any]:
   try:
     while request is not None:
       response = request.execute()
-      list_of_services.append(response.get("services", None))
+      list_of_services.extend(response.get("services", []))
 
       request = serviceusage.services().list_next(
           previous_request=request, previous_response=response)
@@ -1016,7 +1012,7 @@ def list_sourcerepo(project_id: str, credentials: Credentials) -> List[Any]:
   try:
     while request is not None:
       response = request.execute()
-      list_of_repos.append(response.get("repos", None))
+      list_of_repos.extend(response.get("repos", None))
 
       request = service.projects().repos().list_next(
         previous_request=request,
@@ -1049,7 +1045,7 @@ def list_dns_policies(project_id: str, credentials: Credentials) -> List[Any]:
   try:
     while request is not None:
       response = request.execute()
-      list_of_policies.append(response.get("policies", None))
+      list_of_policies.extend(response.get("policies", None))
 
       request = service.policies().list_next(
         previous_request=request,

diff --git a/src/gcp_scanner/credsdb.py b/src/gcp_scanner/credsdb.py
@@ -95,21 +95,21 @@ def get_creds_from_metadata() -> Tuple[Optional[str], Optional[Credentials]]:
 service-accounts/default/email"
   headers = {"Metadata-Flavor": "Google"}
   try:
-    res = requests.get(token_url, headers=headers)
+    res = requests.get(token_url, headers=headers, timeout=120)
     if not res.ok:
       logging.error("Failed to retrieve instance token. Status code %d",
                     res.status_code)
       return None, None
     token = res.json()["access_token"]
 
-    res = requests.get(scope_url, headers=headers)
+    res = requests.get(scope_url, headers=headers, timeout=120)
     if not res.ok:
       logging.error("Failed to retrieve instance scopes. Status code %d",
                     res.status_code)
       return None, None
     instance_scopes = res.content.decode("utf-8")
 
-    res = requests.get(email_url, headers=headers)
+    res = requests.get(email_url, headers=headers, timeout=120)
     if not res.ok:
       logging.error("Failed to retrieve instance email. Status code %d",
                     res.status_code)

diff --git a/src/gcp_scanner/scanner.py b/src/gcp_scanner/scanner.py
@@ -35,15 +35,65 @@
 from httplib2 import Credentials
 from .models import SpiderContext
 
+# We define the schema statically to make it easier for the user and avoid extra
+# config files.
+light_version_scan_schema = {
+  'compute_instances': ['name', 'zone', 'machineType', 'networkInterfaces',
+                        'status'],
+  'compute_images': ['name', 'status', 'diskSizeGb', 'sourceDisk'],
+  'machine_images': ['name', 'description', 'status', 'sourceInstance',
+                     'totalStorageBytes', 'savedDisks'],
+  'compute_disks': ['name', 'sizeGb', 'zone', 'status', 'sourceImage', 'users'],
+  'compute_snapshots': ['name', 'status', 'sourceDisk', 'downloadBytes'],
+  'managed_zones': ['name', 'dnsName', 'description', 'nameServers'],
+  'sql_instances': ['name', 'region', 'ipAddresses', 'databaseVersion'
+                    'state'],
+  'cloud_functions': ['name', 'eventTrigger', 'status', 'entryPoint',
+                      'serviceAccountEmail'],
+  'kms': ['name', 'primary', 'purpose', 'createTime'],
+  'services': ['name'],
+}
+
 def is_set(config: Optional[dict], config_setting: str) -> Union[dict,bool]:
   if config is None:
     return True
   obj = config.get(config_setting, {})
   return obj.get('fetch', False)
 
+def save_results(res_data: Dict, res_path: str, is_light: bool):
+  """The function to save scan results on disk in json format.
+
+  Args:
+    res_data: scan results as a dictionary of entries
+    res_path: full path to save data in file
+    is_light: save only the most interesting results
+  """
+
+  if is_light is True:
+    # returning the light version of the scan based on predefined schema
+    for gcp_resource, schema in light_version_scan_schema.items():
+      projects = res_data.get('projects', {})
+      for project_name, project_data in projects.items():
+        scan_results = project_data.get(gcp_resource, {})
+        light_results = list()
+        for scan_result in scan_results:
+          light_results.append({key: scan_result.get(key) for key in schema})
+
+        project_data.update({gcp_resource: light_results})
+        projects.update({project_name: project_data})
+      res_data.update({'projects': projects})
+
+  # Write out results to json DB
+  sa_results_data = json.dumps(res_data, indent=2, sort_keys=False)
+
+  with open(res_path, 'a', encoding='utf-8') as outfile:
+    outfile.write(sa_results_data)
+
+
 def crawl_loop(initial_sa_tuples: List[Tuple[str, Credentials, List[str]]],
                out_dir: str,
                scan_config: Dict,
+               light_scan: bool,
                target_project: Optional[str] = None,
                force_projects: Optional[str] = None):
   """The main loop function to crawl GCP resources.
@@ -108,7 +158,7 @@ def crawl_loop(initial_sa_tuples: List[Tuple[str, Credentials, List[str]]],
       output_path = Path(out_dir, output_file_name)
 
       try:
-        with open(output_path, 'x', encoding='utf-8') as outfile:
+        with open(output_path, 'x', encoding='utf-8'):
           pass
 
       except FileExistsError:
@@ -117,7 +167,6 @@ def crawl_loop(initial_sa_tuples: List[Tuple[str, Credentials, List[str]]],
 
       if is_set(scan_config, 'iam_policy'):
         # Get IAM policy
-        iam_client = iam_client_for_credentials(credentials)
         iam_policy = crawl.get_iam_policy(project_id, credentials)
         project_result['iam_policy'] = iam_policy
 
@@ -256,23 +305,21 @@ def crawl_loop(initial_sa_tuples: List[Tuple[str, Credentials, List[str]]],
           credentials
         )
 
-      # trying to impersonate SAs within project
       if scan_config is not None:
         impers = scan_config.get('service_accounts', None)
       else:
-        impers = {'impersonate': True}
+        impers = {'impersonate': False} # do not impersonate by default
+
+      # trying to impersonate SAs within project
       if impers is not None and impers.get('impersonate', False) is True:
+        iam_client = iam_client_for_credentials(credentials)
         if is_set(scan_config, 'iam_policy') is False:
           iam_policy = crawl.get_iam_policy(project_id, credentials)
 
-        project_service_accounts = crawl.get_associated_service_accounts(
-            iam_policy)
-
+        project_service_accounts = crawl.get_sas_for_impersonation(iam_policy)
         for candidate_service_account in project_service_accounts:
-          logging.info('Trying %s', candidate_service_account)
-          if not candidate_service_account.startswith('serviceAccount'):
-            continue
           try:
+            logging.info('Trying %s', candidate_service_account)
             creds_impersonated = credsdb.impersonate_sa(
                 iam_client, candidate_service_account)
             context.service_account_queue.put(
@@ -286,14 +333,9 @@ def crawl_loop(initial_sa_tuples: List[Tuple[str, Credentials, List[str]]],
                                                       candidate_service_account)
             logging.error(sys.exc_info()[1])
 
-      # Write out results to json DB
       logging.info('Saving results for %s into the file', project_id)
 
-      sa_results_data = json.dumps(sa_results, indent=2, sort_keys=False)
-
-      with open(output_path, 'a', encoding='utf-8') as outfile:
-        outfile.write(sa_results_data)
-
+      save_results(sa_results, output_path, light_scan)
       # Clean memory to avoid leak for large amount projects.
       sa_results.clear()
 
@@ -400,7 +442,6 @@ def main():
     with open(args.config_path, 'r', encoding='utf-8') as f:
       scan_config = json.load(f)
 
-
-  crawl_loop(sa_tuples, args.output, scan_config, args.target_project,
-             force_projects_list)
+  crawl_loop(sa_tuples, args.output, scan_config, args.light_scan,
+             args.target_project, force_projects_list)
   return 0
diff --git a/src/gcp_scanner/test_acceptance.py b/src/gcp_scanner/test_acceptance.py
@@ -46,7 +46,7 @@
 CLOUD_FUNCTIONS = 1
 ENDPOINTS_COUNT = 0
 KMS_COUNT = 1
-SERVICES_COUNT = 1
+SERVICES_COUNT = 37
 SERVICE_ACCOUNTS_COUNT = 3
 
 def check_obj_entry(res_dict, subojects_count, entry_name, volatile = False):

diff --git a/src/gcp_scanner/test_unit.py b/src/gcp_scanner/test_unit.py
@@ -61,22 +61,24 @@ def save_to_test_file(res):
 def compare_volatile(f1, f2):
   res = True
   with open(f1, "r", encoding="utf-8") as file_1:
-    file_1_text = file_1.readlines()
+    file_1_text = file_1.read()
 
   with open(f2, "r", encoding="utf-8") as file_2:
     file_2_text = file_2.readlines()
 
   for line in file_2_text:
-    # line = line[:-1]
     if not line.startswith("CHECK"):
       continue  # we compare only important part of output
     line = line.replace("CHECK", "")
+    line = line.strip()
     if line in file_1_text:
       continue
     else:
       print(f"The following line was not identified in the output:\n{line}")
       res = False
 
+  if res is False:
+    print(file_1_text)
   return res
 
 

diff --git a/test/dns_policies b/test/dns_policies
@@ -1,29 +1,27 @@
 [
-  [
-    {
-      "id": "1199893578059967130",
-CHECK      "name": "test-policy",
-CHECK      "enableInboundForwarding": true,
-CHECK      "description": "A test policy",
-CHECK      "networks": [
+  {
+    "id": "1199893578059967130",
+CHECK    "name": "test-policy",
+CHECK    "enableInboundForwarding": true,
+CHECK    "description": "A test policy",
+CHECK    "networks": [
+      {
+CHECK        "networkUrl": "https://www.googleapis.com/compute/v1/projects/test-gcp-scanner/global/networks/test-vpc",
+CHECK        "kind": "dns#policyNetwork"
+      }
+    ],
+CHECK    "alternativeNameServerConfig": {
+CHECK      "targetNameServers": [
         {
-CHECK          "networkUrl": "https://www.googleapis.com/compute/v1/projects/test-gcp-scanner/global/networks/test-vpc",
-CHECK          "kind": "dns#policyNetwork"
+CHECK          "ipv4Address": "8.8.8.8",
+CHECK          "forwardingPath": "private",
+CHECK          "ipv6Address": "",
+CHECK          "kind": "dns#policyAlternativeNameServerConfigTargetNameServer"
         }
       ],
-CHECK      "alternativeNameServerConfig": {
-CHECK        "targetNameServers": [
-          {
-CHECK            "ipv4Address": "8.8.8.8",
-CHECK            "forwardingPath": "private",
-CHECK            "ipv6Address": "",
-CHECK            "kind": "dns#policyAlternativeNameServerConfigTargetNameServer"
-          }
-        ],
-CHECK        "kind": "dns#policyAlternativeNameServerConfig"
-      },
-      "enableLogging": false,
-CHECK      "kind": "dns#policy"
-    }
-  ]
+CHECK      "kind": "dns#policyAlternativeNameServerConfig"
+    },
+    "enableLogging": false,
+CHECK    "kind": "dns#policy"
+  }
 ]