diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 2bba24e..78a8035 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -40,7 +40,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 345596b..9b77968 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -7,12 +7,12 @@ jobs: OS: ubuntu-latest PYTHON: '3.9' steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: '2' - name: Setup Python - uses: actions/setup-python@master + uses: actions/setup-python@v5 with: python-version: 3.9 - name: Generate Report diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index a54aa9c..0ec3671 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -10,13 +10,13 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: 3.9 - name: Cache pip - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }} diff --git a/poetry.lock b/poetry.lock index 4964528..e394a8c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,36 +1,16 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] -name = "beautifulsoup4" -version = "4.12.2" -description = "Screen-scraping library" -optional = false -python-versions = ">=3.6.0" -files = [ - {file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"}, - {file = "beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"}, -] - -[package.dependencies] -soupsieve = ">1.2" - -[package.extras] -html5lib = ["html5lib"] -lxml = ["lxml"] - -[[package]] -name = "bs4" -version = "0.0.1" -description = "Dummy package for Beautiful Soup" +name = "argparse-formatter" +version = "1.4" +description = "Paragraph-preserving formatter for argparse" optional = false python-versions = "*" files = [ - {file = "bs4-0.0.1.tar.gz", hash = "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a"}, + {file = "argparse_formatter-1.4-py3-none-any.whl", hash = "sha256:c1ce58a68ed83d5204e1515c6f8fb52f5f32fc21bc993faa80f0b17a78090265"}, + {file = "argparse_formatter-1.4.tar.gz", hash = "sha256:35027941a1e75a1a4df21e5c40a3395d311777d9bb1cfd9744d6ff7cc28de216"}, ] -[package.dependencies] -beautifulsoup4 = "*" - [[package]] name = "certifi" version = "2023.11.17" @@ -611,6 +591,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -684,17 +665,6 @@ pygments = ">=2.13.0,<3.0.0" [package.extras] jupyter = ["ipywidgets (>=7.5.1,<9)"] -[[package]] -name = "soupsieve" -version = "2.5" -description = "A modern CSS selector implementation for Beautiful Soup." -optional = false -python-versions = ">=3.8" -files = [ - {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, - {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, -] - [[package]] name = "tomli" version = "2.0.1" @@ -747,4 +717,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "5910a4fc349dff357ba7de75289ea08dc213979257970a208915b2fd77a8cfc5" +content-hash = "0391c932cb211540ec13844167498de9950b2bbcb31ee90c4c051644adc4405a" diff --git a/pyproject.toml b/pyproject.toml index 8522518..98ab767 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ pyYAML = "^6.0" psycopg = "^3.1.8" rich = "^13.7.0" bs4 = "^0.0.1" +argparse-formatter = "^1.4" [tool.poetry.dev-dependencies] pytest = "^7.3.1" @@ -51,4 +52,5 @@ dv-user-import = "datastation.dv_user_import:main" dv-notifications-cleanup="datastation.dv_notifications_cleanup:main" ingest-flow = "datastation.ingest_flow:main" dv-dataverse-root-collect-storage-usage = "datastation.dv_dataverse_root_collect_storage_usage:main" +dv-dataverse-root-collect-permission-overview = "datastation.dv_dataverse_root_collect_permission_overview:main" datastation-get-component-versions = "datastation.datastation_get_component_versions:main" diff --git a/src/datastation/dataverse/dataverse_api.py b/src/datastation/dataverse/dataverse_api.py index 9d60978..3c2703d 100644 --- a/src/datastation/dataverse/dataverse_api.py +++ b/src/datastation/dataverse/dataverse_api.py @@ -9,9 +9,10 @@ def __init__(self, server_url, api_token): self.server_url = server_url self.api_token = api_token - def get_contents(self, alias="root", dry_run=False): + # get json data for a specific dataverses API endpoint using an API token + def get_resource_data(self, resource, alias="root", dry_run=False): headers = {"X-Dataverse-key": self.api_token} - url = f"{self.server_url}/api/dataverses/{alias}/contents" + url = f"{self.server_url}/api/dataverses/{alias}/{resource}" if dry_run: print_dry_run_message(method="GET", url=url, headers=headers) @@ -23,6 +24,18 @@ def get_contents(self, alias="root", dry_run=False): resp_data = dv_resp.json()["data"] return resp_data + def get_contents(self, alias="root", dry_run=False): + return self.get_resource_data("contents", alias, dry_run) + + def get_roles(self, alias="root", dry_run=False): + return self.get_resource_data("roles", alias, dry_run) + + def get_assignments(self, alias="root", dry_run=False): + return self.get_resource_data("assignments", alias, dry_run) + + def get_groups(self, alias="root", dry_run=False): + return self.get_resource_data("groups", alias, dry_run) + def get_storage_size(self, alias="root", dry_run=False): """ Get dataverse storage size (bytes). """ url = f'{self.server_url}/api/dataverses/{alias}/storagesize' diff --git a/src/datastation/dataverse/permissions_collect.py b/src/datastation/dataverse/permissions_collect.py new file mode 100644 index 0000000..ed2e578 --- /dev/null +++ b/src/datastation/dataverse/permissions_collect.py @@ -0,0 +1,129 @@ +from datastation.common.result_writer import CsvResultWriter, YamlResultWriter, JsonResultWriter +from datastation.dataverse.dataverse_client import DataverseClient +import logging +import re +import csv +import sys +import json +import rich +from datetime import timedelta + + +class PermissionsCollect: + + def __init__(self, dataverse_client: DataverseClient, output_file, output_format, dry_run: bool = False): + self.dataverse_client = dataverse_client + self.output_file = output_file + self.output_format = output_format + self.dry_run = dry_run + + self.writer = None + self.is_first = True # Would be nicer if the Writer does the bookkeeping + self.vpath_delimiter = ' > ' # Note that'/' would tempt people us use it as a real path + + def create_result_writer(self, out_stream): + logging.info(f'Writing output: {self.output_file}, with format : {self.output_format}') + csv_columns = ['depth', 'parentalias', 'alias', 'name', 'id', 'vpath', 'groups', 'roles', 'assignments'] + if self.output_format == 'csv': + return CsvResultWriter(headers=csv_columns, out_stream=out_stream) + else: + return JsonResultWriter(out_stream) + + def write_result_row(self, row): + self.writer.write(row, self.is_first) + self.is_first = False # Only the first time it can be True + + def get_result_row(self, parent_alias, child_alias, child_name, id, vpath, depth): + logging.info(f'Retrieving permission info for dataverse: {parent_alias} / {child_alias} ...') + group_info = self.get_group_info(child_alias) + role_info = self.get_role_info(child_alias) + assignment_info = self.get_assignment_info(child_alias) + row = {'depth': depth, 'parentalias': parent_alias, 'alias': child_alias, 'name': child_name, + 'id': id, 'vpath': vpath, 'groups': group_info, 'roles': role_info, 'assignments': assignment_info} + return row + + def get_group_info(self, alias): + resp_data = self.dataverse_client.dataverse().get_groups(alias) + # flatten and compact it... no list comprehension though + result_list = [] + for group in resp_data: + # append the number of assignees in braces + result_list.append(group['identifier'] + ' (' + str(len(group['containedRoleAssignees'])) + ')') + return ', '.join(result_list) + + def get_role_info(self, alias): + resp_data = self.dataverse_client.dataverse().get_roles(alias) + # flatten and compact it... no list comprehension though + result_list = [] + for role in resp_data: + # append the number of permissions in braces + result_list.append(role['alias'] + ' (' + str(len(role['permissions'])) + ')') + return ', '.join(result_list) + + def get_assignment_info(self, alias): + resp_data = self.dataverse_client.dataverse().get_assignments(alias) + # flatten and compact it... no list comprehension though + result_list = [] + for assignment in resp_data: + # append the role alias in braces + result_list.append(assignment['assignee'] + ' (' + (assignment['_roleAlias']) + ')') + return ', '.join(result_list) + + def collect_permissions_info(self, tree_data, parent_vpath, parent_alias, depth=1): + alias = tree_data['alias'] + name = tree_data['name'] + id = tree_data['id'] + vpath = parent_vpath + self.vpath_delimiter + alias + row = self.get_result_row(parent_alias, alias, name, id, vpath, depth) + self.write_result_row(row) + # only direct descendants (children) + if 'children' in tree_data: + for child_tree_data in tree_data['children']: + self.collect_permissions_info(child_tree_data, vpath, alias, depth + 1) # recurse + + def find_child(self, parent, alias): + result = None + for child in parent['children']: + if child["alias"] == alias: + result = child + break + return result + + def collect_permissions_info_overview(self, selected_dataverse=None): + out_stream = sys.stdout + if self.output_file != '-': + try: + out_stream = open(self.output_file, "w") + except: + logging.error(f"Could not open file: {self.output_file}") + raise + + self.writer = self.create_result_writer(out_stream) + + logging.info(f'Extracting tree for server: {self.dataverse_client.server_url} ...') + tree_data = self.dataverse_client.metrics().get_tree() + alias = tree_data['alias'] + name = tree_data['name'] + id = tree_data['id'] + vpath = alias + logging.info(f'Extracted the tree for the toplevel dataverse: {name} ({alias})') + logging.info("Retrieving the info for this dataverse instance...") + + if selected_dataverse is None: + # do whole tree + logging.info("Retrieving the info for all the dataverse collections...") + self.collect_permissions_info(tree_data, vpath, alias, 1) + else: + # always the 'root' dataverse + row = self.get_result_row("-", alias, name, id, vpath, 0) # The root has no parent + self.write_result_row(row) + # then the selected sub-verse tree + logging.info("Retrieving the info for a selected dataverse collection sub-tree...") + selected_tree_data = self.find_child(tree_data, selected_dataverse) + if selected_tree_data is not None: + self.collect_permissions_info(selected_tree_data, vpath, alias, 1) + else: + logging.error(f"Could not find the selected dataverse: {selected_dataverse}") + + self.writer.close() + self.is_first = True diff --git a/src/datastation/dv_dataverse_root_collect_permission_overview.py b/src/datastation/dv_dataverse_root_collect_permission_overview.py new file mode 100644 index 0000000..9ac3307 --- /dev/null +++ b/src/datastation/dv_dataverse_root_collect_permission_overview.py @@ -0,0 +1,46 @@ +import argparse +from argparse_formatter import FlexiFormatter + +from datastation.common.config import init +from datastation.common.utils import add_dry_run_arg +from datastation.dataverse.dataverse_client import DataverseClient +from datastation.dataverse.permissions_collect import PermissionsCollect + + +def main(): + config = init() + + output_explanation = ''' + The output has the following information: + + * depth: The depth in the tree of the . The top-level ('root') has depth 0. + * parentalias: The alias of the parent dataverse + * alias: The alias of the dataverse + * name: The name of the dataverse + * id: The id of the dataverse, sometimes used in places where the alias is not used + * vpath: The virtual path of the collection, i.e. the path from the root to the dataverse + * groups: A comma-separated list of the explicit groups under the dataverse. + For each group there is the 'identifier' with the number of 'containedRoleAssignees' in braces appended. + * roles: A comma-separated list of the roles defined in the dataverse. + For each role there is the 'alias' with the number of 'permissions' in braces appended. + * assignments: A comma-separated list of the assignments of roles on the dataverse. + For each assignment there is the 'assignee' with the '_roleAlias' in braces appended. + ''' + parser = argparse.ArgumentParser(description='Collect the permissions overview for the dataverses (collections) in a Dataverse installation.', + epilog=output_explanation, formatter_class=FlexiFormatter) + parser.add_argument('-o', '--output-file', dest='output_file', default='-', + help='The file to write the output to or - for stdout') + parser.add_argument('-f', '--format', dest='format', + help='Output format, one of: csv, json (default: json)') + parser.add_argument('-s', '--selected-dataverse', dest='selected_dataverse', default=None, + help='The dataverse (top-level) sub-tree to collect the permissions for, by default all dataverses are collected') + add_dry_run_arg(parser) + args = parser.parse_args() + + selected_dataverse = args.selected_dataverse + dataverse_client = DataverseClient(config['dataverse']) + collector = PermissionsCollect(dataverse_client, args.output_file, args.format, args.dry_run) + collector.collect_permissions_info_overview(selected_dataverse) + +if __name__ == '__main__': + main()