Skip to content

Commit

Permalink
DD-1493: Add command for dataverse role assignment (#53)
Browse files Browse the repository at this point in the history
* Initial implementation of dv-dataverse-role-assignment

* Completed functionality of dv-dataverse-role-assignment

* Renaming BatchProcessor to DatasetBatchProcessor

* Refactoring of the BatchProcessor classes

* Refactoring; dv_dataverse_role_assignment uses new DataverseRole class

* Refactoring; removed alias as input paramater for the assigment related members of the dataverse_api

* Refactoring; removed alias as input parameter for all members of the dataverse_api

* Refactoring; moved and renamed new BatchProcessor out of the way

* Cleanup; removed whitespace

* Refactoring; moved part of assignment code from roles class back to commandline interface

* Refactoring; role assignment in DataverseRole uses the self.dry_run instead of input param
  • Loading branch information
PaulBoon authored Feb 15, 2024
1 parent cedc59f commit 44cf06e
Show file tree
Hide file tree
Showing 11 changed files with 353 additions and 22 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@ ingest-flow = "datastation.ingest_flow:main"
dv-dataverse-root-collect-storage-usage = "datastation.dv_dataverse_root_collect_storage_usage:main"
dv-dataverse-root-collect-permission-overview = "datastation.dv_dataverse_root_collect_permission_overview:main"
datastation-get-component-versions = "datastation.datastation_get_component_versions:main"
dv-dataverse-role-assignment = "datastation.dv_dataverse_role_assignment:main"
139 changes: 139 additions & 0 deletions src/datastation/common/common_batch_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import logging
import os
import time

from datastation.common.csv import CsvReport
from datastation.common.utils import plural


# Base class for batch processing of items
class CommonBatchProcessor:
def __init__(self, item_name="item", wait=0.1, fail_on_first_error=True):
self.item_name = item_name
self.wait = wait
self.fail_on_first_error = fail_on_first_error

def process_items(self, items, callback):
if type(items) is list:
num_items = len(items)
logging.info(f"Start batch processing on {num_items} {plural(self.item_name)}")
else:
logging.info(f"Start batch processing on unknown number of {plural(self.item_name)}")
num_items = -1
i = 0
for item in items:
i += 1
try:
if self.wait > 0 and i > 1:
logging.debug(f"Waiting {self.wait} seconds before processing next {self.item_name}")
time.sleep(self.wait)
logging.info(f"Processing {i} of {num_items}: {item}")
callback(item)
except Exception as e:
logging.exception("Exception occurred", exc_info=True)
if self.fail_on_first_error:
logging.error(f"Stop processing because of an exception: {e}")
break
logging.debug("fail_on_first_error is False, continuing...")


def get_provided_items_iterator(item_or_items_file, item_name="item"):
if item_or_items_file is None:
logging.debug(f"No {plural(item_name)} provided.")
return None
elif os.path.isfile(os.path.expanduser(item_or_items_file)):
items = []
with open(os.path.expanduser(item_or_items_file)) as f:
for line in f:
items.append(line.strip())
return items
else:
return [item_or_items_file]


def get_pids(pid_or_pids_file, search_api=None, query="*", subtree="root", object_type="dataset", dry_run=False):
"""
Args:
pid_or_pids_file: The dataset pid, or a file with a list of pids.
search_api: must be provided if pid_or_pids_file is None
query: passed on to search_api().search
object_type: passed on to search_api().search
subtree (object): passed on to search_api().search
dry_run: Do not perform the action, but show what would be done.
Only applicable if pid_or_pids_file is None.
Returns: an iterator with pids,
if pid_or_pids_file is not provided, it searches for all datasets
and extracts their pids, fetching the result pages lazy.
"""
if pid_or_pids_file is None:
result = search_api.search(query=query, subtree=subtree, object_type=object_type, dry_run=dry_run)
return map(lambda rec: rec['global_id'], result)
else:
return get_provided_items_iterator(pid_or_pids_file, "pid")


def get_aliases(alias_or_aliases_file, dry_run=False):
"""
Args:
alias_or_aliases_file: The dataverse alias, or a file with a list of aliases.
dry_run: Do not perform the action, but show what would be done.
Only applicable if pid_or_pids_file is None.
Returns: an iterator with aliases
"""
if alias_or_aliases_file is None:
# The tree of all (published) dataverses could be retrieved and aliases could recursively be extracted
# from the tree, but this is not implemented yet.
logging.warning(f"No aliases provided, nothing to do.")
return None
else:
return get_provided_items_iterator(alias_or_aliases_file, "alias")


class DatasetBatchProcessor(CommonBatchProcessor):

def __init__(self, wait=0.1, fail_on_first_error=True):
super().__init__("pid", wait, fail_on_first_error)

def process_pids(self, pids, callback):
super().process_items(pids, callback)


class DatasetBatchProcessorWithReport(DatasetBatchProcessor):

def __init__(self, report_file=None, headers=None, wait=0.1, fail_on_first_error=True):
super().__init__(wait, fail_on_first_error)
if headers is None:
headers = ["DOI", "Modified", "Change"]
self.report_file = report_file
self.headers = headers

def process_pids(self, pids, callback):
with CsvReport(os.path.expanduser(self.report_file), self.headers) as csv_report:
super().process_pids(pids, lambda pid: callback(pid, csv_report))


class DataverseBatchProcessor(CommonBatchProcessor):

def __init__(self, wait=0.1, fail_on_first_error=True):
super().__init__("alias", wait, fail_on_first_error)

def process_aliases(self, aliases, callback):
super().process_items(aliases, callback)


class DataverseBatchProcessorWithReport(DataverseBatchProcessor):

def __init__(self, report_file=None, headers=None, wait=0.1, fail_on_first_error=True):
super().__init__(wait, fail_on_first_error)
if headers is None:
headers = ["alias", "Modified", "Change"]
self.report_file = report_file
self.headers = headers

def process_aliases(self, aliases, callback):
with CsvReport(os.path.expanduser(self.report_file), self.headers) as csv_report:
super().process_aliases(aliases, lambda alias: callback(alias, csv_report))
8 changes: 8 additions & 0 deletions src/datastation/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,11 @@ def sizeof_fmt(num, suffix='B'):
return "%3.1f%s%s" % (num, unit, suffix)
num /= 1024.0
return "%.1f%s%s" % (num, 'Yi', suffix)

def plural(word: str):
if word.endswith('s'):
return word + "es"
elif word.endswith('y'):
return word[:-1] + "ies"
else:
return word + "s"
55 changes: 42 additions & 13 deletions src/datastation/dataverse/dataverse_api.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
import requests
import json


from datastation.common.utils import print_dry_run_message


class DataverseApi:
def __init__(self, server_url, api_token):
def __init__(self, server_url, api_token, alias):
self.server_url = server_url
self.api_token = api_token
self.alias = alias # Methods should use this one if specified

def get_alias(self):
return self.alias

# get json data for a specific dataverses API endpoint using an API token
def get_resource_data(self, resource, alias="root", dry_run=False):
def get_resource_data(self, resource, dry_run=False):
headers = {"X-Dataverse-key": self.api_token}
url = f"{self.server_url}/api/dataverses/{alias}/{resource}"
url = f"{self.server_url}/api/dataverses/{self.alias}/{resource}"

if dry_run:
print_dry_run_message(method="GET", url=url, headers=headers)
Expand All @@ -24,21 +29,21 @@ def get_resource_data(self, resource, alias="root", dry_run=False):
resp_data = dv_resp.json()["data"]
return resp_data

def get_contents(self, alias="root", dry_run=False):
return self.get_resource_data("contents", alias, dry_run)
def get_contents(self, dry_run=False):
return self.get_resource_data("contents", dry_run)

def get_roles(self, alias="root", dry_run=False):
return self.get_resource_data("roles", alias, dry_run)
def get_roles(self, dry_run=False):
return self.get_resource_data("roles", dry_run)

def get_assignments(self, alias="root", dry_run=False):
return self.get_resource_data("assignments", alias, dry_run)
def get_role_assignments(self, dry_run=False):
return self.get_resource_data("assignments", dry_run)

def get_groups(self, alias="root", dry_run=False):
return self.get_resource_data("groups", alias, dry_run)
def get_groups(self, dry_run=False):
return self.get_resource_data("groups", dry_run)

def get_storage_size(self, alias="root", dry_run=False):
def get_storage_size(self, dry_run=False):
""" Get dataverse storage size (bytes). """
url = f'{self.server_url}/api/dataverses/{alias}/storagesize'
url = f'{self.server_url}/api/dataverses/{self.alias}/storagesize'
headers = {'X-Dataverse-key': self.api_token}
if dry_run:
print_dry_run_message(method='GET', url=url, headers=headers)
Expand All @@ -47,3 +52,27 @@ def get_storage_size(self, alias="root", dry_run=False):
r = requests.get(url, headers=headers)
r.raise_for_status()
return r.json()['data']['message']

def add_role_assignment(self, assignee, role, dry_run=False):
url = f'{self.server_url}/api/dataverses/{self.alias}/assignments'
headers = {'X-Dataverse-key': self.api_token, 'Content-type': 'application/json'}
role_assignment = {"assignee": assignee, "role": role}
if dry_run:
print_dry_run_message(method='POST', url=url, headers=headers,
data=json.dumps(role_assignment))
return None
else:
r = requests.post(url, headers=headers, json=role_assignment)
r.raise_for_status()
return r

def remove_role_assignment(self, assignment_id, dry_run=False):
url = f'{self.server_url}/api/dataverses/{self.alias}/assignments/{assignment_id}'
headers = {'X-Dataverse-key': self.api_token, 'Content-type': 'application/json'}
if dry_run:
print_dry_run_message(method='DELETE', url=url, headers=headers)
return None
else:
r = requests.delete(url, headers=headers)
r.raise_for_status()
return r
4 changes: 2 additions & 2 deletions src/datastation/dataverse/dataverse_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ def search_api(self):
def dataset(self, pid):
return DatasetApi(pid, self.server_url, self.api_token, self.unblock_key, self.safety_latch)

def dataverse(self):
return DataverseApi(self.server_url, self.api_token)
def dataverse(self, alias=None):
return DataverseApi(self.server_url, self.api_token, alias)

def file(self, file_id):
return FileApi(file_id, self.server_url, self.api_token, self.unblock_key, self.safety_latch)
Expand Down
2 changes: 1 addition & 1 deletion src/datastation/dataverse/metrics_collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def write_result_row(self, row):

def get_result_row(self, parent_alias, child_alias, child_name, depth):
logging.info(f'Retrieving size for dataverse: {parent_alias} / {child_alias} ...')
msg = self.dataverse_client.dataverse().get_storage_size(child_alias)
msg = self.dataverse_client.dataverse(child_alias).get_storage_size()
storage_size = extract_size_str(msg)
logging.info(f'size: {storage_size}')
row = {'depth': depth, 'parentalias': parent_alias, 'alias': child_alias, 'name': child_name,
Expand Down
6 changes: 3 additions & 3 deletions src/datastation/dataverse/permissions_collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def get_result_row(self, parent_alias, child_alias, child_name, id, vpath, depth
return row

def get_group_info(self, alias):
resp_data = self.dataverse_client.dataverse().get_groups(alias)
resp_data = self.dataverse_client.dataverse(alias).get_groups()
# flatten and compact it... no list comprehension though
result_list = []
for group in resp_data:
Expand All @@ -52,7 +52,7 @@ def get_group_info(self, alias):
return ', '.join(result_list)

def get_role_info(self, alias):
resp_data = self.dataverse_client.dataverse().get_roles(alias)
resp_data = self.dataverse_client.dataverse(alias).get_roles()
# flatten and compact it... no list comprehension though
result_list = []
for role in resp_data:
Expand All @@ -61,7 +61,7 @@ def get_role_info(self, alias):
return ', '.join(result_list)

def get_assignment_info(self, alias):
resp_data = self.dataverse_client.dataverse().get_assignments(alias)
resp_data = self.dataverse_client.dataverse(alias).get_role_assignments()
# flatten and compact it... no list comprehension though
result_list = []
for assignment in resp_data:
Expand Down
62 changes: 62 additions & 0 deletions src/datastation/dataverse/roles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import rich
from datetime import datetime

from datastation.common.common_batch_processing import DataverseBatchProcessorWithReport, get_aliases
from datastation.dataverse.dataverse_api import DataverseApi
from datastation.dataverse.dataverse_client import DataverseClient


class DataverseRole:

def __init__(self, dataverse_client: DataverseClient, dry_run: bool = False):
self.dataverse_client = dataverse_client
self.dry_run = dry_run

def list_role_assignments(self, alias):
r = self.dataverse_client.dataverse(alias).get_role_assignments()
if r is not None:
rich.print_json(data=r)

def add_role_assignment(self, role_assignment, dataverse_api: DataverseApi, csv_report):
assignee = role_assignment.split('=')[0]
role = role_assignment.split('=')[1]
action = "None"
if self.in_current_assignments(assignee, role, dataverse_api):
print("{} is already {} for dataset {}".format(assignee, role, dataverse_api.get_alias()))
else:
print(
"Adding {} as {} for dataset {}".format(assignee, role, dataverse_api.get_alias()))
dataverse_api.add_role_assignment(assignee, role, dry_run=self.dry_run)
action = "Added"
csv_report.write(
{'alias': dataverse_api.get_alias(), 'Modified': datetime.now(), 'Assignee': assignee, 'Role': role,
'Change': action})

def in_current_assignments(self, assignee, role, dataverse_api: DataverseApi):
current_assignments = dataverse_api.get_role_assignments()
found = False
for current_assignment in current_assignments:
if current_assignment.get('assignee') == assignee and current_assignment.get(
'_roleAlias') == role:
found = True
break
return found


def remove_role_assignment(self, role_assignment, dataverse_api: DataverseApi, csv_report):
assignee = role_assignment.split('=')[0]
role = role_assignment.split('=')[1]
action = "None"
if self.in_current_assignments(assignee, role, dataverse_api):
print("Removing {} as {} for dataverse {}".format(assignee, role, dataverse_api.get_alias()))
all_assignments = dataverse_api.get_role_assignments()
for assignment in all_assignments:
if assignment.get('assignee') == assignee and assignment.get('_roleAlias') == role:
dataverse_api.remove_role_assignment(assignment.get('id'), dry_run=self.dry_run)
action = "Removed"
break
else:
print("{} is not {} for dataverse {}".format(assignee, role, dataverse_api.get_alias()))
csv_report.write(
{'alias': dataverse_api.get_alias(), 'Modified': datetime.now(), 'Assignee': assignee, 'Role': role,
'Change': action})
3 changes: 1 addition & 2 deletions src/datastation/dv_dataset_destroy_migration_placeholder.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ def main():
batch_processor.process_pids(pids,
callback=lambda pid, csv_report: destroy_placeholder_dataset(dataverse.dataset(pid),
description_text_pattern,
csv_report,
dry_run=args.dry_run))
csv_report))


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 44cf06e

Please sign in to comment.