diff --git a/bids_prov/afni/afni_parser.py b/bids_prov/afni/afni_parser.py index b4692bab..52ee53de 100644 --- a/bids_prov/afni/afni_parser.py +++ b/bids_prov/afni/afni_parser.py @@ -7,8 +7,11 @@ from itertools import chain from bids_prov.fsl.fsl_parser import get_entities -from bids_prov.utils import get_default_graph, CONTEXT_URL, get_id, label_mapping, compute_sha_256_entity, \ +from bids_prov.utils import ( + get_default_graph, CONTEXT_URL, label_mapping, compute_sha_256_entity, + get_activity_urn, get_agent_urn, get_entity_urn, writing_jsonld + ) # regex to catch inputs # in `cp /fsl/5.0/doc/fsl.css .files no_ext 5.0` --> only `.files` should match @@ -117,12 +120,12 @@ def build_records(commands_block: list, agent_id: str, verbose: bool = False): for (block, cmd) in commands_block: cmd_s = re.split(" |=", cmd) - a_name = cmd_s[0] + activity_name = cmd_s[0] cmd_args_remain = cmd_s[1:] inputs = [] outputs = [] function_in_description_functions = False - command_name_end = os.path.split(a_name)[1] + command_name_end = os.path.split(activity_name)[1] for df in description_functions: if df["Name"] == command_name_end: @@ -182,7 +185,7 @@ def build_records(commands_block: list, agent_id: str, verbose: bool = False): outputs = list(chain(*(attributes.pop(k) for k in attributes.keys() & OUTPUT_TAGS))) entity_names = [_ for _ in re.findall( - INPUT_RE, cmd_without_attributes[len(a_name):])] + INPUT_RE, cmd_without_attributes[len(activity_name):])] if entity_names and entity_names[0] in cmd_without_attributes: outputs.append(entity_names[-1]) @@ -190,11 +193,12 @@ def build_records(commands_block: list, agent_id: str, verbose: bool = False): inputs.append(entity_names[0]) # the file name and possible extension - label = f"{os.path.split(a_name)[1]}" - + activity_label = label_mapping( + f'{os.path.split(activity_name)[1]}', + 'afni/afni_labels.json') activity = { - "@id": f"urn:{get_id()}", - "Label": label_mapping(label, "afni/afni_labels.json"), + "@id": get_activity_urn(activity_label), + "Label": activity_label, "AssociatedWith": "urn:" + agent_id, "Command": cmd, "Parameters": param_dic, @@ -202,7 +206,7 @@ def build_records(commands_block: list, agent_id: str, verbose: bool = False): } for input_path in inputs: - input_id = f"urn:{get_id()}" # def format_id + input_id = get_entity_urn(input_path) existing_input = next( (entity for entity in records["Entities"] if entity["AtLocation"] == input_path), None) @@ -225,7 +229,7 @@ def build_records(commands_block: list, agent_id: str, verbose: bool = False): for output_path in outputs: records["Entities"].append( { - "@id": f"urn:{get_id()}", + "@id": get_entity_urn(output_path), "Label": os.path.split(output_path)[1], "AtLocation": output_path, "GeneratedBy": activity["@id"], @@ -363,7 +367,7 @@ def fusion_activities(activities, label): command += activity["Command"] + "; " return { - "@id": f"urn:{get_id()}", + "@id": get_activity_urn(label), "Label": label, "AssociatedWith": activities[0]["AssociatedWith"], "Command": command, diff --git a/bids_prov/fsl/fsl_parser.py b/bids_prov/fsl/fsl_parser.py index 2aec75bc..0bd97eb4 100644 --- a/bids_prov/fsl/fsl_parser.py +++ b/bids_prov/fsl/fsl_parser.py @@ -8,8 +8,11 @@ from bs4 import BeautifulSoup -from bids_prov.utils import get_default_graph, CONTEXT_URL, get_id, label_mapping, compute_sha_256_entity, \ +from bids_prov.utils import ( + get_default_graph, CONTEXT_URL, label_mapping, compute_sha_256_entity, + get_activity_urn, get_agent_urn, get_entity_urn, writing_jsonld + ) # regex to catch inputs # in `cp /fsl/5.0/doc/fsl.css .files no_ext 5.0` --> only `.files` should match @@ -421,7 +424,7 @@ def build_records(groups: Mapping[str, List[str]], agent_id: str): # process to remove + and - in pngappend command cmd = cmd.replace(" + ", " ").replace(" - ", " ") cmd_s = re.split(" |=", cmd) - a_name = cmd_s[0] + activity_name = cmd_s[0] inputs = [] outputs = [] @@ -430,7 +433,7 @@ def build_records(groups: Mapping[str, List[str]], agent_id: str): function_in_description_functions = False - command_name_end = os.path.split(a_name)[1] + command_name_end = os.path.split(activity_name)[1] for df in description_functions: if df["Name"] == command_name_end: description_of_command = df @@ -457,9 +460,9 @@ def build_records(groups: Mapping[str, List[str]], agent_id: str): outputs = list(chain(*(attributes.pop(k) for k in attributes.keys() & OUTPUT_TAGS))) entity_names = [_ for _ in re.findall( - INPUT_RE, cmd_without_attributes[len(a_name):])] + INPUT_RE, cmd_without_attributes[len(activity_name):])] - # # cmd_conf = get_closest_config(a_name) # with the module boutiques + # # cmd_conf = get_closest_config(activity_name) # with the module boutiques # cmd_conf = None # None because boutiques is not used at this time # # if cmd_conf: # # pos_args = filter(lambda e: not e.startswith("-"), cmd_s) # TODO use "-key value" mappings @@ -471,12 +474,13 @@ def build_records(groups: Mapping[str, List[str]], agent_id: str): if len(entity_names) > 1: inputs.append(entity_names[0]) - # the file name and possible extension - label = f"{os.path.split(a_name)[1]}" - - a = { - "@id": f"urn:{get_id()}", - "Label": label_mapping(label, "fsl/fsl_labels.json"), + # Create activity label & record + activity_label = label_mapping( + f'{os.path.split(activity_name)[1]}', + 'fsl/fsl_labels.json') + activity = { + "@id": get_activity_urn(activity_label), + "Label": activity_label, "AssociatedWith": "urn:" + agent_id, "Command": cmd, # "attributes": [ @@ -487,7 +491,7 @@ def build_records(groups: Mapping[str, List[str]], agent_id: str): for input_path in inputs: # input_name = input_path.replace("/", "_") # TODO - input_id = f"urn:{get_id()}" # def format_id + input_id = get_entity_urn(input_path) existing_input = next( (entity for entity in records["Entities"] if entity["AtLocation"] == input_path), None) @@ -509,7 +513,7 @@ def build_records(groups: Mapping[str, List[str]], agent_id: str): # output_name = output_path.replace("/", "_") # TODO records["Entities"].append( { - "@id": f"urn:{get_id()}", + "@id": get_entity_urn(output_path), "Label": os.path.split(output_path)[1], "AtLocation": output_path, "GeneratedBy": a["@id"], diff --git a/bids_prov/spm/spm_parser.py b/bids_prov/spm/spm_parser.py index 8b75cd6c..30268aa7 100644 --- a/bids_prov/spm/spm_parser.py +++ b/bids_prov/spm/spm_parser.py @@ -5,8 +5,11 @@ from typing import List, Dict, Generator from bids_prov.spm import spm_config as conf -from bids_prov.utils import get_id, get_default_graph, CONTEXT_URL, label_mapping, compute_sha_256_entity, \ +from bids_prov.utils import ( + get_uuid, get_default_graph, CONTEXT_URL, label_mapping, compute_sha_256_entity, + get_activity_urn, get_agent_urn, get_entity_urn, writing_jsonld + ) def format_activity_name(activity_name: str) -> str: @@ -61,7 +64,7 @@ def get_input_entity(right: str) -> List[dict]: file_location = re.sub(r"\,1", "", file_drop_quotes) # ds000052/RESULTS/Sub01/con_0001.nii entity_label_short = "_".join(file_location.split("/")[-2:]) # Sub01_con_0001.nii entity = { - "@id": "urn:" + get_id(), + "@id": get_entity_urn(file_location), "Label": label_mapping(entity_label_short, "spm/spm_activity_labels.json"), "AtLocation": file_location } @@ -170,7 +173,7 @@ def get_entities_from_ext_config(conf_dic: dict, activity_name: str, activity_id # activity_id), None) for output in conf_dic[activity]['outputs']: name = conf_dic[activity]['name'] - entity = {"@id": "urn:" + get_id(), + entity = {"@id": get_entity_urn(output), "Label": label_mapping(name, "spm/spm_activity_labels.json"), "Atlocation": output, "GeneratedBy": activity_id, @@ -217,7 +220,7 @@ def find_output_id_from_closest(closest_activity: dict, records: dict) -> str: Returns ------- - output_id : entity id, if such one has been generated by the closest activity, else new id + output_id : entity id, if such one has been generated by the closest activity, else new id """ for entity in records["Entities"]: if "GeneratedBy" in entity: @@ -225,7 +228,7 @@ def find_output_id_from_closest(closest_activity: dict, records: dict) -> str: output_id = entity["@id"] break else: - output_id = "urn:" + get_id() + output_id = 'urn:uuid:' + get_uuid() # output_id = next( # (entity["@id"] for entity in records["Entities"] # if parts[-1] == entity["Label"] and entity["GeneratedBy"] == closest_activity["@id"] @@ -309,9 +312,10 @@ def get_records(task_groups: dict, agent_id: str, verbose=False) -> dict: command_prefix = command_prefix[:-1] command += '\n'.join([command_prefix + c for c in end_line_list]) - activity_id = "urn:" + get_id() + activity_label = format_activity_name(common_prefix_act) + activity_id = get_activity_urn(activity_label) activity = {"@id": activity_id, - "Label": format_activity_name(common_prefix_act), + "Label": activity_label, "Used": list(), "AssociatedWith": "urn:" + agent_id, "Command": command diff --git a/bids_prov/tests/test_spm_parser.py b/bids_prov/tests/test_spm_parser.py index 91298f8d..8374b8b2 100644 --- a/bids_prov/tests/test_spm_parser.py +++ b/bids_prov/tests/test_spm_parser.py @@ -90,11 +90,10 @@ def test_get_input_entity(): right = "{'ds011/sub-01/func/sub-01_task-tonecounting_bold_trunctest.nii.gz'};" # entity label : sub-01_task-tonecounting_bold.nii.gz entities = [{ - "@id": "urn:c15521b1-b3dc-450a-9daa-37e51b591d75", + "@id": "bids::ds011/sub-01/func/sub-01_task-tonecounting_bold_trunctest.nii.gz", "Label": "func_sub-01_task-tonecounting_bold_trunctest.nii.gz", "AtLocation": "ds011/sub-01/func/sub-01_task-tonecounting_bold_trunctest.nii.gz" }] - init_random_state() right_entity = get_input_entity(right)[0] assert right_entity == entities[0] diff --git a/bids_prov/tests/test_utils.py b/bids_prov/tests/test_utils.py index 404a80b5..77dc2fdc 100644 --- a/bids_prov/tests/test_utils.py +++ b/bids_prov/tests/test_utils.py @@ -6,14 +6,15 @@ import hashlib from bids_prov.utils import ( - get_id, get_rrid, get_default_graph, CONTEXT_URL, label_mapping, get_sha256 + get_uuid, get_random_string, get_rrid, make_alnum, + get_activity_urn, get_agent_urn, get_entity_urn, + get_default_graph, CONTEXT_URL, label_mapping, get_sha256 ) from unittest.mock import mock_open, patch - -def test_get_id(): +def test_get_uuid(): # Test that the function returns a valid UUID string - result = get_id() + result = get_uuid() assert isinstance(result, str) assert isinstance(uuid.UUID(result), uuid.UUID) @@ -21,10 +22,19 @@ def test_get_id(): assert uuid.UUID(result).version == 4 # Test that the function returns a different ID each time it's called - id1 = get_id() - id2 = get_id() + id1 = get_uuid() + id2 = get_uuid() assert id1 != id2 +def test_get_random_string(): + # Test that the function returns a random string + result = get_random_string() + assert isinstance(result, str) + assert len(result) == 8 + assert result.isalnum() + assert len(get_random_string(5)) == 5 + assert result != get_random_string() + def test_get_rrid(): # Test that the function returns a RRID string result = get_rrid('FSL') @@ -35,6 +45,32 @@ def test_get_rrid(): # Test the the function returns None if the software is not referenced assert get_rrid('unreferenced_software') is None +def test_make_alnum(): + # Test that the function that removes all non alphanumeric chars from a string + assert make_alnum('¨^$£$êµ*ad45@') == 'ad45' + assert make_alnum('\\//:!§.;,?[]()}{}') == '' + assert make_alnum('ezeasdsa45ADA5sdas') == 'ezeasdsa45ADA5sdas' + +def test_get_activity_urn(): + # Test that the function that returns URNs for activities + assert 'urn:spm-' in get_activity_urn('SPM') + assert len(get_activity_urn('SPM')) == 16 + assert 'urn:spmv1242-' in get_activity_urn('SPM v. 1242355') + assert len(get_activity_urn('SPM v. 1242355')) == 21 + +def test_get_agent_urn(): + # Test that the function that returns URNs for agents + assert 'urn:bet-' in get_agent_urn('BET') + assert len(get_agent_urn('BET')) == 16 + assert 'urn:movefile-' in get_agent_urn('Move file') + assert len(get_agent_urn('SPM v. 1242355')) == 21 + +def test_get_entity_urn(): + # Test that the function that returns URNs for entities + assert get_entity_urn('') == 'bids::' + assert get_entity_urn('sub-001/func/T1.nii') == 'bids::sub-001/func/T1.nii' + assert get_entity_urn('T1.nii') == 'bids::T1.nii' + def test_get_default_graph(): context_url = "http://example.com/context" spm_ver = "v1.0" diff --git a/bids_prov/utils.py b/bids_prov/utils.py index 9d3959c4..9c363de0 100644 --- a/bids_prov/utils.py +++ b/bids_prov/utils.py @@ -2,6 +2,7 @@ import json import os import random +import string import shutil import uuid from typing import Mapping, Union, Tuple @@ -14,9 +15,25 @@ 'SPM': 'RRID:SCR_007037' } -def get_id(): +def get_uuid() -> str: return str(uuid.UUID(int=random.getrandbits(128), version=4)) +def get_random_string(length: str = 8) -> str: + """ Return a random string of a given length. + The string may contain uppercase letters, + lowercase letters, and digits. + + Parameters + ---------- + length : length of the output string + + Returns + ------- + str : a random string + """ + return ''.join(random.choices( + string.ascii_uppercase + string.ascii_lowercase + string.digits, k=length)) + def get_rrid(soft_label: str): """ Return the RRID (see: https://rrid.site/about/Getting%20Started) for a software @@ -34,6 +51,58 @@ def get_rrid(soft_label: str): return None +def make_alnum(input_string: str) -> str: + """ Remove all non alphanumeric form a string + + Parameters + ---------- + input_string : string to make alphanumeric + + Returns + ------- + str : input string with all non alphanumeric removed + """ + return re.sub(re.compile(r'[^a-zA-Z0-9]'), '', input_string) + +def get_activity_urn(label: str) -> str: + """ Return a randomly generated yet human readable URN for a bids prov Activity + + Parameters + ---------- + label : the label of the Activity + + Returns + ------- + str : a new URN for the Activity + """ + return f'urn:{make_alnum(label)[:8].lower()}-{get_random_string()}' + +def get_agent_urn(label: str) -> str: + """ Return a randomly generated yet human readable URN for a bids prov Agent + + Parameters + ---------- + label : the label of the Agent + + Returns + ------- + str : a new URN for the Agent + """ + return f'urn:{make_alnum(label)[:8].lower()}-{get_random_string()}' + +def get_entity_urn(label: str) -> str: + """ Return a randomly generated URN for a bids prov Entity + + Parameters + ---------- + label : the label of the entity + + Returns + ------- + str : a new URN for the entity + """ + return f'bids::{label}' + def get_default_graph(soft_label: str, soft_version: str = "dev", context_url: str = CONTEXT_URL) \ -> Tuple[Mapping[str, Union[str, Mapping]], str]: # TODO Dict instead of Mapping , see parser graph["Records"].update """ Return the base graph for a bids prov file @@ -50,7 +119,7 @@ def get_default_graph(soft_label: str, soft_version: str = "dev", context_url: s mapping : the base graph str : the id generated for the software """ - agent_id = get_id() + agent_id = get_agent_urn(soft_label) software_record = { "@id": "urn:" + agent_id, "@type": "prov:SoftwareAgent",