diff --git a/sbol_utilities/excel_to_sbol.py b/sbol_utilities/excel_to_sbol.py index b57a60ab..eec555d9 100644 --- a/sbol_utilities/excel_to_sbol.py +++ b/sbol_utilities/excel_to_sbol.py @@ -6,8 +6,8 @@ import sbol3 import openpyxl -import tyto -from .helper_functions import toplevel_named, strip_sbol2_version, type_to_standard_extension, is_plasmid +from .helper_functions import toplevel_named, strip_sbol2_version, type_to_standard_extension, is_plasmid, \ + tyto_lookup_with_caching, string_to_display_id, url_to_identity, strip_filetype_suffix BASIC_PARTS_COLLECTION = 'BasicParts' COMPOSITE_PARTS_COLLECTION = 'CompositeParts' @@ -68,24 +68,6 @@ def expand_configuration(values: dict) -> dict: return values_to_use -# TODO: remove after resolution of https://github.com/SynBioDex/pySBOL3/issues/191 -def string_to_display_id(name): - def sanitize_character(c): - replacements = {' ': '_', '-': '_', '.': '_'} - c = replacements.get(c, c) # first, see if there is a wired replacement - if c.isalnum() or c == '_': # keep allowed characters - return c - else: # all others are changed into a reduced & compatible form of their unicode name - return f'_{unicodedata.name(c).replace(" SIGN","").replace(" ","_")}' - - # make replacements in order to get a compliant displayID - display_id = "".join([sanitize_character(c) for c in name.strip()]) - # prepend underscore if there is an initial digit - if display_id[0].isdigit(): - display_id = "_"+display_id - return display_id - - def read_metadata(wb: openpyxl.Workbook, doc: sbol3.Document, config: dict): """ Extract metadata and build collections @@ -123,26 +105,13 @@ def read_metadata(wb: openpyxl.Workbook, doc: sbol3.Document, config: dict): doc.add(final_products) # also collect any necessary data tables from extra sheets - source_table = {row[config['source_name_col']].value:row[config['source_uri_col']].value + source_table = {row[config['source_name_col']].value: row[config['source_uri_col']].value for row in wb[config['sources_sheet']].iter_rows(min_row=config['sources_first_row']) if row[config['source_literal_col']].value} # return the set of created collections return basic_parts, composite_parts, linear_products, final_products, source_table -# TODO: remove kludge after resolution of https://github.com/SynBioDex/tyto/issues/21 -tyto_cache = {} -def tyto_lookup_with_caching(term: str) -> str: - if term not in tyto_cache: - try: - tyto_cache[term] = tyto.SO.get_uri_by_term(term) - except LookupError as e: - tyto_cache[term] = e - if isinstance(tyto_cache[term], LookupError): - raise tyto_cache[term] - else: - return tyto_cache[term] - def row_to_basic_part(doc: sbol3.Document, row, basic_parts: sbol3.Collection, linear_products: sbol3.Collection, final_products: sbol3.Collection, config: dict, source_table: dict): @@ -161,8 +130,8 @@ def row_to_basic_part(doc: sbol3.Document, row, basic_parts: sbol3.Collection, l name = row[config['basic_name_col']].value if name is None: return # skip lines without names - try: - raw_role = row[config['basic_role_col']].value # look up with tyto; if fail, leave blank or add to description + raw_role = row[config['basic_role_col']].value + try: # look up with tyto; if fail, leave blank or add to description role = (tyto_lookup_with_caching(raw_role) if raw_role else None) except LookupError: logging.warning(f'Role "{raw_role}" could not be found in Sequence Ontology') @@ -181,6 +150,8 @@ def row_to_basic_part(doc: sbol3.Document, row, basic_parts: sbol3.Collection, l # identity comes from source if set to a literal table, from display_id if not set identity = None + display_id = None + was_derived_from = None if source_id and source_prefix: source_prefix = source_prefix.strip() if source_prefix in source_table: @@ -188,7 +159,9 @@ def row_to_basic_part(doc: sbol3.Document, row, basic_parts: sbol3.Collection, l display_id = string_to_display_id(source_id.strip()) identity = f'{source_table[source_prefix]}/{display_id}' else: # when there is no prefix, use the bare value (in SBOL3 format) - identity = strip_sbol2_version(source_id.strip()) + raw_url = source_id.strip() + identity = url_to_identity(strip_filetype_suffix(strip_sbol2_version(raw_url))) + was_derived_from = raw_url else: logging.info(f'Part "{name}" ignoring non-literal source: {source_prefix}') elif source_id: @@ -202,6 +175,8 @@ def row_to_basic_part(doc: sbol3.Document, row, basic_parts: sbol3.Collection, l logging.debug(f'Creating basic part "{name}"') component = sbol3.Component(identity or display_id, sbol3.SBO_DNA, name=name, description=f'{design_notes}\n{description}'.strip()) + if was_derived_from: + component.derived_from.append(was_derived_from) doc.add(component) if role: component.roles.append(role) @@ -284,7 +259,7 @@ def make_constraint(constraint, part_list): try: restriction = constraint_dict[m.group(2)] except KeyError: - raise ValueError(f'Do not recognize constraint relation "{restriction}"') + raise ValueError(f'Do not recognize constraint relation in "{constraint}"') x = int(m.group(1)) y = int(m.group(3)) if x is y: diff --git a/sbol_utilities/helper_functions.py b/sbol_utilities/helper_functions.py index a75f9ed3..0a2f42d6 100644 --- a/sbol_utilities/helper_functions.py +++ b/sbol_utilities/helper_functions.py @@ -1,20 +1,23 @@ -from typing import Iterable, Union +import unicodedata +from typing import Iterable, Union, Optional import sbol3 -import filecmp -import difflib +import tyto ######################### # Collection of shared helper functions for utilities package -# Flatten list of lists into a single list -import tyto - def flatten(collection: Iterable[list]) -> list: + """Flatten list of lists into a single list + + :param collection: list of lists + :return: flattened list + """ return [item for sublist in collection for item in sublist] -def toplevel_named(doc: sbol3.Document, name:str) -> sbol3.Identified: + +def toplevel_named(doc: sbol3.Document, name: str) -> Optional[sbol3.Identified]: """Find the unique TopLevel document object with the given name (rather than displayID or URI) :param doc: SBOL document to search @@ -67,11 +70,81 @@ def strip_sbol2_version(identity: str) -> str: last_segment = identity.split('/')[-1] try: sbol2_version = int(last_segment) # if last segment is a number... - return identity.rsplit('/',1)[0] # ... then return everything else + return identity.rsplit('/', 1)[0] # ... then return everything else except ValueError: # if last segment was not a number, there is no version to strip return identity +# TODO: replace with EDAM format entries when SBOL2 and SBOL3 can be differentiated +GENETIC_DESIGN_FILE_TYPES = { + 'FASTA': {'.fasta', '.fa'}, + 'GenBank': {'.genbank', '.gb'}, + 'SBOL2': {'.xml'}, + 'SBOL3': {sbol3.NTRIPLES: {'.nt'}, + sbol3.RDF_XML: {'.rdf'}, + sbol3.TURTLE: {'.ttl'}, + sbol3.JSONLD: {'.json', '.jsonld'} + } +} + + +def design_file_type(name: str) -> Optional[str]: + """Guess a genetic design file's type from its name + + :param name: file name (path allowed) + :return: type name (from GENETIC_DESIGN_FILE_TYPES) if known, None if not + """ + for t, v in GENETIC_DESIGN_FILE_TYPES.items(): + if isinstance(v, set): + if any(x for x in v if name.endswith(x)): + return t + else: # dictionary + if any(sub for sub in v.values() if any(x for x in sub if name.endswith(x))): + return t + return None + + +def strip_filetype_suffix(identity: str) -> str: + """Prettify a URL by stripping standard genetic design file type suffixes off of it + + :param identity: URL to sanitize + :return: sanitized URL + """ + extensions = flatten((flatten(v.values()) if isinstance(v, dict) else v) for v in GENETIC_DESIGN_FILE_TYPES.values()) + for x in extensions: + if identity.endswith(x): + return identity.removesuffix(x) + return identity + + +# TODO: remove after resolution of https://github.com/SynBioDex/pySBOL3/issues/191 +def string_to_display_id(name): + def sanitize_character(c): + replacements = {' ': '_', '-': '_', '.': '_'} + c = replacements.get(c, c) # first, see if there is a wired replacement + if c.isalnum() or c == '_': # keep allowed characters + return c + else: # all others are changed into a reduced & compatible form of their unicode name + return f'_{unicodedata.name(c).replace(" SIGN","").replace(" ","_")}' + + # make replacements in order to get a compliant displayID + display_id = "".join([sanitize_character(c) for c in name.strip()]) + # prepend underscore if there is an initial digit + if display_id[0].isdigit(): + display_id = "_"+display_id + return display_id + + +def url_to_identity(url: str) -> str: + """Sanitize a URL string for use as an identity, turning everything after the last "/" to sanitize as a displayId + + :param url: URL to sanitize + :return: equivalent identity + """ + split = url.rsplit('/',maxsplit=1) + return f'{split[0]}/{string_to_display_id(split[1])}' + + def is_plasmid(obj: Union[sbol3.Component, sbol3.Feature]) -> bool: """Check if an SBOL Component or Feature is a plasmid-like structure, i.e., either circular or having a plasmid role @@ -166,3 +239,17 @@ def replace_feature(component, old, new): for ct in component.constraints: if ct.subject == old.identity: ct.subject = new.identity if ct.object == old.identity: ct.object = new.identity + + +# TODO: remove kludge after resolution of https://github.com/SynBioDex/tyto/issues/21 +tyto_cache = {} +def tyto_lookup_with_caching(term: str) -> str: + if term not in tyto_cache: + try: + tyto_cache[term] = tyto.SO.get_uri_by_term(term) + except LookupError as e: + tyto_cache[term] = e + if isinstance(tyto_cache[term], LookupError): + raise tyto_cache[term] + else: + return tyto_cache[term] diff --git a/setup.py b/setup.py index 1ab171da..96a83ca4 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ long_description_content_type='text/markdown', url='https://github.com/SynBioDex/SBOL-utilities', license='MIT License', - version='1.0a8', + version='1.0a9', # See https://pypi.python.org/pypi?%3Aaction=list_classifiers classifiers=[ # How mature is this project? Common values are diff --git a/test/test_files/simple_library.nt b/test/test_files/simple_library.nt index 6954d151..0b9b4952 100644 --- a/test/test_files/simple_library.nt +++ b/test/test_files/simple_library.nt @@ -671,6 +671,7 @@ "LmrA" . . . + . "Cyan FP; codon optimized using IDT tool \nmCerulean3" . "ATP07149_1" . . diff --git a/test/test_files/simple_library.xlsx b/test/test_files/simple_library.xlsx index 6801f497..28eeb503 100644 Binary files a/test/test_files/simple_library.xlsx and b/test/test_files/simple_library.xlsx differ diff --git a/test/test_files/two_backbones.nt b/test/test_files/two_backbones.nt index aca33318..fd4b4e3c 100644 --- a/test/test_files/two_backbones.nt +++ b/test/test_files/two_backbones.nt @@ -58,8 +58,8 @@ . "VariableFeature1" . . - . . + . . "Backbone_variants" . . @@ -114,8 +114,8 @@ . . . - . . + . "Fluorescence calibration basic parts" . . "Designs for round 1 if iGEM 2021 calibration experimentation" . @@ -182,8 +182,8 @@ . "VariableFeature2" . . - . . + . . "Two_by_six" . . @@ -246,14 +246,6 @@ . . . - "" . - "pOpen_v4" . - . - "pOpen_v4" . - . - . - . - . "pMB1 replicon, 100-300 copy" . "pSB1C3" . . @@ -262,3 +254,12 @@ . . . + "" . + "BBF10K_000589" . + . + "pOpen_v4" . + . + . + . + . + . diff --git a/test/test_files/two_backbones.xlsx b/test/test_files/two_backbones.xlsx index 42fc83a0..1add8b40 100644 Binary files a/test/test_files/two_backbones.xlsx and b/test/test_files/two_backbones.xlsx differ diff --git a/test/test_helpers.py b/test/test_helpers.py index 5757d2be..3f71b038 100644 --- a/test/test_helpers.py +++ b/test/test_helpers.py @@ -1,3 +1,5 @@ +import difflib +import filecmp import unittest from sbol_utilities.helper_functions import * @@ -27,11 +29,24 @@ def test_sequence_validators(self): assert unambiguous_protein_sequence('tklqpntvir') assert not unambiguous_protein_sequence('tklqxpntvir') - def test_sbol2_version_stripping(self): + def test_url_sanitization(self): + # SBOL2 version stripping: assert strip_sbol2_version('https://synbiohub.programmingbiology.org/public/Eco1C1G1T1/LmrA/1') == \ 'https://synbiohub.programmingbiology.org/public/Eco1C1G1T1/LmrA' assert strip_sbol2_version('https://synbiohub.programmingbiology.org/public/Eco1C1G1T1/LmrA') == \ 'https://synbiohub.programmingbiology.org/public/Eco1C1G1T1/LmrA' + # displayId cleaning: + assert string_to_display_id('GB30248.1') == 'GB30248_1' + assert url_to_identity('http://foo/bar/baz.qux') == 'http://foo/bar/baz_qux' + + # extension detection and stripping + assert design_file_type('something.fasta') == 'FASTA' + assert design_file_type('something.xlsx') == None + assert design_file_type('something.xml') == 'SBOL2' + assert design_file_type('something.nt') == 'SBOL3' + assert strip_filetype_suffix('http://foo/bar/baz.gb') == 'http://foo/bar/baz' + assert strip_filetype_suffix('http://foo/bar/baz.qux') == 'http://foo/bar/baz.qux' + if __name__ == '__main__': unittest.main()