Skip to content

Commit

Permalink
Merge pull request #28 from SynBioDex/develop
Browse files Browse the repository at this point in the history
Prepare 1.0a9 release
  • Loading branch information
jakebeal authored Sep 18, 2021
2 parents 6a65300 + 10839dc commit e51335d
Show file tree
Hide file tree
Showing 8 changed files with 138 additions and 59 deletions.
51 changes: 13 additions & 38 deletions sbol_utilities/excel_to_sbol.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

import sbol3
import openpyxl
import tyto
from .helper_functions import toplevel_named, strip_sbol2_version, type_to_standard_extension, is_plasmid
from .helper_functions import toplevel_named, strip_sbol2_version, type_to_standard_extension, is_plasmid, \
tyto_lookup_with_caching, string_to_display_id, url_to_identity, strip_filetype_suffix

BASIC_PARTS_COLLECTION = 'BasicParts'
COMPOSITE_PARTS_COLLECTION = 'CompositeParts'
Expand Down Expand Up @@ -68,24 +68,6 @@ def expand_configuration(values: dict) -> dict:
return values_to_use


# TODO: remove after resolution of https://github.com/SynBioDex/pySBOL3/issues/191
def string_to_display_id(name):
def sanitize_character(c):
replacements = {' ': '_', '-': '_', '.': '_'}
c = replacements.get(c, c) # first, see if there is a wired replacement
if c.isalnum() or c == '_': # keep allowed characters
return c
else: # all others are changed into a reduced & compatible form of their unicode name
return f'_{unicodedata.name(c).replace(" SIGN","").replace(" ","_")}'

# make replacements in order to get a compliant displayID
display_id = "".join([sanitize_character(c) for c in name.strip()])
# prepend underscore if there is an initial digit
if display_id[0].isdigit():
display_id = "_"+display_id
return display_id


def read_metadata(wb: openpyxl.Workbook, doc: sbol3.Document, config: dict):
"""
Extract metadata and build collections
Expand Down Expand Up @@ -123,26 +105,13 @@ def read_metadata(wb: openpyxl.Workbook, doc: sbol3.Document, config: dict):
doc.add(final_products)

# also collect any necessary data tables from extra sheets
source_table = {row[config['source_name_col']].value:row[config['source_uri_col']].value
source_table = {row[config['source_name_col']].value: row[config['source_uri_col']].value
for row in wb[config['sources_sheet']].iter_rows(min_row=config['sources_first_row'])
if row[config['source_literal_col']].value}

# return the set of created collections
return basic_parts, composite_parts, linear_products, final_products, source_table

# TODO: remove kludge after resolution of https://github.com/SynBioDex/tyto/issues/21
tyto_cache = {}
def tyto_lookup_with_caching(term: str) -> str:
if term not in tyto_cache:
try:
tyto_cache[term] = tyto.SO.get_uri_by_term(term)
except LookupError as e:
tyto_cache[term] = e
if isinstance(tyto_cache[term], LookupError):
raise tyto_cache[term]
else:
return tyto_cache[term]


def row_to_basic_part(doc: sbol3.Document, row, basic_parts: sbol3.Collection, linear_products: sbol3.Collection,
final_products: sbol3.Collection, config: dict, source_table: dict):
Expand All @@ -161,8 +130,8 @@ def row_to_basic_part(doc: sbol3.Document, row, basic_parts: sbol3.Collection, l
name = row[config['basic_name_col']].value
if name is None:
return # skip lines without names
try:
raw_role = row[config['basic_role_col']].value # look up with tyto; if fail, leave blank or add to description
raw_role = row[config['basic_role_col']].value
try: # look up with tyto; if fail, leave blank or add to description
role = (tyto_lookup_with_caching(raw_role) if raw_role else None)
except LookupError:
logging.warning(f'Role "{raw_role}" could not be found in Sequence Ontology')
Expand All @@ -181,14 +150,18 @@ def row_to_basic_part(doc: sbol3.Document, row, basic_parts: sbol3.Collection, l

# identity comes from source if set to a literal table, from display_id if not set
identity = None
display_id = None
was_derived_from = None
if source_id and source_prefix:
source_prefix = source_prefix.strip()
if source_prefix in source_table:
if source_table[source_prefix]:
display_id = string_to_display_id(source_id.strip())
identity = f'{source_table[source_prefix]}/{display_id}'
else: # when there is no prefix, use the bare value (in SBOL3 format)
identity = strip_sbol2_version(source_id.strip())
raw_url = source_id.strip()
identity = url_to_identity(strip_filetype_suffix(strip_sbol2_version(raw_url)))
was_derived_from = raw_url
else:
logging.info(f'Part "{name}" ignoring non-literal source: {source_prefix}')
elif source_id:
Expand All @@ -202,6 +175,8 @@ def row_to_basic_part(doc: sbol3.Document, row, basic_parts: sbol3.Collection, l
logging.debug(f'Creating basic part "{name}"')
component = sbol3.Component(identity or display_id, sbol3.SBO_DNA, name=name,
description=f'{design_notes}\n{description}'.strip())
if was_derived_from:
component.derived_from.append(was_derived_from)
doc.add(component)
if role:
component.roles.append(role)
Expand Down Expand Up @@ -284,7 +259,7 @@ def make_constraint(constraint, part_list):
try:
restriction = constraint_dict[m.group(2)]
except KeyError:
raise ValueError(f'Do not recognize constraint relation "{restriction}"')
raise ValueError(f'Do not recognize constraint relation in "{constraint}"')
x = int(m.group(1))
y = int(m.group(3))
if x is y:
Expand Down
103 changes: 95 additions & 8 deletions sbol_utilities/helper_functions.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
from typing import Iterable, Union
import unicodedata
from typing import Iterable, Union, Optional

import sbol3
import filecmp
import difflib
import tyto

#########################
# Collection of shared helper functions for utilities package

# Flatten list of lists into a single list
import tyto


def flatten(collection: Iterable[list]) -> list:
"""Flatten list of lists into a single list
:param collection: list of lists
:return: flattened list
"""
return [item for sublist in collection for item in sublist]

def toplevel_named(doc: sbol3.Document, name:str) -> sbol3.Identified:

def toplevel_named(doc: sbol3.Document, name: str) -> Optional[sbol3.Identified]:
"""Find the unique TopLevel document object with the given name (rather than displayID or URI)
:param doc: SBOL document to search
Expand Down Expand Up @@ -67,11 +70,81 @@ def strip_sbol2_version(identity: str) -> str:
last_segment = identity.split('/')[-1]
try:
sbol2_version = int(last_segment) # if last segment is a number...
return identity.rsplit('/',1)[0] # ... then return everything else
return identity.rsplit('/', 1)[0] # ... then return everything else
except ValueError: # if last segment was not a number, there is no version to strip
return identity


# TODO: replace with EDAM format entries when SBOL2 and SBOL3 can be differentiated
GENETIC_DESIGN_FILE_TYPES = {
'FASTA': {'.fasta', '.fa'},
'GenBank': {'.genbank', '.gb'},
'SBOL2': {'.xml'},
'SBOL3': {sbol3.NTRIPLES: {'.nt'},
sbol3.RDF_XML: {'.rdf'},
sbol3.TURTLE: {'.ttl'},
sbol3.JSONLD: {'.json', '.jsonld'}
}
}


def design_file_type(name: str) -> Optional[str]:
"""Guess a genetic design file's type from its name
:param name: file name (path allowed)
:return: type name (from GENETIC_DESIGN_FILE_TYPES) if known, None if not
"""
for t, v in GENETIC_DESIGN_FILE_TYPES.items():
if isinstance(v, set):
if any(x for x in v if name.endswith(x)):
return t
else: # dictionary
if any(sub for sub in v.values() if any(x for x in sub if name.endswith(x))):
return t
return None


def strip_filetype_suffix(identity: str) -> str:
"""Prettify a URL by stripping standard genetic design file type suffixes off of it
:param identity: URL to sanitize
:return: sanitized URL
"""
extensions = flatten((flatten(v.values()) if isinstance(v, dict) else v) for v in GENETIC_DESIGN_FILE_TYPES.values())
for x in extensions:
if identity.endswith(x):
return identity.removesuffix(x)
return identity


# TODO: remove after resolution of https://github.com/SynBioDex/pySBOL3/issues/191
def string_to_display_id(name):
def sanitize_character(c):
replacements = {' ': '_', '-': '_', '.': '_'}
c = replacements.get(c, c) # first, see if there is a wired replacement
if c.isalnum() or c == '_': # keep allowed characters
return c
else: # all others are changed into a reduced & compatible form of their unicode name
return f'_{unicodedata.name(c).replace(" SIGN","").replace(" ","_")}'

# make replacements in order to get a compliant displayID
display_id = "".join([sanitize_character(c) for c in name.strip()])
# prepend underscore if there is an initial digit
if display_id[0].isdigit():
display_id = "_"+display_id
return display_id


def url_to_identity(url: str) -> str:
"""Sanitize a URL string for use as an identity, turning everything after the last "/" to sanitize as a displayId
:param url: URL to sanitize
:return: equivalent identity
"""
split = url.rsplit('/',maxsplit=1)
return f'{split[0]}/{string_to_display_id(split[1])}'


def is_plasmid(obj: Union[sbol3.Component, sbol3.Feature]) -> bool:
"""Check if an SBOL Component or Feature is a plasmid-like structure, i.e., either circular or having a plasmid role
Expand Down Expand Up @@ -166,3 +239,17 @@ def replace_feature(component, old, new):
for ct in component.constraints:
if ct.subject == old.identity: ct.subject = new.identity
if ct.object == old.identity: ct.object = new.identity


# TODO: remove kludge after resolution of https://github.com/SynBioDex/tyto/issues/21
tyto_cache = {}
def tyto_lookup_with_caching(term: str) -> str:
if term not in tyto_cache:
try:
tyto_cache[term] = tyto.SO.get_uri_by_term(term)
except LookupError as e:
tyto_cache[term] = e
if isinstance(tyto_cache[term], LookupError):
raise tyto_cache[term]
else:
return tyto_cache[term]
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
long_description_content_type='text/markdown',
url='https://github.com/SynBioDex/SBOL-utilities',
license='MIT License',
version='1.0a8',
version='1.0a9',
# See https://pypi.python.org/pypi?%3Aaction=list_classifiers
classifiers=[
# How mature is this project? Common values are
Expand Down
1 change: 1 addition & 0 deletions test/test_files/simple_library.nt
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,7 @@
<https://synbiohub.programmingbiology.org/public/Eco1C1G1T1/LmrA> <http://sbols.org/v3#name> "LmrA" .
<https://synbiohub.programmingbiology.org/public/Eco1C1G1T1/LmrA> <http://sbols.org/v3#type> <https://identifiers.org/SBO:0000251> .
<https://synbiohub.programmingbiology.org/public/Eco1C1G1T1/LmrA> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://sbols.org/v3#Component> .
<https://synbiohub.programmingbiology.org/public/Eco1C1G1T1/LmrA> <http://www.w3.org/ns/prov#wasDerivedFrom> <https://synbiohub.programmingbiology.org/public/Eco1C1G1T1/LmrA/1> .
<https://www.ncbi.nlm.nih.gov/nuccore/ATP07149_1> <http://sbols.org/v3#description> "Cyan FP; codon optimized using IDT tool <https://www.idtdna.com/CodonOpt>\nmCerulean3" .
<https://www.ncbi.nlm.nih.gov/nuccore/ATP07149_1> <http://sbols.org/v3#displayId> "ATP07149_1" .
<https://www.ncbi.nlm.nih.gov/nuccore/ATP07149_1> <http://sbols.org/v3#hasNamespace> <http://sbolstandard.org/testfiles/> .
Expand Down
Binary file modified test/test_files/simple_library.xlsx
Binary file not shown.
23 changes: 12 additions & 11 deletions test/test_files/two_backbones.nt
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@
<http://sbolstandard.org/testfiles/Backbone_variants/VariableFeature1> <http://sbols.org/v3#cardinality> <http://sbols.org/v3#one> .
<http://sbolstandard.org/testfiles/Backbone_variants/VariableFeature1> <http://sbols.org/v3#displayId> "VariableFeature1" .
<http://sbolstandard.org/testfiles/Backbone_variants/VariableFeature1> <http://sbols.org/v3#variable> <http://sbolstandard.org/testfiles/Backbone_variants_template/LocalSubComponent1> .
<http://sbolstandard.org/testfiles/Backbone_variants/VariableFeature1> <http://sbols.org/v3#variant> <http://sbolstandard.org/testfiles/pOpen_v4> .
<http://sbolstandard.org/testfiles/Backbone_variants/VariableFeature1> <http://sbols.org/v3#variant> <http://sbolstandard.org/testfiles/pSB1C3> .
<http://sbolstandard.org/testfiles/Backbone_variants/VariableFeature1> <http://sbols.org/v3#variant> <https://freegenes.github.io/genbank/BBF10K_000589> .
<http://sbolstandard.org/testfiles/Backbone_variants/VariableFeature1> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://sbols.org/v3#VariableFeature> .
<http://sbolstandard.org/testfiles/Backbone_variants> <http://sbols.org/v3#displayId> "Backbone_variants" .
<http://sbolstandard.org/testfiles/Backbone_variants> <http://sbols.org/v3#hasNamespace> <http://sbolstandard.org/testfiles/> .
Expand Down Expand Up @@ -114,8 +114,8 @@
<http://sbolstandard.org/testfiles/BasicParts> <http://sbols.org/v3#member> <http://parts.igem.org/J364007> .
<http://sbolstandard.org/testfiles/BasicParts> <http://sbols.org/v3#member> <http://parts.igem.org/J364009> .
<http://sbolstandard.org/testfiles/BasicParts> <http://sbols.org/v3#member> <http://parts.igem.org/R0040> .
<http://sbolstandard.org/testfiles/BasicParts> <http://sbols.org/v3#member> <http://sbolstandard.org/testfiles/pOpen_v4> .
<http://sbolstandard.org/testfiles/BasicParts> <http://sbols.org/v3#member> <http://sbolstandard.org/testfiles/pSB1C3> .
<http://sbolstandard.org/testfiles/BasicParts> <http://sbols.org/v3#member> <https://freegenes.github.io/genbank/BBF10K_000589> .
<http://sbolstandard.org/testfiles/BasicParts> <http://sbols.org/v3#name> "Fluorescence calibration basic parts" .
<http://sbolstandard.org/testfiles/BasicParts> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://sbols.org/v3#Collection> .
<http://sbolstandard.org/testfiles/CompositeParts> <http://sbols.org/v3#description> "Designs for round 1 if iGEM 2021 calibration experimentation" .
Expand Down Expand Up @@ -182,8 +182,8 @@
<http://sbolstandard.org/testfiles/Two_by_six/VariableFeature2> <http://sbols.org/v3#cardinality> <http://sbols.org/v3#one> .
<http://sbolstandard.org/testfiles/Two_by_six/VariableFeature2> <http://sbols.org/v3#displayId> "VariableFeature2" .
<http://sbolstandard.org/testfiles/Two_by_six/VariableFeature2> <http://sbols.org/v3#variable> <http://sbolstandard.org/testfiles/Two_by_six_template/LocalSubComponent2> .
<http://sbolstandard.org/testfiles/Two_by_six/VariableFeature2> <http://sbols.org/v3#variant> <http://sbolstandard.org/testfiles/pOpen_v4> .
<http://sbolstandard.org/testfiles/Two_by_six/VariableFeature2> <http://sbols.org/v3#variant> <http://sbolstandard.org/testfiles/pSB1C3> .
<http://sbolstandard.org/testfiles/Two_by_six/VariableFeature2> <http://sbols.org/v3#variant> <https://freegenes.github.io/genbank/BBF10K_000589> .
<http://sbolstandard.org/testfiles/Two_by_six/VariableFeature2> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://sbols.org/v3#VariableFeature> .
<http://sbolstandard.org/testfiles/Two_by_six> <http://sbols.org/v3#displayId> "Two_by_six" .
<http://sbolstandard.org/testfiles/Two_by_six> <http://sbols.org/v3#hasNamespace> <http://sbolstandard.org/testfiles/> .
Expand Down Expand Up @@ -246,14 +246,6 @@
<http://sbolstandard.org/testfiles/Two_by_six_template> <http://sbols.org/v3#hasNamespace> <http://sbolstandard.org/testfiles/> .
<http://sbolstandard.org/testfiles/Two_by_six_template> <http://sbols.org/v3#type> <https://identifiers.org/SBO:0000251> .
<http://sbolstandard.org/testfiles/Two_by_six_template> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://sbols.org/v3#Component> .
<http://sbolstandard.org/testfiles/pOpen_v4> <http://sbols.org/v3#description> "" .
<http://sbolstandard.org/testfiles/pOpen_v4> <http://sbols.org/v3#displayId> "pOpen_v4" .
<http://sbolstandard.org/testfiles/pOpen_v4> <http://sbols.org/v3#hasNamespace> <http://sbolstandard.org/testfiles/> .
<http://sbolstandard.org/testfiles/pOpen_v4> <http://sbols.org/v3#name> "pOpen_v4" .
<http://sbolstandard.org/testfiles/pOpen_v4> <http://sbols.org/v3#role> <https://identifiers.org/SO:0000155> .
<http://sbolstandard.org/testfiles/pOpen_v4> <http://sbols.org/v3#type> <https://identifiers.org/SBO:0000251> .
<http://sbolstandard.org/testfiles/pOpen_v4> <http://sbols.org/v3#type> <https://identifiers.org/SO:0000988> .
<http://sbolstandard.org/testfiles/pOpen_v4> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://sbols.org/v3#Component> .
<http://sbolstandard.org/testfiles/pSB1C3> <http://sbols.org/v3#description> "pMB1 replicon, 100-300 copy" .
<http://sbolstandard.org/testfiles/pSB1C3> <http://sbols.org/v3#displayId> "pSB1C3" .
<http://sbolstandard.org/testfiles/pSB1C3> <http://sbols.org/v3#hasNamespace> <http://sbolstandard.org/testfiles/> .
Expand All @@ -262,3 +254,12 @@
<http://sbolstandard.org/testfiles/pSB1C3> <http://sbols.org/v3#type> <https://identifiers.org/SBO:0000251> .
<http://sbolstandard.org/testfiles/pSB1C3> <http://sbols.org/v3#type> <https://identifiers.org/SO:0000988> .
<http://sbolstandard.org/testfiles/pSB1C3> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://sbols.org/v3#Component> .
<https://freegenes.github.io/genbank/BBF10K_000589> <http://sbols.org/v3#description> "" .
<https://freegenes.github.io/genbank/BBF10K_000589> <http://sbols.org/v3#displayId> "BBF10K_000589" .
<https://freegenes.github.io/genbank/BBF10K_000589> <http://sbols.org/v3#hasNamespace> <http://sbolstandard.org/testfiles/> .
<https://freegenes.github.io/genbank/BBF10K_000589> <http://sbols.org/v3#name> "pOpen_v4" .
<https://freegenes.github.io/genbank/BBF10K_000589> <http://sbols.org/v3#role> <https://identifiers.org/SO:0000155> .
<https://freegenes.github.io/genbank/BBF10K_000589> <http://sbols.org/v3#type> <https://identifiers.org/SBO:0000251> .
<https://freegenes.github.io/genbank/BBF10K_000589> <http://sbols.org/v3#type> <https://identifiers.org/SO:0000988> .
<https://freegenes.github.io/genbank/BBF10K_000589> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://sbols.org/v3#Component> .
<https://freegenes.github.io/genbank/BBF10K_000589> <http://www.w3.org/ns/prov#wasDerivedFrom> <https://freegenes.github.io/genbank/BBF10K_000589.gb> .
Binary file modified test/test_files/two_backbones.xlsx
Binary file not shown.
17 changes: 16 additions & 1 deletion test/test_helpers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import difflib
import filecmp
import unittest

from sbol_utilities.helper_functions import *
Expand Down Expand Up @@ -27,11 +29,24 @@ def test_sequence_validators(self):
assert unambiguous_protein_sequence('tklqpntvir')
assert not unambiguous_protein_sequence('tklqxpntvir')

def test_sbol2_version_stripping(self):
def test_url_sanitization(self):
# SBOL2 version stripping:
assert strip_sbol2_version('https://synbiohub.programmingbiology.org/public/Eco1C1G1T1/LmrA/1') == \
'https://synbiohub.programmingbiology.org/public/Eco1C1G1T1/LmrA'
assert strip_sbol2_version('https://synbiohub.programmingbiology.org/public/Eco1C1G1T1/LmrA') == \
'https://synbiohub.programmingbiology.org/public/Eco1C1G1T1/LmrA'

# displayId cleaning:
assert string_to_display_id('GB30248.1') == 'GB30248_1'
assert url_to_identity('http://foo/bar/baz.qux') == 'http://foo/bar/baz_qux'

# extension detection and stripping
assert design_file_type('something.fasta') == 'FASTA'
assert design_file_type('something.xlsx') == None
assert design_file_type('something.xml') == 'SBOL2'
assert design_file_type('something.nt') == 'SBOL3'
assert strip_filetype_suffix('http://foo/bar/baz.gb') == 'http://foo/bar/baz'
assert strip_filetype_suffix('http://foo/bar/baz.qux') == 'http://foo/bar/baz.qux'

if __name__ == '__main__':
unittest.main()

0 comments on commit e51335d

Please sign in to comment.