From 3d39d91f0064687d3a6807159c9047ea9136623d Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 21 Mar 2023 18:27:20 +0100 Subject: [PATCH] #281 - Types with array range break JSON typesystem parsing - Fix handling of arrays marked using '[]' - Also genenerate feature descriptions using this form for arrays - Added test --- cassis/json.py | 30 ++- cassis/typesystem.py | 41 +++- .../tsv3-testSimpleSlotFeature/data-ref.json | 180 ++++++++++++++++++ .../tsv3-testSimpleSlotFeature/data.json | 180 ++++++++++++++++++ .../debug-typesystem.xml | 128 +++++++++++++ .../tsv3-testSimpleSlotFeature/debug.xmi | 17 ++ tests/test_json.py | 6 +- 7 files changed, 575 insertions(+), 7 deletions(-) create mode 100644 tests/test_files/json/fs_as_array/one-way/tsv3-testSimpleSlotFeature/data-ref.json create mode 100644 tests/test_files/json/fs_as_array/one-way/tsv3-testSimpleSlotFeature/data.json create mode 100644 tests/test_files/json/fs_as_array/one-way/tsv3-testSimpleSlotFeature/debug-typesystem.xml create mode 100644 tests/test_files/json/fs_as_array/one-way/tsv3-testSimpleSlotFeature/debug.xmi diff --git a/cassis/json.py b/cassis/json.py index 4b94a7f..d2dc270 100644 --- a/cassis/json.py +++ b/cassis/json.py @@ -170,12 +170,18 @@ def _parse_features(self, typesystem: TypeSystem, type_name: str, json_type: Dic for key, json_feature in json_type.items(): if key.startswith(RESERVED_FIELD_PREFIX): continue + + range_type = json_feature[RANGE_FIELD] + element_type = json_feature.get(ELEMENT_TYPE_FIELD) + if range_type.endswith('[]'): + element_type = range_type[:-2] + range_type = array_type_name_for_type(element_type) typesystem.create_feature( new_type, name=key, - rangeType=json_feature[RANGE_FIELD], + rangeType=range_type, + elementType=element_type, description=json_feature.get(DESCRIPTION_FIELD), - elementType=json_feature.get(ELEMENT_TYPE_FIELD), multipleReferencesAllowed=json_feature.get(MULTIPLE_REFERENCES_ALLOWED_FIELD), ) @@ -214,7 +220,10 @@ def _parse_sofa(self, cas: Cas, fs_id: int, json_fs: Dict[str, any], feature_str def _parse_feature_structure( self, typesystem: TypeSystem, fs_id: int, json_fs: Dict[str, any], feature_structures: Dict[int, any] ): - AnnotationType = typesystem.get_type(json_fs.get(TYPE_FIELD)) + type_name = json_fs.get(TYPE_FIELD) + if type_name.endswith('[]'): + type_name = array_type_name_for_type(type_name) + AnnotationType = typesystem.get_type(type_name) attributes = dict(json_fs) @@ -413,9 +422,20 @@ def _serialize_feature(self, json_type, feature: Feature): if feature._has_reserved_name: feature_name = feature_name[:-1] + range_type_name = self._to_external_type_name(feature.rangeType.name) + skip_element_type = False + if is_array(feature.rangeType): + skip_element_type = True + if is_primitive_array(feature.rangeType): + range_type_name = element_type_name_for_array_type(feature.rangeType) + "[]" + elif feature.elementType: + range_type_name = self._to_external_type_name(feature.elementType.name) + "[]" + else: + range_type_name = TYPE_NAME_TOP + "[]" + json_feature = { NAME_FIELD: feature_name, - RANGE_FIELD: self._to_external_type_name(feature.rangeType.name), + RANGE_FIELD: range_type_name, } if feature.description: @@ -424,7 +444,7 @@ def _serialize_feature(self, json_type, feature: Feature): if feature.multipleReferencesAllowed is not None: json_feature[MULTIPLE_REFERENCES_ALLOWED_FIELD] = feature.multipleReferencesAllowed - if feature.elementType is not None: + if not skip_element_type and feature.elementType is not None: json_feature[ELEMENT_TYPE_FIELD] = self._to_external_type_name(feature.elementType.name) return json_feature diff --git a/cassis/typesystem.py b/cassis/typesystem.py index dcf904e..1a8a724 100644 --- a/cassis/typesystem.py +++ b/cassis/typesystem.py @@ -198,7 +198,46 @@ class TypeSystemMode(Enum): MINIMAL = auto() NONE = auto() - +def array_type_name_for_type(type_: Union[str, "Type"]) -> str: + type_name = type_ if isinstance(type_, str) else type_.name + if type_name == TYPE_NAME_BYTE: + return TYPE_NAME_BYTE_ARRAY + if type_name == TYPE_NAME_FLOAT: + return TYPE_NAME_FLOAT_ARRAY + if type_name == TYPE_NAME_DOUBLE: + return TYPE_NAME_DOUBLE_ARRAY + if type_name == TYPE_NAME_BOOLEAN: + return TYPE_NAME_BOOLEAN_ARRAY + if type_name == TYPE_NAME_INTEGER: + return TYPE_NAME_INTEGER_ARRAY + if type_name == TYPE_NAME_SHORT: + return TYPE_NAME_SHORT_ARRAY + if type_name == TYPE_NAME_LONG: + return TYPE_NAME_LONG_ARRAY + if type_name == TYPE_NAME_STRING: + return TYPE_NAME_STRING_ARRAY + return TYPE_NAME_FS_ARRAY + + +def element_type_name_for_array_type(type_: Union[str, "Type"]) -> str: + type_name = type_ if isinstance(type_, str) else type_.name + if type_name == TYPE_NAME_BYTE_ARRAY: + return TYPE_NAME_BYTE + if type_name == TYPE_NAME_FLOAT_ARRAY: + return TYPE_NAME_FLOAT + if type_name == TYPE_NAME_DOUBLE_ARRAY: + return TYPE_NAME_DOUBLE + if type_name == TYPE_NAME_BOOLEAN_ARRAY: + return TYPE_NAME_BOOLEAN + if type_name == TYPE_NAME_INTEGER_ARRAY: + return TYPE_NAME_INTEGER + if type_name == TYPE_NAME_SHORT_ARRAY: + return TYPE_NAME_SHORT + if type_name == TYPE_NAME_LONG_ARRAY: + return TYPE_NAME_LONG + if type_name == TYPE_NAME_STRING_ARRAY: + return TYPE_NAME_STRING + return TYPE_NAME_TOP def _string_to_valid_classname(name: str): return re.sub("[^a-zA-Z0-9_]", "_", name) diff --git a/tests/test_files/json/fs_as_array/one-way/tsv3-testSimpleSlotFeature/data-ref.json b/tests/test_files/json/fs_as_array/one-way/tsv3-testSimpleSlotFeature/data-ref.json new file mode 100644 index 0000000..c33b02a --- /dev/null +++ b/tests/test_files/json/fs_as_array/one-way/tsv3-testSimpleSlotFeature/data-ref.json @@ -0,0 +1,180 @@ +{ + "%TYPES" : { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" : { + "%NAME" : "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "%SUPER_TYPE" : "uima.tcas.DocumentAnnotation", + "documentTitle" : { + "%NAME" : "documentTitle", + "%RANGE" : "uima.cas.String" + }, + "documentId" : { + "%NAME" : "documentId", + "%RANGE" : "uima.cas.String" + }, + "documentUri" : { + "%NAME" : "documentUri", + "%RANGE" : "uima.cas.String" + }, + "collectionId" : { + "%NAME" : "collectionId", + "%RANGE" : "uima.cas.String" + }, + "documentBaseUri" : { + "%NAME" : "documentBaseUri", + "%RANGE" : "uima.cas.String" + }, + "isLastSegment" : { + "%NAME" : "isLastSegment", + "%RANGE" : "uima.cas.Boolean" + } + }, + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" : { + "%NAME" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "%SUPER_TYPE" : "uima.tcas.Annotation", + "id" : { + "%NAME" : "id", + "%RANGE" : "uima.cas.String" + } + }, + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" : { + "%NAME" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "%SUPER_TYPE" : "uima.tcas.Annotation", + "parent" : { + "%NAME" : "parent", + "%RANGE" : "uima.tcas.Annotation" + }, + "id" : { + "%NAME" : "id", + "%RANGE" : "uima.cas.String" + }, + "syntacticFunction" : { + "%NAME" : "syntacticFunction", + "%RANGE" : "uima.cas.String" + }, + "order" : { + "%NAME" : "order", + "%RANGE" : "uima.cas.Integer" + } + }, + "webanno.custom.LinkType" : { + "%NAME" : "webanno.custom.LinkType", + "%SUPER_TYPE" : "uima.cas.TOP", + "role" : { + "%NAME" : "role", + "%RANGE" : "uima.cas.String" + }, + "target" : { + "%NAME" : "target", + "%RANGE" : "webanno.custom.SimpleSpan" + } + }, + "webanno.custom.SimpleLinkHost" : { + "%NAME" : "webanno.custom.SimpleLinkHost", + "%SUPER_TYPE" : "uima.tcas.Annotation", + "links" : { + "%NAME" : "links", + "%RANGE" : "webanno.custom.LinkType[]" + } + }, + "webanno.custom.SimpleSpan" : { + "%NAME" : "webanno.custom.SimpleSpan", + "%SUPER_TYPE" : "uima.tcas.Annotation" + } + }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 2, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text", + "sofaString" : "This is a test ." + }, { + "%ID" : 1, + "%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "@sofa" : 2, + "begin" : 0, + "end" : 16, + "documentId" : "doc", + "isLastSegment" : false + }, { + "%ID" : 3, + "%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "@sofa" : 2, + "begin" : 0, + "end" : 16 + }, { + "%ID" : 4, + "%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "@sofa" : 2, + "begin" : 0, + "end" : 4, + "order" : 0 + }, { + "%ID" : 5, + "%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "@sofa" : 2, + "begin" : 5, + "end" : 7, + "order" : 0 + }, { + "%ID" : 6, + "%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "@sofa" : 2, + "begin" : 8, + "end" : 9, + "order" : 0 + }, { + "%ID" : 7, + "%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "@sofa" : 2, + "begin" : 10, + "end" : 14, + "order" : 0 + }, { + "%ID" : 8, + "%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "@sofa" : 2, + "begin" : 15, + "end" : 16, + "order" : 0 + }, { + "%ID" : 9, + "%TYPE" : "webanno.custom.LinkType", + "role" : "p2", + "@target" : 10 + }, { + "%ID" : 10, + "%TYPE" : "webanno.custom.SimpleSpan", + "@sofa" : 2, + "begin" : 8, + "end" : 9 + }, { + "%ID" : 11, + "%TYPE" : "webanno.custom.LinkType", + "role" : "p1", + "@target" : 12 + }, { + "%ID" : 12, + "%TYPE" : "webanno.custom.SimpleSpan", + "@sofa" : 2, + "begin" : 5, + "end" : 7 + }, { + "%ID" : 13, + "%TYPE" : "uima.cas.FSArray", + "%ELEMENTS" : [ 11, 9 ] + }, { + "%ID" : 14, + "%TYPE" : "webanno.custom.SimpleLinkHost", + "@sofa" : 2, + "begin" : 0, + "end" : 4, + "@links" : 13 + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 2, + "%MEMBERS" : [ 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14 ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/one-way/tsv3-testSimpleSlotFeature/data.json b/tests/test_files/json/fs_as_array/one-way/tsv3-testSimpleSlotFeature/data.json new file mode 100644 index 0000000..00d71a2 --- /dev/null +++ b/tests/test_files/json/fs_as_array/one-way/tsv3-testSimpleSlotFeature/data.json @@ -0,0 +1,180 @@ +{ + "%TYPES" : { + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" : { + "%NAME" : "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "%SUPER_TYPE" : "uima.tcas.DocumentAnnotation", + "documentTitle" : { + "%NAME" : "documentTitle", + "%RANGE" : "uima.cas.String" + }, + "documentId" : { + "%NAME" : "documentId", + "%RANGE" : "uima.cas.String" + }, + "documentUri" : { + "%NAME" : "documentUri", + "%RANGE" : "uima.cas.String" + }, + "collectionId" : { + "%NAME" : "collectionId", + "%RANGE" : "uima.cas.String" + }, + "documentBaseUri" : { + "%NAME" : "documentBaseUri", + "%RANGE" : "uima.cas.String" + }, + "isLastSegment" : { + "%NAME" : "isLastSegment", + "%RANGE" : "uima.cas.Boolean" + } + }, + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" : { + "%NAME" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "%SUPER_TYPE" : "uima.tcas.Annotation", + "id" : { + "%NAME" : "id", + "%RANGE" : "uima.cas.String" + } + }, + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" : { + "%NAME" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "%SUPER_TYPE" : "uima.tcas.Annotation", + "parent" : { + "%NAME" : "parent", + "%RANGE" : "uima.tcas.Annotation" + }, + "id" : { + "%NAME" : "id", + "%RANGE" : "uima.cas.String" + }, + "syntacticFunction" : { + "%NAME" : "syntacticFunction", + "%RANGE" : "uima.cas.String" + }, + "order" : { + "%NAME" : "order", + "%RANGE" : "uima.cas.Integer" + } + }, + "webanno.custom.LinkType" : { + "%NAME" : "webanno.custom.LinkType", + "%SUPER_TYPE" : "uima.cas.TOP", + "role" : { + "%NAME" : "role", + "%RANGE" : "uima.cas.String" + }, + "target" : { + "%NAME" : "target", + "%RANGE" : "webanno.custom.SimpleSpan" + } + }, + "webanno.custom.SimpleLinkHost" : { + "%NAME" : "webanno.custom.SimpleLinkHost", + "%SUPER_TYPE" : "uima.tcas.Annotation", + "links" : { + "%NAME" : "links", + "%RANGE" : "webanno.custom.LinkType[]" + } + }, + "webanno.custom.SimpleSpan" : { + "%NAME" : "webanno.custom.SimpleSpan", + "%SUPER_TYPE" : "uima.tcas.Annotation" + } + }, + "%FEATURE_STRUCTURES" : [ { + "%ID" : 2, + "%TYPE" : "uima.cas.Sofa", + "sofaNum" : 1, + "sofaID" : "_InitialView", + "mimeType" : "text", + "sofaString" : "This is a test ." + }, { + "%ID" : 1, + "%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "@sofa" : 2, + "begin" : 0, + "end" : 16, + "documentId" : "doc", + "isLastSegment" : false + }, { + "%ID" : 3, + "%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", + "@sofa" : 2, + "begin" : 0, + "end" : 16 + }, { + "%ID" : 4, + "%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "@sofa" : 2, + "begin" : 0, + "end" : 4, + "order" : 0 + }, { + "%ID" : 5, + "%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "@sofa" : 2, + "begin" : 5, + "end" : 7, + "order" : 0 + }, { + "%ID" : 6, + "%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "@sofa" : 2, + "begin" : 8, + "end" : 9, + "order" : 0 + }, { + "%ID" : 7, + "%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "@sofa" : 2, + "begin" : 10, + "end" : 14, + "order" : 0 + }, { + "%ID" : 8, + "%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", + "@sofa" : 2, + "begin" : 15, + "end" : 16, + "order" : 0 + }, { + "%ID" : 9, + "%TYPE" : "webanno.custom.LinkType", + "role" : "p2", + "@target" : 10 + }, { + "%ID" : 11, + "%TYPE" : "webanno.custom.LinkType", + "role" : "p1", + "@target" : 12 + }, { + "%ID" : 13, + "%TYPE" : "webanno.custom.LinkType[]", + "%ELEMENTS" : [ 11, 9 ] + }, { + "%ID" : 14, + "%TYPE" : "webanno.custom.SimpleLinkHost", + "@sofa" : 2, + "begin" : 0, + "end" : 4, + "@links" : 13 + }, { + "%ID" : 12, + "%TYPE" : "webanno.custom.SimpleSpan", + "@sofa" : 2, + "begin" : 5, + "end" : 7 + }, { + "%ID" : 10, + "%TYPE" : "webanno.custom.SimpleSpan", + "@sofa" : 2, + "begin" : 8, + "end" : 9 + } ], + "%VIEWS" : { + "_InitialView" : { + "%SOFA" : 2, + "%MEMBERS" : [ 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14 ] + } + } +} \ No newline at end of file diff --git a/tests/test_files/json/fs_as_array/one-way/tsv3-testSimpleSlotFeature/debug-typesystem.xml b/tests/test_files/json/fs_as_array/one-way/tsv3-testSimpleSlotFeature/debug-typesystem.xml new file mode 100644 index 0000000..998e9a3 --- /dev/null +++ b/tests/test_files/json/fs_as_array/one-way/tsv3-testSimpleSlotFeature/debug-typesystem.xml @@ -0,0 +1,128 @@ + + + + + uima.tcas.DocumentAnnotation + + uima.tcas.Annotation + + + language + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData + + uima.tcas.DocumentAnnotation + + + documentTitle + + uima.cas.String + + + documentId + + uima.cas.String + + + documentUri + + uima.cas.String + + + collectionId + + uima.cas.String + + + documentBaseUri + + uima.cas.String + + + isLastSegment + + uima.cas.Boolean + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence + + uima.tcas.Annotation + + + id + + uima.cas.String + + + + + de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token + + uima.tcas.Annotation + + + parent + + uima.tcas.Annotation + + + id + + uima.cas.String + + + syntacticFunction + + uima.cas.String + + + order + + uima.cas.Integer + + + + + webanno.custom.LinkType + + uima.cas.TOP + + + role + + uima.cas.String + + + target + + webanno.custom.SimpleSpan + + + + + webanno.custom.SimpleLinkHost + + uima.tcas.Annotation + + + links + + uima.cas.FSArray + webanno.custom.LinkType + + + + + webanno.custom.SimpleSpan + + uima.tcas.Annotation + + + diff --git a/tests/test_files/json/fs_as_array/one-way/tsv3-testSimpleSlotFeature/debug.xmi b/tests/test_files/json/fs_as_array/one-way/tsv3-testSimpleSlotFeature/debug.xmi new file mode 100644 index 0000000..8ee3788 --- /dev/null +++ b/tests/test_files/json/fs_as_array/one-way/tsv3-testSimpleSlotFeature/debug.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/tests/test_json.py b/tests/test_json.py index 0c22ee8..1093212 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -81,13 +81,17 @@ ( os.path.join(SER_REF_DIR, "casExtendingDocumentAnnotation"), [["de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", 0, 16, "This is a test ."]], - ) + ), ] ONE_WAY_FIXTURES = [ ( os.path.join(ONE_WAY_DIR, "casWithBadSofaFsOrder"), [["de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", 0, 16, "This is a test ."]], + ), + ( + os.path.join(ONE_WAY_DIR, "tsv3-testSimpleSlotFeature"), + [], ) ]