Skip to content

Commit

Permalink
#281 - Types with array range break JSON typesystem parsing
Browse files Browse the repository at this point in the history
- Fix handling of arrays marked using '[]'
- Also genenerate feature descriptions using this form for arrays
- Added test
  • Loading branch information
reckart committed Mar 21, 2023
1 parent a4d7fef commit 3d39d91
Show file tree
Hide file tree
Showing 7 changed files with 575 additions and 7 deletions.
30 changes: 25 additions & 5 deletions cassis/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,12 +170,18 @@ def _parse_features(self, typesystem: TypeSystem, type_name: str, json_type: Dic
for key, json_feature in json_type.items():
if key.startswith(RESERVED_FIELD_PREFIX):
continue

range_type = json_feature[RANGE_FIELD]
element_type = json_feature.get(ELEMENT_TYPE_FIELD)
if range_type.endswith('[]'):
element_type = range_type[:-2]
range_type = array_type_name_for_type(element_type)
typesystem.create_feature(
new_type,
name=key,
rangeType=json_feature[RANGE_FIELD],
rangeType=range_type,
elementType=element_type,
description=json_feature.get(DESCRIPTION_FIELD),
elementType=json_feature.get(ELEMENT_TYPE_FIELD),
multipleReferencesAllowed=json_feature.get(MULTIPLE_REFERENCES_ALLOWED_FIELD),
)

Expand Down Expand Up @@ -214,7 +220,10 @@ def _parse_sofa(self, cas: Cas, fs_id: int, json_fs: Dict[str, any], feature_str
def _parse_feature_structure(
self, typesystem: TypeSystem, fs_id: int, json_fs: Dict[str, any], feature_structures: Dict[int, any]
):
AnnotationType = typesystem.get_type(json_fs.get(TYPE_FIELD))
type_name = json_fs.get(TYPE_FIELD)
if type_name.endswith('[]'):
type_name = array_type_name_for_type(type_name)
AnnotationType = typesystem.get_type(type_name)

attributes = dict(json_fs)

Expand Down Expand Up @@ -413,9 +422,20 @@ def _serialize_feature(self, json_type, feature: Feature):
if feature._has_reserved_name:
feature_name = feature_name[:-1]

range_type_name = self._to_external_type_name(feature.rangeType.name)
skip_element_type = False
if is_array(feature.rangeType):
skip_element_type = True
if is_primitive_array(feature.rangeType):
range_type_name = element_type_name_for_array_type(feature.rangeType) + "[]"
elif feature.elementType:
range_type_name = self._to_external_type_name(feature.elementType.name) + "[]"
else:
range_type_name = TYPE_NAME_TOP + "[]"

json_feature = {
NAME_FIELD: feature_name,
RANGE_FIELD: self._to_external_type_name(feature.rangeType.name),
RANGE_FIELD: range_type_name,
}

if feature.description:
Expand All @@ -424,7 +444,7 @@ def _serialize_feature(self, json_type, feature: Feature):
if feature.multipleReferencesAllowed is not None:
json_feature[MULTIPLE_REFERENCES_ALLOWED_FIELD] = feature.multipleReferencesAllowed

if feature.elementType is not None:
if not skip_element_type and feature.elementType is not None:
json_feature[ELEMENT_TYPE_FIELD] = self._to_external_type_name(feature.elementType.name)

return json_feature
Expand Down
41 changes: 40 additions & 1 deletion cassis/typesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,46 @@ class TypeSystemMode(Enum):
MINIMAL = auto()
NONE = auto()


def array_type_name_for_type(type_: Union[str, "Type"]) -> str:
type_name = type_ if isinstance(type_, str) else type_.name
if type_name == TYPE_NAME_BYTE:
return TYPE_NAME_BYTE_ARRAY
if type_name == TYPE_NAME_FLOAT:
return TYPE_NAME_FLOAT_ARRAY
if type_name == TYPE_NAME_DOUBLE:
return TYPE_NAME_DOUBLE_ARRAY
if type_name == TYPE_NAME_BOOLEAN:
return TYPE_NAME_BOOLEAN_ARRAY
if type_name == TYPE_NAME_INTEGER:
return TYPE_NAME_INTEGER_ARRAY
if type_name == TYPE_NAME_SHORT:
return TYPE_NAME_SHORT_ARRAY
if type_name == TYPE_NAME_LONG:
return TYPE_NAME_LONG_ARRAY
if type_name == TYPE_NAME_STRING:
return TYPE_NAME_STRING_ARRAY
return TYPE_NAME_FS_ARRAY


def element_type_name_for_array_type(type_: Union[str, "Type"]) -> str:
type_name = type_ if isinstance(type_, str) else type_.name
if type_name == TYPE_NAME_BYTE_ARRAY:
return TYPE_NAME_BYTE
if type_name == TYPE_NAME_FLOAT_ARRAY:
return TYPE_NAME_FLOAT
if type_name == TYPE_NAME_DOUBLE_ARRAY:
return TYPE_NAME_DOUBLE
if type_name == TYPE_NAME_BOOLEAN_ARRAY:
return TYPE_NAME_BOOLEAN
if type_name == TYPE_NAME_INTEGER_ARRAY:
return TYPE_NAME_INTEGER
if type_name == TYPE_NAME_SHORT_ARRAY:
return TYPE_NAME_SHORT
if type_name == TYPE_NAME_LONG_ARRAY:
return TYPE_NAME_LONG
if type_name == TYPE_NAME_STRING_ARRAY:
return TYPE_NAME_STRING
return TYPE_NAME_TOP
def _string_to_valid_classname(name: str):
return re.sub("[^a-zA-Z0-9_]", "_", name)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
{
"%TYPES" : {
"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" : {
"%NAME" : "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
"%SUPER_TYPE" : "uima.tcas.DocumentAnnotation",
"documentTitle" : {
"%NAME" : "documentTitle",
"%RANGE" : "uima.cas.String"
},
"documentId" : {
"%NAME" : "documentId",
"%RANGE" : "uima.cas.String"
},
"documentUri" : {
"%NAME" : "documentUri",
"%RANGE" : "uima.cas.String"
},
"collectionId" : {
"%NAME" : "collectionId",
"%RANGE" : "uima.cas.String"
},
"documentBaseUri" : {
"%NAME" : "documentBaseUri",
"%RANGE" : "uima.cas.String"
},
"isLastSegment" : {
"%NAME" : "isLastSegment",
"%RANGE" : "uima.cas.Boolean"
}
},
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" : {
"%NAME" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"%SUPER_TYPE" : "uima.tcas.Annotation",
"id" : {
"%NAME" : "id",
"%RANGE" : "uima.cas.String"
}
},
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" : {
"%NAME" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"%SUPER_TYPE" : "uima.tcas.Annotation",
"parent" : {
"%NAME" : "parent",
"%RANGE" : "uima.tcas.Annotation"
},
"id" : {
"%NAME" : "id",
"%RANGE" : "uima.cas.String"
},
"syntacticFunction" : {
"%NAME" : "syntacticFunction",
"%RANGE" : "uima.cas.String"
},
"order" : {
"%NAME" : "order",
"%RANGE" : "uima.cas.Integer"
}
},
"webanno.custom.LinkType" : {
"%NAME" : "webanno.custom.LinkType",
"%SUPER_TYPE" : "uima.cas.TOP",
"role" : {
"%NAME" : "role",
"%RANGE" : "uima.cas.String"
},
"target" : {
"%NAME" : "target",
"%RANGE" : "webanno.custom.SimpleSpan"
}
},
"webanno.custom.SimpleLinkHost" : {
"%NAME" : "webanno.custom.SimpleLinkHost",
"%SUPER_TYPE" : "uima.tcas.Annotation",
"links" : {
"%NAME" : "links",
"%RANGE" : "webanno.custom.LinkType[]"
}
},
"webanno.custom.SimpleSpan" : {
"%NAME" : "webanno.custom.SimpleSpan",
"%SUPER_TYPE" : "uima.tcas.Annotation"
}
},
"%FEATURE_STRUCTURES" : [ {
"%ID" : 2,
"%TYPE" : "uima.cas.Sofa",
"sofaNum" : 1,
"sofaID" : "_InitialView",
"mimeType" : "text",
"sofaString" : "This is a test ."
}, {
"%ID" : 1,
"%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
"@sofa" : 2,
"begin" : 0,
"end" : 16,
"documentId" : "doc",
"isLastSegment" : false
}, {
"%ID" : 3,
"%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"@sofa" : 2,
"begin" : 0,
"end" : 16
}, {
"%ID" : 4,
"%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"@sofa" : 2,
"begin" : 0,
"end" : 4,
"order" : 0
}, {
"%ID" : 5,
"%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"@sofa" : 2,
"begin" : 5,
"end" : 7,
"order" : 0
}, {
"%ID" : 6,
"%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"@sofa" : 2,
"begin" : 8,
"end" : 9,
"order" : 0
}, {
"%ID" : 7,
"%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"@sofa" : 2,
"begin" : 10,
"end" : 14,
"order" : 0
}, {
"%ID" : 8,
"%TYPE" : "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"@sofa" : 2,
"begin" : 15,
"end" : 16,
"order" : 0
}, {
"%ID" : 9,
"%TYPE" : "webanno.custom.LinkType",
"role" : "p2",
"@target" : 10
}, {
"%ID" : 10,
"%TYPE" : "webanno.custom.SimpleSpan",
"@sofa" : 2,
"begin" : 8,
"end" : 9
}, {
"%ID" : 11,
"%TYPE" : "webanno.custom.LinkType",
"role" : "p1",
"@target" : 12
}, {
"%ID" : 12,
"%TYPE" : "webanno.custom.SimpleSpan",
"@sofa" : 2,
"begin" : 5,
"end" : 7
}, {
"%ID" : 13,
"%TYPE" : "uima.cas.FSArray",
"%ELEMENTS" : [ 11, 9 ]
}, {
"%ID" : 14,
"%TYPE" : "webanno.custom.SimpleLinkHost",
"@sofa" : 2,
"begin" : 0,
"end" : 4,
"@links" : 13
} ],
"%VIEWS" : {
"_InitialView" : {
"%SOFA" : 2,
"%MEMBERS" : [ 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14 ]
}
}
}
Loading

0 comments on commit 3d39d91

Please sign in to comment.