Skip to content

Commit

Permalink
Merge pull request #377 from GateNLP/various-export-issues
Browse files Browse the repository at this point in the history
Fixes for various export related issues
  • Loading branch information
ianroberts authored Feb 26, 2024
2 parents 834af01 + e289227 commit 6993266
Show file tree
Hide file tree
Showing 4 changed files with 181 additions and 61 deletions.
29 changes: 17 additions & 12 deletions backend/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -980,22 +980,26 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
if json_format == "raw" or json_format == "csv":
doc_dict = self.data.copy()
elif json_format == "gate":
# GATE json format are expected to have an existing "features" field
features_dict = dict(self.data["features"]) if "features" in self.data and isinstance(self.data["features"], dict) else {}

ignore_keys = {"text", self.project.document_id_field}
features_dict = {key: value for key, value in self.data.items() if key not in ignore_keys}
# Add any non-compliant top-level fields into the "features" field instead
ignore_keys = {"text", "features", "offset_type", "annotation_sets", self.project.document_id_field}
features_dict.update({key: value for key, value in self.data.items() if key not in ignore_keys})

doc_dict = {
"text": self.data["text"],
"features": features_dict,
"offset_type": "p",
"offset_type": self.data["offset_type"] if "offset_type" in self.data else "p", # Use original offset type
"name": get_value_from_key_path(self.data, self.project.document_id_field)
}

# Insert annotation sets into the doc dict
annotations = self.annotations.filter(status=Annotation.COMPLETED)
if json_format == "csv":
# Gets pre-existing annotations
annotation_sets = dict(self.data["annotations"]) if "annotations" in self.data else {}
# Format annotations for CSV export
annotation_sets = {}
for annotation in annotations:
a_data = annotation.data
annotation_dict = {}
Expand All @@ -1008,34 +1012,35 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
annotation_dict["duration_seconds"] = annotation.time_to_complete

if anonymize:
annotation_sets[str(annotation.user.id)] = annotation_dict
annotation_sets[f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}"] = annotation_dict
else:
annotation_sets[annotation.user.username] = annotation_dict

doc_dict["annotations"] = annotation_sets

else:
# Gets pre-existing annotations
annotation_sets = dict(self.data["annotation_sets"]) if "annotation_sets" in self.data else {}
# Format for JSON in line with GATE formatting
annotation_sets = {}
for annotation in annotations:
a_data = annotation.data
anonymized_name = f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}"
annotation_set = {
"name": annotation.user.id if anonymize else annotation.user.username,
"name": anonymized_name if anonymize else annotation.user.username,
"annotations": [
{
"type": "Document",
"start": 0,
"end": 0,
"id": 0,
"duration_seconds": annotation.time_to_complete,
"features": {
"label": a_data
}
"features": a_data
}
],
"next_annid": 1,
}
annotation_sets[annotation.user.username] = annotation_set
annotation_sets[anonymized_name if anonymize else annotation.user.username] = annotation_set

doc_dict["annotation_sets"] = annotation_sets

# Add to the export the lists (possibly empty) of users who rejected,
Expand All @@ -1047,7 +1052,7 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
("aborted", Annotation.ABORTED),
]:
teamware_status[key] = [
annotation.user.id if anonymize else annotation.user.username
f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}" if anonymize else annotation.user.username
for annotation in self.annotations.filter(status=status)
]
if json_format == "csv":
Expand Down
166 changes: 137 additions & 29 deletions backend/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1098,10 +1098,11 @@ def test_get_annotations_for_user_in_project(self):
class TestDocumentAnnotationModelExport(TestCase):

def setUp(self):
self.unanonymized_prefix = "namedperson"
self.test_user = get_user_model().objects.create(username="project_creator")
self.annotator_names = [f"anno{i}" for i in range(3)]
self.annotator_names = [f"{self.unanonymized_prefix}{i}" for i in range(3)]
self.annotators = [get_user_model().objects.create(username=u) for u in self.annotator_names]
self.annotator_ids = [a.id for a in self.annotators]
self.anon_annotator_names = [f"{settings.ANONYMIZATION_PREFIX}{a.id}" for a in self.annotators]
self.project = Project.objects.create(owner=self.test_user)
for i in range(10):
document = Document.objects.create(
Expand All @@ -1112,6 +1113,55 @@ def setUp(self):
"feature1": "Testvalue 1",
"feature2": "Testvalue 1",
"feature3": "Testvalue 1",
"features": {
"gate_format_feature1": "Gate feature test value",
"gate_format_feature2": "Gate feature test value",
"gate_format_feature3": "Gate feature test value",
},
"offset_type": "x",
"annotations": {
"existing_annotator1": {
"sentiment": "positive"
},
f"{settings.ANONYMIZATION_PREFIX}{self.annotators[0].pk}": {
"sentiment": "positive"
}

},
"annotation_sets": {
"existing_annotator1": {
"name": "existing_annotator1",
"annotations": [
{
"type": "Document",
"start": 0,
"end": 10,
"id": 0,
"features": {
"sentiment": "positive"
}
}
],
"next_annid": 1
},
f"{settings.ANONYMIZATION_PREFIX}{self.annotators[0].pk}": {
"name": f"{settings.ANONYMIZATION_PREFIX}{self.annotators[0].pk}",
"annotations": [
{
"type": "Document",
"start": 0,
"end": 10,
"id": 0,
"features": {
"sentiment": "positive"
}
}
],
"next_annid": 1
}

}


}
)
Expand Down Expand Up @@ -1147,53 +1197,88 @@ def setUp(self):
def test_export_raw(self):

for document in self.project.documents.all():
# Fields should remain exactly the same as what's been uploaded
# aside from annotation_sets
doc_dict = document.get_doc_annotation_dict("raw")
print(doc_dict)
self.assertTrue("id" in doc_dict)
self.assertTrue("text" in doc_dict)
self.assertTrue("feature1" in doc_dict)
self.assertTrue("feature2" in doc_dict)
self.assertTrue("feature3" in doc_dict)
self.assertTrue("features" in doc_dict)
self.assertTrue("offset_type" in doc_dict)
self.assertTrue("annotations" in doc_dict)
doc_features = doc_dict["features"]
self.assertTrue("gate_format_feature1" in doc_features)
self.assertTrue("gate_format_feature2" in doc_features)
self.assertTrue("gate_format_feature3" in doc_features)


self.check_raw_gate_annotation_formatting(doc_dict)
self.check_teamware_status(doc_dict, self.annotator_ids)
self.check_teamware_status(doc_dict, self.anon_annotator_names)

def test_export_gate(self):

for document in self.project.documents.all():
# All top-level fields apart from name, text, features and annotation_sets should be
# nested inside the features field
doc_dict = document.get_doc_annotation_dict("gate")
print(doc_dict)

self.assertTrue("text" in doc_dict)
self.assertTrue("features" in doc_dict)
self.assertFalse("annotations" in doc_dict)
self.assertEqual(doc_dict["offset_type"], "x")
doc_features = doc_dict["features"]
self.assertTrue("id" in doc_features)
self.assertTrue("feature1" in doc_features)
self.assertTrue("feature2" in doc_features)
self.assertTrue("feature3" in doc_features)
self.assertTrue("annotations" in doc_features)
self.assertFalse("features" in doc_features, "Double nesting of features field")
self.assertFalse("offset_type" in doc_features, "Double nesting of offset_type field")
self.assertTrue("gate_format_feature1" in doc_features)
self.assertTrue("gate_format_feature2" in doc_features)
self.assertTrue("gate_format_feature3" in doc_features)

self.check_raw_gate_annotation_formatting(doc_dict)
self.check_teamware_status(doc_features, self.annotator_ids)
self.check_teamware_status(doc_features, self.anon_annotator_names)

def test_export_gate_with_no_offset_type(self):

def check_raw_gate_annotation_formatting(self, doc_dict):
for document in self.project.documents.all():
document.data.pop("offset_type")

doc_dict = document.get_doc_annotation_dict("gate")
self.assertEqual(doc_dict["offset_type"], "p", "offset_type should default to p")


def check_raw_gate_annotation_formatting(self, doc_dict: dict):
self.assertTrue("annotation_sets" in doc_dict)
self.assertTrue(len(doc_dict["annotation_sets"]) == 3)
self.assertEqual(len(doc_dict["annotation_sets"]), 4, doc_dict)

# Test annotation formatting
for aset_key, aset_data in doc_dict["annotation_sets"].items():
self.assertTrue("name" in aset_data)
self.assertTrue("annotations" in aset_data)
self.assertEqual(len(aset_data["annotations"]), 1)
anno_dict = aset_data["annotations"][0]
self.assertTrue("type" in anno_dict)
self.assertTrue("start" in anno_dict)
self.assertTrue("end" in anno_dict)
self.assertTrue("id" in anno_dict)
self.assertTrue("features" in anno_dict)
self.assertTrue("label" in anno_dict["features"])
label_dict = anno_dict["features"]["label"]
self.assertTrue("text1" in label_dict)
self.assertTrue("checkbox1" in label_dict)
if aset_key != "existing_annotator1":
self.assertTrue("name" in aset_data)
self.assertTrue("annotations" in aset_data)
self.assertEqual(len(aset_data["annotations"]), 1)
anno_dict = aset_data["annotations"][0]
self.assertTrue("type" in anno_dict)
self.assertTrue("start" in anno_dict)
self.assertTrue("end" in anno_dict)
self.assertTrue("id" in anno_dict)
self.assertTrue("features" in anno_dict)
features_dict = anno_dict["features"]
self.assertTrue("text1" in features_dict)
self.assertTrue("checkbox1" in features_dict)
else:
# Check that existing annotation from document upload is carried over
self.assertEqual(aset_data["annotations"][0]["features"]["sentiment"], "positive")




def check_teamware_status(self, containing_dict, expected_value):
self.assertTrue("teamware_status" in containing_dict)
Expand All @@ -1219,31 +1304,44 @@ def test_export_csv(self):
self.assertTrue("feature2" in doc_dict)
self.assertTrue("feature3" in doc_dict)
self.assertTrue("annotations" in doc_dict)
self.assertTrue(len(doc_dict["annotations"]) == 3)
self.assertEqual(len(doc_dict["annotations"]), 4, doc_dict)
anno_set_dict = doc_dict["annotations"]
for set_key in anno_set_dict:
self.assertTrue(isinstance(anno_set_dict[set_key]["text1"], str))
self.assertTrue(isinstance(anno_set_dict[set_key]["checkbox1"], str))
if set_key != "existing_annotator1":
self.assertTrue(isinstance(anno_set_dict[set_key]["text1"], str))
self.assertTrue(isinstance(anno_set_dict[set_key]["checkbox1"], str))
else:
self.assertEqual(anno_set_dict[set_key]["sentiment"], "positive")

self.check_teamware_status(doc_dict, ",".join(str(i) for i in self.annotator_ids))
self.check_teamware_status(doc_dict, ",".join(str(i) for i in self.anon_annotator_names))

def test_export_raw_anonymized(self):

for document in self.project.documents.all():
# Mask any existing annotations that came with the document upload
document.data.pop("annotation_sets")
document.save()

doc_dict = document.get_doc_annotation_dict("raw", anonymize=True)

for aset_key, aset_data in doc_dict["annotation_sets"].items():
self.assertTrue(isinstance(aset_data.get("name", None), int))
self.assertFalse(aset_key.startswith(self.unanonymized_prefix))
self.assertFalse(aset_data.get("name", None).startswith(self.unanonymized_prefix))

self.check_teamware_status(doc_dict, self.annotator_ids)
self.check_teamware_status(doc_dict, self.anon_annotator_names)

def test_export_raw_deanonymized(self):

for document in self.project.documents.all():
# Mask any existing annotations that came with the document upload
document.data.pop("annotation_sets")
document.save()

doc_dict = document.get_doc_annotation_dict("raw", anonymize=False)

for aset_key, aset_data in doc_dict["annotation_sets"].items():
self.assertTrue(isinstance(aset_data.get("name", None), str))
self.assertTrue(aset_key.startswith(self.unanonymized_prefix))
self.assertTrue(aset_data.get("name", None).startswith(self.unanonymized_prefix))

# for non-anonymized export the rejected/aborted/timed_out status
# uses names rather than ID numbers
Expand All @@ -1252,20 +1350,30 @@ def test_export_raw_deanonymized(self):
def test_export_gate_anonymized(self):

for document in self.project.documents.all():
# Mask any existing annotations that came with the document upload
document.data.pop("annotation_sets")
document.save()

doc_dict = document.get_doc_annotation_dict("gate", anonymize=True)

for aset_key, aset_data in doc_dict["annotation_sets"].items():
self.assertTrue(isinstance(aset_data.get("name", None), int))
self.assertFalse(aset_key.startswith(self.unanonymized_prefix))
self.assertFalse(aset_data.get("name", None).startswith(self.unanonymized_prefix))

self.check_teamware_status(doc_dict["features"], self.annotator_ids)
self.check_teamware_status(doc_dict["features"], self.anon_annotator_names)

def test_export_gate_deanonymized(self):

for document in self.project.documents.all():
# Mask any existing annotations that came with the document upload
document.data.pop("annotation_sets")
document.save()

doc_dict = document.get_doc_annotation_dict("gate", anonymize=False)

for aset_key, aset_data in doc_dict["annotation_sets"].items():
self.assertTrue(isinstance(aset_data.get("name", None), str))
self.assertTrue(aset_key.startswith(self.unanonymized_prefix))
self.assertTrue(aset_data.get("name", None).startswith(self.unanonymized_prefix))

# for non-anonymized export the rejected/aborted/timed_out status
# uses names rather than ID numbers
Expand Down
Loading

0 comments on commit 6993266

Please sign in to comment.