Skip to content

Commit

Permalink
Merge pull request #399 from GateNLP/export-rejected
Browse files Browse the repository at this point in the history
Include details of rejected/aborted/timed out annotations in the export
  • Loading branch information
ianroberts authored Feb 26, 2024
2 parents 0e64b88 + 41fd491 commit 834af01
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 9 deletions.
24 changes: 22 additions & 2 deletions backend/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -978,7 +978,7 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
# Create dictionary for document
doc_dict = None
if json_format == "raw" or json_format == "csv":
doc_dict = self.data
doc_dict = self.data.copy()
elif json_format == "gate":

ignore_keys = {"text", self.project.document_id_field}
Expand All @@ -990,7 +990,6 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
"offset_type": "p",
"name": get_value_from_key_path(self.data, self.project.document_id_field)
}
pass

# Insert annotation sets into the doc dict
annotations = self.annotations.filter(status=Annotation.COMPLETED)
Expand Down Expand Up @@ -1039,6 +1038,27 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
annotation_sets[annotation.user.username] = annotation_set
doc_dict["annotation_sets"] = annotation_sets

# Add to the export the lists (possibly empty) of users who rejected,
# timed out or aborted annotation of this document
teamware_status = {}
for key, status in [
("rejected_by", Annotation.REJECTED),
("timed_out", Annotation.TIMED_OUT),
("aborted", Annotation.ABORTED),
]:
teamware_status[key] = [
annotation.user.id if anonymize else annotation.user.username
for annotation in self.annotations.filter(status=status)
]
if json_format == "csv":
# Flatten list if exporting as CSV
teamware_status[key] = ",".join(str(val) for val in teamware_status[key])

if json_format == "gate":
doc_dict["features"]["teamware_status"] = teamware_status
else:
doc_dict["teamware_status"] = teamware_status

return doc_dict


Expand Down
32 changes: 31 additions & 1 deletion backend/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1099,7 +1099,9 @@ class TestDocumentAnnotationModelExport(TestCase):

def setUp(self):
self.test_user = get_user_model().objects.create(username="project_creator")
self.annotators = [get_user_model().objects.create(username=f"anno{i}") for i in range(3)]
self.annotator_names = [f"anno{i}" for i in range(3)]
self.annotators = [get_user_model().objects.create(username=u) for u in self.annotator_names]
self.annotator_ids = [a.id for a in self.annotators]
self.project = Project.objects.create(owner=self.test_user)
for i in range(10):
document = Document.objects.create(
Expand Down Expand Up @@ -1154,6 +1156,7 @@ def test_export_raw(self):
self.assertTrue("feature3" in doc_dict)

self.check_raw_gate_annotation_formatting(doc_dict)
self.check_teamware_status(doc_dict, self.annotator_ids)

def test_export_gate(self):

Expand All @@ -1170,6 +1173,7 @@ def test_export_gate(self):
self.assertTrue("feature3" in doc_features)

self.check_raw_gate_annotation_formatting(doc_dict)
self.check_teamware_status(doc_features, self.annotator_ids)

def check_raw_gate_annotation_formatting(self, doc_dict):
self.assertTrue("annotation_sets" in doc_dict)
Expand All @@ -1191,6 +1195,18 @@ def check_raw_gate_annotation_formatting(self, doc_dict):
self.assertTrue("text1" in label_dict)
self.assertTrue("checkbox1" in label_dict)

def check_teamware_status(self, containing_dict, expected_value):
self.assertTrue("teamware_status" in containing_dict)
teamware_status = containing_dict["teamware_status"]
if isinstance(expected_value, str):
self.assertEqual(teamware_status["rejected_by"], expected_value)
self.assertEqual(teamware_status["aborted"], expected_value)
self.assertEqual(teamware_status["timed_out"], expected_value)
else:
self.assertSetEqual(set(teamware_status["rejected_by"]), set(expected_value))
self.assertSetEqual(set(teamware_status["aborted"]), set(expected_value))
self.assertSetEqual(set(teamware_status["timed_out"]), set(expected_value))

def test_export_csv(self):

for document in self.project.documents.all():
Expand All @@ -1209,6 +1225,8 @@ def test_export_csv(self):
self.assertTrue(isinstance(anno_set_dict[set_key]["text1"], str))
self.assertTrue(isinstance(anno_set_dict[set_key]["checkbox1"], str))

self.check_teamware_status(doc_dict, ",".join(str(i) for i in self.annotator_ids))

def test_export_raw_anonymized(self):

for document in self.project.documents.all():
Expand All @@ -1217,6 +1235,8 @@ def test_export_raw_anonymized(self):
for aset_key, aset_data in doc_dict["annotation_sets"].items():
self.assertTrue(isinstance(aset_data.get("name", None), int))

self.check_teamware_status(doc_dict, self.annotator_ids)

def test_export_raw_deanonymized(self):

for document in self.project.documents.all():
Expand All @@ -1225,6 +1245,10 @@ def test_export_raw_deanonymized(self):
for aset_key, aset_data in doc_dict["annotation_sets"].items():
self.assertTrue(isinstance(aset_data.get("name", None), str))

# for non-anonymized export the rejected/aborted/timed_out status
# uses names rather than ID numbers
self.check_teamware_status(doc_dict, self.annotator_names)

def test_export_gate_anonymized(self):

for document in self.project.documents.all():
Expand All @@ -1233,10 +1257,16 @@ def test_export_gate_anonymized(self):
for aset_key, aset_data in doc_dict["annotation_sets"].items():
self.assertTrue(isinstance(aset_data.get("name", None), int))

self.check_teamware_status(doc_dict["features"], self.annotator_ids)

def test_export_gate_deanonymized(self):

for document in self.project.documents.all():
doc_dict = document.get_doc_annotation_dict("gate", anonymize=False)

for aset_key, aset_data in doc_dict["annotation_sets"].items():
self.assertTrue(isinstance(aset_data.get("name", None), str))

# for non-anonymized export the rejected/aborted/timed_out status
# uses names rather than ID numbers
self.check_teamware_status(doc_dict["features"], self.annotator_names)
34 changes: 28 additions & 6 deletions docs/docs/manageradminguide/documents_annotations_management.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,14 +178,21 @@ The above column headers will generate the following JSON:
## Exporting documents

Documents and annotations can be exported using the **Export** button. A zip file is generated containing files with 500
documents each. You can choose how documents are exported:
documents each. The option to "anonymize annotators" controls whether the individual annotators are identified with
their numeric ID or by their actual username - since usernames are often personally identifiable information (e.g. an
email address) the anonumous mode is recommended if you intend to share the annotation data with third parties. Note
that the anonymous IDs are consistent within a single installation of Teamware, so even in anonymous mode it is still
possible to determine which documents were annotated by _the same person_, just not who that person was.

You can choose how documents are exported:

* `.json` & `.jsonl` - JSON or JSON Lines files can be generated in the format of:
* `raw` - Exports unmodified JSON. If you've originally uploaded in GATE format then choose this option.

An additional field named `annotation_sets` is added for storing annotations. The annotations are laid out in the
same way as GATE JSON format. For example if a document has been annotated by `user1` with labels and values
`text`:`Annotation text`, `radio`:`val3`, and `checkbox`:`["val2", "val4"]`:
`text`:`Annotation text`, `radio`:`val3`, and `checkbox`:`["val2", "val4"]`, the non-anonymous export might look
like this:

```json
{
Expand Down Expand Up @@ -216,13 +223,25 @@ documents each. You can choose how documents are exported:
],
"next_annid":1
}
},
"teamware_status": {
"rejected_by": ["user2"],
"timed_out": ["user3"],
"aborted": []
}
}
```

In anonymous mode the name `user1` would instead be the user's opaque numeric identifier (e.g. `105`).

The field `teamware_status` gives the ids or usernames (depending on the "anonymize" setting) of those annotators
who rejected the document, "timed out" because they did not complete their annotation in the time allowed by the
project, or "aborted" for some other reason (e.g. they were removed from the project).

* `gate` - Convert documents to GATE JSON format and export. A `name` field is added that takes the ID value from the
ID field specified in the project configuration. Fields apart from `text` and the ID field specified in the project
config are placed in the `features` field. An `annotation_sets` field is added for storing annotations.
config are placed in the `features` field, as is the `teamware_status` information. An `annotation_sets` field is
added for storing annotations.

For example in the case of this uploaded JSON document:
```json
Expand All @@ -233,21 +252,24 @@ documents each. You can choose how documents are exported:
"feature1": "Feature text"
}
```
The generated output is as follows. The annotations are formatted same as the `raw` output above:
The generated output is as follows. The annotations and `teamware_status` are formatted same as the `raw` output
above:
```json
{
"name": 32,
"text": "Document text",
"features": {
"text2": "Document text 2",
"feature1": "Feature text"
"feature1": "Feature text",
"teamware_status": {...}
},
"offset_type":"p",
"annotation_sets": {...}
}
```
* `.csv` - The JSON documents will be flattened to csv's column based format. Annotations are added as additional
columns with the header of `annotations.username.label`.
columns with the header of `annotations.username.label` and the status information is in columns named
`teamware_status.rejected_by`, `teamware_status.timed_out` and `teamware_status.aborted`.

## Deleting documents and annotations

Expand Down

0 comments on commit 834af01

Please sign in to comment.