From 641eeafa7afe8aa4c6c2d1ee3ef2ed754d58e09e Mon Sep 17 00:00:00 2001 From: Ian Roberts Date: Fri, 23 Feb 2024 13:19:28 +0000 Subject: [PATCH 1/3] Include rejected/aborted/timed out annotations in export Add a "teamware_status" section to the export formats detailing which users (if any) have rejected, aborted or timed out annotations on each document. - for "raw" JSON this is a dict added to the top-level JSON object, with properties whose values are a JSON list of ID numbers (for anonymous) or names (for non-anonymous) of the relevant annotators - for "gate" JSON it's the same dict added under the "features" section - for "csv" the lists of IDs/names are flattened into a string the same way as multi-valued annotation elements --- backend/models.py | 21 +++++++++++++++++++++ backend/tests/test_models.py | 32 +++++++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/backend/models.py b/backend/models.py index 249638a9..8ddb800a 100644 --- a/backend/models.py +++ b/backend/models.py @@ -1039,6 +1039,27 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True): annotation_sets[annotation.user.username] = annotation_set doc_dict["annotation_sets"] = annotation_sets + # Add to the export the lists (possibly empty) of users who rejected, + # timed out or aborted annotation of this document + teamware_status = {} + for key, status in [ + ("rejected_by", Annotation.REJECTED), + ("timed_out", Annotation.TIMED_OUT), + ("aborted", Annotation.ABORTED), + ]: + teamware_status[key] = [ + annotation.user.id if anonymize else annotation.user.username + for annotation in self.annotations.filter(status=status) + ] + if json_format == "csv": + # Flatten list if exporting as CSV + teamware_status[key] = ",".join(str(val) for val in teamware_status[key]) + + if json_format == "gate": + doc_dict["features"]["teamware_status"] = teamware_status + else: + doc_dict["teamware_status"] = teamware_status + return doc_dict diff --git a/backend/tests/test_models.py b/backend/tests/test_models.py index 0dfa9bcb..2002937e 100644 --- a/backend/tests/test_models.py +++ b/backend/tests/test_models.py @@ -1099,7 +1099,9 @@ class TestDocumentAnnotationModelExport(TestCase): def setUp(self): self.test_user = get_user_model().objects.create(username="project_creator") - self.annotators = [get_user_model().objects.create(username=f"anno{i}") for i in range(3)] + self.annotator_names = [f"anno{i}" for i in range(3)] + self.annotators = [get_user_model().objects.create(username=u) for u in self.annotator_names] + self.annotator_ids = [a.id for a in self.annotators] self.project = Project.objects.create(owner=self.test_user) for i in range(10): document = Document.objects.create( @@ -1154,6 +1156,7 @@ def test_export_raw(self): self.assertTrue("feature3" in doc_dict) self.check_raw_gate_annotation_formatting(doc_dict) + self.check_teamware_status(doc_dict, self.annotator_ids) def test_export_gate(self): @@ -1170,6 +1173,7 @@ def test_export_gate(self): self.assertTrue("feature3" in doc_features) self.check_raw_gate_annotation_formatting(doc_dict) + self.check_teamware_status(doc_features, self.annotator_ids) def check_raw_gate_annotation_formatting(self, doc_dict): self.assertTrue("annotation_sets" in doc_dict) @@ -1191,6 +1195,18 @@ def check_raw_gate_annotation_formatting(self, doc_dict): self.assertTrue("text1" in label_dict) self.assertTrue("checkbox1" in label_dict) + def check_teamware_status(self, containing_dict, expected_value): + self.assertTrue("teamware_status" in containing_dict) + teamware_status = containing_dict["teamware_status"] + if isinstance(expected_value, str): + self.assertEqual(teamware_status["rejected_by"], expected_value) + self.assertEqual(teamware_status["aborted"], expected_value) + self.assertEqual(teamware_status["timed_out"], expected_value) + else: + self.assertSetEqual(set(teamware_status["rejected_by"]), set(expected_value)) + self.assertSetEqual(set(teamware_status["aborted"]), set(expected_value)) + self.assertSetEqual(set(teamware_status["timed_out"]), set(expected_value)) + def test_export_csv(self): for document in self.project.documents.all(): @@ -1209,6 +1225,8 @@ def test_export_csv(self): self.assertTrue(isinstance(anno_set_dict[set_key]["text1"], str)) self.assertTrue(isinstance(anno_set_dict[set_key]["checkbox1"], str)) + self.check_teamware_status(doc_dict, ",".join(str(i) for i in self.annotator_ids)) + def test_export_raw_anonymized(self): for document in self.project.documents.all(): @@ -1217,6 +1235,8 @@ def test_export_raw_anonymized(self): for aset_key, aset_data in doc_dict["annotation_sets"].items(): self.assertTrue(isinstance(aset_data.get("name", None), int)) + self.check_teamware_status(doc_dict, self.annotator_ids) + def test_export_raw_deanonymized(self): for document in self.project.documents.all(): @@ -1225,6 +1245,10 @@ def test_export_raw_deanonymized(self): for aset_key, aset_data in doc_dict["annotation_sets"].items(): self.assertTrue(isinstance(aset_data.get("name", None), str)) + # for non-anonymized export the rejected/aborted/timed_out status + # uses names rather than ID numbers + self.check_teamware_status(doc_dict, self.annotator_names) + def test_export_gate_anonymized(self): for document in self.project.documents.all(): @@ -1233,6 +1257,8 @@ def test_export_gate_anonymized(self): for aset_key, aset_data in doc_dict["annotation_sets"].items(): self.assertTrue(isinstance(aset_data.get("name", None), int)) + self.check_teamware_status(doc_dict["features"], self.annotator_ids) + def test_export_gate_deanonymized(self): for document in self.project.documents.all(): @@ -1240,3 +1266,7 @@ def test_export_gate_deanonymized(self): for aset_key, aset_data in doc_dict["annotation_sets"].items(): self.assertTrue(isinstance(aset_data.get("name", None), str)) + + # for non-anonymized export the rejected/aborted/timed_out status + # uses names rather than ID numbers + self.check_teamware_status(doc_dict["features"], self.annotator_names) From e915ee56bf2c1453aba63b15cd899753658d0c25 Mon Sep 17 00:00:00 2001 From: Ian Roberts Date: Fri, 23 Feb 2024 13:22:02 +0000 Subject: [PATCH 2/3] Copy the self.data dict when generating exports The export process adds entries to the doc_dict, so we should clone it first to ensure such changes are not accidentally persisted to the database if the model object is saved after a call to get_doc_annotation_dict. --- backend/models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/models.py b/backend/models.py index 8ddb800a..d8e44ed1 100644 --- a/backend/models.py +++ b/backend/models.py @@ -978,7 +978,7 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True): # Create dictionary for document doc_dict = None if json_format == "raw" or json_format == "csv": - doc_dict = self.data + doc_dict = self.data.copy() elif json_format == "gate": ignore_keys = {"text", self.project.document_id_field} @@ -990,7 +990,6 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True): "offset_type": "p", "name": get_value_from_key_path(self.data, self.project.document_id_field) } - pass # Insert annotation sets into the doc dict annotations = self.annotations.filter(status=Annotation.COMPLETED) From 41fd491bf4555de52b7677624b413c3ecc489dbb Mon Sep 17 00:00:00 2001 From: Ian Roberts Date: Fri, 23 Feb 2024 13:50:15 +0000 Subject: [PATCH 3/3] Documentation for the teamware_status information --- .../documents_annotations_management.md | 34 +++++++++++++++---- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/docs/docs/manageradminguide/documents_annotations_management.md b/docs/docs/manageradminguide/documents_annotations_management.md index ac010c9f..c73fbb5a 100644 --- a/docs/docs/manageradminguide/documents_annotations_management.md +++ b/docs/docs/manageradminguide/documents_annotations_management.md @@ -178,14 +178,21 @@ The above column headers will generate the following JSON: ## Exporting documents Documents and annotations can be exported using the **Export** button. A zip file is generated containing files with 500 -documents each. You can choose how documents are exported: +documents each. The option to "anonymize annotators" controls whether the individual annotators are identified with +their numeric ID or by their actual username - since usernames are often personally identifiable information (e.g. an +email address) the anonumous mode is recommended if you intend to share the annotation data with third parties. Note +that the anonymous IDs are consistent within a single installation of Teamware, so even in anonymous mode it is still +possible to determine which documents were annotated by _the same person_, just not who that person was. + +You can choose how documents are exported: * `.json` & `.jsonl` - JSON or JSON Lines files can be generated in the format of: * `raw` - Exports unmodified JSON. If you've originally uploaded in GATE format then choose this option. An additional field named `annotation_sets` is added for storing annotations. The annotations are laid out in the same way as GATE JSON format. For example if a document has been annotated by `user1` with labels and values - `text`:`Annotation text`, `radio`:`val3`, and `checkbox`:`["val2", "val4"]`: + `text`:`Annotation text`, `radio`:`val3`, and `checkbox`:`["val2", "val4"]`, the non-anonymous export might look + like this: ```json { @@ -216,13 +223,25 @@ documents each. You can choose how documents are exported: ], "next_annid":1 } + }, + "teamware_status": { + "rejected_by": ["user2"], + "timed_out": ["user3"], + "aborted": [] } } ``` + In anonymous mode the name `user1` would instead be the user's opaque numeric identifier (e.g. `105`). + + The field `teamware_status` gives the ids or usernames (depending on the "anonymize" setting) of those annotators + who rejected the document, "timed out" because they did not complete their annotation in the time allowed by the + project, or "aborted" for some other reason (e.g. they were removed from the project). + * `gate` - Convert documents to GATE JSON format and export. A `name` field is added that takes the ID value from the ID field specified in the project configuration. Fields apart from `text` and the ID field specified in the project - config are placed in the `features` field. An `annotation_sets` field is added for storing annotations. + config are placed in the `features` field, as is the `teamware_status` information. An `annotation_sets` field is + added for storing annotations. For example in the case of this uploaded JSON document: ```json @@ -233,21 +252,24 @@ documents each. You can choose how documents are exported: "feature1": "Feature text" } ``` - The generated output is as follows. The annotations are formatted same as the `raw` output above: + The generated output is as follows. The annotations and `teamware_status` are formatted same as the `raw` output + above: ```json { "name": 32, "text": "Document text", "features": { "text2": "Document text 2", - "feature1": "Feature text" + "feature1": "Feature text", + "teamware_status": {...} }, "offset_type":"p", "annotation_sets": {...} } ``` * `.csv` - The JSON documents will be flattened to csv's column based format. Annotations are added as additional - columns with the header of `annotations.username.label`. + columns with the header of `annotations.username.label` and the status information is in columns named + `teamware_status.rejected_by`, `teamware_status.timed_out` and `teamware_status.aborted`. ## Deleting documents and annotations