From 641eeafa7afe8aa4c6c2d1ee3ef2ed754d58e09e Mon Sep 17 00:00:00 2001
From: Ian Roberts <i.roberts@dcs.shef.ac.uk>
Date: Fri, 23 Feb 2024 13:19:28 +0000
Subject: [PATCH 1/3] Include rejected/aborted/timed out annotations in export

Add a "teamware_status" section to the export formats detailing which users (if any) have rejected, aborted or timed out annotations on each document.

- for "raw" JSON this is a dict added to the top-level JSON object, with properties whose values are a JSON list of ID numbers (for anonymous) or names (for non-anonymous) of the relevant annotators
- for "gate" JSON it's the same dict added under the "features" section
- for "csv" the lists of IDs/names are flattened into a string the same way as multi-valued annotation elements
---
 backend/models.py            | 21 +++++++++++++++++++++
 backend/tests/test_models.py | 32 +++++++++++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/backend/models.py b/backend/models.py
index 249638a9..8ddb800a 100644
--- a/backend/models.py
+++ b/backend/models.py
@@ -1039,6 +1039,27 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
                 annotation_sets[annotation.user.username] = annotation_set
             doc_dict["annotation_sets"] = annotation_sets
 
+        # Add to the export the lists (possibly empty) of users who rejected,
+        # timed out or aborted annotation of this document
+        teamware_status = {}
+        for key, status in [
+            ("rejected_by", Annotation.REJECTED),
+            ("timed_out", Annotation.TIMED_OUT),
+            ("aborted", Annotation.ABORTED),
+        ]:
+            teamware_status[key] = [
+                annotation.user.id if anonymize else annotation.user.username
+                for annotation in self.annotations.filter(status=status)
+            ]
+            if json_format == "csv":
+                # Flatten list if exporting as CSV
+                teamware_status[key] = ",".join(str(val) for val in teamware_status[key])
+
+        if json_format == "gate":
+            doc_dict["features"]["teamware_status"] = teamware_status
+        else:
+            doc_dict["teamware_status"] = teamware_status
+
         return doc_dict
 
 
diff --git a/backend/tests/test_models.py b/backend/tests/test_models.py
index 0dfa9bcb..2002937e 100644
--- a/backend/tests/test_models.py
+++ b/backend/tests/test_models.py
@@ -1099,7 +1099,9 @@ class TestDocumentAnnotationModelExport(TestCase):
 
     def setUp(self):
         self.test_user = get_user_model().objects.create(username="project_creator")
-        self.annotators = [get_user_model().objects.create(username=f"anno{i}") for i in range(3)]
+        self.annotator_names = [f"anno{i}" for i in range(3)]
+        self.annotators = [get_user_model().objects.create(username=u) for u in self.annotator_names]
+        self.annotator_ids = [a.id for a in self.annotators]
         self.project = Project.objects.create(owner=self.test_user)
         for i in range(10):
             document = Document.objects.create(
@@ -1154,6 +1156,7 @@ def test_export_raw(self):
             self.assertTrue("feature3" in doc_dict)
 
             self.check_raw_gate_annotation_formatting(doc_dict)
+            self.check_teamware_status(doc_dict, self.annotator_ids)
 
     def test_export_gate(self):
 
@@ -1170,6 +1173,7 @@ def test_export_gate(self):
             self.assertTrue("feature3" in doc_features)
 
             self.check_raw_gate_annotation_formatting(doc_dict)
+            self.check_teamware_status(doc_features, self.annotator_ids)
 
     def check_raw_gate_annotation_formatting(self, doc_dict):
         self.assertTrue("annotation_sets" in doc_dict)
@@ -1191,6 +1195,18 @@ def check_raw_gate_annotation_formatting(self, doc_dict):
             self.assertTrue("text1" in label_dict)
             self.assertTrue("checkbox1" in label_dict)
 
+    def check_teamware_status(self, containing_dict, expected_value):
+        self.assertTrue("teamware_status" in containing_dict)
+        teamware_status = containing_dict["teamware_status"]
+        if isinstance(expected_value, str):
+            self.assertEqual(teamware_status["rejected_by"], expected_value)
+            self.assertEqual(teamware_status["aborted"], expected_value)
+            self.assertEqual(teamware_status["timed_out"], expected_value)
+        else:
+            self.assertSetEqual(set(teamware_status["rejected_by"]), set(expected_value))
+            self.assertSetEqual(set(teamware_status["aborted"]), set(expected_value))
+            self.assertSetEqual(set(teamware_status["timed_out"]), set(expected_value))
+
     def test_export_csv(self):
 
         for document in self.project.documents.all():
@@ -1209,6 +1225,8 @@ def test_export_csv(self):
                 self.assertTrue(isinstance(anno_set_dict[set_key]["text1"], str))
                 self.assertTrue(isinstance(anno_set_dict[set_key]["checkbox1"], str))
 
+            self.check_teamware_status(doc_dict, ",".join(str(i) for i in self.annotator_ids))
+
     def test_export_raw_anonymized(self):
 
         for document in self.project.documents.all():
@@ -1217,6 +1235,8 @@ def test_export_raw_anonymized(self):
             for aset_key, aset_data in doc_dict["annotation_sets"].items():
                 self.assertTrue(isinstance(aset_data.get("name", None), int))
 
+            self.check_teamware_status(doc_dict, self.annotator_ids)
+
     def test_export_raw_deanonymized(self):
 
         for document in self.project.documents.all():
@@ -1225,6 +1245,10 @@ def test_export_raw_deanonymized(self):
             for aset_key, aset_data in doc_dict["annotation_sets"].items():
                 self.assertTrue(isinstance(aset_data.get("name", None), str))
 
+            # for non-anonymized export the rejected/aborted/timed_out status
+            # uses names rather than ID numbers
+            self.check_teamware_status(doc_dict, self.annotator_names)
+
     def test_export_gate_anonymized(self):
 
         for document in self.project.documents.all():
@@ -1233,6 +1257,8 @@ def test_export_gate_anonymized(self):
             for aset_key, aset_data in doc_dict["annotation_sets"].items():
                 self.assertTrue(isinstance(aset_data.get("name", None), int))
 
+            self.check_teamware_status(doc_dict["features"], self.annotator_ids)
+
     def test_export_gate_deanonymized(self):
 
         for document in self.project.documents.all():
@@ -1240,3 +1266,7 @@ def test_export_gate_deanonymized(self):
 
             for aset_key, aset_data in doc_dict["annotation_sets"].items():
                 self.assertTrue(isinstance(aset_data.get("name", None), str))
+
+            # for non-anonymized export the rejected/aborted/timed_out status
+            # uses names rather than ID numbers
+            self.check_teamware_status(doc_dict["features"], self.annotator_names)

From e915ee56bf2c1453aba63b15cd899753658d0c25 Mon Sep 17 00:00:00 2001
From: Ian Roberts <i.roberts@dcs.shef.ac.uk>
Date: Fri, 23 Feb 2024 13:22:02 +0000
Subject: [PATCH 2/3] Copy the self.data dict when generating exports

The export process adds entries to the doc_dict, so we should clone it first to ensure such changes are not accidentally persisted to the database if the model object is saved after a call to get_doc_annotation_dict.
---
 backend/models.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/backend/models.py b/backend/models.py
index 8ddb800a..d8e44ed1 100644
--- a/backend/models.py
+++ b/backend/models.py
@@ -978,7 +978,7 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
         # Create dictionary for document
         doc_dict = None
         if json_format == "raw" or json_format == "csv":
-            doc_dict = self.data
+            doc_dict = self.data.copy()
         elif json_format == "gate":
 
             ignore_keys = {"text", self.project.document_id_field}
@@ -990,7 +990,6 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
                 "offset_type": "p",
                 "name": get_value_from_key_path(self.data, self.project.document_id_field)
             }
-            pass
 
         # Insert annotation sets into the doc dict
         annotations = self.annotations.filter(status=Annotation.COMPLETED)

From 41fd491bf4555de52b7677624b413c3ecc489dbb Mon Sep 17 00:00:00 2001
From: Ian Roberts <i.roberts@dcs.shef.ac.uk>
Date: Fri, 23 Feb 2024 13:50:15 +0000
Subject: [PATCH 3/3] Documentation for the teamware_status information

---
 .../documents_annotations_management.md       | 34 +++++++++++++++----
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/docs/docs/manageradminguide/documents_annotations_management.md b/docs/docs/manageradminguide/documents_annotations_management.md
index ac010c9f..c73fbb5a 100644
--- a/docs/docs/manageradminguide/documents_annotations_management.md
+++ b/docs/docs/manageradminguide/documents_annotations_management.md
@@ -178,14 +178,21 @@ The above column headers will generate the following JSON:
 ## Exporting documents
 
 Documents and annotations can be exported using the **Export** button. A zip file is generated containing files with 500
-documents each. You can choose how documents are exported:
+documents each. The option to "anonymize annotators" controls whether the individual annotators are identified with
+their numeric ID or by their actual username - since usernames are often personally identifiable information (e.g. an
+email address) the anonumous mode is recommended if you intend to share the annotation data with third parties.  Note
+that the anonymous IDs are consistent within a single installation of Teamware, so even in anonymous mode it is still
+possible to determine which documents were annotated by _the same person_, just not who that person was.
+
+You can choose how documents are exported:
 
 * `.json` & `.jsonl` - JSON or JSON Lines files can be generated in the format of:
   * `raw` - Exports unmodified JSON. If you've originally uploaded in GATE format then choose this option.
 
     An additional field named `annotation_sets` is added for storing annotations. The annotations are laid out in the
     same way as GATE JSON format. For example if a document has been annotated by `user1` with labels and values
-    `text`:`Annotation text`, `radio`:`val3`, and `checkbox`:`["val2", "val4"]`:
+    `text`:`Annotation text`, `radio`:`val3`, and `checkbox`:`["val2", "val4"]`, the non-anonymous export might look
+    like this:
 
     ```json
     {
@@ -216,13 +223,25 @@ documents each. You can choose how documents are exported:
            ],
            "next_annid":1
         }
+      },
+      "teamware_status": {
+        "rejected_by": ["user2"],
+        "timed_out": ["user3"],
+        "aborted": []
       }
     }
     ```
 
+    In anonymous mode the name `user1` would instead be the user's opaque numeric identifier (e.g. `105`).
+
+    The field `teamware_status` gives the ids or usernames (depending on the "anonymize" setting) of those annotators
+    who rejected the document, "timed out" because they did not complete their annotation in the time allowed by the
+    project, or "aborted" for some other reason (e.g. they were removed from the project).
+
   * `gate` - Convert documents to GATE JSON format and export. A `name` field is added that takes the ID value from the
     ID field specified in the project configuration. Fields apart from `text` and the ID field specified in the project
-    config are placed in the `features` field. An `annotation_sets` field is added for storing annotations.
+    config are placed in the `features` field, as is the `teamware_status` information. An `annotation_sets` field is
+    added for storing annotations.
 
     For example in the case of this uploaded JSON document:
     ```json
@@ -233,21 +252,24 @@ documents each. You can choose how documents are exported:
       "feature1": "Feature text"
     }
     ```
-    The generated output is as follows. The annotations are formatted same as the `raw` output above:
+    The generated output is as follows. The annotations and `teamware_status` are formatted same as the `raw` output
+    above:
     ```json
     {
       "name": 32,
       "text": "Document text",
       "features": {
         "text2": "Document text 2",
-        "feature1": "Feature text"
+        "feature1": "Feature text",
+        "teamware_status": {...}
       },
       "offset_type":"p",
       "annotation_sets": {...}
     }
     ```
 * `.csv` - The JSON documents will be flattened to csv's column based format. Annotations are added as additional
-  columns with the header of `annotations.username.label`.
+  columns with the header of `annotations.username.label` and the status information is in columns named
+  `teamware_status.rejected_by`, `teamware_status.timed_out` and `teamware_status.aborted`.
 
 ## Deleting documents and annotations