GateNLP · ianroberts · Jan 21, 2024 · Jan 21, 2024 · Jan 21, 2024 · Jan 21, 2024
diff --git a/backend/models.py b/backend/models.py
@@ -978,7 +978,7 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
         # Create dictionary for document
         doc_dict = None
         if json_format == "raw" or json_format == "csv":
-            doc_dict = self.data
+            doc_dict = self.data.copy()
         elif json_format == "gate":
 
             ignore_keys = {"text", self.project.document_id_field}
@@ -990,7 +990,6 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
                 "offset_type": "p",
                 "name": get_value_from_key_path(self.data, self.project.document_id_field)
             }
-            pass
 
         # Insert annotation sets into the doc dict
         annotations = self.annotations.filter(status=Annotation.COMPLETED)
@@ -1039,6 +1038,27 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
                 annotation_sets[annotation.user.username] = annotation_set
             doc_dict["annotation_sets"] = annotation_sets
 
+        # Add to the export the lists (possibly empty) of users who rejected,
+        # timed out or aborted annotation of this document
+        teamware_status = {}
+        for key, status in [
+            ("rejected_by", Annotation.REJECTED),
+            ("timed_out", Annotation.TIMED_OUT),
+            ("aborted", Annotation.ABORTED),
+        ]:
+            teamware_status[key] = [
+                annotation.user.id if anonymize else annotation.user.username
+                for annotation in self.annotations.filter(status=status)
+            ]
+            if json_format == "csv":
+                # Flatten list if exporting as CSV
+                teamware_status[key] = ",".join(str(val) for val in teamware_status[key])
+
+        if json_format == "gate":
+            doc_dict["features"]["teamware_status"] = teamware_status
+        else:
+            doc_dict["teamware_status"] = teamware_status
+
         return doc_dict
 
 

diff --git a/backend/tests/test_models.py b/backend/tests/test_models.py
@@ -1099,7 +1099,9 @@ class TestDocumentAnnotationModelExport(TestCase):
 
     def setUp(self):
         self.test_user = get_user_model().objects.create(username="project_creator")
-        self.annotators = [get_user_model().objects.create(username=f"anno{i}") for i in range(3)]
+        self.annotator_names = [f"anno{i}" for i in range(3)]
+        self.annotators = [get_user_model().objects.create(username=u) for u in self.annotator_names]
+        self.annotator_ids = [a.id for a in self.annotators]
         self.project = Project.objects.create(owner=self.test_user)
         for i in range(10):
             document = Document.objects.create(
@@ -1154,6 +1156,7 @@ def test_export_raw(self):
             self.assertTrue("feature3" in doc_dict)
 
             self.check_raw_gate_annotation_formatting(doc_dict)
+            self.check_teamware_status(doc_dict, self.annotator_ids)
 
     def test_export_gate(self):
 
@@ -1170,6 +1173,7 @@ def test_export_gate(self):
             self.assertTrue("feature3" in doc_features)
 
             self.check_raw_gate_annotation_formatting(doc_dict)
+            self.check_teamware_status(doc_features, self.annotator_ids)
 
     def check_raw_gate_annotation_formatting(self, doc_dict):
         self.assertTrue("annotation_sets" in doc_dict)
@@ -1191,6 +1195,18 @@ def check_raw_gate_annotation_formatting(self, doc_dict):
             self.assertTrue("text1" in label_dict)
             self.assertTrue("checkbox1" in label_dict)
 
+    def check_teamware_status(self, containing_dict, expected_value):
+        self.assertTrue("teamware_status" in containing_dict)
+        teamware_status = containing_dict["teamware_status"]
+        if isinstance(expected_value, str):
+            self.assertEqual(teamware_status["rejected_by"], expected_value)
+            self.assertEqual(teamware_status["aborted"], expected_value)
+            self.assertEqual(teamware_status["timed_out"], expected_value)
+        else:
+            self.assertSetEqual(set(teamware_status["rejected_by"]), set(expected_value))
+            self.assertSetEqual(set(teamware_status["aborted"]), set(expected_value))
+            self.assertSetEqual(set(teamware_status["timed_out"]), set(expected_value))
+
     def test_export_csv(self):
 
         for document in self.project.documents.all():
@@ -1209,6 +1225,8 @@ def test_export_csv(self):
                 self.assertTrue(isinstance(anno_set_dict[set_key]["text1"], str))
                 self.assertTrue(isinstance(anno_set_dict[set_key]["checkbox1"], str))
 
+            self.check_teamware_status(doc_dict, ",".join(str(i) for i in self.annotator_ids))
+
     def test_export_raw_anonymized(self):
 
         for document in self.project.documents.all():
@@ -1217,6 +1235,8 @@ def test_export_raw_anonymized(self):
             for aset_key, aset_data in doc_dict["annotation_sets"].items():
                 self.assertTrue(isinstance(aset_data.get("name", None), int))
 
+            self.check_teamware_status(doc_dict, self.annotator_ids)
+
     def test_export_raw_deanonymized(self):
 
         for document in self.project.documents.all():
@@ -1225,6 +1245,10 @@ def test_export_raw_deanonymized(self):
             for aset_key, aset_data in doc_dict["annotation_sets"].items():
                 self.assertTrue(isinstance(aset_data.get("name", None), str))
 
+            # for non-anonymized export the rejected/aborted/timed_out status
+            # uses names rather than ID numbers
+            self.check_teamware_status(doc_dict, self.annotator_names)
+
     def test_export_gate_anonymized(self):
 
         for document in self.project.documents.all():
@@ -1233,10 +1257,16 @@ def test_export_gate_anonymized(self):
             for aset_key, aset_data in doc_dict["annotation_sets"].items():
                 self.assertTrue(isinstance(aset_data.get("name", None), int))
 
+            self.check_teamware_status(doc_dict["features"], self.annotator_ids)
+
     def test_export_gate_deanonymized(self):
 
         for document in self.project.documents.all():
             doc_dict = document.get_doc_annotation_dict("gate", anonymize=False)
 
             for aset_key, aset_data in doc_dict["annotation_sets"].items():
                 self.assertTrue(isinstance(aset_data.get("name", None), str))
+
+            # for non-anonymized export the rejected/aborted/timed_out status
+            # uses names rather than ID numbers
+            self.check_teamware_status(doc_dict["features"], self.annotator_names)
diff --git a/docs/docs/manageradminguide/documents_annotations_management.md b/docs/docs/manageradminguide/documents_annotations_management.md
@@ -178,14 +178,21 @@ The above column headers will generate the following JSON:
 ## Exporting documents
 
 Documents and annotations can be exported using the **Export** button. A zip file is generated containing files with 500
-documents each. You can choose how documents are exported:
+documents each. The option to "anonymize annotators" controls whether the individual annotators are identified with
+their numeric ID or by their actual username - since usernames are often personally identifiable information (e.g. an
+email address) the anonumous mode is recommended if you intend to share the annotation data with third parties.  Note
+that the anonymous IDs are consistent within a single installation of Teamware, so even in anonymous mode it is still
+possible to determine which documents were annotated by _the same person_, just not who that person was.
+
+You can choose how documents are exported:
 
 * `.json` & `.jsonl` - JSON or JSON Lines files can be generated in the format of:
   * `raw` - Exports unmodified JSON. If you've originally uploaded in GATE format then choose this option.
 
     An additional field named `annotation_sets` is added for storing annotations. The annotations are laid out in the
     same way as GATE JSON format. For example if a document has been annotated by `user1` with labels and values
-    `text`:`Annotation text`, `radio`:`val3`, and `checkbox`:`["val2", "val4"]`:
+    `text`:`Annotation text`, `radio`:`val3`, and `checkbox`:`["val2", "val4"]`, the non-anonymous export might look
+    like this:
 
     ```json
     {
@@ -216,13 +223,25 @@ documents each. You can choose how documents are exported:
            ],
            "next_annid":1
         }
+      },
+      "teamware_status": {
+        "rejected_by": ["user2"],
+        "timed_out": ["user3"],
+        "aborted": []
       }
     }
     ```
 
+    In anonymous mode the name `user1` would instead be the user's opaque numeric identifier (e.g. `105`).
+
+    The field `teamware_status` gives the ids or usernames (depending on the "anonymize" setting) of those annotators
+    who rejected the document, "timed out" because they did not complete their annotation in the time allowed by the
+    project, or "aborted" for some other reason (e.g. they were removed from the project).
+
   * `gate` - Convert documents to GATE JSON format and export. A `name` field is added that takes the ID value from the
     ID field specified in the project configuration. Fields apart from `text` and the ID field specified in the project
-    config are placed in the `features` field. An `annotation_sets` field is added for storing annotations.
+    config are placed in the `features` field, as is the `teamware_status` information. An `annotation_sets` field is
+    added for storing annotations.
 
     For example in the case of this uploaded JSON document:
     ```json
@@ -233,21 +252,24 @@ documents each. You can choose how documents are exported:
       "feature1": "Feature text"
     }
     ```
-    The generated output is as follows. The annotations are formatted same as the `raw` output above:
+    The generated output is as follows. The annotations and `teamware_status` are formatted same as the `raw` output
+    above:
     ```json
     {
       "name": 32,
       "text": "Document text",
       "features": {
         "text2": "Document text 2",
-        "feature1": "Feature text"
+        "feature1": "Feature text",
+        "teamware_status": {...}
       },
       "offset_type":"p",
       "annotation_sets": {...}
     }
     ```
 * `.csv` - The JSON documents will be flattened to csv's column based format. Annotations are added as additional
-  columns with the header of `annotations.username.label`.
+  columns with the header of `annotations.username.label` and the status information is in columns named
+  `teamware_status.rejected_by`, `teamware_status.timed_out` and `teamware_status.aborted`.
 
 ## Deleting documents and annotations