From 3464a02ac8e2eb093775e86ba98eb6dfceb763d0 Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Thu, 26 Oct 2023 13:04:06 -0700
Subject: [PATCH 01/14] some kind of progress, still need to address overlap in
 sentences crossing paragraphs

---
 ...grobid_augment_existing_document_parser.py | 90 +++++++++----------
 1 file changed, 44 insertions(+), 46 deletions(-)

diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py
index 24f3ca27..2ac9e3a8 100644
--- a/src/mmda/parsers/grobid_augment_existing_document_parser.py
+++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py
@@ -4,7 +4,7 @@
 
 """
 from grobid_client.grobid_client import GrobidClient
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 import logging
 import os
@@ -104,32 +104,40 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document:
         # sentences within the body text, also tagged by paragraphs.
         # We use these to annotate the document in order to provide a hierarchical structure:
         # e.g. doc.sections.header, doc.sections[0].paragraphs[0].sentences[0]
-        section_box_groups, heading_box_groups, paragraph_box_groups, sentence_box_groups = \
-            self._get_structured_body_text_box_groups(xml_root)
-        doc.annotate(
-            sections=box_groups_to_span_groups(
-                section_box_groups, doc, center=True
-            )
-        )
-        doc.annotate(
-            headings=box_groups_to_span_groups(
-                heading_box_groups, doc, center=True
-            )
-        )
-        doc.annotate(
-            paragraphs=box_groups_to_span_groups(
-                paragraph_box_groups, doc, center=True
-            )
-        )
-        doc.annotate(
-            sentences=box_groups_to_span_groups(
-                sentence_box_groups, doc, center=True
-            )
-        )
+        section_headings_and_sentence_box_groups_in_paragraphs = \
+            self._get_structured_sentence_box_groups(xml_root)
+        
+        heading_span_groups = []
+        paragraph_span_groups = []
+        section_span_groups = []
+        sentence_span_groups = []
+
+        for heading_box_group, paragraphs in section_headings_and_sentence_box_groups_in_paragraphs:
+            if heading_box_group:
+                heading_span_groups.extend(box_groups_to_span_groups([heading_box_group], doc, center=True))
+            this_section_paragraph_span_groups = []
+            for sentence_box_groups in paragraphs:
+                this_paragraph_sentence_span_groups = box_groups_to_span_groups(sentence_box_groups, doc, center=True) 
+                sentence_span_groups.extend(this_paragraph_sentence_span_groups)
+                paragraph_spans = []
+                for sg in this_paragraph_sentence_span_groups:
+                    paragraph_spans.extend(sg.spans)
+                this_section_paragraph_span_groups.append(SpanGroup(spans=paragraph_spans))
+            paragraph_span_groups.extend(this_section_paragraph_span_groups)
+            section_spans = []
+            for sg in this_section_paragraph_span_groups:
+                section_spans.extend(sg.spans)
+            section_span_groups.append(SpanGroup(spans=section_spans))
+            
+        doc.annotate(headings=heading_span_groups)
+        doc.annotate(sentences=sentence_span_groups)
+        doc.annotate(paragraphs=paragraph_span_groups)
+        doc.annotate(sections=section_span_groups)
+
 
         return doc
 
-    def _xml_coords_to_boxes(self, coords_attribute: str):
+    def _xml_coords_to_boxes(self, coords_attribute: str) -> List[Box]:
         coords_list = coords_attribute.split(";")
         boxes = []
         for coords in coords_list:
@@ -218,34 +226,24 @@ def _get_heading_box_group(
             )
         return box_group
 
-    def _get_structured_body_text_box_groups(
+    def _get_structured_sentence_box_groups(
             self,
             root: et.Element
-    ) -> (List[BoxGroup], List[BoxGroup], List[BoxGroup], List[BoxGroup]):
+    ) -> List[Tuple[Optional[BoxGroup], List[List[BoxGroup]]]]:
         section_list_root = root.find(f".//tei:body", NS)
-
-        body_sections: List[BoxGroup] = []
-        body_headings: List[BoxGroup] = []
-        body_paragraphs: List[BoxGroup] = []
-        body_sentences: List[BoxGroup] = []
-
         section_divs = section_list_root.findall(f"./tei:div", NS)
+
+        section_structures = []
         for div in section_divs:
-            section_boxes: List[Box] = []
             heading_box_group = self._get_heading_box_group(div)
-            if heading_box_group:
-                body_headings.append(heading_box_group)
-                section_boxes.extend(heading_box_group.boxes)
+            paragraphs: List[List[BoxGroup]] = []
             for p in div.findall(f"./tei:p", NS):
-                paragraph_boxes: List[Box] = []
-                paragraph_sentences: List[BoxGroup] = []
+                sentence_box_groups: List[BoxGroup] = []
                 for s in p.findall(f"./tei:s", NS):
                     sentence_boxes = self._xml_coords_to_boxes(s.attrib["coords"])
-                    paragraph_sentences.append(BoxGroup(boxes=sentence_boxes))
-                    paragraph_boxes.extend(sentence_boxes)
-                body_paragraphs.append(BoxGroup(boxes=paragraph_boxes))
-                section_boxes.extend(paragraph_boxes)
-                body_sentences.extend(paragraph_sentences)
-            body_sections.append(BoxGroup(boxes=section_boxes))
-
-        return body_sections, body_headings, body_paragraphs, body_sentences
+                    sentence_box_groups.append(BoxGroup(boxes=sentence_boxes))
+                paragraphs.append(sentence_box_groups)
+                
+            section_structures.append([heading_box_group, paragraphs])
+
+        return section_structures

From a39d0c13531713351492f1bf3f0da9db477450a9 Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Mon, 6 Nov 2023 13:34:27 -0800
Subject: [PATCH 02/14] ok cool this seems to be working!

---
 ...grobid_augment_existing_document_parser.py | 21 ++++++--
 src/mmda/utils/tools.py                       | 52 +++++++++++--------
 2 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py
index 2ac9e3a8..034843de 100644
--- a/src/mmda/parsers/grobid_augment_existing_document_parser.py
+++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py
@@ -4,7 +4,7 @@
 
 """
 from grobid_client.grobid_client import GrobidClient
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Dict
 
 import logging
 import os
@@ -112,12 +112,27 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document:
         section_span_groups = []
         sentence_span_groups = []
 
+        unallocated_section_tokens_dict: Dict[int, SpanGroup] = dict()
+
         for heading_box_group, paragraphs in section_headings_and_sentence_box_groups_in_paragraphs:
             if heading_box_group:
-                heading_span_groups.extend(box_groups_to_span_groups([heading_box_group], doc, center=True))
+                heading_span_group, unallocated_section_tokens_dict = (
+                    box_groups_to_span_groups(
+                        [heading_box_group],
+                        doc,
+                        center=True,
+                        unallocated_tokens_dict=unallocated_section_tokens_dict
+                    )
+                )
+                heading_span_groups.extend(heading_span_group)
             this_section_paragraph_span_groups = []
             for sentence_box_groups in paragraphs:
-                this_paragraph_sentence_span_groups = box_groups_to_span_groups(sentence_box_groups, doc, center=True) 
+                this_paragraph_sentence_span_groups, unallocated_section_tokens_dict = box_groups_to_span_groups(
+                    sentence_box_groups, 
+                    doc, 
+                    center=True,
+                    unallocated_tokens_dict=unallocated_section_tokens_dict
+                    ) 
                 sentence_span_groups.extend(this_paragraph_sentence_span_groups)
                 paragraph_spans = []
                 for sg in this_paragraph_sentence_span_groups:
diff --git a/src/mmda/utils/tools.py b/src/mmda/utils/tools.py
index 1095c96c..5cdf1756 100644
--- a/src/mmda/utils/tools.py
+++ b/src/mmda/utils/tools.py
@@ -4,7 +4,7 @@
 from collections import defaultdict
 from itertools import groupby
 import itertools
-from typing import List, Dict, Tuple
+from typing import List, Dict, Tuple, Optional, Union
 
 import numpy as np
 
@@ -41,20 +41,33 @@ def allocate_overlapping_tokens_for_box(
 
 
 def box_groups_to_span_groups(
-        box_groups: List[BoxGroup], doc: Document, pad_x: bool = False, center: bool = False
-) -> List[SpanGroup]:
-    """Generate SpanGroups from BoxGroups.
+        box_groups: List[BoxGroup],
+        doc,
+        pad_x: bool = False,
+        center: bool = False,
+        unallocated_tokens_dict:  Optional[Dict[int, SpanGroup]] = None
+) -> Union[List[SpanGroup], Tuple[List[SpanGroup], Dict[int, SpanGroup]]]:
+    """Generate SpanGroups from BoxGroups given they can only generate spans of tokens not already allocated
     Args
         `box_groups` (List[BoxGroup])
         `doc` (Document) base document annotated with pages, tokens, rows to
         `center` (bool) if True, considers tokens to be overlapping with boxes only if their centers overlap
+        `unallocated_tokens` (Optional[Dict]) of token spangroups keyed by page. If provided, will use as starting
+        point for determining if token is already allocated. Assumes the tokens within are of the same type as the `doc`
+        (i.e., tokens from both doc and the dict both have their box data in either Span.box or SpanGroup.boxgroup)
     Returns
-        List[SpanGroup] with each SpanGroup.spans corresponding to spans (sans boxes) of allocated tokens per box_group,
+        Union (either) of:
+         -List[SpanGroup] with each SpanGroup.spans corresponding to spans (sans boxes) of allocated tokens per box_group,
         and each SpanGroup.box_group containing original box_groups
+        or Tuple of:
+         -List[SpanGroup] as described above, and
+         -Dictionary of unallocated tokens keyed by page
     """
     assert all([isinstance(group, BoxGroup) for group in box_groups])
 
-    all_page_tokens = dict()
+    return_unallocated_tokens = unallocated_tokens_dict is not None
+
+    unallocated_tokens = unallocated_tokens_dict if return_unallocated_tokens else dict()
     avg_token_widths = dict()
     derived_span_groups = []
     token_box_in_box_group = None
@@ -66,8 +79,8 @@ def box_groups_to_span_groups(
         for box in box_group.boxes:
 
             # Caching the page tokens to avoid duplicated search
-            if box.page not in all_page_tokens:
-                cur_page_tokens = all_page_tokens[box.page] = doc.pages[
+            if box.page not in unallocated_tokens:
+                cur_page_tokens = unallocated_tokens[box.page] = doc.pages[
                     box.page
                 ].tokens
                 if token_box_in_box_group is None:
@@ -89,7 +102,7 @@ def box_groups_to_span_groups(
                             avg_token_widths[box.page] = np.average([t.spans[0].box.w for t in cur_page_tokens])
 
             else:
-                cur_page_tokens = all_page_tokens[box.page]
+                cur_page_tokens = unallocated_tokens[box.page]
 
             # Find all the tokens within the box
             tokens_in_box, remaining_tokens = allocate_overlapping_tokens_for_box(
@@ -101,7 +114,7 @@ def box_groups_to_span_groups(
                 y=0.0,
                 center=center
             )
-            all_page_tokens[box.page] = remaining_tokens
+            unallocated_tokens[box.page] = remaining_tokens
             all_tokens_overlapping_box_group.extend(tokens_in_box)
 
         merge_spans = (
@@ -128,10 +141,12 @@ def box_groups_to_span_groups(
         # a token is not found to be overlapping with the box, but MergeSpans decides it is close enough to be merged)
         for sg_token in sg_tokens:
             if sg_token not in all_tokens_overlapping_box_group:
-                if token_box_in_box_group and sg_token in all_page_tokens[sg_token.box_group.boxes[0].page]:
-                    all_page_tokens[sg_token.box_group.boxes[0].page].remove(sg_token)
-                elif not token_box_in_box_group and sg_token in all_page_tokens[sg_token.spans[0].box.page]:
-                    all_page_tokens[sg_token.spans[0].box.page].remove(sg_token)
+                if token_box_in_box_group and sg_token in unallocated_tokens[sg_token.box_group.boxes[0].page]:
+                    unallocated_tokens[sg_token.box_group.boxes[0].page].remove(sg_token)
+                elif not token_box_in_box_group and sg_token in unallocated_tokens[sg_token.spans[0].box.page]:
+                    unallocated_tokens[sg_token.spans[0].box.page].remove(sg_token)
+
+
 
         derived_span_groups.append(
             SpanGroup(
@@ -148,20 +163,15 @@ def box_groups_to_span_groups(
                         "future Spans wont contain box). Ensure Document is annotated with tokens "
                         "having box stored in SpanGroup box_group.boxes")
 
-    del all_page_tokens
-
     derived_span_groups = sorted(
         derived_span_groups, key=lambda span_group: span_group.start
     )
     # ensure they are ordered based on span indices
-
     for box_id, span_group in enumerate(derived_span_groups):
         span_group.id = box_id
 
-    # return self._annotate_span_group(
-    #     span_groups=derived_span_groups, field_name=field_name
-    # )
-    return derived_span_groups
+    return (derived_span_groups, unallocated_tokens) if return_unallocated_tokens else derived_span_groups
+
 
 class MergeSpans:
     """

From df9849f645974721651e4e8f25214707b6eada2d Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Mon, 6 Nov 2023 13:52:31 -0800
Subject: [PATCH 03/14] make heading spans part of section

---
 .../parsers/grobid_augment_existing_document_parser.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py
index 034843de..202a1684 100644
--- a/src/mmda/parsers/grobid_augment_existing_document_parser.py
+++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py
@@ -115,8 +115,9 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document:
         unallocated_section_tokens_dict: Dict[int, SpanGroup] = dict()
 
         for heading_box_group, paragraphs in section_headings_and_sentence_box_groups_in_paragraphs:
+            section_spans = []
             if heading_box_group:
-                heading_span_group, unallocated_section_tokens_dict = (
+                heading_span_group_in_list, unallocated_section_tokens_dict = (
                     box_groups_to_span_groups(
                         [heading_box_group],
                         doc,
@@ -124,7 +125,9 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document:
                         unallocated_tokens_dict=unallocated_section_tokens_dict
                     )
                 )
-                heading_span_groups.extend(heading_span_group)
+                heading_span_group = heading_span_group_in_list[0]
+                heading_span_groups.append(heading_span_group)
+                section_spans.extend(heading_span_group.spans)
             this_section_paragraph_span_groups = []
             for sentence_box_groups in paragraphs:
                 this_paragraph_sentence_span_groups, unallocated_section_tokens_dict = box_groups_to_span_groups(
@@ -137,11 +140,12 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document:
                 paragraph_spans = []
                 for sg in this_paragraph_sentence_span_groups:
                     paragraph_spans.extend(sg.spans)
+                # TODO add boxes to paragraph spangroups
                 this_section_paragraph_span_groups.append(SpanGroup(spans=paragraph_spans))
             paragraph_span_groups.extend(this_section_paragraph_span_groups)
-            section_spans = []
             for sg in this_section_paragraph_span_groups:
                 section_spans.extend(sg.spans)
+            # TODO add boxes to section spangroups
             section_span_groups.append(SpanGroup(spans=section_spans))
             
         doc.annotate(headings=heading_span_groups)

From 1cf469a5315876d8874a03525618c7c74c729541 Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Mon, 6 Nov 2023 14:35:56 -0800
Subject: [PATCH 04/14] make sentences have unique ids, give paragraphs and
 sections ids

---
 .../parsers/grobid_augment_existing_document_parser.py    | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py
index 202a1684..220442f2 100644
--- a/src/mmda/parsers/grobid_augment_existing_document_parser.py
+++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py
@@ -147,7 +147,13 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document:
                 section_spans.extend(sg.spans)
             # TODO add boxes to section spangroups
             section_span_groups.append(SpanGroup(spans=section_spans))
-            
+
+        # ensure unique IDs within annotations
+        all_section_span_groups = [heading_span_groups, sentence_span_groups, paragraph_span_groups, section_span_groups]
+        for span_groups in all_section_span_groups:
+            for i, span_group in enumerate(span_groups):
+                span_group.id = i
+
         doc.annotate(headings=heading_span_groups)
         doc.annotate(sentences=sentence_span_groups)
         doc.annotate(paragraphs=paragraph_span_groups)

From 95a6e5ba609904d79c250ed9a7f7615bf0860498 Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Mon, 6 Nov 2023 15:40:14 -0800
Subject: [PATCH 05/14] fix 'coords' error

---
 ...grobid_augment_existing_document_parser.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py
index 220442f2..6283adc8 100644
--- a/src/mmda/parsers/grobid_augment_existing_document_parser.py
+++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py
@@ -209,7 +209,11 @@ def _get_box_groups(
             elements = item_list_root.findall(f".//tei:{item_tag}", NS)
 
         for e in elements:
-            coords_string = e.attrib["coords"]
+            try:
+                coords_string = e.attrib["coords"]
+            except KeyError:
+                logging.warning(f"Element with '{item_tag}' tag missing 'coords' attribute")
+                continue
             boxes = self._xml_coords_to_boxes(coords_string)
 
             grobid_id = e.attrib[ID_ATTR_KEY] if ID_ATTR_KEY in e.keys() else None
@@ -241,7 +245,11 @@ def _get_heading_box_group(
         box_group = None
         heading_element = section_div.find(f".//tei:head", NS)
         if heading_element is not None:  # elements evaluate as False if no children
-            coords_string = heading_element.attrib["coords"]
+            try:
+                coords_string = heading_element.attrib["coords"]
+            except KeyError:
+                logging.warning(f"Heading element missing 'coords' attribute")
+                return None
             boxes = self._xml_coords_to_boxes(coords_string)
             number = heading_element.attrib["n"] if "n" in heading_element.keys() else None
             section_title = heading_element.text
@@ -265,7 +273,12 @@ def _get_structured_sentence_box_groups(
             for p in div.findall(f"./tei:p", NS):
                 sentence_box_groups: List[BoxGroup] = []
                 for s in p.findall(f"./tei:s", NS):
-                    sentence_boxes = self._xml_coords_to_boxes(s.attrib["coords"])
+                    try:
+                        coords_string = s.attrib["coords"]
+                    except KeyError:
+                        logging.warning(f"Sentence element missing 'coords' attribute")
+                        continue
+                    sentence_boxes = self._xml_coords_to_boxes(coords_string)
                     sentence_box_groups.append(BoxGroup(boxes=sentence_boxes))
                 paragraphs.append(sentence_box_groups)
                 

From 172f07308abc9e752cdb4bd1abc8be1ecb589af4 Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Mon, 6 Nov 2023 18:01:53 -0800
Subject: [PATCH 06/14] pad_x for sentences

---
 src/mmda/parsers/grobid_augment_existing_document_parser.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py
index 6283adc8..91f2904c 100644
--- a/src/mmda/parsers/grobid_augment_existing_document_parser.py
+++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py
@@ -12,7 +12,7 @@
 
 from mmda.parsers.parser import Parser
 from mmda.types import Metadata
-from mmda.types.annotation import BoxGroup, Box, SpanGroup
+from mmda.types.annotation import BoxGroup, Box, SpanGroup, Span
 from mmda.types.document import Document
 from mmda.types.names import PagesField, RowsField, TokensField
 from mmda.utils.tools import box_groups_to_span_groups
@@ -134,6 +134,7 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document:
                     sentence_box_groups, 
                     doc, 
                     center=True,
+                    pad_x=True,
                     unallocated_tokens_dict=unallocated_section_tokens_dict
                     ) 
                 sentence_span_groups.extend(this_paragraph_sentence_span_groups)

From 952ac9b6d3fc3eb890da57eb260a71d8c4d0e9e1 Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Tue, 7 Nov 2023 12:46:48 -0800
Subject: [PATCH 07/14] IT WORKS we get nice spans for sentences for this one
 specific sha now

---
 src/mmda/utils/tools.py | 45 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/src/mmda/utils/tools.py b/src/mmda/utils/tools.py
index 5cdf1756..f17fe9f2 100644
--- a/src/mmda/utils/tools.py
+++ b/src/mmda/utils/tools.py
@@ -136,18 +136,49 @@ def box_groups_to_span_groups(
         # tokens overlapping with derived spans:
         sg_tokens = doc.find_overlapping(SpanGroup(spans=derived_spans), "tokens")
 
+        def update_derived_spans(t_span):
+            # if the sg_token is in the derived_span, cut it out by updating derived_spans.
+            # this can happen because merge_spans finds min number of spans and can merge spans that 
+            # cover tokens that were already allocated. We update this to avoid spangroup overlap errors.
+            for i, d_span in enumerate(derived_spans):
+                if d_span.start == t_span.start and t_span.end < d_span.end:
+                    # unusable token_span is at start of derived_span
+                    d_span.start = t_span.end
+                elif d_span.end == t_span.end and d_span.start < t_span.start < d_span.end:
+                    # unusable token_span is at end of derived_span
+                    d_span.end = t_span.end
+                elif d_span.start < t_span.start < d_span.end and t_span.end < d_span.end:
+                    # unusable token_span is encompassed by derived_span
+                    d_span.end = t_span.start
+                    derived_spans.insert(i+1, Span(t_span.end, d_span.end))
+                elif d_span.start == t_span.start and d_span.end == t_span.end:
+                    # unusable token_span is equal to derived_span
+                    derived_spans.remove(d_span)
+
         # remove any additional tokens added to the spangroup via MergeSpans from the list of available page tokens
         # (this can happen if the MergeSpans algorithm merges tokens that are not adjacent, e.g. if `center` is True and
         # a token is not found to be overlapping with the box, but MergeSpans decides it is close enough to be merged)
         for sg_token in sg_tokens:
             if sg_token not in all_tokens_overlapping_box_group:
-                if token_box_in_box_group and sg_token in unallocated_tokens[sg_token.box_group.boxes[0].page]:
-                    unallocated_tokens[sg_token.box_group.boxes[0].page].remove(sg_token)
-                elif not token_box_in_box_group and sg_token in unallocated_tokens[sg_token.spans[0].box.page]:
-                    unallocated_tokens[sg_token.spans[0].box.page].remove(sg_token)
-
-
-
+                # if token not removed from unallocated_tokens yet, do it now
+                if token_box_in_box_group:
+                    if sg_token in unallocated_tokens[sg_token.box_group.boxes[0].page]:
+                        unallocated_tokens[sg_token.box_group.boxes[0].page].remove(sg_token)
+                    # otherwise, if it is in neither all_tokens_overlapping_box_group nor unallocated_tokens, the assumption
+                    # is that the token has already been allocated by a different box_group, so, we need to remove it from our
+                    # derived spans to avoid 'SpanGroup overlap' error.
+                    else:
+                        update_derived_spans(sg_token.spans[0])
+                else:
+                    if sg_token in unallocated_tokens[sg_token.spans[0].box.page]:
+                        unallocated_tokens[sg_token.spans[0].box.page].remove(sg_token)
+                    # same scenario as above.
+                    else:
+                        update_derived_spans(sg_token.spans[0]) 
+
+
+        # if derived_span_group encompasses any tokens that were NOT in unallocated_tokens, we need to REMOVE 
+        # that spangroup here... wait maybe not.
         derived_span_groups.append(
             SpanGroup(
                 spans=derived_spans,

From 002a8a201a251997dcb6c19e4c81d8029c64034d Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Tue, 7 Nov 2023 15:00:24 -0800
Subject: [PATCH 08/14] remove spanless results (useless)

---
 src/mmda/parsers/grobid_augment_existing_document_parser.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py
index 91f2904c..4d8b0b17 100644
--- a/src/mmda/parsers/grobid_augment_existing_document_parser.py
+++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py
@@ -137,7 +137,8 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document:
                     pad_x=True,
                     unallocated_tokens_dict=unallocated_section_tokens_dict
                     ) 
-                sentence_span_groups.extend(this_paragraph_sentence_span_groups)
+                if all([sg.spans for sg in this_paragraph_sentence_span_groups]):
+                    sentence_span_groups.extend(this_paragraph_sentence_span_groups)
                 paragraph_spans = []
                 for sg in this_paragraph_sentence_span_groups:
                     paragraph_spans.extend(sg.spans)

From 8ab0d61f36d2026ad463787c766185be462f0767 Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Tue, 7 Nov 2023 15:16:41 -0800
Subject: [PATCH 09/14] lil rename

---
 src/mmda/utils/tools.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mmda/utils/tools.py b/src/mmda/utils/tools.py
index f17fe9f2..8512c696 100644
--- a/src/mmda/utils/tools.py
+++ b/src/mmda/utils/tools.py
@@ -136,7 +136,7 @@ def box_groups_to_span_groups(
         # tokens overlapping with derived spans:
         sg_tokens = doc.find_overlapping(SpanGroup(spans=derived_spans), "tokens")
 
-        def update_derived_spans(t_span):
+        def omit_span_from_derived_spans(t_span):
             # if the sg_token is in the derived_span, cut it out by updating derived_spans.
             # this can happen because merge_spans finds min number of spans and can merge spans that 
             # cover tokens that were already allocated. We update this to avoid spangroup overlap errors.
@@ -168,13 +168,13 @@ def update_derived_spans(t_span):
                     # is that the token has already been allocated by a different box_group, so, we need to remove it from our
                     # derived spans to avoid 'SpanGroup overlap' error.
                     else:
-                        update_derived_spans(sg_token.spans[0])
+                        omit_span_from_derived_spans(sg_token.spans[0])
                 else:
                     if sg_token in unallocated_tokens[sg_token.spans[0].box.page]:
                         unallocated_tokens[sg_token.spans[0].box.page].remove(sg_token)
                     # same scenario as above.
                     else:
-                        update_derived_spans(sg_token.spans[0]) 
+                        omit_span_from_derived_spans(sg_token.spans[0]) 
 
 
         # if derived_span_group encompasses any tokens that were NOT in unallocated_tokens, we need to REMOVE 

From e16bc8405a117509e85d7c8b2534cde4bee1fac9 Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Tue, 7 Nov 2023 15:16:48 -0800
Subject: [PATCH 10/14] mmda version bump

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index ac07d5ec..847bd5d4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = 'mmda'
-version = '0.9.15'
+version = '0.9.16'
 description = 'MMDA - multimodal document analysis'
 authors = [
     {name = 'Allen Institute for Artificial Intelligence', email = 'contact@allenai.org'},

From 6c534ccab77d4f66b9afe57906fbef05ba6b2dc6 Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Tue, 7 Nov 2023 16:10:28 -0800
Subject: [PATCH 11/14] just return list

---
 src/mmda/utils/tools.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/mmda/utils/tools.py b/src/mmda/utils/tools.py
index 8512c696..cfb1db71 100644
--- a/src/mmda/utils/tools.py
+++ b/src/mmda/utils/tools.py
@@ -46,7 +46,7 @@ def box_groups_to_span_groups(
         pad_x: bool = False,
         center: bool = False,
         unallocated_tokens_dict:  Optional[Dict[int, SpanGroup]] = None
-) -> Union[List[SpanGroup], Tuple[List[SpanGroup], Dict[int, SpanGroup]]]:
+) -> List[SpanGroup]:
     """Generate SpanGroups from BoxGroups given they can only generate spans of tokens not already allocated
     Args
         `box_groups` (List[BoxGroup])
@@ -65,9 +65,7 @@ def box_groups_to_span_groups(
     """
     assert all([isinstance(group, BoxGroup) for group in box_groups])
 
-    return_unallocated_tokens = unallocated_tokens_dict is not None
-
-    unallocated_tokens = unallocated_tokens_dict if return_unallocated_tokens else dict()
+    unallocated_tokens = unallocated_tokens_dict if unallocated_tokens_dict else dict()
     avg_token_widths = dict()
     derived_span_groups = []
     token_box_in_box_group = None
@@ -201,7 +199,7 @@ def omit_span_from_derived_spans(t_span):
     for box_id, span_group in enumerate(derived_span_groups):
         span_group.id = box_id
 
-    return (derived_span_groups, unallocated_tokens) if return_unallocated_tokens else derived_span_groups
+    return derived_span_groups
 
 
 class MergeSpans:

From 5fb4c0f729e8e12201b79dafd8c26d93d21c0862 Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Tue, 7 Nov 2023 16:31:56 -0800
Subject: [PATCH 12/14] oops delete my thoughts

---
 src/mmda/utils/tools.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/mmda/utils/tools.py b/src/mmda/utils/tools.py
index cfb1db71..5ed7aa39 100644
--- a/src/mmda/utils/tools.py
+++ b/src/mmda/utils/tools.py
@@ -174,9 +174,6 @@ def omit_span_from_derived_spans(t_span):
                     else:
                         omit_span_from_derived_spans(sg_token.spans[0]) 
 
-
-        # if derived_span_group encompasses any tokens that were NOT in unallocated_tokens, we need to REMOVE 
-        # that spangroup here... wait maybe not.
         derived_span_groups.append(
             SpanGroup(
                 spans=derived_spans,

From 8336d06fbab3c8d2feb83ca19ffa3e6d0c06d68c Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Tue, 7 Nov 2023 16:40:56 -0800
Subject: [PATCH 13/14] oops fix my error made when switching to just list
 being returned

---
 src/mmda/utils/tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mmda/utils/tools.py b/src/mmda/utils/tools.py
index 5ed7aa39..6a156ed4 100644
--- a/src/mmda/utils/tools.py
+++ b/src/mmda/utils/tools.py
@@ -65,7 +65,7 @@ def box_groups_to_span_groups(
     """
     assert all([isinstance(group, BoxGroup) for group in box_groups])
 
-    unallocated_tokens = unallocated_tokens_dict if unallocated_tokens_dict else dict()
+    unallocated_tokens = unallocated_tokens_dict if unallocated_tokens_dict is not None else dict()
     avg_token_widths = dict()
     derived_span_groups = []
     token_box_in_box_group = None

From 7af1e9370515a87afb4c4b7111d4dbf358fd366a Mon Sep 17 00:00:00 2001
From: Angele Zamarron <angelez@allenai.org>
Date: Tue, 7 Nov 2023 16:52:22 -0800
Subject: [PATCH 14/14] new fix_overlaps param

---
 ...grobid_augment_existing_document_parser.py | 10 ++++---
 src/mmda/utils/tools.py                       | 27 ++++++++++++-------
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py
index 4d8b0b17..2f7406bf 100644
--- a/src/mmda/parsers/grobid_augment_existing_document_parser.py
+++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py
@@ -117,12 +117,13 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document:
         for heading_box_group, paragraphs in section_headings_and_sentence_box_groups_in_paragraphs:
             section_spans = []
             if heading_box_group:
-                heading_span_group_in_list, unallocated_section_tokens_dict = (
+                heading_span_group_in_list = (
                     box_groups_to_span_groups(
                         [heading_box_group],
                         doc,
                         center=True,
-                        unallocated_tokens_dict=unallocated_section_tokens_dict
+                        unallocated_tokens_dict=unallocated_section_tokens_dict,
+                        fix_overlaps=True,
                     )
                 )
                 heading_span_group = heading_span_group_in_list[0]
@@ -130,12 +131,13 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document:
                 section_spans.extend(heading_span_group.spans)
             this_section_paragraph_span_groups = []
             for sentence_box_groups in paragraphs:
-                this_paragraph_sentence_span_groups, unallocated_section_tokens_dict = box_groups_to_span_groups(
+                this_paragraph_sentence_span_groups = box_groups_to_span_groups(
                     sentence_box_groups, 
                     doc, 
                     center=True,
                     pad_x=True,
-                    unallocated_tokens_dict=unallocated_section_tokens_dict
+                    unallocated_tokens_dict=unallocated_section_tokens_dict,
+                    fix_overlaps=True,
                     ) 
                 if all([sg.spans for sg in this_paragraph_sentence_span_groups]):
                     sentence_span_groups.extend(this_paragraph_sentence_span_groups)
diff --git a/src/mmda/utils/tools.py b/src/mmda/utils/tools.py
index 6a156ed4..d1482c1b 100644
--- a/src/mmda/utils/tools.py
+++ b/src/mmda/utils/tools.py
@@ -45,16 +45,23 @@ def box_groups_to_span_groups(
         doc,
         pad_x: bool = False,
         center: bool = False,
-        unallocated_tokens_dict:  Optional[Dict[int, SpanGroup]] = None
+        unallocated_tokens_dict:  Optional[Dict[int, SpanGroup]] = None,
+        fix_overlaps: bool = False,
 ) -> List[SpanGroup]:
     """Generate SpanGroups from BoxGroups given they can only generate spans of tokens not already allocated
     Args
         `box_groups` (List[BoxGroup])
         `doc` (Document) base document annotated with pages, tokens, rows to
-        `center` (bool) if True, considers tokens to be overlapping with boxes only if their centers overlap
+            `center` (bool) if True, considers tokens to be overlapping with boxes only if their centers overlap
         `unallocated_tokens` (Optional[Dict]) of token spangroups keyed by page. If provided, will use as starting
-        point for determining if token is already allocated. Assumes the tokens within are of the same type as the `doc`
-        (i.e., tokens from both doc and the dict both have their box data in either Span.box or SpanGroup.boxgroup)
+            point for determining if token is already allocated. Assumes the tokens within are of the same type as the
+            `doc` (i.e., tokens from both doc and the dict both have their box data in either Span.box or
+            SpanGroup.boxgroup)
+        `fix_overlaps` (bool) if True, will attempt to fix overlapping spans within a SpanGroup by omitting
+            spans from already allocated tokens that end up contained in the derived_spans that come from MergeSpans.
+            This allows for the possibility of a BoxGroup that covers text to end up with a SpanGroup that is missing
+            spans or even has no spans since a previous BoxGroup already allocated all the underlying tokens. This
+            reduces the possibility of SpanGroup overlap errors, but may not return the desired SpanGroups.
     Returns
         Union (either) of:
          -List[SpanGroup] with each SpanGroup.spans corresponding to spans (sans boxes) of allocated tokens per box_group,
@@ -162,17 +169,19 @@ def omit_span_from_derived_spans(t_span):
                 if token_box_in_box_group:
                     if sg_token in unallocated_tokens[sg_token.box_group.boxes[0].page]:
                         unallocated_tokens[sg_token.box_group.boxes[0].page].remove(sg_token)
-                    # otherwise, if it is in neither all_tokens_overlapping_box_group nor unallocated_tokens, the assumption
-                    # is that the token has already been allocated by a different box_group, so, we need to remove it from our
-                    # derived spans to avoid 'SpanGroup overlap' error.
+                    # otherwise, if it is in neither all_tokens_overlapping_box_group nor unallocated_tokens,
+                    # the assumption is that the token has already been allocated by a different box_group, so, we need
+                    # to remove it from our derived spans to avoid 'SpanGroup overlap' error.
                     else:
-                        omit_span_from_derived_spans(sg_token.spans[0])
+                        if fix_overlaps:
+                            omit_span_from_derived_spans(sg_token.spans[0])
                 else:
                     if sg_token in unallocated_tokens[sg_token.spans[0].box.page]:
                         unallocated_tokens[sg_token.spans[0].box.page].remove(sg_token)
                     # same scenario as above.
                     else:
-                        omit_span_from_derived_spans(sg_token.spans[0]) 
+                        if fix_overlaps:
+                            omit_span_from_derived_spans(sg_token.spans[0])
 
         derived_span_groups.append(
             SpanGroup(