From 3464a02ac8e2eb093775e86ba98eb6dfceb763d0 Mon Sep 17 00:00:00 2001 From: Angele Zamarron Date: Thu, 26 Oct 2023 13:04:06 -0700 Subject: [PATCH 01/14] some kind of progress, still need to address overlap in sentences crossing paragraphs --- ...grobid_augment_existing_document_parser.py | 90 +++++++++---------- 1 file changed, 44 insertions(+), 46 deletions(-) diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py index 24f3ca27..2ac9e3a8 100644 --- a/src/mmda/parsers/grobid_augment_existing_document_parser.py +++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py @@ -4,7 +4,7 @@ """ from grobid_client.grobid_client import GrobidClient -from typing import List, Optional +from typing import List, Optional, Tuple import logging import os @@ -104,32 +104,40 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document: # sentences within the body text, also tagged by paragraphs. # We use these to annotate the document in order to provide a hierarchical structure: # e.g. doc.sections.header, doc.sections[0].paragraphs[0].sentences[0] - section_box_groups, heading_box_groups, paragraph_box_groups, sentence_box_groups = \ - self._get_structured_body_text_box_groups(xml_root) - doc.annotate( - sections=box_groups_to_span_groups( - section_box_groups, doc, center=True - ) - ) - doc.annotate( - headings=box_groups_to_span_groups( - heading_box_groups, doc, center=True - ) - ) - doc.annotate( - paragraphs=box_groups_to_span_groups( - paragraph_box_groups, doc, center=True - ) - ) - doc.annotate( - sentences=box_groups_to_span_groups( - sentence_box_groups, doc, center=True - ) - ) + section_headings_and_sentence_box_groups_in_paragraphs = \ + self._get_structured_sentence_box_groups(xml_root) + + heading_span_groups = [] + paragraph_span_groups = [] + section_span_groups = [] + sentence_span_groups = [] + + for heading_box_group, paragraphs in section_headings_and_sentence_box_groups_in_paragraphs: + if heading_box_group: + heading_span_groups.extend(box_groups_to_span_groups([heading_box_group], doc, center=True)) + this_section_paragraph_span_groups = [] + for sentence_box_groups in paragraphs: + this_paragraph_sentence_span_groups = box_groups_to_span_groups(sentence_box_groups, doc, center=True) + sentence_span_groups.extend(this_paragraph_sentence_span_groups) + paragraph_spans = [] + for sg in this_paragraph_sentence_span_groups: + paragraph_spans.extend(sg.spans) + this_section_paragraph_span_groups.append(SpanGroup(spans=paragraph_spans)) + paragraph_span_groups.extend(this_section_paragraph_span_groups) + section_spans = [] + for sg in this_section_paragraph_span_groups: + section_spans.extend(sg.spans) + section_span_groups.append(SpanGroup(spans=section_spans)) + + doc.annotate(headings=heading_span_groups) + doc.annotate(sentences=sentence_span_groups) + doc.annotate(paragraphs=paragraph_span_groups) + doc.annotate(sections=section_span_groups) + return doc - def _xml_coords_to_boxes(self, coords_attribute: str): + def _xml_coords_to_boxes(self, coords_attribute: str) -> List[Box]: coords_list = coords_attribute.split(";") boxes = [] for coords in coords_list: @@ -218,34 +226,24 @@ def _get_heading_box_group( ) return box_group - def _get_structured_body_text_box_groups( + def _get_structured_sentence_box_groups( self, root: et.Element - ) -> (List[BoxGroup], List[BoxGroup], List[BoxGroup], List[BoxGroup]): + ) -> List[Tuple[Optional[BoxGroup], List[List[BoxGroup]]]]: section_list_root = root.find(f".//tei:body", NS) - - body_sections: List[BoxGroup] = [] - body_headings: List[BoxGroup] = [] - body_paragraphs: List[BoxGroup] = [] - body_sentences: List[BoxGroup] = [] - section_divs = section_list_root.findall(f"./tei:div", NS) + + section_structures = [] for div in section_divs: - section_boxes: List[Box] = [] heading_box_group = self._get_heading_box_group(div) - if heading_box_group: - body_headings.append(heading_box_group) - section_boxes.extend(heading_box_group.boxes) + paragraphs: List[List[BoxGroup]] = [] for p in div.findall(f"./tei:p", NS): - paragraph_boxes: List[Box] = [] - paragraph_sentences: List[BoxGroup] = [] + sentence_box_groups: List[BoxGroup] = [] for s in p.findall(f"./tei:s", NS): sentence_boxes = self._xml_coords_to_boxes(s.attrib["coords"]) - paragraph_sentences.append(BoxGroup(boxes=sentence_boxes)) - paragraph_boxes.extend(sentence_boxes) - body_paragraphs.append(BoxGroup(boxes=paragraph_boxes)) - section_boxes.extend(paragraph_boxes) - body_sentences.extend(paragraph_sentences) - body_sections.append(BoxGroup(boxes=section_boxes)) - - return body_sections, body_headings, body_paragraphs, body_sentences + sentence_box_groups.append(BoxGroup(boxes=sentence_boxes)) + paragraphs.append(sentence_box_groups) + + section_structures.append([heading_box_group, paragraphs]) + + return section_structures From a39d0c13531713351492f1bf3f0da9db477450a9 Mon Sep 17 00:00:00 2001 From: Angele Zamarron Date: Mon, 6 Nov 2023 13:34:27 -0800 Subject: [PATCH 02/14] ok cool this seems to be working! --- ...grobid_augment_existing_document_parser.py | 21 ++++++-- src/mmda/utils/tools.py | 52 +++++++++++-------- 2 files changed, 49 insertions(+), 24 deletions(-) diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py index 2ac9e3a8..034843de 100644 --- a/src/mmda/parsers/grobid_augment_existing_document_parser.py +++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py @@ -4,7 +4,7 @@ """ from grobid_client.grobid_client import GrobidClient -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Dict import logging import os @@ -112,12 +112,27 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document: section_span_groups = [] sentence_span_groups = [] + unallocated_section_tokens_dict: Dict[int, SpanGroup] = dict() + for heading_box_group, paragraphs in section_headings_and_sentence_box_groups_in_paragraphs: if heading_box_group: - heading_span_groups.extend(box_groups_to_span_groups([heading_box_group], doc, center=True)) + heading_span_group, unallocated_section_tokens_dict = ( + box_groups_to_span_groups( + [heading_box_group], + doc, + center=True, + unallocated_tokens_dict=unallocated_section_tokens_dict + ) + ) + heading_span_groups.extend(heading_span_group) this_section_paragraph_span_groups = [] for sentence_box_groups in paragraphs: - this_paragraph_sentence_span_groups = box_groups_to_span_groups(sentence_box_groups, doc, center=True) + this_paragraph_sentence_span_groups, unallocated_section_tokens_dict = box_groups_to_span_groups( + sentence_box_groups, + doc, + center=True, + unallocated_tokens_dict=unallocated_section_tokens_dict + ) sentence_span_groups.extend(this_paragraph_sentence_span_groups) paragraph_spans = [] for sg in this_paragraph_sentence_span_groups: diff --git a/src/mmda/utils/tools.py b/src/mmda/utils/tools.py index 1095c96c..5cdf1756 100644 --- a/src/mmda/utils/tools.py +++ b/src/mmda/utils/tools.py @@ -4,7 +4,7 @@ from collections import defaultdict from itertools import groupby import itertools -from typing import List, Dict, Tuple +from typing import List, Dict, Tuple, Optional, Union import numpy as np @@ -41,20 +41,33 @@ def allocate_overlapping_tokens_for_box( def box_groups_to_span_groups( - box_groups: List[BoxGroup], doc: Document, pad_x: bool = False, center: bool = False -) -> List[SpanGroup]: - """Generate SpanGroups from BoxGroups. + box_groups: List[BoxGroup], + doc, + pad_x: bool = False, + center: bool = False, + unallocated_tokens_dict: Optional[Dict[int, SpanGroup]] = None +) -> Union[List[SpanGroup], Tuple[List[SpanGroup], Dict[int, SpanGroup]]]: + """Generate SpanGroups from BoxGroups given they can only generate spans of tokens not already allocated Args `box_groups` (List[BoxGroup]) `doc` (Document) base document annotated with pages, tokens, rows to `center` (bool) if True, considers tokens to be overlapping with boxes only if their centers overlap + `unallocated_tokens` (Optional[Dict]) of token spangroups keyed by page. If provided, will use as starting + point for determining if token is already allocated. Assumes the tokens within are of the same type as the `doc` + (i.e., tokens from both doc and the dict both have their box data in either Span.box or SpanGroup.boxgroup) Returns - List[SpanGroup] with each SpanGroup.spans corresponding to spans (sans boxes) of allocated tokens per box_group, + Union (either) of: + -List[SpanGroup] with each SpanGroup.spans corresponding to spans (sans boxes) of allocated tokens per box_group, and each SpanGroup.box_group containing original box_groups + or Tuple of: + -List[SpanGroup] as described above, and + -Dictionary of unallocated tokens keyed by page """ assert all([isinstance(group, BoxGroup) for group in box_groups]) - all_page_tokens = dict() + return_unallocated_tokens = unallocated_tokens_dict is not None + + unallocated_tokens = unallocated_tokens_dict if return_unallocated_tokens else dict() avg_token_widths = dict() derived_span_groups = [] token_box_in_box_group = None @@ -66,8 +79,8 @@ def box_groups_to_span_groups( for box in box_group.boxes: # Caching the page tokens to avoid duplicated search - if box.page not in all_page_tokens: - cur_page_tokens = all_page_tokens[box.page] = doc.pages[ + if box.page not in unallocated_tokens: + cur_page_tokens = unallocated_tokens[box.page] = doc.pages[ box.page ].tokens if token_box_in_box_group is None: @@ -89,7 +102,7 @@ def box_groups_to_span_groups( avg_token_widths[box.page] = np.average([t.spans[0].box.w for t in cur_page_tokens]) else: - cur_page_tokens = all_page_tokens[box.page] + cur_page_tokens = unallocated_tokens[box.page] # Find all the tokens within the box tokens_in_box, remaining_tokens = allocate_overlapping_tokens_for_box( @@ -101,7 +114,7 @@ def box_groups_to_span_groups( y=0.0, center=center ) - all_page_tokens[box.page] = remaining_tokens + unallocated_tokens[box.page] = remaining_tokens all_tokens_overlapping_box_group.extend(tokens_in_box) merge_spans = ( @@ -128,10 +141,12 @@ def box_groups_to_span_groups( # a token is not found to be overlapping with the box, but MergeSpans decides it is close enough to be merged) for sg_token in sg_tokens: if sg_token not in all_tokens_overlapping_box_group: - if token_box_in_box_group and sg_token in all_page_tokens[sg_token.box_group.boxes[0].page]: - all_page_tokens[sg_token.box_group.boxes[0].page].remove(sg_token) - elif not token_box_in_box_group and sg_token in all_page_tokens[sg_token.spans[0].box.page]: - all_page_tokens[sg_token.spans[0].box.page].remove(sg_token) + if token_box_in_box_group and sg_token in unallocated_tokens[sg_token.box_group.boxes[0].page]: + unallocated_tokens[sg_token.box_group.boxes[0].page].remove(sg_token) + elif not token_box_in_box_group and sg_token in unallocated_tokens[sg_token.spans[0].box.page]: + unallocated_tokens[sg_token.spans[0].box.page].remove(sg_token) + + derived_span_groups.append( SpanGroup( @@ -148,20 +163,15 @@ def box_groups_to_span_groups( "future Spans wont contain box). Ensure Document is annotated with tokens " "having box stored in SpanGroup box_group.boxes") - del all_page_tokens - derived_span_groups = sorted( derived_span_groups, key=lambda span_group: span_group.start ) # ensure they are ordered based on span indices - for box_id, span_group in enumerate(derived_span_groups): span_group.id = box_id - # return self._annotate_span_group( - # span_groups=derived_span_groups, field_name=field_name - # ) - return derived_span_groups + return (derived_span_groups, unallocated_tokens) if return_unallocated_tokens else derived_span_groups + class MergeSpans: """ From df9849f645974721651e4e8f25214707b6eada2d Mon Sep 17 00:00:00 2001 From: Angele Zamarron Date: Mon, 6 Nov 2023 13:52:31 -0800 Subject: [PATCH 03/14] make heading spans part of section --- .../parsers/grobid_augment_existing_document_parser.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py index 034843de..202a1684 100644 --- a/src/mmda/parsers/grobid_augment_existing_document_parser.py +++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py @@ -115,8 +115,9 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document: unallocated_section_tokens_dict: Dict[int, SpanGroup] = dict() for heading_box_group, paragraphs in section_headings_and_sentence_box_groups_in_paragraphs: + section_spans = [] if heading_box_group: - heading_span_group, unallocated_section_tokens_dict = ( + heading_span_group_in_list, unallocated_section_tokens_dict = ( box_groups_to_span_groups( [heading_box_group], doc, @@ -124,7 +125,9 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document: unallocated_tokens_dict=unallocated_section_tokens_dict ) ) - heading_span_groups.extend(heading_span_group) + heading_span_group = heading_span_group_in_list[0] + heading_span_groups.append(heading_span_group) + section_spans.extend(heading_span_group.spans) this_section_paragraph_span_groups = [] for sentence_box_groups in paragraphs: this_paragraph_sentence_span_groups, unallocated_section_tokens_dict = box_groups_to_span_groups( @@ -137,11 +140,12 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document: paragraph_spans = [] for sg in this_paragraph_sentence_span_groups: paragraph_spans.extend(sg.spans) + # TODO add boxes to paragraph spangroups this_section_paragraph_span_groups.append(SpanGroup(spans=paragraph_spans)) paragraph_span_groups.extend(this_section_paragraph_span_groups) - section_spans = [] for sg in this_section_paragraph_span_groups: section_spans.extend(sg.spans) + # TODO add boxes to section spangroups section_span_groups.append(SpanGroup(spans=section_spans)) doc.annotate(headings=heading_span_groups) From 1cf469a5315876d8874a03525618c7c74c729541 Mon Sep 17 00:00:00 2001 From: Angele Zamarron Date: Mon, 6 Nov 2023 14:35:56 -0800 Subject: [PATCH 04/14] make sentences have unique ids, give paragraphs and sections ids --- .../parsers/grobid_augment_existing_document_parser.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py index 202a1684..220442f2 100644 --- a/src/mmda/parsers/grobid_augment_existing_document_parser.py +++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py @@ -147,7 +147,13 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document: section_spans.extend(sg.spans) # TODO add boxes to section spangroups section_span_groups.append(SpanGroup(spans=section_spans)) - + + # ensure unique IDs within annotations + all_section_span_groups = [heading_span_groups, sentence_span_groups, paragraph_span_groups, section_span_groups] + for span_groups in all_section_span_groups: + for i, span_group in enumerate(span_groups): + span_group.id = i + doc.annotate(headings=heading_span_groups) doc.annotate(sentences=sentence_span_groups) doc.annotate(paragraphs=paragraph_span_groups) From 95a6e5ba609904d79c250ed9a7f7615bf0860498 Mon Sep 17 00:00:00 2001 From: Angele Zamarron Date: Mon, 6 Nov 2023 15:40:14 -0800 Subject: [PATCH 05/14] fix 'coords' error --- ...grobid_augment_existing_document_parser.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py index 220442f2..6283adc8 100644 --- a/src/mmda/parsers/grobid_augment_existing_document_parser.py +++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py @@ -209,7 +209,11 @@ def _get_box_groups( elements = item_list_root.findall(f".//tei:{item_tag}", NS) for e in elements: - coords_string = e.attrib["coords"] + try: + coords_string = e.attrib["coords"] + except KeyError: + logging.warning(f"Element with '{item_tag}' tag missing 'coords' attribute") + continue boxes = self._xml_coords_to_boxes(coords_string) grobid_id = e.attrib[ID_ATTR_KEY] if ID_ATTR_KEY in e.keys() else None @@ -241,7 +245,11 @@ def _get_heading_box_group( box_group = None heading_element = section_div.find(f".//tei:head", NS) if heading_element is not None: # elements evaluate as False if no children - coords_string = heading_element.attrib["coords"] + try: + coords_string = heading_element.attrib["coords"] + except KeyError: + logging.warning(f"Heading element missing 'coords' attribute") + return None boxes = self._xml_coords_to_boxes(coords_string) number = heading_element.attrib["n"] if "n" in heading_element.keys() else None section_title = heading_element.text @@ -265,7 +273,12 @@ def _get_structured_sentence_box_groups( for p in div.findall(f"./tei:p", NS): sentence_box_groups: List[BoxGroup] = [] for s in p.findall(f"./tei:s", NS): - sentence_boxes = self._xml_coords_to_boxes(s.attrib["coords"]) + try: + coords_string = s.attrib["coords"] + except KeyError: + logging.warning(f"Sentence element missing 'coords' attribute") + continue + sentence_boxes = self._xml_coords_to_boxes(coords_string) sentence_box_groups.append(BoxGroup(boxes=sentence_boxes)) paragraphs.append(sentence_box_groups) From 172f07308abc9e752cdb4bd1abc8be1ecb589af4 Mon Sep 17 00:00:00 2001 From: Angele Zamarron Date: Mon, 6 Nov 2023 18:01:53 -0800 Subject: [PATCH 06/14] pad_x for sentences --- src/mmda/parsers/grobid_augment_existing_document_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py index 6283adc8..91f2904c 100644 --- a/src/mmda/parsers/grobid_augment_existing_document_parser.py +++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py @@ -12,7 +12,7 @@ from mmda.parsers.parser import Parser from mmda.types import Metadata -from mmda.types.annotation import BoxGroup, Box, SpanGroup +from mmda.types.annotation import BoxGroup, Box, SpanGroup, Span from mmda.types.document import Document from mmda.types.names import PagesField, RowsField, TokensField from mmda.utils.tools import box_groups_to_span_groups @@ -134,6 +134,7 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document: sentence_box_groups, doc, center=True, + pad_x=True, unallocated_tokens_dict=unallocated_section_tokens_dict ) sentence_span_groups.extend(this_paragraph_sentence_span_groups) From 952ac9b6d3fc3eb890da57eb260a71d8c4d0e9e1 Mon Sep 17 00:00:00 2001 From: Angele Zamarron Date: Tue, 7 Nov 2023 12:46:48 -0800 Subject: [PATCH 07/14] IT WORKS we get nice spans for sentences for this one specific sha now --- src/mmda/utils/tools.py | 45 ++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/src/mmda/utils/tools.py b/src/mmda/utils/tools.py index 5cdf1756..f17fe9f2 100644 --- a/src/mmda/utils/tools.py +++ b/src/mmda/utils/tools.py @@ -136,18 +136,49 @@ def box_groups_to_span_groups( # tokens overlapping with derived spans: sg_tokens = doc.find_overlapping(SpanGroup(spans=derived_spans), "tokens") + def update_derived_spans(t_span): + # if the sg_token is in the derived_span, cut it out by updating derived_spans. + # this can happen because merge_spans finds min number of spans and can merge spans that + # cover tokens that were already allocated. We update this to avoid spangroup overlap errors. + for i, d_span in enumerate(derived_spans): + if d_span.start == t_span.start and t_span.end < d_span.end: + # unusable token_span is at start of derived_span + d_span.start = t_span.end + elif d_span.end == t_span.end and d_span.start < t_span.start < d_span.end: + # unusable token_span is at end of derived_span + d_span.end = t_span.end + elif d_span.start < t_span.start < d_span.end and t_span.end < d_span.end: + # unusable token_span is encompassed by derived_span + d_span.end = t_span.start + derived_spans.insert(i+1, Span(t_span.end, d_span.end)) + elif d_span.start == t_span.start and d_span.end == t_span.end: + # unusable token_span is equal to derived_span + derived_spans.remove(d_span) + # remove any additional tokens added to the spangroup via MergeSpans from the list of available page tokens # (this can happen if the MergeSpans algorithm merges tokens that are not adjacent, e.g. if `center` is True and # a token is not found to be overlapping with the box, but MergeSpans decides it is close enough to be merged) for sg_token in sg_tokens: if sg_token not in all_tokens_overlapping_box_group: - if token_box_in_box_group and sg_token in unallocated_tokens[sg_token.box_group.boxes[0].page]: - unallocated_tokens[sg_token.box_group.boxes[0].page].remove(sg_token) - elif not token_box_in_box_group and sg_token in unallocated_tokens[sg_token.spans[0].box.page]: - unallocated_tokens[sg_token.spans[0].box.page].remove(sg_token) - - - + # if token not removed from unallocated_tokens yet, do it now + if token_box_in_box_group: + if sg_token in unallocated_tokens[sg_token.box_group.boxes[0].page]: + unallocated_tokens[sg_token.box_group.boxes[0].page].remove(sg_token) + # otherwise, if it is in neither all_tokens_overlapping_box_group nor unallocated_tokens, the assumption + # is that the token has already been allocated by a different box_group, so, we need to remove it from our + # derived spans to avoid 'SpanGroup overlap' error. + else: + update_derived_spans(sg_token.spans[0]) + else: + if sg_token in unallocated_tokens[sg_token.spans[0].box.page]: + unallocated_tokens[sg_token.spans[0].box.page].remove(sg_token) + # same scenario as above. + else: + update_derived_spans(sg_token.spans[0]) + + + # if derived_span_group encompasses any tokens that were NOT in unallocated_tokens, we need to REMOVE + # that spangroup here... wait maybe not. derived_span_groups.append( SpanGroup( spans=derived_spans, From 002a8a201a251997dcb6c19e4c81d8029c64034d Mon Sep 17 00:00:00 2001 From: Angele Zamarron Date: Tue, 7 Nov 2023 15:00:24 -0800 Subject: [PATCH 08/14] remove spanless results (useless) --- src/mmda/parsers/grobid_augment_existing_document_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py index 91f2904c..4d8b0b17 100644 --- a/src/mmda/parsers/grobid_augment_existing_document_parser.py +++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py @@ -137,7 +137,8 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document: pad_x=True, unallocated_tokens_dict=unallocated_section_tokens_dict ) - sentence_span_groups.extend(this_paragraph_sentence_span_groups) + if all([sg.spans for sg in this_paragraph_sentence_span_groups]): + sentence_span_groups.extend(this_paragraph_sentence_span_groups) paragraph_spans = [] for sg in this_paragraph_sentence_span_groups: paragraph_spans.extend(sg.spans) From 8ab0d61f36d2026ad463787c766185be462f0767 Mon Sep 17 00:00:00 2001 From: Angele Zamarron Date: Tue, 7 Nov 2023 15:16:41 -0800 Subject: [PATCH 09/14] lil rename --- src/mmda/utils/tools.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mmda/utils/tools.py b/src/mmda/utils/tools.py index f17fe9f2..8512c696 100644 --- a/src/mmda/utils/tools.py +++ b/src/mmda/utils/tools.py @@ -136,7 +136,7 @@ def box_groups_to_span_groups( # tokens overlapping with derived spans: sg_tokens = doc.find_overlapping(SpanGroup(spans=derived_spans), "tokens") - def update_derived_spans(t_span): + def omit_span_from_derived_spans(t_span): # if the sg_token is in the derived_span, cut it out by updating derived_spans. # this can happen because merge_spans finds min number of spans and can merge spans that # cover tokens that were already allocated. We update this to avoid spangroup overlap errors. @@ -168,13 +168,13 @@ def update_derived_spans(t_span): # is that the token has already been allocated by a different box_group, so, we need to remove it from our # derived spans to avoid 'SpanGroup overlap' error. else: - update_derived_spans(sg_token.spans[0]) + omit_span_from_derived_spans(sg_token.spans[0]) else: if sg_token in unallocated_tokens[sg_token.spans[0].box.page]: unallocated_tokens[sg_token.spans[0].box.page].remove(sg_token) # same scenario as above. else: - update_derived_spans(sg_token.spans[0]) + omit_span_from_derived_spans(sg_token.spans[0]) # if derived_span_group encompasses any tokens that were NOT in unallocated_tokens, we need to REMOVE From e16bc8405a117509e85d7c8b2534cde4bee1fac9 Mon Sep 17 00:00:00 2001 From: Angele Zamarron Date: Tue, 7 Nov 2023 15:16:48 -0800 Subject: [PATCH 10/14] mmda version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ac07d5ec..847bd5d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = 'mmda' -version = '0.9.15' +version = '0.9.16' description = 'MMDA - multimodal document analysis' authors = [ {name = 'Allen Institute for Artificial Intelligence', email = 'contact@allenai.org'}, From 6c534ccab77d4f66b9afe57906fbef05ba6b2dc6 Mon Sep 17 00:00:00 2001 From: Angele Zamarron Date: Tue, 7 Nov 2023 16:10:28 -0800 Subject: [PATCH 11/14] just return list --- src/mmda/utils/tools.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/mmda/utils/tools.py b/src/mmda/utils/tools.py index 8512c696..cfb1db71 100644 --- a/src/mmda/utils/tools.py +++ b/src/mmda/utils/tools.py @@ -46,7 +46,7 @@ def box_groups_to_span_groups( pad_x: bool = False, center: bool = False, unallocated_tokens_dict: Optional[Dict[int, SpanGroup]] = None -) -> Union[List[SpanGroup], Tuple[List[SpanGroup], Dict[int, SpanGroup]]]: +) -> List[SpanGroup]: """Generate SpanGroups from BoxGroups given they can only generate spans of tokens not already allocated Args `box_groups` (List[BoxGroup]) @@ -65,9 +65,7 @@ def box_groups_to_span_groups( """ assert all([isinstance(group, BoxGroup) for group in box_groups]) - return_unallocated_tokens = unallocated_tokens_dict is not None - - unallocated_tokens = unallocated_tokens_dict if return_unallocated_tokens else dict() + unallocated_tokens = unallocated_tokens_dict if unallocated_tokens_dict else dict() avg_token_widths = dict() derived_span_groups = [] token_box_in_box_group = None @@ -201,7 +199,7 @@ def omit_span_from_derived_spans(t_span): for box_id, span_group in enumerate(derived_span_groups): span_group.id = box_id - return (derived_span_groups, unallocated_tokens) if return_unallocated_tokens else derived_span_groups + return derived_span_groups class MergeSpans: From 5fb4c0f729e8e12201b79dafd8c26d93d21c0862 Mon Sep 17 00:00:00 2001 From: Angele Zamarron Date: Tue, 7 Nov 2023 16:31:56 -0800 Subject: [PATCH 12/14] oops delete my thoughts --- src/mmda/utils/tools.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/mmda/utils/tools.py b/src/mmda/utils/tools.py index cfb1db71..5ed7aa39 100644 --- a/src/mmda/utils/tools.py +++ b/src/mmda/utils/tools.py @@ -174,9 +174,6 @@ def omit_span_from_derived_spans(t_span): else: omit_span_from_derived_spans(sg_token.spans[0]) - - # if derived_span_group encompasses any tokens that were NOT in unallocated_tokens, we need to REMOVE - # that spangroup here... wait maybe not. derived_span_groups.append( SpanGroup( spans=derived_spans, From 8336d06fbab3c8d2feb83ca19ffa3e6d0c06d68c Mon Sep 17 00:00:00 2001 From: Angele Zamarron Date: Tue, 7 Nov 2023 16:40:56 -0800 Subject: [PATCH 13/14] oops fix my error made when switching to just list being returned --- src/mmda/utils/tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mmda/utils/tools.py b/src/mmda/utils/tools.py index 5ed7aa39..6a156ed4 100644 --- a/src/mmda/utils/tools.py +++ b/src/mmda/utils/tools.py @@ -65,7 +65,7 @@ def box_groups_to_span_groups( """ assert all([isinstance(group, BoxGroup) for group in box_groups]) - unallocated_tokens = unallocated_tokens_dict if unallocated_tokens_dict else dict() + unallocated_tokens = unallocated_tokens_dict if unallocated_tokens_dict is not None else dict() avg_token_widths = dict() derived_span_groups = [] token_box_in_box_group = None From 7af1e9370515a87afb4c4b7111d4dbf358fd366a Mon Sep 17 00:00:00 2001 From: Angele Zamarron Date: Tue, 7 Nov 2023 16:52:22 -0800 Subject: [PATCH 14/14] new fix_overlaps param --- ...grobid_augment_existing_document_parser.py | 10 ++++--- src/mmda/utils/tools.py | 27 ++++++++++++------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/src/mmda/parsers/grobid_augment_existing_document_parser.py b/src/mmda/parsers/grobid_augment_existing_document_parser.py index 4d8b0b17..2f7406bf 100644 --- a/src/mmda/parsers/grobid_augment_existing_document_parser.py +++ b/src/mmda/parsers/grobid_augment_existing_document_parser.py @@ -117,12 +117,13 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document: for heading_box_group, paragraphs in section_headings_and_sentence_box_groups_in_paragraphs: section_spans = [] if heading_box_group: - heading_span_group_in_list, unallocated_section_tokens_dict = ( + heading_span_group_in_list = ( box_groups_to_span_groups( [heading_box_group], doc, center=True, - unallocated_tokens_dict=unallocated_section_tokens_dict + unallocated_tokens_dict=unallocated_section_tokens_dict, + fix_overlaps=True, ) ) heading_span_group = heading_span_group_in_list[0] @@ -130,12 +131,13 @@ def _parse_xml_onto_doc(self, xml: str, doc: Document) -> Document: section_spans.extend(heading_span_group.spans) this_section_paragraph_span_groups = [] for sentence_box_groups in paragraphs: - this_paragraph_sentence_span_groups, unallocated_section_tokens_dict = box_groups_to_span_groups( + this_paragraph_sentence_span_groups = box_groups_to_span_groups( sentence_box_groups, doc, center=True, pad_x=True, - unallocated_tokens_dict=unallocated_section_tokens_dict + unallocated_tokens_dict=unallocated_section_tokens_dict, + fix_overlaps=True, ) if all([sg.spans for sg in this_paragraph_sentence_span_groups]): sentence_span_groups.extend(this_paragraph_sentence_span_groups) diff --git a/src/mmda/utils/tools.py b/src/mmda/utils/tools.py index 6a156ed4..d1482c1b 100644 --- a/src/mmda/utils/tools.py +++ b/src/mmda/utils/tools.py @@ -45,16 +45,23 @@ def box_groups_to_span_groups( doc, pad_x: bool = False, center: bool = False, - unallocated_tokens_dict: Optional[Dict[int, SpanGroup]] = None + unallocated_tokens_dict: Optional[Dict[int, SpanGroup]] = None, + fix_overlaps: bool = False, ) -> List[SpanGroup]: """Generate SpanGroups from BoxGroups given they can only generate spans of tokens not already allocated Args `box_groups` (List[BoxGroup]) `doc` (Document) base document annotated with pages, tokens, rows to - `center` (bool) if True, considers tokens to be overlapping with boxes only if their centers overlap + `center` (bool) if True, considers tokens to be overlapping with boxes only if their centers overlap `unallocated_tokens` (Optional[Dict]) of token spangroups keyed by page. If provided, will use as starting - point for determining if token is already allocated. Assumes the tokens within are of the same type as the `doc` - (i.e., tokens from both doc and the dict both have their box data in either Span.box or SpanGroup.boxgroup) + point for determining if token is already allocated. Assumes the tokens within are of the same type as the + `doc` (i.e., tokens from both doc and the dict both have their box data in either Span.box or + SpanGroup.boxgroup) + `fix_overlaps` (bool) if True, will attempt to fix overlapping spans within a SpanGroup by omitting + spans from already allocated tokens that end up contained in the derived_spans that come from MergeSpans. + This allows for the possibility of a BoxGroup that covers text to end up with a SpanGroup that is missing + spans or even has no spans since a previous BoxGroup already allocated all the underlying tokens. This + reduces the possibility of SpanGroup overlap errors, but may not return the desired SpanGroups. Returns Union (either) of: -List[SpanGroup] with each SpanGroup.spans corresponding to spans (sans boxes) of allocated tokens per box_group, @@ -162,17 +169,19 @@ def omit_span_from_derived_spans(t_span): if token_box_in_box_group: if sg_token in unallocated_tokens[sg_token.box_group.boxes[0].page]: unallocated_tokens[sg_token.box_group.boxes[0].page].remove(sg_token) - # otherwise, if it is in neither all_tokens_overlapping_box_group nor unallocated_tokens, the assumption - # is that the token has already been allocated by a different box_group, so, we need to remove it from our - # derived spans to avoid 'SpanGroup overlap' error. + # otherwise, if it is in neither all_tokens_overlapping_box_group nor unallocated_tokens, + # the assumption is that the token has already been allocated by a different box_group, so, we need + # to remove it from our derived spans to avoid 'SpanGroup overlap' error. else: - omit_span_from_derived_spans(sg_token.spans[0]) + if fix_overlaps: + omit_span_from_derived_spans(sg_token.spans[0]) else: if sg_token in unallocated_tokens[sg_token.spans[0].box.page]: unallocated_tokens[sg_token.spans[0].box.page].remove(sg_token) # same scenario as above. else: - omit_span_from_derived_spans(sg_token.spans[0]) + if fix_overlaps: + omit_span_from_derived_spans(sg_token.spans[0]) derived_span_groups.append( SpanGroup(