You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
from mmda.recipes.core_recipe import CoreRecipe
file_name = 'a85f7a895ed9cbe09a90b8b449ad7356fb92de6a.pdf'
doc = recipe_doc.from_path(file_name)
Stack trace:
ile ~/Documents/codes/git/ai2/s2/mmda/src/mmda/recipes/core_recipe.py:56, in CoreRecipe.from_path(self, pdfpath)
53 equations = self.effdet_mfd_predictor.predict(document=doc)
55 # we annotate layout info in the document
---> 56 doc.annotate(layout=layout)
58 # list annotations separately
59 doc.annotate(equations=equations)
File ~/Documents/codes/git/ai2/s2/mmda/src/mmda/types/document.py:97, in Document.annotate(self, is_overwrite, **kwargs)
91 span_groups = self._annotate_span_group(
92 span_groups=annotations, field_name=field_name
93 )
94 elif annotation_type == BoxGroup:
95 # TODO: not good. BoxGroups should be stored on their own, not auto-generating SpanGroups.
96 span_groups = self._annotate_span_group(
---> 97 span_groups=box_groups_to_span_groups(annotations, self), field_name=field_name
98 )
99 else:
100 raise NotImplementedError(
101 f"Unsupported annotation type {annotation_type} for {field_name}"
102 )
File ~/Documents/codes/git/ai2/s2/mmda/src/mmda/utils/tools.py:70, in box_groups_to_span_groups(box_groups, doc, pad_x, center)
66 for box in box_group.boxes:
67
68 # Caching the page tokens to avoid duplicated search
69 if box.page not in all_page_tokens:
---> 70 cur_page_tokens = all_page_tokens[box.page] = doc.pages[
71 box.page
72 ].tokens
73 if token_box_in_box_group is None:
74 # Determine whether box is stored on token SpanGroup span.box or in the box_group
75 token_box_in_box_group = all(
76 [
77 (
(...)
82 ]
83 )
IndexError: list index out of range
It appears as the doc has less number of pages than box_groups, e.g.
Here is the code to reproduce the error
Stack trace:
It appears as the doc has less number of pages than box_groups, e.g.
ipdb> set([box.page for box_group in box_groups for box in box_group])
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36}
ipdb> len(doc.pages)
35
doc.pages misses some of the pages it appears
List of shas: 736aea59f4c4d6d52ffe5a5ffabc6f734e142239, a85f7a895ed9cbe09a90b8b449ad7356fb92de6a, 0197e4b6a68e920019b3bb2ae2acde6b61eb96c5
More error can be found in this datadog log
The text was updated successfully, but these errors were encountered: