From 9b5ac32a7a6a21b60d57cc48072d75868c82961d Mon Sep 17 00:00:00 2001 From: 1192119703jzx <147675149+1192119703jzx@users.noreply.github.com> Date: Sun, 23 Jun 2024 01:18:40 -0400 Subject: [PATCH 1/2] Update ocr.py --- ocr.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/ocr.py b/ocr.py index a964296..d63e9ea 100644 --- a/ocr.py +++ b/ocr.py @@ -13,6 +13,13 @@ import cache +""" +Helper function for showing debug information + +def some_function(x): + from utils import app # import inside function + app.logger.debug(x) +""" class OCRFrame(): """ @@ -62,10 +69,10 @@ def add_bounding_box(self, anno, mmif: Mmif): else: for alignment_anns in mmif.get_alignments(AnnotationTypes.BoundingBox, AnnotationTypes.TimePoint).values(): for alignment_ann in alignment_anns: - if alignment_ann.get('source') == anno.id: + if alignment_ann.get('source') == anno.long_id: timepoint_anno = mmif[alignment_ann.get('target')] break - elif alignment_ann.get('target') == anno.id: + elif alignment_ann.get('target') == anno.long_id: timepoint_anno = mmif[alignment_ann.get('source')] break if timepoint_anno: @@ -90,7 +97,7 @@ def add_timeframe(self, anno, mmif): if "targets" in anno.properties: start_id, end_id = anno.properties.get("targets")[0], anno.properties.get("targets")[-1] anno_parent = mmif.get_view_by_id(anno.parent) - start_anno, end_anno = mmif[start_id], mmif[end_id] + start_anno, end_anno = anno_parent.get_annotation_by_id(start_id), anno_parent.get_annotation_by_id(end_id) start = convert_timepoint(mmif, start_anno, "frames") end = convert_timepoint(mmif, end_anno, "frames") start_secs = convert_timepoint(mmif, start_anno, "seconds") @@ -121,10 +128,9 @@ def add_timepoint(self, anno, mmif, skip_if_view_has_frames=True): self.frametype = anno.properties.get("label") def add_text_document(self, anno): - t = anno.properties.get("text_value") or anno.properties.get("text").value - if t: - text_val = re.sub(r'([\\\/\|\"\'])', r'\1 ', t) - self.text = self.text + [text_val] if text_val not in self.text else self.text + t = anno.properties.get("text_value") or anno.text_value + text_val = re.sub(r'([\\\/\|\"\'])', r'\1 ', t) + self.text = self.text + [text_val] if text_val not in self.text else self.text def get_ocr_frames(view, mmif): @@ -139,6 +145,8 @@ def get_ocr_frames(view, mmif): # Account for alignment in either direction frame = OCRFrame(source, mmif) + if target.at_type == DocumentTypes.TextDocument: + frame.add_timepoint(source, mmif, skip_if_view_has_frames=False) frame.update(target, mmif) i = frame.frame_num if frame.frame_num is not None else frame.range From 1be0ee91db5eb1a540654771945d55231e26c4d3 Mon Sep 17 00:00:00 2001 From: 1192119703jzx <147675149+1192119703jzx@users.noreply.github.com> Date: Sun, 23 Jun 2024 01:20:57 -0400 Subject: [PATCH 2/2] Update ocr.py --- ocr.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ocr.py b/ocr.py index d63e9ea..702884b 100644 --- a/ocr.py +++ b/ocr.py @@ -129,8 +129,9 @@ def add_timepoint(self, anno, mmif, skip_if_view_has_frames=True): def add_text_document(self, anno): t = anno.properties.get("text_value") or anno.text_value - text_val = re.sub(r'([\\\/\|\"\'])', r'\1 ', t) - self.text = self.text + [text_val] if text_val not in self.text else self.text + if t: + text_val = re.sub(r'([\\\/\|\"\'])', r'\1 ', t) + self.text = self.text + [text_val] if text_val not in self.text else self.text def get_ocr_frames(view, mmif):