clamsproject · keighrim · Aug 23, 2024 · Jan 29, 2024 · Jan 30, 2024 · Jul 11, 2024
diff --git a/clams_utils/aapb/converter_aapbjson.py b/clams_utils/aapb/converter_aapbjson.py
@@ -0,0 +1,117 @@
+import argparse
+import json
+import math
+import sys
+import typing
+
+import mmif
+from lapps.discriminators import Uri
+from mmif import AnnotationTypes, DocumentTypes, Document
+from mmif.utils import timeunit_helper as tuh
+
+from clams_utils.aapb import guidhandler
+
+
+def normalize_timeframe_times(tf: mmif.Annotation) -> typing.Tuple[float, float]:
+    s = tuh.convert(tf.get_property("start"), tf.get_property("timeUnit"), "ms", 0) / 1000
+    e = tuh.convert(tf.get_property("end"), tf.get_property("timeUnit"), "ms", 0) / 1000
+    return s, e
+
+
+def get_parts_from_view(view: mmif.View) -> typing.List[typing.Dict[str, typing.Union[str, float]]]:
+    parts = []
+    AAPB_dict = {}
+    speaker_id = 1
+
+    for sent in view.get_annotations(Uri.SENTENCE):
+        direct_tf = False
+        s = e = -1
+        # 1. sentence is directly aligned to the time frame (the easier case)
+        for aligned in sent.get_all_aligned():
+            if aligned.at_type == AnnotationTypes.TimeFrame:
+                direct_tf = True
+                s, e = normalize_timeframe_times(aligned)
+                break
+        # 2. sentence has a list of targets of tokens that are aligned to time frames
+        if not direct_tf:
+            stoken = view.get_annotation_by_id(sent.get_property("targets")[0])
+            etoken = view.get_annotation_by_id(sent.get_property("targets")[-1])
+            s = math.inf
+            e = -1
+            for token in (stoken, etoken):
+                for aligned in token.get_all_aligned():
+                    if aligned.at_type == AnnotationTypes.TimeFrame:
+                        tok_s, tok_e = normalize_timeframe_times(aligned)
+                        s = min(s, tok_s)
+                        e = max(e, tok_e)
+        AAPB_dict["start_time"] = f"{s:.3f}"
+        AAPB_dict["end_time"] = f"{e:.3f}"
+        AAPB_dict["text"] = sent.get_property("text")
+        AAPB_dict["speaker_id"] = speaker_id
+        parts.append(AAPB_dict)
+        AAPB_dict = {}
+        speaker_id += 1
+    return parts
+
+
+def convert_mmif_to_aapbjson(mmif_obj: mmif.Mmif, out_f: typing.IO, pretty=True):
+    done = False
+    for view in mmif_obj.views:
+        # TODO (krim @ 8/23/24): is this the best check to grab an ASR view? 
+        if all(map(lambda x: x in view.metadata.contains, [
+            Uri.SENTENCE,
+            AnnotationTypes.TimeFrame,
+            AnnotationTypes.Alignment,
+            DocumentTypes.TextDocument,
+        ])):
+            lang = 'en-US'  # default language
+            guid = None
+            for annotation in view.annotations:
+                if annotation.at_type == DocumentTypes.TextDocument:
+                    lang = annotation.text_language
+                    for aligned in annotation.get_all_aligned():
+                        if aligned.at_type in (DocumentTypes.AudioDocument, DocumentTypes.VideoDocument):
+                            guid = guidhandler.get_aapb_guid_from(Document(aligned.serialize()).location_address())
+                            break
+            if guid is None:
+                raise ValueError("No GUID found in the MMIF file.")
+            parts = get_parts_from_view(view)
+            out_obj = {
+                'id': guid,
+                'language': lang,
+                'parts': parts
+            }
+            json.dump(out_obj, out_f, indent=2 if pretty else None)
+            done = True
+            break
+    if not done:
+        raise ValueError("No ASR view found in the MMIF file.")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert MMIF <-> AAPB-JSON.")
+    subparsers = parser.add_subparsers(dest='command', help='Subcommands')
+    convert_parser = subparsers.add_parser('convert', help='Convert MMIF <-> AAPB-JSON')
+    # TODO (krim @ 8/23/24): add the inverse conversion from AAPB-JSON to MMIF when the AAPB-JSON format is finalized. 
+    convert_parser.add_argument('--from-mmif', action='store_true', help='conversion direction')
+    convert_parser.add_argument('--to-mmif', action='store_true', help='conversion direction')
+    convert_parser.add_argument("-p", '--pretty', action='store_true', help="indent output json (default: False)")
+    convert_parser.add_argument("IN_FILE",
+                                nargs="?", type=argparse.FileType("r"),
+                                default=None if sys.stdin.isatty() else sys.stdin,
+                                help='input MMIF file path, or STDIN if `-` or not provided.')
+    convert_parser.add_argument("OUT_FILE",
+                                nargs="?", type=argparse.FileType("w"),
+                                default=sys.stdout,
+                                help='output MMIF file path, or STDOUT if `-` or not provided.')
+    args = parser.parse_args()
+    if args.command == 'convert':
+        # print(type(args.IN_FILE))
+        if args.to_mmif:
+            raise NotImplementedError("Conversion from AAPB-JSON to MMIF is not implemented yet.")
+        else:  # meaning, --from-mmif flag actually doesn't need to be specified anyway. It's just for clarity.
+            convert_mmif_to_aapbjson(mmif.Mmif(args.IN_FILE.read()), args.OUT_FILE, args.pretty)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/clams_utils/aapb/guidhandler.py b/clams_utils/aapb/guidhandler.py
@@ -4,13 +4,16 @@
 """
 
 import re
+from typing import Optional
 
-def get_aapb_guid_from(s: str) -> str:
+
+def get_aapb_guid_from(s: Optional[str]) -> Optional[str]:
     """
     Extracts the AAPB GUID from a string, if present.
     The function returns the GUID if found, otherwise it returns None.
     """
-
+    if s is None:
+        return None
     m = re.search(r'(cpb-aacip[-_][a-z0-9-]+).', s)
     if m is None:
         return m

diff --git a/pyproject.toml b/pyproject.toml
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,3 @@
 requests
+mmif-python>=1.0.19
+lapps
diff --git a/setup.cfg b/setup.cfg
@@ -1,9 +1,16 @@
 [metadata]
+name = clams-utils
 long_description = file: README.md
 license = Apache-2.0
 author = "Brandeis Lab for Linguistics and Computation"
 author_email = "[email protected]"
 
 [options]
 packages = find:
-install_requires = requests
+install_requires =
+    requests
+    mmif-python
+
+[options.entry_points]
+console_scripts =
+    clams-aapb = clams_utils.aapb.converter_aapbjson:main
diff --git a/setup.py b/setup.py
@@ -0,0 +1,3 @@
+from setuptools import setup
+
+setup()