diff --git a/clams_utils/aapb/converter_aapbjson.py b/clams_utils/aapb/converter_aapbjson.py new file mode 100644 index 0000000..80fa007 --- /dev/null +++ b/clams_utils/aapb/converter_aapbjson.py @@ -0,0 +1,117 @@ +import argparse +import json +import math +import sys +import typing + +import mmif +from lapps.discriminators import Uri +from mmif import AnnotationTypes, DocumentTypes, Document +from mmif.utils import timeunit_helper as tuh + +from clams_utils.aapb import guidhandler + + +def normalize_timeframe_times(tf: mmif.Annotation) -> typing.Tuple[float, float]: + s = tuh.convert(tf.get_property("start"), tf.get_property("timeUnit"), "ms", 0) / 1000 + e = tuh.convert(tf.get_property("end"), tf.get_property("timeUnit"), "ms", 0) / 1000 + return s, e + + +def get_parts_from_view(view: mmif.View) -> typing.List[typing.Dict[str, typing.Union[str, float]]]: + parts = [] + AAPB_dict = {} + speaker_id = 1 + + for sent in view.get_annotations(Uri.SENTENCE): + direct_tf = False + s = e = -1 + # 1. sentence is directly aligned to the time frame (the easier case) + for aligned in sent.get_all_aligned(): + if aligned.at_type == AnnotationTypes.TimeFrame: + direct_tf = True + s, e = normalize_timeframe_times(aligned) + break + # 2. sentence has a list of targets of tokens that are aligned to time frames + if not direct_tf: + stoken = view.get_annotation_by_id(sent.get_property("targets")[0]) + etoken = view.get_annotation_by_id(sent.get_property("targets")[-1]) + s = math.inf + e = -1 + for token in (stoken, etoken): + for aligned in token.get_all_aligned(): + if aligned.at_type == AnnotationTypes.TimeFrame: + tok_s, tok_e = normalize_timeframe_times(aligned) + s = min(s, tok_s) + e = max(e, tok_e) + AAPB_dict["start_time"] = f"{s:.3f}" + AAPB_dict["end_time"] = f"{e:.3f}" + AAPB_dict["text"] = sent.get_property("text") + AAPB_dict["speaker_id"] = speaker_id + parts.append(AAPB_dict) + AAPB_dict = {} + speaker_id += 1 + return parts + + +def convert_mmif_to_aapbjson(mmif_obj: mmif.Mmif, out_f: typing.IO, pretty=True): + done = False + for view in mmif_obj.views: + # TODO (krim @ 8/23/24): is this the best check to grab an ASR view? + if all(map(lambda x: x in view.metadata.contains, [ + Uri.SENTENCE, + AnnotationTypes.TimeFrame, + AnnotationTypes.Alignment, + DocumentTypes.TextDocument, + ])): + lang = 'en-US' # default language + guid = None + for annotation in view.annotations: + if annotation.at_type == DocumentTypes.TextDocument: + lang = annotation.text_language + for aligned in annotation.get_all_aligned(): + if aligned.at_type in (DocumentTypes.AudioDocument, DocumentTypes.VideoDocument): + guid = guidhandler.get_aapb_guid_from(Document(aligned.serialize()).location_address()) + break + if guid is None: + raise ValueError("No GUID found in the MMIF file.") + parts = get_parts_from_view(view) + out_obj = { + 'id': guid, + 'language': lang, + 'parts': parts + } + json.dump(out_obj, out_f, indent=2 if pretty else None) + done = True + break + if not done: + raise ValueError("No ASR view found in the MMIF file.") + + +def main(): + parser = argparse.ArgumentParser(description="Convert MMIF <-> AAPB-JSON.") + subparsers = parser.add_subparsers(dest='command', help='Subcommands') + convert_parser = subparsers.add_parser('convert', help='Convert MMIF <-> AAPB-JSON') + # TODO (krim @ 8/23/24): add the inverse conversion from AAPB-JSON to MMIF when the AAPB-JSON format is finalized. + convert_parser.add_argument('--from-mmif', action='store_true', help='conversion direction') + convert_parser.add_argument('--to-mmif', action='store_true', help='conversion direction') + convert_parser.add_argument("-p", '--pretty', action='store_true', help="indent output json (default: False)") + convert_parser.add_argument("IN_FILE", + nargs="?", type=argparse.FileType("r"), + default=None if sys.stdin.isatty() else sys.stdin, + help='input MMIF file path, or STDIN if `-` or not provided.') + convert_parser.add_argument("OUT_FILE", + nargs="?", type=argparse.FileType("w"), + default=sys.stdout, + help='output MMIF file path, or STDOUT if `-` or not provided.') + args = parser.parse_args() + if args.command == 'convert': + # print(type(args.IN_FILE)) + if args.to_mmif: + raise NotImplementedError("Conversion from AAPB-JSON to MMIF is not implemented yet.") + else: # meaning, --from-mmif flag actually doesn't need to be specified anyway. It's just for clarity. + convert_mmif_to_aapbjson(mmif.Mmif(args.IN_FILE.read()), args.OUT_FILE, args.pretty) + + +if __name__ == "__main__": + main() diff --git a/clams_utils/aapb/guidhandler.py b/clams_utils/aapb/guidhandler.py index fdb7aa7..de7bd3d 100644 --- a/clams_utils/aapb/guidhandler.py +++ b/clams_utils/aapb/guidhandler.py @@ -4,13 +4,16 @@ """ import re +from typing import Optional -def get_aapb_guid_from(s: str) -> str: + +def get_aapb_guid_from(s: Optional[str]) -> Optional[str]: """ Extracts the AAPB GUID from a string, if present. The function returns the GUID if found, otherwise it returns None. """ - + if s is None: + return None m = re.search(r'(cpb-aacip[-_][a-z0-9-]+).', s) if m is None: return m diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 0a5dca8..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,11 +0,0 @@ -[build-system] -requires = ["setuptools", "setuptools-git-versioning<2"] -build-backend = "setuptools.build_meta" - -[tool.setuptools-git-versioning] -enabled = true - -[project] -name = "clams_utils" -description = "a collection of smaller helper/utility code of CLAMS platform" -dynamic = ["version", "license", "readme", "authors", "dependencies"] diff --git a/requirements.txt b/requirements.txt index f229360..8c598e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ requests +mmif-python>=1.0.19 +lapps diff --git a/setup.cfg b/setup.cfg index 86e5543..85b5e4c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,4 +1,5 @@ [metadata] +name = clams-utils long_description = file: README.md license = Apache-2.0 author = "Brandeis Lab for Linguistics and Computation" @@ -6,4 +7,10 @@ author_email = "admin@clams.ai" [options] packages = find: -install_requires = requests +install_requires = + requests + mmif-python + +[options.entry_points] +console_scripts = + clams-aapb = clams_utils.aapb.converter_aapbjson:main \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..fc1f76c --- /dev/null +++ b/setup.py @@ -0,0 +1,3 @@ +from setuptools import setup + +setup() \ No newline at end of file