From 4598d2bb3182f0f39237edd237e1219720018beb Mon Sep 17 00:00:00 2001 From: Harpo Date: Thu, 22 Feb 2024 08:21:59 -0800 Subject: [PATCH] Migrates to use s3 mmif location (#21) --- mario/pipelines/pipeline.py | 2 +- mario/pipelines/utils.py | 44 ++++++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/mario/pipelines/pipeline.py b/mario/pipelines/pipeline.py index f43ec27..c17f759 100644 --- a/mario/pipelines/pipeline.py +++ b/mario/pipelines/pipeline.py @@ -74,8 +74,8 @@ def run_pipeline(self): def end(self): """Upload the results to S3 and Chowda, cleanup files""" self.s3_path = f'{self.guid}/{self.batch_id}/{self.guid}.mmif' - self.update_database(self.s3_path) self.upload_mmif(self.s3_path) + self.update_database(self.s3_path) self.cleanup() print(f'Successfully processed {self.guid}') diff --git a/mario/pipelines/utils.py b/mario/pipelines/utils.py index 823d19b..12687ea 100644 --- a/mario/pipelines/utils.py +++ b/mario/pipelines/utils.py @@ -46,6 +46,22 @@ def get_asset_id(self) -> None: self.type = asset.type.value.lower() self.filename = join('/m', self.asset_name) + def get_mmif_from_database(self): + from chowda.db import engine + from chowda.models import MediaFile + from sqlmodel import Session, select + + with Session(engine) as db: + media_file = db.exec( + select(MediaFile).where(MediaFile.guid == self.guid) + ).one() + # TODO Ensure this gets the most recent mmif + location = media_file.mmifs[-1].mmif_location + # get the mmif from the S3 bucket + if not location: + raise ValueError(f'No mmif found for {self.guid}') + return self.download_mmif_from_s3(location) + def create_new_mmif(self) -> dict: from requests import post @@ -121,7 +137,7 @@ def upload_mmif(self, s3_path: str) -> None: from boto3 import client - mmif_filename = join('/m', self.guid + '.mmif') + mmif_filename = join(self.guid + '.mmif') with open(mmif_filename, 'w') as f: f.write(dumps(self.output_mmif)) @@ -136,6 +152,32 @@ def upload_mmif(self, s3_path: str) -> None: s3_path, ) print('Uploaded mmif!') + return s3_path + + def download_mmif_from_s3(self, s3_path: str): + from json import loads + from os import remove + from os.path import join + + from boto3 import client + + bucket = self.bucket if self.bucket != 'null' else 'clams-mmif' + mmif_filename = join(self.guid + '.mmif') + + print(f'Downloading {s3_path} to {mmif_filename}') + s3_client = client('s3') + s3_client.download_file( + bucket, + s3_path, + mmif_filename, + ) + + with open(mmif_filename) as file: + file_contents = file.read() + + remove(mmif_filename) + + return loads(file_contents) def cleanup(self) -> None: """delete media file and transcripts"""