Skip to content

Commit

Permalink
Migrates to use s3 mmif location (#21)
Browse files Browse the repository at this point in the history
  • Loading branch information
mrharpo authored Feb 22, 2024
1 parent 6f592b6 commit 4598d2b
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 2 deletions.
2 changes: 1 addition & 1 deletion mario/pipelines/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ def run_pipeline(self):
def end(self):
"""Upload the results to S3 and Chowda, cleanup files"""
self.s3_path = f'{self.guid}/{self.batch_id}/{self.guid}.mmif'
self.update_database(self.s3_path)
self.upload_mmif(self.s3_path)
self.update_database(self.s3_path)
self.cleanup()
print(f'Successfully processed {self.guid}')

Expand Down
44 changes: 43 additions & 1 deletion mario/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,22 @@ def get_asset_id(self) -> None:
self.type = asset.type.value.lower()
self.filename = join('/m', self.asset_name)

def get_mmif_from_database(self):
from chowda.db import engine
from chowda.models import MediaFile
from sqlmodel import Session, select

with Session(engine) as db:
media_file = db.exec(
select(MediaFile).where(MediaFile.guid == self.guid)
).one()
# TODO Ensure this gets the most recent mmif
location = media_file.mmifs[-1].mmif_location
# get the mmif from the S3 bucket
if not location:
raise ValueError(f'No mmif found for {self.guid}')
return self.download_mmif_from_s3(location)

def create_new_mmif(self) -> dict:
from requests import post

Expand Down Expand Up @@ -121,7 +137,7 @@ def upload_mmif(self, s3_path: str) -> None:

from boto3 import client

mmif_filename = join('/m', self.guid + '.mmif')
mmif_filename = join(self.guid + '.mmif')

with open(mmif_filename, 'w') as f:
f.write(dumps(self.output_mmif))
Expand All @@ -136,6 +152,32 @@ def upload_mmif(self, s3_path: str) -> None:
s3_path,
)
print('Uploaded mmif!')
return s3_path

def download_mmif_from_s3(self, s3_path: str):
from json import loads
from os import remove
from os.path import join

from boto3 import client

bucket = self.bucket if self.bucket != 'null' else 'clams-mmif'
mmif_filename = join(self.guid + '.mmif')

print(f'Downloading {s3_path} to {mmif_filename}')
s3_client = client('s3')
s3_client.download_file(
bucket,
s3_path,
mmif_filename,
)

with open(mmif_filename) as file:
file_contents = file.read()

remove(mmif_filename)

return loads(file_contents)

def cleanup(self) -> None:
"""delete media file and transcripts"""
Expand Down

0 comments on commit 4598d2b

Please sign in to comment.