removed reliance on browser session, replaced uuid with sha1 hash (#26)

clamsproject · Aug 27, 2023 · c86e1e2 · c86e1e2
1 parent aeb7f41
commit c86e1e2
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 54 deletions.
diff --git a/app.py b/app.py
@@ -1,16 +1,15 @@
+import hashlib
 import os
 import secrets
 import sys
-from pathlib import Path
 from threading import Thread
 
-import shortuuid
-from flask import request, render_template, flash, send_from_directory, session, redirect
+from flask import request, render_template, flash, send_from_directory, redirect
 from mmif.serialize import Mmif
 
 import cache
 from cache import set_last_access, cleanup
-from utils import app, render_ocr, get_media, prep_annotations, prepare_ocr_visualization
+from utils import app, render_ocr, documents_to_htmls, prep_annotations, prepare_ocr_visualization
 
 
 @app.route('/')
@@ -82,38 +81,37 @@ def send_js(path):
     return send_from_directory("uv", path)
 
 
-def render_mmif(mmif_str):
+def render_mmif(mmif_str, viz_id):
     mmif = Mmif(mmif_str)
-    media = get_media(mmif)
-    annotations = prep_annotations(mmif)
+    media = documents_to_htmls(mmif, viz_id)
+    annotations = prep_annotations(mmif, viz_id)
     return render_template('player.html',
                            media=media, annotations=annotations)
 
 
-def upload_file(file):
+def upload_file(in_mmif):
     # Save file locally
-    uuid = shortuuid.uuid()
-    session["mmif_id"] = uuid
-    app.logger.debug(uuid)
-    path = Path(app.root_path) / 'static' / 'tmp' / uuid
-    os.makedirs(path)
+    in_mmif_bytes = in_mmif.read()
+    in_mmif_str = in_mmif_bytes.decode('utf-8')
+    viz_id = hashlib.sha1(in_mmif_bytes).hexdigest()
+    app.logger.debug(viz_id)
+    path = cache.get_cache_path() / viz_id
+    os.makedirs(path, exist_ok=True)
     set_last_access(path)
-    file.save(os.path.join(path, "file.mmif"))
-    with open(os.path.join(path, "file.mmif")) as fh:
-        mmif_str = fh.read()
-    html_page = render_mmif(mmif_str)
+    with open(path / 'file.mmif', 'w') as in_mmif_file:
+        in_mmif_file.write(in_mmif_str)
+    html_page = render_mmif(in_mmif_str, viz_id)
     with open(os.path.join(path, "index.html"), "w") as f:
         f.write(html_page)
-
     # Perform cleanup
     t = Thread(target=cleanup)
     t.daemon = True
     t.run()
 
     agent = request.headers.get('User-Agent')
     if 'curl' in agent.lower():
-        return f"Visualization ID is {uuid}\nYou can access the visualized file at /display/{uuid}\n"
-    return redirect(f"/display/{uuid}", code=301)
+        return f"Visualization ID is {viz_id}\nYou can access the visualized file at /display/{viz_id}\n"
+    return redirect(f"/display/{viz_id}", code=301)
 
 
 if __name__ == '__main__':

diff --git a/cache.py b/cache.py
@@ -10,7 +10,11 @@
 
 
 def get_cache_path():
-    return pathlib.Path(app.root_path) / "static" / "tmp"
+    return pathlib.Path(app.static_folder) / "tmp"
+
+
+def get_cache_relpath(full_path):
+    return str(full_path)[len(app.static_folder):]
 
 
 def set_last_access(path):

diff --git a/iiif_utils.py b/iiif_utils.py
@@ -1,17 +1,17 @@
+import datetime
 import json
 import os
-import pathlib
 import tempfile
 from typing import Dict
 
 import mmif
-from flask import url_for, session
+from flask import url_for
 from mmif import AnnotationTypes, DocumentTypes, Mmif
-import datetime
+
 import cache
 
 
-def generate_iiif_manifest(in_mmif: mmif.Mmif):
+def generate_iiif_manifest(in_mmif: mmif.Mmif, viz_id):
     iiif_json = {
         "@context": "http://iiif.io/api/presentation/2/context.json",
         "id": "http://0.0.0.0:5000/mmif_example_manifest.json",
@@ -29,7 +29,7 @@ def generate_iiif_manifest(in_mmif: mmif.Mmif):
     }
     add_canvas_from_documents(in_mmif, iiif_json)
     add_structure_from_timeframe(in_mmif, iiif_json)
-    return save_manifest(iiif_json)
+    return save_manifest(iiif_json, viz_id)
 
 
 def add_canvas_from_documents(in_mmif, iiif_json):
@@ -106,7 +106,7 @@ def add_structure_from_timeframe(in_mmif: Mmif, iiif_json: Dict):
         iiif_json["structures"].append(view_range)
 
 
-def save_manifest(iiif_json: Dict) -> str:
+def save_manifest(iiif_json: Dict, viz_id) -> str:
     # generate a iiif manifest and save output file
     manifest = tempfile.NamedTemporaryFile(
         'w', dir=str(cache.get_cache_path() / viz_id), suffix='.json', delete=False)

diff --git a/utils.py b/utils.py
@@ -2,7 +2,7 @@
 from datetime import timedelta
 from io import StringIO
 
-from flask import Flask, session
+from flask import Flask
 from lapps.discriminators import Uri
 from mmif import DocumentTypes
 from mmif.serialize.annotation import Text
@@ -19,8 +19,11 @@
 app.secret_key = 'your_secret_key_here'
 
 
-def get_alignments(alignment_view):
-    vtt_file = tempfile.NamedTemporaryFile('w', dir=str(cache.get_cache_path() / session['mmif_id']), suffix='.vtt', delete=False)
+def asr_alignments_to_vtt(alignment_view, viz_id):
+    vtt_filename = cache.get_cache_path() / viz_id / f"{alignment_view.id.replace(':', '-')}.vtt" 
+    if vtt_filename.exists():
+        return str(vtt_filename)
+    vtt_file = open(vtt_filename, 'w')
     vtt_file.write("WEBVTT\n\n")
     annotations = alignment_view.annotations
     timeframe_at_type = [at_type for at_type in alignment_view.metadata.contains if at_type.shortname == "TimeFrame"][0]
@@ -51,7 +54,7 @@ def get_alignments(alignment_view):
                 vtt_file.write(f'{vtt_start} --> {vtt_end}\n{" ".join(texts)}\n\n')
                 vtt_start = None
                 texts = []
-    return vtt_file
+    return vtt_file.name
 
 
 def build_alignment(alignment, token_idx, timeframe_idx):
@@ -66,7 +69,7 @@ def build_alignment(alignment, token_idx, timeframe_idx):
         return start, end, text
 
 
-def get_media(mmif):
+def documents_to_htmls(mmif, viz_id):
     # Returns a list of tuples, one for each element in the documents list of
     # the MMIF object, following the order in that list. Each tuple has four
     # elements: document type, document identifier, document path and the HTML
@@ -80,16 +83,16 @@ def get_media(mmif):
         elif document.at_type == DocumentTypes.VideoDocument:
             fa_views = get_alignment_views(mmif)
             fa_view = fa_views[0] if fa_views else None
-            html = html_video(doc_path, fa_view)
+            html = html_video(viz_id, doc_path, fa_view)
         elif document.at_type == DocumentTypes.AudioDocument:
             html = html_audio(doc_path)
         elif document.at_type == DocumentTypes.ImageDocument:
             boxes = get_boxes(mmif)
             html = html_img(doc_path, boxes)
         media.append((document.at_type, document.id, doc_path, html))
-    manifest_filename = generate_iiif_manifest(mmif)
+    manifest_filename = generate_iiif_manifest(mmif, viz_id)
     man = os.path.basename(manifest_filename)
-    temp = render_template("uv_player.html", manifest=man, mmif_id=session["mmif_id"])
+    temp = render_template("uv_player.html", manifest=man, mmif_id=viz_id)
     media.append(('UV', "", "", temp))
     return media
 
@@ -116,7 +119,7 @@ def get_boxes(mmif):
     return boxes
 
 
-def prep_annotations(mmif):
+def prep_annotations(mmif, viz_id):
     """Prepare annotations from the views, and return a list of pairs of tabname
     and tab content. The first tab is alway the full MMIF pretty print."""
     tabs = [("Info", "<pre>" + create_info(mmif) + "</pre>"),
@@ -127,7 +130,7 @@ def prep_annotations(mmif):
     # stuff; it does a loop but for now we assume there is just one file with
     # alignments (generated by Kaldi)
     for fa_view in get_alignment_views(mmif):
-        vtt_file = view_to_vtt(fa_view)
+        vtt_file = asr_alignments_to_vtt(fa_view, viz_id)
         tabs.append(("WebVTT", '<pre>' + open(vtt_file).read() + '</pre>'))
     ner_views = get_ner_views(mmif)
     use_id = True if len(ner_views) > 1 else False
@@ -144,7 +147,7 @@ def prep_annotations(mmif):
         if not ocr_view.annotations:
             continue
         tabname = "Frames-%s" % ocr_view.id
-        visualization = render_template("pre-ocr.html", view_id=ocr_view.id, tabname=tabname, mmif_id=session["mmif_id"])
+        visualization = render_template("pre-ocr.html", view_id=ocr_view.id, tabname=tabname, mmif_id=viz_id)
         tabs.append((tabname, visualization))
     return tabs
 
@@ -217,19 +220,16 @@ def get_alignment_views(mmif):
 
 # Remder Media as HTML ------------
 
-def html_video(vpath, vtt_srcview=None):
+def html_video(viz_id, vpath, vtt_srcview=None):
     vpath = url2posix(vpath)
     html = StringIO()
-    html.write('<video id="vid" controls>\n')
+    html.write('<video id="vid" controls crossorigin="anonymous" >\n')
     html.write(f'    <source src=\"{vpath}\">\n')
     if vtt_srcview is not None:
-        vtt_path = view_to_vtt(vtt_srcview)
-        src = "/" + os.sep.join(vtt_path.split(os.sep)[-2:])
-        # use only basename because "static" directory is mapped to '' route by
-        # `static_url_path` param
-        # src = os.path.basename(vtt_path)
-
-        html.write(f'    <track kind="subtitles" srclang="en" src="{src}" label="English" default>\n')
+        vtt_path = asr_alignments_to_vtt(vtt_srcview, viz_id)
+        src = cache.get_cache_relpath(vtt_path)
+        app.logger.debug(f"VTT path: {vtt_path}")
+        html.write(f'    <track kind="captions" srclang="en" src="{src}" label="transcript" default/>\n')
     html.write("</video>\n")
     return html.getvalue()
 
@@ -300,13 +300,6 @@ def get_ner_views(mmif):
     return [v for v in mmif.views if Uri.NE in v.metadata.contains]
 
 
-def view_to_vtt(alignment_view):
-    """Write alignments to a file in VTT style and return the filename."""
-    vtt_file = get_alignments(alignment_view)
-    return vtt_file.name
-    # return os.sep.join(vtt_file.name.split(os.sep)[-3:])
-
-
 def create_ner_visualization(mmif, view):
     metadata = view.metadata.contains.get(Uri.NE)
     try: