Merge pull request #28 from clamsproject/url_access

sub-page/sub-URL generation
clamsproject · Sep 13, 2023 · 0b1ed87 · 0b1ed87
2 parents c1b4c9a + 9a28864
commit 0b1ed87
Show file tree

Hide file tree

Showing 11 changed files with 322 additions and 163 deletions.
diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ In these notes we assume that the data are in a local directory named `/Users/Sh
 $ docker run --rm -d -p 5000:5000 -v /Users/Shared/archive:/data clams-mmif-visualizer
 ```
 
-After this, all you need to do is point your browser at [http://0.0.0.0:5000/upload](http://0.0.0.0:5000/upload), click "Choose File", select a MMIF file and then click "Visualize". See the *Data source repository and input MMIF file* section below for a description of the MMIF file. Assuming you have not made any changes to the directory structure you can use the example MMIF files in the `input` folder.
+See the *Data source repository and input MMIF file* section below for a description of the MMIF file. Assuming you have not made any changes to the directory structure you can use the example MMIF files in the `input` folder.
 
 **Some background**
 
@@ -89,8 +89,17 @@ To run the server do:
 $ python app.py
 ```
 
-Then point your browser at [http://0.0.0.0:5000/upload](http://0.0.0.0:5000/upload), click "Choose File" and then click "Visualize".
 
+## Uploading Files
+MMIF files can be uploaded to the visualization server one of two ways:
+* Point your browser to http://0.0.0.0:5000/upload, click "Choose File" and then click "Visualize". This will generate a static URL containing the visualization of the input file (e.g. `http://localhost:5000/display/HaTxbhDfwakewakmzdXu5e`). Once the file is uploaded, the page will automatically redirect to the file's visualization.
+* Using a command line, enter:
+  ``` 
+  curl -X POST -F "file=@<filename>" -s http://localhost:5000/upload
+  ```
+  This will upload the file and print the unique identifier for the file visualization. The visualization can be accessed at `http://localhost:5000/display/<id>`
+
+The server will maintain a cache of up to 50MB for these temporary files, so the visualizations can be repeatedly accessed without needing to re-upload any files. Once this limit is reached, the server will delete stored visualizations until enough space is reclaimed, drawing from oldest/least recently accessed pages first. If you attempt to access the /display URL of a deleted file, you will be redirected back to the upload page instead.
 
 
 ## Data source repository and input MMIF file

diff --git a/app.py b/app.py
@@ -1,15 +1,16 @@
+import hashlib
 import os
-import pathlib
-import sys
 import secrets
-import json
-import html
+import sys
+from threading import Thread
 
-from flask import request, render_template, flash, redirect, send_from_directory, session
-from werkzeug.utils import secure_filename
+from flask import request, render_template, flash, send_from_directory, redirect
 from mmif.serialize import Mmif
 
-from utils import app, render_ocr, get_media, prep_annotations, prepare_ocr_visualization
+import cache
+from cache import set_last_access, cleanup
+from utils import app, render_ocr, documents_to_htmls, prep_annotations, prepare_ocr_visualization
+
 
 @app.route('/')
 def index():
@@ -20,10 +21,10 @@ def index():
 def ocr():
     try:
         data = dict(request.json)
-        mmif_str = open(session["mmif_file"]).read()
+        mmif_str = open(cache.get_cache_path() / data["mmif_id"] / "file.mmif").read()
         mmif = Mmif(mmif_str)
         ocr_view = mmif.get_view_by_id(data["view_id"])
-        return prepare_ocr_visualization(mmif, ocr_view)
+        return prepare_ocr_visualization(mmif, ocr_view, data["mmif_id"])
     except Exception as e:
         return f'<p class="error">{e}</h1>'
 
@@ -32,10 +33,11 @@ def ocr():
 def ocrpage():
     data = request.json
     try:
-        return (render_ocr(data['vid_path'], data["view_id"], data["page_number"]))
+        return render_ocr(data["mmif_id"], data['vid_path'], data["view_id"], data["page_number"])
     except Exception as e:
         return f'<p class="error">Unexpected error of type {type(e)}: {e}</h1>'
 
+
 @app.route('/upload', methods=['GET', 'POST'])
 def upload():
     # NOTE. Uses of flash() originally gaven a RuntimeError (The session is
@@ -44,7 +46,7 @@ def upload():
     if request.method == 'POST':
         # Check if request is coming from elasticsearch
         if 'data' in request.form:
-            return render_mmif(request.form['data'])
+            return upload_file(request.form['data'])
         # Otherwise, check if the post request has the file part
         elif 'file' not in request.files:
             flash('WARNING: post request has no file part')
@@ -56,34 +58,87 @@ def upload():
             flash('WARNING: no file was selected')
             return redirect(request.url)
         if file:
-            filename = secure_filename(file.filename)
-            file.save(os.path.join('temp', filename))
-            with open("temp/" + filename) as fh:
-                session["mmif_file"] = fh.name
-                mmif_str = fh.read()
-                return render_mmif(mmif_str)
+            return upload_file(file)
+
     return render_template('upload.html')
 
 
+@app.route('/decache', methods=['GET', 'POST'])
+def invalidate_cache():
+    app.logger.debug(f"Request to invalidate cache on {request.args}")
+    if not request.args.get('viz_id'):
+        cache.invalidate_cache()
+        return redirect("/upload")
+    viz_id = request.args.get('viz_id')
+    in_mmif = open(cache.get_cache_path() / viz_id / 'file.mmif', 'rb').read()
+    cache.invalidate_cache([viz_id])
+    return upload_file(in_mmif)
+
+
+@app.route('/display/<viz_id>')
+def display(viz_id):
+    try:
+        path = cache.get_cache_path() / viz_id
+        set_last_access(path)
+        with open(os.path.join(path, "index.html")) as f:
+            html_file = f.read()
+        return html_file
+    except FileNotFoundError:
+        flash("File not found -- please upload again (it may have been deleted to clear up cache space).")
+        return redirect("/upload")
+
+
 @app.route('/uv/<path:path>')
 def send_js(path):
     return send_from_directory("uv", path)
 
 
-def render_mmif(mmif_str):
+def render_mmif(mmif_str, viz_id):
     mmif = Mmif(mmif_str)
-    media = get_media(mmif)
-    annotations = prep_annotations(mmif)
+    media = documents_to_htmls(mmif, viz_id)
+    app.logger.debug(f"Prepared Media: {[m[0] for m in media]}")
+    annotations = prep_annotations(mmif, viz_id)
+    app.logger.debug(f"Prepared Annotations: {[annotation[0] for annotation in annotations]}")
     return render_template('player.html',
-                           mmif=mmif, media=media, annotations=annotations)
+                           media=media, viz_id=viz_id, annotations=annotations)
+
+
+def upload_file(in_mmif):
+    # Save file locally
+    in_mmif_bytes = in_mmif if isinstance(in_mmif, bytes) else in_mmif.read()
+    in_mmif_str = in_mmif_bytes.decode('utf-8')
+    viz_id = hashlib.sha1(in_mmif_bytes).hexdigest()
+    app.logger.debug(f"Visualization ID: {viz_id}")
+    path = cache.get_cache_path() / viz_id
+    app.logger.debug(f"Visualization Directory: {path}")
+    try:
+        os.makedirs(path)
+        set_last_access(path)
+        with open(path / 'file.mmif', 'w') as in_mmif_file:
+            app.logger.debug(f"Writing original MMIF to {path / 'file.mmif'}")
+            in_mmif_file.write(in_mmif_str)
+        html_page = render_mmif(in_mmif_str, viz_id)
+        with open(os.path.join(path, "index.html"), "w") as f:
+            f.write(html_page)
+    except FileExistsError:
+        app.logger.debug("Visualization already cached")
+    finally:
+        # Perform cleanup
+        t = Thread(target=cleanup)
+        t.daemon = True
+        t.run()
+
+    agent = request.headers.get('User-Agent')
+    if 'curl' in agent.lower():
+        return f"Visualization ID is {viz_id}\nYou can access the visualized file at /display/{viz_id}\n"
+    return redirect(f"/display/{viz_id}", code=301)
 
 
 if __name__ == '__main__':
     # Make path for temp files
-    tmp_path = pathlib.Path(__file__).parent /'static'/'tmp'
-    if not os.path.exists(tmp_path):
-        os.makedirs(tmp_path)
-
+    cache_path = cache.get_cache_path()
+    if not os.path.exists(cache_path):
+        os.makedirs(cache_path)
 
     # to avoid runtime errors for missing keys when using flash()
     alphabet = 'abcdefghijklmnopqrstuvwxyz1234567890'
@@ -92,4 +147,5 @@ def render_mmif(mmif_str):
     port = 5000
     if len(sys.argv) > 2 and sys.argv[1] == '-p':
         port = int(sys.argv[2])
-    app.run(port=port, host='0.0.0.0', debug=True)
+
+    app.run(port=port, host='0.0.0.0', debug=True, use_reloader=False)
diff --git a/cache.py b/cache.py
@@ -0,0 +1,68 @@
+import os
+import time
+import shutil
+import threading
+import pathlib
+
+from utils import app
+
+lock = threading.Lock()
+
+
+def get_cache_path():
+    return pathlib.Path(app.static_folder) / "tmp"
+
+
+def get_cache_relpath(full_path):
+    return str(full_path)[len(app.static_folder):]
+
+
+def invalidate_cache(viz_ids):
+    if not viz_ids:
+        app.logger.debug("Invalidating entire cache.")
+        shutil.rmtree(get_cache_path())
+        os.makedirs(get_cache_path())
+    else:
+        for v in viz_ids:
+            app.logger.debug(f"Invalidating {v} from cache.")
+            shutil.rmtree(get_cache_path() / v)
+
+
+def set_last_access(path):
+    with open(os.path.join(path, "last_access.txt"), "w") as f:
+        f.write(str(time.time()))
+
+
+def scan_tmp_directory():
+    oldest_accessed_dir = {"dir": None, "access_time": None}
+    total_size = sum(f.stat().st_size for f in get_cache_path().glob('**/*') if f.is_file())
+    # this will be some visualization IDs
+    for p in get_cache_path().glob('*'):
+        if not (p / 'last_access.txt').exists():
+            oldest_accessed_dir = {"dir": p, "access_time": 0}
+        elif oldest_accessed_dir["dir"] is None:
+            with open(p / 'last_access.txt') as f:
+                timestamp = f.read()
+                if timestamp == '':
+                    continue
+                oldest_accessed_dir = {"dir": p, "access_time": float(timestamp)}
+        else:
+            with open(p / 'last_access.txt') as f:
+                if float(f.read()) < oldest_accessed_dir["access_time"]:
+                    timestamp = f.read()
+                    if timestamp == '':
+                        continue
+                    oldest_accessed_dir = {"dir": p, "access_time": float(timestamp)}
+    return total_size, oldest_accessed_dir["dir"]
+
+
+def cleanup():
+    with lock:
+        print("Checking visualization cache...")
+        # Max tmp size is 500MB
+        max_size = 500000000
+        folder_size, oldest_dir = scan_tmp_directory()
+        while folder_size > max_size:
+            print(f"Maximum cache size reached. Deleting {os.path.basename(oldest_dir)}.")
+            shutil.rmtree(oldest_dir)
+            folder_size, oldest_dir = scan_tmp_directory()
diff --git a/iiif_utils.py b/iiif_utils.py
@@ -1,16 +1,17 @@
+import datetime
 import json
 import os
-import pathlib
 import tempfile
 from typing import Dict
 
 import mmif
 from flask import url_for
 from mmif import AnnotationTypes, DocumentTypes, Mmif
-import datetime
+
+import cache
 
 
-def generate_iiif_manifest(in_mmif: mmif.Mmif):
+def generate_iiif_manifest(in_mmif: mmif.Mmif, viz_id):
     iiif_json = {
         "@context": "http://iiif.io/api/presentation/2/context.json",
         "id": "http://0.0.0.0:5000/mmif_example_manifest.json",
@@ -28,7 +29,7 @@ def generate_iiif_manifest(in_mmif: mmif.Mmif):
     }
     add_canvas_from_documents(in_mmif, iiif_json)
     add_structure_from_timeframe(in_mmif, iiif_json)
-    return save_manifest(iiif_json)
+    return save_manifest(iiif_json, viz_id)
 
 
 def add_canvas_from_documents(in_mmif, iiif_json):
@@ -105,9 +106,10 @@ def add_structure_from_timeframe(in_mmif: Mmif, iiif_json: Dict):
         iiif_json["structures"].append(view_range)
 
 
-def save_manifest(iiif_json: Dict) -> str:
+def save_manifest(iiif_json: Dict, viz_id) -> str:
     # generate a iiif manifest and save output file
-    manifest = tempfile.NamedTemporaryFile('w', dir=str(pathlib.Path(__file__).parent /'static'/'tmp'), suffix='.json', delete=False)
+    manifest = tempfile.NamedTemporaryFile(
+        'w', dir=str(cache.get_cache_path() / viz_id), suffix='.json', delete=False)
     json.dump(iiif_json, manifest, indent=4)
     return manifest.name