Skip to content

Commit

Permalink
Merge pull request #28 from clamsproject/url_access
Browse files Browse the repository at this point in the history
sub-page/sub-URL generation
  • Loading branch information
haydenmccormick authored Sep 13, 2023
2 parents c1b4c9a + 9a28864 commit 0b1ed87
Show file tree
Hide file tree
Showing 11 changed files with 322 additions and 163 deletions.
13 changes: 11 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ In these notes we assume that the data are in a local directory named `/Users/Sh
$ docker run --rm -d -p 5000:5000 -v /Users/Shared/archive:/data clams-mmif-visualizer
```

After this, all you need to do is point your browser at [http://0.0.0.0:5000/upload](http://0.0.0.0:5000/upload), click "Choose File", select a MMIF file and then click "Visualize". See the *Data source repository and input MMIF file* section below for a description of the MMIF file. Assuming you have not made any changes to the directory structure you can use the example MMIF files in the `input` folder.
See the *Data source repository and input MMIF file* section below for a description of the MMIF file. Assuming you have not made any changes to the directory structure you can use the example MMIF files in the `input` folder.

**Some background**

Expand Down Expand Up @@ -89,8 +89,17 @@ To run the server do:
$ python app.py
```

Then point your browser at [http://0.0.0.0:5000/upload](http://0.0.0.0:5000/upload), click "Choose File" and then click "Visualize".

## Uploading Files
MMIF files can be uploaded to the visualization server one of two ways:
* Point your browser to http://0.0.0.0:5000/upload, click "Choose File" and then click "Visualize". This will generate a static URL containing the visualization of the input file (e.g. `http://localhost:5000/display/HaTxbhDfwakewakmzdXu5e`). Once the file is uploaded, the page will automatically redirect to the file's visualization.
* Using a command line, enter:
```
curl -X POST -F "file=@<filename>" -s http://localhost:5000/upload
```
This will upload the file and print the unique identifier for the file visualization. The visualization can be accessed at `http://localhost:5000/display/<id>`

The server will maintain a cache of up to 50MB for these temporary files, so the visualizations can be repeatedly accessed without needing to re-upload any files. Once this limit is reached, the server will delete stored visualizations until enough space is reclaimed, drawing from oldest/least recently accessed pages first. If you attempt to access the /display URL of a deleted file, you will be redirected back to the upload page instead.


## Data source repository and input MMIF file
Expand Down
108 changes: 82 additions & 26 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import hashlib
import os
import pathlib
import sys
import secrets
import json
import html
import sys
from threading import Thread

from flask import request, render_template, flash, redirect, send_from_directory, session
from werkzeug.utils import secure_filename
from flask import request, render_template, flash, send_from_directory, redirect
from mmif.serialize import Mmif

from utils import app, render_ocr, get_media, prep_annotations, prepare_ocr_visualization
import cache
from cache import set_last_access, cleanup
from utils import app, render_ocr, documents_to_htmls, prep_annotations, prepare_ocr_visualization


@app.route('/')
def index():
Expand All @@ -20,10 +21,10 @@ def index():
def ocr():
try:
data = dict(request.json)
mmif_str = open(session["mmif_file"]).read()
mmif_str = open(cache.get_cache_path() / data["mmif_id"] / "file.mmif").read()
mmif = Mmif(mmif_str)
ocr_view = mmif.get_view_by_id(data["view_id"])
return prepare_ocr_visualization(mmif, ocr_view)
return prepare_ocr_visualization(mmif, ocr_view, data["mmif_id"])
except Exception as e:
return f'<p class="error">{e}</h1>'

Expand All @@ -32,10 +33,11 @@ def ocr():
def ocrpage():
data = request.json
try:
return (render_ocr(data['vid_path'], data["view_id"], data["page_number"]))
return render_ocr(data["mmif_id"], data['vid_path'], data["view_id"], data["page_number"])
except Exception as e:
return f'<p class="error">Unexpected error of type {type(e)}: {e}</h1>'


@app.route('/upload', methods=['GET', 'POST'])
def upload():
# NOTE. Uses of flash() originally gaven a RuntimeError (The session is
Expand All @@ -44,7 +46,7 @@ def upload():
if request.method == 'POST':
# Check if request is coming from elasticsearch
if 'data' in request.form:
return render_mmif(request.form['data'])
return upload_file(request.form['data'])
# Otherwise, check if the post request has the file part
elif 'file' not in request.files:
flash('WARNING: post request has no file part')
Expand All @@ -56,34 +58,87 @@ def upload():
flash('WARNING: no file was selected')
return redirect(request.url)
if file:
filename = secure_filename(file.filename)
file.save(os.path.join('temp', filename))
with open("temp/" + filename) as fh:
session["mmif_file"] = fh.name
mmif_str = fh.read()
return render_mmif(mmif_str)
return upload_file(file)

return render_template('upload.html')


@app.route('/decache', methods=['GET', 'POST'])
def invalidate_cache():
app.logger.debug(f"Request to invalidate cache on {request.args}")
if not request.args.get('viz_id'):
cache.invalidate_cache()
return redirect("/upload")
viz_id = request.args.get('viz_id')
in_mmif = open(cache.get_cache_path() / viz_id / 'file.mmif', 'rb').read()
cache.invalidate_cache([viz_id])
return upload_file(in_mmif)


@app.route('/display/<viz_id>')
def display(viz_id):
try:
path = cache.get_cache_path() / viz_id
set_last_access(path)
with open(os.path.join(path, "index.html")) as f:
html_file = f.read()
return html_file
except FileNotFoundError:
flash("File not found -- please upload again (it may have been deleted to clear up cache space).")
return redirect("/upload")


@app.route('/uv/<path:path>')
def send_js(path):
return send_from_directory("uv", path)


def render_mmif(mmif_str):
def render_mmif(mmif_str, viz_id):
mmif = Mmif(mmif_str)
media = get_media(mmif)
annotations = prep_annotations(mmif)
media = documents_to_htmls(mmif, viz_id)
app.logger.debug(f"Prepared Media: {[m[0] for m in media]}")
annotations = prep_annotations(mmif, viz_id)
app.logger.debug(f"Prepared Annotations: {[annotation[0] for annotation in annotations]}")
return render_template('player.html',
mmif=mmif, media=media, annotations=annotations)
media=media, viz_id=viz_id, annotations=annotations)


def upload_file(in_mmif):
# Save file locally
in_mmif_bytes = in_mmif if isinstance(in_mmif, bytes) else in_mmif.read()
in_mmif_str = in_mmif_bytes.decode('utf-8')
viz_id = hashlib.sha1(in_mmif_bytes).hexdigest()
app.logger.debug(f"Visualization ID: {viz_id}")
path = cache.get_cache_path() / viz_id
app.logger.debug(f"Visualization Directory: {path}")
try:
os.makedirs(path)
set_last_access(path)
with open(path / 'file.mmif', 'w') as in_mmif_file:
app.logger.debug(f"Writing original MMIF to {path / 'file.mmif'}")
in_mmif_file.write(in_mmif_str)
html_page = render_mmif(in_mmif_str, viz_id)
with open(os.path.join(path, "index.html"), "w") as f:
f.write(html_page)
except FileExistsError:
app.logger.debug("Visualization already cached")
finally:
# Perform cleanup
t = Thread(target=cleanup)
t.daemon = True
t.run()

agent = request.headers.get('User-Agent')
if 'curl' in agent.lower():
return f"Visualization ID is {viz_id}\nYou can access the visualized file at /display/{viz_id}\n"
return redirect(f"/display/{viz_id}", code=301)


if __name__ == '__main__':
# Make path for temp files
tmp_path = pathlib.Path(__file__).parent /'static'/'tmp'
if not os.path.exists(tmp_path):
os.makedirs(tmp_path)

cache_path = cache.get_cache_path()
if not os.path.exists(cache_path):
os.makedirs(cache_path)

# to avoid runtime errors for missing keys when using flash()
alphabet = 'abcdefghijklmnopqrstuvwxyz1234567890'
Expand All @@ -92,4 +147,5 @@ def render_mmif(mmif_str):
port = 5000
if len(sys.argv) > 2 and sys.argv[1] == '-p':
port = int(sys.argv[2])
app.run(port=port, host='0.0.0.0', debug=True)

app.run(port=port, host='0.0.0.0', debug=True, use_reloader=False)
68 changes: 68 additions & 0 deletions cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import os
import time
import shutil
import threading
import pathlib

from utils import app

lock = threading.Lock()


def get_cache_path():
return pathlib.Path(app.static_folder) / "tmp"


def get_cache_relpath(full_path):
return str(full_path)[len(app.static_folder):]


def invalidate_cache(viz_ids):
if not viz_ids:
app.logger.debug("Invalidating entire cache.")
shutil.rmtree(get_cache_path())
os.makedirs(get_cache_path())
else:
for v in viz_ids:
app.logger.debug(f"Invalidating {v} from cache.")
shutil.rmtree(get_cache_path() / v)


def set_last_access(path):
with open(os.path.join(path, "last_access.txt"), "w") as f:
f.write(str(time.time()))


def scan_tmp_directory():
oldest_accessed_dir = {"dir": None, "access_time": None}
total_size = sum(f.stat().st_size for f in get_cache_path().glob('**/*') if f.is_file())
# this will be some visualization IDs
for p in get_cache_path().glob('*'):
if not (p / 'last_access.txt').exists():
oldest_accessed_dir = {"dir": p, "access_time": 0}
elif oldest_accessed_dir["dir"] is None:
with open(p / 'last_access.txt') as f:
timestamp = f.read()
if timestamp == '':
continue
oldest_accessed_dir = {"dir": p, "access_time": float(timestamp)}
else:
with open(p / 'last_access.txt') as f:
if float(f.read()) < oldest_accessed_dir["access_time"]:
timestamp = f.read()
if timestamp == '':
continue
oldest_accessed_dir = {"dir": p, "access_time": float(timestamp)}
return total_size, oldest_accessed_dir["dir"]


def cleanup():
with lock:
print("Checking visualization cache...")
# Max tmp size is 500MB
max_size = 500000000
folder_size, oldest_dir = scan_tmp_directory()
while folder_size > max_size:
print(f"Maximum cache size reached. Deleting {os.path.basename(oldest_dir)}.")
shutil.rmtree(oldest_dir)
folder_size, oldest_dir = scan_tmp_directory()
14 changes: 8 additions & 6 deletions iiif_utils.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
import datetime
import json
import os
import pathlib
import tempfile
from typing import Dict

import mmif
from flask import url_for
from mmif import AnnotationTypes, DocumentTypes, Mmif
import datetime

import cache


def generate_iiif_manifest(in_mmif: mmif.Mmif):
def generate_iiif_manifest(in_mmif: mmif.Mmif, viz_id):
iiif_json = {
"@context": "http://iiif.io/api/presentation/2/context.json",
"id": "http://0.0.0.0:5000/mmif_example_manifest.json",
Expand All @@ -28,7 +29,7 @@ def generate_iiif_manifest(in_mmif: mmif.Mmif):
}
add_canvas_from_documents(in_mmif, iiif_json)
add_structure_from_timeframe(in_mmif, iiif_json)
return save_manifest(iiif_json)
return save_manifest(iiif_json, viz_id)


def add_canvas_from_documents(in_mmif, iiif_json):
Expand Down Expand Up @@ -105,9 +106,10 @@ def add_structure_from_timeframe(in_mmif: Mmif, iiif_json: Dict):
iiif_json["structures"].append(view_range)


def save_manifest(iiif_json: Dict) -> str:
def save_manifest(iiif_json: Dict, viz_id) -> str:
# generate a iiif manifest and save output file
manifest = tempfile.NamedTemporaryFile('w', dir=str(pathlib.Path(__file__).parent /'static'/'tmp'), suffix='.json', delete=False)
manifest = tempfile.NamedTemporaryFile(
'w', dir=str(cache.get_cache_path() / viz_id), suffix='.json', delete=False)
json.dump(iiif_json, manifest, indent=4)
return manifest.name

Expand Down
Loading

0 comments on commit 0b1ed87

Please sign in to comment.