diff --git a/README.md b/README.md index f2d9af5..c8984e9 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ In these notes we assume that the data are in a local directory named `/Users/Sh $ docker run --rm -d -p 5000:5000 -v /Users/Shared/archive:/data clams-mmif-visualizer ``` -After this, all you need to do is point your browser at [http://0.0.0.0:5000/upload](http://0.0.0.0:5000/upload), click "Choose File", select a MMIF file and then click "Visualize". See the *Data source repository and input MMIF file* section below for a description of the MMIF file. Assuming you have not made any changes to the directory structure you can use the example MMIF files in the `input` folder. +See the *Data source repository and input MMIF file* section below for a description of the MMIF file. Assuming you have not made any changes to the directory structure you can use the example MMIF files in the `input` folder. **Some background** @@ -89,8 +89,17 @@ To run the server do: $ python app.py ``` -Then point your browser at [http://0.0.0.0:5000/upload](http://0.0.0.0:5000/upload), click "Choose File" and then click "Visualize". +## Uploading Files +MMIF files can be uploaded to the visualization server one of two ways: +* Point your browser to http://0.0.0.0:5000/upload, click "Choose File" and then click "Visualize". This will generate a static URL containing the visualization of the input file (e.g. `http://localhost:5000/display/HaTxbhDfwakewakmzdXu5e`). Once the file is uploaded, the page will automatically redirect to the file's visualization. +* Using a command line, enter: + ``` + curl -X POST -F "file=@" -s http://localhost:5000/upload + ``` + This will upload the file and print the unique identifier for the file visualization. The visualization can be accessed at `http://localhost:5000/display/` + +The server will maintain a cache of up to 50MB for these temporary files, so the visualizations can be repeatedly accessed without needing to re-upload any files. Once this limit is reached, the server will delete stored visualizations until enough space is reclaimed, drawing from oldest/least recently accessed pages first. If you attempt to access the /display URL of a deleted file, you will be redirected back to the upload page instead. ## Data source repository and input MMIF file diff --git a/app.py b/app.py index be37df7..5e53c65 100644 --- a/app.py +++ b/app.py @@ -1,15 +1,16 @@ +import hashlib import os -import pathlib -import sys import secrets -import json -import html +import sys +from threading import Thread -from flask import request, render_template, flash, redirect, send_from_directory, session -from werkzeug.utils import secure_filename +from flask import request, render_template, flash, send_from_directory, redirect from mmif.serialize import Mmif -from utils import app, render_ocr, get_media, prep_annotations, prepare_ocr_visualization +import cache +from cache import set_last_access, cleanup +from utils import app, render_ocr, documents_to_htmls, prep_annotations, prepare_ocr_visualization + @app.route('/') def index(): @@ -20,10 +21,10 @@ def index(): def ocr(): try: data = dict(request.json) - mmif_str = open(session["mmif_file"]).read() + mmif_str = open(cache.get_cache_path() / data["mmif_id"] / "file.mmif").read() mmif = Mmif(mmif_str) ocr_view = mmif.get_view_by_id(data["view_id"]) - return prepare_ocr_visualization(mmif, ocr_view) + return prepare_ocr_visualization(mmif, ocr_view, data["mmif_id"]) except Exception as e: return f'

{e}' @@ -32,10 +33,11 @@ def ocr(): def ocrpage(): data = request.json try: - return (render_ocr(data['vid_path'], data["view_id"], data["page_number"])) + return render_ocr(data["mmif_id"], data['vid_path'], data["view_id"], data["page_number"]) except Exception as e: return f'

Unexpected error of type {type(e)}: {e}' + @app.route('/upload', methods=['GET', 'POST']) def upload(): # NOTE. Uses of flash() originally gaven a RuntimeError (The session is @@ -44,7 +46,7 @@ def upload(): if request.method == 'POST': # Check if request is coming from elasticsearch if 'data' in request.form: - return render_mmif(request.form['data']) + return upload_file(request.form['data']) # Otherwise, check if the post request has the file part elif 'file' not in request.files: flash('WARNING: post request has no file part') @@ -56,34 +58,87 @@ def upload(): flash('WARNING: no file was selected') return redirect(request.url) if file: - filename = secure_filename(file.filename) - file.save(os.path.join('temp', filename)) - with open("temp/" + filename) as fh: - session["mmif_file"] = fh.name - mmif_str = fh.read() - return render_mmif(mmif_str) + return upload_file(file) + return render_template('upload.html') +@app.route('/decache', methods=['GET', 'POST']) +def invalidate_cache(): + app.logger.debug(f"Request to invalidate cache on {request.args}") + if not request.args.get('viz_id'): + cache.invalidate_cache() + return redirect("/upload") + viz_id = request.args.get('viz_id') + in_mmif = open(cache.get_cache_path() / viz_id / 'file.mmif', 'rb').read() + cache.invalidate_cache([viz_id]) + return upload_file(in_mmif) + + +@app.route('/display/') +def display(viz_id): + try: + path = cache.get_cache_path() / viz_id + set_last_access(path) + with open(os.path.join(path, "index.html")) as f: + html_file = f.read() + return html_file + except FileNotFoundError: + flash("File not found -- please upload again (it may have been deleted to clear up cache space).") + return redirect("/upload") + + @app.route('/uv/') def send_js(path): return send_from_directory("uv", path) -def render_mmif(mmif_str): +def render_mmif(mmif_str, viz_id): mmif = Mmif(mmif_str) - media = get_media(mmif) - annotations = prep_annotations(mmif) + media = documents_to_htmls(mmif, viz_id) + app.logger.debug(f"Prepared Media: {[m[0] for m in media]}") + annotations = prep_annotations(mmif, viz_id) + app.logger.debug(f"Prepared Annotations: {[annotation[0] for annotation in annotations]}") return render_template('player.html', - mmif=mmif, media=media, annotations=annotations) + media=media, viz_id=viz_id, annotations=annotations) + + +def upload_file(in_mmif): + # Save file locally + in_mmif_bytes = in_mmif if isinstance(in_mmif, bytes) else in_mmif.read() + in_mmif_str = in_mmif_bytes.decode('utf-8') + viz_id = hashlib.sha1(in_mmif_bytes).hexdigest() + app.logger.debug(f"Visualization ID: {viz_id}") + path = cache.get_cache_path() / viz_id + app.logger.debug(f"Visualization Directory: {path}") + try: + os.makedirs(path) + set_last_access(path) + with open(path / 'file.mmif', 'w') as in_mmif_file: + app.logger.debug(f"Writing original MMIF to {path / 'file.mmif'}") + in_mmif_file.write(in_mmif_str) + html_page = render_mmif(in_mmif_str, viz_id) + with open(os.path.join(path, "index.html"), "w") as f: + f.write(html_page) + except FileExistsError: + app.logger.debug("Visualization already cached") + finally: + # Perform cleanup + t = Thread(target=cleanup) + t.daemon = True + t.run() + + agent = request.headers.get('User-Agent') + if 'curl' in agent.lower(): + return f"Visualization ID is {viz_id}\nYou can access the visualized file at /display/{viz_id}\n" + return redirect(f"/display/{viz_id}", code=301) if __name__ == '__main__': # Make path for temp files - tmp_path = pathlib.Path(__file__).parent /'static'/'tmp' - if not os.path.exists(tmp_path): - os.makedirs(tmp_path) - + cache_path = cache.get_cache_path() + if not os.path.exists(cache_path): + os.makedirs(cache_path) # to avoid runtime errors for missing keys when using flash() alphabet = 'abcdefghijklmnopqrstuvwxyz1234567890' @@ -92,4 +147,5 @@ def render_mmif(mmif_str): port = 5000 if len(sys.argv) > 2 and sys.argv[1] == '-p': port = int(sys.argv[2]) - app.run(port=port, host='0.0.0.0', debug=True) + + app.run(port=port, host='0.0.0.0', debug=True, use_reloader=False) diff --git a/cache.py b/cache.py new file mode 100644 index 0000000..7c9660b --- /dev/null +++ b/cache.py @@ -0,0 +1,68 @@ +import os +import time +import shutil +import threading +import pathlib + +from utils import app + +lock = threading.Lock() + + +def get_cache_path(): + return pathlib.Path(app.static_folder) / "tmp" + + +def get_cache_relpath(full_path): + return str(full_path)[len(app.static_folder):] + + +def invalidate_cache(viz_ids): + if not viz_ids: + app.logger.debug("Invalidating entire cache.") + shutil.rmtree(get_cache_path()) + os.makedirs(get_cache_path()) + else: + for v in viz_ids: + app.logger.debug(f"Invalidating {v} from cache.") + shutil.rmtree(get_cache_path() / v) + + +def set_last_access(path): + with open(os.path.join(path, "last_access.txt"), "w") as f: + f.write(str(time.time())) + + +def scan_tmp_directory(): + oldest_accessed_dir = {"dir": None, "access_time": None} + total_size = sum(f.stat().st_size for f in get_cache_path().glob('**/*') if f.is_file()) + # this will be some visualization IDs + for p in get_cache_path().glob('*'): + if not (p / 'last_access.txt').exists(): + oldest_accessed_dir = {"dir": p, "access_time": 0} + elif oldest_accessed_dir["dir"] is None: + with open(p / 'last_access.txt') as f: + timestamp = f.read() + if timestamp == '': + continue + oldest_accessed_dir = {"dir": p, "access_time": float(timestamp)} + else: + with open(p / 'last_access.txt') as f: + if float(f.read()) < oldest_accessed_dir["access_time"]: + timestamp = f.read() + if timestamp == '': + continue + oldest_accessed_dir = {"dir": p, "access_time": float(timestamp)} + return total_size, oldest_accessed_dir["dir"] + + +def cleanup(): + with lock: + print("Checking visualization cache...") + # Max tmp size is 500MB + max_size = 500000000 + folder_size, oldest_dir = scan_tmp_directory() + while folder_size > max_size: + print(f"Maximum cache size reached. Deleting {os.path.basename(oldest_dir)}.") + shutil.rmtree(oldest_dir) + folder_size, oldest_dir = scan_tmp_directory() diff --git a/iiif_utils.py b/iiif_utils.py index 47a19b7..7be9eb8 100644 --- a/iiif_utils.py +++ b/iiif_utils.py @@ -1,16 +1,17 @@ +import datetime import json import os -import pathlib import tempfile from typing import Dict import mmif from flask import url_for from mmif import AnnotationTypes, DocumentTypes, Mmif -import datetime + +import cache -def generate_iiif_manifest(in_mmif: mmif.Mmif): +def generate_iiif_manifest(in_mmif: mmif.Mmif, viz_id): iiif_json = { "@context": "http://iiif.io/api/presentation/2/context.json", "id": "http://0.0.0.0:5000/mmif_example_manifest.json", @@ -28,7 +29,7 @@ def generate_iiif_manifest(in_mmif: mmif.Mmif): } add_canvas_from_documents(in_mmif, iiif_json) add_structure_from_timeframe(in_mmif, iiif_json) - return save_manifest(iiif_json) + return save_manifest(iiif_json, viz_id) def add_canvas_from_documents(in_mmif, iiif_json): @@ -105,9 +106,10 @@ def add_structure_from_timeframe(in_mmif: Mmif, iiif_json: Dict): iiif_json["structures"].append(view_range) -def save_manifest(iiif_json: Dict) -> str: +def save_manifest(iiif_json: Dict, viz_id) -> str: # generate a iiif manifest and save output file - manifest = tempfile.NamedTemporaryFile('w', dir=str(pathlib.Path(__file__).parent /'static'/'tmp'), suffix='.json', delete=False) + manifest = tempfile.NamedTemporaryFile( + 'w', dir=str(cache.get_cache_path() / viz_id), suffix='.json', delete=False) json.dump(iiif_json, manifest, indent=4) return manifest.name diff --git a/ocr.py b/ocr.py index 2b88360..41aa30e 100644 --- a/ocr.py +++ b/ocr.py @@ -5,11 +5,13 @@ import tempfile import json import re -import html +import os, shutil -from flask import render_template, session +from flask import render_template from mmif.utils.video_document_helper import convert_timepoint, convert_timeframe +import cache + class OCRFrame(): """Class representing an (aligned or otherwise) set of OCR annotations for a single frame""" @@ -59,7 +61,6 @@ def add_bounding_box(self, anno, mmif): if anno.properties.get("boxType") and anno.properties.get("boxType") not in self.boxtypes: self.boxtypes.append(anno.properties.get("boxType")) - def add_timeframe(self, anno, mmif): start, end = convert_timeframe(mmif, anno, "frames") start_secs, end_secs = convert_timeframe(mmif, anno, "seconds") @@ -86,7 +87,7 @@ def get_ocr_frames(view, mmif, fps): for alignment in view.get_annotations(full_alignment_type[0]): source = find_annotation(alignment.properties["source"], view, mmif) target = find_annotation(alignment.properties["target"], view, mmif) - + frame = OCRFrame(source, mmif) i = frame.frame_num if frame.frame_num is not None else frame.range if i in frames.keys(): @@ -124,14 +125,16 @@ def paginate(frames_list): return {i: page for (i, page) in enumerate(pages)} -def render_ocr(vid_path, view_id, page_number): + +def render_ocr(mmif_id, vid_path, view_id, page_number): """Iterate through frames and display the contents/alignments.""" # Path for storing temporary images generated by cv2 cv2_vid = cv2.VideoCapture(vid_path) - f = open(session[f"{view_id}-page-file"]) - frames_pages = json.load(f) - page = frames_pages[str(page_number)] + tn_data_fname = cache.get_cache_path() / mmif_id / f"{view_id}-pages.json" + thumbnail_pages = json.load(open(tn_data_fname)) + page = thumbnail_pages[str(page_number)] prev_frame_cap = None + path = make_image_directory(mmif_id) for frame_num, frame in page: # If index is range instead of frame... if frame.get("range"): @@ -142,22 +145,28 @@ def render_ocr(vid_path, view_id, page_number): raise FileNotFoundError(f"Video file {vid_path} not found!") # Double check histogram similarity of "repeat" frames -- if they're significantly different, un-mark as repeat - if prev_frame_cap is not None and frame["repeat"] and not is_duplicate_image(prev_frame_cap, frame_cap, cv2_vid): + if prev_frame_cap is not None and frame["repeat"] and not is_duplicate_image(prev_frame_cap, frame_cap, + cv2_vid): frame["repeat"] = False - - with tempfile.NamedTemporaryFile( - prefix=str(pathlib.Path(__file__).parent /'static'/'tmp'), suffix=".jpg", delete=False) as tf: + with tempfile.NamedTemporaryFile(dir=str(path), suffix=".jpg", delete=False) as tf: cv2.imwrite(tf.name, frame_cap) # "id" is just the name of the temp image file frame["id"] = pathlib.Path(tf.name).name prev_frame_cap = frame_cap - return render_template('ocr.html', - vid_path=vid_path, - view_id=view_id, - page=page, - n_pages=len(frames_pages), - page_number=str(page_number)) + tn_page_html = render_template( + 'ocr.html', vid_path=vid_path, view_id=view_id, page=page, + n_pages=len(thumbnail_pages), page_number=str(page_number), mmif_id=mmif_id) + return tn_page_html + + +def make_image_directory(mmif_id): + # Make path for temp OCR image files or clear image files if it exists + path = cache.get_cache_path() / mmif_id / "img" + if os.path.exists(path): + shutil.rmtree(path) + os.makedirs(path) + return path def find_duplicates(frames_list, cv2_vid): @@ -175,15 +184,15 @@ def find_duplicates(frames_list, cv2_vid): def is_duplicate_ocr_frame(prev_frame, frame): if not prev_frame: - return False + return False if prev_frame.get("boxtypes") != frame.get("boxtypes"): return False - if abs(len(prev_frame.get("boxes"))-len(frame.get("boxes"))) > 3: + if abs(len(prev_frame.get("boxes")) - len(frame.get("boxes"))) > 3: return False # Check Boundingbox distances rounded_prev = round_boxes(prev_frame.get("boxes")) for box in round_boxes(frame.get("boxes")): - if box in rounded_prev and frame["secs"]-prev_frame["secs"] < 10: + if box in rounded_prev and frame["secs"] - prev_frame["secs"] < 10: return True # Check overlap in text prev_text, text = set(prev_frame.get("text")), set(frame.get("text")) @@ -191,16 +200,16 @@ def is_duplicate_ocr_frame(prev_frame, frame): return True return False -def is_duplicate_image(prev_frame, frame, cv2_vid): +def is_duplicate_image(prev_frame, frame, cv2_vid): # Convert it to HSV img1_hsv = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2HSV) img2_hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV) # Calculate the histogram and normalize it - hist_img1 = cv2.calcHist([img1_hsv], [0,1], None, [180,256], [0,180,0,256]) + hist_img1 = cv2.calcHist([img1_hsv], [0, 1], None, [180, 256], [0, 180, 0, 256]) cv2.normalize(hist_img1, hist_img1, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX); - hist_img2 = cv2.calcHist([img2_hsv], [0,1], None, [180,256], [0,180,0,256]) + hist_img2 = cv2.calcHist([img2_hsv], [0, 1], None, [180, 256], [0, 180, 0, 256]) cv2.normalize(hist_img2, hist_img2, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX); # Find the metric value @@ -208,14 +217,13 @@ def is_duplicate_image(prev_frame, frame, cv2_vid): return metric_val < 50 - def round_boxes(boxes): # To account for jittery bounding boxes in OCR annotations rounded_boxes = [] for box in boxes: rounded_box = [] for coord in box[2]: - rounded_box.append(round(coord/100)*100) + rounded_box.append(round(coord / 100) * 100) rounded_boxes.append(rounded_box) return rounded_boxes @@ -227,7 +235,8 @@ def get_ocr_views(mmif): for view in mmif.views: for anno_type, anno in view.metadata.contains.items(): # Annotation belongs to a CV view if it is a TimeFrame/BB and it refers to a VideoDocument - if anno_type.shortname in required_types and mmif.get_document_by_id(anno["document"]).at_type.shortname == "VideoDocument": + if anno_type.shortname in required_types and mmif.get_document_by_id( + anno["document"]).at_type.shortname == "VideoDocument": views.append(view) continue # TODO: Couldn't find a simple way to show if an alignment view is a CV/Frames-type view @@ -236,8 +245,8 @@ def get_ocr_views(mmif): continue return views -def save_json(dict, view_id): - with tempfile.NamedTemporaryFile(prefix=str(pathlib.Path(__file__).parent /'static'/'tmp'), suffix=".json", delete=False) as tf: - pages_json = open(tf.name, "w") - json.dump(dict, pages_json) - session[f"{view_id}-page-file"] = tf.name + +def save_json(data, view_id, mmif_id): + path = cache.get_cache_path() / mmif_id / f"{view_id}-pages.json" + with open(path, 'w') as f: + json.dump(data, f) diff --git a/requirements.txt b/requirements.txt index 563435d..09b2fd3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ -spacy==2.3.2 +spacy==2.* mmif-python==1.0.8 -mmif-python[utils]==1.0.8 +lapps flask-session -opencv-python==4.* \ No newline at end of file +flask[async] +opencv-python==4.* +shortuuid==1.0.11 \ No newline at end of file diff --git a/templates/ocr.html b/templates/ocr.html index 35b624d..ff584eb 100644 --- a/templates/ocr.html +++ b/templates/ocr.html @@ -1,7 +1,7 @@

{% for frame_num, frame in page %} - {% set filename = frame["id"] %} + {% set filename = "/tmp/" + mmif_id + "/img/" + frame["id"] %} {% set id = frame["id"] %} {% set boxes = frame["boxes"] %} {% set secs = frame["secs"] %} @@ -127,12 +127,13 @@

var data = { "vid_path": "{{vid_path}}", "view_id": "{{view_id}}", - "page_number": parseInt("{{page_number}}") + "page_number": parseInt("{{page_number}}"), + "mmif_id": "{{mmif_id}}" } - if (page == BACKWARD) { + if (page === BACKWARD) { data["page_number"] -= 1 } - else if (page == FORWARD) { + else if (page === FORWARD) { data["page_number"] += 1 } else { diff --git a/templates/player.html b/templates/player.html index 1116688..5da61c0 100644 --- a/templates/player.html +++ b/templates/player.html @@ -23,6 +23,13 @@