-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathextract_hypothesis_annotations_from_url.py
110 lines (97 loc) · 4.87 KB
/
extract_hypothesis_annotations_from_url.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
# coding: utf-8
# Usage: python extract_hypothesis_annotations_from_url.py --help
# Description: An extensible Python script for extracting Hypothes.is (https://web.hypothes.is/) annotations from a web page located at a given URL
# License: GNU AFFERO GENERAL PUBLIC LICENCE Version 3 (https://www.gnu.org/licenses/agpl-3.0.en.html)
import argparse
import sys
import re
import requests
import json
import urllib.parse
from requests.auth import HTTPBasicAuth
from fingerprint import fingerprint
# Commandline interface description
parser = argparse.ArgumentParser()
parser.add_argument("-url", "--url", help="The URL of the page or path to PDF file you wish to extract Hypothes.is annotations from")
parser.add_argument("-a", "--apikey", help="Your Hypothes.is API key which you can find here: https://hypothes.is/account/developer")
parser.add_argument("-user", "--username", help="If you only wish to extract annotations by a particular user, specify their Hypothes.is username here")
parser.add_argument("-g", "--groupid", help="If you wish to only extract annotations made within a particular Hypothes.is annotation group, specify the Hypothes.is annotation group ID here. A Hypothes.is annotation group ID looks like a random sequence of upper and lower case characters and numbers. It can be found under the Groups menu after logging into your Hypothes.is account on the web interface.")
args = parser.parse_args()
# Check if input string is a valid HTTP URL
def is_valid_url(url):
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return (re.match(regex, url) is not None)
# Send API search request and return the result
def search_query(api_key, query_url):
headers = {
"Content-Type": "application/json;charset=utf-8",
}
headers["Authorization"] = "Bearer " + api_key
r = requests.get(query_url, headers=headers)
json_result = r.json()
if r.ok:
return json_result
else:
raise Exception("Error executing API search query")
# Check if input URL is valid
if (is_valid_url(args.url)==False) and (args.url[-4:].lower() != '.pdf'):
sys.exit("Input URL is invalid")
else:
# 1. Formulate API query components
api_query = {}
if (args.url[-4:].lower() == '.pdf'):
# a) PDF file: get unique PDF fingerprint
api_query['uri'] = "urn:x-pdf:" + str(fingerprint(args.url))
print("urn:x-pdf:"+str(api_query['uri']))
else:
# a) Input URL to extract annotations from
api_query['uri'] = args.url
# b) API base URL
hypothesis_api_base_url = 'https://hypothes.is/api/'
hypothesis_search_api_endpoint = hypothesis_api_base_url + 'search?'
# c) OPTIONAL: only extract annotations from this Hypothes.is user. I.e., the Hypothes.is username of the annotator.
if args.username:
api_query['user'] = args.username
# d) OPTIONAL: only extract annotations from this Hypothes.is annotation group. I.e., the Hypothes.is group ID from which to extract annotations.
if args.groupid:
api_query['group'] = args.groupid
# 2. URL encoding of the API query
api_request_url = urllib.parse.urlencode(api_query)
# 3. Attach the API query URL to the base URL of the Hypothes.is search API
final_api_request_url = hypothesis_search_api_endpoint + api_request_url
# 4. Get the API key
api_key = args.apikey
# 5. Execute the API search query
api_request_results = search_query(api_key, final_api_request_url)
# 6. Print the resulting annotations. In this example, we only print the:
# a) date of the annotation,
# b) name of the annotator,
# c) highlighted text,
# d) annotation tags (if any) for this text
count = 1
print()
for item in api_request_results['rows']:
if ('target' in item):
if (len(item['target']) > 0):
targ = item['target']
if ('selector' in targ[0]):
for selector in targ[0]['selector']:
if selector['type'] == 'TextQuoteSelector':
print("Annotation #" + str(count) + ":")
print("-----------------")
print("date: " + item['created'])
print("annotator: " + item['user'])
print("highlighted text: " + selector['exact'])
print("annotation tags: | ", end='')
for tag in item['tags']:
print(tag + " | ", end = '')
print()
count = count + 1
print()