-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
210 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
config.env |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
language: python | ||
python: | ||
- "2.7" | ||
install: | ||
- pip install -r ./config/requirements.txt | ||
script: | ||
- flake8 ./src/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
FROM python:2 | ||
|
||
RUN apt-get update && apt-get upgrade -y && \ | ||
pip install --upgrade pip | ||
|
||
WORKDIR /usr/src/app | ||
|
||
COPY ./config/requirements.txt ./ | ||
RUN pip install --no-cache-dir -r requirements.txt && \ | ||
rm requirements.txt | ||
|
||
COPY ./src/* ./ | ||
|
||
RUN flake8 ./* | ||
|
||
CMD ["python", "run"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2018, Open Book Publishers. | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# OAI URI import | ||
|
||
Query an OAI repository and store found URIs in the identifier translation service | ||
|
||
## Run via crontab | ||
``` | ||
0 0 * * 1 docker run --rm --name "oai_harvester" --env-file /path/to/config.env openbookpublishers/oai_uri_import | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
USER_AGENT="HIRMEOSCrossrefURIImporter/1.0; mailto:[email protected])" | ||
FILENAME=OAIRepository.xml | ||
CSV_LOC_URL=https://www.openbookpublishers.com/doi_prod_id_mapping.csv | ||
CSV_TITLE_COL=0 | ||
CSV_DOI_COL=1 | ||
CSV_URI_START_COL=1 | ||
CSV_CANONICAL_END=4 | ||
DEFAULT_TYPE=monograph | ||
URI_API_ENDP=https://identifier.translation.service/translate | ||
URI_API_WORKS=https://identifier.translation.service/works | ||
URI_API_URIS=https://identifier.translation.service/uris | ||
AUTH_API_ENDP=https://authentication.service/auth | ||
[email protected] | ||
URI_API_PASS=some_secret_password |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
flake8==3.6.0 | ||
httplib2==0.10.3 | ||
urllib3==1.20 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
import re | ||
import os | ||
import sys | ||
import time | ||
import codecs | ||
import httplib2 | ||
import xml.dom.pulldom | ||
from optparse import OptionParser | ||
|
||
|
||
USER_AGENT = os.environ['USER_AGENT'] | ||
ATTEMPTS = 1 | ||
MAX_ATTEMPTS = 3 | ||
ARGS = [ | ||
{ | ||
'val': '--repository', | ||
'dest': 'repository', | ||
'action': 'store', | ||
'help': 'Root URL of the OAI repository' | ||
}, { | ||
'val': '--metadata-prefix', | ||
'dest': 'prefix', | ||
'action': 'store', | ||
'help': 'metadataPrefix used by the repository' | ||
}, { | ||
'val': '--set', | ||
'dest': 'set', | ||
'action': 'store', | ||
'help': 'A particular set to query (optional)' | ||
} | ||
] | ||
|
||
|
||
def get_options(args): | ||
parser = OptionParser() | ||
for arg in args: | ||
parser.add_option(arg['val'], dest=arg['dest'], | ||
action=arg['action'], help=arg['help']) | ||
options, rest = parser.parse_args() | ||
|
||
assert rest == [] | ||
assert options.repository and options.prefix | ||
return options | ||
|
||
|
||
def build_url(repository, prefix, oai_set=''): | ||
url = '%s?verb=ListRecords&metadataPrefix=%s' % (repository, prefix) | ||
url += '&set=%s' % (oai_set) if oai_set else '' | ||
return url | ||
|
||
|
||
def build_resumption_url(repository, token): | ||
return '%s?verb=ListRecords&resumptionToken=%s' % (repository, token) | ||
|
||
|
||
def get_oai(url): | ||
global ATTEMPTS | ||
headers = {'Accept': 'text/html', 'Accept-Encoding': 'compress, deflate', | ||
'User-Agent': USER_AGENT} | ||
res, content = httplib2.Http().request(url, 'GET', headers=headers) | ||
if res.status == 200: | ||
ATTEMPTS = 1 | ||
return content | ||
elif res.status in [429, 503, 504]: | ||
try: | ||
time.sleep(int(res['retry-after'])) | ||
except (AttributeError, ValueError, IOError, KeyError): | ||
pass | ||
if ATTEMPTS <= MAX_ATTEMPTS: | ||
ATTEMPTS += 1 | ||
return get_oai(url) | ||
print >>sys.stderr, "Error (%s) after %d attempts: %s" % \ | ||
(res.status, res.reason, ATTEMPTS) | ||
sys.exit(1) | ||
|
||
|
||
def run(respository, metadata_prefix, oai_set): | ||
tempname = 'tmp.xml' | ||
temp = codecs.open(tempname, 'w+b', encoding='utf-8') | ||
temp.write('<repository>\n') | ||
regex = '<resumptionToken[^>]*>(.*)</resumptionToken>' | ||
url = build_url(respository, metadata_prefix, oai_set) | ||
data = get_oai(url) | ||
|
||
while True: | ||
events = xml.dom.pulldom.parseString(data) | ||
for (event, node) in events: | ||
if event == "START_ELEMENT" and node.tagName == 'record': | ||
events.expandNode(node) | ||
node.writexml(temp) | ||
try: | ||
token = re.search(regex, data).group(1) | ||
assert token | ||
except (AttributeError, IndexError, AssertionError): | ||
break | ||
url = build_resumption_url(respository, token) | ||
data = get_oai(url) | ||
temp.write('\n</repository>\n') | ||
temp.close() | ||
temp_stream = open(tempname, 'r') | ||
for line in open(tempname, 'r'): | ||
print line | ||
temp_stream.close() | ||
|
||
|
||
if __name__ == '__main__': | ||
options = get_options(ARGS) | ||
run(options.repository, options.prefix, options.set) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
import os | ||
import subprocess | ||
|
||
OAI_REPOSITORY = os.environ['OAI_REPOSITORY'] | ||
OAI_METADATA_PREFIX = os.environ['OAI_METADATA_PREFIX'] | ||
OAI_SET = os.environ['OAI_SET'] | ||
FILENAME = '/tmp/' + os.environ['FILENAME'] | ||
|
||
|
||
def outstream(filename): | ||
return open(filename, "w") | ||
|
||
|
||
def instream(filename): | ||
return open(filename, "r") | ||
|
||
|
||
def run(): | ||
cmd = ['./retrieve_oai_records', | ||
'--repository', OAI_REPOSITORY, | ||
'--metadata-prefix', OAI_METADATA_PREFIX, | ||
'--set', OAI_SET] | ||
subprocess.call(cmd, stdout=outstream(FILENAME)) | ||
|
||
|
||
if __name__ == '__main__': | ||
run() |