From 67e872a14f147cc0016998e3b1554decb173d0b5 Mon Sep 17 00:00:00 2001 From: Javier Arias Date: Tue, 27 Nov 2018 12:24:05 +0000 Subject: [PATCH] first commit --- .gitignore | 1 + .travis.yml | 7 +++ Dockerfile | 16 ++++++ LICENSE | 21 ++++++++ README.md | 8 +++ config/config.env.example | 14 +++++ config/requirements.txt | 3 ++ src/retrieve_oai_records | 111 ++++++++++++++++++++++++++++++++++++++ src/run | 29 ++++++++++ 9 files changed, 210 insertions(+) create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 Dockerfile create mode 100644 LICENSE create mode 100644 README.md create mode 100644 config/config.env.example create mode 100644 config/requirements.txt create mode 100755 src/retrieve_oai_records create mode 100755 src/run diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2549b3d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +config.env diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..ac157e7 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,7 @@ +language: python +python: + - "2.7" +install: + - pip install -r ./config/requirements.txt +script: + - flake8 ./src/* diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..55976b3 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM python:2 + +RUN apt-get update && apt-get upgrade -y && \ + pip install --upgrade pip + +WORKDIR /usr/src/app + +COPY ./config/requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt && \ + rm requirements.txt + +COPY ./src/* ./ + +RUN flake8 ./* + +CMD ["python", "run"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c0a4328 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018, Open Book Publishers. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..e7d9106 --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +# OAI URI import + +Query an OAI repository and store found URIs in the identifier translation service + +## Run via crontab +``` +0 0 * * 1 docker run --rm --name "oai_harvester" --env-file /path/to/config.env openbookpublishers/oai_uri_import +``` diff --git a/config/config.env.example b/config/config.env.example new file mode 100644 index 0000000..3376d27 --- /dev/null +++ b/config/config.env.example @@ -0,0 +1,14 @@ +USER_AGENT="HIRMEOSCrossrefURIImporter/1.0; mailto:contact@obp.com)" +FILENAME=OAIRepository.xml +CSV_LOC_URL=https://www.openbookpublishers.com/doi_prod_id_mapping.csv +CSV_TITLE_COL=0 +CSV_DOI_COL=1 +CSV_URI_START_COL=1 +CSV_CANONICAL_END=4 +DEFAULT_TYPE=monograph +URI_API_ENDP=https://identifier.translation.service/translate +URI_API_WORKS=https://identifier.translation.service/works +URI_API_URIS=https://identifier.translation.service/uris +AUTH_API_ENDP=https://authentication.service/auth +URI_API_USER=admin_user@openbookpublishers.com +URI_API_PASS=some_secret_password diff --git a/config/requirements.txt b/config/requirements.txt new file mode 100644 index 0000000..3d75347 --- /dev/null +++ b/config/requirements.txt @@ -0,0 +1,3 @@ +flake8==3.6.0 +httplib2==0.10.3 +urllib3==1.20 diff --git a/src/retrieve_oai_records b/src/retrieve_oai_records new file mode 100755 index 0000000..b55be9b --- /dev/null +++ b/src/retrieve_oai_records @@ -0,0 +1,111 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import re +import os +import sys +import time +import codecs +import httplib2 +import xml.dom.pulldom +from optparse import OptionParser + + +USER_AGENT = os.environ['USER_AGENT'] +ATTEMPTS = 1 +MAX_ATTEMPTS = 3 +ARGS = [ + { + 'val': '--repository', + 'dest': 'repository', + 'action': 'store', + 'help': 'Root URL of the OAI repository' + }, { + 'val': '--metadata-prefix', + 'dest': 'prefix', + 'action': 'store', + 'help': 'metadataPrefix used by the repository' + }, { + 'val': '--set', + 'dest': 'set', + 'action': 'store', + 'help': 'A particular set to query (optional)' + } +] + + +def get_options(args): + parser = OptionParser() + for arg in args: + parser.add_option(arg['val'], dest=arg['dest'], + action=arg['action'], help=arg['help']) + options, rest = parser.parse_args() + + assert rest == [] + assert options.repository and options.prefix + return options + + +def build_url(repository, prefix, oai_set=''): + url = '%s?verb=ListRecords&metadataPrefix=%s' % (repository, prefix) + url += '&set=%s' % (oai_set) if oai_set else '' + return url + + +def build_resumption_url(repository, token): + return '%s?verb=ListRecords&resumptionToken=%s' % (repository, token) + + +def get_oai(url): + global ATTEMPTS + headers = {'Accept': 'text/html', 'Accept-Encoding': 'compress, deflate', + 'User-Agent': USER_AGENT} + res, content = httplib2.Http().request(url, 'GET', headers=headers) + if res.status == 200: + ATTEMPTS = 1 + return content + elif res.status in [429, 503, 504]: + try: + time.sleep(int(res['retry-after'])) + except (AttributeError, ValueError, IOError, KeyError): + pass + if ATTEMPTS <= MAX_ATTEMPTS: + ATTEMPTS += 1 + return get_oai(url) + print >>sys.stderr, "Error (%s) after %d attempts: %s" % \ + (res.status, res.reason, ATTEMPTS) + sys.exit(1) + + +def run(respository, metadata_prefix, oai_set): + tempname = 'tmp.xml' + temp = codecs.open(tempname, 'w+b', encoding='utf-8') + temp.write('\n') + regex = ']*>(.*)' + url = build_url(respository, metadata_prefix, oai_set) + data = get_oai(url) + + while True: + events = xml.dom.pulldom.parseString(data) + for (event, node) in events: + if event == "START_ELEMENT" and node.tagName == 'record': + events.expandNode(node) + node.writexml(temp) + try: + token = re.search(regex, data).group(1) + assert token + except (AttributeError, IndexError, AssertionError): + break + url = build_resumption_url(respository, token) + data = get_oai(url) + temp.write('\n\n') + temp.close() + temp_stream = open(tempname, 'r') + for line in open(tempname, 'r'): + print line + temp_stream.close() + + +if __name__ == '__main__': + options = get_options(ARGS) + run(options.repository, options.prefix, options.set) diff --git a/src/run b/src/run new file mode 100755 index 0000000..5c84aab --- /dev/null +++ b/src/run @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import os +import subprocess + +OAI_REPOSITORY = os.environ['OAI_REPOSITORY'] +OAI_METADATA_PREFIX = os.environ['OAI_METADATA_PREFIX'] +OAI_SET = os.environ['OAI_SET'] +FILENAME = '/tmp/' + os.environ['FILENAME'] + + +def outstream(filename): + return open(filename, "w") + + +def instream(filename): + return open(filename, "r") + + +def run(): + cmd = ['./retrieve_oai_records', + '--repository', OAI_REPOSITORY, + '--metadata-prefix', OAI_METADATA_PREFIX, + '--set', OAI_SET] + subprocess.call(cmd, stdout=outstream(FILENAME)) + + +if __name__ == '__main__': + run()