Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ja573 committed Nov 27, 2018
1 parent 069fccc commit 67e872a
Show file tree
Hide file tree
Showing 9 changed files with 210 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
config.env
7 changes: 7 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
language: python
python:
- "2.7"
install:
- pip install -r ./config/requirements.txt
script:
- flake8 ./src/*
16 changes: 16 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM python:2

RUN apt-get update && apt-get upgrade -y && \
pip install --upgrade pip

WORKDIR /usr/src/app

COPY ./config/requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt && \
rm requirements.txt

COPY ./src/* ./

RUN flake8 ./*

CMD ["python", "run"]
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2018, Open Book Publishers.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# OAI URI import

Query an OAI repository and store found URIs in the identifier translation service

## Run via crontab
```
0 0 * * 1 docker run --rm --name "oai_harvester" --env-file /path/to/config.env openbookpublishers/oai_uri_import
```
14 changes: 14 additions & 0 deletions config/config.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
USER_AGENT="HIRMEOSCrossrefURIImporter/1.0; mailto:[email protected])"
FILENAME=OAIRepository.xml
CSV_LOC_URL=https://www.openbookpublishers.com/doi_prod_id_mapping.csv
CSV_TITLE_COL=0
CSV_DOI_COL=1
CSV_URI_START_COL=1
CSV_CANONICAL_END=4
DEFAULT_TYPE=monograph
URI_API_ENDP=https://identifier.translation.service/translate
URI_API_WORKS=https://identifier.translation.service/works
URI_API_URIS=https://identifier.translation.service/uris
AUTH_API_ENDP=https://authentication.service/auth
[email protected]
URI_API_PASS=some_secret_password
3 changes: 3 additions & 0 deletions config/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
flake8==3.6.0
httplib2==0.10.3
urllib3==1.20
111 changes: 111 additions & 0 deletions src/retrieve_oai_records
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import os
import sys
import time
import codecs
import httplib2
import xml.dom.pulldom
from optparse import OptionParser


USER_AGENT = os.environ['USER_AGENT']
ATTEMPTS = 1
MAX_ATTEMPTS = 3
ARGS = [
{
'val': '--repository',
'dest': 'repository',
'action': 'store',
'help': 'Root URL of the OAI repository'
}, {
'val': '--metadata-prefix',
'dest': 'prefix',
'action': 'store',
'help': 'metadataPrefix used by the repository'
}, {
'val': '--set',
'dest': 'set',
'action': 'store',
'help': 'A particular set to query (optional)'
}
]


def get_options(args):
parser = OptionParser()
for arg in args:
parser.add_option(arg['val'], dest=arg['dest'],
action=arg['action'], help=arg['help'])
options, rest = parser.parse_args()

assert rest == []
assert options.repository and options.prefix
return options


def build_url(repository, prefix, oai_set=''):
url = '%s?verb=ListRecords&metadataPrefix=%s' % (repository, prefix)
url += '&set=%s' % (oai_set) if oai_set else ''
return url


def build_resumption_url(repository, token):
return '%s?verb=ListRecords&resumptionToken=%s' % (repository, token)


def get_oai(url):
global ATTEMPTS
headers = {'Accept': 'text/html', 'Accept-Encoding': 'compress, deflate',
'User-Agent': USER_AGENT}
res, content = httplib2.Http().request(url, 'GET', headers=headers)
if res.status == 200:
ATTEMPTS = 1
return content
elif res.status in [429, 503, 504]:
try:
time.sleep(int(res['retry-after']))
except (AttributeError, ValueError, IOError, KeyError):
pass
if ATTEMPTS <= MAX_ATTEMPTS:
ATTEMPTS += 1
return get_oai(url)
print >>sys.stderr, "Error (%s) after %d attempts: %s" % \
(res.status, res.reason, ATTEMPTS)
sys.exit(1)


def run(respository, metadata_prefix, oai_set):
tempname = 'tmp.xml'
temp = codecs.open(tempname, 'w+b', encoding='utf-8')
temp.write('<repository>\n')
regex = '<resumptionToken[^>]*>(.*)</resumptionToken>'
url = build_url(respository, metadata_prefix, oai_set)
data = get_oai(url)

while True:
events = xml.dom.pulldom.parseString(data)
for (event, node) in events:
if event == "START_ELEMENT" and node.tagName == 'record':
events.expandNode(node)
node.writexml(temp)
try:
token = re.search(regex, data).group(1)
assert token
except (AttributeError, IndexError, AssertionError):
break
url = build_resumption_url(respository, token)
data = get_oai(url)
temp.write('\n</repository>\n')
temp.close()
temp_stream = open(tempname, 'r')
for line in open(tempname, 'r'):
print line
temp_stream.close()


if __name__ == '__main__':
options = get_options(ARGS)
run(options.repository, options.prefix, options.set)
29 changes: 29 additions & 0 deletions src/run
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import subprocess

OAI_REPOSITORY = os.environ['OAI_REPOSITORY']
OAI_METADATA_PREFIX = os.environ['OAI_METADATA_PREFIX']
OAI_SET = os.environ['OAI_SET']
FILENAME = '/tmp/' + os.environ['FILENAME']


def outstream(filename):
return open(filename, "w")


def instream(filename):
return open(filename, "r")


def run():
cmd = ['./retrieve_oai_records',
'--repository', OAI_REPOSITORY,
'--metadata-prefix', OAI_METADATA_PREFIX,
'--set', OAI_SET]
subprocess.call(cmd, stdout=outstream(FILENAME))


if __name__ == '__main__':
run()

0 comments on commit 67e872a

Please sign in to comment.