diff --git a/rorapi/common/csv_create.py b/rorapi/common/csv_create.py index cf1260d..cc8799f 100644 --- a/rorapi/common/csv_create.py +++ b/rorapi/common/csv_create.py @@ -71,27 +71,29 @@ def new_record_from_csv(csv_data, version): temp_names.append(name_obj) print("temp names 1:") print(temp_names) - name_values = [n['value'] for n in temp_names] + name_vals = [n['value'] for n in temp_names] dup_names = [] - for n in name_values: - if name_values.count(n) > 1: + for n in name_vals: + if name_vals.count(n) > 1: if n not in dup_names: dup_names.append(n) - if dup_names: - dup_names_objs = [] - for d in dup_names: - types = [] - for t in temp_names: - if t['value'] == d: - types.extend(t['types']) - name_obj = { - "types": types, - "value": d, - "lang": None - } - dup_names_objs.append(name_obj) - temp_names = [t for t in temp_names if t['value'] not in dup_names] - temp_names.extend(dup_names_objs) + for d in dup_names: + dup_names_objs = [t for t in temp_names if t['value'] == d] + lang_codes = [dno['lang'] for dno in dup_names_objs] + for lang_code in lang_codes: + if lang_codes.count(lang_code) > 1: + name_lang_dups = [dno for dno in dup_names_objs if dno['lang'] == lang_code] + types = [] + for n in name_lang_dups: + types.extend(n['types']) + name_obj = { + "types": types, + "value": d, + "lang": lang_code + } + if name_obj not in temp_names: + temp_names = [t for t in temp_names if t not in name_lang_dups] + temp_names.append(name_obj) print("temp names 2:") print(temp_names) v2_data['names'] = temp_names diff --git a/rorapi/common/csv_update.py b/rorapi/common/csv_update.py index d8b44a6..51e8d5f 100644 --- a/rorapi/common/csv_update.py +++ b/rorapi/common/csv_update.py @@ -290,7 +290,7 @@ def update_record_from_csv(csv_data, version): # check if value, lang and type already exist if temp_names_match or temp_names_null_lang_match: if temp_names_match: - errors.append("Attempting to add names that already exists: {}".format(a)) + errors.append("Attempting to add name that already exists: {}".format(a)) if temp_names_null_lang_match: errors.append("Attempting to add name with lang code that already exists with no lang code: {}".format(a)) else: diff --git a/rorapi/management/commands/getrordump.py b/rorapi/management/commands/getrordump.py new file mode 100644 index 0000000..4bee42d --- /dev/null +++ b/rorapi/management/commands/getrordump.py @@ -0,0 +1,64 @@ +import json +import os +import re +import requests +import zipfile +import base64 +from io import BytesIO +from rorapi.settings import ES7, ES_VARS, ROR_DUMP, DATA +from django.core.management.base import BaseCommand + +HEADERS = {'Accept': 'application/vnd.github.v3+json'} +AUTH_HEADERS = {'Authorization': 'token {}'.format(ROR_DUMP['GITHUB_TOKEN']), 'Accept': 'application/vnd.github.v3+json'} + +def get_ror_dump_sha(filename, use_test_data, github_headers): + sha = '' + if use_test_data: + contents_url = ROR_DUMP['TEST_REPO_URL'] + '/contents' + else: + contents_url = ROR_DUMP['PROD_REPO_URL'] + '/contents' + try: + response = requests.get(contents_url, headers=github_headers) + except requests.exceptions.RequestException as e: + raise SystemExit(f"{contents_url}: is Not reachable \nErr: {e}") + try: + repo_contents = response.json() + for file in repo_contents: + if filename in file['name']: + sha = file['sha'] + return sha + except: + return None + +def get_ror_dump_zip(filename, use_test_data, github_headers): + sha = get_ror_dump_sha(filename, use_test_data, github_headers) + if sha: + if use_test_data: + blob_url = ROR_DUMP['TEST_REPO_URL'] + '/git/blobs/' + sha + else: + blob_url = ROR_DUMP['PROD_REPO_URL'] + '/git/blobs/' + sha + try: + response = requests.get(blob_url, headers=github_headers) + except requests.exceptions.RequestException as e: + raise SystemExit(f"Github blob is Not reachable \nErr: {e}") + try: + response_json = response.json() + file_decoded = base64.b64decode(response_json['content']) + with open(filename + '.zip', 'wb') as zip_file: + zip_file.write(file_decoded) + return zip_file.name + except: + return None + +class Command(BaseCommand): + help = 'Downloads a specified ROR data dump from Github' + + def handle(self, *args, **options): + filename = options['filename'] + use_test_data = options['testdata'] + self.stdout.write('Getting ROR dump') + if ROR_DUMP['GITHUB_TOKEN']: + github_headers = AUTH_HEADERS + else: + github_headers = HEADERS + ror_dump_zip = get_ror_dump_zip(filename, use_test_data, github_headers) diff --git a/rorapi/management/commands/indexrordump.py b/rorapi/management/commands/indexrordump.py index 10b6704..f1ce85a 100644 --- a/rorapi/management/commands/indexrordump.py +++ b/rorapi/management/commands/indexrordump.py @@ -96,44 +96,6 @@ def index_dump(self, filename, index, dataset): ES7.indices.delete(backup_index) self.stdout.write('ROR dataset ' + filename + ' indexed') -def get_ror_dump_sha(filename, use_test_data): - sha = '' - if use_test_data: - contents_url = ROR_DUMP['TEST_REPO_URL'] + '/contents' - else: - contents_url = ROR_DUMP['PROD_REPO_URL'] + '/contents' - try: - response = requests.get(contents_url, headers=HEADERS) - except requests.exceptions.RequestException as e: - raise SystemExit(f"{contents_url}: is Not reachable \nErr: {e}") - try: - repo_contents = response.json() - for file in repo_contents: - if filename in file['name']: - sha = file['sha'] - return sha - except: - return None - -def get_ror_dump_zip(filename, use_test_data): - sha = get_ror_dump_sha(filename, use_test_data) - if sha: - if use_test_data: - blob_url = ROR_DUMP['TEST_REPO_URL'] + '/git/blobs/' + sha - else: - blob_url = ROR_DUMP['PROD_REPO_URL'] + '/git/blobs/' + sha - try: - response = requests.get(blob_url, headers=HEADERS) - except requests.exceptions.RequestException as e: - raise SystemExit(f"Github blob is Not reachable \nErr: {e}") - try: - response_json = response.json() - file_decoded = base64.b64decode(response_json['content']) - with open(filename + '.zip', 'wb') as zip_file: - zip_file.write(file_decoded) - return zip_file.name - except: - return None class Command(BaseCommand): help = 'Indexes ROR dataset from a full dump file in ror-data repo' @@ -141,11 +103,11 @@ class Command(BaseCommand): def handle(self, *args, **options): json_files = [] filename = options['filename'] - use_test_data = options['testdata'] - ror_dump_zip = get_ror_dump_zip(filename, use_test_data) - if ror_dump_zip: + ror_dump_zip = filename + '.zip' + if os.path.exists(ror_dump_zip): if not os.path.exists(DATA['WORKING_DIR']): os.makedirs(DATA['WORKING_DIR']) + self.stdout.write('Extracting ROR dump') with zipfile.ZipFile(ror_dump_zip, 'r') as zip_ref: zip_ref.extractall(DATA['WORKING_DIR'] + filename) unzipped_files = os.listdir(DATA['WORKING_DIR'] + filename) @@ -155,13 +117,17 @@ def handle(self, *args, **options): for json_file in json_files: index = None json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file - with open(json_path, 'r') as it: - dataset = json.load(it) if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None): + self.stdout.write('Loading JSON') + with open(json_path, 'r') as it: + dataset = json.load(it) self.stdout.write('Indexing ROR dataset ' + json_file) index = ES_VARS['INDEX_V2'] index_dump(self, json_file, index, dataset) if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None): + self.stdout.write('Loading JSON') + with open(json_path, 'r') as it: + dataset = json.load(it) self.stdout.write('Indexing ROR dataset ' + json_file) index = ES_VARS['INDEX_V1'] index_dump(self, json_file, index, dataset) diff --git a/rorapi/management/commands/setup.py b/rorapi/management/commands/setup.py index bf7e5fe..3505e72 100644 --- a/rorapi/management/commands/setup.py +++ b/rorapi/management/commands/setup.py @@ -1,14 +1,17 @@ import requests import zipfile - +import base64 from django.core.management.base import BaseCommand from rorapi.management.commands.deleteindex import Command as DeleteIndexCommand from rorapi.management.commands.createindex import Command as CreateIndexCommand from rorapi.management.commands.indexrordump import Command as IndexRorDumpCommand +from rorapi.management.commands.getrordump import Command as GetRorDumpCommand from rorapi.settings import ROR_DUMP HEADERS = {'Accept': 'application/vnd.github.v3+json'} +HEADERS = {'Authorization': 'token {}'.format(ROR_DUMP['GITHUB_TOKEN']), 'Accept': 'application/vnd.github.v3+json'} + def get_ror_dump_sha(filename, use_test_data): sha = '' if use_test_data: @@ -49,6 +52,7 @@ def handle(self, *args, **options): sha = get_ror_dump_sha(filename, use_test_data) if sha: + GetRorDumpCommand().handle(*args, **options) DeleteIndexCommand().handle(*args, **options) CreateIndexCommand().handle(*args, **options) IndexRorDumpCommand().handle(*args, **options)