Skip to content

Commit

Permalink
Merge pull request #395 from ror-community/staging
Browse files Browse the repository at this point in the history
Merge staging to prod: Update index ror dump commands
  • Loading branch information
lizkrznarich authored Apr 18, 2024
2 parents 943060b + cd6ac78 commit 94bad80
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 63 deletions.
38 changes: 20 additions & 18 deletions rorapi/common/csv_create.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,27 +71,29 @@ def new_record_from_csv(csv_data, version):
temp_names.append(name_obj)
print("temp names 1:")
print(temp_names)
name_values = [n['value'] for n in temp_names]
name_vals = [n['value'] for n in temp_names]
dup_names = []
for n in name_values:
if name_values.count(n) > 1:
for n in name_vals:
if name_vals.count(n) > 1:
if n not in dup_names:
dup_names.append(n)
if dup_names:
dup_names_objs = []
for d in dup_names:
types = []
for t in temp_names:
if t['value'] == d:
types.extend(t['types'])
name_obj = {
"types": types,
"value": d,
"lang": None
}
dup_names_objs.append(name_obj)
temp_names = [t for t in temp_names if t['value'] not in dup_names]
temp_names.extend(dup_names_objs)
for d in dup_names:
dup_names_objs = [t for t in temp_names if t['value'] == d]
lang_codes = [dno['lang'] for dno in dup_names_objs]
for lang_code in lang_codes:
if lang_codes.count(lang_code) > 1:
name_lang_dups = [dno for dno in dup_names_objs if dno['lang'] == lang_code]
types = []
for n in name_lang_dups:
types.extend(n['types'])
name_obj = {
"types": types,
"value": d,
"lang": lang_code
}
if name_obj not in temp_names:
temp_names = [t for t in temp_names if t not in name_lang_dups]
temp_names.append(name_obj)
print("temp names 2:")
print(temp_names)
v2_data['names'] = temp_names
Expand Down
2 changes: 1 addition & 1 deletion rorapi/common/csv_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def update_record_from_csv(csv_data, version):
# check if value, lang and type already exist
if temp_names_match or temp_names_null_lang_match:
if temp_names_match:
errors.append("Attempting to add names that already exists: {}".format(a))
errors.append("Attempting to add name that already exists: {}".format(a))
if temp_names_null_lang_match:
errors.append("Attempting to add name with lang code that already exists with no lang code: {}".format(a))
else:
Expand Down
64 changes: 64 additions & 0 deletions rorapi/management/commands/getrordump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import json
import os
import re
import requests
import zipfile
import base64
from io import BytesIO
from rorapi.settings import ES7, ES_VARS, ROR_DUMP, DATA
from django.core.management.base import BaseCommand

HEADERS = {'Accept': 'application/vnd.github.v3+json'}
AUTH_HEADERS = {'Authorization': 'token {}'.format(ROR_DUMP['GITHUB_TOKEN']), 'Accept': 'application/vnd.github.v3+json'}

def get_ror_dump_sha(filename, use_test_data, github_headers):
sha = ''
if use_test_data:
contents_url = ROR_DUMP['TEST_REPO_URL'] + '/contents'
else:
contents_url = ROR_DUMP['PROD_REPO_URL'] + '/contents'
try:
response = requests.get(contents_url, headers=github_headers)
except requests.exceptions.RequestException as e:
raise SystemExit(f"{contents_url}: is Not reachable \nErr: {e}")
try:
repo_contents = response.json()
for file in repo_contents:
if filename in file['name']:
sha = file['sha']
return sha
except:
return None

def get_ror_dump_zip(filename, use_test_data, github_headers):
sha = get_ror_dump_sha(filename, use_test_data, github_headers)
if sha:
if use_test_data:
blob_url = ROR_DUMP['TEST_REPO_URL'] + '/git/blobs/' + sha
else:
blob_url = ROR_DUMP['PROD_REPO_URL'] + '/git/blobs/' + sha
try:
response = requests.get(blob_url, headers=github_headers)
except requests.exceptions.RequestException as e:
raise SystemExit(f"Github blob is Not reachable \nErr: {e}")
try:
response_json = response.json()
file_decoded = base64.b64decode(response_json['content'])
with open(filename + '.zip', 'wb') as zip_file:
zip_file.write(file_decoded)
return zip_file.name
except:
return None

class Command(BaseCommand):
help = 'Downloads a specified ROR data dump from Github'

def handle(self, *args, **options):
filename = options['filename']
use_test_data = options['testdata']
self.stdout.write('Getting ROR dump')
if ROR_DUMP['GITHUB_TOKEN']:
github_headers = AUTH_HEADERS
else:
github_headers = HEADERS
ror_dump_zip = get_ror_dump_zip(filename, use_test_data, github_headers)
52 changes: 9 additions & 43 deletions rorapi/management/commands/indexrordump.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,56 +96,18 @@ def index_dump(self, filename, index, dataset):
ES7.indices.delete(backup_index)
self.stdout.write('ROR dataset ' + filename + ' indexed')

def get_ror_dump_sha(filename, use_test_data):
sha = ''
if use_test_data:
contents_url = ROR_DUMP['TEST_REPO_URL'] + '/contents'
else:
contents_url = ROR_DUMP['PROD_REPO_URL'] + '/contents'
try:
response = requests.get(contents_url, headers=HEADERS)
except requests.exceptions.RequestException as e:
raise SystemExit(f"{contents_url}: is Not reachable \nErr: {e}")
try:
repo_contents = response.json()
for file in repo_contents:
if filename in file['name']:
sha = file['sha']
return sha
except:
return None

def get_ror_dump_zip(filename, use_test_data):
sha = get_ror_dump_sha(filename, use_test_data)
if sha:
if use_test_data:
blob_url = ROR_DUMP['TEST_REPO_URL'] + '/git/blobs/' + sha
else:
blob_url = ROR_DUMP['PROD_REPO_URL'] + '/git/blobs/' + sha
try:
response = requests.get(blob_url, headers=HEADERS)
except requests.exceptions.RequestException as e:
raise SystemExit(f"Github blob is Not reachable \nErr: {e}")
try:
response_json = response.json()
file_decoded = base64.b64decode(response_json['content'])
with open(filename + '.zip', 'wb') as zip_file:
zip_file.write(file_decoded)
return zip_file.name
except:
return None

class Command(BaseCommand):
help = 'Indexes ROR dataset from a full dump file in ror-data repo'

def handle(self, *args, **options):
json_files = []
filename = options['filename']
use_test_data = options['testdata']
ror_dump_zip = get_ror_dump_zip(filename, use_test_data)
if ror_dump_zip:
ror_dump_zip = filename + '.zip'
if os.path.exists(ror_dump_zip):
if not os.path.exists(DATA['WORKING_DIR']):
os.makedirs(DATA['WORKING_DIR'])
self.stdout.write('Extracting ROR dump')
with zipfile.ZipFile(ror_dump_zip, 'r') as zip_ref:
zip_ref.extractall(DATA['WORKING_DIR'] + filename)
unzipped_files = os.listdir(DATA['WORKING_DIR'] + filename)
Expand All @@ -155,13 +117,17 @@ def handle(self, *args, **options):
for json_file in json_files:
index = None
json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file
with open(json_path, 'r') as it:
dataset = json.load(it)
if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None):
self.stdout.write('Loading JSON')
with open(json_path, 'r') as it:
dataset = json.load(it)
self.stdout.write('Indexing ROR dataset ' + json_file)
index = ES_VARS['INDEX_V2']
index_dump(self, json_file, index, dataset)
if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None):
self.stdout.write('Loading JSON')
with open(json_path, 'r') as it:
dataset = json.load(it)
self.stdout.write('Indexing ROR dataset ' + json_file)
index = ES_VARS['INDEX_V1']
index_dump(self, json_file, index, dataset)
Expand Down
6 changes: 5 additions & 1 deletion rorapi/management/commands/setup.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import requests
import zipfile

import base64
from django.core.management.base import BaseCommand
from rorapi.management.commands.deleteindex import Command as DeleteIndexCommand
from rorapi.management.commands.createindex import Command as CreateIndexCommand
from rorapi.management.commands.indexrordump import Command as IndexRorDumpCommand
from rorapi.management.commands.getrordump import Command as GetRorDumpCommand
from rorapi.settings import ROR_DUMP

HEADERS = {'Accept': 'application/vnd.github.v3+json'}

HEADERS = {'Authorization': 'token {}'.format(ROR_DUMP['GITHUB_TOKEN']), 'Accept': 'application/vnd.github.v3+json'}

def get_ror_dump_sha(filename, use_test_data):
sha = ''
if use_test_data:
Expand Down Expand Up @@ -49,6 +52,7 @@ def handle(self, *args, **options):
sha = get_ror_dump_sha(filename, use_test_data)

if sha:
GetRorDumpCommand().handle(*args, **options)
DeleteIndexCommand().handle(*args, **options)
CreateIndexCommand().handle(*args, **options)
IndexRorDumpCommand().handle(*args, **options)
Expand Down

0 comments on commit 94bad80

Please sign in to comment.