Merge pull request #395 from ror-community/staging

Merge staging to prod: Update index ror dump commands
ror-community · Apr 18, 2024 · 94bad80 · 94bad80
2 parents 943060b + cd6ac78
commit 94bad80
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 63 deletions.
diff --git a/rorapi/common/csv_create.py b/rorapi/common/csv_create.py
@@ -71,27 +71,29 @@ def new_record_from_csv(csv_data, version):
                 temp_names.append(name_obj)
     print("temp names 1:")
     print(temp_names)
-    name_values = [n['value'] for n in temp_names]
+    name_vals = [n['value'] for n in temp_names]
     dup_names = []
-    for n in name_values:
-        if name_values.count(n) > 1:
+    for n in name_vals:
+        if name_vals.count(n) > 1:
             if n not in dup_names:
                 dup_names.append(n)
-    if dup_names:
-        dup_names_objs = []
-        for d in dup_names:
-            types = []
-            for t in temp_names:
-                if t['value'] == d:
-                    types.extend(t['types'])
-            name_obj = {
-                "types": types,
-                "value": d,
-                "lang": None
-            }
-            dup_names_objs.append(name_obj)
-        temp_names = [t for t in temp_names if t['value'] not in dup_names]
-        temp_names.extend(dup_names_objs)
+    for d in dup_names:
+        dup_names_objs = [t for t in temp_names if t['value'] == d]
+        lang_codes = [dno['lang'] for dno in dup_names_objs]
+        for lang_code in lang_codes:
+            if lang_codes.count(lang_code) > 1:
+                name_lang_dups = [dno for dno in dup_names_objs if dno['lang'] == lang_code]
+                types = []
+                for n in name_lang_dups:
+                    types.extend(n['types'])
+                name_obj = {
+                    "types": types,
+                    "value": d,
+                    "lang": lang_code
+                }
+                if name_obj not in temp_names:
+                    temp_names = [t for t in temp_names if t not in name_lang_dups]
+                    temp_names.append(name_obj)
     print("temp names 2:")
     print(temp_names)
     v2_data['names'] = temp_names

diff --git a/rorapi/common/csv_update.py b/rorapi/common/csv_update.py
@@ -290,7 +290,7 @@ def update_record_from_csv(csv_data, version):
                                 # check if value, lang and type already exist
                                 if temp_names_match or temp_names_null_lang_match:
                                     if temp_names_match:
-                                        errors.append("Attempting to add names that already exists: {}".format(a))
+                                        errors.append("Attempting to add name that already exists: {}".format(a))
                                     if temp_names_null_lang_match:
                                         errors.append("Attempting to add name with lang code that already exists with no lang code: {}".format(a))
                                 else:

diff --git a/rorapi/management/commands/getrordump.py b/rorapi/management/commands/getrordump.py
@@ -0,0 +1,64 @@
+import json
+import os
+import re
+import requests
+import zipfile
+import base64
+from io import BytesIO
+from rorapi.settings import ES7, ES_VARS, ROR_DUMP, DATA
+from django.core.management.base import BaseCommand
+
+HEADERS = {'Accept': 'application/vnd.github.v3+json'}
+AUTH_HEADERS = {'Authorization': 'token {}'.format(ROR_DUMP['GITHUB_TOKEN']), 'Accept': 'application/vnd.github.v3+json'}
+
+def get_ror_dump_sha(filename, use_test_data, github_headers):
+    sha = ''
+    if use_test_data:
+        contents_url = ROR_DUMP['TEST_REPO_URL'] + '/contents'
+    else:
+        contents_url = ROR_DUMP['PROD_REPO_URL'] + '/contents'
+    try:
+        response = requests.get(contents_url, headers=github_headers)
+    except requests.exceptions.RequestException as e:
+        raise SystemExit(f"{contents_url}: is Not reachable \nErr: {e}")
+    try:
+        repo_contents = response.json()
+        for file in repo_contents:
+            if filename in file['name']:
+                sha = file['sha']
+        return sha
+    except:
+        return None
+
+def get_ror_dump_zip(filename, use_test_data, github_headers):
+    sha = get_ror_dump_sha(filename, use_test_data, github_headers)
+    if sha:
+        if use_test_data:
+            blob_url = ROR_DUMP['TEST_REPO_URL'] + '/git/blobs/' + sha
+        else:
+            blob_url = ROR_DUMP['PROD_REPO_URL'] + '/git/blobs/' + sha
+        try:
+            response = requests.get(blob_url, headers=github_headers)
+        except requests.exceptions.RequestException as e:
+            raise SystemExit(f"Github blob is Not reachable \nErr: {e}")
+        try:
+            response_json = response.json()
+            file_decoded = base64.b64decode(response_json['content'])
+            with open(filename + '.zip', 'wb') as zip_file:
+                zip_file.write(file_decoded)
+            return zip_file.name
+        except:
+            return None
+
+class Command(BaseCommand):
+    help = 'Downloads a specified ROR data dump from Github'
+
+    def handle(self, *args, **options):
+        filename = options['filename']
+        use_test_data = options['testdata']
+        self.stdout.write('Getting ROR dump')
+        if ROR_DUMP['GITHUB_TOKEN']:
+            github_headers = AUTH_HEADERS
+        else:
+            github_headers = HEADERS
+        ror_dump_zip = get_ror_dump_zip(filename, use_test_data, github_headers)
diff --git a/rorapi/management/commands/indexrordump.py b/rorapi/management/commands/indexrordump.py
@@ -96,56 +96,18 @@ def index_dump(self, filename, index, dataset):
         ES7.indices.delete(backup_index)
     self.stdout.write('ROR dataset ' + filename + ' indexed')
 
-def get_ror_dump_sha(filename, use_test_data):
-    sha = ''
-    if use_test_data:
-        contents_url = ROR_DUMP['TEST_REPO_URL'] + '/contents'
-    else:
-        contents_url = ROR_DUMP['PROD_REPO_URL'] + '/contents'
-    try:
-        response = requests.get(contents_url, headers=HEADERS)
-    except requests.exceptions.RequestException as e:
-        raise SystemExit(f"{contents_url}: is Not reachable \nErr: {e}")
-    try:
-        repo_contents = response.json()
-        for file in repo_contents:
-            if filename in file['name']:
-                sha = file['sha']
-        return sha
-    except:
-        return None
-
-def get_ror_dump_zip(filename, use_test_data):
-    sha = get_ror_dump_sha(filename, use_test_data)
-    if sha:
-        if use_test_data:
-            blob_url = ROR_DUMP['TEST_REPO_URL'] + '/git/blobs/' + sha
-        else:
-            blob_url = ROR_DUMP['PROD_REPO_URL'] + '/git/blobs/' + sha
-        try:
-            response = requests.get(blob_url, headers=HEADERS)
-        except requests.exceptions.RequestException as e:
-            raise SystemExit(f"Github blob is Not reachable \nErr: {e}")
-        try:
-            response_json = response.json()
-            file_decoded = base64.b64decode(response_json['content'])
-            with open(filename + '.zip', 'wb') as zip_file:
-                zip_file.write(file_decoded)
-            return zip_file.name
-        except:
-            return None
 
 class Command(BaseCommand):
     help = 'Indexes ROR dataset from a full dump file in ror-data repo'
 
     def handle(self, *args, **options):
         json_files = []
         filename = options['filename']
-        use_test_data = options['testdata']
-        ror_dump_zip = get_ror_dump_zip(filename, use_test_data)
-        if ror_dump_zip:
+        ror_dump_zip = filename + '.zip'
+        if os.path.exists(ror_dump_zip):
             if not os.path.exists(DATA['WORKING_DIR']):
                 os.makedirs(DATA['WORKING_DIR'])
+            self.stdout.write('Extracting ROR dump')
             with zipfile.ZipFile(ror_dump_zip, 'r') as zip_ref:
                 zip_ref.extractall(DATA['WORKING_DIR'] + filename)
             unzipped_files = os.listdir(DATA['WORKING_DIR'] + filename)
@@ -155,13 +117,17 @@ def handle(self, *args, **options):
             for json_file in json_files:
                 index = None
                 json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file
-                with open(json_path, 'r') as it:
-                    dataset = json.load(it)
                 if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None):
+                    self.stdout.write('Loading JSON')
+                    with open(json_path, 'r') as it:
+                        dataset = json.load(it)
                     self.stdout.write('Indexing ROR dataset ' + json_file)
                     index = ES_VARS['INDEX_V2']
                     index_dump(self, json_file, index, dataset)
                 if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None):
+                    self.stdout.write('Loading JSON')
+                    with open(json_path, 'r') as it:
+                        dataset = json.load(it)
                     self.stdout.write('Indexing ROR dataset ' + json_file)
                     index = ES_VARS['INDEX_V1']
                     index_dump(self, json_file, index, dataset)

diff --git a/rorapi/management/commands/setup.py b/rorapi/management/commands/setup.py
@@ -1,14 +1,17 @@
 import requests
 import zipfile
-
+import base64
 from django.core.management.base import BaseCommand
 from rorapi.management.commands.deleteindex import Command as DeleteIndexCommand
 from rorapi.management.commands.createindex import Command as CreateIndexCommand
 from rorapi.management.commands.indexrordump import Command as IndexRorDumpCommand
+from rorapi.management.commands.getrordump import Command as GetRorDumpCommand
 from rorapi.settings import ROR_DUMP
 
 HEADERS = {'Accept': 'application/vnd.github.v3+json'}
 
+HEADERS = {'Authorization': 'token {}'.format(ROR_DUMP['GITHUB_TOKEN']), 'Accept': 'application/vnd.github.v3+json'}
+
 def get_ror_dump_sha(filename, use_test_data):
     sha = ''
     if use_test_data:
@@ -49,6 +52,7 @@ def handle(self, *args, **options):
         sha = get_ror_dump_sha(filename, use_test_data)
 
         if sha:
+            GetRorDumpCommand().handle(*args, **options)
             DeleteIndexCommand().handle(*args, **options)
             CreateIndexCommand().handle(*args, **options)
             IndexRorDumpCommand().handle(*args, **options)