From f5fa6d156a6c813eea7ac925cd31ea95d530076e Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Wed, 20 Dec 2023 09:17:14 -0600 Subject: [PATCH 01/38] start v2 create/update functionality --- rorapi/common/parsers.py | 23 ++++++++++++ rorapi/common/urls.py | 1 + rorapi/common/validation.py | 75 +++++++++++++++++++++++++++++++++++++ rorapi/common/views.py | 70 ++++++++++++++++++++++++++++++++-- 4 files changed, 166 insertions(+), 3 deletions(-) create mode 100644 rorapi/common/parsers.py create mode 100644 rorapi/common/validation.py diff --git a/rorapi/common/parsers.py b/rorapi/common/parsers.py new file mode 100644 index 0000000..687cc4a --- /dev/null +++ b/rorapi/common/parsers.py @@ -0,0 +1,23 @@ +import jsonschema +import requests +from rest_framework.exceptions import ParseError +from rest_framework.parsers import JSONParser + + +class JSONSchemaParser(JSONParser): + + def get_file_from_url(self, url): + rsp = requests.get(url) + rsp.raise_for_status() + return rsp.json() + + def parse(self, stream, media_type=None, parser_context=None): + schema = self.get_file_from_url("https://raw.githubusercontent.com/ror-community/ror-schema/schema-v2/ror_schema_v2_0.json") + data = super(JSONSchemaParser, self).parse(stream, media_type, + parser_context) + try: + jsonschema.validate(data, schema) + except jsonschema.ValidationError as error: + raise ParseError(detail=error.message) + else: + return data \ No newline at end of file diff --git a/rorapi/common/urls.py b/rorapi/common/urls.py index 3c67117..224d5d3 100644 --- a/rorapi/common/urls.py +++ b/rorapi/common/urls.py @@ -10,6 +10,7 @@ url(r"^(?P(v1|v2))\/heartbeat$", HeartbeatView.as_view()), url(r"^heartbeat$", HeartbeatView.as_view()), # Using REST API + url(r"^(?P(v1|v2))\/generateaddress\/(?P[0-9]+)", GenerateAddress.as_view()), path('generateaddress/', GenerateAddress.as_view()), url(r"^generateid$", GenerateId.as_view()), path('indexdata/', IndexData.as_view()), diff --git a/rorapi/common/validation.py b/rorapi/common/validation.py new file mode 100644 index 0000000..25d01a4 --- /dev/null +++ b/rorapi/common/validation.py @@ -0,0 +1,75 @@ +import jsonschema +import requests +import copy +from datetime import datetime +from rest_framework.exceptions import ParseError +from rest_framework.parsers import JSONParser +from rorapi.common.models import Errors + +NOW = datetime.now() + +ADMIN = { + "created": { + "date": NOW.strftime("%Y-%m-%d"), + "schema_version": "2.0" + }, + "last_modified": { + "date": NOW.strftime("%Y-%m-%d"), + "schema_version": "2.0" + } +} + +LAST_MOD = { + "date": NOW.strftime("%Y-%m-%d"), + "schema_version": "2.0" +} + +OPTIONAL_FIELD_DEFAULTS = { + "domains": [], + "established": None, + "external_ids": [], + "links": [], + "relationships": [] +} + +def update_record(json_input, existing_record): + record = copy.deepcopy(existing_record) + for k, v in json_input.items(): + record[k] = copy.deepcopy(v) + return record + +def update_last_mod(record): + record['admin']['last_modified'] = copy.deepcopy(LAST_MOD) + return record + +def check_optional_fields(record): + for k in OPTIONAL_FIELD_DEFAULTS: + if k not in record: + return True + return False + +def add_missing_optional_fields(record): + for k, v in OPTIONAL_FIELD_DEFAULTS.items(): + if k not in record: + record[k] = v + return record + +def add_created_last_mod(record): + record['admin'] = copy.deepcopy(ADMIN) + return record + +def get_file_from_url(url): + rsp = requests.get(url) + rsp.raise_for_status() + return rsp.json() + +def validate_v2(data): + errors = [] + schema = get_file_from_url("https://raw.githubusercontent.com/ror-community/ror-schema/schema-v2/ror_schema_v2_0.json") + try: + jsonschema.validate(data, schema) + except jsonschema.ValidationError as error: + errors.append(error) + return Errors(errors), None + else: + return None, data \ No newline at end of file diff --git a/rorapi/common/views.py b/rorapi/common/views.py index baa9863..5861c20 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -1,4 +1,5 @@ from rest_framework import viewsets, routers, status +from rest_framework.exceptions import ParseError from rest_framework.response import Response from django.http import HttpResponse from django.views import View @@ -7,7 +8,9 @@ from rest_framework.permissions import BasePermission from rest_framework.views import APIView import json +import copy +from rorapi.common import validation from rorapi.settings import REST_FRAMEWORK from rorapi.common.matching import match_organizations from rorapi.common.models import ( @@ -83,6 +86,64 @@ def retrieve(self, request, pk=None, version=REST_FRAMEWORK["DEFAULT_VERSION"]): serializer = OrganizationSerializerV1(organization) return Response(serializer.data) + def create(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): + errors = None + if version == "v2": + json = request.data + if 'id' in json and (json['id'] is not None and json['id'] != ""): + errors = Errors(["Value {} found in ID field. New records cannot contain a value in the ID field".format(json['id'])]) + else: + new_record = copy.deepcopy(json) + if validation.check_optional_fields(new_record): + new_record = validation.add_missing_optional_fields(new_record) + new_record = validation.add_created_last_mod(new_record) + new_ror_id = check_ror_id(version) + new_record['id'] = new_ror_id + # handle admin + errors, valid_data = validation.validate_v2(new_record) + else: + errors = Errors(["Version {} does not support creating records".format(version)]) + if errors is not None: + print(errors) + return Response( + ErrorsSerializer(errors).data, status=status.HTTP_400_BAD_REQUEST + ) + serializer = OrganizationSerializerV2(valid_data) + return Response(serializer.data) + + def update(self, request, pk=None, version=REST_FRAMEWORK["DEFAULT_VERSION"]): + errors = None + if version == "v2": + ror_id = get_ror_id(pk) + if ror_id is None: + errors = Errors(["'{}' is not a valid ROR ID".format(pk)]) + return Response( + ErrorsSerializer(errors).data, status=status.HTTP_404_NOT_FOUND + ) + errors, organization = retrieve_organization(ror_id, version) + if organization is None: + return Response( + ErrorsSerializer(errors).data, status=status.HTTP_404_NOT_FOUND + ) + json = request.data + if 'id' not in json: + errors = Errors(["No value found in ID field. Updated records must include a value in the ID field"]) + elif get_ror_id(json['id']) != ror_id: + errors = Errors(["Value {} in IDs field does not match resource ID specified in request URL {}".format(json['id'], pk)]) + else: + serializer = OrganizationSerializerV2(organization) + existing_record = serializer.data + updated_record = validation.update_record(json, existing_record) + errors, valid_data = validation.validate_v2(updated_record) + else: + errors = Errors(["Version {} does not support creating records".format(version)]) + if errors is not None: + return Response( + ErrorsSerializer(errors).data, status=status.HTTP_400_BAD_REQUEST + ) + serializer = OrganizationSerializerV2(valid_data) + return Response(serializer.data) + organizations_router = routers.DefaultRouter(trailing_slash=False) organizations_router.register( @@ -116,10 +177,13 @@ def has_permission(self, request, view): class GenerateAddress(APIView): - permission_classes = [OurTokenPermission] + #permission_classes = [OurTokenPermission] - def get(self, request, geonamesid): - address = ua.new_geonames(geonamesid) + def get(self, request, geonamesid, version=REST_FRAMEWORK["DEFAULT_VERSION"]): + if version == 'v2': + address = ua.new_geonames_v2(geonamesid) + else: + address = ua.new_geonames(geonamesid) return Response(address) From aa9ea215df46689674ff56bb820054379f12a303 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Thu, 18 Jan 2024 20:12:01 -0600 Subject: [PATCH 02/38] start v2 create API --- Dockerfile | 2 +- requirements.txt | 6 +- rorapi/common/urls.py | 6 +- rorapi/common/validation.py | 235 +++++++++++++++++- rorapi/common/views.py | 138 +++++++++- rorapi/settings.py | 3 +- .../data/test_data_create_valid.json | 68 +++++ .../tests_unit/data/test_update_valid.json | 68 +++++ rorapi/v2/record_template.json | 13 + 9 files changed, 519 insertions(+), 20 deletions(-) create mode 100644 rorapi/tests/tests_unit/data/test_data_create_valid.json create mode 100644 rorapi/tests/tests_unit/data/test_update_valid.json create mode 100644 rorapi/v2/record_template.json diff --git a/Dockerfile b/Dockerfile index 26f8583..9ebc84e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,7 +16,7 @@ RUN mv /etc/apt/sources.list.d /etc/apt/sources.list.d.bak && \ mv /etc/apt/sources.list.d.bak /etc/apt/sources.list.d && \ apt-get upgrade -y -o Dpkg::Options::="--force-confold" && \ apt-get clean && \ - apt-get install ntp wget unzip tzdata python3-pip -y && \ + apt-get install ntp wget unzip tzdata python3-pip libmagic1 -y && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* # Enable Passenger and Nginx and remove the default site diff --git a/requirements.txt b/requirements.txt index ab14247..fd678cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,5 +20,7 @@ boto3 pandas==1.4.1 numpy==1.22 titlecase==2.3 -update_address @ git+https://github.com/ror-community/update_address.git -launchdarkly-server-sdk \ No newline at end of file +update_address @ git+https://github.com/ror-community/update_address.git@v2-locations +launchdarkly-server-sdk +jsonschema==3.2.0 +python-magic \ No newline at end of file diff --git a/rorapi/common/urls.py b/rorapi/common/urls.py index 224d5d3..cdcb523 100644 --- a/rorapi/common/urls.py +++ b/rorapi/common/urls.py @@ -1,9 +1,9 @@ from django.conf.urls import url, include -from django.urls import path +from django.urls import path, re_path from rest_framework.documentation import include_docs_urls from . import views -from rorapi.common.views import HeartbeatView,GenerateAddress,GenerateId,IndexData +from rorapi.common.views import HeartbeatView,GenerateAddress,GenerateId,IndexData,FileUploadView urlpatterns = [ # Health check @@ -19,4 +19,6 @@ url(r"^docs/", include_docs_urls(title="Research Organization Registry")), # Prometheus url("", include("django_prometheus.urls")), + re_path(r"^(?P(v1|v2))\/upload$", FileUploadView.as_view()), + path('upload/', FileUploadView.as_view()) ] diff --git a/rorapi/common/validation.py b/rorapi/common/validation.py index 25d01a4..fe604b2 100644 --- a/rorapi/common/validation.py +++ b/rorapi/common/validation.py @@ -1,10 +1,16 @@ import jsonschema import requests import copy +import csv +import json +import io from datetime import datetime from rest_framework.exceptions import ParseError from rest_framework.parsers import JSONParser from rorapi.common.models import Errors +import update_address as ua + +from rorapi.management.commands.generaterorid import check_ror_id NOW = datetime.now() @@ -32,6 +38,62 @@ "relationships": [] } +CSV_REQUIRED_FIELDS = ( + "id", + "domains", + "established", + "external_ids.type.fundref.all", + "external_ids.type.fundref.preferred", + "external_ids.type.grid.all", + "external_ids.type.grid.preferred", + "external_ids.type.isni.all", + "external_ids.type.isni.preferred", + "external_ids.type.wikidata.all", + "external_ids.type.wikidata.preferred", + "links.type.website", + "links.type.wikipedia", + "locations.geonames_id", + "names.types.acronym", + "names.types.alias", + "names.types.label", + "names.types.ror_display", + "status", + "types" +) + +V2_TEMPLATE = { + "locations": [], + "established": None, + "external_ids": [], + "id": "", + "domains": [], + "links": [], + "names": [], + "relationships": [], + "status": "", + "types": [], + "admin": {} +} + +V2_EXTERNAL_ID_TYPES = { + "FUNDREF" : "fundref", + "GRID" : "grid", + "ISNI" : "isni", + "WIKIDATA" : "wikidata" + } + +V2_LINK_TYPES = { + "WEBSITE" : "website", + "WIKIPEDIA" : "wikipedia" + } + +V2_NAME_TYPES = { + "ACRONYM" : "acronym", + "ALIAS" : "alias", + "LABEL" : "label", + "ROR_DISPLAY" : "ror_display" + } + def update_record(json_input, existing_record): record = copy.deepcopy(existing_record) for k, v in json_input.items(): @@ -58,6 +120,19 @@ def add_created_last_mod(record): record['admin'] = copy.deepcopy(ADMIN) return record +def update_locations(locations): + errors = [] + updated_locations = [] + for location in locations: + if 'geonames_id' in location: + try: + print(location['geonames_id']) + updated_location = ua.new_geonames_v2(str(location['geonames_id'])) + updated_locations.append(updated_location['location']) + except: + errors.append("Error retrieving Geonames data for ID {}. Please check that this is a valid Geonames ID".format(location['geonames_id'])) + return errors, updated_locations + def get_file_from_url(url): rsp = requests.get(url) rsp.raise_for_status() @@ -67,9 +142,167 @@ def validate_v2(data): errors = [] schema = get_file_from_url("https://raw.githubusercontent.com/ror-community/ror-schema/schema-v2/ror_schema_v2_0.json") try: + print("validating data:") + print(data) jsonschema.validate(data, schema) except jsonschema.ValidationError as error: errors.append(error) + print(errors) return Errors(errors), None else: - return None, data \ No newline at end of file + return None, data + +def validate_csv(csv_file): + errors = [] + try: + read_file = csv_file.read().decode('utf-8') + reader = csv.DictReader(io.StringIO(read_file)) + rowcount = 0 + for row in reader: + rowcount += 1 + if rowcount > 0: + csv_fields = reader.fieldnames + missing_fields = [] + for field in CSV_REQUIRED_FIELDS: + if field not in csv_fields: + missing_fields.append(field) + print(missing_fields) + if len(missing_fields) > 0: + errors.append(f'CSV file is missing columns: {", ".join(missing_fields)}') + else: + errors.append("CSV file contains no data rows") + except IOError as e: + errors.append(f"Error parsing CSV file: {e}") + print(errors) + return errors + +def new_record_from_json(json_input, version): + errors = None + valid_data = None + new_record = copy.deepcopy(json_input) + if check_optional_fields(new_record): + new_record = add_missing_optional_fields(new_record) + location_errors, updated_locations = update_locations(new_record['locations']) + if len(location_errors) > 0: + errors = Errors(location_errors) + else: + new_record['locations'] = updated_locations + new_record = add_created_last_mod(new_record) + new_ror_id = check_ror_id(version) + new_record['id'] = new_ror_id + # handle admin + errors, valid_data = validate_v2(new_record) + return errors, valid_data + + +def new_record_from_csv(csv_data, version): + v2_data = copy.deepcopy(V2_TEMPLATE) + if csv_data['domains']: + v2_data['domains'] = [d.strip() for d in csv_data['domains'].split(';')] + + if csv_data['established']: + v2_data['established'] = int(csv_data['established'].strip()) + + for k,v in V2_EXTERNAL_ID_TYPES.items(): + if csv_data['external_ids.type.' + v + '.all']: + ext_id_obj = { + "type": v, + "all": [i.strip() for i in csv_data['external_ids.type.' + v + '.all'].split(';')], + "preferred": csv_data['external_ids.type.' + v + '.preferred'].strip() if csv_data['external_ids.type.' + v + '.preferred'] else None + } + v2_data['external_ids'].append(ext_id_obj) + + for k,v in V2_LINK_TYPES.items(): + if csv_data['links.type.' + v]: + link_obj = { + "type": v, + "value": csv_data['links.type.' + v].strip() + } + v2_data['links'].append(link_obj) + + if csv_data['locations.geonames_id']: + geonames_ids = [i.strip() for i in csv_data['locations.geonames_id'].split(';')] + for geonames_id in geonames_ids: + location_obj = { + "geonames_id": geonames_id, + "geonames_details": {} + } + v2_data['locations'].append(location_obj) + + temp_names = [] + for k,v in V2_NAME_TYPES.items(): + if csv_data['names.types.' + v]: + name_obj = { + "types": v, + "value": csv_data['names.types.' + v].strip() + } + temp_names.append(name_obj) + print("temp names 1:") + print(temp_names) + name_values = [n['value'] for n in temp_names] + dup_names = [] + for n in name_values: + if name_values.count(n) > 1: + dup_names.append(n) + dup_names_types = [] + for d in dup_names: + types = [] + for t in temp_names: + if t['value'] == d: + types.append(t['types']) + name_obj = { + "types": types, + "value": d + } + dup_names_types.append(name_obj) + temp_names = [t for t in temp_names if t['value'] not in dup_names] + temp_names.append(name_obj) + print("temp names 2:") + print(temp_names) + v2_data['names'] = temp_names + if csv_data['status']: + v2_data['status'] = csv_data['status'].strip() + + if csv_data['types']: + v2_data['types'] = [t.strip() for t in csv_data['types'].split(';')] + errors, new_record = new_record_from_json(v2_data, version) + return errors, new_record + +def process_csv(csv_file, version): + print("Processing CSV") + errors = None + row_errors = {} + skipped_count = 0 + updated_count = 0 + new_count = 0 + read_file = csv_file.read().decode('utf-8') + print(read_file) + reader = csv.DictReader(io.StringIO(read_file)) + row_num = 1 + for row in reader: + print("Row data") + print(row) + errors, v2_record = new_record_from_csv(row, version) + print(errors) + print(v2_record) + ''' + if row['ror_id']: + row_error, updated_record = update_from_csv(row) + if row_error: + row_errors[row_num] = ror_error + skipped_count += 1 + else: + updated_count += 1 + else: + row_error, new_record = new_record_from_csv(row) + if row_error: + row_errors[row_num] = ror_error + skipped_count += 1 + else: + new_count +=1 + row_num += 1 + if len(ror_errors): + #create row errors csv + if updated_count > 0 or updated_count > 0 or skipped_count > 0: + # created zip + ''' \ No newline at end of file diff --git a/rorapi/common/views.py b/rorapi/common/views.py index 5861c20..c9722ef 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -7,8 +7,14 @@ from rest_framework.authentication import BasicAuthentication from rest_framework.permissions import BasePermission from rest_framework.views import APIView +from rest_framework.parsers import FormParser, MultiPartParser +from rorapi.settings import DATA import json import copy +import csv +import io +import mimetypes +import magic from rorapi.common import validation from rorapi.settings import REST_FRAMEWORK @@ -89,18 +95,11 @@ def retrieve(self, request, pk=None, version=REST_FRAMEWORK["DEFAULT_VERSION"]): def create(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): errors = None if version == "v2": - json = request.data - if 'id' in json and (json['id'] is not None and json['id'] != ""): - errors = Errors(["Value {} found in ID field. New records cannot contain a value in the ID field".format(json['id'])]) + json_input = request.data + if 'id' in json and (json_input['id'] is not None and json_input['id'] != ""): + errors = Errors(["Value {} found in ID field. New records cannot contain a value in the ID field".format(json_inputjson['id'])]) else: - new_record = copy.deepcopy(json) - if validation.check_optional_fields(new_record): - new_record = validation.add_missing_optional_fields(new_record) - new_record = validation.add_created_last_mod(new_record) - new_ror_id = check_ror_id(version) - new_record['id'] = new_ror_id - # handle admin - errors, valid_data = validation.validate_v2(new_record) + errors, valid_data = validation.new_record_from_json(json_input, version) else: errors = Errors(["Version {} does not support creating records".format(version)]) if errors is not None: @@ -134,7 +133,12 @@ def update(self, request, pk=None, version=REST_FRAMEWORK["DEFAULT_VERSION"]): serializer = OrganizationSerializerV2(organization) existing_record = serializer.data updated_record = validation.update_record(json, existing_record) - errors, valid_data = validation.validate_v2(updated_record) + location_errors, updated_locations = validation.update_locations(updated_record['locations']) + if len(location_errors) > 0: + errors = Errors(location_errors) + else: + updated_record['locations'] = updated_locations + errors, valid_data = validation.validate_v2(updated_record) else: errors = Errors(["Version {} does not support creating records".format(version)]) if errors is not None: @@ -177,7 +181,7 @@ def has_permission(self, request, view): class GenerateAddress(APIView): - #permission_classes = [OurTokenPermission] + permission_classes = [OurTokenPermission] def get(self, request, geonamesid, version=REST_FRAMEWORK["DEFAULT_VERSION"]): if version == 'v2': @@ -204,3 +208,111 @@ def get(self, request, branch): if msg["status"] == "ERROR": st = 400 return Response({"status": msg["status"], "msg": msg["msg"]}, status=st) + +def save_file(file, full_path): + with open(full_path, 'wb+') as f: + for chunk in file.chunks(): + f.write(chunk) + +class FileUploadView(APIView): + parser_classes = (MultiPartParser, FormParser) + #serializer_class = FileUploadSerializer + + def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): + errors = None + #serializer = self.serializer_class(data=request.data) + #if serializer.is_valid(): + # you can access the file like this from serializer + # uploaded_file = serializer.validated_data["file"] + #serializer.save() + if version == 'v2': + if request.data: + file_object = request.data['file'] + mime_type = magic.from_buffer(file_object.read(2048)) + if "ASCII text" in mime_type: + file_object.seek(0) + csv_validation_errors = validation.validate_csv(file_object) + if len(csv_validation_errors) == 0: + file_object.seek(0) + #full_path = os.path.join(DATA['DIR'], file_object.name) + #save_file(file_object, full_path) + errors = validation.process_csv(file_object, version) + else: + errors=Errors(csv_validation_errors) + else: + errors = Errors(["File upload must be CSV. File type '{}' is not supported".format(mime_type)]) + else: + errors = Errors(["Could not processs request. No data included in request."]) + else: + errors = Errors(["Version {} does not support creating records".format(version)]) + if errors is not None: + print(errors) + return Response( + ErrorsSerializer(errors).data, status=status.HTTP_400_BAD_REQUEST + ) + + return Response( + request.data, + status=status.HTTP_201_CREATED + ) + + def get(self, request, filename, **kwargs): + filepath = os.path.join(DATA['DIR'], filename) + + if os.path.exists(filepath): + with open(filepath, 'r') as fh: + mime_type, _ = mimetypes.guess_type(filepath) + response = HttpResponse(fh, content_type=mime_type) + response['Content-Disposition'] = "attachment; filename=%s" % filename + return response + + +class BulkCreateUpdate(APIView): + #permission_classes = [OurTokenPermission] + + ''' + def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): + errors = None + row_errors = {} + skipped_count = 0 + updated_count = 0 + new_count = 0 + if version == 'v2': + if request.data: + csv_errors = validate_csv(request.data) + if csv_errors: + errors = csv_errors + else: + with open(request.data, 'r') as csv: + row_num = 1 + for row in csv: + if row['ror_id']: + row_error, updated_record = update_from_csv(row) + if row_error: + row_errors[row_num] = ror_error + skipped_count += 1 + else: + updated_count += 1 + else: + row_error, new_record = create_from_csv(row) + if row_error: + row_errors[row_num] = ror_error + skipped_count += 1 + else: + new_count +=1 + row_num += 1 + if len(ror_errors): + #create row errors csv + if updated_count > 0 or updated_count > 0 or skipped_count > 0: + # created zip + else: + errors = Errors(["Could not processs request. No CSV file included in request."]) + else: + errors = Errors(["Version {} does not support creating records".format(version)]) + if errors is not None: + print(errors) + return Response( + ErrorsSerializer(errors).data, status=status.HTTP_400_BAD_REQUEST + ) + return Response(zippedfile) + ''' diff --git a/rorapi/settings.py b/rorapi/settings.py index 81476d7..c39bd57 100644 --- a/rorapi/settings.py +++ b/rorapi/settings.py @@ -39,7 +39,8 @@ 'SECRET_KEY', '0y0zn=hnz99$+c6lejml@chch54s2y2@-z##i$pstn62doft_g') # SECURITY WARNING: don't run with debug turned on in production! -DEBUG = os.environ.get('PASSENGER_APP_ENV', 'development') == 'development' +#DEBUG = os.environ.get('PASSENGER_APP_ENV', 'development') == 'development' +DEBUG = False ALLOWED_HOSTS = ['*'] diff --git a/rorapi/tests/tests_unit/data/test_data_create_valid.json b/rorapi/tests/tests_unit/data/test_data_create_valid.json new file mode 100644 index 0000000..533d944 --- /dev/null +++ b/rorapi/tests/tests_unit/data/test_data_create_valid.json @@ -0,0 +1,68 @@ +{ + "locations": [ + { + "geonames_id": 2661552, + "geonames_details": { + "country_code": "CH", + "country_name": "Switzerland", + "lat": 46.94809, + "lng": 7.44744, + "name": "Bern" + } + } + ], + "established": null, + "external_ids": [ + { + "type": "grid", + "all": [ + "grid.426225.5" + ], + "preferred": "grid.426225.5" + } + ], + "id": "https://ror.org/00wz65j53", + "domains": ["wisc.edu"], + "links": [ + { + "type": "website", + "value": "https://www.jdsu.com" + } + ], + "names": [ + { + "value": "JDSU (Switzerland)", + "types": [ + "ror_display", + "label" + ], + "lang": null + } + ], + "relationships": [ + { + "label": "JDSU (United States)", + "type": "parent", + "id": "https://ror.org/01a5v8x09" + }, + { + "label": "Viavi Solutions (United States)", + "type": "successor", + "id": "https://ror.org/059a9e323" + } + ], + "status": "inactive", + "types": [ + "company" + ], + "admin": { + "created": { + "date": "2023-07-28", + "schema_version": "1.0" + }, + "last_modified": { + "date": "2023-07-28", + "schema_version": "2.0" + } + } +} \ No newline at end of file diff --git a/rorapi/tests/tests_unit/data/test_update_valid.json b/rorapi/tests/tests_unit/data/test_update_valid.json new file mode 100644 index 0000000..533d944 --- /dev/null +++ b/rorapi/tests/tests_unit/data/test_update_valid.json @@ -0,0 +1,68 @@ +{ + "locations": [ + { + "geonames_id": 2661552, + "geonames_details": { + "country_code": "CH", + "country_name": "Switzerland", + "lat": 46.94809, + "lng": 7.44744, + "name": "Bern" + } + } + ], + "established": null, + "external_ids": [ + { + "type": "grid", + "all": [ + "grid.426225.5" + ], + "preferred": "grid.426225.5" + } + ], + "id": "https://ror.org/00wz65j53", + "domains": ["wisc.edu"], + "links": [ + { + "type": "website", + "value": "https://www.jdsu.com" + } + ], + "names": [ + { + "value": "JDSU (Switzerland)", + "types": [ + "ror_display", + "label" + ], + "lang": null + } + ], + "relationships": [ + { + "label": "JDSU (United States)", + "type": "parent", + "id": "https://ror.org/01a5v8x09" + }, + { + "label": "Viavi Solutions (United States)", + "type": "successor", + "id": "https://ror.org/059a9e323" + } + ], + "status": "inactive", + "types": [ + "company" + ], + "admin": { + "created": { + "date": "2023-07-28", + "schema_version": "1.0" + }, + "last_modified": { + "date": "2023-07-28", + "schema_version": "2.0" + } + } +} \ No newline at end of file diff --git a/rorapi/v2/record_template.json b/rorapi/v2/record_template.json new file mode 100644 index 0000000..906a5b7 --- /dev/null +++ b/rorapi/v2/record_template.json @@ -0,0 +1,13 @@ +{ + "locations": [], + "established": null, + "external_ids": [], + "id": "", + "domains": [], + "links": [], + "names": [], + "relationships": [], + "status": "", + "types": [], + "admin": {} +} \ No newline at end of file From 8680647c6ee3daa688fb395431da6c5e9251afed Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Mon, 5 Feb 2024 19:54:07 -0600 Subject: [PATCH 03/38] add bulk update endpoint --- rorapi/common/features.py | 2 +- rorapi/common/validation.py | 38 +++++++++++++++++++++++++++++++++++-- rorapi/common/views.py | 19 +++---------------- 3 files changed, 40 insertions(+), 19 deletions(-) diff --git a/rorapi/common/features.py b/rorapi/common/features.py index f3a9644..680402f 100644 --- a/rorapi/common/features.py +++ b/rorapi/common/features.py @@ -3,4 +3,4 @@ from rorapi.settings import LAUNCH_DARKLY_KEY ldclient.set_config(Config(LAUNCH_DARKLY_KEY)) -launch_darkly_client = ldclient.get() +launch_darkly_client = ldclient.get() \ No newline at end of file diff --git a/rorapi/common/validation.py b/rorapi/common/validation.py index fe604b2..393d311 100644 --- a/rorapi/common/validation.py +++ b/rorapi/common/validation.py @@ -4,11 +4,18 @@ import csv import json import io +import os from datetime import datetime from rest_framework.exceptions import ParseError from rest_framework.parsers import JSONParser +from rest_framework.renderers import JSONRenderer from rorapi.common.models import Errors import update_address as ua +from rorapi.settings import DATA +from rorapi.v2.serializers import ( + OrganizationSerializer as OrganizationSerializerV2 +) +from rorapi.common.queries import get_ror_id from rorapi.management.commands.generaterorid import check_ror_id @@ -176,6 +183,7 @@ def validate_csv(csv_file): print(errors) return errors + def new_record_from_json(json_input, version): errors = None valid_data = None @@ -189,12 +197,30 @@ def new_record_from_json(json_input, version): new_record['locations'] = updated_locations new_record = add_created_last_mod(new_record) new_ror_id = check_ror_id(version) + print("new ror id: " + new_ror_id) new_record['id'] = new_ror_id # handle admin errors, valid_data = validate_v2(new_record) return errors, valid_data +def update_record_from_json(new_json, existing_org): + errors = None + valid_data = None + serializer = OrganizationSerializerV2(existing_org) + existing_record = serializer.data + updated_record = update_record(new_json, existing_record) + location_errors, updated_locations = update_locations(updated_record['locations']) + if len(location_errors) > 0: + errors = Errors(location_errors) + else: + updated_record['locations'] = updated_locations + errors, valid_data = validate_v2(updated_record) + return errors, valid_data + +def update_record_from_csv(): + #todo + def new_record_from_csv(csv_data, version): v2_data = copy.deepcopy(V2_TEMPLATE) if csv_data['domains']: @@ -234,7 +260,8 @@ def new_record_from_csv(csv_data, version): if csv_data['names.types.' + v]: name_obj = { "types": v, - "value": csv_data['names.types.' + v].strip() + "value": csv_data['names.types.' + v].strip(), + "lang": None } temp_names.append(name_obj) print("temp names 1:") @@ -252,7 +279,8 @@ def new_record_from_csv(csv_data, version): types.append(t['types']) name_obj = { "types": types, - "value": d + "value": d, + "lang": None } dup_names_types.append(name_obj) temp_names = [t for t in temp_names if t['value'] not in dup_names] @@ -285,6 +313,12 @@ def process_csv(csv_file, version): errors, v2_record = new_record_from_csv(row, version) print(errors) print(v2_record) + ror_id = v2_record['id'] + full_path = os.path.join(DATA['DIR'], ror_id.split('https://ror.org/')[1] + '.json') + serializer = OrganizationSerializerV2(v2_record) + json_obj = json.loads(JSONRenderer().render(serializer.data)) + with open(full_path, "w") as outfile: + json.dump(json_obj, outfile, ensure_ascii=False, indent=2) ''' if row['ror_id']: row_error, updated_record = update_from_csv(row) diff --git a/rorapi/common/views.py b/rorapi/common/views.py index c9722ef..5829c0e 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -96,7 +96,7 @@ def create(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): errors = None if version == "v2": json_input = request.data - if 'id' in json and (json_input['id'] is not None and json_input['id'] != ""): + if 'id' in json_input and (json_input['id'] is not None and json_input['id'] != ""): errors = Errors(["Value {} found in ID field. New records cannot contain a value in the ID field".format(json_inputjson['id'])]) else: errors, valid_data = validation.new_record_from_json(json_input, version) @@ -124,21 +124,7 @@ def update(self, request, pk=None, version=REST_FRAMEWORK["DEFAULT_VERSION"]): return Response( ErrorsSerializer(errors).data, status=status.HTTP_404_NOT_FOUND ) - json = request.data - if 'id' not in json: - errors = Errors(["No value found in ID field. Updated records must include a value in the ID field"]) - elif get_ror_id(json['id']) != ror_id: - errors = Errors(["Value {} in IDs field does not match resource ID specified in request URL {}".format(json['id'], pk)]) - else: - serializer = OrganizationSerializerV2(organization) - existing_record = serializer.data - updated_record = validation.update_record(json, existing_record) - location_errors, updated_locations = validation.update_locations(updated_record['locations']) - if len(location_errors) > 0: - errors = Errors(location_errors) - else: - updated_record['locations'] = updated_locations - errors, valid_data = validation.validate_v2(updated_record) + errors, valid_data = validation.update_record_from_json(request.data) else: errors = Errors(["Version {} does not support creating records".format(version)]) if errors is not None: @@ -237,6 +223,7 @@ def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): #full_path = os.path.join(DATA['DIR'], file_object.name) #save_file(file_object, full_path) errors = validation.process_csv(file_object, version) + else: errors=Errors(csv_validation_errors) else: From 5a348617c0939d7e76180a99dbcfbcff8b4cc44b Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Mon, 5 Feb 2024 19:54:30 -0600 Subject: [PATCH 04/38] specify version for launch darkly sdk --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index fd678cd..0ca299c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,6 +21,6 @@ pandas==1.4.1 numpy==1.22 titlecase==2.3 update_address @ git+https://github.com/ror-community/update_address.git@v2-locations -launchdarkly-server-sdk +launchdarkly-server-sdk==7.6.1 jsonschema==3.2.0 python-magic \ No newline at end of file From 922398da1966a07f0bfb13bcb81fd9e9d203a07d Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Fri, 9 Feb 2024 18:30:40 -0600 Subject: [PATCH 05/38] csv batch create/update WIP --- rorapi/common/validation.py | 366 ++++++++++++++++++++++++++++++++---- rorapi/common/views.py | 10 +- 2 files changed, 340 insertions(+), 36 deletions(-) diff --git a/rorapi/common/validation.py b/rorapi/common/validation.py index 393d311..1b20332 100644 --- a/rorapi/common/validation.py +++ b/rorapi/common/validation.py @@ -5,6 +5,7 @@ import json import io import os +import re from datetime import datetime from rest_framework.exceptions import ParseError from rest_framework.parsers import JSONParser @@ -15,7 +16,7 @@ from rorapi.v2.serializers import ( OrganizationSerializer as OrganizationSerializerV2 ) -from rorapi.common.queries import get_ror_id +from rorapi.common.queries import get_ror_id, retrieve_organization from rorapi.management.commands.generaterorid import check_ror_id @@ -45,28 +46,40 @@ "relationships": [] } -CSV_REQUIRED_FIELDS = ( - "id", - "domains", - "established", - "external_ids.type.fundref.all", - "external_ids.type.fundref.preferred", - "external_ids.type.grid.all", - "external_ids.type.grid.preferred", - "external_ids.type.isni.all", - "external_ids.type.isni.preferred", - "external_ids.type.wikidata.all", - "external_ids.type.wikidata.preferred", - "links.type.website", - "links.type.wikipedia", - "locations.geonames_id", - "names.types.acronym", - "names.types.alias", - "names.types.label", - "names.types.ror_display", - "status", - "types" -) +UPDATE_ACTIONS = { + "ADD": "add", + "DELETE": "delete", + "REPLACE": "replace" +} + +UPDATE_ACTIONS_MULTI = [UPDATE_ACTIONS["ADD"], UPDATE_ACTIONS["DELETE"], UPDATE_ACTIONS["REPLACE"]] + +UPDATE_ACTIONS_SINGLE = [UPDATE_ACTIONS["DELETE"], UPDATE_ACTIONS["REPLACE"]] + +NO_DELETE_FIELDS = ["id", "locations.geonames_id", "names.types.ror_display", "status", "types"] + +CSV_REQUIRED_FIELDS_ACTIONS = { + "id": None, + "domains": UPDATE_ACTIONS_MULTI, + "established": UPDATE_ACTIONS_SINGLE, + "external_ids.type.fundref.all": UPDATE_ACTIONS_MULTI, + "external_ids.type.fundref.preferred": UPDATE_ACTIONS_SINGLE, + "external_ids.type.grid.all": UPDATE_ACTIONS_MULTI, + "external_ids.type.grid.preferred": UPDATE_ACTIONS_SINGLE, + "external_ids.type.isni.all": UPDATE_ACTIONS_MULTI, + "external_ids.type.isni.preferred": UPDATE_ACTIONS_SINGLE, + "external_ids.type.wikidata.all": UPDATE_ACTIONS_MULTI, + "external_ids.type.wikidata.preferred": UPDATE_ACTIONS_SINGLE, + "links.type.website": UPDATE_ACTIONS_MULTI, + "links.type.wikipedia": UPDATE_ACTIONS_MULTI, + "locations.geonames_id": UPDATE_ACTIONS_MULTI, + "names.types.acronym": UPDATE_ACTIONS_MULTI, + "names.types.alias": UPDATE_ACTIONS_MULTI, + "names.types.label": UPDATE_ACTIONS_MULTI, + "names.types.ror_display": [UPDATE_ACTIONS["REPLACE"]], + "status": [UPDATE_ACTIONS["REPLACE"]], + "types": UPDATE_ACTIONS_MULTI +} V2_TEMPLATE = { "locations": [], @@ -101,6 +114,10 @@ "ROR_DISPLAY" : "ror_display" } +LANG_DELIMITER = "*" + +UPDATE_DELIMITER = "==" + def update_record(json_input, existing_record): record = copy.deepcopy(existing_record) for k, v in json_input.items(): @@ -170,7 +187,7 @@ def validate_csv(csv_file): if rowcount > 0: csv_fields = reader.fieldnames missing_fields = [] - for field in CSV_REQUIRED_FIELDS: + for field in CSV_REQUIRED_FIELDS_ACTIONS.keys(): if field not in csv_fields: missing_fields.append(field) print(missing_fields) @@ -199,7 +216,6 @@ def new_record_from_json(json_input, version): new_ror_id = check_ror_id(version) print("new ror id: " + new_ror_id) new_record['id'] = new_ror_id - # handle admin errors, valid_data = validate_v2(new_record) return errors, valid_data @@ -218,26 +234,291 @@ def update_record_from_json(new_json, existing_org): errors, valid_data = validate_v2(updated_record) return errors, valid_data -def update_record_from_csv(): - #todo + +def get_action_value(csv_field): + action = None + value = None + if csv_field.lower() == "delete": + action = "delete" + value = None + elif UPDATE_DELIMITER in csv_field: + action = csv_field.split(UPDATE_DELIMITER)[0] + value = csv_field.split(UPDATE_DELIMITER)[1] + else: + action = "replace" + value = csv_field + return action, value + +def get_actions_values(csv_field): + print("getting actions values:") + actions_values = {} + if csv_field.lower() == UPDATE_ACTIONS["DELETE"]: + actions_values[UPDATE_ACTIONS["DELETE"]] = None + elif UPDATE_DELIMITER in csv_field: + for ua in list(UPDATE_ACTIONS.values()): + print(ua) + if ua + UPDATE_DELIMITER in csv_field: + print("doing regex:") + result = re.search(r"{0}(.*?)(?=$|(add|delete|replace)==)".format(ua + UPDATE_DELIMITER), csv_field) + print(result[0]) + #add==foo;bar;delete==fizz;buzz; + #^(add|delete|replace)==(.*?)($|(?=add|delete|replace==)) + temp_val = result[0].replace(ua + UPDATE_DELIMITER, '') + print("temp val:") + print(temp_val) + actions_values[ua] = temp_val + csv_field.replace(result[0], '') + else: + actions_values[UPDATE_ACTIONS["REPLACE"]] = csv_field + return actions_values + +def validate_csv_row_update_syntax(csv_data): + print("validating row") + errors = [] + for k, v in csv_data.items(): + if UPDATE_DELIMITER in v: + print("field:") + print(k) + print("value:") + print(v) + actions_values = get_actions_values(v) + print("actions values:") + print(actions_values) + update_actions = list(actions_values.keys()) + if len(update_actions) > 2: + errors.append("{} update actions '{}' found in '{}' field but only 2 are allowed".format(str(len(update_actions)), ", ".join(update_actions), k)) + if len(update_actions) == 2: + if not (UPDATE_ACTIONS['ADD'] and UPDATE_ACTIONS['delete']) in update_actions: + errors.append("Invalid combination of update actions '{}' found in '{}' field.".format(", ".join(update_actions), k)) + disallowed_actions = [ua for ua in update_actions if ua not in CSV_REQUIRED_FIELDS_ACTIONS[k]] + print("allowed actions:") + print(CSV_REQUIRED_FIELDS_ACTIONS[k]) + print("disallowed actions:") + print(disallowed_actions) + if len(disallowed_actions) > 0: + errors.append("Invalid update action(s) '{}' found in {} field. Allowed actions for this field are '{}'".format(", ".join(disallowed_actions), k, ", ".join(CSV_REQUIRED_FIELDS_ACTIONS[k]))) + if v.strip() == UPDATE_ACTIONS['DELETE'].lower() and k in NO_DELETE_FIELDS: + errors.append("Invalid update action '{}' in {} field. Cannot remove all values from a required field.".format(UPDATE_ACTIONS['DELETE'], k)) + return errors + +def update_record_from_csv(csv_data, version): + errors = None + updated_record = None + print("updating record from csvs") + errors, existing_record = retrieve_organization(csv_data['id'], version) + print(existing_record) + if existing_record is None: + errors = Errors(["No existing record found for ROR ID '{}'".format(csv_data['id'])]) + else: + row_validation_errors = validate_csv_row_update_syntax(csv_data) + if len(row_validation_errors) > 0: + errors = row_validation_errors + print("row validation errors:") + print(errors) + + else: + update_data = {} + ''' + #domains + if csv_data['domains']: + actions_values = get_actions_values(csv_data['domains']) + temp_domains = copy.deepcopy(existing_record['domains']) + if UPDATE_ACTIONS['DELETE'] in actions_values: + delete_values = actions_values[UPDATE_ACTIONS['DELETE']] + if delete_values is None: + temp_domains = [] + else: + #should we check if values to delete exist? + temp_domains = [d for d in temp_domains if d not in delete_values.split(';')] + if UPDATE_ACTIONS['ADD'] in actions_values: + add_values = actions_values[UPDATE_ACTIONS['ADD']] + temp_domains = temp_domains.append([a.strip() for a in add_values.split(';')]) + if UPDATE_ACTIONS['REPLACE'] in actions_values: + replace_values = actions_values[UPDATE_ACTIONS['REPLACE']] + temp_domains = [r.strip() for r in replace_values.split(';')] + update_data['domains'] = temp_domains + + #established + if csv_data['established']: + actions_values = get_actions_values(csv_data['established']) + if UPDATE_ACTIONS['DELETE'] in actions_values: + update_data['established'] = None + if UPDATE_ACTIONS['REPLACE'] in actions_values: + update_data['established'] = int(actions_values[UPDATE_ACTIONS['REPLACE']].strip()) + + #external ids + updated_ext_id_types = [] + for k,v in V2_EXTERNAL_ID_TYPES.items(): + if csv_data['external_ids.type.' + v + '.all'] or csv_data['external_ids.type.' + v + '.preferred']: + updated_ext_id_types.append(v) + if len(updated_ext_id_types) > 0: + existing_ext_ids = copy.deepcopy(existing_record['external_ids']) + for t in updated_ext_id_types: + new_ext_id_obj = {} + if csv_data['external_ids.type.' + t + '.all']: + action, csv_field_value = get_action_value(csv_data['external_ids.type.' + t + '.all']) + existing_ext_id_obj = [i for i in existing_ext_ids if i['type'] == t] + # all + if action == "add": + new_ext_id_obj = { + "type": t, + "all": existing_ext_id_obj[0]['all'].append([c.strip() for c in csv_field_value.split(';')]), + "preferred": existing_ext_id_obj[0]['preferred'] + } + elif action == "delete": + new_ext_id_obj = { + "type": t, + "all": [e for e in existing_ext_id_obj[0]['all'] if e not in csv_field_value.split(';')], + "preferred": existing_ext_id_obj[0]['preferred'] + } + elif action == "replace": + new_ext_id_obj = { + "type": t, + "all": [c.strip() for c in csv_field_value.split(';')], + "preferred": existing_ext_id_obj[0]['preferred'] + } + # preferred + if csv_data['external_ids.type.' + t + '.preferred']: + if action == "add": + new_ext_id_obj = { + "type": t, + "all": existing_ext_id_obj[0]['all'].append([c.strip() for c in csv_field_value.split(';')]), + "preferred": existing_ext_id_obj[0]['preferred'] + } + elif action == "delete": + new_ext_id_obj = { + "type": t, + "all": [e for e in existing_ext_id_obj[0]['all'] if e not in csv_field_value.split(';')], + "preferred": existing_ext_id_obj[0]['preferred'] + } + elif action == "replace": + new_ext_id_obj = { + "type": t, + "all": [c.strip() for c in csv_field_value.split(';')], + "preferred": existing_ext_id_obj[0]['preferred'] + } + + + + + all_ids = [i.strip() for i in csv_data['external_ids.type.' + v + '.all'].split(';')] + ext_id_obj = { + "type": v, + "all": [i.strip() for i in csv_data['external_ids.type.' + v + '.all'].split(';')], + "preferred": csv_data['external_ids.type.' + v + '.preferred'].strip() if csv_data['external_ids.type.' + v + '.preferred'] else all_ids[0] + } + v2_data['external_ids'].append(ext_id_obj) + + #links + updated_link_types = [] + for k,v in V2_LINK_TYPES.items(): + if csv_data['links.type.' + v]: + updated_link_types.append(v) + if len(updated_link_types) > 0: + temp_links = copy.deepcopy(existing_record['links']) + for t in updated_link_types + if csv_data['links.type.' + t]: + action, csv_field_value = get_action_value(csv_data['links.type.' + t]) + if action == "add": + new_links = [c.strip() for c in csv_field_value.split(';')] + for link in new_links: + link_obj = { + "type": t, + "value": link + } + temp_links.append(link_obj) + elif action == "delete": + # remove all links of current type + if csv_field_value is None: + temp_links = [tl for tl in temp_links if tl['type'] != t] + else: + deleted_links = [c.strip() for c in csv_field_value.split(';')] + temp_links = [tl for tl in temp_links if tl['value'] not in deleted_links] + elif action == "replace": + temp_links = [] + new_links = [c.strip() for c in csv_field_value.split(';')] + for link in new_links: + link_obj = { + "type": t, + "value": csv_data['links.type.' + t].strip() + } + temp_links.append(link_obj) + update_data['links'] = temp_links + + #locations + if csv_data['locations.geonames_id']: + temp_locations = copy.deepcopy(existing_record['locations']) + action, csv_field_value = get_action_value(csv_data['locations.geonames_id']) + if action == "add": + new_locations = [c.strip() for c in csv_field_value.split(';')] + for nl in new_locations: + location_obj = { + "geonames_id": nl, + "geonames_details": {} + } + temp_locations.append(location_obj) + elif action == "delete": + deleted_locations = [c.strip() for c in csv_field_value.split(';')] + temp_locations = [tl for tl in temp_locations if tl['geonames_id'] not in deleted_locations] + elif action == "replace": + temp_locations = [] + new_locations = [c.strip() for c in csv_field_value.split(';')] + for nl in new_locations: + location_obj = { + "geonames_id": nl, + "geonames_details": {} + } + temp_locations.append(location_obj) + + update_data['locations'] = temp_locations + + #names + + #status + if csv_data['status']: + action, csv_field_value = get_action_value(csv_data['established']) + update_data['status'] = csv_field_value.strip() + + #types + if csv_data['types']: + action, csv_field_value = get_action_value(csv_data['types']) + if action == "add": + update_data['types'] = existing_record['types'].append([c.strip() for c in csv_field_value.split(';')]) + elif action == "delete": + update_data['types'] = [t for t in existing_record['types'] if t not in csv_field_value.split(';')] + elif action == "replace": + update_data['types'] = [c.strip() for c in csv_field_value.split(';')] + + ''' + errors, updated_record = update_record_from_json(update_data, existing_record) + + return errors, updated_record + + #return None, None def new_record_from_csv(csv_data, version): v2_data = copy.deepcopy(V2_TEMPLATE) + + #domains if csv_data['domains']: v2_data['domains'] = [d.strip() for d in csv_data['domains'].split(';')] + #established if csv_data['established']: v2_data['established'] = int(csv_data['established'].strip()) + #external ids for k,v in V2_EXTERNAL_ID_TYPES.items(): if csv_data['external_ids.type.' + v + '.all']: + all_ids = [i.strip() for i in csv_data['external_ids.type.' + v + '.all'].split(';')] ext_id_obj = { "type": v, - "all": [i.strip() for i in csv_data['external_ids.type.' + v + '.all'].split(';')], - "preferred": csv_data['external_ids.type.' + v + '.preferred'].strip() if csv_data['external_ids.type.' + v + '.preferred'] else None + "all": all_ids, + "preferred": csv_data['external_ids.type.' + v + '.preferred'].strip() if csv_data['external_ids.type.' + v + '.preferred'] else all_ids[0] } v2_data['external_ids'].append(ext_id_obj) + #links for k,v in V2_LINK_TYPES.items(): if csv_data['links.type.' + v]: link_obj = { @@ -246,6 +527,7 @@ def new_record_from_csv(csv_data, version): } v2_data['links'].append(link_obj) + #locations if csv_data['locations.geonames_id']: geonames_ids = [i.strip() for i in csv_data['locations.geonames_id'].split(';')] for geonames_id in geonames_ids: @@ -255,13 +537,15 @@ def new_record_from_csv(csv_data, version): } v2_data['locations'].append(location_obj) + #names temp_names = [] for k,v in V2_NAME_TYPES.items(): if csv_data['names.types.' + v]: + name_lang = csv_data['names.types.' + v].split(LANG_DELIMITER) name_obj = { "types": v, - "value": csv_data['names.types.' + v].strip(), - "lang": None + "value": name_lang[0].strip(), + "lang": name_lang[1].strip() if name_lang[1] else None } temp_names.append(name_obj) print("temp names 1:") @@ -288,11 +572,15 @@ def new_record_from_csv(csv_data, version): print("temp names 2:") print(temp_names) v2_data['names'] = temp_names + + #status if csv_data['status']: v2_data['status'] = csv_data['status'].strip() + #types if csv_data['types']: v2_data['types'] = [t.strip() for t in csv_data['types'].split(';')] + errors, new_record = new_record_from_json(v2_data, version) return errors, new_record @@ -310,15 +598,25 @@ def process_csv(csv_file, version): for row in reader: print("Row data") print(row) - errors, v2_record = new_record_from_csv(row, version) - print(errors) - print(v2_record) + if row['id']: + errors, v2_record = update_record_from_csv(row, version) + else: + errors, v2_record = new_record_from_csv(row, version) + if errors is None: + serializer = OrganizationSerializerV2(v2_record) + json_obj = json.loads(JSONRenderer().render(serializer.data)) + print(json_obj) + else: + print(errors) + + ''' ror_id = v2_record['id'] full_path = os.path.join(DATA['DIR'], ror_id.split('https://ror.org/')[1] + '.json') serializer = OrganizationSerializerV2(v2_record) json_obj = json.loads(JSONRenderer().render(serializer.data)) with open(full_path, "w") as outfile: json.dump(json_obj, outfile, ensure_ascii=False, indent=2) + ''' ''' if row['ror_id']: row_error, updated_record = update_from_csv(row) diff --git a/rorapi/common/views.py b/rorapi/common/views.py index 5829c0e..3ec3367 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -124,7 +124,13 @@ def update(self, request, pk=None, version=REST_FRAMEWORK["DEFAULT_VERSION"]): return Response( ErrorsSerializer(errors).data, status=status.HTTP_404_NOT_FOUND ) - errors, valid_data = validation.update_record_from_json(request.data) + json = request.data + if 'id' not in json: + errors = Errors(["No value found in ID field. Updated records must include a value in the ID field"]) + elif get_ror_id(json['id']) != ror_id: + errors = Errors(["Value {} in IDs field does not match resource ID specified in request URL {}".format(json['id'], pk)]) + else: + errors, valid_data = validation.update_record_from_json(json, organization) else: errors = Errors(["Version {} does not support creating records".format(version)]) if errors is not None: @@ -178,7 +184,7 @@ def get(self, request, geonamesid, version=REST_FRAMEWORK["DEFAULT_VERSION"]): class GenerateId(APIView): - permission_classes = [OurTokenPermission] + spermission_classes = [OurTokenPermission] def get(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): id = check_ror_id(version) From cf70a2171b822d2298ba65f3791519f32d07aac6 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Tue, 13 Feb 2024 19:42:04 -0600 Subject: [PATCH 06/38] CSV create/update WIP --- rorapi/common/validation.py | 143 +++++++++++++++++++++++++++--------- 1 file changed, 107 insertions(+), 36 deletions(-) diff --git a/rorapi/common/validation.py b/rorapi/common/validation.py index 1b20332..0035177 100644 --- a/rorapi/common/validation.py +++ b/rorapi/common/validation.py @@ -259,17 +259,19 @@ def get_actions_values(csv_field): print(ua) if ua + UPDATE_DELIMITER in csv_field: print("doing regex:") - result = re.search(r"{0}(.*?)(?=$|(add|delete|replace)==)".format(ua + UPDATE_DELIMITER), csv_field) + regex = r"(" + re.escape( + ua + UPDATE_DELIMITER) + r")(.*?)(?=$|(add|delete|replace)==)" + result = re.search(regex, csv_field) print(result[0]) - #add==foo;bar;delete==fizz;buzz; - #^(add|delete|replace)==(.*?)($|(?=add|delete|replace==)) temp_val = result[0].replace(ua + UPDATE_DELIMITER, '') print("temp val:") print(temp_val) - actions_values[ua] = temp_val - csv_field.replace(result[0], '') + actions_values[ua] = [v.strip() for v in temp_val.split(';') if v] + #csv_field.replace(result[0], '') + else: - actions_values[UPDATE_ACTIONS["REPLACE"]] = csv_field + actions_values[UPDATE_ACTIONS["REPLACE"]] = [v.strip() for v in csv_field.split(';') if v] + print(actions_values) return actions_values def validate_csv_row_update_syntax(csv_data): @@ -285,6 +287,8 @@ def validate_csv_row_update_syntax(csv_data): print("actions values:") print(actions_values) update_actions = list(actions_values.keys()) + if len(update_actions)==0: + errors.append("Update delimiter '{}' found in '{}' field but no valid update action found in value {}".format(UPDATE_DELIMITER, k, v)) if len(update_actions) > 2: errors.append("{} update actions '{}' found in '{}' field but only 2 are allowed".format(str(len(update_actions)), ", ".join(update_actions), k)) if len(update_actions) == 2: @@ -302,50 +306,65 @@ def validate_csv_row_update_syntax(csv_data): return errors def update_record_from_csv(csv_data, version): - errors = None + errors = [] updated_record = None - print("updating record from csvs") - errors, existing_record = retrieve_organization(csv_data['id'], version) - print(existing_record) - if existing_record is None: - errors = Errors(["No existing record found for ROR ID '{}'".format(csv_data['id'])]) + print("updating record from csv") + existing_org_errors, existing_org = retrieve_organization(csv_data['id'], version) + print(existing_org) + if existing_org is None: + errors.append("No existing record found for ROR ID '{}'".format(csv_data['id'])) else: row_validation_errors = validate_csv_row_update_syntax(csv_data) if len(row_validation_errors) > 0: - errors = row_validation_errors + errors.extend(row_validation_errors) print("row validation errors:") print(errors) - else: + serializer = OrganizationSerializerV2(existing_org) + existing_record = serializer.data + print(existing_record) update_data = {} - ''' #domains if csv_data['domains']: actions_values = get_actions_values(csv_data['domains']) temp_domains = copy.deepcopy(existing_record['domains']) + print("initial temp domains:") + print(temp_domains) if UPDATE_ACTIONS['DELETE'] in actions_values: delete_values = actions_values[UPDATE_ACTIONS['DELETE']] if delete_values is None: temp_domains = [] else: #should we check if values to delete exist? - temp_domains = [d for d in temp_domains if d not in delete_values.split(';')] + for d in delete_values: + if d not in temp_domains: + errors.append("Attempting to delete dommain(s) that don't exist: {}".format(d)) + temp_domains = [d for d in temp_domains if d not in delete_values] + print("temp domains delete") + print(temp_domains) if UPDATE_ACTIONS['ADD'] in actions_values: add_values = actions_values[UPDATE_ACTIONS['ADD']] - temp_domains = temp_domains.append([a.strip() for a in add_values.split(';')]) + for a in add_values: + if a in temp_domains: + errors.append("Attempting to add dommain(s) that already exist: {}".format(a)) + print(add_values) + temp_domains.extend(add_values) + print("temp domains add") + print(temp_domains) if UPDATE_ACTIONS['REPLACE'] in actions_values: - replace_values = actions_values[UPDATE_ACTIONS['REPLACE']] - temp_domains = [r.strip() for r in replace_values.split(';')] + temp_domains = actions_values[UPDATE_ACTIONS['REPLACE']] + print("temp domains replace") + print(temp_domains) + print("final temp domains:") + print(temp_domains) update_data['domains'] = temp_domains #established if csv_data['established']: actions_values = get_actions_values(csv_data['established']) - if UPDATE_ACTIONS['DELETE'] in actions_values: - update_data['established'] = None if UPDATE_ACTIONS['REPLACE'] in actions_values: - update_data['established'] = int(actions_values[UPDATE_ACTIONS['REPLACE']].strip()) - + update_data['established'] = int(actions_values[UPDATE_ACTIONS['REPLACE']][0]) + ''' #external ids updated_ext_id_types = [] for k,v in V2_EXTERNAL_ID_TYPES.items(): @@ -445,7 +464,7 @@ def update_record_from_csv(csv_data, version): temp_links.append(link_obj) update_data['links'] = temp_links - #locations + if csv_data['locations.geonames_id']: temp_locations = copy.deepcopy(existing_record['locations']) action, csv_field_value = get_action_value(csv_data['locations.geonames_id']) @@ -471,27 +490,79 @@ def update_record_from_csv(csv_data, version): temp_locations.append(location_obj) update_data['locations'] = temp_locations + ''' + #locations + if csv_data['locations.geonames_id']: + actions_values = get_actions_values(csv_data['locations.geonames_id']) + temp_locations = copy.deepcopy(existing_record['locations']) + print("initial temp locations:") + print(temp_locations) + if UPDATE_ACTIONS['DELETE'] in actions_values: + delete_values = actions_values[UPDATE_ACTIONS['DELETE']] + for d in delete_values: + if d not in temp_locations: + errors.append("Attempting to delete type(s) that don't exist: {}".format(d)) + temp_locations = [tl for tl in temp_locations if tl['geonames_id'] not in delete_values] + if UPDATE_ACTIONS['ADD'] in actions_values: + add_values = actions_values[UPDATE_ACTIONS['ADD']] + for a in add_values: + if a in temp_locations: + errors.append("Attempting to add type(s) that already exist: {}".format(a)) + for a in add_values: + location_obj = { + "geonames_id": a, + "geonames_details": {} + } + temp_locations.append(location_obj) + if UPDATE_ACTIONS['REPLACE'] in actions_values: + temp_locations = [] + for r in UPDATE_ACTIONS['REPLACE']: + location_obj = { + "geonames_id": r, + "geonames_details": {} + } + temp_locations.append(location_obj) + print("final temp locations:") + print(temp_locations) + update_data['locations'] = temp_locations #names + #TODO #status if csv_data['status']: - action, csv_field_value = get_action_value(csv_data['established']) - update_data['status'] = csv_field_value.strip() + actions_values = get_actions_values(csv_data['status']) + if UPDATE_ACTIONS['DELETE'] in actions_values: + errors.append("Cannot delete required field 'status'") + if UPDATE_ACTIONS['REPLACE'] in actions_values: + update_data['status'] = actions_values[UPDATE_ACTIONS['REPLACE']][0] #types if csv_data['types']: - action, csv_field_value = get_action_value(csv_data['types']) - if action == "add": - update_data['types'] = existing_record['types'].append([c.strip() for c in csv_field_value.split(';')]) - elif action == "delete": - update_data['types'] = [t for t in existing_record['types'] if t not in csv_field_value.split(';')] - elif action == "replace": - update_data['types'] = [c.strip() for c in csv_field_value.split(';')] - - ''' - errors, updated_record = update_record_from_json(update_data, existing_record) + actions_values = get_actions_values(csv_data['types']) + temp_types = copy.deepcopy(existing_record['types']) + print("initial temp types:") + print(temp_types) + if UPDATE_ACTIONS['DELETE'] in actions_values: + delete_values = actions_values[UPDATE_ACTIONS['DELETE']] + for d in delete_values: + if d not in temp_types: + errors.append("Attempting to delete type(s) that don't exist: {}".format(d)) + temp_types = [t for t in temp_types if t not in delete_values] + if UPDATE_ACTIONS['ADD'] in actions_values: + add_values = actions_values[UPDATE_ACTIONS['ADD']] + for a in add_values: + if a in temp_types: + errors.append("Attempting to add type(s) that already exist: {}".format(a)) + temp_types.extend(add_values) + if UPDATE_ACTIONS['REPLACE'] in actions_values: + temp_types = actions_values[UPDATE_ACTIONS['REPLACE']] + print("final temp types:") + print(temp_types) + update_data['types'] = temp_types + if len(errors) == 0: + errors, updated_record = update_record_from_json(update_data, existing_record) return errors, updated_record #return None, None From ccbdc5ea42bda6be3dbae3ce5e2523c0dc8a57db Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Thu, 15 Feb 2024 18:08:14 -0600 Subject: [PATCH 07/38] CSV create/update WIP --- rorapi/common/validation.py | 213 +++++++++++++++++------------------- rorapi/common/views.py | 3 +- 2 files changed, 105 insertions(+), 111 deletions(-) diff --git a/rorapi/common/validation.py b/rorapi/common/validation.py index 0035177..b8535b2 100644 --- a/rorapi/common/validation.py +++ b/rorapi/common/validation.py @@ -292,7 +292,7 @@ def validate_csv_row_update_syntax(csv_data): if len(update_actions) > 2: errors.append("{} update actions '{}' found in '{}' field but only 2 are allowed".format(str(len(update_actions)), ", ".join(update_actions), k)) if len(update_actions) == 2: - if not (UPDATE_ACTIONS['ADD'] and UPDATE_ACTIONS['delete']) in update_actions: + if not (UPDATE_ACTIONS['ADD'] and UPDATE_ACTIONS['DELETE']) in update_actions: errors.append("Invalid combination of update actions '{}' found in '{}' field.".format(", ".join(update_actions), k)) disallowed_actions = [ua for ua in update_actions if ua not in CSV_REQUIRED_FIELDS_ACTIONS[k]] print("allowed actions:") @@ -335,7 +335,6 @@ def update_record_from_csv(csv_data, version): if delete_values is None: temp_domains = [] else: - #should we check if values to delete exist? for d in delete_values: if d not in temp_domains: errors.append("Attempting to delete dommain(s) that don't exist: {}".format(d)) @@ -362,71 +361,79 @@ def update_record_from_csv(csv_data, version): #established if csv_data['established']: actions_values = get_actions_values(csv_data['established']) + if UPDATE_ACTIONS['DELETE'] in actions_values: + update_data['established'] = None if UPDATE_ACTIONS['REPLACE'] in actions_values: update_data['established'] = int(actions_values[UPDATE_ACTIONS['REPLACE']][0]) - ''' + #external ids updated_ext_id_types = [] for k,v in V2_EXTERNAL_ID_TYPES.items(): if csv_data['external_ids.type.' + v + '.all'] or csv_data['external_ids.type.' + v + '.preferred']: updated_ext_id_types.append(v) if len(updated_ext_id_types) > 0: - existing_ext_ids = copy.deepcopy(existing_record['external_ids']) + temp_ext_ids = copy.deepcopy(existing_record['external_ids']) for t in updated_ext_id_types: - new_ext_id_obj = {} + temp_all = [] + temp_preferred = None + existing_ext_id_obj = None + existing_ext_ids_type = [i for i in temp_ext_ids if i['type'] == t] + + if len(existing_ext_ids_type) == 1: + existing_ext_id_obj = existing_ext_ids_type[0] + temp_all = existing_ext_id_obj['all'] + temp_preferred = existing_ext_id_obj['preferred'] + if len(existing_ext_ids_type) > 1: + errors.append("Something is wrong. Multiple external ID objects with type ".format(t)) + + # external_ids.all if csv_data['external_ids.type.' + t + '.all']: - action, csv_field_value = get_action_value(csv_data['external_ids.type.' + t + '.all']) - existing_ext_id_obj = [i for i in existing_ext_ids if i['type'] == t] - # all - if action == "add": - new_ext_id_obj = { - "type": t, - "all": existing_ext_id_obj[0]['all'].append([c.strip() for c in csv_field_value.split(';')]), - "preferred": existing_ext_id_obj[0]['preferred'] - } - elif action == "delete": - new_ext_id_obj = { - "type": t, - "all": [e for e in existing_ext_id_obj[0]['all'] if e not in csv_field_value.split(';')], - "preferred": existing_ext_id_obj[0]['preferred'] - } - elif action == "replace": - new_ext_id_obj = { - "type": t, - "all": [c.strip() for c in csv_field_value.split(';')], - "preferred": existing_ext_id_obj[0]['preferred'] - } - # preferred + actions_values = get_actions_values(csv_data['external_ids.type.' + t + '.all']) + if UPDATE_ACTIONS['DELETE'] in actions_values: + delete_values = actions_values[UPDATE_ACTIONS['DELETE']] + if delete_values is None: + temp_all = [] + else: + for d in delete_values: + if d not in temp_all: + errors.append("Attempting to delete external ID(s) from {}.all that don't exist: {}".format(t, d)) + temp_all = [i for i in temp_all if i not in delete_values] + if UPDATE_ACTIONS['ADD'] in actions_values: + add_values = [a for a in actions_values[UPDATE_ACTIONS['ADD']]] + for a in add_values: + if a in temp_all: + errors.append("Attempting to add external ID(s) to {}.all that already exist: {}".format(t, a)) + temp_all.extend(add_values) + if UPDATE_ACTIONS['REPLACE'] in actions_values: + temp_all = actions_values[UPDATE_ACTIONS['REPLACE']] + + # external_ids.preferred if csv_data['external_ids.type.' + t + '.preferred']: - if action == "add": - new_ext_id_obj = { - "type": t, - "all": existing_ext_id_obj[0]['all'].append([c.strip() for c in csv_field_value.split(';')]), - "preferred": existing_ext_id_obj[0]['preferred'] - } - elif action == "delete": - new_ext_id_obj = { - "type": t, - "all": [e for e in existing_ext_id_obj[0]['all'] if e not in csv_field_value.split(';')], - "preferred": existing_ext_id_obj[0]['preferred'] - } - elif action == "replace": - new_ext_id_obj = { - "type": t, - "all": [c.strip() for c in csv_field_value.split(';')], - "preferred": existing_ext_id_obj[0]['preferred'] - } - - - - - all_ids = [i.strip() for i in csv_data['external_ids.type.' + v + '.all'].split(';')] - ext_id_obj = { - "type": v, - "all": [i.strip() for i in csv_data['external_ids.type.' + v + '.all'].split(';')], - "preferred": csv_data['external_ids.type.' + v + '.preferred'].strip() if csv_data['external_ids.type.' + v + '.preferred'] else all_ids[0] - } - v2_data['external_ids'].append(ext_id_obj) + actions_values = get_actions_values(csv_data['external_ids.type.' + t + '.preferred']) + if UPDATE_ACTIONS['DELETE'] in actions_values: + temp_preferred = None + if UPDATE_ACTIONS['REPLACE'] in actions_values: + temp_preferred = actions_values[UPDATE_ACTIONS['REPLACE']][0] + + + if len(temp_all) == 0 and temp_preferred is None: + # remove all of type + if not existing_ext_id_obj: + errors.append("Attempting to delete external ID object with type {} that doesn't exist.".format(t)) + temp_ext_ids = [i for i in temp_ext_ids if i['type'] != t] + + else: + # remove all of type and replace with new obj + new_ext_id_obj = { + "type": t, + "all": temp_all, + "preferred": temp_preferred + } + if existing_ext_id_obj: + temp_ext_ids = [i for i in temp_ext_ids if i['type'] != t] + temp_ext_ids.append(new_ext_id_obj) + + update_data['external_ids'] = temp_ext_ids #links updated_link_types = [] @@ -435,90 +442,74 @@ def update_record_from_csv(csv_data, version): updated_link_types.append(v) if len(updated_link_types) > 0: temp_links = copy.deepcopy(existing_record['links']) - for t in updated_link_types + for t in updated_link_types: if csv_data['links.type.' + t]: - action, csv_field_value = get_action_value(csv_data['links.type.' + t]) - if action == "add": - new_links = [c.strip() for c in csv_field_value.split(';')] - for link in new_links: + actions_values = get_actions_values(csv_data['links.type.' + t]) + existing_links = [tl['value'] for tl in temp_links] + if UPDATE_ACTIONS['DELETE'] in actions_values: + delete_values = actions_values[UPDATE_ACTIONS['DELETE']] + if delete_values is None: + temp_links = [tl for tl in temp_links if tl['type'] != t] + else: + for d in delete_values: + if d not in existing_links: + errors.append("Attempting to delete link(s) that don't exist: {}".format(d)) + temp_links = [tl for tl in temp_links if tl['value'] not in delete_values] + if UPDATE_ACTIONS['ADD'] in actions_values: + add_values = [a for a in actions_values[UPDATE_ACTIONS['ADD']]] + for a in add_values: + if a in existing_links: + errors.append("Attempting to add link(s) that already exist: {}".format(a)) + for a in add_values: link_obj = { "type": t, - "value": link + "value": a } temp_links.append(link_obj) - elif action == "delete": - # remove all links of current type - if csv_field_value is None: - temp_links = [tl for tl in temp_links if tl['type'] != t] - else: - deleted_links = [c.strip() for c in csv_field_value.split(';')] - temp_links = [tl for tl in temp_links if tl['value'] not in deleted_links] - elif action == "replace": + if UPDATE_ACTIONS['REPLACE'] in actions_values: temp_links = [] - new_links = [c.strip() for c in csv_field_value.split(';')] - for link in new_links: + for r in actions_values[UPDATE_ACTIONS['REPLACE']]: link_obj = { "type": t, - "value": csv_data['links.type.' + t].strip() + "value": r } - temp_links.append(link_obj) + temp_links.append(link_obj) + print("final temp links:") + print(temp_links) update_data['links'] = temp_links - - if csv_data['locations.geonames_id']: - temp_locations = copy.deepcopy(existing_record['locations']) - action, csv_field_value = get_action_value(csv_data['locations.geonames_id']) - if action == "add": - new_locations = [c.strip() for c in csv_field_value.split(';')] - for nl in new_locations: - location_obj = { - "geonames_id": nl, - "geonames_details": {} - } - temp_locations.append(location_obj) - elif action == "delete": - deleted_locations = [c.strip() for c in csv_field_value.split(';')] - temp_locations = [tl for tl in temp_locations if tl['geonames_id'] not in deleted_locations] - elif action == "replace": - temp_locations = [] - new_locations = [c.strip() for c in csv_field_value.split(';')] - for nl in new_locations: - location_obj = { - "geonames_id": nl, - "geonames_details": {} - } - temp_locations.append(location_obj) - - update_data['locations'] = temp_locations - ''' #locations if csv_data['locations.geonames_id']: actions_values = get_actions_values(csv_data['locations.geonames_id']) temp_locations = copy.deepcopy(existing_record['locations']) print("initial temp locations:") print(temp_locations) + existing_geonames_ids = [tl['geonames_id'] for tl in temp_locations] + print(existing_geonames_ids) if UPDATE_ACTIONS['DELETE'] in actions_values: - delete_values = actions_values[UPDATE_ACTIONS['DELETE']] + delete_values = [int(d) for d in actions_values[UPDATE_ACTIONS['DELETE']]] for d in delete_values: - if d not in temp_locations: - errors.append("Attempting to delete type(s) that don't exist: {}".format(d)) + if d not in existing_geonames_ids: + errors.append("Attempting to delete locations(s) that don't exist: {}".format(d)) + if len(existing_geonames_ids) == len(delete_values): + errors.append("Cannot remove all values from required field 'locations'") temp_locations = [tl for tl in temp_locations if tl['geonames_id'] not in delete_values] if UPDATE_ACTIONS['ADD'] in actions_values: - add_values = actions_values[UPDATE_ACTIONS['ADD']] + add_values = [int(a) for a in actions_values[UPDATE_ACTIONS['ADD']]] for a in add_values: - if a in temp_locations: - errors.append("Attempting to add type(s) that already exist: {}".format(a)) + if int(a) in existing_geonames_ids: + errors.append("Attempting to add locations(s) that already exist: {}".format(a)) for a in add_values: location_obj = { - "geonames_id": a, + "geonames_id": int(a), "geonames_details": {} } temp_locations.append(location_obj) if UPDATE_ACTIONS['REPLACE'] in actions_values: temp_locations = [] - for r in UPDATE_ACTIONS['REPLACE']: + for r in actions_values[UPDATE_ACTIONS['REPLACE']]: location_obj = { - "geonames_id": r, + "geonames_id": int(r), "geonames_details": {} } temp_locations.append(location_obj) @@ -548,6 +539,8 @@ def update_record_from_csv(csv_data, version): for d in delete_values: if d not in temp_types: errors.append("Attempting to delete type(s) that don't exist: {}".format(d)) + if len(temp_types) == len(delete_values): + errors.append("Cannot remove all values from required field 'types'") temp_types = [t for t in temp_types if t not in delete_values] if UPDATE_ACTIONS['ADD'] in actions_values: add_values = actions_values[UPDATE_ACTIONS['ADD']] diff --git a/rorapi/common/views.py b/rorapi/common/views.py index 3ec3367..c197047 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -221,7 +221,8 @@ def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): if request.data: file_object = request.data['file'] mime_type = magic.from_buffer(file_object.read(2048)) - if "ASCII text" in mime_type: + print(mime_type) + if "ASCII text" in mime_type or "CSV text" in mime_type: file_object.seek(0) csv_validation_errors = validation.validate_csv(file_object) if len(csv_validation_errors) == 0: From f14b36a5cd93e270d4c8b671ff756dbba5f869c3 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Sat, 17 Feb 2024 06:52:48 -0600 Subject: [PATCH 08/38] CSV create/update WIP --- rorapi/common/validation.py | 42 ++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/rorapi/common/validation.py b/rorapi/common/validation.py index b8535b2..8d59670 100644 --- a/rorapi/common/validation.py +++ b/rorapi/common/validation.py @@ -518,7 +518,47 @@ def update_record_from_csv(csv_data, version): update_data['locations'] = temp_locations #names - #TODO + updated_name_types = [] + for k,v in V2_NAME_TYPES.items(): + if csv_data['names.types.' + v]: + updated_name_types.append(v) + if len(updated_name_types) > 0: + temp_names = copy.deepcopy(existing_record['names']) + for t in updated_name_types: + if csv_data['names.types' + t]: + actions_values = get_actions_values(csv_data['names.types' + t]) + existing_names = [n for n in temp_names if t in n['types']] + if UPDATE_ACTIONS['DELETE'] in actions_values: + delete_values = actions_values[UPDATE_ACTIONS['DELETE']] + if delete_values is None: + temp_links = [tl for tl in temp_links if tl['type'] != t] + else: + for d in delete_values: + if d not in existing_links: + errors.append("Attempting to delete link(s) that don't exist: {}".format(d)) + temp_links = [tl for tl in temp_links if tl['value'] not in delete_values] + if UPDATE_ACTIONS['ADD'] in actions_values: + add_values = [a for a in actions_values[UPDATE_ACTIONS['ADD']]] + for a in add_values: + if a in existing_links: + errors.append("Attempting to add link(s) that already exist: {}".format(a)) + for a in add_values: + link_obj = { + "type": t, + "value": a + } + temp_links.append(link_obj) + if UPDATE_ACTIONS['REPLACE'] in actions_values: + temp_links = [] + for r in actions_values[UPDATE_ACTIONS['REPLACE']]: + link_obj = { + "type": t, + "value": r + } + temp_links.append(link_obj) + print("final temp links:") + print(temp_links) + update_data['links'] = temp_links #status if csv_data['status']: From f44ba39218d93f9eed44211413368e55db9c6266 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Wed, 21 Feb 2024 19:18:50 -0600 Subject: [PATCH 09/38] CSV create/update WIP --- rorapi/common/validation.py | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/rorapi/common/validation.py b/rorapi/common/validation.py index 8d59670..6c3466c 100644 --- a/rorapi/common/validation.py +++ b/rorapi/common/validation.py @@ -378,14 +378,12 @@ def update_record_from_csv(csv_data, version): temp_preferred = None existing_ext_id_obj = None existing_ext_ids_type = [i for i in temp_ext_ids if i['type'] == t] - if len(existing_ext_ids_type) == 1: existing_ext_id_obj = existing_ext_ids_type[0] temp_all = existing_ext_id_obj['all'] temp_preferred = existing_ext_id_obj['preferred'] if len(existing_ext_ids_type) > 1: errors.append("Something is wrong. Multiple external ID objects with type ".format(t)) - # external_ids.all if csv_data['external_ids.type.' + t + '.all']: actions_values = get_actions_values(csv_data['external_ids.type.' + t + '.all']) @@ -441,7 +439,7 @@ def update_record_from_csv(csv_data, version): if csv_data['links.type.' + v]: updated_link_types.append(v) if len(updated_link_types) > 0: - temp_links = copy.deepcopy(existing_record['links']) + temp_names = copy.deepcopy(existing_record['links']) for t in updated_link_types: if csv_data['links.type.' + t]: actions_values = get_actions_values(csv_data['links.type.' + t]) @@ -527,16 +525,40 @@ def update_record_from_csv(csv_data, version): for t in updated_name_types: if csv_data['names.types' + t]: actions_values = get_actions_values(csv_data['names.types' + t]) - existing_names = [n for n in temp_names if t in n['types']] + for k, v in actions_values.items(): + if v: + vals_obj_list = [] + for val in v: + vals_obj = { + "value": None, + "lang_code": None + } + if "*" in v: + name_val, lang_code = val.split("*") + vals_obj["value"] = name_val.strip() + vals_obj["lang_code"] = lang_code.strip() + else: + vals_obj["value"] = val.strip() + vals_obj_list.append(vals_obj) + actions_values[k] = vals_obj_list = [] if UPDATE_ACTIONS['DELETE'] in actions_values: delete_values = actions_values[UPDATE_ACTIONS['DELETE']] if delete_values is None: - temp_links = [tl for tl in temp_links if tl['type'] != t] + temp_names = [tn for tn in temp_names if t not in tn['types']] else: for d in delete_values: - if d not in existing_links: + temp_names_match = [tn for tn in temp_names if (t in tn['types'] and tn['value'] == d['value'] and tn['lang_code'] == d['lang_code'])] + if len(temp_names_match) == 0: errors.append("Attempting to delete link(s) that don't exist: {}".format(d)) - temp_links = [tl for tl in temp_links if tl['value'] not in delete_values] + #if name has multiple types, delete type only + else: + for tnm in temp_names_match: + if len(tnm['types'] > 1): + temp_types = [tnm_type for tnm_type in tnm['types'] if tnm_type != t] + tnm['types'] = temp_types + #if name has only current type, delete obj + + if UPDATE_ACTIONS['ADD'] in actions_values: add_values = [a for a in actions_values[UPDATE_ACTIONS['ADD']]] for a in add_values: From 60402cde902146c2ee9d9acecca1f8a3850ee0a8 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Thu, 29 Feb 2024 09:21:59 -0600 Subject: [PATCH 10/38] CSV create/update WIP --- rorapi/common/validation.py | 132 ++++++++++++++++++++++++------------ 1 file changed, 89 insertions(+), 43 deletions(-) diff --git a/rorapi/common/validation.py b/rorapi/common/validation.py index 6c3466c..1aaa138 100644 --- a/rorapi/common/validation.py +++ b/rorapi/common/validation.py @@ -191,7 +191,7 @@ def validate_csv(csv_file): if field not in csv_fields: missing_fields.append(field) print(missing_fields) - if len(missing_fields) > 0: + if missing_fields: errors.append(f'CSV file is missing columns: {", ".join(missing_fields)}') else: errors.append("CSV file contains no data rows") @@ -208,7 +208,7 @@ def new_record_from_json(json_input, version): if check_optional_fields(new_record): new_record = add_missing_optional_fields(new_record) location_errors, updated_locations = update_locations(new_record['locations']) - if len(location_errors) > 0: + if location_errors: errors = Errors(location_errors) else: new_record['locations'] = updated_locations @@ -227,7 +227,7 @@ def update_record_from_json(new_json, existing_org): existing_record = serializer.data updated_record = update_record(new_json, existing_record) location_errors, updated_locations = update_locations(updated_record['locations']) - if len(location_errors) > 0: + if location_errors: errors = Errors(location_errors) else: updated_record['locations'] = updated_locations @@ -287,7 +287,7 @@ def validate_csv_row_update_syntax(csv_data): print("actions values:") print(actions_values) update_actions = list(actions_values.keys()) - if len(update_actions)==0: + if not update_actions: errors.append("Update delimiter '{}' found in '{}' field but no valid update action found in value {}".format(UPDATE_DELIMITER, k, v)) if len(update_actions) > 2: errors.append("{} update actions '{}' found in '{}' field but only 2 are allowed".format(str(len(update_actions)), ", ".join(update_actions), k)) @@ -299,7 +299,7 @@ def validate_csv_row_update_syntax(csv_data): print(CSV_REQUIRED_FIELDS_ACTIONS[k]) print("disallowed actions:") print(disallowed_actions) - if len(disallowed_actions) > 0: + if disallowed_actions: errors.append("Invalid update action(s) '{}' found in {} field. Allowed actions for this field are '{}'".format(", ".join(disallowed_actions), k, ", ".join(CSV_REQUIRED_FIELDS_ACTIONS[k]))) if v.strip() == UPDATE_ACTIONS['DELETE'].lower() and k in NO_DELETE_FIELDS: errors.append("Invalid update action '{}' in {} field. Cannot remove all values from a required field.".format(UPDATE_ACTIONS['DELETE'], k)) @@ -315,7 +315,7 @@ def update_record_from_csv(csv_data, version): errors.append("No existing record found for ROR ID '{}'".format(csv_data['id'])) else: row_validation_errors = validate_csv_row_update_syntax(csv_data) - if len(row_validation_errors) > 0: + if row_validation_errors: errors.extend(row_validation_errors) print("row validation errors:") print(errors) @@ -324,6 +324,7 @@ def update_record_from_csv(csv_data, version): existing_record = serializer.data print(existing_record) update_data = {} + #domains if csv_data['domains']: actions_values = get_actions_values(csv_data['domains']) @@ -371,7 +372,7 @@ def update_record_from_csv(csv_data, version): for k,v in V2_EXTERNAL_ID_TYPES.items(): if csv_data['external_ids.type.' + v + '.all'] or csv_data['external_ids.type.' + v + '.preferred']: updated_ext_id_types.append(v) - if len(updated_ext_id_types) > 0: + if updated_ext_id_types: temp_ext_ids = copy.deepcopy(existing_record['external_ids']) for t in updated_ext_id_types: temp_all = [] @@ -414,7 +415,7 @@ def update_record_from_csv(csv_data, version): temp_preferred = actions_values[UPDATE_ACTIONS['REPLACE']][0] - if len(temp_all) == 0 and temp_preferred is None: + if (not temp_all) and temp_preferred is None: # remove all of type if not existing_ext_id_obj: errors.append("Attempting to delete external ID object with type {} that doesn't exist.".format(t)) @@ -438,7 +439,7 @@ def update_record_from_csv(csv_data, version): for k,v in V2_LINK_TYPES.items(): if csv_data['links.type.' + v]: updated_link_types.append(v) - if len(updated_link_types) > 0: + if updated_link_types: temp_names = copy.deepcopy(existing_record['links']) for t in updated_link_types: if csv_data['links.type.' + t]: @@ -520,67 +521,112 @@ def update_record_from_csv(csv_data, version): for k,v in V2_NAME_TYPES.items(): if csv_data['names.types.' + v]: updated_name_types.append(v) - if len(updated_name_types) > 0: + print("updated name types") + print(updated_name_types) + if updated_name_types: temp_names = copy.deepcopy(existing_record['names']) for t in updated_name_types: - if csv_data['names.types' + t]: - actions_values = get_actions_values(csv_data['names.types' + t]) + print("updating name type " + t) + if csv_data['names.types.' + t]: + actions_values = get_actions_values(csv_data['names.types.' + t]) for k, v in actions_values.items(): if v: vals_obj_list = [] for val in v: vals_obj = { "value": None, - "lang_code": None + "lang": None } if "*" in v: - name_val, lang_code = val.split("*") + name_val, lang = val.split("*") vals_obj["value"] = name_val.strip() - vals_obj["lang_code"] = lang_code.strip() + vals_obj["lang"] = lang.strip() else: vals_obj["value"] = val.strip() vals_obj_list.append(vals_obj) - actions_values[k] = vals_obj_list = [] + actions_values[k] = vals_obj_list + print("updated actions values") + print(actions_values) if UPDATE_ACTIONS['DELETE'] in actions_values: + print("delete in actions") delete_values = actions_values[UPDATE_ACTIONS['DELETE']] + print(delete_values) if delete_values is None: temp_names = [tn for tn in temp_names if t not in tn['types']] else: for d in delete_values: - temp_names_match = [tn for tn in temp_names if (t in tn['types'] and tn['value'] == d['value'] and tn['lang_code'] == d['lang_code'])] - if len(temp_names_match) == 0: - errors.append("Attempting to delete link(s) that don't exist: {}".format(d)) - #if name has multiple types, delete type only + temp_names_match = [tn for tn in temp_names if (t in tn['types'] and tn['value'] == d['value'] and tn['lang'] == d['lang'])] + if not temp_names_match: + errors.append("Attempting to delete name(s) that don't exist: {}".format(d)) else: for tnm in temp_names_match: - if len(tnm['types'] > 1): + temp_names.remove(tnm) + #if name has multiple types, delete type only + if len(tnm['types']) > 1: temp_types = [tnm_type for tnm_type in tnm['types'] if tnm_type != t] tnm['types'] = temp_types - #if name has only current type, delete obj - + temp_names.append(tnm) if UPDATE_ACTIONS['ADD'] in actions_values: - add_values = [a for a in actions_values[UPDATE_ACTIONS['ADD']]] - for a in add_values: - if a in existing_links: - errors.append("Attempting to add link(s) that already exist: {}".format(a)) + add_values = actions_values[UPDATE_ACTIONS['ADD']] for a in add_values: - link_obj = { - "type": t, - "value": a - } - temp_links.append(link_obj) + temp_names_match = [tn for tn in temp_names if (t in tn['types'] and tn['value'] == a['value'] and tn['lang'] == a['lang'])] + # check if value, lang and type already exist + if temp_names_match: + errors.append("Attempting to add names(s) that already exist: {}".format(a)) + else: + name_vals_match = [tn for tn in temp_names if (tn['value'] == a['value'] and tn['lang'] == a['lang'])] + if name_vals_match: + print("name vals match") + print(name_vals_match) + for nvm in name_vals_match: + # if value and lang exist but not type, add type only + if len(nvm['types']) > 0: + temp_names.remove(nvm) + nvm['types'].append(t) + temp_names.append(nvm) + else: + # if value and lang don't exist add new name obj + name_obj = { + "types": [t], + "value": a['value'], + "lang": a['lang'] + } + temp_names.append(name_obj) if UPDATE_ACTIONS['REPLACE'] in actions_values: - temp_links = [] - for r in actions_values[UPDATE_ACTIONS['REPLACE']]: - link_obj = { - "type": t, - "value": r - } - temp_links.append(link_obj) - print("final temp links:") - print(temp_links) - update_data['links'] = temp_links + temp_names_match = [tn for tn in temp_names if t in tn['types']] + # remove all names of current type from temp names using same rules as delete + if temp_names_match: + for tnm in temp_names_match: + temp_names.remove(tnm) + #if name has multiple types, delete type only + if len(tnm['types']) > 1: + temp_types = [tnm_type for tnm_type in tnm['types'] if tnm_type != t] + tnm['types'] = temp_types + temp_names.append(tnm) + replace_values = actions_values[UPDATE_ACTIONS['REPLACE']] + for r in replace_values: + name_vals_match = [tn for tn in temp_names if (tn['value'] == r['value'] and tn['lang'] == r['lang'])] + # add new names of current type to temp names using same rules as add + if name_vals_match: + for nvm in name_vals_match: + # if value and lang exist but not type, add type only + if len(nvm['types'] > 0): + temp_names.remove(nvm) + nvm['types'].append(t) + temp_names.append(nvm) + else: + # if value and lang don't exist add new name obj + name_obj = { + "types": [t], + "value": r['value'], + "lang": r['lang'] + } + temp_names.append(name_obj) + + print("final temp names:") + print(temp_names) + update_data['names'] = temp_names #status if csv_data['status']: @@ -616,7 +662,7 @@ def update_record_from_csv(csv_data, version): print(temp_types) update_data['types'] = temp_types - if len(errors) == 0: + if not errors: errors, updated_record = update_record_from_json(update_data, existing_record) return errors, updated_record From 3b227d2fad0d1ffd99600b2457da250d6a68148d Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Fri, 1 Mar 2024 19:07:44 -0600 Subject: [PATCH 11/38] CSV create/update WIP --- requirements.txt | 3 +- rorapi/common/validation.py | 93 ++++++++++++++++++++++++++++--------- 2 files changed, 73 insertions(+), 23 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0ca299c..84b7ffe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,4 +23,5 @@ titlecase==2.3 update_address @ git+https://github.com/ror-community/update_address.git@v2-locations launchdarkly-server-sdk==7.6.1 jsonschema==3.2.0 -python-magic \ No newline at end of file +python-magic +iso639-lang \ No newline at end of file diff --git a/rorapi/common/validation.py b/rorapi/common/validation.py index 1aaa138..648baa8 100644 --- a/rorapi/common/validation.py +++ b/rorapi/common/validation.py @@ -7,6 +7,7 @@ import os import re from datetime import datetime +from iso639 import Lang from rest_framework.exceptions import ParseError from rest_framework.parsers import JSONParser from rest_framework.renderers import JSONRenderer @@ -17,6 +18,7 @@ OrganizationSerializer as OrganizationSerializerV2 ) from rorapi.common.queries import get_ror_id, retrieve_organization +from rorapi.common.serializers import ErrorsSerializer from rorapi.management.commands.generaterorid import check_ror_id @@ -157,6 +159,20 @@ def update_locations(locations): errors.append("Error retrieving Geonames data for ID {}. Please check that this is a valid Geonames ID".format(location['geonames_id'])) return errors, updated_locations +def get_lang_code(lang_string): + lang_code = None + error = None + if len(lang_string) == 2: + lang_string = lang_string.lower() + else: + lang_string = lang_string.title() + try: + lg = Lang(lang_string) + lang_code = lg.pt1 + except Exception as e: + error = e.msg + return error, lang_code + def get_file_from_url(url): rsp = requests.get(url) rsp.raise_for_status() @@ -533,14 +549,22 @@ def update_record_from_csv(csv_data, version): if v: vals_obj_list = [] for val in v: + print("val is") + print(val) vals_obj = { "value": None, "lang": None } - if "*" in v: + if LANG_DELIMITER in val: + print("has lang delim") name_val, lang = val.split("*") vals_obj["value"] = name_val.strip() - vals_obj["lang"] = lang.strip() + if lang: + lang_errors, lang_code = get_lang_code(lang.strip()) + if lang_errors: + errors.append("Could not convert language value to ISO code: {}".format(lang)) + else: + vals_obj["lang"] = lang_code else: vals_obj["value"] = val.strip() vals_obj_list.append(vals_obj) @@ -611,7 +635,7 @@ def update_record_from_csv(csv_data, version): if name_vals_match: for nvm in name_vals_match: # if value and lang exist but not type, add type only - if len(nvm['types'] > 0): + if len(nvm['types']) > 0: temp_names.remove(nvm) nvm['types'].append(t) temp_names.append(nvm) @@ -663,10 +687,11 @@ def update_record_from_csv(csv_data, version): update_data['types'] = temp_types if not errors: - errors, updated_record = update_record_from_json(update_data, existing_record) + validation_errors, updated_record = update_record_from_json(update_data, existing_record) + if validation_errors: + errors = ErrorsSerializer(validation_errors).data return errors, updated_record - #return None, None def new_record_from_csv(csv_data, version): v2_data = copy.deepcopy(V2_TEMPLATE) @@ -693,11 +718,12 @@ def new_record_from_csv(csv_data, version): #links for k,v in V2_LINK_TYPES.items(): if csv_data['links.type.' + v]: - link_obj = { - "type": v, - "value": csv_data['links.type.' + v].strip() - } - v2_data['links'].append(link_obj) + for l in csv_data['links.type.' + v].split(';'): + link_obj = { + "type": v, + "value": l.strip() + } + v2_data['links'].append(link_obj) #locations if csv_data['locations.geonames_id']: @@ -713,13 +739,23 @@ def new_record_from_csv(csv_data, version): temp_names = [] for k,v in V2_NAME_TYPES.items(): if csv_data['names.types.' + v]: - name_lang = csv_data['names.types.' + v].split(LANG_DELIMITER) - name_obj = { - "types": v, - "value": name_lang[0].strip(), - "lang": name_lang[1].strip() if name_lang[1] else None - } - temp_names.append(name_obj) + for n in csv_data['names.types.' + v].split(';'): + if LANG_DELIMITER in n: + name_val, lang_code = n.split("*") + if lang: + lang_errors, lang_code = get_lang_code(lang.strip()) + if lang_errors: + errors.append("Could not convert language value to ISO code: {}".format(lang)) + else: + name_val = n + lang_code = None + + name_obj = { + "types": [v], + "value": name_val.strip(), + "lang": lang_code + } + temp_names.append(name_obj) print("temp names 1:") print(temp_names) name_values = [n['value'] for n in temp_names] @@ -732,7 +768,7 @@ def new_record_from_csv(csv_data, version): types = [] for t in temp_names: if t['value'] == d: - types.append(t['types']) + types.extend(t['types']) name_obj = { "types": types, "value": d, @@ -753,36 +789,49 @@ def new_record_from_csv(csv_data, version): if csv_data['types']: v2_data['types'] = [t.strip() for t in csv_data['types'].split(';')] - errors, new_record = new_record_from_json(v2_data, version) + validation_errors, new_record = new_record_from_json(v2_data, version) + if validation_errors: + errors = ErrorsSerializer(validation_errors).data return errors, new_record def process_csv(csv_file, version): print("Processing CSV") errors = None - row_errors = {} + report = [] + report_fields = ['row', 'ror_id', 'action', 'errors'] skipped_count = 0 updated_count = 0 new_count = 0 read_file = csv_file.read().decode('utf-8') print(read_file) reader = csv.DictReader(io.StringIO(read_file)) - row_num = 1 + row_num = 2 for row in reader: + ror_id = None print("Row data") print(row) if row['id']: + action = 'updated' + ror_id = row['id'] errors, v2_record = update_record_from_csv(row, version) else: + action = 'updated' errors, v2_record = new_record_from_csv(row, version) if errors is None: + ror_id = v2_record['id'] serializer = OrganizationSerializerV2(v2_record) json_obj = json.loads(JSONRenderer().render(serializer.data)) print(json_obj) else: + action = 'skipped' print(errors) + report.append({"row": row_num, "ror_id": ror_id if ror_id else '', "action": action, "errors": errors}) + row_num += 1 + print(report) + ''' - ror_id = v2_record['id'] + full_path = os.path.join(DATA['DIR'], ror_id.split('https://ror.org/')[1] + '.json') serializer = OrganizationSerializerV2(v2_record) json_obj = json.loads(JSONRenderer().render(serializer.data)) From 6efa1e48314e4ac7bf03add445c19fe0c415b7d3 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Mon, 4 Mar 2024 09:45:38 -0600 Subject: [PATCH 12/38] CSV create/update WIP --- rorapi/common/validation.py | 97 ++++++++++++++++++++++--------------- rorapi/common/views.py | 4 +- 2 files changed, 61 insertions(+), 40 deletions(-) diff --git a/rorapi/common/validation.py b/rorapi/common/validation.py index 648baa8..2093a74 100644 --- a/rorapi/common/validation.py +++ b/rorapi/common/validation.py @@ -6,6 +6,7 @@ import io import os import re +import shutil from datetime import datetime from iso639 import Lang from rest_framework.exceptions import ParseError @@ -24,6 +25,8 @@ NOW = datetime.now() +DIR_NAME = NOW.strftime("%Y-%m-%d-%H-%M-%S") + "-ror-records" + ADMIN = { "created": { "date": NOW.strftime("%Y-%m-%d"), @@ -354,7 +357,8 @@ def update_record_from_csv(csv_data, version): else: for d in delete_values: if d not in temp_domains: - errors.append("Attempting to delete dommain(s) that don't exist: {}".format(d)) + errors.append("Attempting to delete domain(s) that don't exist: {}".format(d)) + temp_domains = [d for d in temp_domains if d not in delete_values] print("temp domains delete") print(temp_domains) @@ -362,7 +366,7 @@ def update_record_from_csv(csv_data, version): add_values = actions_values[UPDATE_ACTIONS['ADD']] for a in add_values: if a in temp_domains: - errors.append("Attempting to add dommain(s) that already exist: {}".format(a)) + errors.append("Attempting to add domain(s) that already exist: {}".format(a)) print(add_values) temp_domains.extend(add_values) print("temp domains add") @@ -401,6 +405,7 @@ def update_record_from_csv(csv_data, version): temp_preferred = existing_ext_id_obj['preferred'] if len(existing_ext_ids_type) > 1: errors.append("Something is wrong. Multiple external ID objects with type ".format(t)) + # external_ids.all if csv_data['external_ids.type.' + t + '.all']: actions_values = get_actions_values(csv_data['external_ids.type.' + t + '.all']) @@ -430,7 +435,6 @@ def update_record_from_csv(csv_data, version): if UPDATE_ACTIONS['REPLACE'] in actions_values: temp_preferred = actions_values[UPDATE_ACTIONS['REPLACE']][0] - if (not temp_all) and temp_preferred is None: # remove all of type if not existing_ext_id_obj: @@ -438,6 +442,8 @@ def update_record_from_csv(csv_data, version): temp_ext_ids = [i for i in temp_ext_ids if i['type'] != t] else: + if not temp_preferred in temp_all: + errors.append("Changes to external ID object with type {} result in preferred value '{}' not in all values '{}'".format(t, temp_preferred, ", ".join(temp_all))) # remove all of type and replace with new obj new_ext_id_obj = { "type": t, @@ -695,7 +701,7 @@ def update_record_from_csv(csv_data, version): def new_record_from_csv(csv_data, version): v2_data = copy.deepcopy(V2_TEMPLATE) - + errors = None #domains if csv_data['domains']: v2_data['domains'] = [d.strip() for d in csv_data['domains'].split(';')] @@ -794,9 +800,37 @@ def new_record_from_csv(csv_data, version): errors = ErrorsSerializer(validation_errors).data return errors, new_record +def save_record_file(ror_id, updated, json_obj): + dir_path = os.path.join(DATA['DIR'],DIR_NAME) + if not os.path.exists(dir_path): + os.mkdir(dir_path) + subdir = 'updates' if updated else 'new' + if not os.path.exists(os.path.join(dir_path, subdir)): + os.mkdir(os.path.join(dir_path, subdir)) + full_path = os.path.join(dir_path, subdir, ror_id.split('https://ror.org/')[1] + '.json') + with open(full_path, "w") as outfile: + json.dump(json_obj, outfile, ensure_ascii=False, indent=2) + +def save_report_file(report, report_fields, csv_file): + dir_path = os.path.join(DATA['DIR'],DIR_NAME) + if not os.path.exists(dir_path): + os.mkdir(dir_path) + filepath = os.path.join(dir_path, 'report.csv') + with open(filepath, 'w') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=report_fields) + writer.writeheader() + writer.writerows(report) + # save copy of input file + filepath = os.path.join(dir_path, 'input.csv') + csv_file.seek(0) + with open(filepath, 'wb+') as f: + for chunk in csv_file.chunks(): + f.write(chunk) + def process_csv(csv_file, version): print("Processing CSV") errors = None + success_msg = None report = [] report_fields = ['row', 'ror_id', 'action', 'errors'] skipped_count = 0 @@ -808,54 +842,41 @@ def process_csv(csv_file, version): row_num = 2 for row in reader: ror_id = None + updated = False print("Row data") print(row) if row['id']: - action = 'updated' ror_id = row['id'] + updated = True errors, v2_record = update_record_from_csv(row, version) else: - action = 'updated' errors, v2_record = new_record_from_csv(row, version) if errors is None: + if updated: + action = 'updated' + updated_count += 1 + else: + action = 'created' + new_count += 1 ror_id = v2_record['id'] serializer = OrganizationSerializerV2(v2_record) json_obj = json.loads(JSONRenderer().render(serializer.data)) print(json_obj) + #create file + file = save_record_file(ror_id, updated, json_obj) else: action = 'skipped' + skipped_count += 1 print(errors) - - report.append({"row": row_num, "ror_id": ror_id if ror_id else '', "action": action, "errors": errors}) + report.append({"row": row_num, "ror_id": ror_id if ror_id else '', "action": action, "errors": "; ".join(errors)}) row_num += 1 print(report) - - ''' - - full_path = os.path.join(DATA['DIR'], ror_id.split('https://ror.org/')[1] + '.json') - serializer = OrganizationSerializerV2(v2_record) - json_obj = json.loads(JSONRenderer().render(serializer.data)) - with open(full_path, "w") as outfile: - json.dump(json_obj, outfile, ensure_ascii=False, indent=2) - ''' - ''' - if row['ror_id']: - row_error, updated_record = update_from_csv(row) - if row_error: - row_errors[row_num] = ror_error - skipped_count += 1 - else: - updated_count += 1 - else: - row_error, new_record = new_record_from_csv(row) - if row_error: - row_errors[row_num] = ror_error - skipped_count += 1 - else: - new_count +=1 - row_num += 1 - if len(ror_errors): - #create row errors csv - if updated_count > 0 or updated_count > 0 or skipped_count > 0: - # created zip - ''' \ No newline at end of file + if new_count > 0 or updated_count > 0 or skipped_count > 0: + #create report file + save_report_file(report, report_fields, csv_file) + # create zip file + zipfile = shutil.make_archive(os.path.join(DATA['DIR'],DIR_NAME), 'zip', DATA['DIR'], DIR_NAME) + + success_msg = {"zipfile": zipfile, "rows processed": new_count + updated_count + skipped_count, "created": new_count, "udpated": updated_count, "skipped": skipped_count} + print(success_msg) + return errors, success_msg \ No newline at end of file diff --git a/rorapi/common/views.py b/rorapi/common/views.py index c197047..e1ec149 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -229,7 +229,7 @@ def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): file_object.seek(0) #full_path = os.path.join(DATA['DIR'], file_object.name) #save_file(file_object, full_path) - errors = validation.process_csv(file_object, version) + errors, msg = validation.process_csv(file_object, version) else: errors=Errors(csv_validation_errors) @@ -246,7 +246,7 @@ def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): ) return Response( - request.data, + msg, status=status.HTTP_201_CREATED ) From f7a0f35a9589b40c4358082981cf8c2b446ab9d5 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Thu, 7 Mar 2024 20:19:56 -0600 Subject: [PATCH 13/38] CSV create/update WIP --- rorapi/common/urls.py | 2 +- rorapi/common/validation.py | 112 ++++++++++++++++++++---------------- rorapi/common/views.py | 109 +++++++++-------------------------- 3 files changed, 91 insertions(+), 132 deletions(-) diff --git a/rorapi/common/urls.py b/rorapi/common/urls.py index cdcb523..2d24274 100644 --- a/rorapi/common/urls.py +++ b/rorapi/common/urls.py @@ -13,7 +13,7 @@ url(r"^(?P(v1|v2))\/generateaddress\/(?P[0-9]+)", GenerateAddress.as_view()), path('generateaddress/', GenerateAddress.as_view()), url(r"^generateid$", GenerateId.as_view()), - path('indexdata/', IndexData.as_view()), + path(r"^(?P(v1|v2))\/indexdata/(?P.*)", IndexData.as_view()), url(r"^(?P(v1|v2))\/", include(views.organizations_router.urls)), url(r"^", include(views.organizations_router.urls)), url(r"^docs/", include_docs_urls(title="Research Organization Registry")), diff --git a/rorapi/common/validation.py b/rorapi/common/validation.py index 2093a74..e77e27c 100644 --- a/rorapi/common/validation.py +++ b/rorapi/common/validation.py @@ -7,6 +7,7 @@ import os import re import shutil +import urllib from datetime import datetime from iso639 import Lang from rest_framework.exceptions import ParseError @@ -23,23 +24,19 @@ from rorapi.management.commands.generaterorid import check_ror_id -NOW = datetime.now() - -DIR_NAME = NOW.strftime("%Y-%m-%d-%H-%M-%S") + "-ror-records" - ADMIN = { "created": { - "date": NOW.strftime("%Y-%m-%d"), + "date": "", "schema_version": "2.0" }, "last_modified": { - "date": NOW.strftime("%Y-%m-%d"), + "date": "", "schema_version": "2.0" } } LAST_MOD = { - "date": NOW.strftime("%Y-%m-%d"), + "date": "", "schema_version": "2.0" } @@ -131,6 +128,7 @@ def update_record(json_input, existing_record): def update_last_mod(record): record['admin']['last_modified'] = copy.deepcopy(LAST_MOD) + record['admin']['last_modified']['date'] = datetime.now().strftime("%Y-%m-%d") return record def check_optional_fields(record): @@ -146,7 +144,10 @@ def add_missing_optional_fields(record): return record def add_created_last_mod(record): + today = datetime.now().strftime("%Y-%m-%d") record['admin'] = copy.deepcopy(ADMIN) + record['admin']['created']['date'] = today + record['admin']['last_modified']['date'] = today return record def update_locations(locations): @@ -462,7 +463,7 @@ def update_record_from_csv(csv_data, version): if csv_data['links.type.' + v]: updated_link_types.append(v) if updated_link_types: - temp_names = copy.deepcopy(existing_record['links']) + temp_links = copy.deepcopy(existing_record['links']) for t in updated_link_types: if csv_data['links.type.' + t]: actions_values = get_actions_values(csv_data['links.type.' + t]) @@ -664,7 +665,7 @@ def update_record_from_csv(csv_data, version): if UPDATE_ACTIONS['DELETE'] in actions_values: errors.append("Cannot delete required field 'status'") if UPDATE_ACTIONS['REPLACE'] in actions_values: - update_data['status'] = actions_values[UPDATE_ACTIONS['REPLACE']][0] + update_data['status'] = actions_values[UPDATE_ACTIONS['REPLACE']][0].lower() #types if csv_data['types']: @@ -673,7 +674,7 @@ def update_record_from_csv(csv_data, version): print("initial temp types:") print(temp_types) if UPDATE_ACTIONS['DELETE'] in actions_values: - delete_values = actions_values[UPDATE_ACTIONS['DELETE']] + delete_values = [av.lower() for av in actions_values[UPDATE_ACTIONS['DELETE']]] for d in delete_values: if d not in temp_types: errors.append("Attempting to delete type(s) that don't exist: {}".format(d)) @@ -681,13 +682,13 @@ def update_record_from_csv(csv_data, version): errors.append("Cannot remove all values from required field 'types'") temp_types = [t for t in temp_types if t not in delete_values] if UPDATE_ACTIONS['ADD'] in actions_values: - add_values = actions_values[UPDATE_ACTIONS['ADD']] + add_values = [av.lower() for av in actions_values[UPDATE_ACTIONS['ADD']]] for a in add_values: if a in temp_types: errors.append("Attempting to add type(s) that already exist: {}".format(a)) temp_types.extend(add_values) if UPDATE_ACTIONS['REPLACE'] in actions_values: - temp_types = actions_values[UPDATE_ACTIONS['REPLACE']] + temp_types = [av.lower() for av in actions_values[UPDATE_ACTIONS['REPLACE']]] print("final temp types:") print(temp_types) update_data['types'] = temp_types @@ -701,10 +702,10 @@ def update_record_from_csv(csv_data, version): def new_record_from_csv(csv_data, version): v2_data = copy.deepcopy(V2_TEMPLATE) - errors = None + errors = [] #domains if csv_data['domains']: - v2_data['domains'] = [d.strip() for d in csv_data['domains'].split(';')] + v2_data['domains'] = [d.strip() for d in csv_data['domains'].strip(';').split(';')] #established if csv_data['established']: @@ -713,7 +714,7 @@ def new_record_from_csv(csv_data, version): #external ids for k,v in V2_EXTERNAL_ID_TYPES.items(): if csv_data['external_ids.type.' + v + '.all']: - all_ids = [i.strip() for i in csv_data['external_ids.type.' + v + '.all'].split(';')] + all_ids = [i.strip() for i in csv_data['external_ids.type.' + v + '.all'].strip(';').split(';')] ext_id_obj = { "type": v, "all": all_ids, @@ -724,7 +725,7 @@ def new_record_from_csv(csv_data, version): #links for k,v in V2_LINK_TYPES.items(): if csv_data['links.type.' + v]: - for l in csv_data['links.type.' + v].split(';'): + for l in csv_data['links.type.' + v].strip(';').split(';'): link_obj = { "type": v, "value": l.strip() @@ -733,7 +734,7 @@ def new_record_from_csv(csv_data, version): #locations if csv_data['locations.geonames_id']: - geonames_ids = [i.strip() for i in csv_data['locations.geonames_id'].split(';')] + geonames_ids = [i.strip() for i in csv_data['locations.geonames_id'].strip(';').split(';')] for geonames_id in geonames_ids: location_obj = { "geonames_id": geonames_id, @@ -745,9 +746,9 @@ def new_record_from_csv(csv_data, version): temp_names = [] for k,v in V2_NAME_TYPES.items(): if csv_data['names.types.' + v]: - for n in csv_data['names.types.' + v].split(';'): + for n in csv_data['names.types.' + v].strip(';').split(';'): if LANG_DELIMITER in n: - name_val, lang_code = n.split("*") + name_val, lang = n.split("*") if lang: lang_errors, lang_code = get_lang_code(lang.strip()) if lang_errors: @@ -768,40 +769,42 @@ def new_record_from_csv(csv_data, version): dup_names = [] for n in name_values: if name_values.count(n) > 1: - dup_names.append(n) - dup_names_types = [] - for d in dup_names: - types = [] - for t in temp_names: - if t['value'] == d: - types.extend(t['types']) - name_obj = { - "types": types, - "value": d, - "lang": None - } - dup_names_types.append(name_obj) - temp_names = [t for t in temp_names if t['value'] not in dup_names] - temp_names.append(name_obj) + if n not in dup_names: + dup_names.append(n) + if dup_names: + dup_names_objs = [] + for d in dup_names: + types = [] + for t in temp_names: + if t['value'] == d: + types.extend(t['types']) + name_obj = { + "types": types, + "value": d, + "lang": None + } + dup_names_objs.append(name_obj) + temp_names = [t for t in temp_names if t['value'] not in dup_names] + temp_names.extend(dup_names_objs) print("temp names 2:") print(temp_names) v2_data['names'] = temp_names #status if csv_data['status']: - v2_data['status'] = csv_data['status'].strip() + v2_data['status'] = csv_data['status'].strip().lower() #types if csv_data['types']: - v2_data['types'] = [t.strip() for t in csv_data['types'].split(';')] + v2_data['types'] = [t.strip().lower() for t in csv_data['types'].strip(';').split(';')] validation_errors, new_record = new_record_from_json(v2_data, version) if validation_errors: errors = ErrorsSerializer(validation_errors).data return errors, new_record -def save_record_file(ror_id, updated, json_obj): - dir_path = os.path.join(DATA['DIR'],DIR_NAME) +def save_record_file(ror_id, updated, json_obj, dir_name): + dir_path = os.path.join(DATA['DIR'],dir_name) if not os.path.exists(dir_path): os.mkdir(dir_path) subdir = 'updates' if updated else 'new' @@ -811,8 +814,8 @@ def save_record_file(ror_id, updated, json_obj): with open(full_path, "w") as outfile: json.dump(json_obj, outfile, ensure_ascii=False, indent=2) -def save_report_file(report, report_fields, csv_file): - dir_path = os.path.join(DATA['DIR'],DIR_NAME) +def save_report_file(report, report_fields, csv_file, dir_name): + dir_path = os.path.join(DATA['DIR'],dir_name) if not os.path.exists(dir_path): os.mkdir(dir_path) filepath = os.path.join(dir_path, 'report.csv') @@ -829,6 +832,7 @@ def save_report_file(report, report_fields, csv_file): def process_csv(csv_file, version): print("Processing CSV") + dir_name = datetime.now().strftime("%Y-%m-%d-%H:%M:%S") + "-ror-records" errors = None success_msg = None report = [] @@ -848,10 +852,10 @@ def process_csv(csv_file, version): if row['id']: ror_id = row['id'] updated = True - errors, v2_record = update_record_from_csv(row, version) + row_errors, v2_record = update_record_from_csv(row, version) else: - errors, v2_record = new_record_from_csv(row, version) - if errors is None: + row_errors, v2_record = new_record_from_csv(row, version) + if not row_errors: if updated: action = 'updated' updated_count += 1 @@ -863,20 +867,28 @@ def process_csv(csv_file, version): json_obj = json.loads(JSONRenderer().render(serializer.data)) print(json_obj) #create file - file = save_record_file(ror_id, updated, json_obj) + file = save_record_file(ror_id, updated, json_obj, dir_name) else: action = 'skipped' skipped_count += 1 - print(errors) - report.append({"row": row_num, "ror_id": ror_id if ror_id else '', "action": action, "errors": "; ".join(errors)}) + #print(errors) + report.append({"row": row_num, "ror_id": ror_id if ror_id else '', "action": action, "errors": row_errors if row_errors else ''}) row_num += 1 print(report) if new_count > 0 or updated_count > 0 or skipped_count > 0: #create report file - save_report_file(report, report_fields, csv_file) + save_report_file(report, report_fields, csv_file, dir_name) # create zip file - zipfile = shutil.make_archive(os.path.join(DATA['DIR'],DIR_NAME), 'zip', DATA['DIR'], DIR_NAME) + zipfile = shutil.make_archive(os.path.join(DATA['DIR'], dir_name), 'zip', DATA['DIR'], dir_name) + # upload to S3 + try: + DATA['CLIENT'].upload_file(zipfile, DATA['PUBLIC_STORE'], dir_name + '.zip') + s3_file = f"http://s3.eu-west-1.amazonaws.com/{DATA['PUBLIC_STORE']}/{urllib.parse.quote(dir_name)}.zip" + except Exception as e: + errors = e + print(errors) + - success_msg = {"zipfile": zipfile, "rows processed": new_count + updated_count + skipped_count, "created": new_count, "udpated": updated_count, "skipped": skipped_count} + success_msg = {"file": s3_file, "rows processed": new_count + updated_count + skipped_count, "created": new_count, "udpated": updated_count, "skipped": skipped_count} print(success_msg) - return errors, success_msg \ No newline at end of file + return success_msg \ No newline at end of file diff --git a/rorapi/common/views.py b/rorapi/common/views.py index e1ec149..50cc435 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -42,8 +42,29 @@ from rorapi.management.commands.generaterorid import check_ror_id from rorapi.management.commands.indexror import process_files +class OurTokenPermission(BasePermission): + """ + Allows access only to using our token and user name. + """ + + def has_permission(self, request, view): + has_permission = False + if request.method == 'GET': + has_permission = True + else: + header_token = request.headers.get("Token", None) + header_user = request.headers.get("Route-User", None) + user = os.environ.get("ROUTE_USER") + token = os.environ.get("TOKEN") + if header_token == token and header_user == user: + has_permission = True + + return has_permission + class OrganizationViewSet(viewsets.ViewSet): + #permission_classes = [OurTokenPermission] + lookup_value_regex = r"((https?(:\/\/|%3A%2F%2F))?ror\.org(\/|%2F))?.*" def list(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): @@ -92,6 +113,7 @@ def retrieve(self, request, pk=None, version=REST_FRAMEWORK["DEFAULT_VERSION"]): serializer = OrganizationSerializerV1(organization) return Response(serializer.data) + permission_classes = [OurTokenPermission] def create(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): errors = None if version == "v2": @@ -159,19 +181,6 @@ def get(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): return HttpResponse(status=500) -class OurTokenPermission(BasePermission): - """ - Allows access only to using our token and user name. - """ - - def has_permission(self, request, view): - header_token = request.headers.get("Token", None) - header_user = request.headers.get("Route-User", None) - user = os.environ.get("ROUTE_USER") - token = os.environ.get("TOKEN") - return header_token == token and header_user == user - - class GenerateAddress(APIView): permission_classes = [OurTokenPermission] @@ -184,7 +193,7 @@ def get(self, request, geonamesid, version=REST_FRAMEWORK["DEFAULT_VERSION"]): class GenerateId(APIView): - spermission_classes = [OurTokenPermission] + permission_classes = [OurTokenPermission] def get(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): id = check_ror_id(version) @@ -194,42 +203,31 @@ def get(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): class IndexData(APIView): permission_classes = [OurTokenPermission] - def get(self, request, branch): + def get(self, request, branch, version=REST_FRAMEWORK["DEFAULT_VERSION"]): st = 200 - msg = process_files(branch) + msg = process_files(branch, version) if msg["status"] == "ERROR": st = 400 return Response({"status": msg["status"], "msg": msg["msg"]}, status=st) -def save_file(file, full_path): - with open(full_path, 'wb+') as f: - for chunk in file.chunks(): - f.write(chunk) class FileUploadView(APIView): + #permission_classes = [OurTokenPermission] parser_classes = (MultiPartParser, FormParser) - #serializer_class = FileUploadSerializer def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): errors = None - #serializer = self.serializer_class(data=request.data) - #if serializer.is_valid(): - # you can access the file like this from serializer - # uploaded_file = serializer.validated_data["file"] - #serializer.save() if version == 'v2': if request.data: file_object = request.data['file'] mime_type = magic.from_buffer(file_object.read(2048)) print(mime_type) - if "ASCII text" in mime_type or "CSV text" in mime_type: + if "ASCII text" in mime_type or "UTF-8 Unicode text" in mime_type or "CSV text" in mime_type: file_object.seek(0) csv_validation_errors = validation.validate_csv(file_object) if len(csv_validation_errors) == 0: file_object.seek(0) - #full_path = os.path.join(DATA['DIR'], file_object.name) - #save_file(file_object, full_path) - errors, msg = validation.process_csv(file_object, version) + msg = validation.process_csv(file_object, version) else: errors=Errors(csv_validation_errors) @@ -259,54 +257,3 @@ def get(self, request, filename, **kwargs): response = HttpResponse(fh, content_type=mime_type) response['Content-Disposition'] = "attachment; filename=%s" % filename return response - - -class BulkCreateUpdate(APIView): - #permission_classes = [OurTokenPermission] - - ''' - def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): - errors = None - row_errors = {} - skipped_count = 0 - updated_count = 0 - new_count = 0 - if version == 'v2': - if request.data: - csv_errors = validate_csv(request.data) - if csv_errors: - errors = csv_errors - else: - with open(request.data, 'r') as csv: - row_num = 1 - for row in csv: - if row['ror_id']: - row_error, updated_record = update_from_csv(row) - if row_error: - row_errors[row_num] = ror_error - skipped_count += 1 - else: - updated_count += 1 - else: - row_error, new_record = create_from_csv(row) - if row_error: - row_errors[row_num] = ror_error - skipped_count += 1 - else: - new_count +=1 - row_num += 1 - if len(ror_errors): - #create row errors csv - if updated_count > 0 or updated_count > 0 or skipped_count > 0: - # created zip - else: - errors = Errors(["Could not processs request. No CSV file included in request."]) - else: - errors = Errors(["Version {} does not support creating records".format(version)]) - if errors is not None: - print(errors) - return Response( - ErrorsSerializer(errors).data, status=status.HTTP_400_BAD_REQUEST - ) - return Response(zippedfile) - ''' From b89800be4dc83c4e7a34315b95ef35d699e8af27 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Thu, 7 Mar 2024 20:20:24 -0600 Subject: [PATCH 14/38] add support for v2 in incremental indexing --- rorapi/management/commands/indexror.py | 17 +++++++++++------ rorapi/settings.py | 1 + 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/rorapi/management/commands/indexror.py b/rorapi/management/commands/indexror.py index 59c43b4..91796ad 100644 --- a/rorapi/management/commands/indexror.py +++ b/rorapi/management/commands/indexror.py @@ -88,7 +88,7 @@ def get_data(): return contents, err -def process_files(dir): +def process_files(dir, version): err = [] if dir: path = os.path.join(DATA['WORKING_DIR'], dir) @@ -104,7 +104,7 @@ def process_files(dir): if path and file and not(e): data, e = prepare_files(path, file) if not(e): - index_error = index(data) + index_error = index(data, version) err.append(index_error) else: err.append(e) @@ -116,14 +116,17 @@ def process_files(dir): if err: msg = {"status": "ERROR", "msg": err} else: - msg = {"status": "OK", "msg": f"{dir} indexed"} + msg = {"status": "OK", "msg": f"{dir} indexed using version {version}"} return msg -def index(dataset): +def index(dataset, version): err = {} - index = ES_VARS['INDEX_V1'] + if version == 'v2': + index = ES_VARS['INDEX_V2'] + else: + index = ES_VARS['INDEX_V1'] backup_index = '{}-tmp'.format(index) ES7.reindex(body={ 'source': { @@ -171,9 +174,11 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('dir', type=str, help='add directory name for S3 bucket to be processed') + parser.add_argument('version', type=str, help='schema version of files to be processed') def handle(self,*args, **options): dir = options['dir'] - process_files(dir) + version = options['version'] + process_files(dir, version) diff --git a/rorapi/settings.py b/rorapi/settings.py index c39bd57..a8f3504 100644 --- a/rorapi/settings.py +++ b/rorapi/settings.py @@ -254,6 +254,7 @@ DATA = {} DATA['DATA_STORE'] = os.environ.get('DATA_STORE', None) +DATA['PUBLIC_STORE'] = os.environ.get('PUBLIC_STORE', None) DATA['WORKING_DIR'] = os.path.join(BASE_DIR, 'rorapi', 'data', '') if DATA['DATA_STORE']: From 68975f67f1d77bfc11f9610707c2d055537a1733 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Fri, 8 Mar 2024 12:21:20 -0600 Subject: [PATCH 15/38] add creds for put/post requests --- rorapi/common/validation.py | 9 +++++---- rorapi/common/views.py | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/rorapi/common/validation.py b/rorapi/common/validation.py index e77e27c..2dc2d85 100644 --- a/rorapi/common/validation.py +++ b/rorapi/common/validation.py @@ -489,7 +489,7 @@ def update_record_from_csv(csv_data, version): } temp_links.append(link_obj) if UPDATE_ACTIONS['REPLACE'] in actions_values: - temp_links = [] + temp_links = [l for l in temp_links if l['type'] != t] for r in actions_values[UPDATE_ACTIONS['REPLACE']]: link_obj = { "type": t, @@ -881,14 +881,15 @@ def process_csv(csv_file, version): # create zip file zipfile = shutil.make_archive(os.path.join(DATA['DIR'], dir_name), 'zip', DATA['DIR'], dir_name) # upload to S3 + ''' try: DATA['CLIENT'].upload_file(zipfile, DATA['PUBLIC_STORE'], dir_name + '.zip') - s3_file = f"http://s3.eu-west-1.amazonaws.com/{DATA['PUBLIC_STORE']}/{urllib.parse.quote(dir_name)}.zip" + zipfile = f"https://s3.eu-west-1.amazonaws.com/{DATA['PUBLIC_STORE']}/{urllib.parse.quote(dir_name)}.zip" except Exception as e: errors = e print(errors) + ''' - - success_msg = {"file": s3_file, "rows processed": new_count + updated_count + skipped_count, "created": new_count, "udpated": updated_count, "skipped": skipped_count} + success_msg = {"file": zipfile, "rows processed": new_count + updated_count + skipped_count, "created": new_count, "udpated": updated_count, "skipped": skipped_count} print(success_msg) return success_msg \ No newline at end of file diff --git a/rorapi/common/views.py b/rorapi/common/views.py index 50cc435..5404fc7 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -63,7 +63,7 @@ def has_permission(self, request, view): class OrganizationViewSet(viewsets.ViewSet): - #permission_classes = [OurTokenPermission] + permission_classes = [OurTokenPermission] lookup_value_regex = r"((https?(:\/\/|%3A%2F%2F))?ror\.org(\/|%2F))?.*" @@ -212,7 +212,7 @@ def get(self, request, branch, version=REST_FRAMEWORK["DEFAULT_VERSION"]): class FileUploadView(APIView): - #permission_classes = [OurTokenPermission] + permission_classes = [OurTokenPermission] parser_classes = (MultiPartParser, FormParser) def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): From 7a4cde6c123a9c057bd729ad39a4ac8aa6b60491 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Fri, 8 Mar 2024 14:18:27 -0600 Subject: [PATCH 16/38] sort list fields in v2 --- rorapi/common/validation.py | 29 ++++++++++++++++++++++++++++- rorapi/v2/models.py | 17 ++++++++++++----- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/rorapi/common/validation.py b/rorapi/common/validation.py index 2dc2d85..bec5af7 100644 --- a/rorapi/common/validation.py +++ b/rorapi/common/validation.py @@ -16,6 +16,10 @@ from rorapi.common.models import Errors import update_address as ua from rorapi.settings import DATA +from rorapi.v2.models import ( + Organization as OrganizationV2, + ListResult as ListResultV2 +) from rorapi.v2.serializers import ( OrganizationSerializer as OrganizationSerializerV2 ) @@ -120,6 +124,16 @@ UPDATE_DELIMITER = "==" +SORT_KEYS = { + "domains": None, + "external_ids": "type", + "links": "type", + "locations": "geonames_id", + "names": "value", + "relationships": "type", + "types": None +} + def update_record(json_input, existing_record): record = copy.deepcopy(existing_record) for k, v in json_input.items(): @@ -177,6 +191,18 @@ def get_lang_code(lang_string): error = e.msg return error, lang_code +def sort_list_fields(v2_record): + for field in v2_record: + if field in SORT_KEYS: + if SORT_KEYS[field] is not None: + sort_key = SORT_KEYS[field] + sorted_vals = sorted(v2_record[field], key=lambda x: x[sort_key]) + else: + sorted_vals = v2_record[field].sort() + v2_record[field] = sorted_vals + return v2_record + + def get_file_from_url(url): rsp = requests.get(url) rsp.raise_for_status() @@ -863,7 +889,8 @@ def process_csv(csv_file, version): action = 'created' new_count += 1 ror_id = v2_record['id'] - serializer = OrganizationSerializerV2(v2_record) + sorted_record = sort_list_fields(v2_record) + serializer = OrganizationSerializerV2(sorted_record) json_obj = json.loads(JSONRenderer().render(serializer.data)) print(json_obj) #create file diff --git a/rorapi/v2/models.py b/rorapi/v2/models.py index fd13fce..2cacbfb 100644 --- a/rorapi/v2/models.py +++ b/rorapi/v2/models.py @@ -48,18 +48,25 @@ def __init__(self, data): if "_source" in data: data = data["_source"] super(Organization, self).__init__( - data, ["domains", "established", "id", "types", "status"] + data, ["established", "id", "status"] ) self.admin = Admin(data.admin) + self.domains = data.domains.sort() + sorted_ext_ids = sorted(data.external_ids, key=lambda x: x['type']) self.external_ids = [ - Entity(e, ["type", "preferred", "all"]) for e in data.external_ids + Entity(e, ["type", "preferred", "all"]) for e in sorted_ext_ids ] - self.links = [Entity(l, ["value", "type"]) for l in data.links] - self.locations = [Location(l) for l in data.locations] - self.names = [Entity(n, ["value", "lang", "types"]) for n in data.names] + sorted_links = sorted(data.links, key=lambda x: x['type']) + self.links = [Entity(l, ["value", "type"]) for l in sorted_links] + sorted_locations = sorted(data.locations, key=lambda x: x['geonames_id']) + self.locations = [Location(l) for l in sorted_locations] + sorted_names = sorted(data.names, key=lambda x: x['value']) + self.names = [Entity(n, ["value", "lang", "types"]) for n in sorted_names] + sorted_rels = sorted(data.relationships, key=lambda x: x['type']) self.relationships = [ Entity(r, ["type", "label", "id"]) for r in data.relationships ] + self.types = data.types.sort() class ListResult: From f6d21dfc5820ca490ee2558ea4c66d7a0312a05f Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Fri, 8 Mar 2024 17:26:18 -0600 Subject: [PATCH 17/38] refactor code --- rorapi/common/create_update.py | 101 ++++ rorapi/common/csv_bulk.py | 106 ++++ rorapi/common/csv_create.py | 110 ++++ .../common/{validation.py => csv_update.py} | 550 +----------------- rorapi/common/csv_utils.py | 122 ++++ rorapi/common/record_utils.py | 38 ++ rorapi/common/views.py | 19 +- rorapi/v2/models.py | 6 +- rorapi/v2/record_constants.py | 66 +++ 9 files changed, 559 insertions(+), 559 deletions(-) create mode 100644 rorapi/common/create_update.py create mode 100644 rorapi/common/csv_bulk.py create mode 100644 rorapi/common/csv_create.py rename rorapi/common/{validation.py => csv_update.py} (53%) create mode 100644 rorapi/common/csv_utils.py create mode 100644 rorapi/common/record_utils.py create mode 100644 rorapi/v2/record_constants.py diff --git a/rorapi/common/create_update.py b/rorapi/common/create_update.py new file mode 100644 index 0000000..aefbc8d --- /dev/null +++ b/rorapi/common/create_update.py @@ -0,0 +1,101 @@ +import copy +from datetime import datetime +from rorapi.common.models import Errors +from rorapi.common.record_utils import * +import update_address as ua +from rorapi.v2.record_constants import * +from rorapi.v2.serializers import ( + OrganizationSerializer as OrganizationSerializerV2 +) +from rorapi.management.commands.generaterorid import check_ror_id + +V2_SCHEMA = get_file_from_url("https://raw.githubusercontent.com/ror-community/ror-schema/schema-v2/ror_schema_v2_0.json") + + +def update_record(json_input, existing_record): + record = copy.deepcopy(existing_record) + for k, v in json_input.items(): + record[k] = copy.deepcopy(v) + return update_last_mod(record) + +def update_last_mod(record): + record['admin']['last_modified'] = copy.deepcopy(V2_LAST_MOD) + record['admin']['last_modified']['date'] = datetime.now().strftime("%Y-%m-%d") + return record + +def check_optional_fields(record): + for k in V2_OPTIONAL_FIELD_DEFAULTS: + if k not in record: + return True + return False + +def add_missing_optional_fields(record): + for k, v in V2_OPTIONAL_FIELD_DEFAULTS.items(): + if k not in record: + record[k] = v + return record + +def add_created_last_mod(record): + today = datetime.now().strftime("%Y-%m-%d") + record['admin'] = copy.deepcopy(V2_ADMIN) + record['admin']['created']['date'] = today + record['admin']['last_modified']['date'] = today + return record + +def update_locations(locations): + errors = [] + updated_locations = [] + for location in locations: + if 'geonames_id' in location: + try: + print(location['geonames_id']) + updated_location = ua.new_geonames_v2(str(location['geonames_id'])) + updated_locations.append(updated_location['location']) + except: + errors.append("Error retrieving Geonames data for ID {}. Please check that this is a valid Geonames ID".format(location['geonames_id'])) + return errors, updated_locations + +def sort_list_fields(v2_record): + for field in v2_record: + if field in V2_SORT_KEYS: + if V2_SORT_KEYS[field] is not None: + sort_key = V2_SORT_KEYS[field] + sorted_vals = sorted(v2_record[field], key=lambda x: x[sort_key]) + else: + sorted_vals = sorted(v2_record[field]) + v2_record[field] = sorted_vals + return v2_record + + +def new_record_from_json(json_input, version): + errors = None + valid_data = None + new_record = copy.deepcopy(json_input) + if check_optional_fields(new_record): + new_record = add_missing_optional_fields(new_record) + location_errors, updated_locations = update_locations(new_record['locations']) + if location_errors: + errors = Errors(location_errors) + else: + new_record['locations'] = updated_locations + new_record = add_created_last_mod(new_record) + new_ror_id = check_ror_id(version) + print("new ror id: " + new_ror_id) + new_record['id'] = new_ror_id + errors, valid_data = validate_record(sort_list_fields(new_record), V2_SCHEMA) + return errors, valid_data + + +def update_record_from_json(new_json, existing_org): + errors = None + valid_data = None + serializer = OrganizationSerializerV2(existing_org) + existing_record = serializer.data + updated_record = update_record(new_json, existing_record) + location_errors, updated_locations = update_locations(updated_record['locations']) + if location_errors: + errors = Errors(location_errors) + else: + updated_record['locations'] = updated_locations + errors, valid_data = validate_record(sort_list_fields(updated_record), V2_SCHEMA) + return errors, valid_data diff --git a/rorapi/common/csv_bulk.py b/rorapi/common/csv_bulk.py new file mode 100644 index 0000000..22bb2b3 --- /dev/null +++ b/rorapi/common/csv_bulk.py @@ -0,0 +1,106 @@ +import csv +import json +import io +import os +import shutil +import urllib +from datetime import datetime +from rest_framework.renderers import JSONRenderer +from rorapi.settings import DATA +from rorapi.v2.serializers import ( + OrganizationSerializer as OrganizationSerializerV2 +) +from rorapi.common.csv_update import update_record_from_csv +from rorapi.common.csv_create import new_record_from_csv + + +def save_record_file(ror_id, updated, json_obj, dir_name): + dir_path = os.path.join(DATA['DIR'],dir_name) + if not os.path.exists(dir_path): + os.mkdir(dir_path) + subdir = 'updates' if updated else 'new' + if not os.path.exists(os.path.join(dir_path, subdir)): + os.mkdir(os.path.join(dir_path, subdir)) + full_path = os.path.join(dir_path, subdir, ror_id.split('https://ror.org/')[1] + '.json') + with open(full_path, "w") as outfile: + json.dump(json_obj, outfile, ensure_ascii=False, indent=2) + +def save_report_file(report, report_fields, csv_file, dir_name): + dir_path = os.path.join(DATA['DIR'],dir_name) + if not os.path.exists(dir_path): + os.mkdir(dir_path) + filepath = os.path.join(dir_path, 'report.csv') + with open(filepath, 'w') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=report_fields) + writer.writeheader() + writer.writerows(report) + # save copy of input file + filepath = os.path.join(dir_path, 'input.csv') + csv_file.seek(0) + with open(filepath, 'wb+') as f: + for chunk in csv_file.chunks(): + f.write(chunk) + +def process_csv(csv_file, version): + print("Processing CSV") + dir_name = datetime.now().strftime("%Y-%m-%d-%H:%M:%S") + "-ror-records" + errors = None + success_msg = None + report = [] + report_fields = ['row', 'ror_id', 'action', 'errors'] + skipped_count = 0 + updated_count = 0 + new_count = 0 + read_file = csv_file.read().decode('utf-8') + print(read_file) + reader = csv.DictReader(io.StringIO(read_file)) + row_num = 2 + for row in reader: + ror_id = None + updated = False + print("Row data") + print(row) + if row['id']: + ror_id = row['id'] + updated = True + row_errors, v2_record = update_record_from_csv(row, version) + else: + row_errors, v2_record = new_record_from_csv(row, version) + if not row_errors: + if updated: + action = 'updated' + updated_count += 1 + else: + action = 'created' + new_count += 1 + ror_id = v2_record['id'] + serializer = OrganizationSerializerV2(v2_record) + json_obj = json.loads(JSONRenderer().render(serializer.data)) + print(json_obj) + #create file + file = save_record_file(ror_id, updated, json_obj, dir_name) + else: + action = 'skipped' + skipped_count += 1 + #print(errors) + report.append({"row": row_num, "ror_id": ror_id if ror_id else '', "action": action, "errors": row_errors if row_errors else ''}) + row_num += 1 + print(report) + if new_count > 0 or updated_count > 0 or skipped_count > 0: + #create report file + save_report_file(report, report_fields, csv_file, dir_name) + # create zip file + zipfile = shutil.make_archive(os.path.join(DATA['DIR'], dir_name), 'zip', DATA['DIR'], dir_name) + # upload to S3 + ''' + try: + DATA['CLIENT'].upload_file(zipfile, DATA['PUBLIC_STORE'], dir_name + '.zip') + zipfile = f"https://s3.eu-west-1.amazonaws.com/{DATA['PUBLIC_STORE']}/{urllib.parse.quote(dir_name)}.zip" + except Exception as e: + errors = e + print(errors) + ''' + + success_msg = {"file": zipfile, "rows processed": new_count + updated_count + skipped_count, "created": new_count, "udpated": updated_count, "skipped": skipped_count} + print(success_msg) + return success_msg \ No newline at end of file diff --git a/rorapi/common/csv_create.py b/rorapi/common/csv_create.py new file mode 100644 index 0000000..a0cd121 --- /dev/null +++ b/rorapi/common/csv_create.py @@ -0,0 +1,110 @@ +import copy +from rorapi.common.record_utils import * +from rorapi.common.csv_utils import * +from rorapi.v2.record_constants import * +from rorapi.common.serializers import ErrorsSerializer +from rorapi.common.create_update import new_record_from_json + + +def new_record_from_csv(csv_data, version): + v2_data = copy.deepcopy(V2_TEMPLATE) + errors = [] + #domains + if csv_data['domains']: + v2_data['domains'] = [d.strip() for d in csv_data['domains'].strip(';').split(';')] + + #established + if csv_data['established']: + v2_data['established'] = int(csv_data['established'].strip()) + + #external ids + for k,v in V2_EXTERNAL_ID_TYPES.items(): + if csv_data['external_ids.type.' + v + '.all']: + all_ids = [i.strip() for i in csv_data['external_ids.type.' + v + '.all'].strip(';').split(';')] + ext_id_obj = { + "type": v, + "all": all_ids, + "preferred": csv_data['external_ids.type.' + v + '.preferred'].strip() if csv_data['external_ids.type.' + v + '.preferred'] else all_ids[0] + } + v2_data['external_ids'].append(ext_id_obj) + + #links + for k,v in V2_LINK_TYPES.items(): + if csv_data['links.type.' + v]: + for l in csv_data['links.type.' + v].strip(';').split(';'): + link_obj = { + "type": v, + "value": l.strip() + } + v2_data['links'].append(link_obj) + + #locations + if csv_data['locations.geonames_id']: + geonames_ids = [i.strip() for i in csv_data['locations.geonames_id'].strip(';').split(';')] + for geonames_id in geonames_ids: + location_obj = { + "geonames_id": geonames_id, + "geonames_details": {} + } + v2_data['locations'].append(location_obj) + + #names + temp_names = [] + for k,v in V2_NAME_TYPES.items(): + if csv_data['names.types.' + v]: + for n in csv_data['names.types.' + v].strip(';').split(';'): + if LANG_DELIMITER in n: + name_val, lang = n.split("*") + if lang: + lang_errors, lang_code = get_lang_code(lang.strip()) + if lang_errors: + errors.append("Could not convert language value to ISO code: {}".format(lang)) + else: + name_val = n + lang_code = None + + name_obj = { + "types": [v], + "value": name_val.strip(), + "lang": lang_code + } + temp_names.append(name_obj) + print("temp names 1:") + print(temp_names) + name_values = [n['value'] for n in temp_names] + dup_names = [] + for n in name_values: + if name_values.count(n) > 1: + if n not in dup_names: + dup_names.append(n) + if dup_names: + dup_names_objs = [] + for d in dup_names: + types = [] + for t in temp_names: + if t['value'] == d: + types.extend(t['types']) + name_obj = { + "types": types, + "value": d, + "lang": None + } + dup_names_objs.append(name_obj) + temp_names = [t for t in temp_names if t['value'] not in dup_names] + temp_names.extend(dup_names_objs) + print("temp names 2:") + print(temp_names) + v2_data['names'] = temp_names + + #status + if csv_data['status']: + v2_data['status'] = csv_data['status'].strip().lower() + + #types + if csv_data['types']: + v2_data['types'] = [t.strip().lower() for t in csv_data['types'].strip(';').split(';')] + + validation_errors, new_record = new_record_from_json(v2_data, version) + if validation_errors: + errors = ErrorsSerializer(validation_errors).data + return errors, new_record \ No newline at end of file diff --git a/rorapi/common/validation.py b/rorapi/common/csv_update.py similarity index 53% rename from rorapi/common/validation.py rename to rorapi/common/csv_update.py index bec5af7..c457e0f 100644 --- a/rorapi/common/validation.py +++ b/rorapi/common/csv_update.py @@ -1,355 +1,13 @@ -import jsonschema -import requests import copy -import csv -import json -import io -import os -import re -import shutil -import urllib -from datetime import datetime -from iso639 import Lang -from rest_framework.exceptions import ParseError -from rest_framework.parsers import JSONParser -from rest_framework.renderers import JSONRenderer -from rorapi.common.models import Errors -import update_address as ua -from rorapi.settings import DATA -from rorapi.v2.models import ( - Organization as OrganizationV2, - ListResult as ListResultV2 -) +from rorapi.common.record_utils import * +from rorapi.v2.record_constants import * +from rorapi.common.csv_utils import * from rorapi.v2.serializers import ( OrganizationSerializer as OrganizationSerializerV2 ) -from rorapi.common.queries import get_ror_id, retrieve_organization +from rorapi.common.queries import retrieve_organization from rorapi.common.serializers import ErrorsSerializer - -from rorapi.management.commands.generaterorid import check_ror_id - -ADMIN = { - "created": { - "date": "", - "schema_version": "2.0" - }, - "last_modified": { - "date": "", - "schema_version": "2.0" - } -} - -LAST_MOD = { - "date": "", - "schema_version": "2.0" -} - -OPTIONAL_FIELD_DEFAULTS = { - "domains": [], - "established": None, - "external_ids": [], - "links": [], - "relationships": [] -} - -UPDATE_ACTIONS = { - "ADD": "add", - "DELETE": "delete", - "REPLACE": "replace" -} - -UPDATE_ACTIONS_MULTI = [UPDATE_ACTIONS["ADD"], UPDATE_ACTIONS["DELETE"], UPDATE_ACTIONS["REPLACE"]] - -UPDATE_ACTIONS_SINGLE = [UPDATE_ACTIONS["DELETE"], UPDATE_ACTIONS["REPLACE"]] - -NO_DELETE_FIELDS = ["id", "locations.geonames_id", "names.types.ror_display", "status", "types"] - -CSV_REQUIRED_FIELDS_ACTIONS = { - "id": None, - "domains": UPDATE_ACTIONS_MULTI, - "established": UPDATE_ACTIONS_SINGLE, - "external_ids.type.fundref.all": UPDATE_ACTIONS_MULTI, - "external_ids.type.fundref.preferred": UPDATE_ACTIONS_SINGLE, - "external_ids.type.grid.all": UPDATE_ACTIONS_MULTI, - "external_ids.type.grid.preferred": UPDATE_ACTIONS_SINGLE, - "external_ids.type.isni.all": UPDATE_ACTIONS_MULTI, - "external_ids.type.isni.preferred": UPDATE_ACTIONS_SINGLE, - "external_ids.type.wikidata.all": UPDATE_ACTIONS_MULTI, - "external_ids.type.wikidata.preferred": UPDATE_ACTIONS_SINGLE, - "links.type.website": UPDATE_ACTIONS_MULTI, - "links.type.wikipedia": UPDATE_ACTIONS_MULTI, - "locations.geonames_id": UPDATE_ACTIONS_MULTI, - "names.types.acronym": UPDATE_ACTIONS_MULTI, - "names.types.alias": UPDATE_ACTIONS_MULTI, - "names.types.label": UPDATE_ACTIONS_MULTI, - "names.types.ror_display": [UPDATE_ACTIONS["REPLACE"]], - "status": [UPDATE_ACTIONS["REPLACE"]], - "types": UPDATE_ACTIONS_MULTI -} - -V2_TEMPLATE = { - "locations": [], - "established": None, - "external_ids": [], - "id": "", - "domains": [], - "links": [], - "names": [], - "relationships": [], - "status": "", - "types": [], - "admin": {} -} - -V2_EXTERNAL_ID_TYPES = { - "FUNDREF" : "fundref", - "GRID" : "grid", - "ISNI" : "isni", - "WIKIDATA" : "wikidata" - } - -V2_LINK_TYPES = { - "WEBSITE" : "website", - "WIKIPEDIA" : "wikipedia" - } - -V2_NAME_TYPES = { - "ACRONYM" : "acronym", - "ALIAS" : "alias", - "LABEL" : "label", - "ROR_DISPLAY" : "ror_display" - } - -LANG_DELIMITER = "*" - -UPDATE_DELIMITER = "==" - -SORT_KEYS = { - "domains": None, - "external_ids": "type", - "links": "type", - "locations": "geonames_id", - "names": "value", - "relationships": "type", - "types": None -} - -def update_record(json_input, existing_record): - record = copy.deepcopy(existing_record) - for k, v in json_input.items(): - record[k] = copy.deepcopy(v) - return record - -def update_last_mod(record): - record['admin']['last_modified'] = copy.deepcopy(LAST_MOD) - record['admin']['last_modified']['date'] = datetime.now().strftime("%Y-%m-%d") - return record - -def check_optional_fields(record): - for k in OPTIONAL_FIELD_DEFAULTS: - if k not in record: - return True - return False - -def add_missing_optional_fields(record): - for k, v in OPTIONAL_FIELD_DEFAULTS.items(): - if k not in record: - record[k] = v - return record - -def add_created_last_mod(record): - today = datetime.now().strftime("%Y-%m-%d") - record['admin'] = copy.deepcopy(ADMIN) - record['admin']['created']['date'] = today - record['admin']['last_modified']['date'] = today - return record - -def update_locations(locations): - errors = [] - updated_locations = [] - for location in locations: - if 'geonames_id' in location: - try: - print(location['geonames_id']) - updated_location = ua.new_geonames_v2(str(location['geonames_id'])) - updated_locations.append(updated_location['location']) - except: - errors.append("Error retrieving Geonames data for ID {}. Please check that this is a valid Geonames ID".format(location['geonames_id'])) - return errors, updated_locations - -def get_lang_code(lang_string): - lang_code = None - error = None - if len(lang_string) == 2: - lang_string = lang_string.lower() - else: - lang_string = lang_string.title() - try: - lg = Lang(lang_string) - lang_code = lg.pt1 - except Exception as e: - error = e.msg - return error, lang_code - -def sort_list_fields(v2_record): - for field in v2_record: - if field in SORT_KEYS: - if SORT_KEYS[field] is not None: - sort_key = SORT_KEYS[field] - sorted_vals = sorted(v2_record[field], key=lambda x: x[sort_key]) - else: - sorted_vals = v2_record[field].sort() - v2_record[field] = sorted_vals - return v2_record - - -def get_file_from_url(url): - rsp = requests.get(url) - rsp.raise_for_status() - return rsp.json() - -def validate_v2(data): - errors = [] - schema = get_file_from_url("https://raw.githubusercontent.com/ror-community/ror-schema/schema-v2/ror_schema_v2_0.json") - try: - print("validating data:") - print(data) - jsonschema.validate(data, schema) - except jsonschema.ValidationError as error: - errors.append(error) - print(errors) - return Errors(errors), None - else: - return None, data - -def validate_csv(csv_file): - errors = [] - try: - read_file = csv_file.read().decode('utf-8') - reader = csv.DictReader(io.StringIO(read_file)) - rowcount = 0 - for row in reader: - rowcount += 1 - if rowcount > 0: - csv_fields = reader.fieldnames - missing_fields = [] - for field in CSV_REQUIRED_FIELDS_ACTIONS.keys(): - if field not in csv_fields: - missing_fields.append(field) - print(missing_fields) - if missing_fields: - errors.append(f'CSV file is missing columns: {", ".join(missing_fields)}') - else: - errors.append("CSV file contains no data rows") - except IOError as e: - errors.append(f"Error parsing CSV file: {e}") - print(errors) - return errors - - -def new_record_from_json(json_input, version): - errors = None - valid_data = None - new_record = copy.deepcopy(json_input) - if check_optional_fields(new_record): - new_record = add_missing_optional_fields(new_record) - location_errors, updated_locations = update_locations(new_record['locations']) - if location_errors: - errors = Errors(location_errors) - else: - new_record['locations'] = updated_locations - new_record = add_created_last_mod(new_record) - new_ror_id = check_ror_id(version) - print("new ror id: " + new_ror_id) - new_record['id'] = new_ror_id - errors, valid_data = validate_v2(new_record) - return errors, valid_data - - -def update_record_from_json(new_json, existing_org): - errors = None - valid_data = None - serializer = OrganizationSerializerV2(existing_org) - existing_record = serializer.data - updated_record = update_record(new_json, existing_record) - location_errors, updated_locations = update_locations(updated_record['locations']) - if location_errors: - errors = Errors(location_errors) - else: - updated_record['locations'] = updated_locations - errors, valid_data = validate_v2(updated_record) - return errors, valid_data - - -def get_action_value(csv_field): - action = None - value = None - if csv_field.lower() == "delete": - action = "delete" - value = None - elif UPDATE_DELIMITER in csv_field: - action = csv_field.split(UPDATE_DELIMITER)[0] - value = csv_field.split(UPDATE_DELIMITER)[1] - else: - action = "replace" - value = csv_field - return action, value - -def get_actions_values(csv_field): - print("getting actions values:") - actions_values = {} - if csv_field.lower() == UPDATE_ACTIONS["DELETE"]: - actions_values[UPDATE_ACTIONS["DELETE"]] = None - elif UPDATE_DELIMITER in csv_field: - for ua in list(UPDATE_ACTIONS.values()): - print(ua) - if ua + UPDATE_DELIMITER in csv_field: - print("doing regex:") - regex = r"(" + re.escape( - ua + UPDATE_DELIMITER) + r")(.*?)(?=$|(add|delete|replace)==)" - result = re.search(regex, csv_field) - print(result[0]) - temp_val = result[0].replace(ua + UPDATE_DELIMITER, '') - print("temp val:") - print(temp_val) - actions_values[ua] = [v.strip() for v in temp_val.split(';') if v] - #csv_field.replace(result[0], '') - - else: - actions_values[UPDATE_ACTIONS["REPLACE"]] = [v.strip() for v in csv_field.split(';') if v] - print(actions_values) - return actions_values - -def validate_csv_row_update_syntax(csv_data): - print("validating row") - errors = [] - for k, v in csv_data.items(): - if UPDATE_DELIMITER in v: - print("field:") - print(k) - print("value:") - print(v) - actions_values = get_actions_values(v) - print("actions values:") - print(actions_values) - update_actions = list(actions_values.keys()) - if not update_actions: - errors.append("Update delimiter '{}' found in '{}' field but no valid update action found in value {}".format(UPDATE_DELIMITER, k, v)) - if len(update_actions) > 2: - errors.append("{} update actions '{}' found in '{}' field but only 2 are allowed".format(str(len(update_actions)), ", ".join(update_actions), k)) - if len(update_actions) == 2: - if not (UPDATE_ACTIONS['ADD'] and UPDATE_ACTIONS['DELETE']) in update_actions: - errors.append("Invalid combination of update actions '{}' found in '{}' field.".format(", ".join(update_actions), k)) - disallowed_actions = [ua for ua in update_actions if ua not in CSV_REQUIRED_FIELDS_ACTIONS[k]] - print("allowed actions:") - print(CSV_REQUIRED_FIELDS_ACTIONS[k]) - print("disallowed actions:") - print(disallowed_actions) - if disallowed_actions: - errors.append("Invalid update action(s) '{}' found in {} field. Allowed actions for this field are '{}'".format(", ".join(disallowed_actions), k, ", ".join(CSV_REQUIRED_FIELDS_ACTIONS[k]))) - if v.strip() == UPDATE_ACTIONS['DELETE'].lower() and k in NO_DELETE_FIELDS: - errors.append("Invalid update action '{}' in {} field. Cannot remove all values from a required field.".format(UPDATE_ACTIONS['DELETE'], k)) - return errors +from rorapi.common.create_update import update_record_from_json def update_record_from_csv(csv_data, version): errors = [] @@ -723,200 +381,4 @@ def update_record_from_csv(csv_data, version): validation_errors, updated_record = update_record_from_json(update_data, existing_record) if validation_errors: errors = ErrorsSerializer(validation_errors).data - return errors, updated_record - - -def new_record_from_csv(csv_data, version): - v2_data = copy.deepcopy(V2_TEMPLATE) - errors = [] - #domains - if csv_data['domains']: - v2_data['domains'] = [d.strip() for d in csv_data['domains'].strip(';').split(';')] - - #established - if csv_data['established']: - v2_data['established'] = int(csv_data['established'].strip()) - - #external ids - for k,v in V2_EXTERNAL_ID_TYPES.items(): - if csv_data['external_ids.type.' + v + '.all']: - all_ids = [i.strip() for i in csv_data['external_ids.type.' + v + '.all'].strip(';').split(';')] - ext_id_obj = { - "type": v, - "all": all_ids, - "preferred": csv_data['external_ids.type.' + v + '.preferred'].strip() if csv_data['external_ids.type.' + v + '.preferred'] else all_ids[0] - } - v2_data['external_ids'].append(ext_id_obj) - - #links - for k,v in V2_LINK_TYPES.items(): - if csv_data['links.type.' + v]: - for l in csv_data['links.type.' + v].strip(';').split(';'): - link_obj = { - "type": v, - "value": l.strip() - } - v2_data['links'].append(link_obj) - - #locations - if csv_data['locations.geonames_id']: - geonames_ids = [i.strip() for i in csv_data['locations.geonames_id'].strip(';').split(';')] - for geonames_id in geonames_ids: - location_obj = { - "geonames_id": geonames_id, - "geonames_details": {} - } - v2_data['locations'].append(location_obj) - - #names - temp_names = [] - for k,v in V2_NAME_TYPES.items(): - if csv_data['names.types.' + v]: - for n in csv_data['names.types.' + v].strip(';').split(';'): - if LANG_DELIMITER in n: - name_val, lang = n.split("*") - if lang: - lang_errors, lang_code = get_lang_code(lang.strip()) - if lang_errors: - errors.append("Could not convert language value to ISO code: {}".format(lang)) - else: - name_val = n - lang_code = None - - name_obj = { - "types": [v], - "value": name_val.strip(), - "lang": lang_code - } - temp_names.append(name_obj) - print("temp names 1:") - print(temp_names) - name_values = [n['value'] for n in temp_names] - dup_names = [] - for n in name_values: - if name_values.count(n) > 1: - if n not in dup_names: - dup_names.append(n) - if dup_names: - dup_names_objs = [] - for d in dup_names: - types = [] - for t in temp_names: - if t['value'] == d: - types.extend(t['types']) - name_obj = { - "types": types, - "value": d, - "lang": None - } - dup_names_objs.append(name_obj) - temp_names = [t for t in temp_names if t['value'] not in dup_names] - temp_names.extend(dup_names_objs) - print("temp names 2:") - print(temp_names) - v2_data['names'] = temp_names - - #status - if csv_data['status']: - v2_data['status'] = csv_data['status'].strip().lower() - - #types - if csv_data['types']: - v2_data['types'] = [t.strip().lower() for t in csv_data['types'].strip(';').split(';')] - - validation_errors, new_record = new_record_from_json(v2_data, version) - if validation_errors: - errors = ErrorsSerializer(validation_errors).data - return errors, new_record - -def save_record_file(ror_id, updated, json_obj, dir_name): - dir_path = os.path.join(DATA['DIR'],dir_name) - if not os.path.exists(dir_path): - os.mkdir(dir_path) - subdir = 'updates' if updated else 'new' - if not os.path.exists(os.path.join(dir_path, subdir)): - os.mkdir(os.path.join(dir_path, subdir)) - full_path = os.path.join(dir_path, subdir, ror_id.split('https://ror.org/')[1] + '.json') - with open(full_path, "w") as outfile: - json.dump(json_obj, outfile, ensure_ascii=False, indent=2) - -def save_report_file(report, report_fields, csv_file, dir_name): - dir_path = os.path.join(DATA['DIR'],dir_name) - if not os.path.exists(dir_path): - os.mkdir(dir_path) - filepath = os.path.join(dir_path, 'report.csv') - with open(filepath, 'w') as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=report_fields) - writer.writeheader() - writer.writerows(report) - # save copy of input file - filepath = os.path.join(dir_path, 'input.csv') - csv_file.seek(0) - with open(filepath, 'wb+') as f: - for chunk in csv_file.chunks(): - f.write(chunk) - -def process_csv(csv_file, version): - print("Processing CSV") - dir_name = datetime.now().strftime("%Y-%m-%d-%H:%M:%S") + "-ror-records" - errors = None - success_msg = None - report = [] - report_fields = ['row', 'ror_id', 'action', 'errors'] - skipped_count = 0 - updated_count = 0 - new_count = 0 - read_file = csv_file.read().decode('utf-8') - print(read_file) - reader = csv.DictReader(io.StringIO(read_file)) - row_num = 2 - for row in reader: - ror_id = None - updated = False - print("Row data") - print(row) - if row['id']: - ror_id = row['id'] - updated = True - row_errors, v2_record = update_record_from_csv(row, version) - else: - row_errors, v2_record = new_record_from_csv(row, version) - if not row_errors: - if updated: - action = 'updated' - updated_count += 1 - else: - action = 'created' - new_count += 1 - ror_id = v2_record['id'] - sorted_record = sort_list_fields(v2_record) - serializer = OrganizationSerializerV2(sorted_record) - json_obj = json.loads(JSONRenderer().render(serializer.data)) - print(json_obj) - #create file - file = save_record_file(ror_id, updated, json_obj, dir_name) - else: - action = 'skipped' - skipped_count += 1 - #print(errors) - report.append({"row": row_num, "ror_id": ror_id if ror_id else '', "action": action, "errors": row_errors if row_errors else ''}) - row_num += 1 - print(report) - if new_count > 0 or updated_count > 0 or skipped_count > 0: - #create report file - save_report_file(report, report_fields, csv_file, dir_name) - # create zip file - zipfile = shutil.make_archive(os.path.join(DATA['DIR'], dir_name), 'zip', DATA['DIR'], dir_name) - # upload to S3 - ''' - try: - DATA['CLIENT'].upload_file(zipfile, DATA['PUBLIC_STORE'], dir_name + '.zip') - zipfile = f"https://s3.eu-west-1.amazonaws.com/{DATA['PUBLIC_STORE']}/{urllib.parse.quote(dir_name)}.zip" - except Exception as e: - errors = e - print(errors) - ''' - - success_msg = {"file": zipfile, "rows processed": new_count + updated_count + skipped_count, "created": new_count, "udpated": updated_count, "skipped": skipped_count} - print(success_msg) - return success_msg \ No newline at end of file + return errors, updated_record \ No newline at end of file diff --git a/rorapi/common/csv_utils.py b/rorapi/common/csv_utils.py new file mode 100644 index 0000000..8c4d706 --- /dev/null +++ b/rorapi/common/csv_utils.py @@ -0,0 +1,122 @@ +import csv +import io +import re + +UPDATE_ACTIONS = { + "ADD": "add", + "DELETE": "delete", + "REPLACE": "replace" +} + +UPDATE_ACTIONS_MULTI = [UPDATE_ACTIONS["ADD"], UPDATE_ACTIONS["DELETE"], UPDATE_ACTIONS["REPLACE"]] + +UPDATE_ACTIONS_SINGLE = [UPDATE_ACTIONS["DELETE"], UPDATE_ACTIONS["REPLACE"]] + +NO_DELETE_FIELDS = ["id", "locations.geonames_id", "names.types.ror_display", "status", "types"] + +CSV_REQUIRED_FIELDS_ACTIONS = { + "id": None, + "domains": UPDATE_ACTIONS_MULTI, + "established": UPDATE_ACTIONS_SINGLE, + "external_ids.type.fundref.all": UPDATE_ACTIONS_MULTI, + "external_ids.type.fundref.preferred": UPDATE_ACTIONS_SINGLE, + "external_ids.type.grid.all": UPDATE_ACTIONS_MULTI, + "external_ids.type.grid.preferred": UPDATE_ACTIONS_SINGLE, + "external_ids.type.isni.all": UPDATE_ACTIONS_MULTI, + "external_ids.type.isni.preferred": UPDATE_ACTIONS_SINGLE, + "external_ids.type.wikidata.all": UPDATE_ACTIONS_MULTI, + "external_ids.type.wikidata.preferred": UPDATE_ACTIONS_SINGLE, + "links.type.website": UPDATE_ACTIONS_MULTI, + "links.type.wikipedia": UPDATE_ACTIONS_MULTI, + "locations.geonames_id": UPDATE_ACTIONS_MULTI, + "names.types.acronym": UPDATE_ACTIONS_MULTI, + "names.types.alias": UPDATE_ACTIONS_MULTI, + "names.types.label": UPDATE_ACTIONS_MULTI, + "names.types.ror_display": [UPDATE_ACTIONS["REPLACE"]], + "status": [UPDATE_ACTIONS["REPLACE"]], + "types": UPDATE_ACTIONS_MULTI +} + +LANG_DELIMITER = "*" + +UPDATE_DELIMITER = "==" + + +def get_actions_values(csv_field): + print("getting actions values:") + actions_values = {} + if csv_field.lower() == UPDATE_ACTIONS["DELETE"]: + actions_values[UPDATE_ACTIONS["DELETE"]] = None + elif UPDATE_DELIMITER in csv_field: + for ua in list(UPDATE_ACTIONS.values()): + print(ua) + if ua + UPDATE_DELIMITER in csv_field: + print("doing regex:") + regex = r"(" + re.escape( + ua + UPDATE_DELIMITER) + r")(.*?)(?=$|(add|delete|replace)==)" + result = re.search(regex, csv_field) + print(result[0]) + temp_val = result[0].replace(ua + UPDATE_DELIMITER, '') + print("temp val:") + print(temp_val) + actions_values[ua] = [v.strip() for v in temp_val.split(';') if v] + + else: + actions_values[UPDATE_ACTIONS["REPLACE"]] = [v.strip() for v in csv_field.split(';') if v] + print(actions_values) + return actions_values + +def validate_csv(csv_file): + errors = [] + try: + read_file = csv_file.read().decode('utf-8') + reader = csv.DictReader(io.StringIO(read_file)) + rowcount = 0 + for row in reader: + rowcount += 1 + if rowcount > 0: + csv_fields = reader.fieldnames + missing_fields = [] + for field in CSV_REQUIRED_FIELDS_ACTIONS.keys(): + if field not in csv_fields: + missing_fields.append(field) + print(missing_fields) + if missing_fields: + errors.append(f'CSV file is missing columns: {", ".join(missing_fields)}') + else: + errors.append("CSV file contains no data rows") + except IOError as e: + errors.append(f"Error parsing CSV file: {e}") + print(errors) + return errors + +def validate_csv_row_update_syntax(csv_data): + print("validating row") + errors = [] + for k, v in csv_data.items(): + if UPDATE_DELIMITER in v: + print("field:") + print(k) + print("value:") + print(v) + actions_values = get_actions_values(v) + print("actions values:") + print(actions_values) + update_actions = list(actions_values.keys()) + if not update_actions: + errors.append("Update delimiter '{}' found in '{}' field but no valid update action found in value {}".format(UPDATE_DELIMITER, k, v)) + if len(update_actions) > 2: + errors.append("{} update actions '{}' found in '{}' field but only 2 are allowed".format(str(len(update_actions)), ", ".join(update_actions), k)) + if len(update_actions) == 2: + if not (UPDATE_ACTIONS['ADD'] and UPDATE_ACTIONS['DELETE']) in update_actions: + errors.append("Invalid combination of update actions '{}' found in '{}' field.".format(", ".join(update_actions), k)) + disallowed_actions = [ua for ua in update_actions if ua not in CSV_REQUIRED_FIELDS_ACTIONS[k]] + print("allowed actions:") + print(CSV_REQUIRED_FIELDS_ACTIONS[k]) + print("disallowed actions:") + print(disallowed_actions) + if disallowed_actions: + errors.append("Invalid update action(s) '{}' found in {} field. Allowed actions for this field are '{}'".format(", ".join(disallowed_actions), k, ", ".join(CSV_REQUIRED_FIELDS_ACTIONS[k]))) + if v.strip() == UPDATE_ACTIONS['DELETE'].lower() and k in NO_DELETE_FIELDS: + errors.append("Invalid update action '{}' in {} field. Cannot remove all values from a required field.".format(UPDATE_ACTIONS['DELETE'], k)) + return errors \ No newline at end of file diff --git a/rorapi/common/record_utils.py b/rorapi/common/record_utils.py new file mode 100644 index 0000000..39107ff --- /dev/null +++ b/rorapi/common/record_utils.py @@ -0,0 +1,38 @@ +import jsonschema +import requests +from iso639 import Lang +from rorapi.common.models import Errors + + +def get_lang_code(lang_string): + lang_code = None + error = None + if len(lang_string) == 2: + lang_string = lang_string.lower() + else: + lang_string = lang_string.title() + try: + lg = Lang(lang_string) + lang_code = lg.pt1 + except Exception as e: + error = e.msg + return error, lang_code + +def get_file_from_url(url): + rsp = requests.get(url) + rsp.raise_for_status() + return rsp.json() + +def validate_record(data, schema): + errors = [] + try: + print("validating data:") + print(data) + jsonschema.validate(data, schema) + except jsonschema.ValidationError as error: + errors.append(error) + print(errors) + return Errors(errors), None + else: + return None, data + diff --git a/rorapi/common/views.py b/rorapi/common/views.py index 5404fc7..b5cd59e 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -1,22 +1,18 @@ from rest_framework import viewsets, routers, status -from rest_framework.exceptions import ParseError from rest_framework.response import Response from django.http import HttpResponse from django.views import View from django.shortcuts import redirect -from rest_framework.authentication import BasicAuthentication from rest_framework.permissions import BasePermission from rest_framework.views import APIView from rest_framework.parsers import FormParser, MultiPartParser from rorapi.settings import DATA -import json -import copy -import csv -import io import mimetypes import magic -from rorapi.common import validation +from rorapi.common.create_update import new_record_from_json, update_record_from_json +from rorapi.common.csv_bulk import process_csv +from rorapi.common.csv_utils import validate_csv from rorapi.settings import REST_FRAMEWORK from rorapi.common.matching import match_organizations from rorapi.common.models import ( @@ -113,7 +109,6 @@ def retrieve(self, request, pk=None, version=REST_FRAMEWORK["DEFAULT_VERSION"]): serializer = OrganizationSerializerV1(organization) return Response(serializer.data) - permission_classes = [OurTokenPermission] def create(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): errors = None if version == "v2": @@ -121,7 +116,7 @@ def create(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): if 'id' in json_input and (json_input['id'] is not None and json_input['id'] != ""): errors = Errors(["Value {} found in ID field. New records cannot contain a value in the ID field".format(json_inputjson['id'])]) else: - errors, valid_data = validation.new_record_from_json(json_input, version) + errors, valid_data = new_record_from_json(json_input, version) else: errors = Errors(["Version {} does not support creating records".format(version)]) if errors is not None: @@ -152,7 +147,7 @@ def update(self, request, pk=None, version=REST_FRAMEWORK["DEFAULT_VERSION"]): elif get_ror_id(json['id']) != ror_id: errors = Errors(["Value {} in IDs field does not match resource ID specified in request URL {}".format(json['id'], pk)]) else: - errors, valid_data = validation.update_record_from_json(json, organization) + errors, valid_data = update_record_from_json(json, organization) else: errors = Errors(["Version {} does not support creating records".format(version)]) if errors is not None: @@ -224,10 +219,10 @@ def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): print(mime_type) if "ASCII text" in mime_type or "UTF-8 Unicode text" in mime_type or "CSV text" in mime_type: file_object.seek(0) - csv_validation_errors = validation.validate_csv(file_object) + csv_validation_errors = validate_csv(file_object) if len(csv_validation_errors) == 0: file_object.seek(0) - msg = validation.process_csv(file_object, version) + msg = process_csv(file_object, version) else: errors=Errors(csv_validation_errors) diff --git a/rorapi/v2/models.py b/rorapi/v2/models.py index 2cacbfb..3a13e19 100644 --- a/rorapi/v2/models.py +++ b/rorapi/v2/models.py @@ -51,7 +51,7 @@ def __init__(self, data): data, ["established", "id", "status"] ) self.admin = Admin(data.admin) - self.domains = data.domains.sort() + self.domains = sorted(data.domains) sorted_ext_ids = sorted(data.external_ids, key=lambda x: x['type']) self.external_ids = [ Entity(e, ["type", "preferred", "all"]) for e in sorted_ext_ids @@ -64,9 +64,9 @@ def __init__(self, data): self.names = [Entity(n, ["value", "lang", "types"]) for n in sorted_names] sorted_rels = sorted(data.relationships, key=lambda x: x['type']) self.relationships = [ - Entity(r, ["type", "label", "id"]) for r in data.relationships + Entity(r, ["type", "label", "id"]) for r in sorted_rels ] - self.types = data.types.sort() + self.types = sorted(data.types) class ListResult: diff --git a/rorapi/v2/record_constants.py b/rorapi/v2/record_constants.py new file mode 100644 index 0000000..1f49572 --- /dev/null +++ b/rorapi/v2/record_constants.py @@ -0,0 +1,66 @@ +V2_ADMIN = { + "created": { + "date": "", + "schema_version": "2.0" + }, + "last_modified": { + "date": "", + "schema_version": "2.0" + } +} + +V2_LAST_MOD = { + "date": "", + "schema_version": "2.0" +} + +V2_OPTIONAL_FIELD_DEFAULTS = { + "domains": [], + "established": None, + "external_ids": [], + "links": [], + "relationships": [] +} + +V2_TEMPLATE = { + "locations": [], + "established": None, + "external_ids": [], + "id": "", + "domains": [], + "links": [], + "names": [], + "relationships": [], + "status": "", + "types": [], + "admin": {} +} + +V2_EXTERNAL_ID_TYPES = { + "FUNDREF" : "fundref", + "GRID" : "grid", + "ISNI" : "isni", + "WIKIDATA" : "wikidata" + } + +V2_LINK_TYPES = { + "WEBSITE" : "website", + "WIKIPEDIA" : "wikipedia" + } + +V2_NAME_TYPES = { + "ACRONYM" : "acronym", + "ALIAS" : "alias", + "LABEL" : "label", + "ROR_DISPLAY" : "ror_display" + } + +V2_SORT_KEYS = { + "domains": None, + "external_ids": "type", + "links": "type", + "locations": "geonames_id", + "names": "value", + "relationships": "type", + "types": None +} \ No newline at end of file From cdd6f9ea8fc289ae5e25e7814a218296feb93e7f Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Fri, 8 Mar 2024 22:07:55 -0600 Subject: [PATCH 18/38] fix existing tests and add new tests for file upload view --- rorapi/common/urls.py | 2 +- .../tests/tests_unit/data/test_upload_csv.csv | 3 + rorapi/tests/tests_unit/tests_models_v2.py | 40 +++++------- rorapi/tests/tests_unit/tests_queries_v2.py | 22 +++---- rorapi/tests/tests_unit/tests_views_v1.py | 6 +- rorapi/tests/tests_unit/tests_views_v2.py | 62 +++++++++++++++++-- rorapi/v2/record_constants.py | 3 +- 7 files changed, 90 insertions(+), 48 deletions(-) create mode 100644 rorapi/tests/tests_unit/data/test_upload_csv.csv diff --git a/rorapi/common/urls.py b/rorapi/common/urls.py index 2d24274..0c97f55 100644 --- a/rorapi/common/urls.py +++ b/rorapi/common/urls.py @@ -13,7 +13,7 @@ url(r"^(?P(v1|v2))\/generateaddress\/(?P[0-9]+)", GenerateAddress.as_view()), path('generateaddress/', GenerateAddress.as_view()), url(r"^generateid$", GenerateId.as_view()), - path(r"^(?P(v1|v2))\/indexdata/(?P.*)", IndexData.as_view()), + url(r"^(?P(v1|v2))\/indexdata/(?P.*)", IndexData.as_view()), url(r"^(?P(v1|v2))\/", include(views.organizations_router.urls)), url(r"^", include(views.organizations_router.urls)), url(r"^docs/", include_docs_urls(title="Research Organization Registry")), diff --git a/rorapi/tests/tests_unit/data/test_upload_csv.csv b/rorapi/tests/tests_unit/data/test_upload_csv.csv new file mode 100644 index 0000000..494420f --- /dev/null +++ b/rorapi/tests/tests_unit/data/test_upload_csv.csv @@ -0,0 +1,3 @@ +html_url,id,names.types.ror_display,status,types,names.types.alias,names.types.label,names.types.acronym,links.type.website,links.type.wikipedia,domains,established,external_ids.type.fundref.all,external_ids.type.fundref.preferred,external_ids.type.grid.all,external_ids.type.grid.preferred,external_ids.type.isni.all,external_ids.type.isni.preferred,external_ids.type.wikidata.all,external_ids.type.wikidata.preferred,city,country,locations.geonames_id +https://github.com/ror-community/ror-updates/issues/9185,,Jizzakh branch of the National University of Uzbekistan named after Mirzo Ulugbek,active,Education,Jizzakh branch of the National University of Uzbekistan; Mirzo Ulug`bek nomidagi O`zbekiston milliy universiteti Jizzax filiali; Джизакский филиал Национального университета Узбекистана имени Мирзо Улугбека,Mirzo Ulug`bek nomidagi O`zbekiston milliy universiteti Jizzax filiali*Uzbek,,https://jbnuu.uz,https://uz.wikipedia.org/wiki/O%CA%BBzbekiston_milliy_universitetining_Jizzax_filiali,,2019,,,,,,,Q72342707,Q72342707,Jizzakh,Uzbekistan,1513886 +https://github.com/ror-community/ror-updates/issues/9389,,Znanstveno-raziskovalno središče Koper,active,Facility; Government,SRC Koper; ZRS Koper;,Science and Research Centre of Koper*English; Centro di ricerche scientifiche Capodistria*Italian,,https://www.zrs-kp.si;,,,,,,,,0000 0004 0398 0403,0000 0004 0398 0403,Q49569044,Q49569044,Koper,Slovenia,3197753 \ No newline at end of file diff --git a/rorapi/tests/tests_unit/tests_models_v2.py b/rorapi/tests/tests_unit/tests_models_v2.py index b798074..b0f9958 100644 --- a/rorapi/tests/tests_unit/tests_models_v2.py +++ b/rorapi/tests/tests_unit/tests_models_v2.py @@ -67,7 +67,8 @@ def test_attributes_exist(self): organization = Organization(AttrDict(data)) self.assertEqual(organization.id, data["id"]) - self.assertEqual(organization.types, data["types"]) + for i, type in enumerate(organization.types): + self.assertIn(organization.types[i], data["types"]) self.assertEqual(organization.established, data["established"]) self.assertEqual( organization.locations[0].geonames_details.lat, @@ -99,21 +100,18 @@ def test_attributes_exist(self): self.assertEqual(len(organization.names), 6) for i, name in enumerate(organization.names): - self.assertEqual(organization.names[i].value, data["names"][i]["value"]) - self.assertEqual(organization.names[i].types, data["names"][i]["types"]) - self.assertEqual(organization.names[i].lang, data["names"][i]["lang"]) + matched_names = [n for n in data["names"] if \ + n['value']==organization.names[i].value and \ + n['types']==organization.names[i].types and \ + n['lang']==organization.names[i].lang] + self.assertTrue(len(matched_names) == 1) for i, ext_id in enumerate(organization.external_ids): - self.assertEqual( - organization.external_ids[i].all, data["external_ids"][i]["all"] - ) - self.assertEqual( - organization.external_ids[i].preferred, - data["external_ids"][i]["preferred"], - ) - self.assertEqual( - organization.external_ids[i].type, data["external_ids"][i]["type"] - ) + matched_ids = [e for e in data["external_ids"] if \ + e['all']==organization.external_ids[i].all and \ + e['preferred']==organization.external_ids[i].preferred and \ + e['type']==organization.external_ids[i].type] + self.assertTrue(len(matched_ids) == 1) class MatchedOrganizationTestCase(SimpleTestCase): @@ -153,15 +151,5 @@ def test_attributes_exist(self): self.assertEqual(organization.chosen, data["chosen"]) self.assertEqual(organization.organization.id, data["organization"]["id"]) for i, name in enumerate(organization.organization.names): - self.assertEqual( - organization.organization.names[i].value, - data["organization"]["names"][i]["value"], - ) - self.assertEqual( - organization.organization.names[i].types, - data["organization"]["names"][i]["types"], - ) - self.assertEqual( - organization.organization.names[i].lang, - data["organization"]["names"][i]["lang"], - ) + matched_names = [n for n in data["organization"]["names"] if n['value']==organization.organization.names[i].value and n['types']==organization.organization.names[i].types and organization.organization.names[i].lang] + self.assertTrue(len(matched_names) == 1) diff --git a/rorapi/tests/tests_unit/tests_queries_v2.py b/rorapi/tests/tests_unit/tests_queries_v2.py index 9c600e0..b3bca45 100644 --- a/rorapi/tests/tests_unit/tests_queries_v2.py +++ b/rorapi/tests/tests_unit/tests_queries_v2.py @@ -537,12 +537,11 @@ def test_search_organizations(self, search_mock): self.test_data['hits']['hits']): self.assertEquals(ret.id, exp['_source']['id']) for i, name in enumerate(ret.names): - self.assertEqual(ret.names[i].value, - exp['_source']['names'][i]['value']) - self.assertEqual(ret.names[i].types, - exp['_source']['names'][i]['types']) - self.assertEqual(ret.names[i].lang, - exp['_source']['names'][i]['lang']) + matched_names = [n for n in exp['_source']['names'] if \ + n['value']==ret.names[i].value and \ + n['types']==ret.names[i].types and \ + n['lang']==ret.names[i].lang] + self.assertTrue(len(matched_names) == 1) self.assertEquals( len(organizations.meta.types), len(self.test_data['aggregations']['types']['buckets'])) @@ -612,12 +611,11 @@ def test_retrieve_organization(self, search_mock): expected = self.test_data['hits']['hits'][0]['_source'] self.assertEquals(organization.id, expected['id']) for i, name in enumerate(organization.names): - self.assertEqual(organization.names[i].value, - expected['names'][i]['value']) - self.assertEqual(organization.names[i].types, - expected['names'][i]['types']) - self.assertEqual(organization.names[i].lang, - expected['names'][i]['lang']) + matched_names = [n for n in expected["names"] if \ + n['value']==organization.names[i].value and \ + n['types']==organization.names[i].types and \ + n['lang']==organization.names[i].lang] + self.assertTrue(len(matched_names) == 1) @mock.patch('elasticsearch_dsl.Search.execute') def test_retrieve_non_existing_organization(self, search_mock): diff --git a/rorapi/tests/tests_unit/tests_views_v1.py b/rorapi/tests/tests_unit/tests_views_v1.py index dd2494c..cf17efe 100644 --- a/rorapi/tests/tests_unit/tests_views_v1.py +++ b/rorapi/tests/tests_unit/tests_views_v1.py @@ -218,7 +218,7 @@ def setUp(self): def test_index_ror_success(self, index_mock, permission_mock): index_mock.return_value = self.success_msg permission_mock.return_value = True - response = self.client.get('/indexdata/foo') + response = self.client.get('/v1/indexdata/foo') self.assertEquals(response.status_code, 200) @mock.patch('rorapi.common.views.OurTokenPermission.has_permission') @@ -226,13 +226,13 @@ def test_index_ror_success(self, index_mock, permission_mock): def test_index_ror_fail_error(self, index_mock, permission_mock): index_mock.return_value = self.error_msg permission_mock.return_value = True - response = self.client.get('/indexdata/foo') + response = self.client.get('/v1/indexdata/foo') self.assertEquals(response.status_code, 400) @mock.patch('rorapi.common.views.OurTokenPermission.has_permission') def test_index_ror_fail_no_permission(self, permission_mock): permission_mock.return_value = False - response = self.client.get('/indexdata/foo') + response = self.client.get('/v1/indexdata/foo') self.assertEquals(response.status_code, 403) class HeartbeatViewTestCase(SimpleTestCase): diff --git a/rorapi/tests/tests_unit/tests_views_v2.py b/rorapi/tests/tests_unit/tests_views_v2.py index 01205b5..e0d4825 100644 --- a/rorapi/tests/tests_unit/tests_views_v2.py +++ b/rorapi/tests/tests_unit/tests_views_v2.py @@ -6,6 +6,7 @@ from rest_framework.test import APIRequestFactory from rorapi.common import views +from rorapi.v2.models import Organization as OrganizationV2 from .utils import IterableAttrDict @@ -107,7 +108,25 @@ def test_retrieve_organization(self, search_mock): organization = json.loads(response.content.decode('utf-8')) # go through every attribute and check to see that they are equal self.assertEquals(response.status_code, 200) - self.assertEquals(organization, self.test_data['hits']['hits'][0]['_source']) + self.assertEquals(organization['admin'], self.test_data['hits']['hits'][0]['_source']['admin']) + for d in organization['domains']: + self.assertIn(d, self.test_data['hits']['hits'][0]['_source']['domains']) + self.assertEquals(organization['established'], self.test_data['hits']['hits'][0]['_source']['established']) + for e in organization['external_ids']: + self.assertIn(e, self.test_data['hits']['hits'][0]['_source']['external_ids']) + self.assertEquals(organization['id'], self.test_data['hits']['hits'][0]['_source']['id']) + for l in organization['links']: + self.assertIn(l, self.test_data['hits']['hits'][0]['_source']['links']) + for l in organization['locations']: + self.assertIn(l, self.test_data['hits']['hits'][0]['_source']['locations']) + for n in organization['names']: + self.assertIn(n, self.test_data['hits']['hits'][0]['_source']['names']) + for r in organization['relationships']: + self.assertIn(r, self.test_data['hits']['hits'][0]['_source']['relationships']) + self.assertEquals(organization['status'], self.test_data['hits']['hits'][0]['_source']['status']) + for t in organization['types']: + self.assertIn(t, self.test_data['hits']['hits'][0]['_source']['types']) + @mock.patch('elasticsearch_dsl.Search.execute') def test_retrieve_non_existing_organization(self, search_mock): @@ -214,7 +233,7 @@ def setUp(self): def test_index_ror_success(self, index_mock, permission_mock): index_mock.return_value = self.success_msg permission_mock.return_value = True - response = self.client.get('/indexdata/foo') + response = self.client.get('/v2/indexdata/foo') self.assertEquals(response.status_code, 200) @mock.patch('rorapi.common.views.OurTokenPermission.has_permission') @@ -222,13 +241,13 @@ def test_index_ror_success(self, index_mock, permission_mock): def test_index_ror_fail_error(self, index_mock, permission_mock): index_mock.return_value = self.error_msg permission_mock.return_value = True - response = self.client.get('/indexdata/foo') + response = self.client.get('/v2/indexdata/foo') self.assertEquals(response.status_code, 400) @mock.patch('rorapi.common.views.OurTokenPermission.has_permission') def test_index_ror_fail_no_permission(self, permission_mock): permission_mock.return_value = False - response = self.client.get('/indexdata/foo') + response = self.client.get('/v2/indexdata/foo') self.assertEquals(response.status_code, 403) class HeartbeatViewTestCase(SimpleTestCase): @@ -243,4 +262,37 @@ def test_heartbeat_success(self, search_mock): search_mock.return_value = \ IterableAttrDict(self.test_data, self.test_data['hits']['hits']) response = self.client.get('/v2/heartbeat') - self.assertEquals(response.status_code, 200) \ No newline at end of file + self.assertEquals(response.status_code, 200) + +class FileUploadViewTestCase(SimpleTestCase): + def setUp(self): + self.csv_errors_empty = [] + self.csv_errors_error = ['error'] + self.process_csv_msg = {"filename":"filename.zip", "rows processed":1,"created":0,"udpated":0,"skipped":1} + self.maxDiff = None + + @mock.patch('rorapi.common.views.OurTokenPermission.has_permission') + @mock.patch('rorapi.common.views.validate_csv') + @mock.patch('rorapi.common.views.process_csv') + def test_file_upload_success(self, permission_mock, validate_csv_mock, process_csv_msg_mock): + permission_mock.return_value = True + validate_csv_mock.return_value = self.csv_errors_empty + process_csv_msg_mock.return_value = self.process_csv_msg + with open(os.path.join(os.path.dirname(__file__), + 'data/test_upload_csv.csv'), 'rb') as f: + response = self.client.post('/v2/upload', {"file":f}) + self.assertEquals(response.status_code, 201) + + @mock.patch('rorapi.common.views.OurTokenPermission.has_permission') + @mock.patch('rorapi.common.views.validate_csv') + def test_file_upload_fail_error(self, permission_mock, validate_csv_mock): + permission_mock.return_value = True + validate_csv_mock.return_value = self.csv_errors_error + response = self.client.post('/v2/upload') + self.assertEquals(response.status_code, 400) + + @mock.patch('rorapi.common.views.OurTokenPermission.has_permission') + def test_file_upload_fail_no_permission(self, permission_mock): + permission_mock.return_value = False + response = self.client.post('/v2/upload') + self.assertEquals(response.status_code, 403) \ No newline at end of file diff --git a/rorapi/v2/record_constants.py b/rorapi/v2/record_constants.py index 1f49572..e962615 100644 --- a/rorapi/v2/record_constants.py +++ b/rorapi/v2/record_constants.py @@ -63,4 +63,5 @@ "names": "value", "relationships": "type", "types": None -} \ No newline at end of file +} + From 4c78ebd05c994a06a624a03dd12f2b28520131b8 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Sat, 9 Mar 2024 09:08:42 -0600 Subject: [PATCH 19/38] update tests --- .../data/test_data_new_record_invalid_v2.json | 29 +++++++++++++ .../data/test_data_new_record_valid_v2.json | 28 ++++++++++++ rorapi/tests/tests_unit/tests_views_v2.py | 43 +++++++++++++++++++ 3 files changed, 100 insertions(+) create mode 100644 rorapi/tests/tests_unit/data/test_data_new_record_invalid_v2.json create mode 100644 rorapi/tests/tests_unit/data/test_data_new_record_valid_v2.json diff --git a/rorapi/tests/tests_unit/data/test_data_new_record_invalid_v2.json b/rorapi/tests/tests_unit/data/test_data_new_record_invalid_v2.json new file mode 100644 index 0000000..61f3f2a --- /dev/null +++ b/rorapi/tests/tests_unit/data/test_data_new_record_invalid_v2.json @@ -0,0 +1,29 @@ +{ + "locations": [ + { + "geonames_id": 2661552, + "geonames_details": { + "country_code": "CH", + "country_name": "Switzerland", + "lat": 46.94809, + "lng": 7.44744, + "name": "Bern" + } + } + ], + "names": [ + { + "value": "JDSU (Switzerland)", + "types": [ + "ror_display", + "label" + ], + "lang": null + } + ], + "status": "active", + "types": [ + "company" + ], + "foo": "bar" +} \ No newline at end of file diff --git a/rorapi/tests/tests_unit/data/test_data_new_record_valid_v2.json b/rorapi/tests/tests_unit/data/test_data_new_record_valid_v2.json new file mode 100644 index 0000000..a349173 --- /dev/null +++ b/rorapi/tests/tests_unit/data/test_data_new_record_valid_v2.json @@ -0,0 +1,28 @@ +{ + "locations": [ + { + "geonames_id": 2661552, + "geonames_details": { + "country_code": "CH", + "country_name": "Switzerland", + "lat": 46.94809, + "lng": 7.44744, + "name": "Bern" + } + } + ], + "names": [ + { + "value": "JDSU (Switzerland)", + "types": [ + "ror_display", + "label" + ], + "lang": null + } + ], + "status": "active", + "types": [ + "company" + ] +} \ No newline at end of file diff --git a/rorapi/tests/tests_unit/tests_views_v2.py b/rorapi/tests/tests_unit/tests_views_v2.py index e0d4825..c58c76e 100644 --- a/rorapi/tests/tests_unit/tests_views_v2.py +++ b/rorapi/tests/tests_unit/tests_views_v2.py @@ -162,6 +162,7 @@ def test_retrieve_invalid_id(self, search_mock): self.assertEquals(len(organization['errors']), 1) self.assertTrue(any(['not a valid' in e for e in organization['errors']])) + class GenerateIdViewTestCase(SimpleTestCase): def setUp(self): with open( @@ -186,6 +187,32 @@ def test_generateid_fail_no_permission(self, permission_mock): response = self.client.get('/generateid') self.assertEquals(response.status_code, 403) + +class GenerateIdViewTestCase(SimpleTestCase): + def setUp(self): + with open( + os.path.join(os.path.dirname(__file__), + 'data/test_data_empty_es7.json'), 'r') as f: + self.test_data_empty = json.load(f) + self.maxDiff = None + + @mock.patch('rorapi.common.views.OurTokenPermission.has_permission') + @mock.patch('elasticsearch_dsl.Search.execute') + def test_generateid_success(self, search_mock, permission_mock): + search_mock.return_value = \ + IterableAttrDict(self.test_data_empty, + self.test_data_empty['hits']['hits']) + permission_mock.return_value = True + response = self.client.get('/generateid') + self.assertEquals(response.status_code, 200) + + @mock.patch('rorapi.common.views.OurTokenPermission.has_permission') + def test_generateid_fail_no_permission(self, permission_mock): + permission_mock.return_value = False + response = self.client.get('/generateid') + self.assertEquals(response.status_code, 403) + + class GenerateAddressViewTestCase(SimpleTestCase): def setUp(self): with open( @@ -295,4 +322,20 @@ def test_file_upload_fail_error(self, permission_mock, validate_csv_mock): def test_file_upload_fail_no_permission(self, permission_mock): permission_mock.return_value = False response = self.client.post('/v2/upload') + self.assertEquals(response.status_code, 403) + +class CreateOrganizationViewTestCase(SimpleTestCase): + # TODO: complete tests. For now just test that endpoint can't be accessed without creds. + @mock.patch('rorapi.common.views.OurTokenPermission.has_permission') + def test_create_record_fail_no_permission(self, permission_mock): + permission_mock.return_value = False + response = self.client.post('/v2/organizations') + self.assertEquals(response.status_code, 403) + +class UpdateOrganizationViewTestCase(SimpleTestCase): + # TODO: complete tests. For now just test that endpoint can't be accessed without creds. + @mock.patch('rorapi.common.views.OurTokenPermission.has_permission') + def test_create_record_fail_no_permission(self, permission_mock): + permission_mock.return_value = False + response = self.client.put('/v2/organizations/foo') self.assertEquals(response.status_code, 403) \ No newline at end of file From d67e320c004563b788b39e0183ca7553af46b9bc Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Sat, 9 Mar 2024 09:48:12 -0600 Subject: [PATCH 20/38] tidy up error handling --- rorapi/common/create_update.py | 31 +++++++++++--------------- rorapi/common/csv_bulk.py | 40 ++++++++++++++++++---------------- rorapi/common/csv_create.py | 6 ++--- rorapi/common/csv_update.py | 6 ++--- rorapi/common/record_utils.py | 8 ++----- rorapi/common/views.py | 15 ++++++++----- 6 files changed, 52 insertions(+), 54 deletions(-) diff --git a/rorapi/common/create_update.py b/rorapi/common/create_update.py index aefbc8d..cafdb18 100644 --- a/rorapi/common/create_update.py +++ b/rorapi/common/create_update.py @@ -1,6 +1,5 @@ import copy from datetime import datetime -from rorapi.common.models import Errors from rorapi.common.record_utils import * import update_address as ua from rorapi.v2.record_constants import * @@ -43,7 +42,7 @@ def add_created_last_mod(record): return record def update_locations(locations): - errors = [] + error = None updated_locations = [] for location in locations: if 'geonames_id' in location: @@ -52,8 +51,8 @@ def update_locations(locations): updated_location = ua.new_geonames_v2(str(location['geonames_id'])) updated_locations.append(updated_location['location']) except: - errors.append("Error retrieving Geonames data for ID {}. Please check that this is a valid Geonames ID".format(location['geonames_id'])) - return errors, updated_locations + error = "Error retrieving Geonames data for ID {}. Please check that this is a valid Geonames ID".format(location['geonames_id']) + return error, updated_locations def sort_list_fields(v2_record): for field in v2_record: @@ -68,34 +67,30 @@ def sort_list_fields(v2_record): def new_record_from_json(json_input, version): - errors = None + error = None valid_data = None new_record = copy.deepcopy(json_input) if check_optional_fields(new_record): new_record = add_missing_optional_fields(new_record) - location_errors, updated_locations = update_locations(new_record['locations']) - if location_errors: - errors = Errors(location_errors) - else: + error, updated_locations = update_locations(new_record['locations']) + if not error: new_record['locations'] = updated_locations new_record = add_created_last_mod(new_record) new_ror_id = check_ror_id(version) print("new ror id: " + new_ror_id) new_record['id'] = new_ror_id - errors, valid_data = validate_record(sort_list_fields(new_record), V2_SCHEMA) - return errors, valid_data + error, valid_data = validate_record(sort_list_fields(new_record), V2_SCHEMA) + return error, valid_data def update_record_from_json(new_json, existing_org): - errors = None + error = None valid_data = None serializer = OrganizationSerializerV2(existing_org) existing_record = serializer.data updated_record = update_record(new_json, existing_record) - location_errors, updated_locations = update_locations(updated_record['locations']) - if location_errors: - errors = Errors(location_errors) - else: + error, updated_locations = update_locations(updated_record['locations']) + if not error: updated_record['locations'] = updated_locations - errors, valid_data = validate_record(sort_list_fields(updated_record), V2_SCHEMA) - return errors, valid_data + error, valid_data = validate_record(sort_list_fields(updated_record), V2_SCHEMA) + return error, valid_data diff --git a/rorapi/common/csv_bulk.py b/rorapi/common/csv_bulk.py index 22bb2b3..3819b67 100644 --- a/rorapi/common/csv_bulk.py +++ b/rorapi/common/csv_bulk.py @@ -44,8 +44,8 @@ def save_report_file(report, report_fields, csv_file, dir_name): def process_csv(csv_file, version): print("Processing CSV") dir_name = datetime.now().strftime("%Y-%m-%d-%H:%M:%S") + "-ror-records" - errors = None success_msg = None + error = None report = [] report_fields = ['row', 'ror_id', 'action', 'errors'] skipped_count = 0 @@ -82,25 +82,27 @@ def process_csv(csv_file, version): else: action = 'skipped' skipped_count += 1 - #print(errors) - report.append({"row": row_num, "ror_id": ror_id if ror_id else '', "action": action, "errors": row_errors if row_errors else ''}) + report.append({"row": row_num, "ror_id": ror_id if ror_id else '', "action": action, "errors": "; ".join(row_errors) if row_errors else ''}) row_num += 1 - print(report) if new_count > 0 or updated_count > 0 or skipped_count > 0: - #create report file - save_report_file(report, report_fields, csv_file, dir_name) - # create zip file - zipfile = shutil.make_archive(os.path.join(DATA['DIR'], dir_name), 'zip', DATA['DIR'], dir_name) - # upload to S3 - ''' try: - DATA['CLIENT'].upload_file(zipfile, DATA['PUBLIC_STORE'], dir_name + '.zip') - zipfile = f"https://s3.eu-west-1.amazonaws.com/{DATA['PUBLIC_STORE']}/{urllib.parse.quote(dir_name)}.zip" + #create report file + save_report_file(report, report_fields, csv_file, dir_name) + # create zip file + zipfile = shutil.make_archive(os.path.join(DATA['DIR'], dir_name), 'zip', DATA['DIR'], dir_name) + # upload to S3 + ''' + try: + DATA['CLIENT'].upload_file(zipfile, DATA['PUBLIC_STORE'], dir_name + '.zip') + zipfile = f"https://s3.eu-west-1.amazonaws.com/{DATA['PUBLIC_STORE']}/{urllib.parse.quote(dir_name)}.zip" + except Exception as e: + error = f"Error uploading zipfile to S3: {e}" + ''' except Exception as e: - errors = e - print(errors) - ''' - - success_msg = {"file": zipfile, "rows processed": new_count + updated_count + skipped_count, "created": new_count, "udpated": updated_count, "skipped": skipped_count} - print(success_msg) - return success_msg \ No newline at end of file + error = f"Unexpected error generating records: {e}" + success_msg = {"file": zipfile, + "rows processed": new_count + updated_count + skipped_count, + "created": new_count, + "udpated": updated_count, + "skipped": skipped_count} + return error, success_msg \ No newline at end of file diff --git a/rorapi/common/csv_create.py b/rorapi/common/csv_create.py index a0cd121..cf1260d 100644 --- a/rorapi/common/csv_create.py +++ b/rorapi/common/csv_create.py @@ -104,7 +104,7 @@ def new_record_from_csv(csv_data, version): if csv_data['types']: v2_data['types'] = [t.strip().lower() for t in csv_data['types'].strip(';').split(';')] - validation_errors, new_record = new_record_from_json(v2_data, version) - if validation_errors: - errors = ErrorsSerializer(validation_errors).data + validation_error, new_record = new_record_from_json(v2_data, version) + if validation_error: + errors.append(validation_error) return errors, new_record \ No newline at end of file diff --git a/rorapi/common/csv_update.py b/rorapi/common/csv_update.py index c457e0f..d38ce2c 100644 --- a/rorapi/common/csv_update.py +++ b/rorapi/common/csv_update.py @@ -378,7 +378,7 @@ def update_record_from_csv(csv_data, version): update_data['types'] = temp_types if not errors: - validation_errors, updated_record = update_record_from_json(update_data, existing_record) - if validation_errors: - errors = ErrorsSerializer(validation_errors).data + validation_error, updated_record = update_record_from_json(update_data, existing_record) + if validation_error: + errors.append(validation_error) return errors, updated_record \ No newline at end of file diff --git a/rorapi/common/record_utils.py b/rorapi/common/record_utils.py index 39107ff..2332785 100644 --- a/rorapi/common/record_utils.py +++ b/rorapi/common/record_utils.py @@ -1,7 +1,6 @@ import jsonschema import requests from iso639 import Lang -from rorapi.common.models import Errors def get_lang_code(lang_string): @@ -24,15 +23,12 @@ def get_file_from_url(url): return rsp.json() def validate_record(data, schema): - errors = [] try: print("validating data:") print(data) jsonschema.validate(data, schema) - except jsonschema.ValidationError as error: - errors.append(error) - print(errors) - return Errors(errors), None + except jsonschema.ValidationError as e: + return "Validation error: " + e.message, None else: return None, data diff --git a/rorapi/common/views.py b/rorapi/common/views.py index b5cd59e..fabf39e 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -114,9 +114,11 @@ def create(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): if version == "v2": json_input = request.data if 'id' in json_input and (json_input['id'] is not None and json_input['id'] != ""): - errors = Errors(["Value {} found in ID field. New records cannot contain a value in the ID field".format(json_inputjson['id'])]) + errors = Errors(["Value {} found in ID field. New records cannot contain a value in the ID field".format(json_input['id'])]) else: - errors, valid_data = new_record_from_json(json_input, version) + create_error, valid_data = new_record_from_json(json_input, version) + if create_error: + errors = Errors([create_error]) else: errors = Errors(["Version {} does not support creating records".format(version)]) if errors is not None: @@ -147,7 +149,9 @@ def update(self, request, pk=None, version=REST_FRAMEWORK["DEFAULT_VERSION"]): elif get_ror_id(json['id']) != ror_id: errors = Errors(["Value {} in IDs field does not match resource ID specified in request URL {}".format(json['id'], pk)]) else: - errors, valid_data = update_record_from_json(json, organization) + update_error, valid_data = update_record_from_json(json, organization) + if update_error: + errors = Errors([update_error]) else: errors = Errors(["Version {} does not support creating records".format(version)]) if errors is not None: @@ -222,8 +226,9 @@ def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): csv_validation_errors = validate_csv(file_object) if len(csv_validation_errors) == 0: file_object.seek(0) - msg = process_csv(file_object, version) - + csv_process_error, msg = process_csv(file_object, version) + if csv_process_error: + errors = Errors([csv_process_error]) else: errors=Errors(csv_validation_errors) else: From 951c5c134e883491455f9f0658537a084e818708 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Mon, 11 Mar 2024 15:00:30 -0500 Subject: [PATCH 21/38] first iteration of v2 curation endpoint --- .github/workflows/dev.yml | 14 +- README.md | 157 +++++++++++++++++++++- rorapi/common/csv_bulk.py | 2 - rorapi/common/urls.py | 6 +- rorapi/common/views.py | 12 +- rorapi/tests/tests_unit/tests_views_v2.py | 16 +-- 6 files changed, 187 insertions(+), 20 deletions(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 1ae103e..e7da136 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -35,6 +35,18 @@ jobs: uses: actions/checkout@v2 with: path: ror-api + - name: Checkout ror-data-test + uses: actions/checkout@v2 + with: + repository: ror-community/ror-data-test + token: ${{ secrets.PERSONAL_ACCESS_TOKEN }} + path: ror-data-test + - name: Get last data dump name + working-directory: ./ror-data-test + run: | + FILE="$(ls -Art *.zip | tail -n 1)" + echo ${FILE%.*} + echo "LATEST_DUMP_FILE=${FILE%.*}" >> $GITHUB_ENV - name: Cache dependency uses: actions/cache@v2 with: @@ -57,7 +69,7 @@ jobs: - name: Setup working-directory: ./ror-api run: | - python manage.py setup v1.35-2023-10-26-ror-data -t + python manage.py setup LATEST_DUMP_FILE -t # Dump file temp hard coded for v2 beta # Pulled from ror-data-test per settings.py config - name: Test diff --git a/README.md b/README.md index eefd23a..7d6fb83 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,3 @@ -[![Build Status](https://travis-ci.com/ror-community/ror-api.svg?branch=master)](https://travis-ci.com/ror-community/ror-api) - # Research Organization Registry (ROR) API The ROR API allows retrieving, searching and filtering the organizations indexed in ROR. The results are returned in JSON. See https://ror.readme.io for documentation. @@ -152,3 +150,158 @@ ROR dataset ZIP archive created This will create a new `data/ror-2020-03-15` folder, containing a `ror.json` and `ror.zip`. To finish the process, add the new folder to git and push to the GitHub repo. To install the updated ROR data, run `./manage.py setup`. + +## Create new record file (v2 only) + +Making a POST request `/organizations` performs the following actions: +- Populates fields with supplied values +- Adds default values for optional fields +- Populates Geonames details fields with values from the Geonames API, based on the Geonames ID provided +- Validates submitted metadata against the ROR schema. Note that only schema validation is performed - additional tests included in [validation-suite]I(https://github.com/ror-community/validation-suite), such as checking relationship pairs, are not performed. +- Orders fields and values within fields alphabetically (consistent with API behavior) +- Returns JSON that can be saved to a file and used during the [ROR data release creation & deployment process](https://github.com/ror-community/ror-records?tab=readme-ov-file#ror-data-release-creation--deployment-steps) + +**A POST request to this route DOES NOT immediately add a new record to the ROR API.** + +### Usage + +1. Prepare a JSON file formatted according to the [ROR v2 JSON schema](https://github.com/ror-community/ror-schema/blob/schema-v2/ror_schema_v2_0.json). Ensure that all required fields EXCEPT `id` contain values. DO NOT include a value in the `id` field or in geonames_details fields. These values will be generated. Optional fields and `id` field may be omitted. + +2. Make a POST request to `/organizations` with the JSON file as the data payload. Credentials are required for POST requests. + + curl -X POST -H "Route-User: [API USER]" -H "Token: [API TOKEN]" "http://api.dev.ror.org/v2/organizations" -d @[PATH TO JSON FILE].json -H "Content-Type: application/json" + +3. The response is a schema-valid JSON object populated with the submitted metadata as well as a ROR ID and Geonames details retrieved from Geonames. Fields and values will be ordered as in the ROR API and optional fields will be populated with empty or null values. Redirect the response to a file for use in the ROR data deployment process. **The resulting record is NOT added to the the ROR index.** + +## Update existing record file (v2 only) + +Making a PUT request `/organizations/[ROR ID]` performs the following actions: + +- Ovewrites fields with supplied values +- Populates Geonames details fields with values from the Geonames API, based on the Geonames ID provided +- Validates submitted metadata against the ROR schema. Note that only schema validation is performed - additional tests included in [validation-suite]I(https://github.com/ror-community/validation-suite), such as checking relationship pairs, are not performed. +- Orders fields and values within fields alphabetically (consistent with API behavior) +- Returns JSON that can be saved to a file and used during the [ROR data release creation & deployment process](https://github.com/ror-community/ror-records?tab=readme-ov-file#ror-data-release-creation--deployment-steps) + +**A PUT request to this route DOES NOT immediately update a record in the ROR API.** + +### Usage + +1. Prepare a JSON file formatted according to the [ROR v2 JSON schema](https://github.com/ror-community/ror-schema/blob/schema-v2/ror_schema_v2_0.json). It is only necessary to include the `id` field and any fields that you wish to update. Existing field values will be overwritten by values included in the file. If you wish to delete all existing values from a field, include the field in the JSON file with value `[]` (multi-value fields) or `null` (single-value fields). Geonames details will be updated during record generation regardless of which fields are included in the JSON. + +2. Make a PUT request to `/organizations/[ROR ID]` with the JSON file as the data payload. Credentials are required for PUT requests. The ROR ID specified in the request path must match the ROR ID in the `id` field of the JSON data. + + curl -X PUT -H "Route-User: [API USER]" -H "Token: [API TOKEN]" "http://api.dev.ror.org/v2/organizations/[ROR ID]" -d @[PATH TO JSON FILE].json -H "Content-Type: application/json" + +3. The response is a schema-valid JSON object populated with the updates in the submitted metadata as well as updated Geonames details retrieved from Geonames. Fields and values will be ordered as in the ROR API and optional fields will be populated with empty or null values. Redirect the response to a file for use in the ROR data deployment process. **The resulting record is NOT updated in the the ROR index.** + +## Create/update multiple record files from a CSV + +Making a POST request `/organizations/bulkupdate` performs the following actions: + +- Validates the CSV file to ensure that it contains all required columns +- Loops through each row and performs the following actions: + - If no value is included in `ror_id` column, attempt to create a new record file with values specified in the CSV + - If a value is included in `ror_id`, attempt to retrieve the existing record and create an updated record file with changes specified in the CSV + - If validation or other errors occur during record creation, the row is skipped and error(s) are recorded in the report.csv file +- Generates a zipped file containing files for all new/updated records, as well as a report.csv file with a row for each row in the input CSV and a copy of the input CSV file +- Uploads the zipped file to AWS S3 +- Returns a message with the URL for the zipped file and a summary message with counts of records created/updated/skipped +- Records can be downloadede from S3 and used during the [ROR data release creation & deployment process](https://github.com/ror-community/ror-records?tab=readme-ov-file#ror-data-release-creation--deployment-steps) + +**A POST request to this route DOES NOT immediately add new/udpated records to the ROR API.** + +### Usage + +1. Prepare a CSV file as specified below with 1 row for each new or updated record. New and updated records can be included in the same file. + +2. Make a POST request to `/bulkupdate`` with the filepath specfied in the file field of a multi-part form payload. Credentials are required for POST requests. + + curl -X POST -H "Route-User: [API USER]" -H "Token: [API TOKEN]" 'https://api.dev.ror.org/v2/bulkupdate' --form 'file=@"[PATH TO CSV FILE].csv"' + +3. The response is a summary with counts of records created/updated/skipped and a link to download the generated files from AWS S3. + + {"file":"https://s3.eu-west-1.amazonaws.com/2024-03-09-15:56:26-ror-records.zip","rows processed":208,"created":207,"udpated":0,"skipped":1} + +The zipped file contains the following items: +- **input.csv:** Copy of the CSV submitted to the API +- **report.csv:** CSV with a row for each processed row in the input CSV, with indication of whether it was created, updated or skipped. If a record was created, its new ROR ID is listed in the `ror_id` column. If a record was skipped, the reasons(s) are listed in the `errors` column. +- **new:** Directory containing JSON files for records that were successully created (omitted if no records were created) +- **updates:** A directory containing JSON files for records that were successfully updated (omitted if no records were updated) + +### CSV formatting + +#### Column headings & values + +- All column headings below must be included, but they are not required to contain values +- Columns can be in any order +- Additional columns can be included, at any position +- For new records, `ror_id` column value must be empty +- For updated records, `ror_id` column must contain the ROR ID for the existing production record you would like to update +- For list fields, multiple values should be separated with `;` (with or without a trailing space). The last value in a list can be followed by a trailing `;` (or not - behavior is the same in both cases). +- For values with language codes, specify the language by adding `*` followed by the ISO-639 reference name or 2-char code, ex `*French` or `*FR`. Use reference names from the [Python library iso639](https://github.com/LBeaudoux/iso639/blob/master/iso639/data/ISO-639-2_utf-8.txt) +- Values in `status` and `types` field can be specified using any casing, but will be converted to lowercase + + +| Column name | Value format | Example | Notes | +| ------------------------------------ | ---------------------------------- | ----------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| id | Single | [https://ror.org/01an7q238](https://ror.org/01an7q238) | ROR ID as full url; include for updated records only | +| domains | Single or Multiple, separated by ; | foo.org
foo.org;bar.org | | +| established | Single | 1973 | | +| external_ids.type.fundref.all | Single or Multiple, separated by ; | 100000015
100000015;100006157 | | +| external_ids.type.fundref.preferred | Single | 100000015 | Preferred value must exist in all | +| external_ids.type.grid.all | Single or Multiple, separated by ; | grid.85084.31
grid.85084.31;grid.85084.58 | | +| external_ids.type.grid.preferred | Single | grid.85084.31 | Preferred value must exist in all | +| external_ids.type.isni.all | Single or Multiple, separated by ; | 0000 0001 2342 3717
0000 0001 2342 3717;0000 0001 2342 3525 | | +| external_ids.type.isni.preferred | Single | 0000 0001 2342 3717 | Preferred value must exist in all | +| external_ids.type.wikidata.all | Single or Multiple, separated by ; | Q217810
Q217810;Q6422983 | | +| external_ids.type.wikidata.preferred | Single | Q217810 | Preferred value must exist in all | +| links.type.website | Single or Multiple, separated by ; | https://foo.org
https://foo.org;https://foo.bar.org | | +| links.type.wikipedia | Single or Multiple, separated by ; | http://en.wikipedia.org/wiki/foo
http://en.wikipedia.org/wiki/foo;http://en.wikipedia.org/wiki/bar | | +| locations.geonames_id | Single or Multiple, separated by ; | 6252001
6252001;6252002 | | +| names.types.acronym | Single or Multiple, separated by ; | US
US;UoS | | +| names.types.alias | Single or Multiple, separated by ; | Stuff University
Stuff University;U Stuff | | +| names.types.label | Single or Multiple, separated by ; | Universidad de Stuff\*Spanish
Universidad de Stuff\*Spanish;Université de Stuff\*French | Language can be specified for any name type using its full ISO 639-2 reference name or 2-char code, ex \*French or \*FR. Python iso639 is used for language code conversion, and it has some quirks. See mapping of language names to codes https://github.com/LBeaudoux/iso639/blob/master/iso639/data/ISO-639-2_utf-8.txt | +| names.types.ror_display | Single | University of Stuff | | +| status | Single | active | Any casing allowed; will be converted to lowercase | +| types | Single or Multiple, separated by ; | government
government;education | Any casing allowed; will be converted to lowercase | + +#### Update syntax + +- For new records, specify just the desired field values in the CSV (no actions) +- For updated records, use the syntax `add==`, `delete==`, `delete` or `replace==` to specify the action to be taken on specified values, ex `add==Value to be added` or `add==Value to be added;Another value to be added` +- Add and delete actions can be combined, ex `add==Value to be added;Another value to be added;delete==Value to be deleted`. Add or delete cannot be combined with replace, because replace would overwrite anything specified by add/delete actions +- Some actions are not allowed for certain fields (see below); invalid actions or invalid combinations of actions will result in the row being skipped. Errors are recorded report.csv. +- When processing a given field, delete actions are processed first, followed by add actions, regardless of how they are ordered in the submitted CSV + + +| Action | Behavior | Allowed fields | Notes | +| ------------------------------- | --------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| add== | Add specified item(s) to multi-item field | domains, external_ids.type.fundref.all, external_ids.type.grid.all, external_ids.type.isni.all, external_ids.type.wikidata.all, links.type.website, links.type.wikipedia, locations, names.types.acronym, names.types.alias, names.types.label, types | Values to be added are validated to ensure they don't already exist in field, however, only exact matches are checked. Variants with different leading/trailing characters and/or diacritics are not matched.

add== has special behavior for external_ids.[type].all and names fields - see below. | +| delete== | Remove specified item(s) from multi-item field | domains, external_ids.type.fundref.all, external_ids.type.grid.all, external_ids.type.isni.all, external_ids.type.wikidata.all, links.type.website, links.type.wikipedia, locations, names.types.acronym, names.types.alias, names.types.label, types | Values to be deleted are validated to ensure they exist in field, however, only exact matches are checked. Variants with different leading/trailing characters and/or diacritics are not matched.

delete== has special behavior for external_ids.[type].all and names fields - see below | +| delete | Remove all values from field (single or multi-item field) | All optional fields. Not allowed for required fields: locations, names.types.ror_display, status, types | | +| replace== | Replace all value(s) with specified value(s) (single or multi-item field) | All fields | replace== has special behavior for external_ids.[type].all and names fields - see below | +| no action (only value supplied) | Replace existing value or add value to currently empty field (single-item fields) | established, external_ids preferred, status, names.types.ror_display | Same action as replace | +#### External IDs + +| Action | external_ids.[TYPE].all | external.[TYPE].preferred | +| ------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| add== | If an external_ids object with the type exists, value(s) are added to external_ids.[TYPE].all. If an external_ids object with the type does not exist, a new object is added with value(s) in If an external_ids object with the type exists. A preferred ID is NOT automatically added - it must be explicitly specified in external.[TYPE].preferred . | Not allowed. Add== action is only allowed for multi-value fields | +| delete== | Value(s) are removed from external_ids.[TYPE].all. After all changes to external_ids.[TYPE].all and external.[TYPE].preferred are calcuated, if the result is that BOTH fields are empty the entire external_ids object is deleted. Preferred ID is NOT automatically removed if the value is removed from external_ids.[TYPE].all - it must be explicitly deleted from external.[TYPE].preferred | Not allowed. Add== action is only allowed for multi-value fields | +| replace== | Replaces any existing value(s) in  external_ids.[TYPE].all or populates field if no value(s) exist. Preferred ID is NOT automatically removed if the value is removed from external_ids.[TYPE].all - it must be explicitly deleted from external.[TYPE].preferred | Replaces any existing value from external.[TYPE].preferred or populates field if no value exists. Value is NOT automatically added to external_ids.[TYPE].all  - it must be explicitly added to external.[TYPE].all | +| delete | Deletes any existing all existing values from external_ids.[TYPE].all. Preferred ID is NOT automatically removed from external_ids.[TYPE].all  - it must be explicitly deleted from external.[TYPE].all . After all changes to external_ids.[TYPE].all and external.[TYPE].preferred are calcuated, if the result is that BOTH fields are empty the entire external_ids object is deleted. | Deletes any existing value in external.[TYPE].preferred. Value is NOT automatically removed from external_ids.[TYPE].all - it must be explicitly deleted from external.[TYPE].all | +| no action (only value supplied) | Same as replace== | Same as replace== | + +#### Names + +| Action | names.[TYPE] | +| ------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| add== | If a names object with the exact same value AND language exists, the type is added to types field. If not, a new names object is added with the specifed value, language and type. If no language is specified, the lang field is null. NOTE: because matching is based on the combination of value AND lang, a case like "value": "University of Foo", "lang": null does not match "value": "University of Foo", "lang": "en" | +| delete== | If the name to be removed has multiple types in its types field, the specified type is removed from the types field, but the names object remains. If the result of all changes is a names object with no types, the entire names object is removed. | +| replace== | Names of the specified type are removed according to the delete== rules above, then added according to the add== rules above. Depending on the existing values on the record and the values specifed in replace==, that can result in some names objects added, some removed and/or some with changes to their types field. | +| delete | Removes the specified type from all names objects that currently have that type in their types field. If the result of all changes is a names object with no types, the entire names object is removed. | +| no action (only value supplied) | Same as replace== | + + + + diff --git a/rorapi/common/csv_bulk.py b/rorapi/common/csv_bulk.py index 3819b67..3bd7288 100644 --- a/rorapi/common/csv_bulk.py +++ b/rorapi/common/csv_bulk.py @@ -91,13 +91,11 @@ def process_csv(csv_file, version): # create zip file zipfile = shutil.make_archive(os.path.join(DATA['DIR'], dir_name), 'zip', DATA['DIR'], dir_name) # upload to S3 - ''' try: DATA['CLIENT'].upload_file(zipfile, DATA['PUBLIC_STORE'], dir_name + '.zip') zipfile = f"https://s3.eu-west-1.amazonaws.com/{DATA['PUBLIC_STORE']}/{urllib.parse.quote(dir_name)}.zip" except Exception as e: error = f"Error uploading zipfile to S3: {e}" - ''' except Exception as e: error = f"Unexpected error generating records: {e}" success_msg = {"file": zipfile, diff --git a/rorapi/common/urls.py b/rorapi/common/urls.py index 0c97f55..4cc7539 100644 --- a/rorapi/common/urls.py +++ b/rorapi/common/urls.py @@ -3,7 +3,7 @@ from rest_framework.documentation import include_docs_urls from . import views -from rorapi.common.views import HeartbeatView,GenerateAddress,GenerateId,IndexData,FileUploadView +from rorapi.common.views import HeartbeatView,GenerateAddress,GenerateId,IndexData,BulkUpdate urlpatterns = [ # Health check @@ -19,6 +19,6 @@ url(r"^docs/", include_docs_urls(title="Research Organization Registry")), # Prometheus url("", include("django_prometheus.urls")), - re_path(r"^(?P(v1|v2))\/upload$", FileUploadView.as_view()), - path('upload/', FileUploadView.as_view()) + re_path(r"^(?P(v1|v2))\/bulkupdate$", BulkUpdate.as_view()), + ] diff --git a/rorapi/common/views.py b/rorapi/common/views.py index fabf39e..f39c26f 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -210,7 +210,7 @@ def get(self, request, branch, version=REST_FRAMEWORK["DEFAULT_VERSION"]): return Response({"status": msg["status"], "msg": msg["msg"]}, status=st) -class FileUploadView(APIView): +class BulkUpdate(APIView): permission_classes = [OurTokenPermission] parser_classes = (MultiPartParser, FormParser) @@ -226,9 +226,13 @@ def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): csv_validation_errors = validate_csv(file_object) if len(csv_validation_errors) == 0: file_object.seek(0) - csv_process_error, msg = process_csv(file_object, version) - if csv_process_error: - errors = Errors([csv_process_error]) + process_csv_error, msg = process_csv(file_object, version) + print("views msg") + print(msg) + print("views type msg") + print(type(msg)) + if process_csv_error: + errors = Errors([process_csv_error]) else: errors=Errors(csv_validation_errors) else: diff --git a/rorapi/tests/tests_unit/tests_views_v2.py b/rorapi/tests/tests_unit/tests_views_v2.py index c58c76e..f9d8d8c 100644 --- a/rorapi/tests/tests_unit/tests_views_v2.py +++ b/rorapi/tests/tests_unit/tests_views_v2.py @@ -291,7 +291,7 @@ def test_heartbeat_success(self, search_mock): response = self.client.get('/v2/heartbeat') self.assertEquals(response.status_code, 200) -class FileUploadViewTestCase(SimpleTestCase): +class BulkUpdateViewTestCase(SimpleTestCase): def setUp(self): self.csv_errors_empty = [] self.csv_errors_error = ['error'] @@ -301,27 +301,27 @@ def setUp(self): @mock.patch('rorapi.common.views.OurTokenPermission.has_permission') @mock.patch('rorapi.common.views.validate_csv') @mock.patch('rorapi.common.views.process_csv') - def test_file_upload_success(self, permission_mock, validate_csv_mock, process_csv_msg_mock): + def test_bulkupdate_success(self, process_csv_mock, validate_csv_mock, permission_mock): permission_mock.return_value = True validate_csv_mock.return_value = self.csv_errors_empty - process_csv_msg_mock.return_value = self.process_csv_msg + process_csv_mock.return_value = None, self.process_csv_msg with open(os.path.join(os.path.dirname(__file__), 'data/test_upload_csv.csv'), 'rb') as f: - response = self.client.post('/v2/upload', {"file":f}) + response = self.client.post('/v2/bulkupdate', {"file":f}) self.assertEquals(response.status_code, 201) @mock.patch('rorapi.common.views.OurTokenPermission.has_permission') @mock.patch('rorapi.common.views.validate_csv') - def test_file_upload_fail_error(self, permission_mock, validate_csv_mock): + def test_bulkupdate_fail_error(self, validate_csv_mock, permission_mock): permission_mock.return_value = True validate_csv_mock.return_value = self.csv_errors_error - response = self.client.post('/v2/upload') + response = self.client.post('/v2/bulkupdate') self.assertEquals(response.status_code, 400) @mock.patch('rorapi.common.views.OurTokenPermission.has_permission') - def test_file_upload_fail_no_permission(self, permission_mock): + def test_bulkupdate_fail_no_permission(self, permission_mock): permission_mock.return_value = False - response = self.client.post('/v2/upload') + response = self.client.post('/v2/bulkupdate') self.assertEquals(response.status_code, 403) class CreateOrganizationViewTestCase(SimpleTestCase): From fc09deaaf17f6c5e5faca79977fa3ab3da102d0a Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Mon, 11 Mar 2024 16:25:11 -0500 Subject: [PATCH 22/38] resolve merge conflicts --- rorapi/common/urls.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/rorapi/common/urls.py b/rorapi/common/urls.py index a85730f..3589ab9 100644 --- a/rorapi/common/urls.py +++ b/rorapi/common/urls.py @@ -2,12 +2,8 @@ from django.urls import path, re_path from rest_framework.documentation import include_docs_urls -from . import views -<<<<<<< HEAD -from rorapi.common.views import HeartbeatView,GenerateAddress,GenerateId,IndexData,BulkUpdate -======= -from rorapi.common.views import HeartbeatView,GenerateAddress,GenerateId,IndexData,IndexDataDump ->>>>>>> dev +from . import viewss +from rorapi.common.views import HeartbeatView,GenerateAddress,GenerateId,IndexData,IndexDataDump,BulkUpdate urlpatterns = [ # Health check From 881e19302fa08f87eb7adb40334fa2b06c94b084 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Mon, 11 Mar 2024 16:33:22 -0500 Subject: [PATCH 23/38] resolve merge conflicts --- rorapi/common/urls.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/rorapi/common/urls.py b/rorapi/common/urls.py index 3589ab9..8ec3a87 100644 --- a/rorapi/common/urls.py +++ b/rorapi/common/urls.py @@ -1,8 +1,6 @@ from django.conf.urls import url, include from django.urls import path, re_path from rest_framework.documentation import include_docs_urls - -from . import viewss from rorapi.common.views import HeartbeatView,GenerateAddress,GenerateId,IndexData,IndexDataDump,BulkUpdate urlpatterns = [ From 42bdd8410e6594bd36d1e6d9c68a1c44a354c77e Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Mon, 11 Mar 2024 16:38:26 -0500 Subject: [PATCH 24/38] resolve merge conflicts --- rorapi/common/urls.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rorapi/common/urls.py b/rorapi/common/urls.py index 8ec3a87..0660255 100644 --- a/rorapi/common/urls.py +++ b/rorapi/common/urls.py @@ -1,6 +1,7 @@ from django.conf.urls import url, include from django.urls import path, re_path from rest_framework.documentation import include_docs_urls +from . import views from rorapi.common.views import HeartbeatView,GenerateAddress,GenerateId,IndexData,IndexDataDump,BulkUpdate urlpatterns = [ From 2551b215bad9aeca808597322211fdfa7f0dca4f Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Mon, 11 Mar 2024 16:49:33 -0500 Subject: [PATCH 25/38] fix dump env var in workflow --- .github/workflows/dev.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index e7da136..3e57a5d 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -69,7 +69,7 @@ jobs: - name: Setup working-directory: ./ror-api run: | - python manage.py setup LATEST_DUMP_FILE -t + python manage.py setup $LATEST_DUMP_FILE -t # Dump file temp hard coded for v2 beta # Pulled from ror-data-test per settings.py config - name: Test From e8f19f3eddf0661f52cc0ad1f8f35d071a922e61 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Mon, 11 Mar 2024 17:00:52 -0500 Subject: [PATCH 26/38] revert to hard coding dump file for tests --- .github/workflows/dev.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 3e57a5d..789a240 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -69,7 +69,7 @@ jobs: - name: Setup working-directory: ./ror-api run: | - python manage.py setup $LATEST_DUMP_FILE -t + python manage.py setup v1.42-2024-02-21-ror-data -t # Dump file temp hard coded for v2 beta # Pulled from ror-data-test per settings.py config - name: Test From 710c662ba11eabf7aef5bac867e9ca757c0c5bd1 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Mon, 11 Mar 2024 17:51:20 -0500 Subject: [PATCH 27/38] log out error for testing --- rorapi/common/views.py | 2 +- rorapi/tests/tests_unit/tests_views_v2.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/rorapi/common/views.py b/rorapi/common/views.py index 7a87a2e..b0b5b24 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -262,7 +262,7 @@ def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): else: errors = Errors(["Version {} does not support creating records".format(version)]) if errors is not None: - print(errors) + print(errors.__dict__) return Response( ErrorsSerializer(errors).data, status=status.HTTP_400_BAD_REQUEST ) diff --git a/rorapi/tests/tests_unit/tests_views_v2.py b/rorapi/tests/tests_unit/tests_views_v2.py index 6cd7331..a36325e 100644 --- a/rorapi/tests/tests_unit/tests_views_v2.py +++ b/rorapi/tests/tests_unit/tests_views_v2.py @@ -302,6 +302,7 @@ def setUp(self): @mock.patch('rorapi.common.views.validate_csv') @mock.patch('rorapi.common.views.process_csv') def test_bulkupdate_success(self, process_csv_mock, validate_csv_mock, permission_mock): + permission_mock.return_value = True validate_csv_mock.return_value = self.csv_errors_empty process_csv_mock.return_value = None, self.process_csv_msg @@ -338,6 +339,8 @@ class UpdateOrganizationViewTestCase(SimpleTestCase): def test_create_record_fail_no_permission(self, permission_mock): permission_mock.return_value = False response = self.client.put('/v2/organizations/foo') + self.assertEquals(response.status_code, 403) + class IndexRorDumpViewTestCase(SimpleTestCase): def setUp(self): self.success_msg = "SUCCESS: ROR dataset vX.XX-XXXX-XX-XX-ror-data indexed in version X. Using test repo: X" From ea8bcfa3b902f4c540546a910bd39acc465c3043 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Mon, 11 Mar 2024 18:04:42 -0500 Subject: [PATCH 28/38] update mimetypes so tests will pass --- rorapi/common/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rorapi/common/views.py b/rorapi/common/views.py index b0b5b24..591ec7a 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -241,7 +241,7 @@ def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): file_object = request.data['file'] mime_type = magic.from_buffer(file_object.read(2048)) print(mime_type) - if "ASCII text" in mime_type or "UTF-8 Unicode text" in mime_type or "CSV text" in mime_type: + if "ASCII text" in mime_type or "UTF-8 text" in mime_type or "UTF-8 Unicode text" in mime_type or "CSV text" in mime_type: file_object.seek(0) csv_validation_errors = validate_csv(file_object) if len(csv_validation_errors) == 0: From 8b1c5eb88d6bc4edf07a4f10c082c40f7809ac7c Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Mon, 11 Mar 2024 18:31:10 -0500 Subject: [PATCH 29/38] use makedirs for case where data dir doesn't exist in ecs --- rorapi/common/csv_bulk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rorapi/common/csv_bulk.py b/rorapi/common/csv_bulk.py index 3bd7288..f822d93 100644 --- a/rorapi/common/csv_bulk.py +++ b/rorapi/common/csv_bulk.py @@ -17,7 +17,7 @@ def save_record_file(ror_id, updated, json_obj, dir_name): dir_path = os.path.join(DATA['DIR'],dir_name) if not os.path.exists(dir_path): - os.mkdir(dir_path) + os.makedirs(dir_path) subdir = 'updates' if updated else 'new' if not os.path.exists(os.path.join(dir_path, subdir)): os.mkdir(os.path.join(dir_path, subdir)) From 3d998953ff7971bc1317319f139c7d37644c3a0b Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Mon, 11 Mar 2024 19:10:57 -0500 Subject: [PATCH 30/38] change zipfile name and update readme --- README.md | 15 +++++++++------ rorapi/common/csv_bulk.py | 2 +- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 7d6fb83..e3e9ea1 100644 --- a/README.md +++ b/README.md @@ -169,7 +169,7 @@ Making a POST request `/organizations` performs the following actions: 2. Make a POST request to `/organizations` with the JSON file as the data payload. Credentials are required for POST requests. - curl -X POST -H "Route-User: [API USER]" -H "Token: [API TOKEN]" "http://api.dev.ror.org/v2/organizations" -d @[PATH TO JSON FILE].json -H "Content-Type: application/json" + curl -X POST -H "Route-User: [API USER]" -H "Token: [API TOKEN]" "http://api.dev.ror.org/v2/organizations" -d @[PATH TO JSON FILE].json -H "Content-Type: application/json" 3. The response is a schema-valid JSON object populated with the submitted metadata as well as a ROR ID and Geonames details retrieved from Geonames. Fields and values will be ordered as in the ROR API and optional fields will be populated with empty or null values. Redirect the response to a file for use in the ROR data deployment process. **The resulting record is NOT added to the the ROR index.** @@ -191,7 +191,7 @@ Making a PUT request `/organizations/[ROR ID]` performs the following actions: 2. Make a PUT request to `/organizations/[ROR ID]` with the JSON file as the data payload. Credentials are required for PUT requests. The ROR ID specified in the request path must match the ROR ID in the `id` field of the JSON data. - curl -X PUT -H "Route-User: [API USER]" -H "Token: [API TOKEN]" "http://api.dev.ror.org/v2/organizations/[ROR ID]" -d @[PATH TO JSON FILE].json -H "Content-Type: application/json" + curl -X PUT -H "Route-User: [API USER]" -H "Token: [API TOKEN]" "http://api.dev.ror.org/v2/organizations/[ROR ID]" -d @[PATH TO JSON FILE].json -H "Content-Type: application/json" 3. The response is a schema-valid JSON object populated with the updates in the submitted metadata as well as updated Geonames details retrieved from Geonames. Fields and values will be ordered as in the ROR API and optional fields will be populated with empty or null values. Redirect the response to a file for use in the ROR data deployment process. **The resulting record is NOT updated in the the ROR index.** @@ -217,11 +217,11 @@ Making a POST request `/organizations/bulkupdate` performs the following actions 2. Make a POST request to `/bulkupdate`` with the filepath specfied in the file field of a multi-part form payload. Credentials are required for POST requests. - curl -X POST -H "Route-User: [API USER]" -H "Token: [API TOKEN]" 'https://api.dev.ror.org/v2/bulkupdate' --form 'file=@"[PATH TO CSV FILE].csv"' + curl -X POST -H "Route-User: [API USER]" -H "Token: [API TOKEN]" 'https://api.dev.ror.org/v2/bulkupdate' --form 'file=@"[PATH TO CSV FILE].csv"' 3. The response is a summary with counts of records created/updated/skipped and a link to download the generated files from AWS S3. - {"file":"https://s3.eu-west-1.amazonaws.com/2024-03-09-15:56:26-ror-records.zip","rows processed":208,"created":207,"udpated":0,"skipped":1} + {"file":"https://s3.eu-west-1.amazonaws.com/2024-03-09_15_56_26-ror-records.zip","rows processed":208,"created":207,"udpated":0,"skipped":1} The zipped file contains the following items: - **input.csv:** Copy of the CSV submitted to the API @@ -282,7 +282,10 @@ The zipped file contains the following items: | delete | Remove all values from field (single or multi-item field) | All optional fields. Not allowed for required fields: locations, names.types.ror_display, status, types | | | replace== | Replace all value(s) with specified value(s) (single or multi-item field) | All fields | replace== has special behavior for external_ids.[type].all and names fields - see below | | no action (only value supplied) | Replace existing value or add value to currently empty field (single-item fields) | established, external_ids preferred, status, names.types.ror_display | Same action as replace | -#### External IDs +#### Fields with special behaviors +For some fields that contain a list of dictionaries as their value, update actions have special behaviors. + +##### External IDs | Action | external_ids.[TYPE].all | external.[TYPE].preferred | | ------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -292,7 +295,7 @@ The zipped file contains the following items: | delete | Deletes any existing all existing values from external_ids.[TYPE].all. Preferred ID is NOT automatically removed from external_ids.[TYPE].all  - it must be explicitly deleted from external.[TYPE].all . After all changes to external_ids.[TYPE].all and external.[TYPE].preferred are calcuated, if the result is that BOTH fields are empty the entire external_ids object is deleted. | Deletes any existing value in external.[TYPE].preferred. Value is NOT automatically removed from external_ids.[TYPE].all - it must be explicitly deleted from external.[TYPE].all | | no action (only value supplied) | Same as replace== | Same as replace== | -#### Names +##### Names | Action | names.[TYPE] | | ------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | diff --git a/rorapi/common/csv_bulk.py b/rorapi/common/csv_bulk.py index f822d93..9f5a02e 100644 --- a/rorapi/common/csv_bulk.py +++ b/rorapi/common/csv_bulk.py @@ -43,7 +43,7 @@ def save_report_file(report, report_fields, csv_file, dir_name): def process_csv(csv_file, version): print("Processing CSV") - dir_name = datetime.now().strftime("%Y-%m-%d-%H:%M:%S") + "-ror-records" + dir_name = datetime.now().strftime("%Y-%m-%d_%H_%M_%S") + "-ror-records" success_msg = None error = None report = [] From cbd0eedfbfa51cca0be3f6515aa2f71665505ed0 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Thu, 14 Mar 2024 17:10:23 -0500 Subject: [PATCH 31/38] add validate only param --- rorapi/common/csv_bulk.py | 60 +++++++++++++++++++++++---------------- rorapi/common/urls.py | 3 +- rorapi/common/views.py | 17 +++++++---- 3 files changed, 48 insertions(+), 32 deletions(-) diff --git a/rorapi/common/csv_bulk.py b/rorapi/common/csv_bulk.py index 9f5a02e..c2c42ab 100644 --- a/rorapi/common/csv_bulk.py +++ b/rorapi/common/csv_bulk.py @@ -25,7 +25,7 @@ def save_record_file(ror_id, updated, json_obj, dir_name): with open(full_path, "w") as outfile: json.dump(json_obj, outfile, ensure_ascii=False, indent=2) -def save_report_file(report, report_fields, csv_file, dir_name): +def save_report_file(report, report_fields, csv_file, dir_name, validate_only): dir_path = os.path.join(DATA['DIR'],dir_name) if not os.path.exists(dir_path): os.mkdir(dir_path) @@ -34,14 +34,15 @@ def save_report_file(report, report_fields, csv_file, dir_name): writer = csv.DictWriter(csvfile, fieldnames=report_fields) writer.writeheader() writer.writerows(report) - # save copy of input file - filepath = os.path.join(dir_path, 'input.csv') - csv_file.seek(0) - with open(filepath, 'wb+') as f: - for chunk in csv_file.chunks(): - f.write(chunk) + if not validate_only: + # save copy of input file + filepath = os.path.join(dir_path, 'input.csv') + csv_file.seek(0) + with open(filepath, 'wb+') as f: + for chunk in csv_file.chunks(): + f.write(chunk) -def process_csv(csv_file, version): +def process_csv(csv_file, version, validate_only): print("Processing CSV") dir_name = datetime.now().strftime("%Y-%m-%d_%H_%M_%S") + "-ror-records" success_msg = None @@ -77,8 +78,9 @@ def process_csv(csv_file, version): serializer = OrganizationSerializerV2(v2_record) json_obj = json.loads(JSONRenderer().render(serializer.data)) print(json_obj) - #create file - file = save_record_file(ror_id, updated, json_obj, dir_name) + if not validate_only: + #create file + file = save_record_file(ror_id, updated, json_obj, dir_name) else: action = 'skipped' skipped_count += 1 @@ -86,21 +88,29 @@ def process_csv(csv_file, version): row_num += 1 if new_count > 0 or updated_count > 0 or skipped_count > 0: try: - #create report file - save_report_file(report, report_fields, csv_file, dir_name) - # create zip file - zipfile = shutil.make_archive(os.path.join(DATA['DIR'], dir_name), 'zip', DATA['DIR'], dir_name) - # upload to S3 - try: - DATA['CLIENT'].upload_file(zipfile, DATA['PUBLIC_STORE'], dir_name + '.zip') - zipfile = f"https://s3.eu-west-1.amazonaws.com/{DATA['PUBLIC_STORE']}/{urllib.parse.quote(dir_name)}.zip" - except Exception as e: - error = f"Error uploading zipfile to S3: {e}" + if validate_only: + try: + save_report_file(report, report_fields, csv_file, dir_name, validate_only) + success_msg = os.path.join(DATA['DIR'], dir_name, 'report.csv') + except Exception as e: + error = f"Error creating validation report: {e}" + else: + #create report file + save_report_file(report, report_fields, csv_file, dir_name, validate_only) + # create zip file + zipfile = shutil.make_archive(os.path.join(DATA['DIR'], dir_name), 'zip', DATA['DIR'], dir_name) + # upload to S3 + try: + DATA['CLIENT'].upload_file(zipfile, DATA['PUBLIC_STORE'], dir_name + '.zip') + zipfile = f"https://s3.eu-west-1.amazonaws.com/{DATA['PUBLIC_STORE']}/{urllib.parse.quote(dir_name)}.zip" + success_msg = {"file": zipfile, + "rows processed": new_count + updated_count + skipped_count, + "created": new_count, + "updated": updated_count, + "skipped": skipped_count} + except Exception as e: + error = f"Error uploading zipfile to S3: {e}" except Exception as e: error = f"Unexpected error generating records: {e}" - success_msg = {"file": zipfile, - "rows processed": new_count + updated_count + skipped_count, - "created": new_count, - "udpated": updated_count, - "skipped": skipped_count} + return error, success_msg \ No newline at end of file diff --git a/rorapi/common/urls.py b/rorapi/common/urls.py index 0660255..b9aa57c 100644 --- a/rorapi/common/urls.py +++ b/rorapi/common/urls.py @@ -2,7 +2,8 @@ from django.urls import path, re_path from rest_framework.documentation import include_docs_urls from . import views -from rorapi.common.views import HeartbeatView,GenerateAddress,GenerateId,IndexData,IndexDataDump,BulkUpdate +from rorapi.common.views import ( + HeartbeatView,GenerateAddress,GenerateId,IndexData,IndexDataDump,BulkUpdate) urlpatterns = [ # Health check diff --git a/rorapi/common/views.py b/rorapi/common/views.py index 591ec7a..e163012 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -1,3 +1,4 @@ +import csv from rest_framework import viewsets, routers, status from rest_framework.response import Response from django.http import HttpResponse @@ -125,7 +126,6 @@ def create(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): else: errors = Errors(["Version {} does not support creating records".format(version)]) if errors is not None: - print(errors) return Response( ErrorsSerializer(errors).data, status=status.HTTP_400_BAD_REQUEST ) @@ -235,6 +235,7 @@ class BulkUpdate(APIView): parser_classes = (MultiPartParser, FormParser) def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): + validate_only = False errors = None if version == 'v2': if request.data: @@ -246,11 +247,10 @@ def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): csv_validation_errors = validate_csv(file_object) if len(csv_validation_errors) == 0: file_object.seek(0) - process_csv_error, msg = process_csv(file_object, version) - print("views msg") - print(msg) - print("views type msg") - print(type(msg)) + params = request.GET.dict() + if "validate" in params: + validate_only = True + process_csv_error, msg = process_csv(file_object, version, validate_only) if process_csv_error: errors = Errors([process_csv_error]) else: @@ -266,6 +266,11 @@ def post(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): return Response( ErrorsSerializer(errors).data, status=status.HTTP_400_BAD_REQUEST ) + if validate_only: + with open(msg) as file: + response = HttpResponse(file, content_type='text/csv') + response['Content-Disposition'] = 'attachment; filename=reports.csv' + return response return Response( msg, From b0dd7ef17873c623658d73fda820897be74c63be Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Thu, 14 Mar 2024 17:14:29 -0500 Subject: [PATCH 32/38] udpate readme --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index e3e9ea1..953426d 100644 --- a/README.md +++ b/README.md @@ -229,6 +229,14 @@ The zipped file contains the following items: - **new:** Directory containing JSON files for records that were successully created (omitted if no records were created) - **updates:** A directory containing JSON files for records that were successfully updated (omitted if no records were updated) +#### Validate only +Use the `?validate` parameter to simulate running the bulkupdate request without actually generating files. The response is the same CSV report described above. + +1. Make a POST request to `/bulkupdate?validate`` with the filepath specfied in the file field of a multi-part form payload. Credentials are required for POST requests. Makre sure to redirect the output to a CSV file on your machine. + + curl -X POST -H "Route-User: [API USER]" -H "Token: [API TOKEN]" 'https://api.dev.ror.org/v2/bulkupdate?validate' --form 'file=@"[PATH TO CSV FILE].csv"' > report.csv + + ### CSV formatting #### Column headings & values From d280ab674891226a7ea9cc5ff51445a416b5d30a Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Fri, 15 Mar 2024 13:52:06 -0500 Subject: [PATCH 33/38] fix v2 indexing for nested names_ids doc --- rorapi/management/commands/indexror.py | 35 ++++++++++++++++++++------ 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/rorapi/management/commands/indexror.py b/rorapi/management/commands/indexror.py index 91796ad..b6d643e 100644 --- a/rorapi/management/commands/indexror.py +++ b/rorapi/management/commands/indexror.py @@ -13,7 +13,7 @@ from django.core.management.base import BaseCommand from elasticsearch import TransportError -def get_nested_names(org): +def get_nested_names_v1(org): yield org['name'] for label in org['labels']: yield label['label'] @@ -22,8 +22,11 @@ def get_nested_names(org): for acronym in org['acronyms']: yield acronym +def get_nested_names_v2(org): + for name in org['names']: + yield name['value'] -def get_nested_ids(org): +def get_nested_ids_v1(org): yield org['id'] yield re.sub('https://', '', org['id']) yield re.sub('https://ror.org/', '', org['id']) @@ -34,6 +37,14 @@ def get_nested_ids(org): for eid in ext_id['all']: yield eid +def get_nested_ids_v2(org): + yield org['id'] + yield re.sub('https://', '', org['id']) + yield re.sub('https://ror.org/', '', org['id']) + for ext_id in org['external_ids']: + for eid in ext_id['all']: + yield eid + def prepare_files(path, local_file): data = [] err = {} @@ -147,12 +158,20 @@ def index(dataset, version): '_id': org['id'] } }) - org['names_ids'] = [{ - 'name': n - } for n in get_nested_names(org)] - org['names_ids'] += [{ - 'id': n - } for n in get_nested_ids(org)] + if 'v2' in index: + org['names_ids'] = [{ + 'name': n + } for n in get_nested_names_v2(org)] + org['names_ids'] += [{ + 'id': n + } for n in get_nested_ids_v2(org)] + else: + org['names_ids'] = [{ + 'name': n + } for n in get_nested_names_v1(org)] + org['names_ids'] += [{ + 'id': n + } for n in get_nested_ids_v1(org)] body.append(org) ES7.bulk(body) except TransportError: From 85bf36d75b8d80322127544c72265a955ca1ed5d Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Fri, 15 Mar 2024 17:29:43 -0500 Subject: [PATCH 34/38] create all dirs in path for report file --- rorapi/common/csv_bulk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rorapi/common/csv_bulk.py b/rorapi/common/csv_bulk.py index c2c42ab..3fc0587 100644 --- a/rorapi/common/csv_bulk.py +++ b/rorapi/common/csv_bulk.py @@ -28,7 +28,7 @@ def save_record_file(ror_id, updated, json_obj, dir_name): def save_report_file(report, report_fields, csv_file, dir_name, validate_only): dir_path = os.path.join(DATA['DIR'],dir_name) if not os.path.exists(dir_path): - os.mkdir(dir_path) + os.makedirs(dir_path) filepath = os.path.join(dir_path, 'report.csv') with open(filepath, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=report_fields) From 9e61f9c1e7e5c6d141d129bed609adbb674eaf28 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Fri, 22 Mar 2024 16:40:14 -0500 Subject: [PATCH 35/38] don't include IDs in report if valiate only --- rorapi/common/csv_bulk.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rorapi/common/csv_bulk.py b/rorapi/common/csv_bulk.py index 3fc0587..ff42d70 100644 --- a/rorapi/common/csv_bulk.py +++ b/rorapi/common/csv_bulk.py @@ -84,6 +84,8 @@ def process_csv(csv_file, version, validate_only): else: action = 'skipped' skipped_count += 1 + if validate_only and action == 'created': + ror_id = None report.append({"row": row_num, "ror_id": ror_id if ror_id else '', "action": action, "errors": "; ".join(row_errors) if row_errors else ''}) row_num += 1 if new_count > 0 or updated_count > 0 or skipped_count > 0: From d2cdb757ce4fa8f0cdaddbe3f7c257160a97a8e4 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Mon, 1 Apr 2024 10:49:24 -0500 Subject: [PATCH 36/38] add validation error for name with lang code if name exists with no lang code --- rorapi/common/csv_update.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/rorapi/common/csv_update.py b/rorapi/common/csv_update.py index d38ce2c..3b47fa5 100644 --- a/rorapi/common/csv_update.py +++ b/rorapi/common/csv_update.py @@ -286,9 +286,13 @@ def update_record_from_csv(csv_data, version): add_values = actions_values[UPDATE_ACTIONS['ADD']] for a in add_values: temp_names_match = [tn for tn in temp_names if (t in tn['types'] and tn['value'] == a['value'] and tn['lang'] == a['lang'])] + temp_names_null_lang_match = [tn for tn in temp_names if (tn['value'] == a['value'] and (tn['lang'] is None and a['lang'] is not None))] # check if value, lang and type already exist - if temp_names_match: - errors.append("Attempting to add names(s) that already exist: {}".format(a)) + if temp_names_match or temp_names_null_lang_match: + if temp_names_match: + errors.append("Attempting to add names that already exists: {}".format(a)) + if temp_names_null_lang_match: + errors.append("Attempting to add name with lang code that already exist with no lang code: {}".format(a)) else: name_vals_match = [tn for tn in temp_names if (tn['value'] == a['value'] and tn['lang'] == a['lang'])] if name_vals_match: From 3425608014f7abc10f4ca0a01888b215416171c0 Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Mon, 1 Apr 2024 11:03:52 -0500 Subject: [PATCH 37/38] fix error msg typo --- rorapi/common/csv_update.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/rorapi/common/csv_update.py b/rorapi/common/csv_update.py index 3b47fa5..363862f 100644 --- a/rorapi/common/csv_update.py +++ b/rorapi/common/csv_update.py @@ -285,14 +285,20 @@ def update_record_from_csv(csv_data, version): if UPDATE_ACTIONS['ADD'] in actions_values: add_values = actions_values[UPDATE_ACTIONS['ADD']] for a in add_values: + print('temp names:') + print(temp_names) temp_names_match = [tn for tn in temp_names if (t in tn['types'] and tn['value'] == a['value'] and tn['lang'] == a['lang'])] + print("temp names match:") + print(temp_names_match) temp_names_null_lang_match = [tn for tn in temp_names if (tn['value'] == a['value'] and (tn['lang'] is None and a['lang'] is not None))] + print("null lang match:") + print(temp_names_null_lang_match) # check if value, lang and type already exist if temp_names_match or temp_names_null_lang_match: if temp_names_match: errors.append("Attempting to add names that already exists: {}".format(a)) if temp_names_null_lang_match: - errors.append("Attempting to add name with lang code that already exist with no lang code: {}".format(a)) + errors.append("Attempting to add name with lang code that already exists with no lang code: {}".format(a)) else: name_vals_match = [tn for tn in temp_names if (tn['value'] == a['value'] and tn['lang'] == a['lang'])] if name_vals_match: From 647d5b69f3b1e32295e1ba373a0bee804cbae7ee Mon Sep 17 00:00:00 2001 From: lizkrznarich Date: Mon, 1 Apr 2024 11:05:09 -0500 Subject: [PATCH 38/38] remove print statements --- rorapi/common/csv_update.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/rorapi/common/csv_update.py b/rorapi/common/csv_update.py index 363862f..d8b44a6 100644 --- a/rorapi/common/csv_update.py +++ b/rorapi/common/csv_update.py @@ -285,14 +285,8 @@ def update_record_from_csv(csv_data, version): if UPDATE_ACTIONS['ADD'] in actions_values: add_values = actions_values[UPDATE_ACTIONS['ADD']] for a in add_values: - print('temp names:') - print(temp_names) temp_names_match = [tn for tn in temp_names if (t in tn['types'] and tn['value'] == a['value'] and tn['lang'] == a['lang'])] - print("temp names match:") - print(temp_names_match) temp_names_null_lang_match = [tn for tn in temp_names if (tn['value'] == a['value'] and (tn['lang'] is None and a['lang'] is not None))] - print("null lang match:") - print(temp_names_null_lang_match) # check if value, lang and type already exist if temp_names_match or temp_names_null_lang_match: if temp_names_match: