diff --git a/aleph/logic/processing.py b/aleph/logic/processing.py index 50054c39cb..ed2000e1d3 100644 --- a/aleph/logic/processing.py +++ b/aleph/logic/processing.py @@ -27,13 +27,13 @@ def index_many(stage, collection, sync=False, entity_ids=None, batch=BATCH_SIZE) refresh_collection(collection.id) -def bulk_write(collection, entities, safe=False, role_id=None, mutable=True): +def bulk_write(collection, entities, safe=False, role_id=None, mutable=True, clean=True): """Write a set of entities - given as dicts - to the index.""" # This is called mainly by the /api/2/collections/X/_bulk API. aggregator = get_aggregator(collection) writer = aggregator.bulk() for data in entities: - entity = model.get_proxy(data, cleaned=False) + entity = model.get_proxy(data, cleaned=(not clean)) entity = collection.ns.apply(entity) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) diff --git a/aleph/manage.py b/aleph/manage.py index 0b35aaec94..406c6a2353 100644 --- a/aleph/manage.py +++ b/aleph/manage.py @@ -257,7 +257,12 @@ def xref(foreign_id): default=False, help="Mark entities mutable.", ) -def load_entities(foreign_id, infile, safe=False, mutable=False): +@click.option( + "--clean/--unclean", + default=False, + help="Allow to disable (if --clean) server-side values validation for all types.", +) +def load_entities(foreign_id, infile, safe=True, mutable=False, clean=True): """Load FtM entities from the specified iJSON file.""" collection = ensure_collection(foreign_id, foreign_id) @@ -274,7 +279,7 @@ def read_entities(): role = Role.load_cli_user() for _ in bulk_write( - collection, read_entities(), safe=safe, mutable=mutable, role_id=role.id + collection, read_entities(), safe=safe, mutable=mutable, clean=clean, role_id=role.id ): pass reindex_collection(collection) diff --git a/aleph/tests/test_collections_api.py b/aleph/tests/test_collections_api.py index 45f58afcac..6992f03dc2 100644 --- a/aleph/tests/test_collections_api.py +++ b/aleph/tests/test_collections_api.py @@ -208,6 +208,39 @@ def test_bulk_api(self): res = self.client.post(url, headers=headers, data=json.dumps(data)) assert res.status_code == 400, res + def test_bulk_api_flags(self): + _, headers = self.login(is_admin=True) + data = json.dumps( + [ + { + "id": "4345800498380953840", + "schema": "LegalEntity", + "properties": {"name": "Barbra W. Vaughn", "phone": "+19046426847"}, + }, + { + "id": "7598743983789743598", + "schema": "LegalEntity", + "properties": {"name": "Marion C. Bostic", "phone": "123456"}, + }, + ] + ) + url = "/api/2/collections/%s/_bulk?clean=False" % self.col.id + res = self.client.post(url, headers=headers, data=data) + assert res.status_code == 204, res + query = "/api/2/entities?filter:schemata=LegalEntity&filter:collection_id=%s" + query = query % self.col.id + res = self.client.get(query, headers=headers) + assert "phone" in res.json["results"][0]["properties"], res.json + assert "phone" in res.json["results"][1]["properties"], res.json + url = "/api/2/collections/%s/_bulk" % self.col.id + res = self.client.post(url, headers=headers, data=data) + assert res.status_code == 204, res + query = "/api/2/entities?filter:schemata=LegalEntity&filter:collection_id=%s" + query = query % self.col.id + res = self.client.get(query, headers=headers) + assert "phone" in res.json["results"][0]["properties"], res.json + assert "phone" not in res.json["results"][1]["properties"], res.json + def test_bulk_entitysets_api(self): role, headers = self.login(is_admin=True) authz = Authz.from_role(role) diff --git a/aleph/views/collections_api.py b/aleph/views/collections_api.py index b1d3a77ec1..99f81eaeb8 100644 --- a/aleph/views/collections_api.py +++ b/aleph/views/collections_api.py @@ -235,10 +235,17 @@ def bulk(collection_id): minimum: 1 type: integer - description: >- - This will disable checksum security measures in order to allow bulk - loading of document data. + safe=True means that the data cannot be trusted and that file checksums should be removed. + Flag is only available for admins. Default True. in: query - name: unsafe + name: safe + schema: + type: boolean + - description: >- + clean=True means that the data cannot be trusted and that the data should be cleaned from invalid values. + Flag is only available for admins. Default True. + in: query + name: clean schema: type: boolean requestBody: @@ -262,19 +269,26 @@ def bulk(collection_id): if entityset is not None: entityset = get_entityset(entityset, request.authz.WRITE) - # This will disable checksum security measures in order to allow bulk + # This will disable (if False) checksum security measures in order to allow bulk # loading of document data: safe = get_flag("safe", default=True) # Flag is only available for admins: if not request.authz.is_admin: safe = True + # This will disable (if False) values validation for all types of all entities / properties + # (will pass cleaned=True to the model.get_proxy() in the aleph/logic/processing.py) + clean = get_flag("clean", default=True) + # Flag is only available for admins: + if not request.authz.is_admin: + clean = True + # Let UI tools change the entities created by this: mutable = get_flag("mutable", default=False) entities = ensure_list(request.get_json(force=True)) entity_ids = list() for entity_id in bulk_write( - collection, entities, safe=safe, mutable=mutable, role_id=request.authz.id + collection, entities, safe=safe, mutable=mutable, clean=clean, role_id=request.authz.id ): entity_ids.append(entity_id) if entityset is not None: