Skip to content

Commit

Permalink
Add cleaned option (alephdata#3234)
Browse files Browse the repository at this point in the history
* Add cleaned option

* Change cleaned=True to clean=False; add clean flag tests
  • Loading branch information
ksotik authored and simonwoerpel committed Apr 22, 2024
1 parent 64ec8ee commit b6f9d3a
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 9 deletions.
4 changes: 2 additions & 2 deletions aleph/logic/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@ def index_many(stage, collection, sync=False, entity_ids=None, batch=BATCH_SIZE)
refresh_collection(collection.id)


def bulk_write(collection, entities, safe=False, role_id=None, mutable=True):
def bulk_write(collection, entities, safe=False, role_id=None, mutable=True, clean=True):
"""Write a set of entities - given as dicts - to the index."""
# This is called mainly by the /api/2/collections/X/_bulk API.
aggregator = get_aggregator(collection)
writer = aggregator.bulk()
for data in entities:
entity = model.get_proxy(data, cleaned=False)
entity = model.get_proxy(data, cleaned=(not clean))
entity = collection.ns.apply(entity)
if entity.id is None:
raise InvalidData("No ID for entity", errors=entity.to_dict())
Expand Down
9 changes: 7 additions & 2 deletions aleph/manage.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,12 @@ def xref(foreign_id):
default=False,
help="Mark entities mutable.",
)
def load_entities(foreign_id, infile, safe=False, mutable=False):
@click.option(
"--clean/--unclean",
default=False,
help="Allow to disable (if --clean) server-side values validation for all types.",
)
def load_entities(foreign_id, infile, safe=True, mutable=False, clean=True):
"""Load FtM entities from the specified iJSON file."""
collection = ensure_collection(foreign_id, foreign_id)

Expand All @@ -274,7 +279,7 @@ def read_entities():

role = Role.load_cli_user()
for _ in bulk_write(
collection, read_entities(), safe=safe, mutable=mutable, role_id=role.id
collection, read_entities(), safe=safe, mutable=mutable, clean=clean, role_id=role.id
):
pass
reindex_collection(collection)
Expand Down
33 changes: 33 additions & 0 deletions aleph/tests/test_collections_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,39 @@ def test_bulk_api(self):
res = self.client.post(url, headers=headers, data=json.dumps(data))
assert res.status_code == 400, res

def test_bulk_api_flags(self):
_, headers = self.login(is_admin=True)
data = json.dumps(
[
{
"id": "4345800498380953840",
"schema": "LegalEntity",
"properties": {"name": "Barbra W. Vaughn", "phone": "+19046426847"},
},
{
"id": "7598743983789743598",
"schema": "LegalEntity",
"properties": {"name": "Marion C. Bostic", "phone": "123456"},
},
]
)
url = "/api/2/collections/%s/_bulk?clean=False" % self.col.id
res = self.client.post(url, headers=headers, data=data)
assert res.status_code == 204, res
query = "/api/2/entities?filter:schemata=LegalEntity&filter:collection_id=%s"
query = query % self.col.id
res = self.client.get(query, headers=headers)
assert "phone" in res.json["results"][0]["properties"], res.json
assert "phone" in res.json["results"][1]["properties"], res.json
url = "/api/2/collections/%s/_bulk" % self.col.id
res = self.client.post(url, headers=headers, data=data)
assert res.status_code == 204, res
query = "/api/2/entities?filter:schemata=LegalEntity&filter:collection_id=%s"
query = query % self.col.id
res = self.client.get(query, headers=headers)
assert "phone" in res.json["results"][0]["properties"], res.json
assert "phone" not in res.json["results"][1]["properties"], res.json

def test_bulk_entitysets_api(self):
role, headers = self.login(is_admin=True)
authz = Authz.from_role(role)
Expand Down
24 changes: 19 additions & 5 deletions aleph/views/collections_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,10 +235,17 @@ def bulk(collection_id):
minimum: 1
type: integer
- description: >-
This will disable checksum security measures in order to allow bulk
loading of document data.
safe=True means that the data cannot be trusted and that file checksums should be removed.
Flag is only available for admins. Default True.
in: query
name: unsafe
name: safe
schema:
type: boolean
- description: >-
clean=True means that the data cannot be trusted and that the data should be cleaned from invalid values.
Flag is only available for admins. Default True.
in: query
name: clean
schema:
type: boolean
requestBody:
Expand All @@ -262,19 +269,26 @@ def bulk(collection_id):
if entityset is not None:
entityset = get_entityset(entityset, request.authz.WRITE)

# This will disable checksum security measures in order to allow bulk
# This will disable (if False) checksum security measures in order to allow bulk
# loading of document data:
safe = get_flag("safe", default=True)
# Flag is only available for admins:
if not request.authz.is_admin:
safe = True

# This will disable (if False) values validation for all types of all entities / properties
# (will pass cleaned=True to the model.get_proxy() in the aleph/logic/processing.py)
clean = get_flag("clean", default=True)
# Flag is only available for admins:
if not request.authz.is_admin:
clean = True

# Let UI tools change the entities created by this:
mutable = get_flag("mutable", default=False)
entities = ensure_list(request.get_json(force=True))
entity_ids = list()
for entity_id in bulk_write(
collection, entities, safe=safe, mutable=mutable, role_id=request.authz.id
collection, entities, safe=safe, mutable=mutable, clean=clean, role_id=request.authz.id
):
entity_ids.append(entity_id)
if entityset is not None:
Expand Down

0 comments on commit b6f9d3a

Please sign in to comment.