Skip to content

Commit

Permalink
fix: LEAP-1404: Support multiple import storages per provider (#6216)
Browse files Browse the repository at this point in the history
Co-authored-by: Sergei Ivashchenko <[email protected]>
  • Loading branch information
jpantzlaff and triklozoid authored Oct 24, 2024
1 parent 1eff3ec commit 52507c2
Show file tree
Hide file tree
Showing 10 changed files with 153 additions and 41 deletions.
5 changes: 5 additions & 0 deletions label_studio/io_storages/azure_blob/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import re
from datetime import datetime, timedelta
from typing import Union
from urllib.parse import urlparse

from azure.core.exceptions import ResourceNotFoundError
Expand All @@ -22,6 +23,7 @@
ImportStorageLink,
ProjectStorageMixin,
)
from io_storages.utils import storage_can_resolve_bucket_url
from tasks.models import Annotation

from label_studio.io_storages.azure_blob.utils import AZURE
Expand Down Expand Up @@ -156,6 +158,9 @@ def generate_http_url(self, url):
'https://' + self.get_account_name() + '.blob.core.windows.net/' + container + '/' + blob + '?' + sas_token
)

def can_resolve_url(self, url: Union[str, None]) -> bool:
return storage_can_resolve_bucket_url(self, url)

def get_blob_metadata(self, key):
return AZURE.get_blob_metadata(
key, self.container, account_name=self.account_name, account_key=self.account_key
Expand Down
15 changes: 10 additions & 5 deletions label_studio/io_storages/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import logging
import traceback as tb
from datetime import datetime
from typing import Union
from urllib.parse import urljoin

import django_rq
Expand Down Expand Up @@ -231,9 +232,13 @@ def get_data(self, key):
def generate_http_url(self, url):
raise NotImplementedError

def can_resolve_url(self, url):
# TODO: later check to the full prefix like "url.startswith(self.path_full)"
# Search of occurrences inside string, e.g. for cases like "gs://bucket/file.pdf" or "<embed src='gs://bucket/file.pdf'/>"
def can_resolve_url(self, url: Union[str, None]) -> bool:
return self.can_resolve_scheme(url)

def can_resolve_scheme(self, url: Union[str, None]) -> bool:
if not url:
return False
# TODO: Search for occurrences inside string, e.g. for cases like "gs://bucket/file.pdf" or "<embed src='gs://bucket/file.pdf'/>"
_, prefix = get_uri_via_regex(url, prefixes=(self.url_scheme,))
if prefix == self.url_scheme:
return True
Expand Down Expand Up @@ -261,8 +266,8 @@ def resolve_uri(self, uri, task=None):
elif isinstance(uri, str):
try:
# extract uri first from task data
extracted_uri, extracted_storage = get_uri_via_regex(uri, prefixes=(self.url_scheme,))
if not extracted_storage:
extracted_uri, _ = get_uri_via_regex(uri, prefixes=(self.url_scheme,))
if not self.can_resolve_url(extracted_uri):
logger.debug(f'No storage info found for URI={uri}')
return

Expand Down
5 changes: 5 additions & 0 deletions label_studio/io_storages/gcs/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""
import json
import logging
from typing import Union

from core.redis import start_job_async_or_sync
from django.conf import settings
Expand All @@ -17,6 +18,7 @@
ProjectStorageMixin,
)
from io_storages.gcs.utils import GCS
from io_storages.utils import storage_can_resolve_bucket_url
from tasks.models import Annotation

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -93,6 +95,9 @@ def generate_http_url(self, url):
presign_ttl=self.presign_ttl,
)

def can_resolve_url(self, url: Union[str, None]) -> bool:
return storage_can_resolve_bucket_url(self, url)

def scan_and_create_links(self):
return self._scan_and_create_links(GCSImportStorageLink)

Expand Down
5 changes: 5 additions & 0 deletions label_studio/io_storages/s3/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
import logging
import re
from typing import Union

import boto3
from core.feature_flags import flag_set
Expand All @@ -20,6 +21,7 @@
ProjectStorageMixin,
)
from io_storages.s3.utils import get_client_and_resource, resolve_s3_url
from io_storages.utils import storage_can_resolve_bucket_url
from tasks.models import Annotation
from tasks.validation import ValidationError as TaskValidationError

Expand Down Expand Up @@ -175,6 +177,9 @@ def get_data(self, key):
def generate_http_url(self, url):
return resolve_s3_url(url, self.get_client(), self.presign, expires_in=self.presign_ttl * 60)

def can_resolve_url(self, url: Union[str, None]) -> bool:
return storage_can_resolve_bucket_url(self, url)

def get_blob_metadata(self, key):
return AWS.get_blob_metadata(
key,
Expand Down
46 changes: 45 additions & 1 deletion label_studio/io_storages/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,23 @@
"""
import logging
import re
from dataclasses import dataclass
from typing import Union

logger = logging.getLogger(__name__)

# Put storage prefixes here
uri_regex = r"([\"'])(?P<uri>(?P<storage>{})://[^\1=]*)\1"


def get_uri_via_regex(data, prefixes=('s3', 'gs')):
@dataclass
class BucketURI:
bucket: str
path: str
scheme: str


def get_uri_via_regex(data, prefixes=('s3', 'gs')) -> tuple[Union[str, None], Union[str, None]]:
data = str(data).strip()
middle_check = False

Expand Down Expand Up @@ -38,3 +47,38 @@ def get_uri_via_regex(data, prefixes=('s3', 'gs')):
logger.warning("Can't parse task.data to match URI. Reason: Match is not found.")
return None, None
return r_match.group('uri'), r_match.group('storage')


def parse_bucket_uri(value: object, storage) -> Union[BucketURI, None]:
if not value:
return None

uri, _ = get_uri_via_regex(value, prefixes=(storage.url_scheme,))
if not uri:
return None

try:
scheme, rest = uri.split('://', 1)
bucket, path = rest.split('/', 1)
except ValueError:
return None

return BucketURI(bucket=bucket, path=path, scheme=scheme)


def storage_can_resolve_bucket_url(storage, url) -> bool:
if not storage.can_resolve_scheme(url):
return False

uri = parse_bucket_uri(url, storage)
if not uri:
return False

storage_bucket: str | None = getattr(storage, 'bucket', None) or getattr(storage, 'container', None)
if storage_bucket != uri.bucket:
return False

if storage.prefix and not uri.path.startswith(storage.prefix):
return False

return True
10 changes: 8 additions & 2 deletions label_studio/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,9 @@ def s3_with_hypertext_s3_links(s3):
s3.put_object(
Bucket=bucket_name,
Key='test.json',
Body=json.dumps({'text': '<a href="s3://hypertext-bucket/file with /spaces and\' / \' / quotes.jpg"/>'}),
Body=json.dumps(
{'text': '<a href="s3://pytest-s3-jsons-hypertext/file with /spaces and\' / \' / quotes.jpg"/>'}
),
)
yield s3

Expand All @@ -157,7 +159,11 @@ def s3_with_partially_encoded_s3_links(s3):
s3.put_object(
Bucket=bucket_name,
Key='test.json',
Body=json.dumps({'text': '<a href="s3://hypertext-bucket/file with /spaces and\' / \' / %2Bquotes%3D.jpg"/>'}),
Body=json.dumps(
{
'text': '<a href="s3://pytest-s3-json-partially-encoded/file with /spaces and\' / \' / %2Bquotes%3D.jpg"/>'
}
),
)
yield s3

Expand Down
62 changes: 49 additions & 13 deletions label_studio/tests/io_storages.tavern.yml
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,36 @@ stages:
response:
status_code: 400

- id: import_task_invalid_url
name: Import task from wrong bucket
request:
url: "{django_live_url}/api/projects/{project_pk}/tasks"
json:
data:
image_url: "s3:/"
method: POST
headers:
content-type: application/json
response:
status_code: 201
save:
json:
task_pk: id

# check, that image_url is not resolved and api is not broken
- id: get_task
name: Get task and check, that image_url is not resolved
request:
url: "{django_live_url}/api/tasks/{task_pk}"
method: GET
headers:
content-type: application/json
response:
status_code: 200
json:
data:
image_url: "s3:/"

---
test_name: test_import_from_s3_storage_recursive_scan
strict: false
Expand Down Expand Up @@ -575,16 +605,19 @@ stages:
content-type: application/json
json:
data:
image: gs://whatever-bucket-with/manual.link.jpg
image: gs://test-gs-bucket_JSON/manual.link.jpg
dict:
key1: gs://whatever-bucket-with/manual.link.jpg
key1: gs://test-gs-bucket_JSON/manual.link.jpg
array:
- gs://whatever-bucket-with/manual.link.jpg
- gs://whatever-bucket-with/manual.link.jpg
- gs://test-gs-bucket_JSON/manual.link.jpg
- gs://test-gs-bucket_JSON/manual.link.jpg
array:
- item1: gs://whatever-bucket-with/manual.link.jpg
- item1: gs://test-gs-bucket_JSON/manual.link.jpg
some: 'some text'
- item2: gs://whatever-bucket-with/manual.link.jpg
- item2: gs://test-gs-bucket_JSON/manual.link.jpg
some: 'some text'
# This link should not be resolved - no matching bucket
- item3: gs://bad-bucket/manual.link.jpg
some: 'some text'
method: POST
url: '{django_live_url}/api/projects/{project_pk}/import'
Expand Down Expand Up @@ -617,16 +650,19 @@ stages:
response:
json:
data:
image: !re_match "https://storage.googleapis.com/whatever-bucket-with/manual.link.+"
image: !re_match "https://storage.googleapis.com/test-gs-bucket_JSON/manual.link.+"
dict:
key1: !re_match "https://storage.googleapis.com/whatever-bucket-with/manual.link.+"
key1: !re_match "https://storage.googleapis.com/test-gs-bucket_JSON/manual.link.+"
array:
- !re_match "https://storage.googleapis.com/whatever-bucket-with/manual.link.+"
- !re_match "https://storage.googleapis.com/whatever-bucket-with/manual.link.+"
- !re_match "https://storage.googleapis.com/test-gs-bucket_JSON/manual.link.+"
- !re_match "https://storage.googleapis.com/test-gs-bucket_JSON/manual.link.+"
array:
- item1: !re_match "https://storage.googleapis.com/whatever-bucket-with/manual.link.+"
- item1: !re_match "https://storage.googleapis.com/test-gs-bucket_JSON/manual.link.+"
some: 'some text'
- item2: !re_match "https://storage.googleapis.com/test-gs-bucket_JSON/manual.link.+"
some: 'some text'
- item2: !re_match "https://storage.googleapis.com/whatever-bucket-with/manual.link.+"
# This link should remain unresolved - no matching bucket
- item3: !re_match "gs://bad-bucket/manual.link.+"
some: 'some text'
status_code: 200

Expand Down Expand Up @@ -1569,7 +1605,7 @@ stages:
response:
json:
data:
text: !re_match "<a href=\"https://hypertext-bucket.s3.amazonaws.com/file%20with%20/spaces%20and%27%20/%20%27%20/%20quotes.jpg.+X-Amz-Security-Token=testing"
text: !re_match "<a href=\"https://pytest-s3-jsons-hypertext.s3.amazonaws.com/file%20with%20/spaces%20and%27%20/%20%27%20/%20quotes.jpg.+X-Amz-Security-Token=testing"
status_code: 200

---
Expand Down
8 changes: 4 additions & 4 deletions label_studio/tests/io_storages_presign_endpoints.tavern.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,11 @@ stages:

- name: get_presigned_url
request:
url: "{django_live_url}/projects/{project_pk}/presign?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
url: "{django_live_url}/projects/{project_pk}/presign?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldC9tYW51YWwubGluay5qcGc="
response:
status_code: 303
headers:
location: "https://storage.googleapis.com/whatever-bucket-with/manual.link.jpg"
location: "https://storage.googleapis.com/test-gs-bucket/manual.link.jpg"


---
Expand Down Expand Up @@ -139,8 +139,8 @@ stages:

- name: get_presigned_url
request:
url: "{django_live_url}/tasks/{task_pk}/presign?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
url: "{django_live_url}/tasks/{task_pk}/presign?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldC9tYW51YWwubGluay5qcGc="
response:
status_code: 303
headers:
location: "https://storage.googleapis.com/whatever-bucket-with/manual.link.jpg"
location: "https://storage.googleapis.com/test-gs-bucket/manual.link.jpg"
34 changes: 20 additions & 14 deletions label_studio/tests/io_storages_presign_proxy.tavern.yml
Original file line number Diff line number Diff line change
Expand Up @@ -431,16 +431,19 @@ stages:
content-type: application/json
json:
data:
image: gs://whatever-bucket-with/manual.link.jpg
image: gs://test-gs-bucket_JSON/manual.link.jpg
dict:
key1: gs://whatever-bucket-with/manual.link.jpg
key1: gs://test-gs-bucket_JSON/manual.link.jpg
array:
- gs://whatever-bucket-with/manual.link.jpg
- gs://whatever-bucket-with/manual.link.jpg
- gs://test-gs-bucket_JSON/manual.link.jpg
- gs://test-gs-bucket_JSON/manual.link.jpg
array:
- item1: gs://whatever-bucket-with/manual.link.jpg
- item1: gs://test-gs-bucket_JSON/manual.link.jpg
some: "some text"
- item2: gs://whatever-bucket-with/manual.link.jpg
- item2: gs://test-gs-bucket_JSON/manual.link.jpg
some: "some text"
# This link should not be resolved - no matching bucket
- item3: gs://bad-bucket/manual.link.jpg
some: "some text"
method: POST
url: "{django_live_url}/api/projects/{project_pk}/import"
Expand Down Expand Up @@ -473,16 +476,19 @@ stages:
response:
json:
data:
image: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
image: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldF9KU09OL21hbnVhbC5saW5rLmpwZw=="
dict:
key1: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
key1: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldF9KU09OL21hbnVhbC5saW5rLmpwZw=="
array:
- !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
- !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
- !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldF9KU09OL21hbnVhbC5saW5rLmpwZw=="
- !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldF9KU09OL21hbnVhbC5saW5rLmpwZw=="
array:
- item1: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
- item1: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldF9KU09OL21hbnVhbC5saW5rLmpwZw=="
some: "some text"
- item2: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldF9KU09OL21hbnVhbC5saW5rLmpwZw=="
some: "some text"
- item2: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
# This link should remain unresolved - no matching bucket
- item3: !re_match "gs://bad-bucket/manual.link.+"
some: "some text"
status_code: 200
---
Expand Down Expand Up @@ -1152,7 +1158,7 @@ stages:
response:
json:
data:
text: !re_match "<a href=\"/tasks/\\d+/presign/\\?fileuri=czM6Ly9oeXBlcnRleHQtYnVja2V0L2ZpbGUgd2l0aCAvc3BhY2VzIGFuZCcgLyAnIC8gcXVvdGVzLmpwZw=="
text: !re_match "<a href=\"/tasks/\\d+/presign/\\?fileuri=czM6Ly9weXRlc3QtczMtanNvbnMtaHlwZXJ0ZXh0L2ZpbGUgd2l0aCAvc3BhY2VzIGFuZCcgLyAnIC8gcXVvdGVzLmpwZw=="
status_code: 200
---
# - Check that json blobs containing partially encoded contents resolve correctly from the bucket,
Expand Down Expand Up @@ -1216,7 +1222,7 @@ stages:
response:
json:
data:
text: !re_match "<a href=\"/tasks/\\d+/presign/\\?fileuri=czM6Ly9oeXBlcnRleHQtYnVja2V0L2ZpbGUgd2l0aCAvc3BhY2VzIGFuZCcgLyAnIC8gJTJCcXVvdGVzJTNELmpwZw=="
text: !re_match "<a href=\"/tasks/\\d+/presign/\\?fileuri=czM6Ly9weXRlc3QtczMtanNvbi1wYXJ0aWFsbHktZW5jb2RlZC9maWxlIHdpdGggL3NwYWNlcyBhbmQnIC8gJyAvICUyQnF1b3RlcyUzRC5qcGc="
status_code: 200
---
# we don't fail when unexisted s3:// links occur in the list
Expand Down
Loading

0 comments on commit 52507c2

Please sign in to comment.