Skip to content

Commit

Permalink
Preemptively filter out Rekognition tags (#4667)
Browse files Browse the repository at this point in the history
* Preemptively filter out Rekognition tags

* Rename variable
  • Loading branch information
AetherUnbound authored Aug 12, 2024
1 parent 6dbbe5e commit 5ccf7ff
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 6 deletions.
13 changes: 9 additions & 4 deletions ingestion_server/ingestion_server/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@
# Filter out low-confidence tags, which indicate that the machine-generated tag
# may be inaccurate.
TAG_MIN_CONFIDENCE = 0.90
# Filter out tags that match the following providers (either because they haven't
# been vetted or because they are known to be low-quality).
FILTERED_TAG_PROVIDERS = {"rekognition"}

# We know that flickr and wikimedia support TLS, so we can add them here
TLS_CACHE = {
Expand Down Expand Up @@ -123,7 +126,7 @@ def cleanup_url(url, tls_support):
@staticmethod
def cleanup_tags(tags):
"""
Delete denylisted and low-accuracy tags.
Filter denylisted, low-accuracy, and unverified provider tags.
:return: an SQL fragment if an update is needed, ``None`` otherwise
"""
Expand All @@ -133,12 +136,14 @@ def cleanup_tags(tags):
if not tags:
return None
for tag in tags:
below_threshold = False
alt_filtered = False
if "accuracy" in tag and float(tag["accuracy"]) < TAG_MIN_CONFIDENCE:
below_threshold = True
alt_filtered = True
if "provider" in tag and tag["provider"] in FILTERED_TAG_PROVIDERS:
alt_filtered = True
if "name" in tag and isinstance(tag["name"], str):
lower_tag = tag["name"].lower()
should_filter = _tag_denylisted(lower_tag) or below_threshold
should_filter = _tag_denylisted(lower_tag) or alt_filtered
else:
log.warning(f'Filtering malformed tag "{tag}" in "{tags}"')
should_filter = True
Expand Down
17 changes: 15 additions & 2 deletions ingestion_server/test/unit_tests/test_cleanup.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import pook
from psycopg2._json import Json

from ingestion_server.cleanup import CleanupFunctions
from ingestion_server.cleanup import FILTERED_TAG_PROVIDERS, CleanupFunctions
from test.unit_tests.conftest import create_mock_image


class TestCleanup:
@staticmethod
def test_tag_blacklist():
def test_tag_denylisted():
tags = [
{"name": "cc0"},
{"name": " cc0"},
Expand Down Expand Up @@ -40,6 +40,19 @@ def test_accuracy_filter():
expected = str(Json([{"name": "accurate", "accuracy": 0.999}]))
assert result == expected

@staticmethod
def test_provider_filter():
tags = [
{"name": "valid", "provider": "provider1"},
*[
{"name": "invalid", "provider": provider}
for provider in FILTERED_TAG_PROVIDERS
],
]
result = str(CleanupFunctions.cleanup_tags(tags))
expected = str(Json([{"name": "valid", "provider": "provider1"}]))
assert result == expected

@staticmethod
@pook.on
def test_url_protocol_fix():
Expand Down

0 comments on commit 5ccf7ff

Please sign in to comment.