From 5ccf7ff950cd45f8c022458de5f3ecb93c477cf5 Mon Sep 17 00:00:00 2001 From: Madison Swain-Bowden Date: Mon, 12 Aug 2024 08:10:27 -0700 Subject: [PATCH] Preemptively filter out Rekognition tags (#4667) * Preemptively filter out Rekognition tags * Rename variable --- ingestion_server/ingestion_server/cleanup.py | 13 +++++++++---- .../test/unit_tests/test_cleanup.py | 17 +++++++++++++++-- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/ingestion_server/ingestion_server/cleanup.py b/ingestion_server/ingestion_server/cleanup.py index bb673169803..b5bdea2f13d 100644 --- a/ingestion_server/ingestion_server/cleanup.py +++ b/ingestion_server/ingestion_server/cleanup.py @@ -55,6 +55,9 @@ # Filter out low-confidence tags, which indicate that the machine-generated tag # may be inaccurate. TAG_MIN_CONFIDENCE = 0.90 +# Filter out tags that match the following providers (either because they haven't +# been vetted or because they are known to be low-quality). +FILTERED_TAG_PROVIDERS = {"rekognition"} # We know that flickr and wikimedia support TLS, so we can add them here TLS_CACHE = { @@ -123,7 +126,7 @@ def cleanup_url(url, tls_support): @staticmethod def cleanup_tags(tags): """ - Delete denylisted and low-accuracy tags. + Filter denylisted, low-accuracy, and unverified provider tags. :return: an SQL fragment if an update is needed, ``None`` otherwise """ @@ -133,12 +136,14 @@ def cleanup_tags(tags): if not tags: return None for tag in tags: - below_threshold = False + alt_filtered = False if "accuracy" in tag and float(tag["accuracy"]) < TAG_MIN_CONFIDENCE: - below_threshold = True + alt_filtered = True + if "provider" in tag and tag["provider"] in FILTERED_TAG_PROVIDERS: + alt_filtered = True if "name" in tag and isinstance(tag["name"], str): lower_tag = tag["name"].lower() - should_filter = _tag_denylisted(lower_tag) or below_threshold + should_filter = _tag_denylisted(lower_tag) or alt_filtered else: log.warning(f'Filtering malformed tag "{tag}" in "{tags}"') should_filter = True diff --git a/ingestion_server/test/unit_tests/test_cleanup.py b/ingestion_server/test/unit_tests/test_cleanup.py index 02cc6d047e2..f09f6608857 100644 --- a/ingestion_server/test/unit_tests/test_cleanup.py +++ b/ingestion_server/test/unit_tests/test_cleanup.py @@ -1,13 +1,13 @@ import pook from psycopg2._json import Json -from ingestion_server.cleanup import CleanupFunctions +from ingestion_server.cleanup import FILTERED_TAG_PROVIDERS, CleanupFunctions from test.unit_tests.conftest import create_mock_image class TestCleanup: @staticmethod - def test_tag_blacklist(): + def test_tag_denylisted(): tags = [ {"name": "cc0"}, {"name": " cc0"}, @@ -40,6 +40,19 @@ def test_accuracy_filter(): expected = str(Json([{"name": "accurate", "accuracy": 0.999}])) assert result == expected + @staticmethod + def test_provider_filter(): + tags = [ + {"name": "valid", "provider": "provider1"}, + *[ + {"name": "invalid", "provider": provider} + for provider in FILTERED_TAG_PROVIDERS + ], + ] + result = str(CleanupFunctions.cleanup_tags(tags)) + expected = str(Json([{"name": "valid", "provider": "provider1"}])) + assert result == expected + @staticmethod @pook.on def test_url_protocol_fix():