Skip to content

Commit

Permalink
fix: improve brand exclusion for 'taxonomy' predictor
Browse files Browse the repository at this point in the history
- only exclude for 'taxonomy' predictor and not other predictors
- improve documentation in brand_taxonomy_blacklist.txt
- remove excluded brands once a day in scheduler
  • Loading branch information
raphael0202 committed Aug 25, 2023
1 parent 69b5f1e commit 7705823
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 21 deletions.
5 changes: 3 additions & 2 deletions data/ocr/brand_taxonomy_blacklist.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// These brands are excluded from Robotoff's detection. So these brands won't be suggested by Robotoff to users.
// Also Hunger Game won't propose suggestions for these brands.
// These brands are excluded from Robotoff's detection using the 'taxonomy' predictor.
// If you find some false positives for brand insight that were generated using the 'taxonomy' predictor, you should add them here.
// Insight removal of excluded brands is performed once a day.
//
// Why adding brands here?
// * avoid brands that are common words in some languages ("plus", "king", "everyday",...)
Expand Down
2 changes: 2 additions & 0 deletions robotoff/brands.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def get_brand_prefix() -> set[tuple[str, str]]:

@functools.cache
def get_brand_blacklist() -> set[str]:
"""Return the list of brands we want to exclude from automatic detection
through the 'taxonomy' predictor."""
logger.info("Loading brand blacklist...")
return set(text_file_iter(settings.OCR_TAXONOMY_BRANDS_BLACKLIST_PATH))

Expand Down
25 changes: 15 additions & 10 deletions robotoff/insights/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -985,21 +985,26 @@ def is_in_barcode_range(barcode: str, tag: str) -> bool:
return True

@staticmethod
def is_prediction_valid(prediction: Prediction) -> bool:
brand_blacklist = get_brand_blacklist()
if prediction.value_tag in brand_blacklist:
return False
def is_prediction_valid(item: Prediction | ProductInsight) -> bool:
"""Return True if the Prediction or ProductInsight is valid:
if (
prediction.predictor == "universal-logo-detector"
and "username" in prediction.data
):
# Check barcode range for all predictors except logos detected
- we check for 'taxonomy' predictor whether the brand is excluded
- we check that the brand is compatible with the barcode
range
:param item: a Prediction or a ProductInsight
"""
if item.predictor == "universal-logo-detector" and "username" in item.data:
# Don't perform barcode range check and for logos detected
# using universal-logo-detector model and annotated manually
return True

brand_blacklist = get_brand_blacklist()
if item.predictor == "taxonomy" and item.value_tag in brand_blacklist:
return False

return BrandInsightImporter.is_in_barcode_range(
prediction.barcode, prediction.value_tag # type: ignore
item.barcode, item.value_tag # type: ignore
)

@classmethod
Expand Down
6 changes: 3 additions & 3 deletions robotoff/products.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,11 +493,11 @@ def load_from_path(cls, path: Path, projection: Optional[list[str]] = None):
return cls(store)

@classmethod
def load_min(cls, projection: Optional[list[str]] = None):
def load_min(cls, projection: Optional[list[str]] = None) -> "MemoryProductStore":
return cls.load_from_path(settings.JSONL_MIN_DATASET_PATH, projection)

@classmethod
def load_full(cls):
def load_full(cls) -> "MemoryProductStore":
return cls.load_from_path(settings.JSONL_DATASET_PATH)

def __getitem__(self, item) -> Optional[Product]:
Expand Down Expand Up @@ -543,7 +543,7 @@ def iter_product(self, projection: Optional[list[str]] = None):
yield from (Product(p) for p in self.collection.find(projection=projection))


def get_min_product_store(projection: Optional[list[str]] = None) -> ProductStore:
def get_min_product_store(projection: Optional[list[str]] = None) -> MemoryProductStore:
logger.info("Loading product store in memory...")
ps = MemoryProductStore.load_min(projection)
logger.info("product store loaded (%s items)", len(ps))
Expand Down
34 changes: 28 additions & 6 deletions robotoff/scheduler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@

from robotoff import settings, slack
from robotoff.insights.annotate import UPDATED_ANNOTATION_RESULT, annotate
from robotoff.insights.importer import import_insights, is_valid_insight_image
from robotoff.insights.importer import (
BrandInsightImporter,
import_insights,
is_valid_insight_image,
)
from robotoff.metrics import (
ensure_influx_database,
save_facet_metrics,
Expand All @@ -28,7 +32,7 @@
get_min_product_store,
has_dataset_changed,
)
from robotoff.types import ServerType
from robotoff.types import PredictionType, ServerType
from robotoff.utils import get_logger

from .latent import generate_quality_facets
Expand Down Expand Up @@ -181,6 +185,8 @@ def refresh_insights(with_deletion: bool = True) -> None:
logger.info("%s deleted, deleting insight %s", product_id, insight)
insight_deleted += 1
insight.delete_instance()
continue

elif not is_valid_insight_image(product.image_ids, insight.source_image):
if with_deletion:
# insight source image is not referenced in DB
Expand All @@ -192,11 +198,27 @@ def refresh_insights(with_deletion: bool = True) -> None:
)
insight_deleted += 1
insight.delete_instance()
else:
was_updated = update_insight_attributes(product, insight)
continue
# We remove insight with excluded brands, it can happen if the
# brand was added to exclude list after insight creation
elif (
insight.type == PredictionType.brand.value
and not BrandInsightImporter.is_prediction_valid(insight)
):
if with_deletion:
logger.info(
"Brand insight with excluded brand %s, deleting insight %s",
insight.value_tag,
insight,
)
insight_deleted += 1
insight.delete_instance()
continue

was_updated = update_insight_attributes(product, insight)

if was_updated:
insight_updated += 1
if was_updated:
insight_updated += 1

logger.info("%s prediction deleted", prediction_deleted)
logger.info("%s insight deleted", insight_deleted)
Expand Down

0 comments on commit 7705823

Please sign in to comment.