Skip to content

Commit

Permalink
feat: store fingerprint of all images
Browse files Browse the repository at this point in the history
used for near-duplicate image detection
  • Loading branch information
raphael0202 committed Oct 25, 2023
1 parent dd6d81e commit ed8fd38
Show file tree
Hide file tree
Showing 6 changed files with 203 additions and 55 deletions.
146 changes: 93 additions & 53 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ h5py = "~3.8.0"
opencv-contrib-python = "~4.7.0.72"
toml = "~0.10.2"
openfoodfacts = "0.1.10"
imagehash = "~4.3.1"

[tool.poetry.dependencies.sentry-sdk]
version = "~1.14.0"
Expand Down
39 changes: 39 additions & 0 deletions robotoff/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
from pathlib import Path
from typing import Optional

import imagehash
import numpy as np
from PIL import Image

from robotoff.models import ImageModel
from robotoff.off import generate_image_path, generate_image_url
from robotoff.types import JSONType, ProductIdentifier
Expand Down Expand Up @@ -128,3 +132,38 @@ def refresh_images_in_db(product_id: ProductIdentifier, images: JSONType):
image_url = generate_image_url(product_id, missing_image_id)
logger.debug("Creating missing image %s in DB", source_image)
save_image(product_id, source_image, image_url, images)


def add_image_fingerprint(image_model: ImageModel):
"""Update image in DB to add the image fingerprint.
:param image_model: the image model to update
"""
image_url = image_model.get_image_url()
image = get_image_from_url(image_url, error_raise=False, session=http_session)

if image is None:
logger.info(
"could not fetch image from %s, aborting image fingerprinting", image_url
)
return

image_model.fingerprint = generate_image_fingerprint(image)
ImageModel.bulk_update([image_model], fields=["fingerprint"])


def generate_image_fingerprint(image: Image.Image) -> int:
"""Generate a fingerprint from an image, used for near-duplicate
detection.
We use perceptual hashing algorithm.
:param image: the input image
:return: the fingerprint, as a 64-bit integer
"""
array = imagehash.phash(image).hash
# `int_array` is a flattened int array of dim 64
int_array = array.flatten().astype(int)
# convert the 64-bit array to a 64 bits integer
fingerprint = int_array.dot(2 ** np.arange(int_array.size)[::-1])
return fingerprint
13 changes: 13 additions & 0 deletions robotoff/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
import datetime
import functools
import uuid
from pathlib import Path
from typing import Iterable

import peewee
from playhouse.postgres_ext import BinaryJSONField, PostgresqlExtDatabase
from playhouse.shortcuts import model_to_dict

from robotoff import settings
from robotoff.off import generate_image_url
from robotoff.types import ProductIdentifier, ServerType

db = PostgresqlExtDatabase(
Expand Down Expand Up @@ -245,13 +247,24 @@ class ImageModel(BaseModel):
height = peewee.IntegerField(null=False, index=True)
deleted = peewee.BooleanField(null=False, index=True, default=False)
server_type = peewee.CharField(null=True, max_length=10, index=True)
# Perceptual hash of the image, used to find near-duplicates
# It's a 64-bit bitmap, so it can be stored as a bigint (8 bits)
fingerprint = peewee.BigIntegerField(null=True, index=True)

class Meta:
table_name = "image"

def get_product_id(self) -> ProductIdentifier:
return ProductIdentifier(self.barcode, ServerType[self.server_type])

def get_image_url(self) -> str:
"""Get the full image URL from the product `barcode`, `server_type`
and `source_image` fields.
:return: the image URL
"""
return generate_image_url(self.get_product_id(), Path(self.source_image).stem)


class ImagePrediction(BaseModel):
"""Table to store computer vision predictions (object detection,
Expand Down
31 changes: 29 additions & 2 deletions robotoff/workers/tasks/import_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from robotoff import settings
from robotoff.elasticsearch import get_es_client
from robotoff.images import save_image
from robotoff.images import add_image_fingerprint, save_image
from robotoff.insights.extraction import (
DEFAULT_OCR_PREDICTION_TYPES,
extract_ocr_predictions,
Expand Down Expand Up @@ -45,7 +45,7 @@
)
from robotoff.utils import get_image_from_url, get_logger, http_session
from robotoff.utils.image import convert_image_to_array
from robotoff.workers.queues import enqueue_job, get_high_queue
from robotoff.workers.queues import enqueue_job, get_high_queue, low_queue

logger = get_logger(__name__)

Expand Down Expand Up @@ -92,6 +92,14 @@ def run_import_image_job(product_id: ProductIdentifier, image_url: str, ocr_url:
ImageModel.bulk_update([image_model], fields=["deleted"])
return

# Compute image fingerprint, this job is low priority
enqueue_job(
add_image_fingerprint_job,
low_queue,
job_kwargs={"result_ttl": 0},
image_model_id=image_model.id,
)

if product_id.server_type.is_food():
# Currently we don't support insight generation for projects other
# than OFF (OBF, OPF,...)
Expand Down Expand Up @@ -495,3 +503,22 @@ def process_created_logos(image_prediction_id: int, server_type: ServerType):
logos = [embedding.logo for embedding in logo_embeddings]
thresholds = get_logo_confidence_thresholds()
import_logo_insights(logos, thresholds=thresholds, server_type=server_type)


@with_db
def add_image_fingerprint_job(image_model_id: int):
"""Job to add the fingerprint of an image in DB.
:param image_model_id: the DB ID of the image
"""
logger.info("Computing fingerprint for image ID %s", image_model_id)

image_model: ImageModel
if (image_model := ImageModel.get_or_none(id=image_model_id)) is None:
logger.warning(
"image ID %s not found in DB, skipping fingerprint generation",
image_model_id,
)
return

add_image_fingerprint(image_model)
28 changes: 28 additions & 0 deletions tests/unit/test_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from pathlib import Path

from PIL import Image

from robotoff.images import generate_image_fingerprint

IMAGE_DATA_DIR = Path(__file__).parent / "data/upc_image"


def load_test_image(file_name: str) -> Image.Image:
file_path = IMAGE_DATA_DIR / file_name
return Image.open(file_path)


def test_generate_image_fingerprint():
image_1 = load_test_image("no_upc1.jpg")
image_2 = load_test_image("no_upc2.jpg")
image_1_rescaled = image_1.copy()
image_1_rescaled.thumbnail((400, 400))

fingerprint_1 = generate_image_fingerprint(image_1)
fingerprint_2 = generate_image_fingerprint(image_2)
fingerprint_rescaled_1 = generate_image_fingerprint(image_1_rescaled)

# two different images should have different fingerprints
assert fingerprint_1 != fingerprint_2
# fingerprints should be invariant to rescaling
assert fingerprint_1 == fingerprint_rescaled_1

0 comments on commit ed8fd38

Please sign in to comment.