Skip to content

Commit

Permalink
feat: save brand prefix as gzipped file
Browse files Browse the repository at this point in the history
and update brand prefix file
  • Loading branch information
raphael0202 committed Dec 9, 2022
1 parent 15ef7c3 commit c7c8322
Show file tree
Hide file tree
Showing 7 changed files with 23 additions and 29 deletions.
1 change: 0 additions & 1 deletion data/brand_prefix.json

This file was deleted.

Binary file added data/brand_prefix.json.gz
Binary file not shown.
34 changes: 15 additions & 19 deletions robotoff/brands.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@
import json
import operator
from typing import Dict, List, Optional, Set, Tuple

import orjson
import cachetools
import requests

from robotoff import settings
from robotoff.products import ProductDataset
from robotoff.taxonomy import TaxonomyType
from robotoff.utils import dump_text, text_file_iter
from robotoff.taxonomy import TaxonomyType, get_taxonomy
from robotoff.utils import dump_json, dump_text, load_json, text_file_iter
from robotoff.utils.cache import CachedStore


def get_brand_prefix() -> Set:
with settings.BRAND_PREFIX_PATH.open("rb") as f:
return set(tuple(x) for x in orjson.loads(f.read()))
@cachetools.cached(cachetools.LRUCache(maxsize=1))
def get_brand_prefix() -> set[tuple[str, str]]:
"""Get a set of brand prefix tuples found in Open Food Facts databases.
Each tuple has the format (brand_tag, prefix) where prefix is a digit with
13 elements (EAN-13).
"""
return set(tuple(x) for x in load_json(settings.BRAND_PREFIX_PATH, compressed=True)) # type: ignore


def get_brand_blacklist() -> Set[str]:
Expand All @@ -26,9 +30,7 @@ def generate_barcode_prefix(barcode: str) -> str:
prefix = 7
return barcode[:prefix] + "x" * (len(barcode) - prefix)

raise ValueError(
"barcode prefix only works on EAN-13 barcode " "(here: {})".format(barcode)
)
raise ValueError(f"barcode prefix only works on EAN-13 barcode (here: {barcode})")


def compute_brand_prefix(
Expand Down Expand Up @@ -60,14 +62,11 @@ def compute_brand_prefix(
return count


def save_brand_prefix(count_threshold: int):
def save_brand_prefix(count_threshold: int = 5):
product_dataset = ProductDataset(settings.JSONL_DATASET_PATH)
counts = compute_brand_prefix(product_dataset, threshold=count_threshold)

brand_prefixes = list(counts.keys())

with settings.BRAND_PREFIX_PATH.open("w") as f:
json.dump(brand_prefixes, f)
dump_json(settings.BRAND_PREFIX_PATH, brand_prefixes, compressed=True)


def keep_brand_from_taxonomy(
Expand All @@ -94,9 +93,7 @@ def generate_brand_list(
blacklisted_brands: Optional[Set[str]] = None,
) -> List[Tuple[str, str]]:
min_length = min_length or 0
brand_taxonomy = requests.get(
settings.TAXONOMY_URLS[TaxonomyType.brand.name]
).json()
brand_taxonomy = get_taxonomy(TaxonomyType.brand.name)
brand_count_list = requests.get(settings.OFF_BRANDS_URL).json()["tags"]

brand_count = {tag["id"]: tag for tag in brand_count_list}
Expand Down Expand Up @@ -144,7 +141,6 @@ def in_barcode_range(
return True


BRAND_PREFIX_STORE = CachedStore(fetch_func=get_brand_prefix, expiration_interval=None)
BRAND_BLACKLIST_STORE = CachedStore(
fetch_func=get_brand_blacklist, expiration_interval=None
)
Expand Down
4 changes: 2 additions & 2 deletions robotoff/insights/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from playhouse.shortcuts import model_to_dict

from robotoff import settings
from robotoff.brands import BRAND_PREFIX_STORE, in_barcode_range
from robotoff.brands import get_brand_prefix, in_barcode_range
from robotoff.insights.dataclass import InsightType
from robotoff.insights.normalize import normalize_emb_code
from robotoff.models import Prediction as PredictionModel
Expand Down Expand Up @@ -860,7 +860,7 @@ def is_conflicting_insight(

@staticmethod
def is_in_barcode_range(barcode: str, tag: str) -> bool:
brand_prefix: Set[Tuple[str, str]] = BRAND_PREFIX_STORE.get()
brand_prefix = get_brand_prefix()

if not in_barcode_range(brand_prefix, tag, barcode):
logger.info(f"Barcode {barcode} of brand {tag} not in barcode range")
Expand Down
2 changes: 1 addition & 1 deletion robotoff/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def init_sentry(integrations: Optional[List[Integration]] = None):
# Try to detect postal codes in France
OCR_CITIES_FR_PATH = OCR_DATA_DIR / "cities_laposte_hexasmal.json.gz"

BRAND_PREFIX_PATH = DATA_DIR / "brand_prefix.json"
BRAND_PREFIX_PATH = DATA_DIR / "brand_prefix.json.gz"

# When we're making queries to the API, so that we're not blocked by error
ROBOTOFF_USER_AGENT = "Robotoff Live Analysis"
Expand Down
7 changes: 3 additions & 4 deletions tests/integration/test_annotate_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import robotoff.insights.importer
import robotoff.taxonomy
from robotoff.app.api import api
from robotoff.brands import BRAND_PREFIX_STORE
from robotoff.models import LogoAnnotation, Prediction, ProductInsight
from robotoff.products import Product

Expand Down Expand Up @@ -154,16 +153,16 @@ def test_logo_annotation_incorrect_value_label_type(client, peewee_db):
}


def test_logo_annotation_brand(client, peewee_db, monkeypatch, fake_taxonomy):
def test_logo_annotation_brand(client, peewee_db, monkeypatch, mocker, fake_taxonomy):
with peewee_db:
ann = LogoAnnotationFactory(
image_prediction__image__source_image="/images/2.jpg",
annotation_type="brand",
)
barcode = ann.image_prediction.image.barcode
_fake_store(monkeypatch, barcode)
monkeypatch.setattr(
BRAND_PREFIX_STORE, "get", lambda: {("Etorki", "0000000xxxxxx")}
mocker.patch(
"robotoff.brands.get_brand_prefix", return_value={("Etorki", "0000000xxxxxx")}
)
start = datetime.utcnow()
result = client.simulate_post(
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_brands.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from robotoff.brands import BRAND_PREFIX_STORE, in_barcode_range
from robotoff.brands import get_brand_prefix, in_barcode_range


@pytest.mark.parametrize(
Expand All @@ -16,5 +16,5 @@
],
)
def test_in_barcode_range(barcode, brand_tag, is_valid):
brand_prefix = BRAND_PREFIX_STORE.get()
brand_prefix = get_brand_prefix()
assert in_barcode_range(brand_prefix, brand_tag, barcode) is is_valid

0 comments on commit c7c8322

Please sign in to comment.