Skip to content

Commit

Permalink
Remove all time inefficient loops from mlbf
Browse files Browse the repository at this point in the history
  • Loading branch information
KevinMind committed Nov 18, 2024
1 parent c9ba62b commit fe76d06
Show file tree
Hide file tree
Showing 6 changed files with 251 additions and 155 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ jobs:
services: ''
compose_file: docker-compose.yml:docker-compose.ci.yml
run: make test_es_tests
-
name: MLBF
services: ''
compose_file: docker-compose.yml:docker-compose.ci.yml
run: make test_mlbf
-
name: Codestyle
services: web
Expand Down
7 changes: 7 additions & 0 deletions Makefile-docker
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,13 @@ test_es: ## run the ES tests
-m es_tests \
$(ARGS)

.PHONY: test_mlbf
test_mlbf: ## run the MLBF tests
pytest \
$(PYTEST_SRC) \
-m mlbf_tests \
$(ARGS)

.PHONY: test_no_es
test_no_es: ## run all but the ES tests
pytest \
Expand Down
140 changes: 87 additions & 53 deletions src/olympia/blocklist/mlbf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import os
import secrets
from enum import Enum
from typing import Dict, List, Optional, Tuple
from functools import lru_cache
from typing import List, Optional, Tuple

from django.utils.functional import cached_property

Expand All @@ -21,15 +22,47 @@


def ordered_diff_lists(
previous: List[str], current: List[str]
current: List[str],
previous: List[str],
) -> Tuple[List[str], List[str], int]:
current_set = set(current)
previous_set = set(previous)
# Use lists instead of sets to maintain order
extras = [x for x in current if x not in previous_set]
deletes = [x for x in previous if x not in current_set]
changed_count = len(extras) + len(deletes)
return extras, deletes, changed_count
return [x for x in current if x not in previous_set]


def get_not_blocked_items(all_blocks_ids: List[int]) -> List[Tuple[str, str]]:
"""
Returns a list of tuples containing the guid, version of all not blocked
versions. We use distinct to avoid duplicates, order by ID to ensure
cache.json is always sorted consistently, and return the values as a tuple
to make it easier to mock in tests.
"""
return (
Version.unfiltered.exclude(id__in=all_blocks_ids or ())
.distinct()
.order_by('id')
.values_list('addon__addonguid__guid', 'version')
)


def get_all_blocked_items() -> List[Tuple[str, str]]:
"""
Returns a list of tuples containing the guid, version, version_id, and
block_type of all blocked items. We use distinct to avoid duplicates,
Order by ID to ensure cache.json is always sorted consistently,
and return the values as a tuple to make it easier to mock in tests.
"""
return (
BlockVersion.objects.filter(version__file__is_signed=True)
.distinct()
.order_by('id')
.values_list(
'block__guid',
'version__version',
'version_id',
'block_type',
)
)


def generate_mlbf(stats, blocked, not_blocked):
Expand Down Expand Up @@ -136,50 +169,46 @@ def __init__(self, *args, **kwargs):

@cached_property
def _all_blocks(self):
return (
BlockVersion.objects.filter(version__file__is_signed=True)
.distinct()
.order_by('id')
.values_list(
'block__guid',
'version__version',
'version_id',
'block_type',
named=True,
)
)
_blocked_version_ids = []
_blocked = {
BlockType.BLOCKED: [],
BlockType.SOFT_BLOCKED: [],
}

def _format_blocks(self, block_type: BlockType) -> List[str]:
return MLBF.hash_filter_inputs(
[
(version.block__guid, version.version__version)
for version in self._all_blocks
if version.block_type == block_type
]
)
# We define get_all_blocked_items as a separate function to allow
# mocking the database query in tests to simulate large data sets.
for guid, version, id, block_type in get_all_blocked_items():
_blocked_version_ids.append(id)
_blocked[block_type].append((guid, version))

return _blocked, _blocked_version_ids

@cached_property
def blocked_items(self) -> List[str]:
return self._format_blocks(BlockType.BLOCKED)
def blocked_items(self):
blocks_dict, _ = self._all_blocks
return MLBF.hash_filter_inputs(blocks_dict[BlockType.BLOCKED])

@cached_property
def soft_blocked_items(self) -> List[str]:
return self._format_blocks(BlockType.SOFT_BLOCKED)
def soft_blocked_items(self):
blocks_dict, _ = self._all_blocks
return MLBF.hash_filter_inputs(blocks_dict[BlockType.SOFT_BLOCKED])

@cached_property
def not_blocked_items(self) -> List[str]:
all_blocks_ids = [version.version_id for version in self._all_blocks]
def not_blocked_items(self):
_, all_blocks_ids = self._all_blocks
# We define not_blocked_items as a separate function to allow
# tests to simulate large data sets.
not_blocked_items = MLBF.hash_filter_inputs(
Version.unfiltered.exclude(id__in=all_blocks_ids or ())
.distinct()
.order_by('id')
.values_list('addon__addonguid__guid', 'version')
get_not_blocked_items(all_blocks_ids)
)
blocked_items = set(self.blocked_items + self.soft_blocked_items)
# even though we exclude all the version ids in the query there's an
# edge case where the version string occurs twice for an addon so we
# ensure not_blocked_items contain no blocked_items or soft_blocked_items.
return [item for item in not_blocked_items if item not in blocked_items]
return ordered_diff_lists(
not_blocked_items,
blocked_items,
)


class MLBF:
Expand All @@ -200,8 +229,8 @@ def __init__(
self.data: BaseMLBFLoader = data_class(storage=self.storage)

@classmethod
def hash_filter_inputs(cls, input_list):
"""Returns a list"""
def hash_filter_inputs(cls, input_list: List[Tuple[str, str]]) -> List[str]:
"""Returns a list of hashed strings"""
return [
cls.KEY_FORMAT.format(guid=guid, version=version)
for (guid, version) in input_list
Expand Down Expand Up @@ -233,16 +262,18 @@ def generate_and_write_filter(self):

log.info(json.dumps(stats))

@lru_cache(maxsize=128) # noqa: B019
def generate_diffs(
self, previous_mlbf: 'MLBF' = None
) -> Dict[BlockType, Tuple[List[str], List[str], int]]:
return {
block_type: ordered_diff_lists(
[] if previous_mlbf is None else previous_mlbf.data[block_type],
self.data[block_type],
)
for block_type in BlockType
}
self,
block_type: BlockType,
previous_mlbf: 'MLBF' = None,
):
current = self.data[block_type]
previous = [] if previous_mlbf is None else previous_mlbf.data[block_type]
added = ordered_diff_lists(current, previous)
removed = ordered_diff_lists(previous, current)
changed_count = len(added) + len(removed)
return added, removed, changed_count

def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None):
"""
Expand All @@ -269,15 +300,18 @@ def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None):
in the "unblocked" stash (like for hard blocked items) as this would
result in the version being in both blocked and unblocked stashes.
"""
diffs = self.generate_diffs(previous_mlbf)
blocked_added, blocked_removed, _ = diffs[BlockType.BLOCKED]
blocked_added, blocked_removed, _ = self.generate_diffs(
BlockType.BLOCKED, previous_mlbf
)
stash_json = {
'blocked': blocked_added,
'unblocked': blocked_removed,
}

if waffle.switch_is_active('enable-soft-blocking'):
soft_blocked_added, soft_blocked_removed, _ = diffs[BlockType.SOFT_BLOCKED]
soft_blocked_added, soft_blocked_removed, _ = self.generate_diffs(
BlockType.SOFT_BLOCKED, previous_mlbf
)
stash_json['softblocked'] = soft_blocked_added
stash_json['unblocked'] = [
unblocked
Expand All @@ -295,7 +329,7 @@ def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None):
def blocks_changed_since_previous(
self, block_type: BlockType = BlockType.BLOCKED, previous_mlbf: 'MLBF' = None
):
_, _, changed_count = self.generate_diffs(previous_mlbf)[block_type]
_, _, changed_count = self.generate_diffs(block_type, previous_mlbf)
return changed_count

@classmethod
Expand Down
41 changes: 41 additions & 0 deletions src/olympia/blocklist/tests/test_cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
STATSD_PREFIX = 'blocklist.cron.upload_mlbf_to_remote_settings.'


@pytest.mark.mlbf_tests
@freeze_time('2020-01-01 12:34:56')
@override_switch('blocklist_mlbf_submit', active=True)
class TestUploadToRemoteSettings(TestCase):
Expand Down Expand Up @@ -379,6 +380,46 @@ def test_invalid_cache_results_in_diff(self):
in self.mocks['olympia.blocklist.cron.upload_filter.delay'].call_args_list
)

@mock.patch('olympia.blocklist.mlbf.get_not_blocked_items')
@mock.patch('olympia.blocklist.mlbf.get_all_blocked_items')
def test_large_data_set(
self,
mock_get_all_blocked_items,
mock_get_not_blocked_items,
):
"""
Test that the cron can handle a large data set
We can safely mock the filter cascase initialize and verify methods
because they massively impact performance of the test and we don't
need to test them here.
"""
two_million_blocked = [
(
f'{x}@blocked',
f'{x}@version',
x,
# even numbers are blocked, odd numbers are soft blocked
BlockType.BLOCKED if x % 2 == 0 else BlockType.SOFT_BLOCKED,
)
for x in range(0, 2_000_000)
]
one_million_not_blocked = [
(f'{x}@not_blocked', f'{x}@version') for x in range(2_000_000, 3_000_000)
]

mock_get_all_blocked_items.return_value = two_million_blocked
mock_get_not_blocked_items.return_value = one_million_not_blocked

upload_mlbf_to_remote_settings()

mlbf = MLBF.load_from_storage(self.current_time)
assert len(mlbf.data.blocked_items) == 1_000_000
assert len(mlbf.data.soft_blocked_items) == 1_000_000
assert len(mlbf.data.not_blocked_items) == 1_000_000

with override_switch('enable-soft-blocking', active=True):
upload_mlbf_to_remote_settings()


class TestTimeMethods(TestCase):
@freeze_time('2024-10-10 12:34:56')
Expand Down
Loading

0 comments on commit fe76d06

Please sign in to comment.