diff --git a/packagedb/management/commands/populate_search_vectors.py b/packagedb/management/commands/populate_search_vectors.py new file mode 100644 index 00000000..f6622b8e --- /dev/null +++ b/packagedb/management/commands/populate_search_vectors.py @@ -0,0 +1,82 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import copy +import logging +import sys + +from django.contrib.postgres.search import SearchVector +from django.db import transaction + +from minecode.management.commands import VerboseCommand +from packagedb.models import Package + +TRACE = False + +logger = logging.getLogger(__name__) +logging.basicConfig(stream=sys.stdout) +logger.setLevel(logging.INFO) + + +# This is from https://stackoverflow.com/questions/4856882/limiting-memory-use-in-a-large-django-queryset/5188179#5188179 +class MemorySavingQuerysetIterator(object): + + def __init__(self,queryset,max_obj_num=1000): + self._base_queryset = queryset + self._generator = self._setup() + self.max_obj_num = max_obj_num + + def _setup(self): + for i in range(0,self._base_queryset.count(),self.max_obj_num): + # By making a copy of of the queryset and using that to actually access + # the objects we ensure that there are only `max_obj_num` objects in + # memory at any given time + smaller_queryset = copy.deepcopy(self._base_queryset)[i:i+self.max_obj_num] + logger.debug('Grabbing next %s objects from DB' % self.max_obj_num) + for obj in smaller_queryset.iterator(): + yield obj + + def __iter__(self): + return self._generator + + def next(self): + return self._generator.next() + + +class Command(VerboseCommand): + def handle(self, *args, **options): + packages_without_search_vectors = Package.objects.using('default').filter(search_vector__isnull=True) + packages_without_search_vectors_count = packages_without_search_vectors.count() + updated = [] + print(f"Populating the `search_vector` field for {packages_without_search_vectors_count:,} Packages from the 'default` database") + i = 0 + for package in MemorySavingQuerysetIterator(packages_without_search_vectors): + if not i % 2000 and updated: + with transaction.atomic(): + Package.objects.using('default').bulk_update( + objs=updated, + fields=[ + 'search_vector', + ] + ) + updated = [] + print(f" {i:,} / {packages_without_search_vectors_count:,} Package `search_vector`s populated") + package.search_vector = SearchVector('namespace', 'name', 'version', 'download_url') + updated.append(package) + i += 1 + if updated: + with transaction.atomic(): + Package.objects.using('default').bulk_update( + objs=updated, + fields=[ + 'search_vector', + ] + ) + updated = [] + print(f"{i:,} Package `search_vector`s populated")