Skip to content

Commit

Permalink
Create command to populate Package.search_vector
Browse files Browse the repository at this point in the history
Signed-off-by: Jono Yang <[email protected]>
  • Loading branch information
JonoYang committed Oct 26, 2023
1 parent ad37ccc commit 92af9d1
Showing 1 changed file with 82 additions and 0 deletions.
82 changes: 82 additions & 0 deletions packagedb/management/commands/populate_search_vectors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import copy
import logging
import sys

from django.contrib.postgres.search import SearchVector
from django.db import transaction

from minecode.management.commands import VerboseCommand
from packagedb.models import Package

TRACE = False

logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.INFO)


# This is from https://stackoverflow.com/questions/4856882/limiting-memory-use-in-a-large-django-queryset/5188179#5188179
class MemorySavingQuerysetIterator(object):

def __init__(self,queryset,max_obj_num=1000):
self._base_queryset = queryset
self._generator = self._setup()
self.max_obj_num = max_obj_num

def _setup(self):
for i in range(0,self._base_queryset.count(),self.max_obj_num):
# By making a copy of of the queryset and using that to actually access
# the objects we ensure that there are only `max_obj_num` objects in
# memory at any given time
smaller_queryset = copy.deepcopy(self._base_queryset)[i:i+self.max_obj_num]
logger.debug('Grabbing next %s objects from DB' % self.max_obj_num)
for obj in smaller_queryset.iterator():
yield obj

def __iter__(self):
return self._generator

def next(self):
return self._generator.next()


class Command(VerboseCommand):
def handle(self, *args, **options):
packages_without_search_vectors = Package.objects.using('default').filter(search_vector__isnull=True)
packages_without_search_vectors_count = packages_without_search_vectors.count()
updated = []
print(f"Populating the `search_vector` field for {packages_without_search_vectors_count:,} Packages from the 'default` database")
i = 0
for package in MemorySavingQuerysetIterator(packages_without_search_vectors):
if not i % 2000 and updated:
with transaction.atomic():
Package.objects.using('default').bulk_update(
objs=updated,
fields=[
'search_vector',
]
)
updated = []
print(f" {i:,} / {packages_without_search_vectors_count:,} Package `search_vector`s populated")
package.search_vector = SearchVector('namespace', 'name', 'version', 'download_url')
updated.append(package)
i += 1
if updated:
with transaction.atomic():
Package.objects.using('default').bulk_update(
objs=updated,
fields=[
'search_vector',
]
)
updated = []
print(f"{i:,} Package `search_vector`s populated")

0 comments on commit 92af9d1

Please sign in to comment.