Create command to populate Package.search_vector

Signed-off-by: Jono Yang <[email protected]>
aboutcode-org · Oct 26, 2023 · 92af9d1 · 92af9d1
1 parent ad37ccc
commit 92af9d1
Showing 1 changed file with 82 additions and 0 deletions.
diff --git a/packagedb/management/commands/populate_search_vectors.py b/packagedb/management/commands/populate_search_vectors.py
@@ -0,0 +1,82 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# purldb is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/nexB/purldb for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+import copy
+import logging
+import sys
+
+from django.contrib.postgres.search import SearchVector
+from django.db import transaction
+
+from minecode.management.commands import VerboseCommand
+from packagedb.models import Package
+
+TRACE = False
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout)
+logger.setLevel(logging.INFO)
+
+
+# This is from https://stackoverflow.com/questions/4856882/limiting-memory-use-in-a-large-django-queryset/5188179#5188179
+class MemorySavingQuerysetIterator(object):
+
+    def __init__(self,queryset,max_obj_num=1000):
+        self._base_queryset = queryset
+        self._generator = self._setup()
+        self.max_obj_num = max_obj_num
+
+    def _setup(self):
+        for i in range(0,self._base_queryset.count(),self.max_obj_num):
+            # By making a copy of of the queryset and using that to actually access
+            # the objects we ensure that there are only `max_obj_num` objects in
+            # memory at any given time
+            smaller_queryset = copy.deepcopy(self._base_queryset)[i:i+self.max_obj_num]
+            logger.debug('Grabbing next %s objects from DB' % self.max_obj_num)
+            for obj in smaller_queryset.iterator():
+                yield obj
+
+    def __iter__(self):
+        return self._generator
+
+    def next(self):
+        return self._generator.next()
+
+
+class Command(VerboseCommand):
+    def handle(self, *args, **options):
+        packages_without_search_vectors = Package.objects.using('default').filter(search_vector__isnull=True)
+        packages_without_search_vectors_count = packages_without_search_vectors.count()
+        updated = []
+        print(f"Populating the `search_vector` field for {packages_without_search_vectors_count:,} Packages from the 'default` database")
+        i = 0
+        for package in MemorySavingQuerysetIterator(packages_without_search_vectors):
+            if not i % 2000 and updated:
+                with transaction.atomic():
+                    Package.objects.using('default').bulk_update(
+                        objs=updated,
+                        fields=[
+                            'search_vector',
+                        ]
+                    )
+                updated = []
+                print(f"  {i:,} / {packages_without_search_vectors_count:,} Package `search_vector`s populated")
+            package.search_vector = SearchVector('namespace', 'name', 'version', 'download_url')
+            updated.append(package)
+            i += 1
+        if updated:
+            with transaction.atomic():
+                Package.objects.using('default').bulk_update(
+                    objs=updated,
+                    fields=[
+                        'search_vector',
+                    ]
+                )
+            updated = []
+        print(f"{i:,} Package `search_vector`s populated")