Skip to content

Commit

Permalink
Relocate MemorySavingQuerysetIterator
Browse files Browse the repository at this point in the history
Signed-off-by: Jono Yang <[email protected]>
  • Loading branch information
JonoYang committed Jul 11, 2024
1 parent 3cede75 commit 4aa19f2
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 61 deletions.
31 changes: 3 additions & 28 deletions minecode/management/commands/update_maven_package_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,19 @@
#
from dateutil.parser import parse as dateutil_parse
from os.path import basename
import copy
import logging
import sys
import traceback

from django.db import transaction
from django.db.utils import DataError
from django.utils import timezone
from packageurl import normalize_qualifiers

from minecode.models import ProcessingError
from minecode.collectors.maven import MavenNexusCollector
from minecode.management.commands import VerboseCommand
from minecode.models import ProcessingError
from packagedb.models import Package
from packageurl import normalize_qualifiers
from minecode.collectors.maven import MavenNexusCollector

DEFAULT_TIMEOUT = 30

Expand All @@ -32,30 +31,6 @@
logger.setLevel(logging.INFO)


# This is from https://stackoverflow.com/questions/4856882/limiting-memory-use-in-a-large-django-queryset/5188179#5188179
class MemorySavingQuerysetIterator(object):
def __init__(self,queryset,max_obj_num=1000):
self._base_queryset = queryset
self._generator = self._setup()
self.max_obj_num = max_obj_num

def _setup(self):
for i in range(0,self._base_queryset.count(),self.max_obj_num):
# By making a copy of of the queryset and using that to actually access
# the objects we ensure that there are only `max_obj_num` objects in
# memory at any given time
smaller_queryset = copy.deepcopy(self._base_queryset)[i:i+self.max_obj_num]
logger.debug('Grabbing next %s objects from DB' % self.max_obj_num)
for obj in smaller_queryset.iterator():
yield obj

def __iter__(self):
return self._generator

def next(self):
return self._generator.next()


def update_packages(packages, fields_to_update):
try:
with transaction.atomic():
Expand Down
26 changes: 25 additions & 1 deletion minecode/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# See https://aboutcode.org for more information about nexB OSS projects.
#


import copy
import hashlib
import logging
import os
Expand Down Expand Up @@ -391,3 +391,27 @@ def validate_uuid(uuid_string):
except ValueError:
return False
return str(val).lower() == uuid_string.lower()


# This is from https://stackoverflow.com/questions/4856882/limiting-memory-use-in-a-large-django-queryset/5188179#5188179
class MemorySavingQuerysetIterator(object):
def __init__(self,queryset,max_obj_num=1000):
self._base_queryset = queryset
self._generator = self._setup()
self.max_obj_num = max_obj_num

def _setup(self):
for i in range(0,self._base_queryset.count(),self.max_obj_num):
# By making a copy of of the queryset and using that to actually access
# the objects we ensure that there are only `max_obj_num` objects in
# memory at any given time
smaller_queryset = copy.deepcopy(self._base_queryset)[i:i+self.max_obj_num]
logger.debug('Grabbing next %s objects from DB' % self.max_obj_num)
for obj in smaller_queryset.iterator():
yield obj

def __iter__(self):
return self._generator

def next(self):
return self._generator.next()
38 changes: 6 additions & 32 deletions packagedb/management/commands/fix_purl_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,18 @@
import logging
import sys

from django.db import transaction

from urllib3.util import Retry
from packageurl import PackageURL
from packagedcode.maven import get_urls, build_filename
from requests import Session
from requests.adapters import HTTPAdapter
import requests

from minecode.management.commands import VerboseCommand
from minecode.utils import MemorySavingQuerysetIterator
from minecode.visitors.maven import collect_links_from_text
from minecode.visitors.maven import filter_for_artifacts
from minecode.management.commands import VerboseCommand
from packagedb.models import Package
from packagedcode.maven import get_urls, build_filename
from packageurl import PackageURL

DEFAULT_TIMEOUT = 30

Expand Down Expand Up @@ -113,30 +112,6 @@ def _populate_related_artifacts(cls, namespace, name, version, ec):
)


# This is from https://stackoverflow.com/questions/4856882/limiting-memory-use-in-a-large-django-queryset/5188179#5188179
class MemorySavingQuerysetIterator(object):
def __init__(self,queryset,max_obj_num=1000):
self._base_queryset = queryset
self._generator = self._setup()
self.max_obj_num = max_obj_num

def _setup(self):
for i in range(0,self._base_queryset.count(),self.max_obj_num):
# By making a copy of of the queryset and using that to actually access
# the objects we ensure that there are only `max_obj_num` objects in
# memory at any given time
smaller_queryset = copy.deepcopy(self._base_queryset)[i:i+self.max_obj_num]
logger.debug('Grabbing next %s objects from DB' % self.max_obj_num)
for obj in smaller_queryset.iterator():
yield obj

def __iter__(self):
return self._generator

def next(self):
return self._generator.next()


def query_sha1_on_maven(sha1, timeout=DEFAULT_TIMEOUT):
maven_api_search_url = f'https://search.maven.org/solrsearch/select?q=1:{sha1}'
try:
Expand Down Expand Up @@ -177,10 +152,8 @@ def handle(self, *args, **options):
maven_packages_count = maven_packages.count()
logger.info(f'Checking {maven_packages_count:,} Maven Package PackageURL values')
packages_to_delete = []
unsaved_packages = []

processed_packages_count = 0
for i, package in enumerate(MemorySavingQuerysetIterator(maven_packages)):
for package in MemorySavingQuerysetIterator(maven_packages):
matched_artifacts = query_sha1_on_maven(package.sha1)
if not matched_artifacts:
# Remove this package from the database because it's not on maven
Expand Down Expand Up @@ -223,4 +196,5 @@ def handle(self, *args, **options):
package_different_case.repository_homepage_url = artifact.repository_homepage_url
package_different_case.repository_download_url = artifact.repository_download_url
package_different_case.api_data_url = artifact.api_data_url
package_different_case.sha1 = package.sha1
package_different_case.save()

0 comments on commit 4aa19f2

Please sign in to comment.