add utility to Slice a queryset into chunks.

HammadAhmadWaqas · HammadAhmadWaqas · commit 3b774e8ae205 · 2021-03-05T16:08:12.000+05:00
The function slices a queryset into smaller QuerySets containing chunk_size objects and then yields them. It is
used to avoid memory error when processing huge querysets, and also to avoid database errors due to the
database pulling the whole table at once. Additionally, without using a chunked queryset, concurrent database
modification while processing a large table might repeat or skip some entries.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -14,6 +14,11 @@ Change Log
 Unreleased
 ----------
 
+[3.15.0] - 2021-03-02
+_____________________
+
+* Added chunked_queryset utility.
+
 [3.14.0] - 2020-12-15
 _____________________
 
diff --git a/edx_django_utils/__init__.py b/edx_django_utils/__init__.py
@@ -2,7 +2,7 @@
 EdX utilities for Django Application development..
 """
 
-__version__ = "3.14.0"
+__version__ = "3.15.0"
 
 default_app_config = (
     "edx_django_utils.apps.EdxDjangoUtilsConfig"
diff --git a/edx_django_utils/db/__init__.py b/edx_django_utils/db/__init__.py
@@ -2,8 +2,10 @@
 Utilities for working effectively with databases in django.
 
 read_replica: Tools for making queries from the read-replica.
+queryset_utils: Utils to use with Django QuerySets.
 """
 
+from .queryset_utils import chunked_queryset
 from .read_replica import (
     ReadReplicaRouter,
     read_queries_only,
diff --git a/edx_django_utils/db/queryset_utils.py b/edx_django_utils/db/queryset_utils.py
@@ -0,0 +1,50 @@
+"""
+Utils related to QuerySets.
+"""
+
+
+def chunked_queryset(queryset, chunk_size=2000):
+    """
+    Slice a queryset into chunks.
+
+    The function slices a queryset into smaller QuerySets containing chunk_size objects and then yields them. It is
+    used to avoid memory error when processing huge querysets, and also to avoid database errors due to the
+    database pulling the whole table at once. Additionally, without using a chunked queryset, concurrent database
+    modification while processing a large table might repeat or skip some entries.
+
+    Warning: It throws away your sorting and sort queryset based on `pk`. Only recommended for large QuerySets where
+    order does not matter.
+    (e.g: Can be used in management commands to back-fill data based on Queryset having millions of objects.)
+
+    Source: https://www.djangosnippets.org/snippets/10599/
+
+    Example Usage:
+        queryset = User.objects.all()
+        for chunked_queryset in chunked_queryset(queryset):
+            print(chunked_queryset.count())
+
+    Argument:
+        chunk_size (int): Size of desired batch.
+
+    Return:
+        QuerySet: Iterator with sliced Queryset.
+    """
+    start_pk = 0
+    queryset = queryset.order_by('pk')
+
+    while True:
+        # No entry left
+        if not queryset.filter(pk__gt=start_pk).exists():
+            return
+
+        try:
+            # Fetch chunk_size entries if possible
+            end_pk = queryset.filter(pk__gt=start_pk).values_list('pk', flat=True)[chunk_size - 1]
+
+            # Fetch rest entries if less than chunk_size left
+        except IndexError:
+            end_pk = queryset.values_list('pk', flat=True).last()
+
+        yield queryset.filter(pk__gt=start_pk).filter(pk__lte=end_pk)
+
+        start_pk = end_pk
diff --git a/edx_django_utils/db/tests/test_queryset_utils.py b/edx_django_utils/db/tests/test_queryset_utils.py
@@ -0,0 +1,64 @@
+"""
+Tests of edx_django_utils.db.queryset_utils.
+"""
+from ddt import data, ddt, unpack
+from django.contrib import auth
+from django.test import TestCase
+
+from edx_django_utils.db.queryset_utils import chunked_queryset
+
+User = auth.get_user_model()
+
+
+@ddt
+class TestQuerysetUtils(TestCase):
+    """
+    Tests of edx_django_utils.db.queryset_utils.
+    """
+    @unpack
+    @data(
+        (30, 10, [10, 10, 10]),
+        (31, 10, [10, 10, 10, 1]),
+        (10, 10, [10]),
+        (7, 10, [7]),
+        (0, 10, [0]),
+    )
+    def test_chunked_queryset(self, query_size, chunk_size, expected_batches):
+        User.objects.all().delete()
+
+        # create objects size of query_size
+        for number in range(query_size):
+            User.objects.create(username="username_{number}".format(number=number))
+
+        queryset = User.objects.all()
+
+        self.assertEqual(queryset.count(), query_size)
+        for (batch_num, chunked_query) in enumerate(chunked_queryset(queryset, chunk_size)):
+            self.assertEqual(chunked_query.count(), expected_batches[batch_num])
+
+    def test_concurrent_update(self):
+        """
+        Test concurrent database modification wouldn't skip records.
+        """
+        User.objects.all().delete()
+
+        # Create 14 objects.
+        for number in range(14):
+            User.objects.create(username="username_{number}".format(number=number))
+
+        queryset = User.objects.all()
+
+        # Now create chunks of size 10.
+        chunked_query = chunked_queryset(queryset, chunk_size=10)
+
+        # As there a total 14 objects and chunk size is 10, Assert first chunk should contain 10 objects.
+        first_chunk = next(chunked_query)
+        self.assertEqual(first_chunk.count(), 10)
+
+        # Lets create a new object while iterating over the chunked_queryset.
+        User.objects.create(username="one-more-user")
+
+        # As now there are total 15 objects, the second chunk should contain 5 objects instead of 4.
+        # that implies concurrent database modification won't skip records in this process.
+        second_chunk = next(chunked_query)
+        self.assertEqual(second_chunk.count(), 5)