Merge pull request #102 from edx/hammad/ENT-4202

HammadAhmadWaqas · web-flow · commit 1500c18d08c0 · 2021-03-05T16:14:26.000+05:00
ENT-4236 | added chunked_queryset utility.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -14,6 +14,11 @@ Change Log
 Unreleased
 ----------
 
+[3.15.0] - 2021-03-02
+_____________________
+
+* Added chunked_queryset utility.
+
 [3.14.0] - 2020-12-15
 _____________________
 
diff --git a/edx_django_utils/__init__.py b/edx_django_utils/__init__.py
@@ -2,7 +2,7 @@
 EdX utilities for Django Application development..
 """
 
-__version__ = "3.14.0"
+__version__ = "3.15.0"
 
 default_app_config = (
     "edx_django_utils.apps.EdxDjangoUtilsConfig"
diff --git a/edx_django_utils/db/__init__.py b/edx_django_utils/db/__init__.py
@@ -2,8 +2,10 @@
 Utilities for working effectively with databases in django.
 
 read_replica: Tools for making queries from the read-replica.
+queryset_utils: Utils to use with Django QuerySets.
 """
 
+from .queryset_utils import chunked_queryset
 from .read_replica import (
     ReadReplicaRouter,
     read_queries_only,
diff --git a/edx_django_utils/db/queryset_utils.py b/edx_django_utils/db/queryset_utils.py
@@ -0,0 +1,50 @@
+"""
+Utils related to QuerySets.
+"""
+
+
+def chunked_queryset(queryset, chunk_size=2000):
+    """
+    Slice a queryset into chunks.
+
+    The function slices a queryset into smaller QuerySets containing chunk_size objects and then yields them. It is
+    used to avoid memory error when processing huge querysets, and also to avoid database errors due to the
+    database pulling the whole table at once. Additionally, without using a chunked queryset, concurrent database
+    modification while processing a large table might repeat or skip some entries.
+
+    Warning: It throws away your sorting and sort queryset based on `pk`. Only recommended for large QuerySets where
+    order does not matter.
+    (e.g: Can be used in management commands to back-fill data based on Queryset having millions of objects.)
+
+    Source: https://www.djangosnippets.org/snippets/10599/
+
+    Example Usage:
+        queryset = User.objects.all()
+        for chunked_queryset in chunked_queryset(queryset):
+            print(chunked_queryset.count())
+
+    Argument:
+        chunk_size (int): Size of desired batch.
+
+    Return:
+        QuerySet: Iterator with sliced Queryset.
+    """
+    start_pk = 0
+    queryset = queryset.order_by('pk')
+
+    while True:
+        # No entry left
+        if not queryset.filter(pk__gt=start_pk).exists():
+            return
+
+        try:
+            # Fetch chunk_size entries if possible
+            end_pk = queryset.filter(pk__gt=start_pk).values_list('pk', flat=True)[chunk_size - 1]
+
+            # Fetch rest entries if less than chunk_size left
+        except IndexError:
+            end_pk = queryset.values_list('pk', flat=True).last()
+
+        yield queryset.filter(pk__gt=start_pk).filter(pk__lte=end_pk)
+
+        start_pk = end_pk
diff --git a/edx_django_utils/db/tests/test_queryset_utils.py b/edx_django_utils/db/tests/test_queryset_utils.py
@@ -0,0 +1,64 @@
+"""
+Tests of edx_django_utils.db.queryset_utils.
+"""
+from ddt import data, ddt, unpack
+from django.contrib import auth
+from django.test import TestCase
+
+from edx_django_utils.db.queryset_utils import chunked_queryset
+
+User = auth.get_user_model()
+
+
+@ddt
+class TestQuerysetUtils(TestCase):
+    """
+    Tests of edx_django_utils.db.queryset_utils.
+    """
+    @unpack
+    @data(
+        (30, 10, [10, 10, 10]),
+        (31, 10, [10, 10, 10, 1]),
+        (10, 10, [10]),
+        (7, 10, [7]),
+        (0, 10, [0]),
+    )
+    def test_chunked_queryset(self, query_size, chunk_size, expected_batches):
+        User.objects.all().delete()
+
+        # create objects size of query_size
+        for number in range(query_size):
+            User.objects.create(username="username_{number}".format(number=number))
+
+        queryset = User.objects.all()
+
+        self.assertEqual(queryset.count(), query_size)
+        for (batch_num, chunked_query) in enumerate(chunked_queryset(queryset, chunk_size)):
+            self.assertEqual(chunked_query.count(), expected_batches[batch_num])
+
+    def test_concurrent_update(self):
+        """
+        Test concurrent database modification wouldn't skip records.
+        """
+        User.objects.all().delete()
+
+        # Create 14 objects.
+        for number in range(14):
+            User.objects.create(username="username_{number}".format(number=number))
+
+        queryset = User.objects.all()
+
+        # Now create chunks of size 10.
+        chunked_query = chunked_queryset(queryset, chunk_size=10)
+
+        # As there a total 14 objects and chunk size is 10, Assert first chunk should contain 10 objects.
+        first_chunk = next(chunked_query)
+        self.assertEqual(first_chunk.count(), 10)
+
+        # Lets create a new object while iterating over the chunked_queryset.
+        User.objects.create(username="one-more-user")
+
+        # As now there are total 15 objects, the second chunk should contain 5 objects instead of 4.
+        # that implies concurrent database modification won't skip records in this process.
+        second_chunk = next(chunked_query)
+        self.assertEqual(second_chunk.count(), 5)