Skip to content

Commit 1500c18

Browse files
Merge pull request #102 from edx/hammad/ENT-4202
ENT-4236 | added chunked_queryset utility.
2 parents 341a111 + 3b774e8 commit 1500c18

File tree

5 files changed

+122
-1
lines changed

5 files changed

+122
-1
lines changed

CHANGELOG.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ Change Log
1414
Unreleased
1515
----------
1616

17+
[3.15.0] - 2021-03-02
18+
_____________________
19+
20+
* Added chunked_queryset utility.
21+
1722
[3.14.0] - 2020-12-15
1823
_____________________
1924

edx_django_utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
EdX utilities for Django Application development..
33
"""
44

5-
__version__ = "3.14.0"
5+
__version__ = "3.15.0"
66

77
default_app_config = (
88
"edx_django_utils.apps.EdxDjangoUtilsConfig"

edx_django_utils/db/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
Utilities for working effectively with databases in django.
33
44
read_replica: Tools for making queries from the read-replica.
5+
queryset_utils: Utils to use with Django QuerySets.
56
"""
67

8+
from .queryset_utils import chunked_queryset
79
from .read_replica import (
810
ReadReplicaRouter,
911
read_queries_only,
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""
2+
Utils related to QuerySets.
3+
"""
4+
5+
6+
def chunked_queryset(queryset, chunk_size=2000):
7+
"""
8+
Slice a queryset into chunks.
9+
10+
The function slices a queryset into smaller QuerySets containing chunk_size objects and then yields them. It is
11+
used to avoid memory error when processing huge querysets, and also to avoid database errors due to the
12+
database pulling the whole table at once. Additionally, without using a chunked queryset, concurrent database
13+
modification while processing a large table might repeat or skip some entries.
14+
15+
Warning: It throws away your sorting and sort queryset based on `pk`. Only recommended for large QuerySets where
16+
order does not matter.
17+
(e.g: Can be used in management commands to back-fill data based on Queryset having millions of objects.)
18+
19+
Source: https://www.djangosnippets.org/snippets/10599/
20+
21+
Example Usage:
22+
queryset = User.objects.all()
23+
for chunked_queryset in chunked_queryset(queryset):
24+
print(chunked_queryset.count())
25+
26+
Argument:
27+
chunk_size (int): Size of desired batch.
28+
29+
Return:
30+
QuerySet: Iterator with sliced Queryset.
31+
"""
32+
start_pk = 0
33+
queryset = queryset.order_by('pk')
34+
35+
while True:
36+
# No entry left
37+
if not queryset.filter(pk__gt=start_pk).exists():
38+
return
39+
40+
try:
41+
# Fetch chunk_size entries if possible
42+
end_pk = queryset.filter(pk__gt=start_pk).values_list('pk', flat=True)[chunk_size - 1]
43+
44+
# Fetch rest entries if less than chunk_size left
45+
except IndexError:
46+
end_pk = queryset.values_list('pk', flat=True).last()
47+
48+
yield queryset.filter(pk__gt=start_pk).filter(pk__lte=end_pk)
49+
50+
start_pk = end_pk
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
"""
2+
Tests of edx_django_utils.db.queryset_utils.
3+
"""
4+
from ddt import data, ddt, unpack
5+
from django.contrib import auth
6+
from django.test import TestCase
7+
8+
from edx_django_utils.db.queryset_utils import chunked_queryset
9+
10+
User = auth.get_user_model()
11+
12+
13+
@ddt
14+
class TestQuerysetUtils(TestCase):
15+
"""
16+
Tests of edx_django_utils.db.queryset_utils.
17+
"""
18+
@unpack
19+
@data(
20+
(30, 10, [10, 10, 10]),
21+
(31, 10, [10, 10, 10, 1]),
22+
(10, 10, [10]),
23+
(7, 10, [7]),
24+
(0, 10, [0]),
25+
)
26+
def test_chunked_queryset(self, query_size, chunk_size, expected_batches):
27+
User.objects.all().delete()
28+
29+
# create objects size of query_size
30+
for number in range(query_size):
31+
User.objects.create(username="username_{number}".format(number=number))
32+
33+
queryset = User.objects.all()
34+
35+
self.assertEqual(queryset.count(), query_size)
36+
for (batch_num, chunked_query) in enumerate(chunked_queryset(queryset, chunk_size)):
37+
self.assertEqual(chunked_query.count(), expected_batches[batch_num])
38+
39+
def test_concurrent_update(self):
40+
"""
41+
Test concurrent database modification wouldn't skip records.
42+
"""
43+
User.objects.all().delete()
44+
45+
# Create 14 objects.
46+
for number in range(14):
47+
User.objects.create(username="username_{number}".format(number=number))
48+
49+
queryset = User.objects.all()
50+
51+
# Now create chunks of size 10.
52+
chunked_query = chunked_queryset(queryset, chunk_size=10)
53+
54+
# As there a total 14 objects and chunk size is 10, Assert first chunk should contain 10 objects.
55+
first_chunk = next(chunked_query)
56+
self.assertEqual(first_chunk.count(), 10)
57+
58+
# Lets create a new object while iterating over the chunked_queryset.
59+
User.objects.create(username="one-more-user")
60+
61+
# As now there are total 15 objects, the second chunk should contain 5 objects instead of 4.
62+
# that implies concurrent database modification won't skip records in this process.
63+
second_chunk = next(chunked_query)
64+
self.assertEqual(second_chunk.count(), 5)

0 commit comments

Comments
 (0)