Skip to content

Commit 3b774e8

Browse files
add utility to Slice a queryset into chunks.
The function slices a queryset into smaller QuerySets containing chunk_size objects and then yields them. It is used to avoid memory error when processing huge querysets, and also to avoid database errors due to the database pulling the whole table at once. Additionally, without using a chunked queryset, concurrent database modification while processing a large table might repeat or skip some entries.
1 parent 341a111 commit 3b774e8

File tree

5 files changed

+122
-1
lines changed

5 files changed

+122
-1
lines changed

CHANGELOG.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ Change Log
1414
Unreleased
1515
----------
1616

17+
[3.15.0] - 2021-03-02
18+
_____________________
19+
20+
* Added chunked_queryset utility.
21+
1722
[3.14.0] - 2020-12-15
1823
_____________________
1924

edx_django_utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
EdX utilities for Django Application development..
33
"""
44

5-
__version__ = "3.14.0"
5+
__version__ = "3.15.0"
66

77
default_app_config = (
88
"edx_django_utils.apps.EdxDjangoUtilsConfig"

edx_django_utils/db/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
Utilities for working effectively with databases in django.
33
44
read_replica: Tools for making queries from the read-replica.
5+
queryset_utils: Utils to use with Django QuerySets.
56
"""
67

8+
from .queryset_utils import chunked_queryset
79
from .read_replica import (
810
ReadReplicaRouter,
911
read_queries_only,
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""
2+
Utils related to QuerySets.
3+
"""
4+
5+
6+
def chunked_queryset(queryset, chunk_size=2000):
7+
"""
8+
Slice a queryset into chunks.
9+
10+
The function slices a queryset into smaller QuerySets containing chunk_size objects and then yields them. It is
11+
used to avoid memory error when processing huge querysets, and also to avoid database errors due to the
12+
database pulling the whole table at once. Additionally, without using a chunked queryset, concurrent database
13+
modification while processing a large table might repeat or skip some entries.
14+
15+
Warning: It throws away your sorting and sort queryset based on `pk`. Only recommended for large QuerySets where
16+
order does not matter.
17+
(e.g: Can be used in management commands to back-fill data based on Queryset having millions of objects.)
18+
19+
Source: https://www.djangosnippets.org/snippets/10599/
20+
21+
Example Usage:
22+
queryset = User.objects.all()
23+
for chunked_queryset in chunked_queryset(queryset):
24+
print(chunked_queryset.count())
25+
26+
Argument:
27+
chunk_size (int): Size of desired batch.
28+
29+
Return:
30+
QuerySet: Iterator with sliced Queryset.
31+
"""
32+
start_pk = 0
33+
queryset = queryset.order_by('pk')
34+
35+
while True:
36+
# No entry left
37+
if not queryset.filter(pk__gt=start_pk).exists():
38+
return
39+
40+
try:
41+
# Fetch chunk_size entries if possible
42+
end_pk = queryset.filter(pk__gt=start_pk).values_list('pk', flat=True)[chunk_size - 1]
43+
44+
# Fetch rest entries if less than chunk_size left
45+
except IndexError:
46+
end_pk = queryset.values_list('pk', flat=True).last()
47+
48+
yield queryset.filter(pk__gt=start_pk).filter(pk__lte=end_pk)
49+
50+
start_pk = end_pk
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
"""
2+
Tests of edx_django_utils.db.queryset_utils.
3+
"""
4+
from ddt import data, ddt, unpack
5+
from django.contrib import auth
6+
from django.test import TestCase
7+
8+
from edx_django_utils.db.queryset_utils import chunked_queryset
9+
10+
User = auth.get_user_model()
11+
12+
13+
@ddt
14+
class TestQuerysetUtils(TestCase):
15+
"""
16+
Tests of edx_django_utils.db.queryset_utils.
17+
"""
18+
@unpack
19+
@data(
20+
(30, 10, [10, 10, 10]),
21+
(31, 10, [10, 10, 10, 1]),
22+
(10, 10, [10]),
23+
(7, 10, [7]),
24+
(0, 10, [0]),
25+
)
26+
def test_chunked_queryset(self, query_size, chunk_size, expected_batches):
27+
User.objects.all().delete()
28+
29+
# create objects size of query_size
30+
for number in range(query_size):
31+
User.objects.create(username="username_{number}".format(number=number))
32+
33+
queryset = User.objects.all()
34+
35+
self.assertEqual(queryset.count(), query_size)
36+
for (batch_num, chunked_query) in enumerate(chunked_queryset(queryset, chunk_size)):
37+
self.assertEqual(chunked_query.count(), expected_batches[batch_num])
38+
39+
def test_concurrent_update(self):
40+
"""
41+
Test concurrent database modification wouldn't skip records.
42+
"""
43+
User.objects.all().delete()
44+
45+
# Create 14 objects.
46+
for number in range(14):
47+
User.objects.create(username="username_{number}".format(number=number))
48+
49+
queryset = User.objects.all()
50+
51+
# Now create chunks of size 10.
52+
chunked_query = chunked_queryset(queryset, chunk_size=10)
53+
54+
# As there a total 14 objects and chunk size is 10, Assert first chunk should contain 10 objects.
55+
first_chunk = next(chunked_query)
56+
self.assertEqual(first_chunk.count(), 10)
57+
58+
# Lets create a new object while iterating over the chunked_queryset.
59+
User.objects.create(username="one-more-user")
60+
61+
# As now there are total 15 objects, the second chunk should contain 5 objects instead of 4.
62+
# that implies concurrent database modification won't skip records in this process.
63+
second_chunk = next(chunked_query)
64+
self.assertEqual(second_chunk.count(), 5)

0 commit comments

Comments
 (0)