Skip to content

Commit

Permalink
[FIX] Make explore page faster (hedyorg#4502)
Browse files Browse the repository at this point in the history
The `explore` page was (effectively) downloading the entire content of multiple indexes (at least `lang` and `public`, if no other criteria were given), to do a linear filtering operation.

The `lang` index is just not discriminatory enough, but the good news is that the `public` index is a lot better. Just use the `public` index, and filter all programs by the criteria that are provided.

We've added the feature `FilterExpression`: this allows DynamoDB to do server-side filtering, so we don't have to do it client-side. This is still not arbitrary filtering: we will still get $$$ charged for the data that is read off of disk, the data is just not sent over the network... but that still saves time, and it saves client-side filtering. We've added the `public-date-index` which has just the fields necessary to do this server-side filtering in (`lang`, `adventure_name`, `level`), and no others to save disk space.

Gets the load time of the `explore` page down to ~1s. This should also save a chunk of memory on the server.

**How to test**

Run against the prod database and visit the explore page locally; observe that everything still works.
  • Loading branch information
rix0rrr authored Sep 22, 2023
1 parent 1be2c75 commit 92c82e9
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 68 deletions.
15 changes: 12 additions & 3 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -1669,12 +1669,16 @@ def explore():
achievement = ACHIEVEMENTS.add_single_achievement(
current_user()['username'], "indiana_jones")

programs = normalize_public_programs(DATABASE.get_public_programs(
programs = DATABASE.get_public_programs(
limit=40,
level_filter=level,
language_filter=language,
adventure_filter=adventure))
favourite_programs = normalize_public_programs(DATABASE.get_hedy_choices())
adventure_filter=adventure)
favourite_programs = DATABASE.get_hedy_choices()

# Do 'normalize_public_programs' on both sets at once, to save database calls
normalized = normalize_public_programs(list(programs) + list(favourite_programs.records))
programs, favourite_programs = split_at(len(programs), normalized)

adventures_names = hedy_content.Adventures(session['lang']).get_adventure_names()

Expand Down Expand Up @@ -2272,6 +2276,11 @@ def analyze_memory_snapshot(start_snapshot, end_snapshot):
print("Total allocated size: %.1f KiB" % (total / 1024))


def split_at(n, xs):
"""Split a collection at an index."""
return xs[:n], xs[n:]


if __name__ == '__main__':
# Start the server on a developer machine. Flask is initialized in DEBUG mode, so it
# hot-reloads files. We also flip our own internal "debug mode" flag to True, so our
Expand Down
28 changes: 25 additions & 3 deletions tests/test_dynamo.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,16 @@ def test_query(self):
{'id': 'key', 'sort': 2, 'm': 'another'}
])

def test_query_with_filter(self):
self.table.create({'id': 'key', 'sort': 1, 'm': 'val'})
self.table.create({'id': 'key', 'sort': 2, 'm': 'another'})

ret = list(self.table.get_many({'id': 'key'}, filter={'m': 'another'}))

self.assertEqual(ret, [
{'id': 'key', 'sort': 2, 'm': 'another'}
])

def test_between_query(self):
self.table.create({'id': 'key', 'sort': 1, 'x': 'x'})
self.table.create({'id': 'key', 'sort': 2, 'x': 'y'})
Expand All @@ -216,6 +226,18 @@ def test_query_index(self):
{'id': 'key', 'sort': 1, 'm': 'val'}
])

def test_query_index_with_filter(self):
self.table.create({'id': 'key', 'sort': 1, 'm': 'val', 'p': 'p'})
self.table.create({'id': 'key', 'sort': 3, 'm': 'val', 'p': 'p'})
self.table.create({'id': 'key', 'sort': 2, 'm': 'another', 'q': 'q'})

ret = list(self.table.get_many({'m': 'val'}, filter={'p': 'p'}))

self.assertEqual(ret, [
{'id': 'key', 'sort': 1, 'm': 'val', 'p': 'p'},
{'id': 'key', 'sort': 3, 'm': 'val', 'p': 'p'},
])

def test_query_index_with_partition_key(self):
self.table.create({'id': 'key', 'sort': 1, 'x': 'val', 'y': 0})
self.table.create({'id': 'key', 'sort': 2, 'x': 'val', 'y': 1})
Expand Down Expand Up @@ -359,13 +381,13 @@ def test_get_many_iterator(self):
self.assertEqual(list(dynamo.GetManyIterator(self.table, dict(id='key'))), expected)

# Query paginated, using the Python iterator protocol
self.assertEqual(list(dynamo.GetManyIterator(self.table, dict(id='key'), limit=3)), expected)
self.assertEqual(list(dynamo.GetManyIterator(self.table, dict(id='key'), batch_size=3)), expected)

# Reuse the client-side pager every time, using the Python iterator protocol
ret = []
token = None
while True:
many = dynamo.GetManyIterator(self.table, dict(id='key'), limit=3, pagination_token=token)
many = dynamo.GetManyIterator(self.table, dict(id='key'), batch_size=3, pagination_token=token)
ret.append(next(iter(many)))
token = many.next_page_token
if not token:
Expand All @@ -374,7 +396,7 @@ def test_get_many_iterator(self):

# Also test using the eof/current/advance protocol
ret = []
many = dynamo.GetManyIterator(self.table, dict(id='key'), limit=3, pagination_token=token)
many = dynamo.GetManyIterator(self.table, dict(id='key'), batch_size=3, pagination_token=token)
while many:
ret.append(many.current)
many.advance()
Expand Down
77 changes: 39 additions & 38 deletions website/database.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import functools
import operator
import itertools
from datetime import date, timedelta

from utils import timems, times

from . import dynamo
from . import querylog

storage = dynamo.AwsDynamoStorage.from_env() or dynamo.MemoryStorage("dev_database.json")

Expand All @@ -18,8 +20,9 @@
])
PROGRAMS = dynamo.Table(storage, "programs", "id", indexes=[
dynamo.Index('username', sort_key='date', index_name='username-index'),
dynamo.Index('public', sort_key='date', index_name='public-index'),
dynamo.Index('hedy_choice', sort_key='date', index_name='hedy_choice-index'),
# For the explore page, this index has 'level', 'lang' and 'adventure_name'
dynamo.Index('public', sort_key='date'),

# For the filtered view of the 'explore' page (keys_only so we don't duplicate other attributes unnecessarily)
dynamo.Index('lang', sort_key='date', keys_only=True),
Expand Down Expand Up @@ -202,7 +205,7 @@ def filtered_programs_for_user(self, username, level=None, adventure=None, submi
# FIXME: Query by index, the current behavior is slow for many programs
# (See https://github.com/hedyorg/hedy/issues/4121)
programs = dynamo.GetManyIterator(PROGRAMS, {"username": username},
reverse=True, limit=limit, pagination_token=pagination_token)
reverse=True, batch_size=limit, pagination_token=pagination_token)
for program in programs:
if level and program.get('level') != int(level):
continue
Expand All @@ -225,7 +228,7 @@ def filtered_programs_for_user(self, username, level=None, adventure=None, submi
def public_programs_for_user(self, username, limit=None, pagination_token=None):
# Only return programs that are public but not submitted
programs = dynamo.GetManyIterator(PROGRAMS, {"username": username},
reverse=True, limit=limit, pagination_token=pagination_token)
reverse=True, batch_size=limit, pagination_token=pagination_token)
ret = []
for program in programs:
if program.get("public") != 1 or program.get("submitted", False):
Expand Down Expand Up @@ -412,58 +415,46 @@ def get_all_public_programs(self):
programs = PROGRAMS.get_many({"public": 1}, reverse=True)
return [x for x in programs if not x.get("submitted", False)]

@querylog.timed_as("get_public_programs")
def get_public_programs(self, level_filter=None, language_filter=None, adventure_filter=None, limit=40):
"""Return the most recent N public programs, optionally filtered by attributes.
Walk down three key-only indexes at the same time until we have accumulated enough programs.
The 'public' index is the most discriminatory: fetch public programs, then evaluate them against
the filters (on the server). The index we're using here has the 'lang', 'adventure_name' and
'level' columns.
"""
filters = []
filter = {}
if level_filter:
filters.append(PROGRAMS.get_all({'level': int(level_filter)}, reverse=True))
filter['level'] = int(level_filter)
if language_filter:
filters.append(PROGRAMS.get_all({'lang': language_filter}, reverse=True))
filter['lang'] = language_filter
if adventure_filter:
filters.append(PROGRAMS.get_all({'adventure_name': adventure_filter}, reverse=True))
filter['adventure_name'] = adventure_filter

programs = dynamo.GetManyIterator(PROGRAMS, {'public': 1}, reverse=True)

# Iterate down programs, filtering down by the filters in 'filters' as we go to make sure
# the programs match the filter. This works because they all have a 'matching' date field
# we can use to sync up the streams.
#
# Use a cancellation token to timeout if this takes too long. For example, if there are 0
# programs to find this might take a long while to iterate through everything before it
# concludes we didn't want any of it.
#
# Intersecting the filters beforehand and then fetching whatever matches
# is more efficient if we are likely to match very little.
timeout = dynamo.Cancel.after_timeout(timedelta(seconds=3))

found_programs = []
for program in programs:
if len(found_programs) >= limit or timeout.is_cancelled():
id_batch_size = 200

found_program_ids = []
pagination_token = None
while len(found_program_ids) < limit and not timeout.is_cancelled():
page = PROGRAMS.get_many({'public': 1}, reverse=True, limit=id_batch_size,
pagination_token=pagination_token, filter=filter)
found_program_ids.extend([{'id': r['id']} for r in page])
pagination_token = page.next_page_token
if pagination_token is None:
break
del found_program_ids[limit:] # Cap off in case we found too much

# Advance every filter to match the date that the current program has
# FIXME: This is not guaranteed to catch 2 programs that have the same
# timestamp, but for the purposes of showing a sampling of public programs
# I don't really care.
for flt in filters:
while flt and flt.current['date'] > program['date']:
flt.advance()

# Include the current program in the result set if it is now the front item in each filter.
if all((flt and flt.current['id'] == program['id']) for flt in filters):
found_programs.append(program)

# Now retrieve all programs we found with a batch-get
found_programs = PROGRAMS.batch_get(found_program_ids)
return found_programs

def add_public_profile_information(self, programs):
"""Add the information to a list of programs on whether the author has a public profile or not.
"""For each program in a list, note whether the author has a public profile or not.
For each program, add 'public_user': True or 'public_user': None.
Modifies the list in-place.
Modifies the records in the list in-place.
"""
queries = {p['id']: {'username': p['username'].strip().lower()} for p in programs}
profiles = PUBLIC_PROFILES.batch_get(queries)
Expand Down Expand Up @@ -852,3 +843,13 @@ def to_year_week(self, d):
def get_username_role(self, username):
role = "teacher" if USERS.get({"username": username}).get("teacher_request") is True else "student"
return role


def batched(iterable, n):
"Batch data into tuples of length n. The last batch may be shorter."
# batched('ABCDEFG', 3) --> ABC DEF G
if n < 1:
raise ValueError('n must be at least one')
it = iter(iterable)
while batch := tuple(itertools.islice(it, n)):
yield batch
Loading

0 comments on commit 92c82e9

Please sign in to comment.