[FIX] Make explore page faster (hedyorg#4502)

The `explore` page was (effectively) downloading the entire content of multiple indexes (at least `lang` and `public`, if no other criteria were given), to do a linear filtering operation. The `lang` index is just not discriminatory enough, but the good news is that the `public` index is a lot better. Just use the `public` index, and filter all programs by the criteria that are provided. We've added the feature `FilterExpression`: this allows DynamoDB to do server-side filtering, so we don't have to do it client-side. This is still not arbitrary filtering: we will still get $$$ charged for the data that is read off of disk, the data is just not sent over the network... but that still saves time, and it saves client-side filtering. We've added the `public-date-index` which has just the fields necessary to do this server-side filtering in (`lang`, `adventure_name`, `level`), and no others to save disk space. Gets the load time of the `explore` page down to ~1s. This should also save a chunk of memory on the server. **How to test** Run against the prod database and visit the explore page locally; observe that everything still works.
jtwaleson · Sep 22, 2023 · 92c82e9 · 92c82e9
1 parent 1be2c75
commit 92c82e9
Show file tree

Hide file tree

Showing 4 changed files with 150 additions and 68 deletions.
diff --git a/app.py b/app.py
@@ -1669,12 +1669,16 @@ def explore():
         achievement = ACHIEVEMENTS.add_single_achievement(
             current_user()['username'], "indiana_jones")
 
-    programs = normalize_public_programs(DATABASE.get_public_programs(
+    programs = DATABASE.get_public_programs(
         limit=40,
         level_filter=level,
         language_filter=language,
-        adventure_filter=adventure))
-    favourite_programs = normalize_public_programs(DATABASE.get_hedy_choices())
+        adventure_filter=adventure)
+    favourite_programs = DATABASE.get_hedy_choices()
+
+    # Do 'normalize_public_programs' on both sets at once, to save database calls
+    normalized = normalize_public_programs(list(programs) + list(favourite_programs.records))
+    programs, favourite_programs = split_at(len(programs), normalized)
 
     adventures_names = hedy_content.Adventures(session['lang']).get_adventure_names()
 
@@ -2272,6 +2276,11 @@ def analyze_memory_snapshot(start_snapshot, end_snapshot):
     print("Total allocated size: %.1f KiB" % (total / 1024))
 
 
+def split_at(n, xs):
+    """Split a collection at an index."""
+    return xs[:n], xs[n:]
+
+
 if __name__ == '__main__':
     # Start the server on a developer machine. Flask is initialized in DEBUG mode, so it
     # hot-reloads files. We also flip our own internal "debug mode" flag to True, so our

diff --git a/tests/test_dynamo.py b/tests/test_dynamo.py
@@ -192,6 +192,16 @@ def test_query(self):
             {'id': 'key', 'sort': 2, 'm': 'another'}
         ])
 
+    def test_query_with_filter(self):
+        self.table.create({'id': 'key', 'sort': 1, 'm': 'val'})
+        self.table.create({'id': 'key', 'sort': 2, 'm': 'another'})
+
+        ret = list(self.table.get_many({'id': 'key'}, filter={'m': 'another'}))
+
+        self.assertEqual(ret, [
+            {'id': 'key', 'sort': 2, 'm': 'another'}
+        ])
+
     def test_between_query(self):
         self.table.create({'id': 'key', 'sort': 1, 'x': 'x'})
         self.table.create({'id': 'key', 'sort': 2, 'x': 'y'})
@@ -216,6 +226,18 @@ def test_query_index(self):
             {'id': 'key', 'sort': 1, 'm': 'val'}
         ])
 
+    def test_query_index_with_filter(self):
+        self.table.create({'id': 'key', 'sort': 1, 'm': 'val', 'p': 'p'})
+        self.table.create({'id': 'key', 'sort': 3, 'm': 'val', 'p': 'p'})
+        self.table.create({'id': 'key', 'sort': 2, 'm': 'another', 'q': 'q'})
+
+        ret = list(self.table.get_many({'m': 'val'}, filter={'p': 'p'}))
+
+        self.assertEqual(ret, [
+            {'id': 'key', 'sort': 1, 'm': 'val', 'p': 'p'},
+            {'id': 'key', 'sort': 3, 'm': 'val', 'p': 'p'},
+        ])
+
     def test_query_index_with_partition_key(self):
         self.table.create({'id': 'key', 'sort': 1, 'x': 'val', 'y': 0})
         self.table.create({'id': 'key', 'sort': 2, 'x': 'val', 'y': 1})
@@ -359,13 +381,13 @@ def test_get_many_iterator(self):
         self.assertEqual(list(dynamo.GetManyIterator(self.table, dict(id='key'))), expected)
 
         # Query paginated, using the Python iterator protocol
-        self.assertEqual(list(dynamo.GetManyIterator(self.table, dict(id='key'), limit=3)), expected)
+        self.assertEqual(list(dynamo.GetManyIterator(self.table, dict(id='key'), batch_size=3)), expected)
 
         # Reuse the client-side pager every time, using the Python iterator protocol
         ret = []
         token = None
         while True:
-            many = dynamo.GetManyIterator(self.table, dict(id='key'), limit=3, pagination_token=token)
+            many = dynamo.GetManyIterator(self.table, dict(id='key'), batch_size=3, pagination_token=token)
             ret.append(next(iter(many)))
             token = many.next_page_token
             if not token:
@@ -374,7 +396,7 @@ def test_get_many_iterator(self):
 
         # Also test using the eof/current/advance protocol
         ret = []
-        many = dynamo.GetManyIterator(self.table, dict(id='key'), limit=3, pagination_token=token)
+        many = dynamo.GetManyIterator(self.table, dict(id='key'), batch_size=3, pagination_token=token)
         while many:
             ret.append(many.current)
             many.advance()

diff --git a/website/database.py b/website/database.py
@@ -1,10 +1,12 @@
 import functools
 import operator
+import itertools
 from datetime import date, timedelta
 
 from utils import timems, times
 
 from . import dynamo
+from . import querylog
 
 storage = dynamo.AwsDynamoStorage.from_env() or dynamo.MemoryStorage("dev_database.json")
 
@@ -18,8 +20,9 @@
 ])
 PROGRAMS = dynamo.Table(storage, "programs", "id", indexes=[
     dynamo.Index('username', sort_key='date', index_name='username-index'),
-    dynamo.Index('public', sort_key='date', index_name='public-index'),
     dynamo.Index('hedy_choice', sort_key='date', index_name='hedy_choice-index'),
+    # For the explore page, this index has 'level', 'lang' and 'adventure_name'
+    dynamo.Index('public', sort_key='date'),
 
     # For the filtered view of the 'explore' page (keys_only so we don't duplicate other attributes unnecessarily)
     dynamo.Index('lang', sort_key='date', keys_only=True),
@@ -202,7 +205,7 @@ def filtered_programs_for_user(self, username, level=None, adventure=None, submi
         # FIXME: Query by index, the current behavior is slow for many programs
         # (See https://github.com/hedyorg/hedy/issues/4121)
         programs = dynamo.GetManyIterator(PROGRAMS, {"username": username},
-                                          reverse=True, limit=limit, pagination_token=pagination_token)
+                                          reverse=True, batch_size=limit, pagination_token=pagination_token)
         for program in programs:
             if level and program.get('level') != int(level):
                 continue
@@ -225,7 +228,7 @@ def filtered_programs_for_user(self, username, level=None, adventure=None, submi
     def public_programs_for_user(self, username, limit=None, pagination_token=None):
         # Only return programs that are public but not submitted
         programs = dynamo.GetManyIterator(PROGRAMS, {"username": username},
-                                          reverse=True, limit=limit, pagination_token=pagination_token)
+                                          reverse=True, batch_size=limit, pagination_token=pagination_token)
         ret = []
         for program in programs:
             if program.get("public") != 1 or program.get("submitted", False):
@@ -412,58 +415,46 @@ def get_all_public_programs(self):
         programs = PROGRAMS.get_many({"public": 1}, reverse=True)
         return [x for x in programs if not x.get("submitted", False)]
 
+    @querylog.timed_as("get_public_programs")
     def get_public_programs(self, level_filter=None, language_filter=None, adventure_filter=None, limit=40):
         """Return the most recent N public programs, optionally filtered by attributes.
 
-        Walk down three key-only indexes at the same time until we have accumulated enough programs.
+        The 'public' index is the most discriminatory: fetch public programs, then evaluate them against
+        the filters (on the server). The index we're using here has the 'lang', 'adventure_name' and
+        'level' columns.
         """
-        filters = []
+        filter = {}
         if level_filter:
-            filters.append(PROGRAMS.get_all({'level': int(level_filter)}, reverse=True))
+            filter['level'] = int(level_filter)
         if language_filter:
-            filters.append(PROGRAMS.get_all({'lang': language_filter}, reverse=True))
+            filter['lang'] = language_filter
         if adventure_filter:
-            filters.append(PROGRAMS.get_all({'adventure_name': adventure_filter}, reverse=True))
+            filter['adventure_name'] = adventure_filter
 
-        programs = dynamo.GetManyIterator(PROGRAMS, {'public': 1}, reverse=True)
-
-        # Iterate down programs, filtering down by the filters in 'filters' as we go to make sure
-        # the programs match the filter. This works because they all have a 'matching' date field
-        # we can use to sync up the streams.
-        #
-        # Use a cancellation token to timeout if this takes too long. For example, if there are 0
-        # programs to find this might take a long while to iterate through everything before it
-        # concludes we didn't want any of it.
-        #
-        # Intersecting the filters beforehand and then fetching whatever matches
-        # is more efficient if we are likely to match very little.
         timeout = dynamo.Cancel.after_timeout(timedelta(seconds=3))
-
-        found_programs = []
-        for program in programs:
-            if len(found_programs) >= limit or timeout.is_cancelled():
+        id_batch_size = 200
+
+        found_program_ids = []
+        pagination_token = None
+        while len(found_program_ids) < limit and not timeout.is_cancelled():
+            page = PROGRAMS.get_many({'public': 1}, reverse=True, limit=id_batch_size,
+                                     pagination_token=pagination_token, filter=filter)
+            found_program_ids.extend([{'id': r['id']} for r in page])
+            pagination_token = page.next_page_token
+            if pagination_token is None:
                 break
+        del found_program_ids[limit:]  # Cap off in case we found too much
 
-            # Advance every filter to match the date that the current program has
-            # FIXME: This is not guaranteed to catch 2 programs that have the same
-            # timestamp, but for the purposes of showing a sampling of public programs
-            # I don't really care.
-            for flt in filters:
-                while flt and flt.current['date'] > program['date']:
-                    flt.advance()
-
-            # Include the current program in the result set if it is now the front item in each filter.
-            if all((flt and flt.current['id'] == program['id']) for flt in filters):
-                found_programs.append(program)
-
+        # Now retrieve all programs we found with a batch-get
+        found_programs = PROGRAMS.batch_get(found_program_ids)
         return found_programs
 
     def add_public_profile_information(self, programs):
-        """Add the information to a list of programs on whether the author has a public profile or not.
+        """For each program in a list, note whether the author has a public profile or not.
 
         For each program, add 'public_user': True or 'public_user': None.
 
-        Modifies the list in-place.
+        Modifies the records in the list in-place.
         """
         queries = {p['id']: {'username': p['username'].strip().lower()} for p in programs}
         profiles = PUBLIC_PROFILES.batch_get(queries)
@@ -852,3 +843,13 @@ def to_year_week(self, d):
     def get_username_role(self, username):
         role = "teacher" if USERS.get({"username": username}).get("teacher_request") is True else "student"
         return role
+
+
+def batched(iterable, n):
+    "Batch data into tuples of length n. The last batch may be shorter."
+    # batched('ABCDEFG', 3) --> ABC DEF G
+    if n < 1:
+        raise ValueError('n must be at least one')
+    it = iter(iterable)
+    while batch := tuple(itertools.islice(it, n)):
+        yield batch