From 926cd7c5bb22dd1670ee221de473c071e4aad1d8 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 17 Feb 2022 14:41:58 -0500 Subject: [PATCH 01/17] Enables alternate Celery daily task for Figures This commit adds ENV_TOKENS['FIGURES']['DAILY_TASK'] override to define which Celery task is called by CeleryBeat to run the daily data update --- figures/settings/lms_production.py | 6 +++- tests/test_settings.py | 47 ++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/figures/settings/lms_production.py b/figures/settings/lms_production.py index b85002b0..fd7c3b5d 100644 --- a/figures/settings/lms_production.py +++ b/figures/settings/lms_production.py @@ -5,6 +5,8 @@ import os from celery.schedules import crontab +FIGURES_DEFAULT_DAILY_TASK = 'figures.tasks.populate_daily_metrics' + class FiguresRouter(object): @@ -64,8 +66,10 @@ def update_celerybeat_schedule( https://stackoverflow.com/questions/51631455/how-to-route-tasks-to-different-queues-with-celery-and-django """ if figures_env_tokens.get('ENABLE_DAILY_METRICS_IMPORT', True): + figures_daily_task = figures_env_tokens.get('DAILY_TASK', + FIGURES_DEFAULT_DAILY_TASK) celerybeat_schedule_settings['figures-populate-daily-metrics'] = { - 'task': 'figures.tasks.populate_daily_metrics', + 'task': figures_daily_task, 'schedule': crontab( hour=figures_env_tokens.get('DAILY_METRICS_IMPORT_HOUR', 2), minute=figures_env_tokens.get('DAILY_METRICS_IMPORT_MINUTE', 0), diff --git a/tests/test_settings.py b/tests/test_settings.py index e9192377..8b106974 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -12,7 +12,10 @@ from figures import helpers as figures_helpers -from figures.settings.lms_production import plugin_settings +from figures.settings.lms_production import ( + FIGURES_DEFAULT_DAILY_TASK, + plugin_settings +) @pytest.mark.parametrize('features, expected', [ @@ -90,6 +93,44 @@ def test_update_settings(self, figures_env_tokens, run_celery): assert settings.ENV_TOKENS['FIGURES'] == figures_env_tokens +class TestDailyTaskFunction(object): + """Tests setting for Figures top level Celery daily job task + + The top level Celery daily job task is what the scheduler calls to populate + Figures enrollment data snapshots and Figures daily aggregate metrics + + We added this test case so that we can improve the daily Figures jobs, while + retaining the ability for deployments to run the previously implemented + daily jobs by default. This is a risk reduction strategy + """ + ALTERNATE_TASK = 'figures.tasks.populate_daily_metrics_v2' + + @pytest.fixture(autouse=True) + def setup(self, db): + self.settings = mock.Mock( + WEBPACK_LOADER={}, + CELERYBEAT_SCHEDULE={}, + FEATURES={}, + ENV_TOKENS={}, + CELERY_IMPORTS=[], + ) + + def test_uses_default_daily_task(self): + """Do the default settings define the default Celery task in CeleryBeat? + """ + plugin_settings(self.settings) + task_found = self.settings.CELERYBEAT_SCHEDULE['figures-populate-daily-metrics']['task'] + assert task_found == FIGURES_DEFAULT_DAILY_TASK + + def test_uses_env_extra_daily_task(self): + """Does overriding the Celery task function setting in CeleryBeat? + """ + self.settings.ENV_TOKENS['FIGURES'] = {'DAILY_TASK': self.ALTERNATE_TASK} + plugin_settings(self.settings) + task_found = self.settings.CELERYBEAT_SCHEDULE['figures-populate-daily-metrics']['task'] + assert task_found == self.ALTERNATE_TASK + + class TestDailyMauPipelineSettings(object): """Tests MAU pipeline settings @@ -115,14 +156,14 @@ def setup(self, db): ) def test_daily_mau_pipeline_flag_enabled(self): - self.settings.ENV_TOKENS['FIGURES'] = { 'ENABLE_DAILY_MAU_IMPORT': True } + self.settings.ENV_TOKENS['FIGURES'] = {'ENABLE_DAILY_MAU_IMPORT': True} plugin_settings(self.settings) assert self.TASK_NAME in self.settings.CELERYBEAT_SCHEDULE assert set(['task', 'schedule', 'options']) == set( self.settings.CELERYBEAT_SCHEDULE[self.TASK_NAME].keys()) def test_daily_mau_pipeline_flag_disabled(self): - self.settings.ENV_TOKENS['FIGURES'] = { 'ENABLE_DAILY_MAU_IMPORT': False } + self.settings.ENV_TOKENS['FIGURES'] = {'ENABLE_DAILY_MAU_IMPORT': False} plugin_settings(self.settings) assert self.TASK_NAME not in self.settings.CELERYBEAT_SCHEDULE From 83687ca54b77d90dfaafaeb2a2301ea352ece5f6 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 17 Feb 2022 15:05:04 -0500 Subject: [PATCH 02/17] One-liner update to DRY up default site handling figures.sites.default_site() is returning the Site object matching `settings.SITE_ID` if we are running in single site mode and that is exactly what is called in single site mode for `get_site_for_course(course_id)`, so just making it that the source of truth is the same code --- figures/sites.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/figures/sites.py b/figures/sites.py index 9f987438..e8be5344 100644 --- a/figures/sites.py +++ b/figures/sites.py @@ -134,7 +134,7 @@ def get_site_for_course(course_id): site = None else: # Operating in single site / standalone mode, return the default site - site = Site.objects.get(id=settings.SITE_ID) + site = default_site() return site From 42ce02d575ba9cc6004d4dde91699252daf9f7f0 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Sat, 19 Feb 2022 00:39:32 -0500 Subject: [PATCH 03/17] Add utc_yesterday helper to figures.helpers This is a convenience function to get the previous calendar day from th ecurrent UTC time. This is the date for which the pipeline uses to collect, save data and aggregate metrics for the daily jobs --- figures/helpers.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/figures/helpers.py b/figures/helpers.py index c57bb172..053c5d36 100644 --- a/figures/helpers.py +++ b/figures/helpers.py @@ -1,6 +1,7 @@ """Helper functions to make data handling and conversions easier # Figures 0.3.13 - Defining scope of this module +# Figures 0.4.x - Yep, this module is still handy and the scope hasn't exploded The purpose of this module is to provide conveniece methods around commonly executed statements. These convenience methods should serve as shorthand for @@ -34,6 +35,8 @@ ## What does not belong here? +* Most importantly, if you have to import from another figures module, it does + not belong here! * "Feature" functionality does not belong here * Long functions do not belong here * Code that communicates outside of Figures does not belong here. No database, @@ -42,7 +45,10 @@ This is not an exhaustive list. We'll grow it as needed. An important point is that we should not expect this module to be a permanent -home for functionality. +home for the functionality included. As we evolve Figures, we may find functions +here that have a stronger context with another module. For example, we've got +a decent set of date oriented functions that are candidatdes for a datetime and +date handling module. """ from __future__ import absolute_import @@ -188,6 +194,15 @@ def prev_day(val): return days_from(val, -1) +def utc_yesterday(): + """Get "yesterday" form the utc datetime + + We primarily use this for the daily metrics collection. However, it proves + handy as a convenience function for exploring data in the Django shell + """ + return prev_day(datetime.datetime.utcnow().date()) + + def days_in_month(month_for): _, num_days_in_month = calendar.monthrange(month_for.year, month_for.month) return num_days_in_month From 5a5ff31e03ed6a6ca18e9502effcfb589cd78b13 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 24 Feb 2022 12:32:37 -0500 Subject: [PATCH 04/17] Fix and update devsite settings Fixes: * Apparently devsite/settings.py broke with the Celery routing addition. This didn't impact `test_settings` but would fail to run devsite `manage.py` which relies on `devsite/settings.py`. This commit fixes that * Added `update_celery_routes` to `devsite/settings.py` as part of the Celery fix to make it to prepare for a better devsite Celery story to make Figures Celery task development a better experience Improvements: * Updated devsite's default Open edX version to Juniper --- devsite/.env.example | 4 ++-- devsite/devsite/settings.py | 13 +++++++++++-- devsite/devsite/test_settings.py | 2 +- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/devsite/.env.example b/devsite/.env.example index 9a381bfc..58ed6fa0 100644 --- a/devsite/.env.example +++ b/devsite/.env.example @@ -15,8 +15,8 @@ DATABASE_URL=mysql://figures_user:drowssap-ekaf@127.0.0.1:3306/figures-db # Set which expected Open edX release mocks for devsite to use. # Valid options are: "GINKGO", "HAWTHORN", "JUNIPER" # -# If not specified here, then "HAWTHORN" is used -OPENEDX_RELEASE=JUNIPER +# If not specified here, then "JUNIPER" is used +# OPENEDX_RELEASE=JUNIPER # Enable/disable Figures multisite mode in devsite # This also requires diff --git a/devsite/devsite/settings.py b/devsite/devsite/settings.py index 4fc535f0..227e0895 100644 --- a/devsite/devsite/settings.py +++ b/devsite/devsite/settings.py @@ -14,6 +14,7 @@ from figures.settings.lms_production import ( update_webpack_loader, update_celerybeat_schedule, + update_celery_routes, ) DEVSITE_BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -22,7 +23,7 @@ env = environ.Env( DEBUG=(bool, True), ALLOWED_HOSTS=(list, []), - OPENEDX_RELEASE=(str, 'HAWTHORN'), + OPENEDX_RELEASE=(str, 'JUNIPER'), FIGURES_IS_MULTISITE=(bool, False), ENABLE_DEVSITE_CELERY=(bool, True), ENABLE_OPENAPI_DOCS=(bool, False), @@ -240,8 +241,16 @@ # We have an empty dict here to replicate behavior in the LMS ENV_TOKENS = {} +PRJ_SETTINGS = { + 'CELERY_ROUTES': "app.celery.routes" +} + +FIGURES_PIPELINE_TASKS_ROUTING_KEY = "" + update_webpack_loader(WEBPACK_LOADER, ENV_TOKENS) -update_celerybeat_schedule(CELERYBEAT_SCHEDULE, ENV_TOKENS) +update_celerybeat_schedule(CELERYBEAT_SCHEDULE, ENV_TOKENS, FIGURES_PIPELINE_TASKS_ROUTING_KEY) +update_celery_routes(PRJ_SETTINGS, ENV_TOKENS, FIGURES_PIPELINE_TASKS_ROUTING_KEY) + # Used by Django Debug Toolbar INTERNAL_IPS = [ diff --git a/devsite/devsite/test_settings.py b/devsite/devsite/test_settings.py index 94305565..cb567ef1 100644 --- a/devsite/devsite/test_settings.py +++ b/devsite/devsite/test_settings.py @@ -28,7 +28,7 @@ def root(*args): env = environ.Env( - OPENEDX_RELEASE=(str, 'HAWTHORN'), + OPENEDX_RELEASE=(str, 'JUNIPER'), ) environ.Env.read_env(join(dirname(dirname(__file__)), '.env')) From c6693fe12d7873dd94017c438c5ac32cd427f953 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 24 Feb 2022 13:38:02 -0500 Subject: [PATCH 05/17] Enrollment model updates + tests + support code This commit contains changes to prepare for adding new daily metrics workflow to speed up daily job execution * Added new EnrollmentData model manager method to handle and encapsulate EnrollmentData/LCGM update logic given a `CourseEnrollment` record as parameter. Added unit tests for this new method. * Adds `collect_elapsed` field to EnrollmentData and LCGM to track how long it takes to collect the progress data for that enrollment. The big cost here is currently to `CourseGradeFactory()` * Added convenience method to StudentModuleFactory to create one form a `CourseEnrollment` record instead of having to pass in two variables (user and course_id) to construct a test StudentModule record to match an already constructed CourseEnrollment record --- ...0016_add_collect_elapsed_to_ed_and_lcgm.py | 24 +++ figures/models.py | 82 ++++++- tests/factories.py | 14 ++ .../test_enrollment_data_update_metrics.py | 201 ++++++++++++++++++ 4 files changed, 316 insertions(+), 5 deletions(-) create mode 100644 figures/migrations/0016_add_collect_elapsed_to_ed_and_lcgm.py create mode 100644 tests/models/test_enrollment_data_update_metrics.py diff --git a/figures/migrations/0016_add_collect_elapsed_to_ed_and_lcgm.py b/figures/migrations/0016_add_collect_elapsed_to_ed_and_lcgm.py new file mode 100644 index 00000000..26de21a2 --- /dev/null +++ b/figures/migrations/0016_add_collect_elapsed_to_ed_and_lcgm.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('figures', '0015_add_enrollment_data_model'), + ] + + operations = [ + migrations.AddField( + model_name='enrollmentdata', + name='collect_elapsed', + field=models.FloatField(null=True), + ), + migrations.AddField( + model_name='learnercoursegrademetrics', + name='collect_elapsed', + field=models.FloatField(null=True), + ), + ] diff --git a/figures/models.py b/figures/models.py index 8ae1482e..81fe1175 100644 --- a/figures/models.py +++ b/figures/models.py @@ -5,6 +5,7 @@ from __future__ import absolute_import from datetime import date +import timeit from django.conf import settings from django.contrib.sites.models import Site from django.core.validators import MaxValueValidator, MinValueValidator @@ -17,7 +18,7 @@ from model_utils.models import TimeStampedModel from figures.compat import CourseEnrollment -from figures.helpers import as_course_key +from figures.helpers import as_course_key, utc_yesterday from figures.progress import EnrollmentProgress @@ -203,7 +204,7 @@ class EnrollmentDataManager(models.Manager): EnrollmentData instances. """ - def set_enrollment_data(self, site, user, course_id, course_enrollment=False): + def set_enrollment_data(self, site, user, course_id, course_enrollment=None): """ This is an expensive call as it needs to call CourseGradeFactory if there is not already a LearnerCourseGradeMetrics record for the learner @@ -259,6 +260,71 @@ def set_enrollment_data(self, site, user, course_id, course_enrollment=False): defaults=defaults) return obj, created + def update_metrics(self, site, course_enrollment, force_update=False): + """ + This is an expensive call as it needs to call CourseGradeFactory if + there is not already a LearnerCourseGradeMetrics record for the learner + + """ + date_for = utc_yesterday() + + # check if we already have a record for the date for EnrollmentData + + # Alternately, we could use a try/except on a 'get' call, however, this + # would be much slower for a bunch of new enrollments + + # Should only be one even if we don't include the site in the query + # because course_id should be globally unique + # If course id is ever NOT globally unique, then we need to add site + # to the query + ed_recs = EnrollmentData.objects.filter( + user_id=course_enrollment.user_id, + course_id=str(course_enrollment.course_id)) + + if not ed_recs or ed_recs[0].date_for < date_for or force_update: + # We do the update + start_time = timeit.default_timer() + # get the progress data + ep = EnrollmentProgress(user=course_enrollment.user, + course_id=str(course_enrollment.course_id)) + defaults = dict( + date_for=date_for, + is_completed=ep.is_completed(), + progress_percent=ep.progress_percent(), + points_possible=ep.progress.get('points_possible', 0), + points_earned=ep.progress.get('points_earned', 0), + sections_possible=ep.progress.get('sections_possible', 0), + sections_worked=ep.progress.get('sections_worked', 0), + is_enrolled=course_enrollment.is_active, + date_enrolled=course_enrollment.created, + ) + elapsed = timeit.default_timer() - start_time + defaults['collect_elapsed'] = elapsed + + ed_rec, created = self.update_or_create( + site=site, + user=course_enrollment.user, + course_id=str(course_enrollment.course_id), + defaults=defaults) + # create a new LCGM record + # if it already exists for the day + LearnerCourseGradeMetrics.objects.update_or_create( + site=ed_rec.site, + user=ed_rec.user, + course_id=ed_rec.course_id, + date_for=date_for, + defaults=dict( + points_possible=ed_rec.points_possible, + points_earned=ed_rec.points_earned, + sections_worked=ed_rec.sections_worked, + sections_possible=ed_rec.sections_possible, + collect_elapsed=elapsed + ) + ) + return ed_rec, created + else: + return ed_recs[0], False + @python_2_unicode_compatible class EnrollmentData(TimeStampedModel): @@ -301,6 +367,9 @@ class EnrollmentData(TimeStampedModel): sections_worked = models.IntegerField() sections_possible = models.IntegerField() + # seconds it took to collect progress data + collect_elapsed = models.FloatField(null=True) + objects = EnrollmentDataManager() class Meta: @@ -324,7 +393,7 @@ def progress_details(self): class LearnerCourseGradeMetricsManager(models.Manager): - """Custom model manager for LearnerCourseGrades model + """Custom model manager for LearnerCourseGradeMetrics model """ def latest_lcgm(self, user, course_id): """Gets the most recent record for the given user and course @@ -414,9 +483,9 @@ class LearnerCourseGradeMetrics(TimeStampedModel): Purpose is primarliy to improve performance for the front end. In addition, data collected can be used for course progress over time - We're capturing data from figures.metrics.LearnerCourseGrades + We're capturing data from figures.progress.EnrollmentProgress - Note: We're probably going to move ``LearnerCourseGrades`` to figures.pipeline + Note: We're probably going to move ``EnrollmentProgress`` to figures.pipeline since that class will only be needed by the pipeline Even though this is for a course enrollment, we're mapping to the user @@ -452,6 +521,9 @@ class LearnerCourseGradeMetrics(TimeStampedModel): sections_worked = models.IntegerField() sections_possible = models.IntegerField() + # seconds it took to collect progress data + collect_elapsed = models.FloatField(null=True) + objects = LearnerCourseGradeMetricsManager() class Meta: diff --git a/tests/factories.py b/tests/factories.py index 66a646ac..90fddda5 100644 --- a/tests/factories.py +++ b/tests/factories.py @@ -221,6 +221,20 @@ class Meta: 2018,2,2, tzinfo=factory.compat.UTC)) + @classmethod + def from_course_enrollment(cls, course_enrollment, **kwargs): + """Contruct a StudentModule for the given CourseEnrollment + + kwargs provides for additional optional parameters if you need to + override the default factory assignment + """ + kwargs.update({ + 'student': course_enrollment.user, + 'course_id': course_enrollment.course_id, + }) + return cls(**kwargs) + + if OPENEDX_RELEASE == GINKGO: class CourseEnrollmentFactory(DjangoModelFactory): class Meta: diff --git a/tests/models/test_enrollment_data_update_metrics.py b/tests/models/test_enrollment_data_update_metrics.py new file mode 100644 index 00000000..e4ff46d9 --- /dev/null +++ b/tests/models/test_enrollment_data_update_metrics.py @@ -0,0 +1,201 @@ +""" +Basic tests for figures.models.EnrollmentData model + +Test scenarios we need to help verify the data model + +1. Learner doesn't have a CourseEnrollment (CE) +2. Learner just signed up. Has a CE, no LearnerCourseGradeMetric (LCGM) +3. Learner has CE and one LCGM +4. Learner has CE and multiple LCGM +5. Learner completed the course +6. Learner is no longer active in the course (Note: this is only handled in + our data by the fact that the EnrollmentData.is_active field would be False) +7+ The test scenrios we haven't identified yet + + +Tests fail in Ginkgo due to + "TypeError: 'course' is an invalid keyword argument for this function" + +Which is interesting because other tests use CourseEnrollmentFactory(course=X) +and they do not fail in Ginkgo. For now, skipping test in Ginkgo +""" + +import pytest + +from mock import patch + +from django.contrib.sites.models import Site + +from figures.helpers import ( + as_datetime, days_from, utc_yesterday +) +from figures.models import EnrollmentData, LearnerCourseGradeMetrics + +from tests.factories import ( + CourseEnrollmentFactory, + CourseOverviewFactory, + EnrollmentDataFactory, + LearnerCourseGradeMetricsFactory, + OrganizationFactory, + OrganizationCourseFactory, + StudentModuleFactory) +from tests.helpers import ( + organizations_support_sites +) + +if organizations_support_sites(): + from tests.factories import UserOrganizationMappingFactory + + def map_users_to_org(org, users): + """Convenience method to simplify test code + """ + [UserOrganizationMappingFactory(user=user, + organization=org) for user in users] + + +# @pytest.mark.skipif(OPENEDX_RELEASE == GINKGO, reason='Breaks on CourseEnrollmentFactory') +@pytest.mark.django_db +class TestUpdateMetrics(object): + """Test EnrollmentDataManager.update_metrics method + + """ + @pytest.fixture(autouse=True) + def setup(self, db, settings): + + # Set up data that's the same for standalone or multisite + self.date_for = utc_yesterday() + self.site = Site.objects.first() + self.courses = [CourseOverviewFactory(), CourseOverviewFactory()] + + # Two for "our" course, one for another course in the same site + self.enrollments = [ + CourseEnrollmentFactory(course_id=self.courses[0].id), + CourseEnrollmentFactory(course_id=self.courses[0].id), + CourseEnrollmentFactory(course_id=self.courses[1].id), + ] + + self.ce0_sm = StudentModuleFactory.from_course_enrollment( + self.enrollments[0], + created=as_datetime(self.date_for), + modified=as_datetime(self.date_for)) + + # Handle site mode specifices + if organizations_support_sites(): + settings.FEATURES['FIGURES_IS_MULTISITE'] = True + self.org = OrganizationFactory(sites=[self.site]) + for course in self.courses: + OrganizationCourseFactory(organization=self.org, + course_id=str(course.id)) + map_users_to_org(self.org, [ce.user for ce in self.enrollments]) + + # For our tests, we focus on a single enrollment. We should not + # need to stand up other site data, but if we find we do need to, + # then here's the place to do it + else: + self.org = OrganizationFactory() + + def setup_data_for_force_checks(self): + pass + + def test_new_records_yesterday(self): + """Test new enrollment with first activity in the course yesterday + """ + ce = self.enrollments[0] + the_enrollment = { + 'site': self.site, + 'user': self.enrollments[0].user, + 'course_id': str(self.enrollments[0].course_id) + } + assert not EnrollmentData.objects.filter(**the_enrollment) + + ed, created = EnrollmentData.objects.update_metrics(self.site, ce) + + check_ed = EnrollmentData.objects.get(**the_enrollment) + lcgm = LearnerCourseGradeMetrics.objects.latest_lcgm(ce.user, ce.course_id) + assert check_ed == ed + assert created + assert check_ed.date_for == self.date_for + assert lcgm.date_for == self.date_for + + def test_edrec_exists_older_lcgm(self): + ce = self.enrollments[0] + older_date = days_from(self.date_for, -2) + + # Create existing Figures records + EnrollmentDataFactory(site=self.site, + user=ce.user, + course_id=str(ce.course_id), + date_for=older_date) + older_lcgm = LearnerCourseGradeMetricsFactory(site=self.site, + user=ce.user, + course_id=str(ce.course_id), + date_for=older_date) + + # Make sure that the LCGM we created is the most recent one + assert LearnerCourseGradeMetrics.objects.latest_lcgm(ce.user, + ce.course_id) == older_lcgm + # assert lcgm1 == older_lcgm + # run our code under test + ed, created = EnrollmentData.objects.update_metrics(self.site, ce) + # verify our Figures records are updated + after_lcgm = LearnerCourseGradeMetrics.objects.latest_lcgm(ce.user, ce.course_id) + after_ed = EnrollmentData.objects.get(site=self.site, + user=ce.user, + course_id=str(ce.course_id)) + assert after_lcgm.date_for == self.date_for + assert after_ed.date_for == self.date_for + + def test_exists_no_force(self): + ce = self.enrollments[0] + construct_kwargs = dict( + site=self.site, + user=ce.user, + course_id=str(ce.course_id), + date_for=self.date_for) + before_ed = EnrollmentDataFactory(**construct_kwargs) + LearnerCourseGradeMetricsFactory(**construct_kwargs) + with patch('figures.models.EnrollmentProgress._get_progress') as get_prog: + ed, created = EnrollmentData.objects.update_metrics(self.site, ce) + assert not get_prog.called + assert ed == before_ed + + def test_force_update(self): + ce = self.enrollments[0] + + # Create existing Figures records + # We only need to assign one progress value but we assign the possible + # and earned for one to make sure that the earned is not more than the + # possible. We arbitrarily choose points. We could have also chosen + # sections or assigned both + construct_kwargs = dict( + site=self.site, + user=ce.user, + course_id=str(ce.course_id), + date_for=self.date_for, + points_earned=5, + points_possible=10) + EnrollmentDataFactory(**construct_kwargs) + before_lcgm = LearnerCourseGradeMetricsFactory(**construct_kwargs) + + fake_progress = dict(points_possible=50, + points_earned=25, + sections_possible=10, + sections_worked=5) + + with patch('figures.models.EnrollmentProgress._get_progress', return_value=fake_progress): + ed, created = EnrollmentData.objects.update_metrics(self.site, ce, force_update=True) + + # verify our Figures records are updated + lcgm = LearnerCourseGradeMetrics.objects.latest_lcgm(ce.user, ce.course_id) + check_ed = EnrollmentData.objects.get(site=self.site, + user=ce.user, + course_id=str(ce.course_id)) + assert check_ed == ed + assert not created + assert check_ed.date_for == self.date_for + assert check_ed.points_earned == fake_progress['points_earned'] + assert lcgm.date_for == self.date_for + assert lcgm.id == before_lcgm.id + # We only need to check one of the progress fields to know it was updated + assert lcgm.points_earned == fake_progress['points_earned'] + # import pdb; pdb.set_trace() From 839b3226933215c9e4b4e6233ab941707498bc00 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 25 Feb 2022 00:03:59 -0500 Subject: [PATCH 06/17] figures.course is a new module to simplify Figures This PR adds `figures.course`, a new module that has the `Course` class The goal of this class is to simplify Figures code This class provides data specific to and associated with a course. Our initial version provides only the essential data needed for our pipeline performance improvement --- figures/course.py | 115 +++++++++++++++++++++++++++++++++++++++++++ tests/test_course.py | 110 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 225 insertions(+) create mode 100644 figures/course.py create mode 100644 tests/test_course.py diff --git a/figures/course.py b/figures/course.py new file mode 100644 index 00000000..207921fd --- /dev/null +++ b/figures/course.py @@ -0,0 +1,115 @@ +"""Course specific module for Figures + +## This module defined a `Course` class for data retrieval + +Initialy created to do the following: + +1. Reduce duplication in Figures "pipeline" +2. Build stronger course context to make Figures programming easier + +## Background summary + +A course id is globally unique as it has the identify of the organization and +organizations are globally unique. + +## Design to think about - Enrollment class + +Build on Django's lazy eval for querysets to create also an `Enrollment` class +that provides interfaces for `enrollment.date_for` and abstracts this bit of +mess that enrollments and student modules do NOT associate and instead we need +to query back and forth with `user_id` and `course_id`. +""" +from __future__ import absolute_import +from django.db.models import Q +from figures.compat import CourseEnrollment, StudentModule +from figures.helpers import ( + as_course_key, + as_date, +) +from figures.sites import ( + get_site_for_course, +) + + +class Course(object): + """Representation of a Course. + + The impetus for this class was dealing with querying for course enrollment + and student module records for a specific course and for dates and date + ranges for the course + + ## Architecture goal + + **Start simple and don't build the kitchen sink into here right away just + because this class exists** + + ## Data under consideration to have this class handle + + * enrollments created on a date, before, after, between. However this would + just be a convenience as the `.enrollments` property returns a queryset that + can be filtered on `.created` + """ + def __init__(self, course_id): + """ + Initial version, we pass in a course ID and cast to a course key as an + instance attribute. Later on, add `CourseLike` to abstract course identity + so we can stop worrying about "Is it a string repretentation of a course or + is it a CourseKey?" + """ + self.course_key = as_course_key(course_id) + + # Improvement: Consider doing lazy evaluation + self.site = get_site_for_course(self.course_id) + + def __repr__(self): + return '{}.{} <{}>'.format(self.__module__, + self.__class__.__name__, + str(self.course_key)) + + def __str__(self): + return self.__repr__() + + @property + def course_id(self): + """Returns string representation of the course id + """ + return str(self.course_key) + + @property + def enrollments(self): + """Returns CourseEnrollment queryset for the course + """ + return CourseEnrollment.objects.filter(course_id=self.course_key) + + @property + def student_modules(self): + """Returns StudentModule queryset for enrollments in the course + """ + return StudentModule.objects.filter(course_id=self.course_key) + + def student_modules_active_on_date(self, date_for): + """Returns StudentModule queryset active on the date + Active is if there was a `created` or `modified` field for the given date + + NOTE: We need to do this instead of simplly `modified__date=date_for` + because we still have to support Django 1.8/Ginkgo + """ + date_for = as_date(date_for) + q_created = Q(created__year=date_for.year, + created__month=date_for.month, + created__day=date_for.day) + q_modified = Q(modified__year=date_for.year, + modified__month=date_for.month, + modified__day=date_for.day) + return self.student_modules.filter(q_created | q_modified) + + def enrollments_active_on_date(self, date_for): + """Return CourseEnrollment queryset for enrollments active on the date + + Looks for student modules modified on the specified date and returns + matching CourseEnrollment records + """ + sm = self.student_modules_active_on_date(date_for) + user_ids = sm.values('student_id').distinct() + return CourseEnrollment.objects.filter(course_id=self.course_key, + user_id__in=user_ids) diff --git a/tests/test_course.py b/tests/test_course.py new file mode 100644 index 00000000..7415dfee --- /dev/null +++ b/tests/test_course.py @@ -0,0 +1,110 @@ +"""Test `figures.course` module + +""" +from __future__ import absolute_import +import pytest +from faker import Faker +from opaque_keys.edx.keys import CourseKey + +from figures.course import Course +from figures.helpers import days_from +from figures.sites import get_site_for_course + +from tests.factories import ( + CourseEnrollmentFactory, + CourseOverviewFactory, + StudentModuleFactory, +) + +fake = Faker() + + +@pytest.mark.django_db +class TestCourse: + """ + Starting with just basic tests + """ + @pytest.fixture(autouse=True) + def setup(self, db): + self.course_overview = CourseOverviewFactory() + # Little sanity check making sure our Factory has the right class type + # for the course key + assert isinstance(self.course_overview.id, CourseKey) + + def assert_construction(self, course): + assert course.course_key == self.course_overview.id + assert course.course_id == str(self.course_overview.id) + assert course.site == get_site_for_course(self.course_overview.id) + + def simple_test_property_course_id_association(self, factory_class, property_name): + """Helper method to DRY tests for property methods on simple associations + + This method can be used to test the Course class property methods return + expected results for property methods that return objects that can be + associated with a course with just the course id. Initially implemented + to test the enrollments and student modules property methods. Could also + be used to test for other models like CourseAccessGroup, CourseAccessRole, + Figures CourseDailyMetrics, + EnrollmentData, or LearnerCourseGradeMetrics if we wanted to implement + property methods for those + + If we did use this for models that use a string course id instead of a + CourseKey, then we'll need to make sure the handled course id form is + used. We could do this with a method parameter that either casts to the + expected form or defines which one to use and we can explicitly cast in + this method + """ + our_objects = [factory_class(course_id=self.course_overview.id) + for _ in range(2)] + other_object = factory_class() + assert not other_object.course_id == self.course_overview.id + course = Course(self.course_overview.id) + assert set(our_objects) == set(getattr(course, property_name)) + + def test_str_constructor(self): + self.assert_construction(Course(str(self.course_overview.id))) + + def test_course_key_constructor(self): + self.assert_construction(Course(self.course_overview.id)) + + def test_enrollments(self): + self.simple_test_property_course_id_association(CourseEnrollmentFactory, + 'enrollments') + + def test_student_modules(self): + self.simple_test_property_course_id_association(StudentModuleFactory, + 'student_modules') + + def test_student_modules_active_on_date(self): + our_date_for = fake.date_this_year() + our_created_sm = [StudentModuleFactory(course_id=self.course_overview.id, + created=our_date_for) for _ in range(2)] + our_modified_sm = [StudentModuleFactory(course_id=self.course_overview.id, + modified=our_date_for) for _ in range(2)] + # Create record with a different date + StudentModuleFactory(course_id=self.course_overview.id, + created=days_from(our_date_for, -2), + modified=days_from(our_date_for, -1)) + course = Course(self.course_overview.id) + found_sm = course.student_modules_active_on_date(our_date_for) + assert set(our_created_sm + our_modified_sm) == set(found_sm) + + def test_enrollments_active_on_date(self): + our_date_for = fake.date_this_year() + other_date_for = days_from(our_date_for, -1) + our_ce = [CourseEnrollmentFactory(course_id=self.course_overview.id) + for _ in range(2)] + our_sm = [] + for ce in our_ce: + our_sm.extend([ + StudentModuleFactory.from_course_enrollment(ce, modified=our_date_for), + StudentModuleFactory.from_course_enrollment(ce, created=our_date_for) + ]) + # Create enrollment we should not get in our results + other_ce = CourseEnrollmentFactory(course_id=self.course_overview.id) + StudentModuleFactory.from_course_enrollment(other_ce, + created=other_date_for, + modified=other_date_for) + course = Course(self.course_overview.id) + found_ce = course.enrollments_active_on_date(our_date_for) + assert set(found_ce) == set(our_ce) From f6a177d15644d21da284e14f069e6005b57580a3 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 25 Feb 2022 14:17:48 -0500 Subject: [PATCH 07/17] Add next iteration to collect enrollment data This PR provides alternate execution to create the same data as before * See the module docstring in `figures.pipeline.enrollment_metrics_next` for details * This PR contains a new module and tests to support the new module --- figures/pipeline/enrollment_metrics_next.py | 68 ++++++++++ .../pipeline/test_enrollment_metrics_next.py | 118 ++++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 figures/pipeline/enrollment_metrics_next.py create mode 100644 tests/pipeline/test_enrollment_metrics_next.py diff --git a/figures/pipeline/enrollment_metrics_next.py b/figures/pipeline/enrollment_metrics_next.py new file mode 100644 index 00000000..7b00fa07 --- /dev/null +++ b/figures/pipeline/enrollment_metrics_next.py @@ -0,0 +1,68 @@ +"""This module updates Figures enrollment data and calculates aggregate progress + +* It updates `EnrollmentData` and `LearnerCourseGradeMetrics` records +* It calculate course progress from EnrollmentData records + +This generates the same metrics as the original enrollment_metrics modules, +but does it differently. + +## How it differs from the previous version + +This module improves on the existing enrollment metrics collection module, +`figures.pipeline.enrollment_metrics` + +* It separates the activities to create and update Figures per-enrollment data + collected +* This separation lets Figures run metrics in distinct stages + * First, collect per-enrollment data + * Second, aggregate metrics based on collected data +* This provides a workflow that is easier to resume if interrupted +* This provides workflow that is simpler to debug +* This simplifies and speeds up aggregate progress metrics, collapsing complex + code into a single Django queryset aggregation +* This update lays groundwork for further metrics improvements and enhancements + such as metrics on subsets of learners in a course or progress of subsets of + learners across courses + +# Developer Notes + +This module provides + +""" +from django.db.models import Avg +from figures.course import Course +from figures.helpers import utc_yesterday +from figures.models import EnrollmentData +from figures.sites import UnlinkedCourseError + + +def update_enrollment_data_for_course(course_id): + """Updates Figures per-enrollment data for enrollments in the course + Checks for and creates new `LearnerCourseGradeMetrics` records and updates + `EnrollmentData` records + + Return results are a list of the results returned by `update_enrollment_data` + """ + date_for = utc_yesterday() + the_course = Course(course_id) + if not the_course.site: + raise UnlinkedCourseError('No site found for course "{}"'.format(course_id)) + + # Any updated student module records? if so, then get the unique enrollments + # for each enrollment, check if LGCM is out of date or up to date + active_enrollments = the_course.enrollments_active_on_date(date_for) + return [EnrollmentData.objects.update_metrics(the_course.site, ce) + for ce in active_enrollments] + + +def calculate_course_progress(course_id): + """Return average progress percentage for all enrollments in the course + """ + results = EnrollmentData.objects.filter(course_id=str(course_id)).aggregate( + average_progress=Avg('progress_percent')) + + # This is a bit of a hack. When we overhaul progress data, we should really + # have None for progress if there's no data. But check how SQL AVG performs + if results['average_progress'] is None: + results['average_progress'] = 0.0 + return results diff --git a/tests/pipeline/test_enrollment_metrics_next.py b/tests/pipeline/test_enrollment_metrics_next.py new file mode 100644 index 00000000..b48fe44a --- /dev/null +++ b/tests/pipeline/test_enrollment_metrics_next.py @@ -0,0 +1,118 @@ +"""Test next iteration to collect enrollment metrics + +These tests exercies the module, `figures.pipeline.enrollment_metrics_next`. + +See the module docstring for details. +""" +import pytest +from mock import patch + +from django.contrib.sites.models import Site + +from figures.compat import CourseEnrollment +from figures.pipeline.enrollment_metrics_next import ( + update_enrollment_data_for_course, + calculate_course_progress, +) +from figures.sites import UnlinkedCourseError + +from tests.factories import ( + CourseEnrollmentFactory, + CourseOverviewFactory, + EnrollmentDataFactory, +) + + +@pytest.mark.django_db +class TestUpdateMetrics(object): + """Tests `update_enrollment_data_for_course` + + Since `figures.models.EnrollmentDataManager.update_metrics` is tested in + `tests/models/test_enrollment_data_update_metrics.py`, we can mock this + method in our tests here. + """ + + @pytest.fixture(autouse=True) + def setup(self, db, settings): + self.course_overview = CourseOverviewFactory() + self.site = Site.objects.first() + + def test_course_has_no_enrollments(self, monkeypatch): + """We have a new course with no enrollments + """ + monkeypatch.setattr('figures.course.get_site_for_course', lambda val: self.site) + result = update_enrollment_data_for_course(self.course_overview.id) + assert result == [] + + def test_course_has_enrollments_but_no_active_for_yesterday(self, monkeypatch): + """We have enrollments, but none were active yesterday + """ + monkeypatch.setattr('figures.course.get_site_for_course', lambda val: self.site) + [CourseEnrollmentFactory(course_id=self.course_overview.id) + for _ in range(2)] + result = update_enrollment_data_for_course(self.course_overview.id) + assert result == [] + + def test_course_has_active_enrollments_for_yesterday(self): + """We have enrollments who were active yesterday + """ + expected_ce = [CourseEnrollmentFactory(course_id=self.course_overview.id) + for _ in range(2)] + ce = CourseEnrollment.objects.filter(course_id=self.course_overview.id) + + def mock_update_metrics(site, ce): + return ce + + with patch('figures.pipeline.enrollment_metrics_next.Course') as course_class: + with patch('figures.pipeline.enrollment_metrics_next.EnrollmentData.objects') as edm: + course_class.return_value.enrollments_active_on_date.return_value = ce + course_class.return_value.site = self.site + edm.update_metrics = mock_update_metrics + result = update_enrollment_data_for_course(self.course_overview.id) + assert set(result) == set(expected_ce) + + def test_course_is_unlinked(self, monkeypatch): + """Function should raise `UnlinkedCourseError` if there's not a site match + + For Tahoe's multisite implementation, this can happen if the course's + organization is not linked to a site + For standalone sites, this should never happen + + To learn more, see the `figures.sites.get_site_for_course` function. + """ + monkeypatch.setattr('figures.course.get_site_for_course', lambda val: None) + with pytest.raises(UnlinkedCourseError) as excinfo: + update_enrollment_data_for_course(self.course_overview.id) + # with patch('figures.pipeline.enrollment_metrics_next.Course') as course_class: + # course_class.return_value.site = None + # with pytest.raises(UnlinkedCourseError) as excinfo: + # update_enrollment_data_for_course(self.course_overview.id) + expected_msg = 'No site found for course "{}"'.format(str(self.course_overview.id)) + assert str(excinfo.value) == expected_msg + + +@pytest.mark.django_db +class TestCalculateProgress(object): + """Tests `calculate_course_progress` + """ + @pytest.fixture(autouse=True) + def setup(self, db, settings): + self.course_overview = CourseOverviewFactory() + + def test_calc_course_progress_empty(self): + """The course doesn't have any EnrollmentData records + """ + results = calculate_course_progress(self.course_overview.id) + assert results['average_progress'] == pytest.approx(0.0) + + def test_calc_course_progress(self): + """The course has EnrollmentData records + """ + some_percentages = [0.0, 25.0, 50.0] + expected_average = sum(some_percentages)/len(some_percentages) + [ + EnrollmentDataFactory(course_id=str(self.course_overview.id), progress_percent=pp) + for pp in some_percentages + ] + results = calculate_course_progress(self.course_overview.id) + assert results['average_progress'] == pytest.approx(expected_average) From 76aea1280500fd2da68dc4ba1a578b4b89611c2c Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 25 Feb 2022 17:56:50 -0500 Subject: [PATCH 08/17] Update CDM pipeline to swich on progress workflows * Default workflow is the old progress calculator, enrollment data collector (in `figures.pipeline.enrollment_metrics`) * New workflow calls `figures.pipeline.enrollment_metrics_next'` * Updates tests to handle both conditions and default when `ed_next` is not set --- figures/pipeline/course_daily_metrics.py | 59 ++++++++++++++++----- tests/pipeline/test_course_daily_metrics.py | 46 ++++++++++++---- 2 files changed, 82 insertions(+), 23 deletions(-) diff --git a/figures/pipeline/course_daily_metrics.py b/figures/pipeline/course_daily_metrics.py index 08ae36d0..bd2e5a8b 100644 --- a/figures/pipeline/course_daily_metrics.py +++ b/figures/pipeline/course_daily_metrics.py @@ -30,6 +30,10 @@ from figures.pipeline.logger import log_error import figures.pipeline.loaders from figures.pipeline.enrollment_metrics import bulk_calculate_course_progress_data +from figures.pipeline.enrollment_metrics_next import ( + calculate_course_progress as calculate_course_progress_next +) + from figures.serializers import CourseIndexSerializer import figures.sites from figures.pipeline.helpers import pipeline_date_for_rule @@ -235,16 +239,32 @@ class CourseDailyMetricsExtractor(object): BUT, we will then need to find a transform """ - def extract(self, course_id, date_for, **_kwargs): - """ - defaults = dict( + def extract(self, course_id, date_for, ed_next=False, **_kwargs): + """Extracts (collects) aggregated course level data + + Args: + course_id (:obj:`str` or :obj:`CourseKey`): The course for which we collect data + date_for (str or date): Deprecated. Was to backfill data. + Specialized TBD backfill data will be called instead. + ed_next (bool, optional): "Enrollment Data Next" flag. If set to `True` + then we collect metrics with our updated workflow. See here: + https://github.com/appsembler/figures/issues/428 + + Returns: + dict with aggregate course level metrics + + ``` + dict( enrollment_count=data['enrollment_count'], active_learners_today=data['active_learners_today'], average_progress=data.get('average_progress', None), average_days_to_complete=data.get('average_days_to_complete, None'), num_learners_completed=data['num_learners_completed'], ) - TODO: refactor this class + ``` + + TODO: refactor this class. It doesn't need to be a class. Can be a + standalone function Add lazy loading method to load course enrollments - Create a method for each metric field """ @@ -290,17 +310,31 @@ def extract(self, course_id, date_for, **_kwargs): logger.debug(msg.format(date_for=date_for, course_id=course_id)) else: try: - progress_data = bulk_calculate_course_progress_data(course_id=course_id, - date_for=date_for) + # This conditional check is an interim solution until we make + # the progress function configurable and able to run Figures + # plugins + if ed_next: + progress_data = calculate_course_progress_next(course_id=course_id) + else: + progress_data = bulk_calculate_course_progress_data(course_id=course_id, + date_for=date_for) data['average_progress'] = progress_data['average_progress'] except Exception: # pylint: disable=broad-except # Broad exception for starters. Refine as we see what gets caught # Make sure we set the average_progres to None so that upstream # does not think things are normal data['average_progress'] = None - msg = ('FIGURES:FAIL bulk_calculate_course_progress_data' + + if ed_next: + prog_func = 'calculate_course_progress_next' + else: + prog_func = 'bulk_calculate_course_progress_data' + + msg = ('FIGURES:FAIL {prog_func}' ' date_for={date_for}, course_id="{course_id}"') - logger.exception(msg.format(date_for=date_for, course_id=course_id)) + logger.exception(msg.format(prog_func=prog_func, + date_for=date_for, + course_id=course_id)) data['average_days_to_complete'] = get_average_days_to_complete( course_id, date_for,) @@ -319,10 +353,11 @@ def __init__(self, course_id): self.extractor = CourseDailyMetricsExtractor() self.site = figures.sites.get_site_for_course(self.course_id) - def get_data(self, date_for): + def get_data(self, date_for, ed_next=False): return self.extractor.extract( course_id=self.course_id, - date_for=date_for) + date_for=date_for, + ed_next=ed_next) @transaction.atomic def save_metrics(self, date_for, data): @@ -350,7 +385,7 @@ def save_metrics(self, date_for, data): cdm.clean_fields() return (cdm, created,) - def load(self, date_for=None, force_update=False, **_kwargs): + def load(self, date_for=None, ed_next=False, force_update=False, **_kwargs): """ TODO: clean up how we do this. We want to be able to call the loader with an existing data set (not having to call the extractor) but we @@ -376,5 +411,5 @@ def load(self, date_for=None, force_update=False, **_kwargs): # record not found, move on to creating pass - data = self.get_data(date_for=date_for) + data = self.get_data(date_for=date_for, ed_next=ed_next) return self.save_metrics(date_for=date_for, data=data) diff --git a/tests/pipeline/test_course_daily_metrics.py b/tests/pipeline/test_course_daily_metrics.py index b51833ab..5e2058e0 100644 --- a/tests/pipeline/test_course_daily_metrics.py +++ b/tests/pipeline/test_course_daily_metrics.py @@ -239,34 +239,58 @@ def setup(self, db): self.student_module = StudentModuleFactory() self.date_for = datetime.datetime.utcnow().date() - def test_extract(self, monkeypatch): + def test_extract_default(self, monkeypatch): + """Default progress calculator is called when `ed_next` param not set + """ + expected_avg_prog = 'fake-average-progress' course_id = self.course_enrollments[0].course_id monkeypatch.setattr(figures.pipeline.course_daily_metrics, 'bulk_calculate_course_progress_data', - lambda **_kwargs: dict(average_progress=0.5)) + lambda **_kwargs: dict(average_progress=expected_avg_prog)) results = pipeline_cdm.CourseDailyMetricsExtractor().extract( course_id, self.date_for) + assert results['average_progress'] == expected_avg_prog + + @pytest.mark.parametrize('prog_func, ed_next', [ + ('bulk_calculate_course_progress_data', False), + ('calculate_course_progress_next', True) + ]) + def test_extract_ed_next(self, monkeypatch, prog_func, ed_next): + """Tests default and alternate progress calculators + """ + course_id = self.course_enrollments[0].course_id + prog_func_str = 'figures.pipeline.course_daily_metrics.{}'.format(prog_func) + with mock.patch(prog_func_str) as prog_mock: + results = pipeline_cdm.CourseDailyMetricsExtractor().extract( + course_id, self.date_for, ed_next=ed_next) + assert prog_mock.called assert results - def test_when_bulk_calculate_course_progress_data_fails(self, - monkeypatch, - caplog): + @pytest.mark.parametrize('prog_func, ed_next', [ + ('bulk_calculate_course_progress_data', False), + ('calculate_course_progress_next', True) + ]) + def test_when_calculate_course_progress_data_fails(self, + monkeypatch, + caplog, + prog_func, + ed_next): course_id = self.course_enrollments[0].course_id - def mock_bulk(**_kwargs): + def prog_func_mock(**_kwargs): raise Exception('fake exception') monkeypatch.setattr(figures.pipeline.course_daily_metrics, - 'bulk_calculate_course_progress_data', - mock_bulk) + prog_func, + prog_func_mock) results = pipeline_cdm.CourseDailyMetricsExtractor().extract( - course_id, self.date_for) + course_id, self.date_for, ed_next) last_log = caplog.records[-1] assert last_log.message.startswith( - 'FIGURES:FAIL bulk_calculate_course_progress_data') + 'FIGURES:FAIL {}'.format(prog_func)) assert not results['average_progress'] @@ -296,7 +320,7 @@ def test_load(self, monkeypatch): else: course_id = self.course_enrollments[0].course.id - def get_data(self, date_for): + def get_data(self, date_for, ed_next=False): return { 'average_progress': 1.0, 'num_learners_completed': 2, From 6e8b83d274ace27ad7094cc82b0fbb1c6a1193ee Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 26 Feb 2022 02:16:55 +0000 Subject: [PATCH 09/17] Bump urijs from 1.19.6 to 1.19.8 in /frontend Bumps [urijs](https://github.com/medialize/URI.js) from 1.19.6 to 1.19.8. - [Release notes](https://github.com/medialize/URI.js/releases) - [Changelog](https://github.com/medialize/URI.js/blob/gh-pages/CHANGELOG.md) - [Commits](https://github.com/medialize/URI.js/compare/v1.19.6...v1.19.8) --- updated-dependencies: - dependency-name: urijs dependency-type: indirect ... Signed-off-by: dependabot[bot] --- frontend/package-lock.json | 6 +++--- frontend/yarn.lock | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/frontend/package-lock.json b/frontend/package-lock.json index a0d28653..f89e6380 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -15760,9 +15760,9 @@ } }, "urijs": { - "version": "1.19.6", - "resolved": "https://registry.npmjs.org/urijs/-/urijs-1.19.6.tgz", - "integrity": "sha512-eSXsXZ2jLvGWeLYlQA3Gh36BcjF+0amo92+wHPyN1mdR8Nxf75fuEuYTd9c0a+m/vhCjRK0ESlE9YNLW+E1VEw==" + "version": "1.19.8", + "resolved": "https://registry.npmjs.org/urijs/-/urijs-1.19.8.tgz", + "integrity": "sha512-iIXHrjomQ0ZCuDRy44wRbyTZVnfVNLVo3Ksz1yxNyE5wV1IDZW2S5Jszy45DTlw/UdsnRT7DyDhIz7Gy+vJumw==" }, "urix": { "version": "0.1.0", diff --git a/frontend/yarn.lock b/frontend/yarn.lock index 2b37b1d2..46eedf0f 100644 --- a/frontend/yarn.lock +++ b/frontend/yarn.lock @@ -9162,9 +9162,9 @@ uri-js@^4.2.2: punycode "^2.1.0" urijs@^1.16.1: - version "1.19.6" - resolved "https://registry.yarnpkg.com/urijs/-/urijs-1.19.6.tgz#51f8cb17ca16faefb20b9a31ac60f84aa2b7c870" - integrity sha512-eSXsXZ2jLvGWeLYlQA3Gh36BcjF+0amo92+wHPyN1mdR8Nxf75fuEuYTd9c0a+m/vhCjRK0ESlE9YNLW+E1VEw== + version "1.19.8" + resolved "https://registry.yarnpkg.com/urijs/-/urijs-1.19.8.tgz#ee0407a18528934d3c383e691912f47043a58feb" + integrity sha512-iIXHrjomQ0ZCuDRy44wRbyTZVnfVNLVo3Ksz1yxNyE5wV1IDZW2S5Jszy45DTlw/UdsnRT7DyDhIz7Gy+vJumw== urix@^0.1.0: version "0.1.0" From 2f087967d5cb5d2246a8bcd6637a1614d8e1aebf Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Sat, 26 Feb 2022 01:19:20 -0500 Subject: [PATCH 10/17] Update figures.tasks for new pipeline workflow * Updated daily metrics tests to exercise new code * Added new helper function, `fake_course_key` so we don't have to create a CourseOverview to get a couse id when we just need a valid course key * Also lint fixed figures/management/commands/backfill_figures_daily_metrics.py --- .../backfill_figures_daily_metrics.py | 5 - .../backfill_figures_enrollment_data.py | 6 +- figures/tasks.py | 95 +++++++++++++++---- tests/helpers.py | 5 + tests/tasks/test_daily_tasks.py | 79 ++++++++++----- 5 files changed, 141 insertions(+), 49 deletions(-) diff --git a/figures/management/commands/backfill_figures_daily_metrics.py b/figures/management/commands/backfill_figures_daily_metrics.py index b30f9d91..450453a2 100644 --- a/figures/management/commands/backfill_figures_daily_metrics.py +++ b/figures/management/commands/backfill_figures_daily_metrics.py @@ -75,11 +75,6 @@ def handle(self, *args, **options): metrics_func(**kwargs) else: metrics_func.delay(**kwargs) # pragma: no cover - # except Exception as e: # pylint: disable=bare-except - # if options['ignore_exceptions']: - # self.print_exc("daily", dt, e.message) - # else: - # raise print('END: Backfill Figures daily metrics metrics for: {}'.format(dt)) diff --git a/figures/management/commands/backfill_figures_enrollment_data.py b/figures/management/commands/backfill_figures_enrollment_data.py index e22ad166..7363219e 100644 --- a/figures/management/commands/backfill_figures_enrollment_data.py +++ b/figures/management/commands/backfill_figures_enrollment_data.py @@ -9,7 +9,7 @@ from textwrap import dedent from figures.management.base import BaseBackfillCommand -from figures.tasks import update_enrollment_data +from figures.tasks import update_enrollment_data_for_site class Command(BaseBackfillCommand): @@ -23,8 +23,8 @@ def handle(self, *args, **options): for site_id in self.get_site_ids(options['site']): print('Updating EnrollmentData for site {}'.format(site_id)) if options['no_delay']: - update_enrollment_data(site_id=site_id) + update_enrollment_data_for_site(site_id=site_id) else: - update_enrollment_data.delay(site_id=site_id) # pragma: no cover + update_enrollment_data_for_site.delay(site_id=site_id) # pragma: no cover print('DONE: Backfill Figures EnrollmentData') diff --git a/figures/tasks.py b/figures/tasks.py index 5cb537e8..4cb59b4d 100644 --- a/figures/tasks.py +++ b/figures/tasks.py @@ -28,6 +28,7 @@ from figures.pipeline.mau_pipeline import collect_course_mau from figures.pipeline.helpers import DateForCannotBeFutureError from figures.pipeline.site_monthly_metrics import fill_last_month as fill_last_smm_month +from figures.pipeline.enrollment_metrics_next import update_enrollment_data_for_course logger = get_task_logger(__name__) @@ -47,7 +48,7 @@ @shared_task -def populate_single_cdm(course_id, date_for=None, force_update=False): +def populate_single_cdm(course_id, date_for=None, ed_next=False, force_update=False): """Populates a CourseDailyMetrics record for the given date and course The calling function is responsible for error handling calls to this @@ -66,7 +67,7 @@ def populate_single_cdm(course_id, date_for=None, force_update=False): start_time = time.time() cdm_obj, _created = CourseDailyMetricsLoader( - course_id).load(date_for=date_for, force_update=force_update) + course_id).load(date_for=date_for, ed_next=ed_next, force_update=force_update) elapsed_time = time.time() - start_time logger.debug('done. Elapsed time (seconds)={}. cdm_obj={}'.format( elapsed_time, cdm_obj)) @@ -90,7 +91,7 @@ def populate_single_sdm(site_id, date_for, force_update=False): @shared_task -def populate_daily_metrics_for_site(site_id, date_for, force_update=False): +def populate_daily_metrics_for_site(site_id, date_for, ed_next=False, force_update=False): """Collect metrics for the given site and date """ try: @@ -103,8 +104,12 @@ def populate_daily_metrics_for_site(site_id, date_for, force_update=False): for course_id in site_course_ids(site): try: + if ed_next: + update_enrollment_data_for_course(course_id) + populate_single_cdm(course_id=course_id, date_for=date_for, + ed_next=ed_next, force_update=force_update) except Exception as e: # pylint: disable=broad-except msg = ('{prefix}:SITE:COURSE:FAIL:populate_daily_metrics_for_site.' @@ -121,14 +126,17 @@ def populate_daily_metrics_for_site(site_id, date_for, force_update=False): @shared_task -def update_enrollment_data(site_id, **_kwargs): - """ - This can be an expensive task as it iterates over all the enrollments in a - site +def update_enrollment_data_for_site(site_id, **_kwargs): + """Original task to collect `EnrollmentData` records + + This tasks collects `EnrollmentData` records at the site level with the + context of 'backfill'. This means it goes through all enrollments on the + site and checks if the `EnrollmentData` record needs to be updated + This can be an expensive task as it iterates over all the enrollments in a + site. We can reduce the records for which we need to iterate if we filter on CourseEnrollment.objects.filter(is_actie=True) - However, we have to ensure that we don't exclude learners who have just completed a course and are awaiting post course activities, like being awarded a certificate @@ -138,20 +146,17 @@ def update_enrollment_data(site_id, **_kwargs): results = backfill_enrollment_data_for_site(site) if results.get('errors'): for rec in results['errors']: - logger.error('figures.tasks.update_enrollment_data. Error:{}'.format(rec)) + logger.error('figures.tasks.update_enrollment_data_for_site. Error:{}'.format(rec)) except Site.DoesNotExist: logger.error( - 'figurs.tasks.update_enrollment_data: site_id={} does not exist'.format( + 'figurs.tasks.update_enrollment_data_for_site: site_id={} does not exist'.format( site_id)) except Exception: # pylint: disable=broad-except - msg = ('FIGURES:DAILYLFAIL daily metrics:update_enrollment_data' + msg = ('FIGURES:DAILYLFAIL daily metrics:update_enrollment_data_for_site' ' for site_id={}'.format(site_id)) logger.exception(msg) -# TODO: Sites iterator with entry and exit logging - - @shared_task def populate_daily_metrics(site_id=None, date_for=None, force_update=False): """Runs Figures daily metrics collection @@ -237,9 +242,9 @@ def populate_daily_metrics(site_id=None, date_for=None, force_update=False): # Until we implement signal triggers if do_update_enrollment_data: try: - update_enrollment_data(site_id=site.id) + update_enrollment_data_for_site(site_id=site.id) except Exception: # pylint: disable=broad-except - msg = ('{prefix}:FAIL figures.tasks update_enrollment_data ' + msg = ('{prefix}:FAIL figures.tasks update_enrollment_data_for_site ' ' unhandled exception. site[{site_id}]:{domain}') logger.exception(msg.format(prefix=FPD_LOG_PREFIX, site_id=site.id, @@ -258,6 +263,64 @@ def populate_daily_metrics(site_id=None, date_for=None, force_update=False): site_count=sites_count)) +@shared_task +def populate_daily_metrics_next(site_id=None, force_update=False): + """Next iteration to collect daily metrics for all sites in a deployment + + This is a top level Celery task run every 24 hours to update Figures data. + + * It updates Figures per-enrollment data and collect daily aggregate metrics + * It's purpose is to collect new metrics on an ongoing basis and not serve + dual purpose of collecting ongoing data AND backfilling data. + * The driver for this change is to improve performance of the daily Celery jobs + + What's different? + + * Figures collects the enrollment data first, then aggregates daily data. + + TODO: Draft up public architecture docs and reference them here + """ + if waffle.switch_is_active(WAFFLE_DISABLE_PIPELINE): + logger.warning('Figures pipeline is disabled due to %s being active.', + WAFFLE_DISABLE_PIPELINE) + return + + date_for = datetime.datetime.utcnow().date() + if site_id is not None: + sites = get_sites_by_id((site_id, )) + else: + sites = get_sites() + sites_count = sites.count() + # This is our task entry log message + msg = '{prefix}:START:date_for={date_for}, site_count={site_count}' + logger.info(msg.format(prefix=FPD_LOG_PREFIX, + date_for=date_for, + site_count=sites_count)) + for i, site in enumerate(sites): + msg = '{prefix}:SITE:START:{id}:{domain} - Site {i:04d} of {n:04d}' + logger.info(msg.format(prefix=FPD_LOG_PREFIX, + id=site.id, + domain=site.domain, + i=i, + n=sites_count)) + try: + populate_daily_metrics_for_site(site_id=site.id, + date_for=date_for, + ed_next=True, + force_update=force_update) + except Exception: # pylint: disable=broad-except + msg = ('{prefix}:FAIL populate_daily_metrics unhandled site level' + ' exception for site[{site_id}]={domain}') + logger.exception(msg.format(prefix=FPD_LOG_PREFIX, + site_id=site.id, + domain=site.domain)) + + msg = '{prefix}:END:date_for={date_for}, site_count={site_count}' + logger.info(msg.format(prefix=FPD_LOG_PREFIX, + date_for=date_for, + site_count=sites_count)) + + # # Daily Metrics Experimental Tasks # diff --git a/tests/helpers.py b/tests/helpers.py index 31d0c2ee..1a028f03 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -9,6 +9,7 @@ from django.utils.timezone import utc +from opaque_keys.edx.keys import CourseKey from organizations.models import Organization @@ -37,6 +38,10 @@ def make_course_key_str(org, number, run='test-run'): return 'course-v1:{}+{}+{}'.format(org, number, run) +def fake_course_key(num): + return CourseKey.from_string('course-v1:TestOrg+TO+{}'.format(num)) + + def create_metrics_model_timeseries(factory, first_day, last_day): """ Convenience method to create a set of time series factory objects diff --git a/tests/tasks/test_daily_tasks.py b/tests/tasks/test_daily_tasks.py index 904b30d1..41b29347 100644 --- a/tests/tasks/test_daily_tasks.py +++ b/tests/tasks/test_daily_tasks.py @@ -58,7 +58,6 @@ """ from __future__ import absolute_import from datetime import date -import logging import pytest from six.moves import range from django.contrib.sites.models import Site @@ -72,15 +71,17 @@ populate_single_cdm, populate_single_sdm, populate_daily_metrics_for_site, - populate_daily_metrics) + populate_daily_metrics, + populate_daily_metrics_next) from tests.factories import (CourseDailyMetricsFactory, CourseOverviewFactory, SiteDailyMetricsFactory, SiteFactory) -from tests.helpers import OPENEDX_RELEASE, GINKGO, FakeException +from tests.helpers import OPENEDX_RELEASE, GINKGO, FakeException, fake_course_key -def test_populate_single_cdm(transactional_db, monkeypatch): +@pytest.mark.parametrize('extra_params', [{}, {'ed_next': True}]) +def test_populate_single_cdm(transactional_db, monkeypatch, extra_params): """Test figures.tasks.populate_single_cdm nominal case This tests the normal execution to popluate a single CourseDailyMetrics @@ -100,31 +101,33 @@ def mock_cdm_load(self, date_for, **kwargs): 'figures.pipeline.course_daily_metrics.CourseDailyMetricsLoader.load', mock_cdm_load) - populate_single_cdm(course_id, date_for) + populate_single_cdm(course_id, date_for, **extra_params) assert CourseDailyMetrics.objects.count() == 1 assert as_date(CourseDailyMetrics.objects.first().date_for) == as_date(date_for) @pytest.mark.django_db -def test_disable_populate_daily_metrics(caplog): +@pytest.mark.parametrize('func', [populate_daily_metrics, populate_daily_metrics_next]) +def test_disable_populate_daily_metrics(caplog, func): """Test figures.tasks.populate_daily_metrics Tests that when WAFFLE_DISABLE_PIPELINE is active, the disabled warning msg is logged """ with override_switch('figures.disable_pipeline', active=True): - populate_daily_metrics() + func() assert 'disabled' in caplog.text @pytest.mark.django_db -def test_enable_populate_daily_metrics(caplog): +@pytest.mark.parametrize('func', [populate_daily_metrics, populate_daily_metrics_next]) +def test_enable_populate_daily_metrics(caplog, func): """Test figures.tasks.populate_daily_metrics Tests that when WAFFLE_DISABLE_PIPELINE is not active, the disabled warning msg is not logged """ with override_switch('figures.disable_pipeline', active=False): - populate_daily_metrics() + func() assert 'disabled' not in caplog.text @@ -156,11 +159,13 @@ def mock_sdm_load(self, site, date_for, **kwargs): as_date('2020-12-12'), as_datetime('2020-12-12') ]) +@pytest.mark.parametrize('extra_params', [{}, {'ed_next': True}]) def test_populate_daily_metrics_for_site_basic(transactional_db, monkeypatch, - date_for): + date_for, + extra_params): site = SiteFactory() - course_ids = ['fake-course-1', 'fake-course-2'] + course_ids = [fake_course_key(i) for i in range(2)] collected_course_ids = [] def fake_populate_single_cdm(course_id, **_kwargs): @@ -169,37 +174,49 @@ def fake_populate_single_cdm(course_id, **_kwargs): def fake_populate_single_sdm(site_id, **_kwargs): assert site_id == site.id + def fake_update_enrollment_data_for_course(course_id): + assert extra_params['ed_next'] + monkeypatch.setattr('figures.tasks.site_course_ids', lambda site: course_ids) monkeypatch.setattr('figures.tasks.populate_single_cdm', fake_populate_single_cdm) monkeypatch.setattr('figures.tasks.populate_single_sdm', fake_populate_single_sdm) + monkeypatch.setattr('figures.tasks.update_enrollment_data_for_course', + fake_update_enrollment_data_for_course) - populate_daily_metrics_for_site(site_id=site.id, date_for=date_for) + populate_daily_metrics_for_site(site_id=site.id, date_for=date_for, **extra_params) assert set(collected_course_ids) == set(course_ids) @pytest.mark.skipif(OPENEDX_RELEASE == GINKGO, reason='Apparent Django 1.8 incompatibility') +@pytest.mark.parametrize('extra_params', [{}, {'ed_next': True}]) def test_populate_daily_metrics_for_site_error_on_cdm(transactional_db, monkeypatch, - caplog): + caplog, + extra_params): date_for = date.today() site = SiteFactory() - fake_course_ids = ['fake-course-id-1'] + fake_course_ids = [fake_course_key(1)] def fake_pop_single_cdm_fails(**kwargs): # TODO: test with different exceptions # At least one with and without `message_dict` raise FakeException('Hey!') + def fake_update_enrollment_data_for_course(course_id): + assert extra_params['ed_next'] + monkeypatch.setattr('figures.tasks.site_course_ids', lambda site: fake_course_ids) monkeypatch.setattr('figures.tasks.populate_single_cdm', fake_pop_single_cdm_fails) + monkeypatch.setattr('figures.tasks.update_enrollment_data_for_course', + fake_update_enrollment_data_for_course) - populate_daily_metrics_for_site(site_id=site.id, date_for=date_for) + populate_daily_metrics_for_site(site_id=site.id, date_for=date_for, **extra_params) last_log = caplog.records[-1] expected_msg = ('{prefix}:SITE:COURSE:FAIL:populate_daily_metrics_for_site. ' @@ -215,9 +232,11 @@ def fake_pop_single_cdm_fails(**kwargs): @pytest.mark.skipif(OPENEDX_RELEASE == GINKGO, reason='Apparent Django 1.8 incompatibility') +@pytest.mark.parametrize('extra_params', [{}, {'ed_next': True}]) def test_populate_daily_metrics_for_site_site_dne(transactional_db, monkeypatch, - caplog): + caplog, + extra_params): """ If there is an invalid site id, logs error and raises it """ @@ -226,7 +245,9 @@ def test_populate_daily_metrics_for_site_site_dne(transactional_db, assert not Site.objects.filter(id=bad_site_id).exists() with pytest.raises(Exception) as e: - populate_daily_metrics_for_site(site_id=bad_site_id, date_for=date_for) + populate_daily_metrics_for_site(site_id=bad_site_id, + date_for=date_for, + **extra_params) assert str(e.value) == 'Site matching query does not exist.' last_log = caplog.records[-1] @@ -237,9 +258,13 @@ def test_populate_daily_metrics_for_site_site_dne(transactional_db, @pytest.mark.skipif(OPENEDX_RELEASE == GINKGO, reason='Apparent Django 1.8 incompatibility') +@pytest.mark.parametrize('func', [ + populate_daily_metrics, populate_daily_metrics_next +]) def test_populate_daily_metrics_site_level_error(transactional_db, monkeypatch, - caplog): + caplog, + func): """ Generic test that the first site fails but we can process the second site """ @@ -249,7 +274,6 @@ def test_populate_daily_metrics_site_level_error(transactional_db, bad_site = SiteFactory() populated_site_ids = [] failed_site_ids = [] - date_for = date.today() def fake_populate_daily_metrics_for_site(site_id, **_kwargs): """ @@ -263,7 +287,8 @@ def fake_populate_daily_metrics_for_site(site_id, **_kwargs): monkeypatch.setattr('figures.tasks.populate_daily_metrics_for_site', fake_populate_daily_metrics_for_site) - populate_daily_metrics(date_for=date_for) + func() + assert set(populated_site_ids) == set([good_site.id]) assert set(failed_site_ids) == set([bad_site.id]) @@ -297,13 +322,13 @@ def fake_update_enrollment_data_fails(**kwargs): monkeypatch.setattr('figures.tasks.populate_daily_metrics_for_site', fake_populate_daily_metrics_for_site) - monkeypatch.setattr('figures.tasks.update_enrollment_data', + monkeypatch.setattr('figures.tasks.update_enrollment_data_for_site', fake_update_enrollment_data_fails) populate_daily_metrics(date_for=date_for) last_log = caplog.records[-1] - expected_msg = ('{prefix}:FAIL figures.tasks update_enrollment_data ' + expected_msg = ('{prefix}:FAIL figures.tasks update_enrollment_data_for_site ' ' unhandled exception. site[{site_id}]:{domain}').format( prefix=FPD_LOG_PREFIX, site_id=site.id, @@ -313,9 +338,11 @@ def fake_update_enrollment_data_fails(**kwargs): @pytest.mark.skipif(OPENEDX_RELEASE == GINKGO, reason='Broken test. Apparent Django 1.8 incompatibility') -def test_populate_daily_metrics_multisite(transactional_db, monkeypatch): +@pytest.mark.parametrize('func', [ + populate_daily_metrics, populate_daily_metrics_next +]) +def test_populate_daily_metrics_multisite(transactional_db, monkeypatch, caplog, func): # Stand up test data - date_for = '2019-01-02' site_links = [] for domain in ['alpha.domain', 'bravo.domain']: site_links.append(dict( @@ -323,4 +350,6 @@ def test_populate_daily_metrics_multisite(transactional_db, monkeypatch): courses=[CourseOverviewFactory() for i in range(2)], )) - populate_daily_metrics(date_for=date_for) + func() + + assert not caplog.records From 82b4c2870033813feb7b4f259a94b9b04299cfef Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Mon, 28 Feb 2022 14:27:14 -0500 Subject: [PATCH 11/17] Address PR comment for #427 * Replaced `timeit` with `time` to capture EnrollmentData collection timing * Added basic test for `figures.helpers.utc_yesterday` --- figures/models.py | 6 +++--- tests/test_helpers.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/figures/models.py b/figures/models.py index 81fe1175..64192be6 100644 --- a/figures/models.py +++ b/figures/models.py @@ -5,7 +5,7 @@ from __future__ import absolute_import from datetime import date -import timeit +from time import time from django.conf import settings from django.contrib.sites.models import Site from django.core.validators import MaxValueValidator, MinValueValidator @@ -283,7 +283,7 @@ def update_metrics(self, site, course_enrollment, force_update=False): if not ed_recs or ed_recs[0].date_for < date_for or force_update: # We do the update - start_time = timeit.default_timer() + start_time = time() # get the progress data ep = EnrollmentProgress(user=course_enrollment.user, course_id=str(course_enrollment.course_id)) @@ -298,7 +298,7 @@ def update_metrics(self, site, course_enrollment, force_update=False): is_enrolled=course_enrollment.is_active, date_enrolled=course_enrollment.created, ) - elapsed = timeit.default_timer() - start_time + elapsed = time() - start_time defaults['collect_elapsed'] = elapsed ed_rec, created = self.update_or_create( diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 21c1f588..9846fff8 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -22,6 +22,7 @@ previous_months_iterator, first_last_days_for_month, import_from_path, + utc_yesterday, ) from tests.factories import COURSE_ID_STR_TEMPLATE @@ -206,6 +207,7 @@ def test_previous_months_iterator(self, month_for, months_back): vals = list(previous_months_iterator(month_for, months_back)) assert vals == expected_vals + def test_first_last_days_for_month(): month_for = '2/2020' month = 2 @@ -244,3 +246,15 @@ def test_import_from_bad_syntax(): with pytest.raises(ValueError): # Malformed path import_from_path(utc_tz_path) + + +def test_utc_yesterday(): + """Basic sanity check and test coverage using live time + + If we find a need or wante to get elaborate, we test a specific time for + any time by with monkeypatch/mock of `utcnow()` call in `figures.helpers` + If it is for a need because the expected and actual do not match up, then + document why. + """ + expected = datetime.datetime.utcnow().date() - datetime.timedelta(days=1) + assert utc_yesterday() == expected From ffac06e1ed954f30f5b5f19d094edebc9402fba4 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Tue, 1 Mar 2022 15:11:43 -0500 Subject: [PATCH 12/17] Moved 'figures.backfill' to 'figures.pipeline.backfill' Moving this because it is a pipeline module. Should have beeing in 'figures.pipeline' when first created --- figures/{ => pipeline}/backfill.py | 1 + tests/{ => pipeline}/test_backfill.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) rename figures/{ => pipeline}/backfill.py (97%) rename tests/{ => pipeline}/test_backfill.py (98%) diff --git a/figures/backfill.py b/figures/pipeline/backfill.py similarity index 97% rename from figures/backfill.py rename to figures/pipeline/backfill.py index 1848fc2a..8fc6cb3f 100644 --- a/figures/backfill.py +++ b/figures/pipeline/backfill.py @@ -19,6 +19,7 @@ from figures.models import EnrollmentData +# This function called just by mgmt command, backfill_figures_monthly_metrics.py def backfill_monthly_metrics_for_site(site, overwrite=False, use_raw_sql=False): """Backfill specified months' historical site metrics for the specified site """ diff --git a/tests/test_backfill.py b/tests/pipeline/test_backfill.py similarity index 98% rename from tests/test_backfill.py rename to tests/pipeline/test_backfill.py index f486abeb..506357ce 100644 --- a/tests/test_backfill.py +++ b/tests/pipeline/test_backfill.py @@ -11,7 +11,7 @@ from django.db import connection from django.utils.timezone import utc -from figures.backfill import backfill_monthly_metrics_for_site +from figures.pipeline.backfill import backfill_monthly_metrics_for_site from figures.models import SiteMonthlyMetrics from tests.factories import ( CourseOverviewFactory, From 17fda93290fa8bcf280693430f723fc735480096 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Tue, 1 Mar 2022 15:28:06 -0500 Subject: [PATCH 13/17] Add Celery task, 'backfill_enrollment_data_for_course' This commit simply adds a celery task wrapper to a pipeline function * Provides a Celery task to call the new pipeline function, `update_enrollment_data_for_course(course_id)` * Added unit test for this task --- figures/tasks.py | 18 ++++++++++++++++-- tests/tasks/test_backfill_tasks.py | 20 ++++++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 tests/tasks/test_backfill_tasks.py diff --git a/figures/tasks.py b/figures/tasks.py index 4cb59b4d..b731d29c 100644 --- a/figures/tasks.py +++ b/figures/tasks.py @@ -18,13 +18,14 @@ from celery.app import shared_task from celery.utils.log import get_task_logger -from figures.backfill import backfill_enrollment_data_for_site from figures.compat import CourseEnrollment, CourseOverview from figures.helpers import as_course_key, as_date, is_past_date from figures.log import log_exec_time +from figures.sites import get_sites, get_sites_by_id, site_course_ids + +from figures.pipeline.backfill import backfill_enrollment_data_for_site from figures.pipeline.course_daily_metrics import CourseDailyMetricsLoader from figures.pipeline.site_daily_metrics import SiteDailyMetricsLoader -from figures.sites import get_sites, get_sites_by_id, site_course_ids from figures.pipeline.mau_pipeline import collect_course_mau from figures.pipeline.helpers import DateForCannotBeFutureError from figures.pipeline.site_monthly_metrics import fill_last_month as fill_last_smm_month @@ -321,6 +322,19 @@ def populate_daily_metrics_next(site_id=None, force_update=False): site_count=sites_count)) +@shared_task +def backfill_enrollment_data_for_course(course_id): + """Create or update EnrollmentData records for the course + + Simple wrapper to run the enrollment update as a Celery task + + We usually run this task through the Figures Django management command, + `backfill_figures_enrollment_data` + """ + ed_objects = update_enrollment_data_for_course(course_id) + return [obj.id for obj in ed_objects] + + # # Daily Metrics Experimental Tasks # diff --git a/tests/tasks/test_backfill_tasks.py b/tests/tasks/test_backfill_tasks.py new file mode 100644 index 00000000..7c5420fe --- /dev/null +++ b/tests/tasks/test_backfill_tasks.py @@ -0,0 +1,20 @@ +"""Test Figures backfill Celery tasks +""" +from __future__ import absolute_import + +from figures.tasks import backfill_enrollment_data_for_course + +from tests.factories import EnrollmentDataFactory + + +def test_backfill_enrollment_data_for_course(transactional_db, monkeypatch): + """ + The Celery task is a simple wrapper around the pipeline function + """ + course_id = 'course-v1:SomeOrg+SomeNum+SomeRun' + ed_recs = [EnrollmentDataFactory() for _ in range(2)] + + func_path = 'figures.tasks.update_enrollment_data_for_course' + monkeypatch.setattr(func_path, lambda course_id: ed_recs) + ed_ids = backfill_enrollment_data_for_course(course_id) + assert set(ed_ids) == set([obj.id for obj in ed_recs]) From 6c4e043f6653ffd910a6a57a406122ab915d1bf7 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Tue, 1 Mar 2022 15:39:16 -0500 Subject: [PATCH 14/17] Address PR #429 comment * Removed dead code --- figures/management/commands/backfill_figures_daily_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/figures/management/commands/backfill_figures_daily_metrics.py b/figures/management/commands/backfill_figures_daily_metrics.py index 450453a2..cdfd9766 100644 --- a/figures/management/commands/backfill_figures_daily_metrics.py +++ b/figures/management/commands/backfill_figures_daily_metrics.py @@ -70,7 +70,7 @@ def handle(self, *args, **options): del kwargs['site_id'] # not implemented for experimental else: metrics_func = populate_daily_metrics - # try: + if options['no_delay']: metrics_func(**kwargs) else: From 66dacc2d7aa9caceffc63bc316d50e518f7d12db Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Tue, 1 Mar 2022 16:53:02 -0500 Subject: [PATCH 15/17] Fix management command from moved 'backfill' module --- .../management/commands/backfill_figures_monthly_metrics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/figures/management/commands/backfill_figures_monthly_metrics.py b/figures/management/commands/backfill_figures_monthly_metrics.py index 2e5bef03..fa033b66 100644 --- a/figures/management/commands/backfill_figures_monthly_metrics.py +++ b/figures/management/commands/backfill_figures_monthly_metrics.py @@ -9,9 +9,11 @@ from django.contrib.sites.models import Site -from figures.backfill import backfill_monthly_metrics_for_site +from figures.pipeline.backfill import backfill_monthly_metrics_for_site from figures.management.base import BaseBackfillCommand +assert 0, "This command needs to be reworked both for performance and correctness" + def backfill_site(site, overwrite, use_raw_sql): From 951692addb29e94475890262cf086b9bf800f6eb Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Tue, 1 Mar 2022 18:45:08 -0500 Subject: [PATCH 16/17] Revert devsite 'courseware' app namepace back to originial Please read this issue: https://github.com/appsembler/figures/issues/433 --- devsite/devsite/settings.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/devsite/devsite/settings.py b/devsite/devsite/settings.py index e60b02ee..b10167fd 100644 --- a/devsite/devsite/settings.py +++ b/devsite/devsite/settings.py @@ -90,11 +90,15 @@ if ENABLE_DEVSITE_CELERY: INSTALLED_APPS.append('djcelery') -# certificates app + if OPENEDX_RELEASE == 'GINKGO': + # certificates and courseware do NOT use the `lms.djangoapps.` namespace + # prefix on Ginkgo. See here: https://github.com/appsembler/figures/issues/433 INSTALLED_APPS.append('certificates') - INSTALLED_APPS.append('lms.djangoapps.courseware') + INSTALLED_APPS.append('courseware') elif OPENEDX_RELEASE == 'HAWTHORN': + # Yes, this is correct, certificates was updated to uses the full namespace + # and courseware has not yet been updated INSTALLED_APPS.append('lms.djangoapps.certificates') INSTALLED_APPS.append('courseware') else: From 61d82556827e2abda937d69d9d04163c5dcac197 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 3 Mar 2022 00:50:05 -0500 Subject: [PATCH 17/17] Updates the enrollment data backfill management command * Significant rework uses the new 'workflow next' * The command module docstring and test code should have adequate documentation. Please read those --- .../backfill_figures_enrollment_data.py | 143 ++++++++++++++++-- tests/commands/__init__.py | 0 ...ackfill_figures_enrollment_data_command.py | 106 +++++++++++++ 3 files changed, 239 insertions(+), 10 deletions(-) create mode 100644 tests/commands/__init__.py create mode 100644 tests/commands/test_backfill_figures_enrollment_data_command.py diff --git a/figures/management/commands/backfill_figures_enrollment_data.py b/figures/management/commands/backfill_figures_enrollment_data.py index 7363219e..15f4438e 100644 --- a/figures/management/commands/backfill_figures_enrollment_data.py +++ b/figures/management/commands/backfill_figures_enrollment_data.py @@ -1,30 +1,153 @@ """This Django management command updates Figures EnrollmentData records -Running this will trigger figures.tasks.update_enrollment_data for every site +Running this will trigger figures.tasks.backfill_enrollment_data_for_course + +Specific sites or courses need to be identified. Running without either of these +parameters will raise a CommandError stating that one of these parameters are +needed. + +This Django management command does not default to running update on every +enrollment because that could trigger a heavy and long load for larger deployments +that require many EnrollmentData records to be updated. + +To specify sites, use the `--sites` parameter, followed by a space delimited list +of domain names. Example: + +``` +backfill_figures_enrollment_data --sites heres-a-site.com another-site.com +``` + +To specify courses, use the `--courses` paramter, followed by a space delimited +list of course id string. Example: + + +### Important + +* The above do not include the full Django management command +* These are run for the LMS instance + +Either `--sites` or `--courses` parameter needs to be + unless the '--site' option is used. Then it will update just that site + +For potential improvements to this management command, see here: + +* https://github.com/appsembler/figures/issues/435 """ from __future__ import print_function from __future__ import absolute_import from textwrap import dedent +from django.contrib.sites.models import Site +from django.core.management.base import BaseCommand, CommandError -from figures.management.base import BaseBackfillCommand -from figures.tasks import update_enrollment_data_for_site +from figures.compat import CourseOverview +from figures.helpers import as_course_key +from figures.sites import site_course_ids +from figures.tasks import backfill_enrollment_data_for_course -class Command(BaseBackfillCommand): +class Command(BaseCommand): """Backfill Figures EnrollmentData model. + + This Django managmenet command provides a basic CLI to create and update + Figures `EnrollmentData` records. + + It filters on either sites or course ids. Because this is potentially a + resource intensive command, it requires identifying specific sites or + course ids. Yes, there's improvement for this. + + * If both sites and courses parameters are used, the sites are ignored + * If there are any invalid site names or primary keys, the command will fail + with the Django model's `DoesNotExist` exception + * If there are any invalid course ids the command will fail with + `DoesNotExist` if the course id has a valid structure but the course is + not found and with `InvalidKeyErro` if the course id does not have a valid + structure (CourseKey.from_string(course_id) fails) + + For potential improvements, read (and contribute if you want) here: + + * https://github.com/appsembler/figures/issues/435 """ help = dedent(__doc__).strip() + def get_sites(self, options): + """Returns a list of Site objects + + Raises `DoesNotExist` if a site in the list is missing + If no sites are specified, returns an empty list + """ + sites = [] + for site_identifier in options.get('sites', []): + sites.append(Site.objects.get(domain=site_identifier)) + return set(sites) + + def get_course_ids(self, options): + """Return a validated list of course id strings. + + If no courses are specified, returns an empty list + + Raises `DoesNotExist` or `InvalidKeyError` exceptions if a course id + is either malformed or is not malformed, but does not exist + """ + course_ids = [] + for cid_str in options.get('courses', []): + course_overview = CourseOverview.objects.get(id=as_course_key(cid_str)) + # we got a course overview object. Otherwise, we'd fail here with + # `InvalidKeyError` or `DoesNotExist` error + # While we could just use `cid_str`, this has us use the object + # returned. We do NOT want to check if the key exists because we + # want to let the user know there was a problem with a course id + course_ids.append(str(course_overview.id)) + return course_ids + + def update_enrollments(self, course_ids, use_celery=True): + for course_id in course_ids: + if use_celery: + # Call the Celery task with delay + backfill_enrollment_data_for_course.delay(str(course_id)) + else: + # Call the Celery task immediately + backfill_enrollment_data_for_course(str(course_id)) + + def add_arguments(self, parser): + """ + + If site domain or id or set of ides are provided, but no course or + courses provided, runs for all courses in the site + + If a course id or list of course ids provided, processes those courses + If a site + """ + parser.add_argument('--sites', + nargs='+', + default=[], + help='backfill a specific site. provide id or domain name') + parser.add_argument('--courses', + nargs='+', + default=[], + help='backfill a specific course. provide course id string') + parser.add_argument('--use-celery', + action='store_true', + default=True, + help='Run with Celery worker. Set to false to run in app space ') + def handle(self, *args, **options): print('BEGIN: Backfill Figures EnrollmentData') + sites = self.get_sites(options) + course_ids = self.get_course_ids(options) + use_celery = options['use_celery'] - for site_id in self.get_site_ids(options['site']): - print('Updating EnrollmentData for site {}'.format(site_id)) - if options['no_delay']: - update_enrollment_data_for_site(site_id=site_id) - else: - update_enrollment_data_for_site.delay(site_id=site_id) # pragma: no cover + # Would it benefit to implement a generator method to yield the course id? + if course_ids: + self.update_enrollments(course_ids, use_celery=use_celery) + + elif sites: + for site in sites: + course_ids = site_course_ids(site) + self.update_enrollments(course_ids, use_celery=use_celery) + + else: + raise CommandError('You need to provide at least one site or course to update') print('DONE: Backfill Figures EnrollmentData') diff --git a/tests/commands/__init__.py b/tests/commands/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/commands/test_backfill_figures_enrollment_data_command.py b/tests/commands/test_backfill_figures_enrollment_data_command.py new file mode 100644 index 00000000..ece81d0e --- /dev/null +++ b/tests/commands/test_backfill_figures_enrollment_data_command.py @@ -0,0 +1,106 @@ +"""Test Figures Django management commands +""" +from __future__ import absolute_import + +import pytest +import mock + +from django.core.management import call_command + +from tests.factories import CourseOverviewFactory, SiteFactory + + +@pytest.mark.django_db +class TestBackfillEnrollmentDataCommand(object): + """Tests Django managmeent command, 'backfill_figures_enrollment_data' + + Alternate testing approach + + * Directly test the `Command.update_enrollments(...) method to make sure + this method is correct + * Mock `Command.update_enrollments(...)` for testing each of the following + cases: + * has one site domain, has two site domains + * has one course id, has two course ids + * has one or more site domains, and one or more courses + * expects to ignore the sites arg and only process the courses arg + """ + TASK = 'figures.tasks.backfill_enrollment_data_for_course' + MANAGEMENT_COMMAND = 'backfill_figures_enrollment_data' + + @pytest.mark.parametrize('domains', [ + ['alpha.test'], ['alpha.test', 'bravo.test'] + ]) + @pytest.mark.parametrize('delay_suffix, do_delay', [ + ('', False), ('.delay', True) + ]) + def test_backfill_with_sites_arg(self, domains, delay_suffix, do_delay): + sites = [SiteFactory(domain=domain) for domain in domains] + courses = [CourseOverviewFactory() for _ in range(2)] + course_ids = [str(obj.id) for obj in courses] + courses_for_sites = [course_ids for _ in range(len(domains))] + calls = [] + for _ in range(len(sites)): + calls += [mock.call(course_id) for course_id in course_ids] + kwargs = {'sites': domains, 'use_celery': do_delay} + + with mock.patch('figures.sites.site_course_ids') as mock_site_course_ids: + # Build thed list of the course_ids returned for each call to `site_course_ids` + mock_site_course_ids.return_value = courses_for_sites + with mock.patch(self.TASK + delay_suffix) as mock_task: + call_command(self.MANAGEMENT_COMMAND, **kwargs) + assert mock_task.has_calls(calls) + + @pytest.mark.parametrize('course_ids', [ + ['course-v1:TestOrg+T01+run'], + ['course-v1:TestOrg+T01+run', 'course-v1:TestOrg+T02+run'], + ]) + def test_backfill_with_courses_arg_immediate(self, course_ids): + """Test called with courses arg and not run in Celery worker + + This tests that the expected task function is called with specific + parameters and the `.delay(course_id)` is not called. + + This and the following function are almost identical. + They can be merged as one test method, but for now left as two + test methods initially for readability and the additional development + time to implement it. + + But, should we merge them, we add an additional parametrize decorator + like so: + ``` + @pytest.mark.parametrize('delay_suffix, delay_flag', [ + ('', False), ('.delay', True) + ]) + ``` + And then we abstract the nested mocks to swap the `.has_calls` and + `not called` checks (which might require a conditional) + """ + [CourseOverviewFactory(id=cid) for cid in course_ids] + kwargs = {'courses': course_ids, 'use_celery': False} + calls = [mock.call(course_id) for course_id in course_ids] + + with mock.patch(self.TASK) as mock_task: + with mock.patch(self.TASK + '.delay') as mock_task_delay: + call_command(self.MANAGEMENT_COMMAND, **kwargs) + assert mock_task.has_calls(calls) + assert not mock_task_delay.called + + @pytest.mark.parametrize('course_ids', [ + ['course-v1:TestOrg+T01+run'], + ['course-v1:TestOrg+T01+run', 'course-v1:TestOrg+T02+run'], + ]) + def test_backfill_with_courses_arg_delay(self, course_ids): + """Test called with courses arg and run in Celery worker + + See comment in the test method immediate above this one. + """ + [CourseOverviewFactory(id=cid) for cid in course_ids] + kwargs = {'courses': course_ids, 'use_celery': True} + calls = [mock.call(course_id) for course_id in course_ids] + + with mock.patch(self.TASK) as mock_task: + with mock.patch(self.TASK + '.delay') as mock_task_delay: + call_command(self.MANAGEMENT_COMMAND, **kwargs) + assert mock_task_delay.has_calls(calls) + assert not mock_task.called