Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stale Case Data Mail Report #35446

Merged
merged 17 commits into from
Dec 17, 2024
Merged
99 changes: 97 additions & 2 deletions corehq/apps/hqadmin/reports.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from collections import defaultdict
from datetime import datetime, timedelta

from django.urls import reverse
from django.utils.functional import cached_property
from django.utils.html import format_html
Expand All @@ -8,9 +11,12 @@
from dateutil.parser import parse
from memoized import memoized

from dimagi.utils.logging import notify_exception

from phonelog.models import DeviceReportEntry
from phonelog.reports import BaseDeviceLogReport

from corehq.apps.accounting.models import Subscription, SoftwarePlanEdition
from corehq.apps.auditcare.utils.export import navigation_events_by_user
from corehq.apps.reports.datatables import DataTablesColumn, DataTablesHeader
from corehq.apps.reports.dispatcher import AdminReportDispatcher
Expand All @@ -24,10 +30,12 @@
from corehq.const import SERVER_DATETIME_FORMAT
from corehq.apps.hqadmin.models import HqDeploy
from corehq.apps.es.cases import CaseES
from corehq.apps.es.case_search import CaseSearchES
from corehq.apps.es.forms import FormES
from corehq.toggles import USER_CONFIGURABLE_REPORTS, RESTRICT_DATA_SOURCE_REBUILD
from corehq.motech.repeaters.const import UCRRestrictionFFStatus
from corehq.apps.es.aggregations import TermsAggregation
from corehq.apps.es.exceptions import ESError


class AdminReport(GenericTabularReport):
Expand Down Expand Up @@ -402,13 +410,13 @@ def should_show_domain(self, domain, total_cases, total_forms):

@staticmethod
def _case_count_by_domain(domains):
return CaseES().domain(domains).aggregation(
return CaseES().domain(domains).size(0).aggregation(
TermsAggregation('domain', 'domain.exact')
).run().aggregations.domain.buckets_dict

@staticmethod
def _forms_count_by_domain(domains):
return FormES().domain(domains).aggregation(
return FormES().domain(domains).size(0).aggregation(
TermsAggregation('domain', 'domain.exact')
).run().aggregations.domain.buckets_dict

Expand Down Expand Up @@ -495,3 +503,90 @@ def headers(self):
@property
def rows(self):
return self.table_data.rows


class StaleCasesTable:
STOP_POINT_DAYS_AGO = 365 * 20
AGG_DATE_RANGE = 150
STALE_DATE_THRESHOLD_DAYS = 365

BACKOFF_AMOUNT_DAYS = 30
MAX_BACKOFF_COUNT = 2

def __init__(self):
self._rows = None
self.stop_date = datetime.now() - timedelta(days=self.STOP_POINT_DAYS_AGO)

@property
def headers(self):
return DataTablesHeader(
DataTablesColumn(gettext_lazy("Domain")),
DataTablesColumn(gettext_lazy("Case count"))
)

@property
def rows(self):
Copy link
Contributor

@kaapstorm kaapstorm Nov 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using table.rows twice will run the query twice. How about something like this?

    def __init__():
        self._rows = None

    @property
    def rows(self):
        if self._rows is None:
            self._rows = []
            # etc.
        return self._rows

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this idea! Currently the "caching" is done in the task itself by calling table_rows = StaleCasesTable().rows and then using the table_rows var locally, but the above suggestion will remove the need for storing the rows locally in the task.

if self._rows is None:
self._rows = []
case_count_by_domain = self._aggregate_case_count_data()
for domain, case_count in case_count_by_domain.items():
self._rows.append([domain, case_count])
return self._rows

def _aggregate_case_count_data(self):
end_date = datetime.now() - timedelta(days=self.STALE_DATE_THRESHOLD_DAYS)
agg_res = defaultdict(lambda: 0)
curr_backoff_count = 0
curr_agg_date_range = self.AGG_DATE_RANGE
domains = self._get_domains()
while (True):
start_date = end_date - timedelta(days=curr_agg_date_range)
try:
query_res = self._stale_case_count_in_date_range(domains, start_date, end_date)
except ESError as e:
curr_backoff_count += 1
if curr_backoff_count <= self.MAX_BACKOFF_COUNT:
curr_agg_date_range -= self.BACKOFF_AMOUNT_DAYS
else:
notify_exception(
None,
'ES query timed out while compiling stale case report email.',
details={
'error': str(e),
'start_date': start_date.strftime("%Y-%m-%d"),
'end_date': end_date.strftime("%Y-%m-%d")
}
)
raise ESError()
else:
curr_backoff_count = 0
curr_agg_date_range = self.AGG_DATE_RANGE
self._merge_agg_data(agg_res, query_res)
end_date = start_date
if end_date <= self.stop_date:
break
return agg_res

def _merge_agg_data(self, agg_res, query_res):
for domain, case_count in query_res.items():
agg_res[domain] += case_count

def _stale_case_count_in_date_range(self, domains, start_date, end_date):
return (
CaseSearchES()
.domain(domains)
.modified_range(gt=start_date, lt=end_date)
.is_closed(False)
.aggregation(
TermsAggregation('domain', 'domain.exact')
)
.size(0)
).run().aggregations.domain.counts_by_bucket()

def _get_domains(self):
return list(set(
Subscription.visible_objects
.exclude(plan_version__plan__edition=SoftwarePlanEdition.COMMUNITY)
.filter(is_active=True)
.values_list('subscriber__domain', flat=True)
))
42 changes: 41 additions & 1 deletion corehq/apps/hqadmin/tests/test_reports.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
from datetime import datetime, timedelta
from uuid import uuid4

from django.test import TestCase
from unittest.mock import patch

from corehq.apps.hqadmin.reports import UCRRebuildRestrictionTable
from corehq.apps.es.case_search import case_search_adapter
from corehq.apps.es.tests.utils import es_test
from corehq.apps.hqadmin.reports import UCRRebuildRestrictionTable, StaleCasesTable
from corehq.form_processor.models import CommCareCase
from corehq.motech.repeaters.const import UCRRestrictionFFStatus


Expand Down Expand Up @@ -76,3 +82,37 @@ def test_should_show_domain_show_should_disable_ff_domains(self, restriction_ff_
self.assertFalse(table_data.should_show_domain(
domain='domain', total_cases=100_000_000, total_forms=0)
)


@es_test(requires=[case_search_adapter], setup_class=True)
class TestStaleCasesTable(TestCase):

@classmethod
def setUpClass(cls):
super().setUpClass()
cases = [
cls._get_case(days_back=0),
cls._get_case(days_back=365),
cls._get_case(days_back=380, is_closed=True),
cls._get_case(days_back=365 * 21),
]
case_search_adapter.bulk_index(cases, refresh=True)
cls.table = StaleCasesTable()

@classmethod
def _get_case(cls, days_back, is_closed=False):
modified_on = datetime.now() - timedelta(days=days_back)
return CommCareCase(
case_id=uuid4().hex,
domain='test',
modified_on=modified_on,
closed=is_closed
)

@patch.object(StaleCasesTable, '_get_domains')
def test_get_rows(self, _get_domains_mock):
_get_domains_mock.return_value = ['test']
self.assertEqual(
self.table.rows,
[['test', 1]]
)
57 changes: 57 additions & 0 deletions corehq/apps/hqwebapp/tasks.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import csv
from io import StringIO

from smtplib import SMTPDataError
from urllib.parse import urlencode, urljoin

Expand All @@ -6,6 +9,7 @@
from django.core.mail.message import EmailMessage
from django.core.management import call_command
from django.urls import reverse
from django.template.defaultfilters import linebreaksbr
from django.utils.translation import gettext as _

from celery.exceptions import MaxRetriesExceededError
Expand All @@ -20,6 +24,7 @@
from dimagi.utils.web import get_url_base

from corehq.apps.celery import periodic_task, task
from corehq.apps.es.exceptions import ESError
from corehq.motech.repeaters.const import UCRRestrictionFFStatus
from corehq.util.bounced_email_manager import BouncedEmailManager
from corehq.util.email_event_utils import get_bounced_system_emails
Expand Down Expand Up @@ -305,3 +310,55 @@ def send_domain_ucr_data_info_to_admins():
send_mail_async.delay(
subject, message, [settings.SOLUTIONS_AES_EMAIL]
)


@periodic_task(run_every=crontab(minute=0, hour=1, day_of_month=1))
def send_stale_case_data_info_to_admins():
from corehq.apps.hqadmin.reports import StaleCasesTable
from corehq.apps.hqwebapp.tasks import send_html_email_async

if not settings.SOLUTIONS_AES_EMAIL or settings.SERVER_ENVIRONMENT != 'production':
return

table = StaleCasesTable()
has_error = False
try:
num_domains = len(table.rows)
except ESError:
has_error = True
subject = (
f'Monthly report: {num_domains} domains containing stale '
f'case data (older than {table.STALE_DATE_THRESHOLD_DAYS} days)'
)
csv_file = None
if num_domains:
message = (
f'We have identified {num_domains} domains containing stale '
f'case data older than {table.STALE_DATE_THRESHOLD_DAYS} days.\n'
'Please see detailed CSV report attached to this email.'
)
if has_error:
message += (
'\nPlease note that an error occurred while compiling the report '
'and so the data given may only be partial.'
)
csv_file = StringIO()
writer = csv.writer(csv_file)
writer.writerow(table.headers)
writer.writerows(table.rows)
elif has_error:
message = (
'\nPlease note that an error occurred while compiling the report '
'and so there may be missing data that was not compiled.'
)
else:
message = (
'No domains were found containing case data older than '
f'{table.STALE_DATE_THRESHOLD_DAYS} days.'
)
send_html_email_async.delay(
subject,
recipient=settings.SOLUTIONS_AES_EMAIL,
html_content=linebreaksbr(message),
file_attachments=[csv_file]
)
Loading