diff --git a/app/celery/scheduled_tasks.py b/app/celery/scheduled_tasks.py index 28da6811f..ba6b309fe 100644 --- a/app/celery/scheduled_tasks.py +++ b/app/celery/scheduled_tasks.py @@ -45,6 +45,8 @@ from app.models import ( from app.notifications.process_notifications import send_notification_to_queue from app.v2.errors import JobIncompleteError +from app.service.utils import get_services_with_high_failure_rates + @notify_celery.task(name="run-scheduled-jobs") @statsd(namespace="tasks") @@ -253,3 +255,28 @@ def check_for_missing_rows_in_completed_jobs(): current_app.logger.info( "Processing missing row: {} for job: {}".format(row_to_process.missing_row, job.id)) process_row(row, template, job, job.service, sender_id=sender_id) + + +@notify_celery.task(name='check-for-services-with-high-failure-rates-or-sending-to-tv-numbers') +@statsd(namespace="tasks") +def check_for_services_with_high_failure_rates_or_sending_to_tv_numbers(): + services_with_failures = get_services_with_high_failure_rates() + # services_sending_to_tv_numbers = dao_find_services_sending_to_tv_numbers(number=100) + + if services_with_failures: + message = "{} service(s) have had high permanent-failure rates for sms messages in last 24 hours: ".format( + len(services_with_failures) + ) + for service in services_with_failures: + message += "service id: {} failure rate: {}, ".format(service["id"], service["permanent_failure_rate"]) + + current_app.logger.exception(message) + + if current_app.config['NOTIFY_ENVIRONMENT'] in ['live', 'production', 'test']: + zendesk_client.create_ticket( + subject="[{}] High failure rates for sms spotted for services".format( + current_app.config['NOTIFY_ENVIRONMENT'] + ), + message=message, + ticket_type=zendesk_client.TYPE_INCIDENT + ) diff --git a/app/config.py b/app/config.py index 8ed088651..33129e74a 100644 --- a/app/config.py +++ b/app/config.py @@ -276,6 +276,11 @@ class Config(object): 'schedule': crontab(day_of_week='mon-fri', hour='9,15', minute=0), 'options': {'queue': QueueNames.PERIODIC} }, + 'check-for-services-with-high-failure-rates-or-sending-to-tv-numbers': { + 'task': 'check-for-services-with-high-failure-rates-or-sending-to-tv-numbers', + 'schedule': crontab(day_of_week='mon-fri', hour=10, minute=30), + 'options': {'queue': QueueNames.PERIODIC} + }, 'raise-alert-if-letter-notifications-still-sending': { 'task': 'raise-alert-if-letter-notifications-still-sending', 'schedule': crontab(hour=16, minute=30), diff --git a/app/service/statistics.py b/app/service/statistics.py index 5c6accaa5..295be4fb1 100644 --- a/app/service/statistics.py +++ b/app/service/statistics.py @@ -21,6 +21,19 @@ def format_statistics(statistics): return counts +def get_rate_of_permanent_failures_for_service(statistics, threshold=100): + counts = {"permanent_failure": 0, "all_other_statuses": 0} + for row in statistics: + if row.notification_type == 'sms': + _count_if_status_is_permanent_failure_from_row(counts, row) + + if counts['permanent_failure'] + counts['all_other_statuses'] >= threshold: + rate = counts['permanent_failure'] / (counts['permanent_failure'] + counts['all_other_statuses']) + else: + rate = 0 + return rate + + def format_admin_stats(statistics): counts = create_stats_dict() @@ -94,6 +107,13 @@ def _update_statuses_from_row(update_dict, row): update_dict['failed'] += row.count +def _count_if_status_is_permanent_failure_from_row(update_dict, row): + if row.status == 'permanent-failure': + update_dict['permanent_failure'] += row.count + else: + update_dict['all_other_statuses'] += row.count + + def create_empty_monthly_notification_status_stats_dict(year): utc_month_starts = get_months_for_financial_year(year) # nested dicts - data[month][template type][status] = count diff --git a/app/service/utils.py b/app/service/utils.py index 16521fbbd..d69fff6b3 100644 --- a/app/service/utils.py +++ b/app/service/utils.py @@ -7,6 +7,11 @@ from app.models import ( MOBILE_TYPE, EMAIL_TYPE, KEY_TYPE_TEST, KEY_TYPE_TEAM, KEY_TYPE_NORMAL) +from app.service import statistics +from datetime import datetime, timedelta + +from app.dao.fact_notification_status_dao import fetch_stats_for_all_services_by_date_range + def get_recipients_from_request(request_json, key, type): return [(type, recipient) for recipient in request_json.get(key)] @@ -52,3 +57,26 @@ def service_allowed_to_send_to(recipient, service, key_type, allow_whitelisted_r whitelist_members ) ) + + +def get_services_with_high_failure_rates(rate=0.25, threshold=100): + start_date = (datetime.utcnow() - timedelta(days=1)).date() + end_date = datetime.utcnow().date() + + stats = fetch_stats_for_all_services_by_date_range( + start_date=start_date, + end_date=end_date, + include_from_test_key=False, + ) + results = [] + for service_id, rows in itertools.groupby(stats, lambda x: x.service_id): + rows = list(rows) + if not rows[0].restricted and not rows[0].research_mode and rows[0].active: + permanent_failure_rate = statistics.get_rate_of_permanent_failures_for_service(rows, threshold=threshold) + if permanent_failure_rate >= rate: + results.append({ + 'id': str(rows[0].service_id), + 'name': rows[0].name, + 'permanent_failure_rate': permanent_failure_rate + }) + return results diff --git a/tests/app/celery/test_scheduled_tasks.py b/tests/app/celery/test_scheduled_tasks.py index 71576ff0f..ea7625001 100644 --- a/tests/app/celery/test_scheduled_tasks.py +++ b/tests/app/celery/test_scheduled_tasks.py @@ -16,6 +16,7 @@ from app.celery.scheduled_tasks import ( check_precompiled_letter_state, check_templated_letter_state, check_for_missing_rows_in_completed_jobs, + check_for_services_with_high_failure_rates_or_sending_to_tv_numbers, switch_current_sms_provider_on_slow_delivery, ) from app.config import QueueNames, TaskNames @@ -494,3 +495,30 @@ def test_check_for_missing_rows_in_completed_jobs_uses_sender_id(mocker, sample_ mock_process_row.assert_called_once_with( mock.ANY, mock.ANY, job, job.service, sender_id=fake_uuid ) + + +@pytest.mark.parametrize("failure_rates, expected_message", [ + [ + [{"id": "123", "name": "Service 1", "permanent_failure_rate": 0.3}], + "1 service(s) have had high permanent-failure rates for sms messages in last " + "24 hours: service id: 123 failure rate: 0.3, " + ] +]) +def test_check_for_services_with_high_failure_rates_or_sending_to_tv_numbers( + mocker, notify_db_session, failure_rates, expected_message +): + mock_logger = mocker.patch('app.celery.tasks.current_app.logger.exception') + mock_create_ticket = mocker.patch('app.celery.scheduled_tasks.zendesk_client.create_ticket') + mock_failure_rates = mocker.patch( + 'app.celery.scheduled_tasks.get_services_with_high_failure_rates', return_value=failure_rates + ) + + check_for_services_with_high_failure_rates_or_sending_to_tv_numbers() + + assert mock_failure_rates.called + mock_logger.assert_called_once_with(expected_message) + mock_create_ticket.assert_called_with( + message=expected_message, + subject="[test] High failure rates for sms spotted for services", + ticket_type='incident' + ) diff --git a/tests/app/service/test_statistics.py b/tests/app/service/test_statistics.py index 07a5d5c09..abf3369ec 100644 --- a/tests/app/service/test_statistics.py +++ b/tests/app/service/test_statistics.py @@ -6,12 +6,13 @@ from freezegun import freeze_time import pytest from app.service.statistics import ( - format_admin_stats, - format_statistics, + add_monthly_notification_status_stats, + create_empty_monthly_notification_status_stats_dict, create_stats_dict, create_zeroed_stats_dicts, - create_empty_monthly_notification_status_stats_dict, - add_monthly_notification_status_stats + format_admin_stats, + format_statistics, + get_rate_of_permanent_failures_for_service ) StatsRow = collections.namedtuple('row', ('notification_type', 'status', 'count')) @@ -73,6 +74,32 @@ def test_format_statistics(stats, email_counts, sms_counts, letter_counts): } +@pytest.mark.idparametrize("statistics, expected_result", { + 'counts_rate_for_sms': ([ + StatsRow('sms', 'permanent-failure', 100), + StatsRow('sms', 'delivered', 300), + ], 0.25), + 'ignores_other_notification_types': ([ + StatsRow('sms', 'permanent-failure', 100), + StatsRow('sms', 'delivered', 300), + StatsRow('email', 'delivered', 300), + StatsRow('letter', 'created', 300), + ], 0.25), + 'only_counts_permanent_failure_as_failed': ([ + StatsRow('sms', 'permanent-failure', 100), + StatsRow('sms', 'temporary-failure', 100), + StatsRow('sms', 'delivered', 300), + ], 0.2), + 'below_threshold': ([ + StatsRow('sms', 'permanent-failure', 5), + StatsRow('sms', 'delivered', 3), + ], 0), +}) +def test_get_rate_of_permanent_failures_for_service(statistics, expected_result): + rate = get_rate_of_permanent_failures_for_service(statistics) + assert rate == expected_result + + def test_create_zeroed_stats_dicts(): assert create_zeroed_stats_dicts() == { 'sms': {'requested': 0, 'delivered': 0, 'failed': 0}, diff --git a/tests/app/service/test_utils.py b/tests/app/service/test_utils.py index 71b2770bb..2ff9328ad 100644 --- a/tests/app/service/test_utils.py +++ b/tests/app/service/test_utils.py @@ -1,5 +1,7 @@ from app.dao.date_util import get_current_financial_year_start_year from freezegun import freeze_time +from tests.app.db import create_service, create_notification, create_template +from app.service.utils import get_services_with_high_failure_rates # see get_financial_year for conversion of financial years. @@ -13,3 +15,33 @@ def test_get_current_financial_year_start_year_before_march(): def test_get_current_financial_year_start_year_after_april(): current_fy = get_current_financial_year_start_year() assert current_fy == 2017 + + +@freeze_time("2019-12-02 12:00:00.000000") +def test_get_services_with_high_failure_rates(notify_db_session): + service_1 = create_service(service_name="Service 1") + service_3 = create_service(service_name="Service 3", restricted=True) # restricted + service_4 = create_service(service_name="Service 4", research_mode=True) # research mode + service_5 = create_service(service_name="Service 5", active=False) # not active + services = [service_1, service_3, service_4, service_5] + for service in services: + template = create_template(service) + create_notification(template, status="permanent-failure") + for x in range(0, 3): + create_notification(template, status="delivered") + + service_6 = create_service(service_name="Service 6") # notifications too old + with freeze_time("2019-11-30 15:00:00.000000"): + template_6 = create_template(service_6) + for x in range(0, 4): + create_notification(template_6, status="permanent-failure") + + service_2 = create_service(service_name="Service 2") # below threshold + template_2 = create_template(service_2) + create_notification(template_2, status="permanent-failure") + + assert get_services_with_high_failure_rates(threshold=3) == [{ + 'id': str(service_1.id), + 'name': service_1.name, + 'permanent_failure_rate': 0.25 + }]