From d72ab4f4a6491b156042d0695a0d74d8a90b66cb Mon Sep 17 00:00:00 2001 From: Pea Tyczynska Date: Fri, 29 Nov 2019 21:24:17 +0000 Subject: [PATCH 1/8] Send zendesk ticket when services found with high failure rates --- app/celery/scheduled_tasks.py | 27 ++++++++++++++++++ app/config.py | 5 ++++ app/service/statistics.py | 20 ++++++++++++++ app/service/utils.py | 28 +++++++++++++++++++ tests/app/celery/test_scheduled_tasks.py | 28 +++++++++++++++++++ tests/app/service/test_statistics.py | 35 +++++++++++++++++++++--- tests/app/service/test_utils.py | 32 ++++++++++++++++++++++ 7 files changed, 171 insertions(+), 4 deletions(-) diff --git a/app/celery/scheduled_tasks.py b/app/celery/scheduled_tasks.py index 28da6811f..ba6b309fe 100644 --- a/app/celery/scheduled_tasks.py +++ b/app/celery/scheduled_tasks.py @@ -45,6 +45,8 @@ from app.models import ( from app.notifications.process_notifications import send_notification_to_queue from app.v2.errors import JobIncompleteError +from app.service.utils import get_services_with_high_failure_rates + @notify_celery.task(name="run-scheduled-jobs") @statsd(namespace="tasks") @@ -253,3 +255,28 @@ def check_for_missing_rows_in_completed_jobs(): current_app.logger.info( "Processing missing row: {} for job: {}".format(row_to_process.missing_row, job.id)) process_row(row, template, job, job.service, sender_id=sender_id) + + +@notify_celery.task(name='check-for-services-with-high-failure-rates-or-sending-to-tv-numbers') +@statsd(namespace="tasks") +def check_for_services_with_high_failure_rates_or_sending_to_tv_numbers(): + services_with_failures = get_services_with_high_failure_rates() + # services_sending_to_tv_numbers = dao_find_services_sending_to_tv_numbers(number=100) + + if services_with_failures: + message = "{} service(s) have had high permanent-failure rates for sms messages in last 24 hours: ".format( + len(services_with_failures) + ) + for service in services_with_failures: + message += "service id: {} failure rate: {}, ".format(service["id"], service["permanent_failure_rate"]) + + current_app.logger.exception(message) + + if current_app.config['NOTIFY_ENVIRONMENT'] in ['live', 'production', 'test']: + zendesk_client.create_ticket( + subject="[{}] High failure rates for sms spotted for services".format( + current_app.config['NOTIFY_ENVIRONMENT'] + ), + message=message, + ticket_type=zendesk_client.TYPE_INCIDENT + ) diff --git a/app/config.py b/app/config.py index 8ed088651..33129e74a 100644 --- a/app/config.py +++ b/app/config.py @@ -276,6 +276,11 @@ class Config(object): 'schedule': crontab(day_of_week='mon-fri', hour='9,15', minute=0), 'options': {'queue': QueueNames.PERIODIC} }, + 'check-for-services-with-high-failure-rates-or-sending-to-tv-numbers': { + 'task': 'check-for-services-with-high-failure-rates-or-sending-to-tv-numbers', + 'schedule': crontab(day_of_week='mon-fri', hour=10, minute=30), + 'options': {'queue': QueueNames.PERIODIC} + }, 'raise-alert-if-letter-notifications-still-sending': { 'task': 'raise-alert-if-letter-notifications-still-sending', 'schedule': crontab(hour=16, minute=30), diff --git a/app/service/statistics.py b/app/service/statistics.py index 5c6accaa5..295be4fb1 100644 --- a/app/service/statistics.py +++ b/app/service/statistics.py @@ -21,6 +21,19 @@ def format_statistics(statistics): return counts +def get_rate_of_permanent_failures_for_service(statistics, threshold=100): + counts = {"permanent_failure": 0, "all_other_statuses": 0} + for row in statistics: + if row.notification_type == 'sms': + _count_if_status_is_permanent_failure_from_row(counts, row) + + if counts['permanent_failure'] + counts['all_other_statuses'] >= threshold: + rate = counts['permanent_failure'] / (counts['permanent_failure'] + counts['all_other_statuses']) + else: + rate = 0 + return rate + + def format_admin_stats(statistics): counts = create_stats_dict() @@ -94,6 +107,13 @@ def _update_statuses_from_row(update_dict, row): update_dict['failed'] += row.count +def _count_if_status_is_permanent_failure_from_row(update_dict, row): + if row.status == 'permanent-failure': + update_dict['permanent_failure'] += row.count + else: + update_dict['all_other_statuses'] += row.count + + def create_empty_monthly_notification_status_stats_dict(year): utc_month_starts = get_months_for_financial_year(year) # nested dicts - data[month][template type][status] = count diff --git a/app/service/utils.py b/app/service/utils.py index 16521fbbd..d69fff6b3 100644 --- a/app/service/utils.py +++ b/app/service/utils.py @@ -7,6 +7,11 @@ from app.models import ( MOBILE_TYPE, EMAIL_TYPE, KEY_TYPE_TEST, KEY_TYPE_TEAM, KEY_TYPE_NORMAL) +from app.service import statistics +from datetime import datetime, timedelta + +from app.dao.fact_notification_status_dao import fetch_stats_for_all_services_by_date_range + def get_recipients_from_request(request_json, key, type): return [(type, recipient) for recipient in request_json.get(key)] @@ -52,3 +57,26 @@ def service_allowed_to_send_to(recipient, service, key_type, allow_whitelisted_r whitelist_members ) ) + + +def get_services_with_high_failure_rates(rate=0.25, threshold=100): + start_date = (datetime.utcnow() - timedelta(days=1)).date() + end_date = datetime.utcnow().date() + + stats = fetch_stats_for_all_services_by_date_range( + start_date=start_date, + end_date=end_date, + include_from_test_key=False, + ) + results = [] + for service_id, rows in itertools.groupby(stats, lambda x: x.service_id): + rows = list(rows) + if not rows[0].restricted and not rows[0].research_mode and rows[0].active: + permanent_failure_rate = statistics.get_rate_of_permanent_failures_for_service(rows, threshold=threshold) + if permanent_failure_rate >= rate: + results.append({ + 'id': str(rows[0].service_id), + 'name': rows[0].name, + 'permanent_failure_rate': permanent_failure_rate + }) + return results diff --git a/tests/app/celery/test_scheduled_tasks.py b/tests/app/celery/test_scheduled_tasks.py index 71576ff0f..ea7625001 100644 --- a/tests/app/celery/test_scheduled_tasks.py +++ b/tests/app/celery/test_scheduled_tasks.py @@ -16,6 +16,7 @@ from app.celery.scheduled_tasks import ( check_precompiled_letter_state, check_templated_letter_state, check_for_missing_rows_in_completed_jobs, + check_for_services_with_high_failure_rates_or_sending_to_tv_numbers, switch_current_sms_provider_on_slow_delivery, ) from app.config import QueueNames, TaskNames @@ -494,3 +495,30 @@ def test_check_for_missing_rows_in_completed_jobs_uses_sender_id(mocker, sample_ mock_process_row.assert_called_once_with( mock.ANY, mock.ANY, job, job.service, sender_id=fake_uuid ) + + +@pytest.mark.parametrize("failure_rates, expected_message", [ + [ + [{"id": "123", "name": "Service 1", "permanent_failure_rate": 0.3}], + "1 service(s) have had high permanent-failure rates for sms messages in last " + "24 hours: service id: 123 failure rate: 0.3, " + ] +]) +def test_check_for_services_with_high_failure_rates_or_sending_to_tv_numbers( + mocker, notify_db_session, failure_rates, expected_message +): + mock_logger = mocker.patch('app.celery.tasks.current_app.logger.exception') + mock_create_ticket = mocker.patch('app.celery.scheduled_tasks.zendesk_client.create_ticket') + mock_failure_rates = mocker.patch( + 'app.celery.scheduled_tasks.get_services_with_high_failure_rates', return_value=failure_rates + ) + + check_for_services_with_high_failure_rates_or_sending_to_tv_numbers() + + assert mock_failure_rates.called + mock_logger.assert_called_once_with(expected_message) + mock_create_ticket.assert_called_with( + message=expected_message, + subject="[test] High failure rates for sms spotted for services", + ticket_type='incident' + ) diff --git a/tests/app/service/test_statistics.py b/tests/app/service/test_statistics.py index 07a5d5c09..abf3369ec 100644 --- a/tests/app/service/test_statistics.py +++ b/tests/app/service/test_statistics.py @@ -6,12 +6,13 @@ from freezegun import freeze_time import pytest from app.service.statistics import ( - format_admin_stats, - format_statistics, + add_monthly_notification_status_stats, + create_empty_monthly_notification_status_stats_dict, create_stats_dict, create_zeroed_stats_dicts, - create_empty_monthly_notification_status_stats_dict, - add_monthly_notification_status_stats + format_admin_stats, + format_statistics, + get_rate_of_permanent_failures_for_service ) StatsRow = collections.namedtuple('row', ('notification_type', 'status', 'count')) @@ -73,6 +74,32 @@ def test_format_statistics(stats, email_counts, sms_counts, letter_counts): } +@pytest.mark.idparametrize("statistics, expected_result", { + 'counts_rate_for_sms': ([ + StatsRow('sms', 'permanent-failure', 100), + StatsRow('sms', 'delivered', 300), + ], 0.25), + 'ignores_other_notification_types': ([ + StatsRow('sms', 'permanent-failure', 100), + StatsRow('sms', 'delivered', 300), + StatsRow('email', 'delivered', 300), + StatsRow('letter', 'created', 300), + ], 0.25), + 'only_counts_permanent_failure_as_failed': ([ + StatsRow('sms', 'permanent-failure', 100), + StatsRow('sms', 'temporary-failure', 100), + StatsRow('sms', 'delivered', 300), + ], 0.2), + 'below_threshold': ([ + StatsRow('sms', 'permanent-failure', 5), + StatsRow('sms', 'delivered', 3), + ], 0), +}) +def test_get_rate_of_permanent_failures_for_service(statistics, expected_result): + rate = get_rate_of_permanent_failures_for_service(statistics) + assert rate == expected_result + + def test_create_zeroed_stats_dicts(): assert create_zeroed_stats_dicts() == { 'sms': {'requested': 0, 'delivered': 0, 'failed': 0}, diff --git a/tests/app/service/test_utils.py b/tests/app/service/test_utils.py index 71b2770bb..2ff9328ad 100644 --- a/tests/app/service/test_utils.py +++ b/tests/app/service/test_utils.py @@ -1,5 +1,7 @@ from app.dao.date_util import get_current_financial_year_start_year from freezegun import freeze_time +from tests.app.db import create_service, create_notification, create_template +from app.service.utils import get_services_with_high_failure_rates # see get_financial_year for conversion of financial years. @@ -13,3 +15,33 @@ def test_get_current_financial_year_start_year_before_march(): def test_get_current_financial_year_start_year_after_april(): current_fy = get_current_financial_year_start_year() assert current_fy == 2017 + + +@freeze_time("2019-12-02 12:00:00.000000") +def test_get_services_with_high_failure_rates(notify_db_session): + service_1 = create_service(service_name="Service 1") + service_3 = create_service(service_name="Service 3", restricted=True) # restricted + service_4 = create_service(service_name="Service 4", research_mode=True) # research mode + service_5 = create_service(service_name="Service 5", active=False) # not active + services = [service_1, service_3, service_4, service_5] + for service in services: + template = create_template(service) + create_notification(template, status="permanent-failure") + for x in range(0, 3): + create_notification(template, status="delivered") + + service_6 = create_service(service_name="Service 6") # notifications too old + with freeze_time("2019-11-30 15:00:00.000000"): + template_6 = create_template(service_6) + for x in range(0, 4): + create_notification(template_6, status="permanent-failure") + + service_2 = create_service(service_name="Service 2") # below threshold + template_2 = create_template(service_2) + create_notification(template_2, status="permanent-failure") + + assert get_services_with_high_failure_rates(threshold=3) == [{ + 'id': str(service_1.id), + 'name': service_1.name, + 'permanent_failure_rate': 0.25 + }] From 53efd87e28bd8674227a46cca3e4a3dd6b2a0ad3 Mon Sep 17 00:00:00 2001 From: Pea Tyczynska Date: Tue, 3 Dec 2019 10:26:59 +0000 Subject: [PATCH 2/8] Check for services sending sms messages to tv numbers --- app/celery/scheduled_tasks.py | 26 ++++++++++-- app/dao/services_dao.py | 24 +++++++++++ app/service/utils.py | 10 ++--- tests/app/celery/test_scheduled_tasks.py | 28 +++++++++++-- .../dao/test_fact_notification_status_dao.py | 2 +- tests/app/dao/test_services_dao.py | 41 ++++++++++++++++++- tests/app/service/test_utils.py | 5 ++- 7 files changed, 119 insertions(+), 17 deletions(-) diff --git a/app/celery/scheduled_tasks.py b/app/celery/scheduled_tasks.py index ba6b309fe..fba91b8c4 100644 --- a/app/celery/scheduled_tasks.py +++ b/app/celery/scheduled_tasks.py @@ -35,6 +35,7 @@ from app.dao.notifications_dao import ( ) from app.dao.provider_details_dao import dao_reduce_sms_provider_priority from app.dao.users_dao import delete_codes_older_created_more_than_a_day_ago +from app.dao.services_dao import dao_find_services_sending_to_tv_numbers from app.models import ( Job, JOB_STATUS_IN_PROGRESS, @@ -260,16 +261,33 @@ def check_for_missing_rows_in_completed_jobs(): @notify_celery.task(name='check-for-services-with-high-failure-rates-or-sending-to-tv-numbers') @statsd(namespace="tasks") def check_for_services_with_high_failure_rates_or_sending_to_tv_numbers(): - services_with_failures = get_services_with_high_failure_rates() - # services_sending_to_tv_numbers = dao_find_services_sending_to_tv_numbers(number=100) + start_date = (datetime.utcnow() - timedelta(days=1)) + end_date = datetime.utcnow() + message = "" + + services_with_failures = get_services_with_high_failure_rates(start_date=start_date, end_date=end_date) + services_sending_to_tv_numbers = dao_find_services_sending_to_tv_numbers( + threshold=100, + start_date=start_date, + end_date=end_date + ) if services_with_failures: - message = "{} service(s) have had high permanent-failure rates for sms messages in last 24 hours: ".format( + message += "{} service(s) have had high permanent-failure rates for sms messages in last 24 hours:\n".format( len(services_with_failures) ) for service in services_with_failures: - message += "service id: {} failure rate: {}, ".format(service["id"], service["permanent_failure_rate"]) + message += "service id: {} failure rate: {},\n".format(service["id"], service["permanent_failure_rate"]) + elif services_sending_to_tv_numbers: + message += "{} service(s) have sent over 100 sms messages to tv numbers in last 24 hours:\n".format( + len(services_sending_to_tv_numbers) + ) + for service in services_sending_to_tv_numbers: + message += "service id: {}, count of sms to tv numbers: {},\n".format( + service.service_id, service.notification_count + ) + if services_with_failures or services_sending_to_tv_numbers: current_app.logger.exception(message) if current_app.config['NOTIFY_ENVIRONMENT'] in ['live', 'production', 'test']: diff --git a/app/dao/services_dao.py b/app/dao/services_dao.py index 666034ad1..3ec010093 100644 --- a/app/dao/services_dao.py +++ b/app/dao/services_dao.py @@ -521,3 +521,27 @@ def dao_fetch_active_users_for_service(service_id): ) return query.all() + + +def dao_find_services_sending_to_tv_numbers(start_date, end_date, threshold=100): + + return db.session.query( + Service.name.label('service_name'), + Notification.service_id.label('service_id'), + func.count(Notification.id).label('notification_count') + ).filter( + Notification.service_id == Service.id, + Notification.created_at >= start_date, + Notification.created_at <= end_date, + Notification.key_type != KEY_TYPE_TEST, + Notification.notification_type == SMS_TYPE, + func.substr(Notification.normalised_to, 3, 7) == '7700900', + Service.restricted == False, # noqa + Service.research_mode == False, + Service.active == True, + ).group_by( + Notification.service_id, + Service.name + ).having( + func.count(Notification.id) > threshold + ).all() diff --git a/app/service/utils.py b/app/service/utils.py index d69fff6b3..932df29e0 100644 --- a/app/service/utils.py +++ b/app/service/utils.py @@ -8,7 +8,6 @@ from app.models import ( KEY_TYPE_TEST, KEY_TYPE_TEAM, KEY_TYPE_NORMAL) from app.service import statistics -from datetime import datetime, timedelta from app.dao.fact_notification_status_dao import fetch_stats_for_all_services_by_date_range @@ -59,13 +58,10 @@ def service_allowed_to_send_to(recipient, service, key_type, allow_whitelisted_r ) -def get_services_with_high_failure_rates(rate=0.25, threshold=100): - start_date = (datetime.utcnow() - timedelta(days=1)).date() - end_date = datetime.utcnow().date() - +def get_services_with_high_failure_rates(start_date, end_date, rate=0.25, threshold=100): stats = fetch_stats_for_all_services_by_date_range( - start_date=start_date, - end_date=end_date, + start_date=start_date.date(), + end_date=end_date.date(), include_from_test_key=False, ) results = [] diff --git a/tests/app/celery/test_scheduled_tasks.py b/tests/app/celery/test_scheduled_tasks.py index ea7625001..ee4ec14ff 100644 --- a/tests/app/celery/test_scheduled_tasks.py +++ b/tests/app/celery/test_scheduled_tasks.py @@ -2,6 +2,7 @@ from datetime import datetime, timedelta from unittest.mock import call import pytest +from collections import namedtuple from freezegun import freeze_time from mock import mock @@ -497,25 +498,46 @@ def test_check_for_missing_rows_in_completed_jobs_uses_sender_id(mocker, sample_ ) -@pytest.mark.parametrize("failure_rates, expected_message", [ +MockServicesSendingToTVNumbers = namedtuple( + 'ServicesSendingToTVNumbers', + [ + 'service_id', + 'service_name', + 'notification_count', + ] +) + + +@pytest.mark.parametrize("failure_rates, sms_to_tv_numbers, expected_message", [ [ [{"id": "123", "name": "Service 1", "permanent_failure_rate": 0.3}], + [], "1 service(s) have had high permanent-failure rates for sms messages in last " - "24 hours: service id: 123 failure rate: 0.3, " + "24 hours:\nservice id: 123 failure rate: 0.3,\n" + ], + [ + [], + [MockServicesSendingToTVNumbers("123", "Service 1", 300)], + "1 service(s) have sent over 100 sms messages to tv numbers in last 24 hours:\n" + "service id: 123, count of sms to tv numbers: 300,\n" ] ]) def test_check_for_services_with_high_failure_rates_or_sending_to_tv_numbers( - mocker, notify_db_session, failure_rates, expected_message + mocker, notify_db_session, failure_rates, sms_to_tv_numbers, expected_message ): mock_logger = mocker.patch('app.celery.tasks.current_app.logger.exception') mock_create_ticket = mocker.patch('app.celery.scheduled_tasks.zendesk_client.create_ticket') mock_failure_rates = mocker.patch( 'app.celery.scheduled_tasks.get_services_with_high_failure_rates', return_value=failure_rates ) + mock_sms_to_tv_numbers = mocker.patch( + 'app.celery.scheduled_tasks.dao_find_services_sending_to_tv_numbers', return_value=sms_to_tv_numbers + ) check_for_services_with_high_failure_rates_or_sending_to_tv_numbers() assert mock_failure_rates.called + assert mock_sms_to_tv_numbers.called mock_logger.assert_called_once_with(expected_message) mock_create_ticket.assert_called_with( message=expected_message, diff --git a/tests/app/dao/test_fact_notification_status_dao.py b/tests/app/dao/test_fact_notification_status_dao.py index 8e2ce9afd..65687f9d8 100644 --- a/tests/app/dao/test_fact_notification_status_dao.py +++ b/tests/app/dao/test_fact_notification_status_dao.py @@ -14,7 +14,7 @@ from app.dao.fact_notification_status_dao import ( fetch_notification_status_totals_for_all_services, fetch_notification_statuses_for_job, fetch_stats_for_all_services_by_date_range, fetch_monthly_template_usage_for_service, - get_total_sent_notifications_for_day_and_type + get_total_sent_notifications_for_day_and_type, ) from app.models import ( FactNotificationStatus, diff --git a/tests/app/dao/test_services_dao.py b/tests/app/dao/test_services_dao.py index 949c26996..4be4e1787 100644 --- a/tests/app/dao/test_services_dao.py +++ b/tests/app/dao/test_services_dao.py @@ -1,5 +1,5 @@ import uuid -from datetime import datetime +from datetime import datetime, timedelta from unittest import mock import pytest @@ -23,6 +23,7 @@ from app.dao.services_dao import ( dao_fetch_live_services_data, dao_fetch_service_by_id, dao_fetch_all_services_by_user, + dao_find_services_sending_to_tv_numbers, dao_update_service, delete_service_and_all_associated_db_objects, dao_fetch_stats_for_service, @@ -1099,3 +1100,41 @@ def create_email_sms_letter_template(): template_two = create_template(service=service, template_name='2', template_type='sms') template_three = create_template(service=service, template_name='3', template_type='letter') return template_one, template_three, template_two + + +@freeze_time("2019-12-02 12:00:00.000000") +def test_dao_find_services_sending_to_tv_numbers(notify_db_session): + service_1 = create_service(service_name="Service 1") + service_3 = create_service(service_name="Service 3", restricted=True) # restricted + service_4 = create_service(service_name="Service 4", research_mode=True) # research mode + service_5 = create_service(service_name="Service 5", active=False) # not active + services = [service_1, service_3, service_4, service_5] + + tv_number = "447700900001" + normal_number = "447711900001" + normal_number_resembling_tv_number = "447227700900" + + for service in services: + template = create_template(service) + for x in range(0, 5): + create_notification(template, normalised_to=tv_number, status="permanent-failure") + + service_6 = create_service(service_name="Service 6") # notifications too old + with freeze_time("2019-11-30 15:00:00.000000"): + template_6 = create_template(service_6) + for x in range(0, 5): + create_notification(template_6, normalised_to=tv_number, status="permanent-failure") + + service_2 = create_service(service_name="Service 2") # below threshold + template_2 = create_template(service_2) + create_notification(template_2, normalised_to=tv_number, status="permanent-failure") + for x in range(0, 5): + create_notification(template_2, normalised_to=normal_number, status="delivered") + create_notification(template_2, normalised_to=normal_number_resembling_tv_number, status="delivered") + + start_date = (datetime.utcnow() - timedelta(days=1)) + end_date = datetime.utcnow() + + result = dao_find_services_sending_to_tv_numbers(start_date, end_date, threshold=4) + assert len(result) == 1 + assert result[0].service_name == "Service 1" diff --git a/tests/app/service/test_utils.py b/tests/app/service/test_utils.py index 2ff9328ad..32a8877e0 100644 --- a/tests/app/service/test_utils.py +++ b/tests/app/service/test_utils.py @@ -2,6 +2,7 @@ from app.dao.date_util import get_current_financial_year_start_year from freezegun import freeze_time from tests.app.db import create_service, create_notification, create_template from app.service.utils import get_services_with_high_failure_rates +from datetime import datetime, timedelta # see get_financial_year for conversion of financial years. @@ -39,8 +40,10 @@ def test_get_services_with_high_failure_rates(notify_db_session): service_2 = create_service(service_name="Service 2") # below threshold template_2 = create_template(service_2) create_notification(template_2, status="permanent-failure") + start_date = (datetime.utcnow() - timedelta(days=1)) + end_date = datetime.utcnow() - assert get_services_with_high_failure_rates(threshold=3) == [{ + assert get_services_with_high_failure_rates(start_date, end_date, threshold=3) == [{ 'id': str(service_1.id), 'name': service_1.name, 'permanent_failure_rate': 0.25 From cfbb080f573f3a318d447fb5d735e6bf69138c92 Mon Sep 17 00:00:00 2001 From: Pea Tyczynska Date: Tue, 3 Dec 2019 16:18:07 +0000 Subject: [PATCH 3/8] Simplify failure rate by building separate query --- app/celery/scheduled_tasks.py | 7 +--- app/dao/services_dao.py | 24 ++++++++++-- app/service/statistics.py | 3 +- app/service/utils.py | 26 ++++++------- tests/app/celery/test_scheduled_tasks.py | 6 ++- .../dao/test_fact_notification_status_dao.py | 2 +- tests/app/dao/test_services_dao.py | 39 +++++++++++++++++-- tests/app/service/test_statistics.py | 6 --- tests/app/service/test_utils.py | 1 - 9 files changed, 77 insertions(+), 37 deletions(-) diff --git a/app/celery/scheduled_tasks.py b/app/celery/scheduled_tasks.py index fba91b8c4..d6397754e 100644 --- a/app/celery/scheduled_tasks.py +++ b/app/celery/scheduled_tasks.py @@ -266,11 +266,7 @@ def check_for_services_with_high_failure_rates_or_sending_to_tv_numbers(): message = "" services_with_failures = get_services_with_high_failure_rates(start_date=start_date, end_date=end_date) - services_sending_to_tv_numbers = dao_find_services_sending_to_tv_numbers( - threshold=100, - start_date=start_date, - end_date=end_date - ) + services_sending_to_tv_numbers = dao_find_services_sending_to_tv_numbers(start_date=start_date, end_date=end_date) if services_with_failures: message += "{} service(s) have had high permanent-failure rates for sms messages in last 24 hours:\n".format( @@ -291,6 +287,7 @@ def check_for_services_with_high_failure_rates_or_sending_to_tv_numbers(): current_app.logger.exception(message) if current_app.config['NOTIFY_ENVIRONMENT'] in ['live', 'production', 'test']: + message += "\nThings to do: contact service? revoke their key?" zendesk_client.create_ticket( subject="[{}] High failure rates for sms spotted for services".format( current_app.config['NOTIFY_ENVIRONMENT'] diff --git a/app/dao/services_dao.py b/app/dao/services_dao.py index 3ec010093..03ddfd22a 100644 --- a/app/dao/services_dao.py +++ b/app/dao/services_dao.py @@ -524,9 +524,7 @@ def dao_fetch_active_users_for_service(service_id): def dao_find_services_sending_to_tv_numbers(start_date, end_date, threshold=100): - return db.session.query( - Service.name.label('service_name'), Notification.service_id.label('service_id'), func.count(Notification.id).label('notification_count') ).filter( @@ -541,7 +539,27 @@ def dao_find_services_sending_to_tv_numbers(start_date, end_date, threshold=100) Service.active == True, ).group_by( Notification.service_id, - Service.name ).having( func.count(Notification.id) > threshold ).all() + + +def dao_find_real_sms_notification_count_by_status_for_live_services(start_date, end_date): + # only works within services' retention period + return db.session.query( + Notification.service_id.label('service_id'), + Notification.status.label('status'), + func.count(Notification.id).label('count') + ).filter( + Notification.service_id == Service.id, + Notification.created_at >= start_date, + Notification.created_at <= end_date, + Notification.key_type != KEY_TYPE_TEST, + Notification.notification_type == SMS_TYPE, + Service.restricted == False, # noqa + Service.research_mode == False, + Service.active == True, + ).group_by( + Notification.service_id, + Notification.status + ).all() diff --git a/app/service/statistics.py b/app/service/statistics.py index 295be4fb1..3c51ee343 100644 --- a/app/service/statistics.py +++ b/app/service/statistics.py @@ -24,8 +24,7 @@ def format_statistics(statistics): def get_rate_of_permanent_failures_for_service(statistics, threshold=100): counts = {"permanent_failure": 0, "all_other_statuses": 0} for row in statistics: - if row.notification_type == 'sms': - _count_if_status_is_permanent_failure_from_row(counts, row) + _count_if_status_is_permanent_failure_from_row(counts, row) if counts['permanent_failure'] + counts['all_other_statuses'] >= threshold: rate = counts['permanent_failure'] / (counts['permanent_failure'] + counts['all_other_statuses']) diff --git a/app/service/utils.py b/app/service/utils.py index 932df29e0..0b7865928 100644 --- a/app/service/utils.py +++ b/app/service/utils.py @@ -2,6 +2,8 @@ import itertools from notifications_utils.recipients import allowed_to_send_to +from app.dao.services_dao import dao_find_real_sms_notification_count_by_status_for_live_services + from app.models import ( ServiceWhitelist, MOBILE_TYPE, EMAIL_TYPE, @@ -9,8 +11,6 @@ from app.models import ( from app.service import statistics -from app.dao.fact_notification_status_dao import fetch_stats_for_all_services_by_date_range - def get_recipients_from_request(request_json, key, type): return [(type, recipient) for recipient in request_json.get(key)] @@ -59,20 +59,18 @@ def service_allowed_to_send_to(recipient, service, key_type, allow_whitelisted_r def get_services_with_high_failure_rates(start_date, end_date, rate=0.25, threshold=100): - stats = fetch_stats_for_all_services_by_date_range( - start_date=start_date.date(), - end_date=end_date.date(), - include_from_test_key=False, + stats = dao_find_real_sms_notification_count_by_status_for_live_services( + start_date=start_date, + end_date=end_date, ) results = [] for service_id, rows in itertools.groupby(stats, lambda x: x.service_id): rows = list(rows) - if not rows[0].restricted and not rows[0].research_mode and rows[0].active: - permanent_failure_rate = statistics.get_rate_of_permanent_failures_for_service(rows, threshold=threshold) - if permanent_failure_rate >= rate: - results.append({ - 'id': str(rows[0].service_id), - 'name': rows[0].name, - 'permanent_failure_rate': permanent_failure_rate - }) + + permanent_failure_rate = statistics.get_rate_of_permanent_failures_for_service(rows, threshold=threshold) + if permanent_failure_rate >= rate: + results.append({ + 'id': str(rows[0].service_id), + 'permanent_failure_rate': permanent_failure_rate + }) return results diff --git a/tests/app/celery/test_scheduled_tasks.py b/tests/app/celery/test_scheduled_tasks.py index ee4ec14ff..79143e673 100644 --- a/tests/app/celery/test_scheduled_tasks.py +++ b/tests/app/celery/test_scheduled_tasks.py @@ -510,7 +510,7 @@ MockServicesSendingToTVNumbers = namedtuple( @pytest.mark.parametrize("failure_rates, sms_to_tv_numbers, expected_message", [ [ - [{"id": "123", "name": "Service 1", "permanent_failure_rate": 0.3}], + [{"id": "123", "permanent_failure_rate": 0.3}], [], "1 service(s) have had high permanent-failure rates for sms messages in last " "24 hours:\nservice id: 123 failure rate: 0.3,\n" @@ -534,13 +534,15 @@ def test_check_for_services_with_high_failure_rates_or_sending_to_tv_numbers( 'app.celery.scheduled_tasks.dao_find_services_sending_to_tv_numbers', return_value=sms_to_tv_numbers ) + zendesk_actions = "\nThings to do: contact service? revoke their key?" + check_for_services_with_high_failure_rates_or_sending_to_tv_numbers() assert mock_failure_rates.called assert mock_sms_to_tv_numbers.called mock_logger.assert_called_once_with(expected_message) mock_create_ticket.assert_called_with( - message=expected_message, + message=expected_message + zendesk_actions, subject="[test] High failure rates for sms spotted for services", ticket_type='incident' ) diff --git a/tests/app/dao/test_fact_notification_status_dao.py b/tests/app/dao/test_fact_notification_status_dao.py index 65687f9d8..8e2ce9afd 100644 --- a/tests/app/dao/test_fact_notification_status_dao.py +++ b/tests/app/dao/test_fact_notification_status_dao.py @@ -14,7 +14,7 @@ from app.dao.fact_notification_status_dao import ( fetch_notification_status_totals_for_all_services, fetch_notification_statuses_for_job, fetch_stats_for_all_services_by_date_range, fetch_monthly_template_usage_for_service, - get_total_sent_notifications_for_day_and_type, + get_total_sent_notifications_for_day_and_type ) from app.models import ( FactNotificationStatus, diff --git a/tests/app/dao/test_services_dao.py b/tests/app/dao/test_services_dao.py index 4be4e1787..df03a7a6f 100644 --- a/tests/app/dao/test_services_dao.py +++ b/tests/app/dao/test_services_dao.py @@ -23,6 +23,7 @@ from app.dao.services_dao import ( dao_fetch_live_services_data, dao_fetch_service_by_id, dao_fetch_all_services_by_user, + dao_find_real_sms_notification_count_by_status_for_live_services, dao_find_services_sending_to_tv_numbers, dao_update_service, delete_service_and_all_associated_db_objects, @@ -1103,8 +1104,8 @@ def create_email_sms_letter_template(): @freeze_time("2019-12-02 12:00:00.000000") -def test_dao_find_services_sending_to_tv_numbers(notify_db_session): - service_1 = create_service(service_name="Service 1") +def test_dao_find_services_sending_to_tv_numbers(notify_db_session, fake_uuid): + service_1 = create_service(service_name="Service 1", service_id=fake_uuid) service_3 = create_service(service_name="Service 3", restricted=True) # restricted service_4 = create_service(service_name="Service 4", research_mode=True) # research mode service_5 = create_service(service_name="Service 5", active=False) # not active @@ -1137,4 +1138,36 @@ def test_dao_find_services_sending_to_tv_numbers(notify_db_session): result = dao_find_services_sending_to_tv_numbers(start_date, end_date, threshold=4) assert len(result) == 1 - assert result[0].service_name == "Service 1" + assert str(result[0].service_id) == fake_uuid + + +def test_dao_find_real_sms_notification_count_by_status_for_live_services(notify_db_session, fake_uuid): + service_1 = create_service(service_name="Service 1", service_id=fake_uuid) + service_3 = create_service(service_name="Service 3", restricted=True) # restricted + service_4 = create_service(service_name="Service 4", research_mode=True) # research mode + service_5 = create_service(service_name="Service 5", active=False) # not active + services = [service_1, service_3, service_4, service_5] + + for service in services: + template = create_template(service) + for x in range(0, 3): + create_notification(template, status="permanent-failure") + create_notification(template, status="delivered") + create_notification(template, status="temporary-failure") + + service_6 = create_service(service_name="Service 6") # notifications too old + with freeze_time("2019-11-30 15:00:00.000000"): + template_6 = create_template(service_6) + for x in range(0, 2): + create_notification(template_6, status="permanent-failure") + + service_2 = create_service(service_name="Service 2") # below threshold + template_2 = create_template(service_2) + create_notification(template_2, status="permanent-failure", key_type='test') # test key type + + start_date = (datetime.utcnow() - timedelta(days=1)) + end_date = datetime.utcnow() + + result = dao_find_real_sms_notification_count_by_status_for_live_services(start_date, end_date) + assert len(result) == 3 + assert str(result[0].service_id) == fake_uuid diff --git a/tests/app/service/test_statistics.py b/tests/app/service/test_statistics.py index abf3369ec..90947f1ae 100644 --- a/tests/app/service/test_statistics.py +++ b/tests/app/service/test_statistics.py @@ -79,12 +79,6 @@ def test_format_statistics(stats, email_counts, sms_counts, letter_counts): StatsRow('sms', 'permanent-failure', 100), StatsRow('sms', 'delivered', 300), ], 0.25), - 'ignores_other_notification_types': ([ - StatsRow('sms', 'permanent-failure', 100), - StatsRow('sms', 'delivered', 300), - StatsRow('email', 'delivered', 300), - StatsRow('letter', 'created', 300), - ], 0.25), 'only_counts_permanent_failure_as_failed': ([ StatsRow('sms', 'permanent-failure', 100), StatsRow('sms', 'temporary-failure', 100), diff --git a/tests/app/service/test_utils.py b/tests/app/service/test_utils.py index 32a8877e0..b8be1ad78 100644 --- a/tests/app/service/test_utils.py +++ b/tests/app/service/test_utils.py @@ -45,6 +45,5 @@ def test_get_services_with_high_failure_rates(notify_db_session): assert get_services_with_high_failure_rates(start_date, end_date, threshold=3) == [{ 'id': str(service_1.id), - 'name': service_1.name, 'permanent_failure_rate': 0.25 }] From 339b6c0ec7ae92d1e4d2355c1b118bb518d714c8 Mon Sep 17 00:00:00 2001 From: Pea Tyczynska Date: Wed, 4 Dec 2019 14:40:24 +0000 Subject: [PATCH 4/8] Refactor a test so it doesn't do query that's tested elsewhere --- tests/app/celery/test_scheduled_tasks.py | 3 +- tests/app/service/test_utils.py | 55 +++++++++++++----------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/tests/app/celery/test_scheduled_tasks.py b/tests/app/celery/test_scheduled_tasks.py index 79143e673..3ebf89b47 100644 --- a/tests/app/celery/test_scheduled_tasks.py +++ b/tests/app/celery/test_scheduled_tasks.py @@ -502,7 +502,6 @@ MockServicesSendingToTVNumbers = namedtuple( 'ServicesSendingToTVNumbers', [ 'service_id', - 'service_name', 'notification_count', ] ) @@ -517,7 +516,7 @@ MockServicesSendingToTVNumbers = namedtuple( ], [ [], - [MockServicesSendingToTVNumbers("123", "Service 1", 300)], + [MockServicesSendingToTVNumbers("123", 300)], "1 service(s) have sent over 100 sms messages to tv numbers in last 24 hours:\n" "service id: 123, count of sms to tv numbers: 300,\n" ] diff --git a/tests/app/service/test_utils.py b/tests/app/service/test_utils.py index b8be1ad78..2e107dada 100644 --- a/tests/app/service/test_utils.py +++ b/tests/app/service/test_utils.py @@ -1,7 +1,7 @@ from app.dao.date_util import get_current_financial_year_start_year from freezegun import freeze_time -from tests.app.db import create_service, create_notification, create_template from app.service.utils import get_services_with_high_failure_rates +from collections import namedtuple from datetime import datetime, timedelta @@ -18,32 +18,35 @@ def test_get_current_financial_year_start_year_after_april(): assert current_fy == 2017 +MockServicesNotificationCounts = namedtuple( + 'ServicesSendingToTVNumbers', + [ + 'service_id', + 'status', + 'count', + ] +) + + @freeze_time("2019-12-02 12:00:00.000000") -def test_get_services_with_high_failure_rates(notify_db_session): - service_1 = create_service(service_name="Service 1") - service_3 = create_service(service_name="Service 3", restricted=True) # restricted - service_4 = create_service(service_name="Service 4", research_mode=True) # research mode - service_5 = create_service(service_name="Service 5", active=False) # not active - services = [service_1, service_3, service_4, service_5] - for service in services: - template = create_template(service) - create_notification(template, status="permanent-failure") - for x in range(0, 3): - create_notification(template, status="delivered") - - service_6 = create_service(service_name="Service 6") # notifications too old - with freeze_time("2019-11-30 15:00:00.000000"): - template_6 = create_template(service_6) - for x in range(0, 4): - create_notification(template_6, status="permanent-failure") - - service_2 = create_service(service_name="Service 2") # below threshold - template_2 = create_template(service_2) - create_notification(template_2, status="permanent-failure") +def test_get_services_with_high_failure_rates(mocker, notify_db_session): + mock_query_results = [ + MockServicesNotificationCounts('123', 'delivered', 150), + MockServicesNotificationCounts('123', 'permanent-failure', 50), # these will show up + MockServicesNotificationCounts('456', 'delivered', 150), + MockServicesNotificationCounts('456', 'permanent-failure', 5), # ratio too low + MockServicesNotificationCounts('789', 'permanent-failure', 5), # below threshold + MockServicesNotificationCounts('444', 'delivered', 100), + MockServicesNotificationCounts('444', 'permanent-failure', 100), # these will show up + ] + mocker.patch( + 'app.service.utils.dao_find_real_sms_notification_count_by_status_for_live_services', + return_value=mock_query_results + ) start_date = (datetime.utcnow() - timedelta(days=1)) end_date = datetime.utcnow() - assert get_services_with_high_failure_rates(start_date, end_date, threshold=3) == [{ - 'id': str(service_1.id), - 'permanent_failure_rate': 0.25 - }] + assert get_services_with_high_failure_rates(start_date, end_date) == [ + {'id': '123', 'permanent_failure_rate': 0.25}, + {'id': '444', 'permanent_failure_rate': 0.5} + ] From b8de67ae549fe4549541c5fb662c38ca7c3abccc Mon Sep 17 00:00:00 2001 From: Pea Tyczynska Date: Thu, 5 Dec 2019 11:46:10 +0000 Subject: [PATCH 5/8] Update error message to include a url to offending service --- app/celery/scheduled_tasks.py | 8 +++++--- tests/app/celery/test_scheduled_tasks.py | 10 +++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/app/celery/scheduled_tasks.py b/app/celery/scheduled_tasks.py index d6397754e..7c2a88260 100644 --- a/app/celery/scheduled_tasks.py +++ b/app/celery/scheduled_tasks.py @@ -273,14 +273,16 @@ def check_for_services_with_high_failure_rates_or_sending_to_tv_numbers(): len(services_with_failures) ) for service in services_with_failures: - message += "service id: {} failure rate: {},\n".format(service["id"], service["permanent_failure_rate"]) + service_dashboard = current_app.config['ADMIN_BASE_URL'] + "/services/" + service["id"] + message += "service: {} failure rate: {},\n".format(service_dashboard, service["permanent_failure_rate"]) elif services_sending_to_tv_numbers: message += "{} service(s) have sent over 100 sms messages to tv numbers in last 24 hours:\n".format( len(services_sending_to_tv_numbers) ) for service in services_sending_to_tv_numbers: - message += "service id: {}, count of sms to tv numbers: {},\n".format( - service.service_id, service.notification_count + service_dashboard = current_app.config['ADMIN_BASE_URL'] + "/services/" + service.service_id + message += "service: {} count of sms to tv numbers: {},\n".format( + service_dashboard, service.notification_count ) if services_with_failures or services_sending_to_tv_numbers: diff --git a/tests/app/celery/test_scheduled_tasks.py b/tests/app/celery/test_scheduled_tasks.py index 3ebf89b47..e6703e4d5 100644 --- a/tests/app/celery/test_scheduled_tasks.py +++ b/tests/app/celery/test_scheduled_tasks.py @@ -20,7 +20,7 @@ from app.celery.scheduled_tasks import ( check_for_services_with_high_failure_rates_or_sending_to_tv_numbers, switch_current_sms_provider_on_slow_delivery, ) -from app.config import QueueNames, TaskNames +from app.config import QueueNames, TaskNames, Config from app.dao.jobs_dao import dao_get_job_by_id from app.dao.notifications_dao import dao_get_scheduled_notifications from app.dao.provider_details_dao import get_provider_details_by_identifier @@ -512,13 +512,17 @@ MockServicesSendingToTVNumbers = namedtuple( [{"id": "123", "permanent_failure_rate": 0.3}], [], "1 service(s) have had high permanent-failure rates for sms messages in last " - "24 hours:\nservice id: 123 failure rate: 0.3,\n" + "24 hours:\nservice: {} failure rate: 0.3,\n".format( + Config.ADMIN_BASE_URL + "/services/" + "123" + ) ], [ [], [MockServicesSendingToTVNumbers("123", 300)], "1 service(s) have sent over 100 sms messages to tv numbers in last 24 hours:\n" - "service id: 123, count of sms to tv numbers: 300,\n" + "service: {} count of sms to tv numbers: 300,\n".format( + Config.ADMIN_BASE_URL + "/services/" + "123" + ) ] ]) def test_check_for_services_with_high_failure_rates_or_sending_to_tv_numbers( From 1b7b26bf24136c9843fda664d71d237e31e450b5 Mon Sep 17 00:00:00 2001 From: Pea Tyczynska Date: Thu, 5 Dec 2019 16:07:06 +0000 Subject: [PATCH 6/8] Query directly for services with high failure rate --- app/celery/scheduled_tasks.py | 10 +- app/dao/services_dao.py | 46 +++++-- app/service/statistics.py | 19 --- app/service/utils.py | 22 ---- tests/app/celery/test_scheduled_tasks.py | 11 +- tests/app/dao/test_services_dao.py | 152 ++++++++++------------- tests/app/service/test_statistics.py | 23 +--- tests/app/service/test_utils.py | 37 ------ 8 files changed, 116 insertions(+), 204 deletions(-) diff --git a/app/celery/scheduled_tasks.py b/app/celery/scheduled_tasks.py index 7c2a88260..225772537 100644 --- a/app/celery/scheduled_tasks.py +++ b/app/celery/scheduled_tasks.py @@ -35,7 +35,7 @@ from app.dao.notifications_dao import ( ) from app.dao.provider_details_dao import dao_reduce_sms_provider_priority from app.dao.users_dao import delete_codes_older_created_more_than_a_day_ago -from app.dao.services_dao import dao_find_services_sending_to_tv_numbers +from app.dao.services_dao import dao_find_services_sending_to_tv_numbers, dao_find_services_with_high_failure_rates from app.models import ( Job, JOB_STATUS_IN_PROGRESS, @@ -46,8 +46,6 @@ from app.models import ( from app.notifications.process_notifications import send_notification_to_queue from app.v2.errors import JobIncompleteError -from app.service.utils import get_services_with_high_failure_rates - @notify_celery.task(name="run-scheduled-jobs") @statsd(namespace="tasks") @@ -265,7 +263,7 @@ def check_for_services_with_high_failure_rates_or_sending_to_tv_numbers(): end_date = datetime.utcnow() message = "" - services_with_failures = get_services_with_high_failure_rates(start_date=start_date, end_date=end_date) + services_with_failures = dao_find_services_with_high_failure_rates(start_date=start_date, end_date=end_date) services_sending_to_tv_numbers = dao_find_services_sending_to_tv_numbers(start_date=start_date, end_date=end_date) if services_with_failures: @@ -273,8 +271,8 @@ def check_for_services_with_high_failure_rates_or_sending_to_tv_numbers(): len(services_with_failures) ) for service in services_with_failures: - service_dashboard = current_app.config['ADMIN_BASE_URL'] + "/services/" + service["id"] - message += "service: {} failure rate: {},\n".format(service_dashboard, service["permanent_failure_rate"]) + service_dashboard = current_app.config['ADMIN_BASE_URL'] + "/services/" + service.service_id + message += "service: {} failure rate: {},\n".format(service_dashboard, service.permanent_failure_rate) elif services_sending_to_tv_numbers: message += "{} service(s) have sent over 100 sms messages to tv numbers in last 24 hours:\n".format( len(services_sending_to_tv_numbers) diff --git a/app/dao/services_dao.py b/app/dao/services_dao.py index 03ddfd22a..76e75f33a 100644 --- a/app/dao/services_dao.py +++ b/app/dao/services_dao.py @@ -4,6 +4,7 @@ from datetime import date, datetime, timedelta from notifications_utils.statsd_decorators import statsd from sqlalchemy.sql.expression import asc, case, and_, func from sqlalchemy.orm import joinedload +from sqlalchemy import cast, Float from flask import current_app from app import db @@ -44,6 +45,7 @@ from app.models import ( KEY_TYPE_TEST, NHS_ORGANISATION_TYPES, NON_CROWN_ORGANISATION_TYPES, + NOTIFICATION_PERMANENT_FAILURE, SMS_TYPE, LETTER_TYPE, ) @@ -544,12 +546,10 @@ def dao_find_services_sending_to_tv_numbers(start_date, end_date, threshold=100) ).all() -def dao_find_real_sms_notification_count_by_status_for_live_services(start_date, end_date): - # only works within services' retention period - return db.session.query( - Notification.service_id.label('service_id'), - Notification.status.label('status'), - func.count(Notification.id).label('count') +def dao_find_services_with_high_failure_rates(start_date, end_date, threshold=100): + subquery = db.session.query( + func.count(Notification.id).label('total_count'), + Notification.service_id.label('service_id') ).filter( Notification.service_id == Service.id, Notification.created_at >= start_date, @@ -561,5 +561,35 @@ def dao_find_real_sms_notification_count_by_status_for_live_services(start_date, Service.active == True, ).group_by( Notification.service_id, - Notification.status - ).all() + ).having( + func.count(Notification.id) >= threshold + ) + + subquery = subquery.subquery() + + query = db.session.query( + Notification.service_id.label('service_id'), + func.count(Notification.id).label('permanent_failure_count'), + subquery.c.total_count.label('total_count'), + (cast(func.count(Notification.id), Float) / cast(subquery.c.total_count, Float)).label('permanent_failure_rate') + ).join( + subquery, + subquery.c.service_id == Notification.service_id + ).filter( + Notification.service_id == Service.id, + Notification.created_at >= start_date, + Notification.created_at <= end_date, + Notification.key_type != KEY_TYPE_TEST, + Notification.notification_type == SMS_TYPE, + Notification.status == NOTIFICATION_PERMANENT_FAILURE, + Service.restricted == False, # noqa + Service.research_mode == False, + Service.active == True, + ).group_by( + Notification.service_id, + subquery.c.total_count + ).having( + cast(func.count(Notification.id), Float) / cast(subquery.c.total_count, Float) >= 0.25 + ) + + return query.all() diff --git a/app/service/statistics.py b/app/service/statistics.py index 3c51ee343..5c6accaa5 100644 --- a/app/service/statistics.py +++ b/app/service/statistics.py @@ -21,18 +21,6 @@ def format_statistics(statistics): return counts -def get_rate_of_permanent_failures_for_service(statistics, threshold=100): - counts = {"permanent_failure": 0, "all_other_statuses": 0} - for row in statistics: - _count_if_status_is_permanent_failure_from_row(counts, row) - - if counts['permanent_failure'] + counts['all_other_statuses'] >= threshold: - rate = counts['permanent_failure'] / (counts['permanent_failure'] + counts['all_other_statuses']) - else: - rate = 0 - return rate - - def format_admin_stats(statistics): counts = create_stats_dict() @@ -106,13 +94,6 @@ def _update_statuses_from_row(update_dict, row): update_dict['failed'] += row.count -def _count_if_status_is_permanent_failure_from_row(update_dict, row): - if row.status == 'permanent-failure': - update_dict['permanent_failure'] += row.count - else: - update_dict['all_other_statuses'] += row.count - - def create_empty_monthly_notification_status_stats_dict(year): utc_month_starts = get_months_for_financial_year(year) # nested dicts - data[month][template type][status] = count diff --git a/app/service/utils.py b/app/service/utils.py index 0b7865928..16521fbbd 100644 --- a/app/service/utils.py +++ b/app/service/utils.py @@ -2,15 +2,11 @@ import itertools from notifications_utils.recipients import allowed_to_send_to -from app.dao.services_dao import dao_find_real_sms_notification_count_by_status_for_live_services - from app.models import ( ServiceWhitelist, MOBILE_TYPE, EMAIL_TYPE, KEY_TYPE_TEST, KEY_TYPE_TEAM, KEY_TYPE_NORMAL) -from app.service import statistics - def get_recipients_from_request(request_json, key, type): return [(type, recipient) for recipient in request_json.get(key)] @@ -56,21 +52,3 @@ def service_allowed_to_send_to(recipient, service, key_type, allow_whitelisted_r whitelist_members ) ) - - -def get_services_with_high_failure_rates(start_date, end_date, rate=0.25, threshold=100): - stats = dao_find_real_sms_notification_count_by_status_for_live_services( - start_date=start_date, - end_date=end_date, - ) - results = [] - for service_id, rows in itertools.groupby(stats, lambda x: x.service_id): - rows = list(rows) - - permanent_failure_rate = statistics.get_rate_of_permanent_failures_for_service(rows, threshold=threshold) - if permanent_failure_rate >= rate: - results.append({ - 'id': str(rows[0].service_id), - 'permanent_failure_rate': permanent_failure_rate - }) - return results diff --git a/tests/app/celery/test_scheduled_tasks.py b/tests/app/celery/test_scheduled_tasks.py index e6703e4d5..cd4848c4d 100644 --- a/tests/app/celery/test_scheduled_tasks.py +++ b/tests/app/celery/test_scheduled_tasks.py @@ -505,11 +505,18 @@ MockServicesSendingToTVNumbers = namedtuple( 'notification_count', ] ) +MockServicesWithHighFailureRate = namedtuple( + 'ServicesWithHighFailureRate', + [ + 'service_id', + 'permanent_failure_rate', + ] +) @pytest.mark.parametrize("failure_rates, sms_to_tv_numbers, expected_message", [ [ - [{"id": "123", "permanent_failure_rate": 0.3}], + [MockServicesWithHighFailureRate("123", 0.3)], [], "1 service(s) have had high permanent-failure rates for sms messages in last " "24 hours:\nservice: {} failure rate: 0.3,\n".format( @@ -531,7 +538,7 @@ def test_check_for_services_with_high_failure_rates_or_sending_to_tv_numbers( mock_logger = mocker.patch('app.celery.tasks.current_app.logger.exception') mock_create_ticket = mocker.patch('app.celery.scheduled_tasks.zendesk_client.create_ticket') mock_failure_rates = mocker.patch( - 'app.celery.scheduled_tasks.get_services_with_high_failure_rates', return_value=failure_rates + 'app.celery.scheduled_tasks.dao_find_services_with_high_failure_rates', return_value=failure_rates ) mock_sms_to_tv_numbers = mocker.patch( 'app.celery.scheduled_tasks.dao_find_services_sending_to_tv_numbers', return_value=sms_to_tv_numbers diff --git a/tests/app/dao/test_services_dao.py b/tests/app/dao/test_services_dao.py index df03a7a6f..5769f94f4 100644 --- a/tests/app/dao/test_services_dao.py +++ b/tests/app/dao/test_services_dao.py @@ -8,79 +8,49 @@ from sqlalchemy.exc import IntegrityError from sqlalchemy.orm.exc import NoResultFound from app import db -from app.dao.inbound_numbers_dao import ( - dao_set_inbound_number_to_service, - dao_get_available_inbound_numbers, - dao_set_inbound_number_active_flag -) +from app.dao.inbound_numbers_dao import (dao_get_available_inbound_numbers, + dao_set_inbound_number_active_flag, + dao_set_inbound_number_to_service) from app.dao.organisation_dao import dao_add_service_to_organisation -from app.dao.service_permissions_dao import dao_add_service_permission, dao_remove_service_permission -from app.dao.services_dao import ( - dao_create_service, - dao_add_user_to_service, - dao_remove_user_from_service, - dao_fetch_all_services, - dao_fetch_live_services_data, - dao_fetch_service_by_id, - dao_fetch_all_services_by_user, - dao_find_real_sms_notification_count_by_status_for_live_services, - dao_find_services_sending_to_tv_numbers, - dao_update_service, - delete_service_and_all_associated_db_objects, - dao_fetch_stats_for_service, - dao_fetch_todays_stats_for_service, - fetch_todays_total_message_count, - dao_fetch_todays_stats_for_all_services, - dao_suspend_service, - dao_resume_service, - dao_fetch_active_users_for_service, - dao_fetch_service_by_inbound_number, - get_services_by_partial_name, -) -from app.dao.service_user_dao import dao_get_service_user, dao_update_service_user -from app.dao.users_dao import save_model_user, create_user_code -from app.models import ( - VerifyCode, - ApiKey, - Template, - TemplateHistory, - Job, - Notification, - NotificationHistory, - Permission, - User, - InvitedUser, - Service, - ServicePermission, - ServiceUser, - KEY_TYPE_NORMAL, - KEY_TYPE_TEAM, - KEY_TYPE_TEST, - EMAIL_TYPE, - SMS_TYPE, - INTERNATIONAL_SMS_TYPE, - LETTER_TYPE, - user_folder_permissions, - Organisation -) -from tests.app.db import ( - create_ft_billing, - create_inbound_number, - create_organisation, - create_user, - create_service, - create_service_with_inbound_number, - create_service_with_defined_sms_sender, - create_template, - create_template_folder, - create_notification, - create_api_key, - create_invited_user, - create_email_branding, - create_letter_branding, - create_notification_history, - create_annual_billing, -) +from app.dao.service_permissions_dao import (dao_add_service_permission, + dao_remove_service_permission) +from app.dao.service_user_dao import (dao_get_service_user, + dao_update_service_user) +from app.dao.services_dao import (dao_add_user_to_service, dao_create_service, + dao_fetch_active_users_for_service, + dao_fetch_all_services, + dao_fetch_all_services_by_user, + dao_fetch_live_services_data, + dao_fetch_service_by_id, + dao_fetch_service_by_inbound_number, + dao_fetch_stats_for_service, + dao_fetch_todays_stats_for_all_services, + dao_fetch_todays_stats_for_service, + dao_find_services_sending_to_tv_numbers, + dao_find_services_with_high_failure_rates, + dao_remove_user_from_service, + dao_resume_service, dao_suspend_service, + dao_update_service, + delete_service_and_all_associated_db_objects, + fetch_todays_total_message_count, + get_services_by_partial_name) +from app.dao.users_dao import create_user_code, save_model_user +from app.models import (EMAIL_TYPE, INTERNATIONAL_SMS_TYPE, KEY_TYPE_NORMAL, + KEY_TYPE_TEAM, KEY_TYPE_TEST, LETTER_TYPE, SMS_TYPE, + ApiKey, InvitedUser, Job, Notification, + NotificationHistory, Organisation, Permission, Service, + ServicePermission, ServiceUser, Template, + TemplateHistory, User, VerifyCode, + user_folder_permissions) +from tests.app.db import (create_annual_billing, create_api_key, + create_email_branding, create_ft_billing, + create_inbound_number, create_invited_user, + create_letter_branding, create_notification, + create_notification_history, create_organisation, + create_service, + create_service_with_defined_sms_sender, + create_service_with_inbound_number, create_template, + create_template_folder, create_user) def test_should_have_decorated_services_dao_functions(): @@ -1106,9 +1076,9 @@ def create_email_sms_letter_template(): @freeze_time("2019-12-02 12:00:00.000000") def test_dao_find_services_sending_to_tv_numbers(notify_db_session, fake_uuid): service_1 = create_service(service_name="Service 1", service_id=fake_uuid) - service_3 = create_service(service_name="Service 3", restricted=True) # restricted - service_4 = create_service(service_name="Service 4", research_mode=True) # research mode - service_5 = create_service(service_name="Service 5", active=False) # not active + service_3 = create_service(service_name="Service 3", restricted=True) # restricted is excluded + service_4 = create_service(service_name="Service 4", research_mode=True) # research mode is excluded + service_5 = create_service(service_name="Service 5", active=False) # not active is excluded services = [service_1, service_3, service_4, service_5] tv_number = "447700900001" @@ -1120,13 +1090,13 @@ def test_dao_find_services_sending_to_tv_numbers(notify_db_session, fake_uuid): for x in range(0, 5): create_notification(template, normalised_to=tv_number, status="permanent-failure") - service_6 = create_service(service_name="Service 6") # notifications too old + service_6 = create_service(service_name="Service 6") # notifications too old are excluded with freeze_time("2019-11-30 15:00:00.000000"): template_6 = create_template(service_6) for x in range(0, 5): create_notification(template_6, normalised_to=tv_number, status="permanent-failure") - service_2 = create_service(service_name="Service 2") # below threshold + service_2 = create_service(service_name="Service 2") # below threshold is excluded template_2 = create_template(service_2) create_notification(template_2, normalised_to=tv_number, status="permanent-failure") for x in range(0, 5): @@ -1141,11 +1111,11 @@ def test_dao_find_services_sending_to_tv_numbers(notify_db_session, fake_uuid): assert str(result[0].service_id) == fake_uuid -def test_dao_find_real_sms_notification_count_by_status_for_live_services(notify_db_session, fake_uuid): +def test_dao_find_services_with_high_failure_rates(notify_db_session, fake_uuid): service_1 = create_service(service_name="Service 1", service_id=fake_uuid) - service_3 = create_service(service_name="Service 3", restricted=True) # restricted - service_4 = create_service(service_name="Service 4", research_mode=True) # research mode - service_5 = create_service(service_name="Service 5", active=False) # not active + service_3 = create_service(service_name="Service 3", restricted=True) # restricted is excluded + service_4 = create_service(service_name="Service 4", research_mode=True) # research mode is excluded + service_5 = create_service(service_name="Service 5", active=False) # not active is excluded services = [service_1, service_3, service_4, service_5] for service in services: @@ -1153,21 +1123,27 @@ def test_dao_find_real_sms_notification_count_by_status_for_live_services(notify for x in range(0, 3): create_notification(template, status="permanent-failure") create_notification(template, status="delivered") + create_notification(template, status="sending") create_notification(template, status="temporary-failure") - service_6 = create_service(service_name="Service 6") # notifications too old + service_6 = create_service(service_name="Service 6") with freeze_time("2019-11-30 15:00:00.000000"): template_6 = create_template(service_6) - for x in range(0, 2): - create_notification(template_6, status="permanent-failure") + for x in range(0, 4): + create_notification(template_6, status="permanent-failure") # notifications too old are excluded - service_2 = create_service(service_name="Service 2") # below threshold + service_2 = create_service(service_name="Service 2") template_2 = create_template(service_2) - create_notification(template_2, status="permanent-failure", key_type='test') # test key type + for x in range(0, 4): + create_notification(template_2, status="permanent-failure", key_type='test') # test key type is excluded + create_notification(template_2, status="permanent-failure") # below threshold is excluded start_date = (datetime.utcnow() - timedelta(days=1)) end_date = datetime.utcnow() - result = dao_find_real_sms_notification_count_by_status_for_live_services(start_date, end_date) - assert len(result) == 3 + result = dao_find_services_with_high_failure_rates(start_date, end_date, threshold=3) + # assert len(result) == 3 + # assert str(result[0].service_id) == fake_uuid + assert len(result) == 1 assert str(result[0].service_id) == fake_uuid + assert result[0].permanent_failure_rate == 0.25 diff --git a/tests/app/service/test_statistics.py b/tests/app/service/test_statistics.py index 90947f1ae..ba40dc796 100644 --- a/tests/app/service/test_statistics.py +++ b/tests/app/service/test_statistics.py @@ -11,8 +11,7 @@ from app.service.statistics import ( create_stats_dict, create_zeroed_stats_dicts, format_admin_stats, - format_statistics, - get_rate_of_permanent_failures_for_service + format_statistics ) StatsRow = collections.namedtuple('row', ('notification_type', 'status', 'count')) @@ -74,26 +73,6 @@ def test_format_statistics(stats, email_counts, sms_counts, letter_counts): } -@pytest.mark.idparametrize("statistics, expected_result", { - 'counts_rate_for_sms': ([ - StatsRow('sms', 'permanent-failure', 100), - StatsRow('sms', 'delivered', 300), - ], 0.25), - 'only_counts_permanent_failure_as_failed': ([ - StatsRow('sms', 'permanent-failure', 100), - StatsRow('sms', 'temporary-failure', 100), - StatsRow('sms', 'delivered', 300), - ], 0.2), - 'below_threshold': ([ - StatsRow('sms', 'permanent-failure', 5), - StatsRow('sms', 'delivered', 3), - ], 0), -}) -def test_get_rate_of_permanent_failures_for_service(statistics, expected_result): - rate = get_rate_of_permanent_failures_for_service(statistics) - assert rate == expected_result - - def test_create_zeroed_stats_dicts(): assert create_zeroed_stats_dicts() == { 'sms': {'requested': 0, 'delivered': 0, 'failed': 0}, diff --git a/tests/app/service/test_utils.py b/tests/app/service/test_utils.py index 2e107dada..71b2770bb 100644 --- a/tests/app/service/test_utils.py +++ b/tests/app/service/test_utils.py @@ -1,8 +1,5 @@ from app.dao.date_util import get_current_financial_year_start_year from freezegun import freeze_time -from app.service.utils import get_services_with_high_failure_rates -from collections import namedtuple -from datetime import datetime, timedelta # see get_financial_year for conversion of financial years. @@ -16,37 +13,3 @@ def test_get_current_financial_year_start_year_before_march(): def test_get_current_financial_year_start_year_after_april(): current_fy = get_current_financial_year_start_year() assert current_fy == 2017 - - -MockServicesNotificationCounts = namedtuple( - 'ServicesSendingToTVNumbers', - [ - 'service_id', - 'status', - 'count', - ] -) - - -@freeze_time("2019-12-02 12:00:00.000000") -def test_get_services_with_high_failure_rates(mocker, notify_db_session): - mock_query_results = [ - MockServicesNotificationCounts('123', 'delivered', 150), - MockServicesNotificationCounts('123', 'permanent-failure', 50), # these will show up - MockServicesNotificationCounts('456', 'delivered', 150), - MockServicesNotificationCounts('456', 'permanent-failure', 5), # ratio too low - MockServicesNotificationCounts('789', 'permanent-failure', 5), # below threshold - MockServicesNotificationCounts('444', 'delivered', 100), - MockServicesNotificationCounts('444', 'permanent-failure', 100), # these will show up - ] - mocker.patch( - 'app.service.utils.dao_find_real_sms_notification_count_by_status_for_live_services', - return_value=mock_query_results - ) - start_date = (datetime.utcnow() - timedelta(days=1)) - end_date = datetime.utcnow() - - assert get_services_with_high_failure_rates(start_date, end_date) == [ - {'id': '123', 'permanent_failure_rate': 0.25}, - {'id': '444', 'permanent_failure_rate': 0.5} - ] From 87bc86efa736641e7e96f8f8d1fe9a0837649052 Mon Sep 17 00:00:00 2001 From: Pea Tyczynska Date: Thu, 5 Dec 2019 17:31:31 +0000 Subject: [PATCH 7/8] Reference dev runbook for instructions in the zendesk ticket --- app/celery/scheduled_tasks.py | 3 ++- tests/app/celery/test_scheduled_tasks.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/app/celery/scheduled_tasks.py b/app/celery/scheduled_tasks.py index 225772537..d8482924e 100644 --- a/app/celery/scheduled_tasks.py +++ b/app/celery/scheduled_tasks.py @@ -287,7 +287,8 @@ def check_for_services_with_high_failure_rates_or_sending_to_tv_numbers(): current_app.logger.exception(message) if current_app.config['NOTIFY_ENVIRONMENT'] in ['live', 'production', 'test']: - message += "\nThings to do: contact service? revoke their key?" + message += "\nYou can find instructions for this ticket in our manual:\n" + "https://github.com/alphagov/notifications-manuals/wiki/Support-Runbook#Deal-with-services-with-high-failure-rates-or-sending-sms-to-tv-numbers" # noqa zendesk_client.create_ticket( subject="[{}] High failure rates for sms spotted for services".format( current_app.config['NOTIFY_ENVIRONMENT'] diff --git a/tests/app/celery/test_scheduled_tasks.py b/tests/app/celery/test_scheduled_tasks.py index cd4848c4d..ed52a9f6a 100644 --- a/tests/app/celery/test_scheduled_tasks.py +++ b/tests/app/celery/test_scheduled_tasks.py @@ -544,7 +544,8 @@ def test_check_for_services_with_high_failure_rates_or_sending_to_tv_numbers( 'app.celery.scheduled_tasks.dao_find_services_sending_to_tv_numbers', return_value=sms_to_tv_numbers ) - zendesk_actions = "\nThings to do: contact service? revoke their key?" + zendesk_actions = "\nYou can find instructions for this ticket in our manual:\n" + "https://github.com/alphagov/notifications-manuals/wiki/Support-Runbook#Deal-with-services-with-high-failure-rates-or-sending-sms-to-tv-numbers" # noqa check_for_services_with_high_failure_rates_or_sending_to_tv_numbers() From 08b12a644369ec302c7ec24347586f0303ae221e Mon Sep 17 00:00:00 2001 From: Pea Tyczynska Date: Fri, 6 Dec 2019 12:03:23 +0000 Subject: [PATCH 8/8] Test that test key notifications are excluded form tv numbers query --- tests/app/dao/test_services_dao.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/app/dao/test_services_dao.py b/tests/app/dao/test_services_dao.py index 5769f94f4..9f36c4b7c 100644 --- a/tests/app/dao/test_services_dao.py +++ b/tests/app/dao/test_services_dao.py @@ -1100,6 +1100,10 @@ def test_dao_find_services_sending_to_tv_numbers(notify_db_session, fake_uuid): template_2 = create_template(service_2) create_notification(template_2, normalised_to=tv_number, status="permanent-failure") for x in range(0, 5): + # test key type is excluded + create_notification(template_2, normalised_to=tv_number, status="permanent-failure", key_type='test') + for x in range(0, 5): + # normal numbers are not counted by the query create_notification(template_2, normalised_to=normal_number, status="delivered") create_notification(template_2, normalised_to=normal_number_resembling_tv_number, status="delivered")