Merge pull request #2667 from alphagov/warn-team-about-high-failure-rates

Warn team about high failure rates
This commit is contained in:
Pea M. Tyczynska
2019-12-09 11:28:25 +00:00
committed by GitHub
6 changed files with 311 additions and 76 deletions

View File

@@ -35,6 +35,7 @@ from app.dao.notifications_dao import (
)
from app.dao.provider_details_dao import dao_reduce_sms_provider_priority
from app.dao.users_dao import delete_codes_older_created_more_than_a_day_ago
from app.dao.services_dao import dao_find_services_sending_to_tv_numbers, dao_find_services_with_high_failure_rates
from app.models import (
Job,
JOB_STATUS_IN_PROGRESS,
@@ -253,3 +254,45 @@ def check_for_missing_rows_in_completed_jobs():
current_app.logger.info(
"Processing missing row: {} for job: {}".format(row_to_process.missing_row, job.id))
process_row(row, template, job, job.service, sender_id=sender_id)
@notify_celery.task(name='check-for-services-with-high-failure-rates-or-sending-to-tv-numbers')
@statsd(namespace="tasks")
def check_for_services_with_high_failure_rates_or_sending_to_tv_numbers():
start_date = (datetime.utcnow() - timedelta(days=1))
end_date = datetime.utcnow()
message = ""
services_with_failures = dao_find_services_with_high_failure_rates(start_date=start_date, end_date=end_date)
services_sending_to_tv_numbers = dao_find_services_sending_to_tv_numbers(start_date=start_date, end_date=end_date)
if services_with_failures:
message += "{} service(s) have had high permanent-failure rates for sms messages in last 24 hours:\n".format(
len(services_with_failures)
)
for service in services_with_failures:
service_dashboard = current_app.config['ADMIN_BASE_URL'] + "/services/" + service.service_id
message += "service: {} failure rate: {},\n".format(service_dashboard, service.permanent_failure_rate)
elif services_sending_to_tv_numbers:
message += "{} service(s) have sent over 100 sms messages to tv numbers in last 24 hours:\n".format(
len(services_sending_to_tv_numbers)
)
for service in services_sending_to_tv_numbers:
service_dashboard = current_app.config['ADMIN_BASE_URL'] + "/services/" + service.service_id
message += "service: {} count of sms to tv numbers: {},\n".format(
service_dashboard, service.notification_count
)
if services_with_failures or services_sending_to_tv_numbers:
current_app.logger.exception(message)
if current_app.config['NOTIFY_ENVIRONMENT'] in ['live', 'production', 'test']:
message += "\nYou can find instructions for this ticket in our manual:\n"
"https://github.com/alphagov/notifications-manuals/wiki/Support-Runbook#Deal-with-services-with-high-failure-rates-or-sending-sms-to-tv-numbers" # noqa
zendesk_client.create_ticket(
subject="[{}] High failure rates for sms spotted for services".format(
current_app.config['NOTIFY_ENVIRONMENT']
),
message=message,
ticket_type=zendesk_client.TYPE_INCIDENT
)

View File

@@ -276,6 +276,11 @@ class Config(object):
'schedule': crontab(day_of_week='mon-fri', hour='9,15', minute=0),
'options': {'queue': QueueNames.PERIODIC}
},
'check-for-services-with-high-failure-rates-or-sending-to-tv-numbers': {
'task': 'check-for-services-with-high-failure-rates-or-sending-to-tv-numbers',
'schedule': crontab(day_of_week='mon-fri', hour=10, minute=30),
'options': {'queue': QueueNames.PERIODIC}
},
'raise-alert-if-letter-notifications-still-sending': {
'task': 'raise-alert-if-letter-notifications-still-sending',
'schedule': crontab(hour=16, minute=30),

View File

@@ -4,6 +4,7 @@ from datetime import date, datetime, timedelta
from notifications_utils.statsd_decorators import statsd
from sqlalchemy.sql.expression import asc, case, and_, func
from sqlalchemy.orm import joinedload
from sqlalchemy import cast, Float
from flask import current_app
from app import db
@@ -44,6 +45,7 @@ from app.models import (
KEY_TYPE_TEST,
NHS_ORGANISATION_TYPES,
NON_CROWN_ORGANISATION_TYPES,
NOTIFICATION_PERMANENT_FAILURE,
SMS_TYPE,
LETTER_TYPE,
)
@@ -521,3 +523,73 @@ def dao_fetch_active_users_for_service(service_id):
)
return query.all()
def dao_find_services_sending_to_tv_numbers(start_date, end_date, threshold=100):
return db.session.query(
Notification.service_id.label('service_id'),
func.count(Notification.id).label('notification_count')
).filter(
Notification.service_id == Service.id,
Notification.created_at >= start_date,
Notification.created_at <= end_date,
Notification.key_type != KEY_TYPE_TEST,
Notification.notification_type == SMS_TYPE,
func.substr(Notification.normalised_to, 3, 7) == '7700900',
Service.restricted == False, # noqa
Service.research_mode == False,
Service.active == True,
).group_by(
Notification.service_id,
).having(
func.count(Notification.id) > threshold
).all()
def dao_find_services_with_high_failure_rates(start_date, end_date, threshold=100):
subquery = db.session.query(
func.count(Notification.id).label('total_count'),
Notification.service_id.label('service_id')
).filter(
Notification.service_id == Service.id,
Notification.created_at >= start_date,
Notification.created_at <= end_date,
Notification.key_type != KEY_TYPE_TEST,
Notification.notification_type == SMS_TYPE,
Service.restricted == False, # noqa
Service.research_mode == False,
Service.active == True,
).group_by(
Notification.service_id,
).having(
func.count(Notification.id) >= threshold
)
subquery = subquery.subquery()
query = db.session.query(
Notification.service_id.label('service_id'),
func.count(Notification.id).label('permanent_failure_count'),
subquery.c.total_count.label('total_count'),
(cast(func.count(Notification.id), Float) / cast(subquery.c.total_count, Float)).label('permanent_failure_rate')
).join(
subquery,
subquery.c.service_id == Notification.service_id
).filter(
Notification.service_id == Service.id,
Notification.created_at >= start_date,
Notification.created_at <= end_date,
Notification.key_type != KEY_TYPE_TEST,
Notification.notification_type == SMS_TYPE,
Notification.status == NOTIFICATION_PERMANENT_FAILURE,
Service.restricted == False, # noqa
Service.research_mode == False,
Service.active == True,
).group_by(
Notification.service_id,
subquery.c.total_count
).having(
cast(func.count(Notification.id), Float) / cast(subquery.c.total_count, Float) >= 0.25
)
return query.all()

View File

@@ -2,6 +2,7 @@ from datetime import datetime, timedelta
from unittest.mock import call
import pytest
from collections import namedtuple
from freezegun import freeze_time
from mock import mock
@@ -16,9 +17,10 @@ from app.celery.scheduled_tasks import (
check_precompiled_letter_state,
check_templated_letter_state,
check_for_missing_rows_in_completed_jobs,
check_for_services_with_high_failure_rates_or_sending_to_tv_numbers,
switch_current_sms_provider_on_slow_delivery,
)
from app.config import QueueNames, TaskNames
from app.config import QueueNames, TaskNames, Config
from app.dao.jobs_dao import dao_get_job_by_id
from app.dao.notifications_dao import dao_get_scheduled_notifications
from app.dao.provider_details_dao import get_provider_details_by_identifier
@@ -494,3 +496,64 @@ def test_check_for_missing_rows_in_completed_jobs_uses_sender_id(mocker, sample_
mock_process_row.assert_called_once_with(
mock.ANY, mock.ANY, job, job.service, sender_id=fake_uuid
)
MockServicesSendingToTVNumbers = namedtuple(
'ServicesSendingToTVNumbers',
[
'service_id',
'notification_count',
]
)
MockServicesWithHighFailureRate = namedtuple(
'ServicesWithHighFailureRate',
[
'service_id',
'permanent_failure_rate',
]
)
@pytest.mark.parametrize("failure_rates, sms_to_tv_numbers, expected_message", [
[
[MockServicesWithHighFailureRate("123", 0.3)],
[],
"1 service(s) have had high permanent-failure rates for sms messages in last "
"24 hours:\nservice: {} failure rate: 0.3,\n".format(
Config.ADMIN_BASE_URL + "/services/" + "123"
)
],
[
[],
[MockServicesSendingToTVNumbers("123", 300)],
"1 service(s) have sent over 100 sms messages to tv numbers in last 24 hours:\n"
"service: {} count of sms to tv numbers: 300,\n".format(
Config.ADMIN_BASE_URL + "/services/" + "123"
)
]
])
def test_check_for_services_with_high_failure_rates_or_sending_to_tv_numbers(
mocker, notify_db_session, failure_rates, sms_to_tv_numbers, expected_message
):
mock_logger = mocker.patch('app.celery.tasks.current_app.logger.exception')
mock_create_ticket = mocker.patch('app.celery.scheduled_tasks.zendesk_client.create_ticket')
mock_failure_rates = mocker.patch(
'app.celery.scheduled_tasks.dao_find_services_with_high_failure_rates', return_value=failure_rates
)
mock_sms_to_tv_numbers = mocker.patch(
'app.celery.scheduled_tasks.dao_find_services_sending_to_tv_numbers', return_value=sms_to_tv_numbers
)
zendesk_actions = "\nYou can find instructions for this ticket in our manual:\n"
"https://github.com/alphagov/notifications-manuals/wiki/Support-Runbook#Deal-with-services-with-high-failure-rates-or-sending-sms-to-tv-numbers" # noqa
check_for_services_with_high_failure_rates_or_sending_to_tv_numbers()
assert mock_failure_rates.called
assert mock_sms_to_tv_numbers.called
mock_logger.assert_called_once_with(expected_message)
mock_create_ticket.assert_called_with(
message=expected_message + zendesk_actions,
subject="[test] High failure rates for sms spotted for services",
ticket_type='incident'
)

View File

@@ -1,5 +1,5 @@
import uuid
from datetime import datetime
from datetime import datetime, timedelta
from unittest import mock
import pytest
@@ -8,77 +8,49 @@ from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm.exc import NoResultFound
from app import db
from app.dao.inbound_numbers_dao import (
dao_set_inbound_number_to_service,
dao_get_available_inbound_numbers,
dao_set_inbound_number_active_flag
)
from app.dao.inbound_numbers_dao import (dao_get_available_inbound_numbers,
dao_set_inbound_number_active_flag,
dao_set_inbound_number_to_service)
from app.dao.organisation_dao import dao_add_service_to_organisation
from app.dao.service_permissions_dao import dao_add_service_permission, dao_remove_service_permission
from app.dao.services_dao import (
dao_create_service,
dao_add_user_to_service,
dao_remove_user_from_service,
dao_fetch_all_services,
dao_fetch_live_services_data,
dao_fetch_service_by_id,
dao_fetch_all_services_by_user,
dao_update_service,
delete_service_and_all_associated_db_objects,
dao_fetch_stats_for_service,
dao_fetch_todays_stats_for_service,
fetch_todays_total_message_count,
dao_fetch_todays_stats_for_all_services,
dao_suspend_service,
dao_resume_service,
dao_fetch_active_users_for_service,
dao_fetch_service_by_inbound_number,
get_services_by_partial_name,
)
from app.dao.service_user_dao import dao_get_service_user, dao_update_service_user
from app.dao.users_dao import save_model_user, create_user_code
from app.models import (
VerifyCode,
ApiKey,
Template,
TemplateHistory,
Job,
Notification,
NotificationHistory,
Permission,
User,
InvitedUser,
Service,
ServicePermission,
ServiceUser,
KEY_TYPE_NORMAL,
KEY_TYPE_TEAM,
KEY_TYPE_TEST,
EMAIL_TYPE,
SMS_TYPE,
INTERNATIONAL_SMS_TYPE,
LETTER_TYPE,
user_folder_permissions,
Organisation
)
from tests.app.db import (
create_ft_billing,
create_inbound_number,
create_organisation,
create_user,
create_service,
create_service_with_inbound_number,
create_service_with_defined_sms_sender,
create_template,
create_template_folder,
create_notification,
create_api_key,
create_invited_user,
create_email_branding,
create_letter_branding,
create_notification_history,
create_annual_billing,
)
from app.dao.service_permissions_dao import (dao_add_service_permission,
dao_remove_service_permission)
from app.dao.service_user_dao import (dao_get_service_user,
dao_update_service_user)
from app.dao.services_dao import (dao_add_user_to_service, dao_create_service,
dao_fetch_active_users_for_service,
dao_fetch_all_services,
dao_fetch_all_services_by_user,
dao_fetch_live_services_data,
dao_fetch_service_by_id,
dao_fetch_service_by_inbound_number,
dao_fetch_stats_for_service,
dao_fetch_todays_stats_for_all_services,
dao_fetch_todays_stats_for_service,
dao_find_services_sending_to_tv_numbers,
dao_find_services_with_high_failure_rates,
dao_remove_user_from_service,
dao_resume_service, dao_suspend_service,
dao_update_service,
delete_service_and_all_associated_db_objects,
fetch_todays_total_message_count,
get_services_by_partial_name)
from app.dao.users_dao import create_user_code, save_model_user
from app.models import (EMAIL_TYPE, INTERNATIONAL_SMS_TYPE, KEY_TYPE_NORMAL,
KEY_TYPE_TEAM, KEY_TYPE_TEST, LETTER_TYPE, SMS_TYPE,
ApiKey, InvitedUser, Job, Notification,
NotificationHistory, Organisation, Permission, Service,
ServicePermission, ServiceUser, Template,
TemplateHistory, User, VerifyCode,
user_folder_permissions)
from tests.app.db import (create_annual_billing, create_api_key,
create_email_branding, create_ft_billing,
create_inbound_number, create_invited_user,
create_letter_branding, create_notification,
create_notification_history, create_organisation,
create_service,
create_service_with_defined_sms_sender,
create_service_with_inbound_number, create_template,
create_template_folder, create_user)
def test_should_have_decorated_services_dao_functions():
@@ -1101,3 +1073,83 @@ def create_email_sms_letter_template():
template_two = create_template(service=service, template_name='2', template_type='sms')
template_three = create_template(service=service, template_name='3', template_type='letter')
return template_one, template_three, template_two
@freeze_time("2019-12-02 12:00:00.000000")
def test_dao_find_services_sending_to_tv_numbers(notify_db_session, fake_uuid):
service_1 = create_service(service_name="Service 1", service_id=fake_uuid)
service_3 = create_service(service_name="Service 3", restricted=True) # restricted is excluded
service_4 = create_service(service_name="Service 4", research_mode=True) # research mode is excluded
service_5 = create_service(service_name="Service 5", active=False) # not active is excluded
services = [service_1, service_3, service_4, service_5]
tv_number = "447700900001"
normal_number = "447711900001"
normal_number_resembling_tv_number = "447227700900"
for service in services:
template = create_template(service)
for x in range(0, 5):
create_notification(template, normalised_to=tv_number, status="permanent-failure")
service_6 = create_service(service_name="Service 6") # notifications too old are excluded
with freeze_time("2019-11-30 15:00:00.000000"):
template_6 = create_template(service_6)
for x in range(0, 5):
create_notification(template_6, normalised_to=tv_number, status="permanent-failure")
service_2 = create_service(service_name="Service 2") # below threshold is excluded
template_2 = create_template(service_2)
create_notification(template_2, normalised_to=tv_number, status="permanent-failure")
for x in range(0, 5):
# test key type is excluded
create_notification(template_2, normalised_to=tv_number, status="permanent-failure", key_type='test')
for x in range(0, 5):
# normal numbers are not counted by the query
create_notification(template_2, normalised_to=normal_number, status="delivered")
create_notification(template_2, normalised_to=normal_number_resembling_tv_number, status="delivered")
start_date = (datetime.utcnow() - timedelta(days=1))
end_date = datetime.utcnow()
result = dao_find_services_sending_to_tv_numbers(start_date, end_date, threshold=4)
assert len(result) == 1
assert str(result[0].service_id) == fake_uuid
def test_dao_find_services_with_high_failure_rates(notify_db_session, fake_uuid):
service_1 = create_service(service_name="Service 1", service_id=fake_uuid)
service_3 = create_service(service_name="Service 3", restricted=True) # restricted is excluded
service_4 = create_service(service_name="Service 4", research_mode=True) # research mode is excluded
service_5 = create_service(service_name="Service 5", active=False) # not active is excluded
services = [service_1, service_3, service_4, service_5]
for service in services:
template = create_template(service)
for x in range(0, 3):
create_notification(template, status="permanent-failure")
create_notification(template, status="delivered")
create_notification(template, status="sending")
create_notification(template, status="temporary-failure")
service_6 = create_service(service_name="Service 6")
with freeze_time("2019-11-30 15:00:00.000000"):
template_6 = create_template(service_6)
for x in range(0, 4):
create_notification(template_6, status="permanent-failure") # notifications too old are excluded
service_2 = create_service(service_name="Service 2")
template_2 = create_template(service_2)
for x in range(0, 4):
create_notification(template_2, status="permanent-failure", key_type='test') # test key type is excluded
create_notification(template_2, status="permanent-failure") # below threshold is excluded
start_date = (datetime.utcnow() - timedelta(days=1))
end_date = datetime.utcnow()
result = dao_find_services_with_high_failure_rates(start_date, end_date, threshold=3)
# assert len(result) == 3
# assert str(result[0].service_id) == fake_uuid
assert len(result) == 1
assert str(result[0].service_id) == fake_uuid
assert result[0].permanent_failure_rate == 0.25

View File

@@ -6,12 +6,12 @@ from freezegun import freeze_time
import pytest
from app.service.statistics import (
format_admin_stats,
format_statistics,
add_monthly_notification_status_stats,
create_empty_monthly_notification_status_stats_dict,
create_stats_dict,
create_zeroed_stats_dicts,
create_empty_monthly_notification_status_stats_dict,
add_monthly_notification_status_stats
format_admin_stats,
format_statistics
)
StatsRow = collections.namedtuple('row', ('notification_type', 'status', 'count'))