Downgrade exception to warning for provider API call

When we send an HTTP request to our SMS providers, there is a
chance we get a 5xx status code back from them. Currently we log this as
two different exception level logs.

If a provider has a funny few minutes, we could end up with
hundreds of exceptions thrown and pagerduty waking someone up in the
middle of the night. These problems tend to pretty quickly fix
themselves as we balance traffic from one SMS to the other SMS provider
within 5 minutes.

By downgrading both exceptions to warning in the case of a
`SmsClientResponseException`, we will reduce the change of waking us up
in the middle of the night for no reason.

If the error is not a `SmsClientResponseException`, then we will still
log at the exception level as before as this is more unexpected and we
may want to be alerted sooner.

What we still want to happen though is that let's say both SMS providers
went down at the same time for 1 hour. We don't want our tasks to just
sit there, retrying every 5 minutes for the whole time without us being
aware (so we can at least raise a statuspage update). Luckily we will
still be alerted because our smoke tests will fail after 10 minutes and
raise a p1:
https://github.com/alphagov/notifications-functional-tests/blob/master/tests/functional/staging_and_prod/notify_api/test_notify_api_sms.py#L21
This commit is contained in:
David McDonald
2021-01-18 15:43:50 +00:00
parent 1cfd19fb6a
commit ac6837cde5
4 changed files with 63 additions and 27 deletions

View File

@@ -6,6 +6,7 @@ from app import notify_celery
from app.config import QueueNames
from app.clients.email import EmailClientNonRetryableException
from app.clients.email.aws_ses import AwsSesClientThrottlingSendRateException
from app.clients.sms import SmsClientResponseException
from app.dao import notifications_dao
from app.dao.notifications_dao import update_notification_status_by_id
from app.delivery import send_to_providers
@@ -22,11 +23,17 @@ def deliver_sms(self, notification_id):
if not notification:
raise NoResultFound()
send_to_providers.send_sms_to_provider(notification)
except Exception:
try:
except Exception as e:
if isinstance(e, SmsClientResponseException):
current_app.logger.warning(
"SMS notification delivery for id: {} failed".format(notification_id)
)
else:
current_app.logger.exception(
"SMS notification delivery for id: {} failed".format(notification_id)
)
try:
if self.request.retries == 0:
self.retry(queue=QueueNames.RETRY, countdown=0)
else:

View File

@@ -88,7 +88,7 @@ class FiretextClient(SmsClient):
self.statsd_client.incr("clients.firetext.success")
else:
self.statsd_client.incr("clients.firetext.error")
self.current_app.logger.error(log_message)
self.current_app.logger.warning(log_message)
def send_sms(self, to, content, reference, sender=None):

View File

@@ -90,7 +90,7 @@ class MMGClient(SmsClient):
self.statsd_client.incr("clients.mmg.success")
else:
self.statsd_client.incr("clients.mmg.error")
self.current_app.logger.error(log_message)
self.current_app.logger.warning(log_message)
def get_name(self):
return self.name

View File

@@ -5,6 +5,7 @@ from celery.exceptions import MaxRetriesExceededError
import app
from app.celery import provider_tasks
from app.celery.provider_tasks import deliver_sms, deliver_email
from app.clients.sms import SmsClientResponseException
from app.clients.email import EmailClientNonRetryableException
from app.clients.email.aws_ses import AwsSesClientException, AwsSesClientThrottlingSendRateException
from app.exceptions import NotificationTechnicalFailureException
@@ -37,28 +38,6 @@ def test_should_add_to_retry_queue_if_notification_not_found_in_deliver_sms_task
app.celery.provider_tasks.deliver_sms.retry.assert_called_with(queue="retry-tasks", countdown=0)
def test_should_call_send_email_to_provider_from_deliver_email_task(
sample_notification,
mocker):
mocker.patch('app.delivery.send_to_providers.send_email_to_provider')
deliver_email(sample_notification.id)
app.delivery.send_to_providers.send_email_to_provider.assert_called_with(sample_notification)
def test_should_go_into_technical_error_if_exceeds_retries_on_deliver_sms_task(sample_notification, mocker):
mocker.patch('app.delivery.send_to_providers.send_sms_to_provider', side_effect=Exception("EXPECTED"))
mocker.patch('app.celery.provider_tasks.deliver_sms.retry', side_effect=MaxRetriesExceededError())
with pytest.raises(NotificationTechnicalFailureException) as e:
deliver_sms(sample_notification.id)
assert str(sample_notification.id) in str(e.value)
provider_tasks.deliver_sms.retry.assert_called_with(queue="retry-tasks", countdown=0)
assert sample_notification.status == 'technical-failure'
def test_send_sms_should_not_switch_providers_on_non_provider_failure(
sample_notification,
mocker
@@ -77,9 +56,59 @@ def test_send_sms_should_not_switch_providers_on_non_provider_failure(
assert mock_dao_reduce_sms_provider_priority.called is False
def test_should_retry_and_log_warning_if_SmsClientResponseException_for_deliver_sms_task(sample_notification, mocker):
mocker.patch('app.delivery.send_to_providers.send_sms_to_provider', side_effect=SmsClientResponseException("something went wrong"))
mocker.patch('app.celery.provider_tasks.deliver_sms.retry')
mock_logger_warning = mocker.patch('app.celery.tasks.current_app.logger.warning')
deliver_sms(sample_notification.id)
assert provider_tasks.deliver_sms.retry.called is True
assert sample_notification.status == 'created'
assert mock_logger_warning.called
def test_should_retry_and_log_exception_for_non_SmsClientResponseException_exceptions_for_deliver_sms_task(
sample_notification, mocker
):
mocker.patch('app.delivery.send_to_providers.send_sms_to_provider', side_effect=Exception("something went wrong"))
mocker.patch('app.celery.provider_tasks.deliver_sms.retry')
mock_logger_exception = mocker.patch('app.celery.tasks.current_app.logger.exception')
deliver_sms(sample_notification.id)
assert provider_tasks.deliver_sms.retry.called is True
assert sample_notification.status == 'created'
assert mock_logger_exception.called
def test_should_go_into_technical_error_if_exceeds_retries_on_deliver_sms_task(sample_notification, mocker):
mocker.patch('app.delivery.send_to_providers.send_sms_to_provider', side_effect=Exception("EXPECTED"))
mocker.patch('app.celery.provider_tasks.deliver_sms.retry', side_effect=MaxRetriesExceededError())
mock_logger_exception = mocker.patch('app.celery.tasks.current_app.logger.exception')
with pytest.raises(NotificationTechnicalFailureException) as e:
deliver_sms(sample_notification.id)
assert str(sample_notification.id) in str(e.value)
provider_tasks.deliver_sms.retry.assert_called_with(queue="retry-tasks", countdown=0)
assert sample_notification.status == 'technical-failure'
assert mock_logger_exception.called
# end of deliver_sms task tests, now deliver_email task tests
def test_should_call_send_email_to_provider_from_deliver_email_task(
sample_notification,
mocker):
mocker.patch('app.delivery.send_to_providers.send_email_to_provider')
deliver_email(sample_notification.id)
app.delivery.send_to_providers.send_email_to_provider.assert_called_with(sample_notification)
def test_should_add_to_retry_queue_if_notification_not_found_in_deliver_email_task(mocker):
mocker.patch('app.delivery.send_to_providers.send_email_to_provider')
mocker.patch('app.celery.provider_tasks.deliver_email.retry')
@@ -125,7 +154,7 @@ def test_should_technical_error_and_not_retry_if_EmailClientNonRetryableExceptio
assert sample_notification.status == 'technical-failure'
def test_should_retry_and_log_exception(sample_notification, mocker):
def test_should_retry_and_log_exception_for_deliver_email_task(sample_notification, mocker):
error_response = {
'Error': {
'Code': 'SomeError',