mirror of
https://github.com/GSA/notifications-api.git
synced 2025-12-13 08:42:21 -05:00
When we send an HTTP request to our SMS providers, there is a chance we get a 5xx status code back from them. Currently we log this as two different exception level logs. If a provider has a funny few minutes, we could end up with hundreds of exceptions thrown and pagerduty waking someone up in the middle of the night. These problems tend to pretty quickly fix themselves as we balance traffic from one SMS to the other SMS provider within 5 minutes. By downgrading both exceptions to warning in the case of a `SmsClientResponseException`, we will reduce the change of waking us up in the middle of the night for no reason. If the error is not a `SmsClientResponseException`, then we will still log at the exception level as before as this is more unexpected and we may want to be alerted sooner. What we still want to happen though is that let's say both SMS providers went down at the same time for 1 hour. We don't want our tasks to just sit there, retrying every 5 minutes for the whole time without us being aware (so we can at least raise a statuspage update). Luckily we will still be alerted because our smoke tests will fail after 10 minutes and raise a p1: https://github.com/alphagov/notifications-functional-tests/blob/master/tests/functional/staging_and_prod/notify_api/test_notify_api_sms.py#L21
80 lines
3.7 KiB
Python
80 lines
3.7 KiB
Python
from flask import current_app
|
|
from notifications_utils.statsd_decorators import statsd
|
|
from sqlalchemy.orm.exc import NoResultFound
|
|
|
|
from app import notify_celery
|
|
from app.config import QueueNames
|
|
from app.clients.email import EmailClientNonRetryableException
|
|
from app.clients.email.aws_ses import AwsSesClientThrottlingSendRateException
|
|
from app.clients.sms import SmsClientResponseException
|
|
from app.dao import notifications_dao
|
|
from app.dao.notifications_dao import update_notification_status_by_id
|
|
from app.delivery import send_to_providers
|
|
from app.exceptions import NotificationTechnicalFailureException
|
|
from app.models import NOTIFICATION_TECHNICAL_FAILURE
|
|
|
|
|
|
@notify_celery.task(bind=True, name="deliver_sms", max_retries=48, default_retry_delay=300)
|
|
@statsd(namespace="tasks")
|
|
def deliver_sms(self, notification_id):
|
|
try:
|
|
current_app.logger.info("Start sending SMS for notification id: {}".format(notification_id))
|
|
notification = notifications_dao.get_notification_by_id(notification_id)
|
|
if not notification:
|
|
raise NoResultFound()
|
|
send_to_providers.send_sms_to_provider(notification)
|
|
except Exception as e:
|
|
if isinstance(e, SmsClientResponseException):
|
|
current_app.logger.warning(
|
|
"SMS notification delivery for id: {} failed".format(notification_id)
|
|
)
|
|
else:
|
|
current_app.logger.exception(
|
|
"SMS notification delivery for id: {} failed".format(notification_id)
|
|
)
|
|
|
|
try:
|
|
if self.request.retries == 0:
|
|
self.retry(queue=QueueNames.RETRY, countdown=0)
|
|
else:
|
|
self.retry(queue=QueueNames.RETRY)
|
|
except self.MaxRetriesExceededError:
|
|
message = "RETRY FAILED: Max retries reached. The task send_sms_to_provider failed for notification {}. " \
|
|
"Notification has been updated to technical-failure".format(notification_id)
|
|
update_notification_status_by_id(notification_id, NOTIFICATION_TECHNICAL_FAILURE)
|
|
raise NotificationTechnicalFailureException(message)
|
|
|
|
|
|
@notify_celery.task(bind=True, name="deliver_email", max_retries=48, default_retry_delay=300)
|
|
@statsd(namespace="tasks")
|
|
def deliver_email(self, notification_id):
|
|
try:
|
|
current_app.logger.info("Start sending email for notification id: {}".format(notification_id))
|
|
notification = notifications_dao.get_notification_by_id(notification_id)
|
|
if not notification:
|
|
raise NoResultFound()
|
|
send_to_providers.send_email_to_provider(notification)
|
|
except EmailClientNonRetryableException as e:
|
|
current_app.logger.exception(
|
|
f"Email notification {notification_id} failed: {e}"
|
|
)
|
|
update_notification_status_by_id(notification_id, 'technical-failure')
|
|
except Exception as e:
|
|
try:
|
|
if isinstance(e, AwsSesClientThrottlingSendRateException):
|
|
current_app.logger.warning(
|
|
f"RETRY: Email notification {notification_id} was rate limited by SES"
|
|
)
|
|
else:
|
|
current_app.logger.exception(
|
|
f"RETRY: Email notification {notification_id} failed"
|
|
)
|
|
|
|
self.retry(queue=QueueNames.RETRY)
|
|
except self.MaxRetriesExceededError:
|
|
message = "RETRY FAILED: Max retries reached. " \
|
|
"The task send_email_to_provider failed for notification {}. " \
|
|
"Notification has been updated to technical-failure".format(notification_id)
|
|
update_notification_status_by_id(notification_id, NOTIFICATION_TECHNICAL_FAILURE)
|
|
raise NotificationTechnicalFailureException(message)
|