Add retries for SES callbacks for recent notifications

We've seen errors caused by what we suspect is a race condition when
SES callback processing tries to look up the notification before the
sender worker has saved notification reference from the SES POST
response to the database.

This adds a retry for SES callback task if the notification was not
found and the message is less than 10 minutes old and removes the
error log message for notifications older than 3 days (since they
might no longer exist in the notifications table and would've been
marked as failure by then either way).

In order to be able to call retry and silence the error log based on
notification time this change inlines `process_ses_response` and
`update_notification_by_reference` functions into the celery task.
It also removes a lot of defensive error-handling that doesn't appear
to have been triggered in the last few months (for things like missing
keys in SES callback data).
This commit is contained in:
Alexey Bezhan
2019-02-13 11:35:31 +00:00
parent ae3505d692
commit 2932b44eb8
4 changed files with 311 additions and 299 deletions

View File

@@ -1,19 +1,80 @@
from flask import current_app
from datetime import datetime, timedelta
import iso8601
from flask import current_app, json
from notifications_utils.statsd_decorators import statsd
from sqlalchemy.orm.exc import NoResultFound
from app import notify_celery
from app import notify_celery, statsd_client
from app.config import QueueNames
from app.clients.email.aws_ses import get_aws_responses
from app.dao import notifications_dao
from app.models import NOTIFICATION_SENDING, NOTIFICATION_PENDING
from app.notifications.notifications_ses_callback import process_ses_response
from app.notifications.notifications_ses_callback import (
determine_notification_bounce_type,
handle_complaint,
_check_and_queue_complaint_callback_task,
_check_and_queue_callback_task,
)
@notify_celery.task(bind=True, name="process-ses-result", max_retries=5, default_retry_delay=300)
@statsd(namespace="tasks")
def process_ses_results(self, response):
try:
errors = process_ses_response(response)
if errors:
current_app.logger.error(errors)
except Exception:
current_app.logger.exception('Error processing SES results')
ses_message = json.loads(response['Message'])
notification_type = ses_message['notificationType']
if notification_type == 'Bounce':
notification_type = determine_notification_bounce_type(notification_type, ses_message)
elif notification_type == 'Complaint':
_check_and_queue_complaint_callback_task(*handle_complaint(ses_message))
return True
aws_response_dict = get_aws_responses(notification_type)
notification_status = aws_response_dict['notification_status']
reference = ses_message['mail']['messageId']
try:
notification = notifications_dao.dao_get_notification_by_reference(reference)
except NoResultFound:
message_time = iso8601.parse_date(ses_message['mail']['timestamp']).replace(tzinfo=None)
if datetime.utcnow() - message_time < timedelta(minutes=10):
self.retry(queue=QueueNames.RETRY)
elif datetime.utcnow() - message_time < timedelta(days=3):
current_app.logger.error(
"notification not found for reference: {} (update to {})".format(reference, notification_status)
)
return
if notification.status not in {NOTIFICATION_SENDING, NOTIFICATION_PENDING}:
notifications_dao._duplicate_update_warning(notification, notification_status)
return
notifications_dao._update_notification_status(notification=notification, status=notification_status)
if not aws_response_dict['success']:
current_app.logger.info(
"SES delivery failed: notification id {} and reference {} has error found. Status {}".format(
notification.id, reference, aws_response_dict['message']
)
)
else:
current_app.logger.info('SES callback return status of {} for notification: {}'.format(
notification_status, notification.id
))
statsd_client.incr('callback.ses.{}'.format(notification_status))
if notification.sent_at:
statsd_client.timing_with_dates('callback.ses.elapsed-time', datetime.utcnow(), notification.sent_at)
_check_and_queue_callback_task(notification)
return True
except Exception as e:
current_app.logger.exception('Error processing SES results: {}'.format(type(e)))
self.retry(queue=QueueNames.RETRY)