mirror of
https://github.com/GSA/notifications-api.git
synced 2025-12-14 01:02:09 -05:00
Change retries policy.
Before we had a long back off, now we have more, but shorter backoffs. - PREVIOUS When we had an error talking to a provider we retried quickly and if we still got errors we backed off more and more. Maximum attempts was 5, max delay 4hours. This was to allow us time to ship a build if that was required. - NOW Backing off 48 times of 5 minutes each. This gives us the same total backoff, but many more tries in that period. - WHY Having the long back off meant messages could be delayed 4 hours. This was happening more and more, as PaaS deploys can place things into the "inflight" state in SQS. The inflight state MUST have an expiry time LONGER than the maximum retry back off. This meant that messages would be delayed 4 hours, even when there was no app error. By doing this we can reduce this delay to 5 minutes. Whilst still giving us time to fix issues.
This commit is contained in:
@@ -10,31 +10,7 @@ from app.statsd_decorators import statsd
|
||||
from app.delivery import send_to_providers
|
||||
|
||||
|
||||
def retry_iteration_to_delay(retry=0):
|
||||
"""
|
||||
:param retry times we have performed a retry
|
||||
Given current retry calculate some delay before retrying
|
||||
0: 10 seconds
|
||||
1: 60 seconds (1 minutes)
|
||||
2: 300 seconds (5 minutes)
|
||||
3: 3600 seconds (60 minutes)
|
||||
4: 14400 seconds (4 hours)
|
||||
:param retry (zero indexed):
|
||||
:return length to retry in seconds, default 10 seconds
|
||||
"""
|
||||
|
||||
delays = {
|
||||
0: 10,
|
||||
1: 60,
|
||||
2: 300,
|
||||
3: 3600,
|
||||
4: 14400
|
||||
}
|
||||
|
||||
return delays.get(retry, 10)
|
||||
|
||||
|
||||
@notify_celery.task(bind=True, name="deliver_sms", max_retries=5, default_retry_delay=5)
|
||||
@notify_celery.task(bind=True, name="deliver_sms", max_retries=48, default_retry_delay=300)
|
||||
@statsd(namespace="tasks")
|
||||
def deliver_sms(self, notification_id):
|
||||
try:
|
||||
@@ -47,7 +23,7 @@ def deliver_sms(self, notification_id):
|
||||
current_app.logger.exception(
|
||||
"SMS notification delivery for id: {} failed".format(notification_id)
|
||||
)
|
||||
self.retry(queue=QueueNames.RETRY, countdown=retry_iteration_to_delay(self.request.retries))
|
||||
self.retry(queue=QueueNames.RETRY)
|
||||
except self.MaxRetriesExceededError:
|
||||
current_app.logger.exception(
|
||||
"RETRY FAILED: task send_sms_to_provider failed for notification {}".format(notification_id),
|
||||
@@ -55,7 +31,7 @@ def deliver_sms(self, notification_id):
|
||||
update_notification_status_by_id(notification_id, 'technical-failure')
|
||||
|
||||
|
||||
@notify_celery.task(bind=True, name="deliver_email", max_retries=5, default_retry_delay=5)
|
||||
@notify_celery.task(bind=True, name="deliver_email", max_retries=48, default_retry_delay=300)
|
||||
@statsd(namespace="tasks")
|
||||
def deliver_email(self, notification_id):
|
||||
try:
|
||||
@@ -71,7 +47,7 @@ def deliver_email(self, notification_id):
|
||||
current_app.logger.exception(
|
||||
"RETRY: Email notification {} failed".format(notification_id)
|
||||
)
|
||||
self.retry(queue=QueueNames.RETRY, countdown=retry_iteration_to_delay(self.request.retries))
|
||||
self.retry(queue=QueueNames.RETRY)
|
||||
except self.MaxRetriesExceededError:
|
||||
current_app.logger.error(
|
||||
"RETRY FAILED: task send_email_to_provider failed for notification {}".format(notification_id)
|
||||
|
||||
@@ -11,34 +11,6 @@ def test_should_have_decorated_tasks_functions():
|
||||
assert deliver_email.__wrapped__.__name__ == 'deliver_email'
|
||||
|
||||
|
||||
def test_should_by_10_second_delay_as_default():
|
||||
assert provider_tasks.retry_iteration_to_delay() == 10
|
||||
|
||||
|
||||
def test_should_by_10_second_delay_on_unmapped_retry_iteration():
|
||||
assert provider_tasks.retry_iteration_to_delay(99) == 10
|
||||
|
||||
|
||||
def test_should_by_10_second_delay_on_retry_one():
|
||||
assert provider_tasks.retry_iteration_to_delay(0) == 10
|
||||
|
||||
|
||||
def test_should_by_1_minute_delay_on_retry_two():
|
||||
assert provider_tasks.retry_iteration_to_delay(1) == 60
|
||||
|
||||
|
||||
def test_should_by_5_minute_delay_on_retry_two():
|
||||
assert provider_tasks.retry_iteration_to_delay(2) == 300
|
||||
|
||||
|
||||
def test_should_by_60_minute_delay_on_retry_two():
|
||||
assert provider_tasks.retry_iteration_to_delay(3) == 3600
|
||||
|
||||
|
||||
def test_should_by_240_minute_delay_on_retry_two():
|
||||
assert provider_tasks.retry_iteration_to_delay(4) == 14400
|
||||
|
||||
|
||||
def test_should_call_send_sms_to_provider_from_deliver_sms_task(
|
||||
notify_db,
|
||||
notify_db_session,
|
||||
@@ -61,7 +33,7 @@ def test_should_add_to_retry_queue_if_notification_not_found_in_deliver_sms_task
|
||||
|
||||
deliver_sms(notification_id)
|
||||
app.delivery.send_to_providers.send_sms_to_provider.assert_not_called()
|
||||
app.celery.provider_tasks.deliver_sms.retry.assert_called_with(queue="retry-tasks", countdown=10)
|
||||
app.celery.provider_tasks.deliver_sms.retry.assert_called_with(queue="retry-tasks")
|
||||
|
||||
|
||||
def test_should_call_send_email_to_provider_from_deliver_email_task(
|
||||
@@ -83,7 +55,7 @@ def test_should_add_to_retry_queue_if_notification_not_found_in_deliver_email_ta
|
||||
|
||||
deliver_email(notification_id)
|
||||
app.delivery.send_to_providers.send_email_to_provider.assert_not_called()
|
||||
app.celery.provider_tasks.deliver_email.retry.assert_called_with(queue="retry-tasks", countdown=10)
|
||||
app.celery.provider_tasks.deliver_email.retry.assert_called_with(queue="retry-tasks")
|
||||
|
||||
|
||||
# DO THESE FOR THE 4 TYPES OF TASK
|
||||
@@ -94,7 +66,7 @@ def test_should_go_into_technical_error_if_exceeds_retries_on_deliver_sms_task(s
|
||||
|
||||
deliver_sms(sample_notification.id)
|
||||
|
||||
provider_tasks.deliver_sms.retry.assert_called_with(queue="retry-tasks", countdown=10)
|
||||
provider_tasks.deliver_sms.retry.assert_called_with(queue="retry-tasks")
|
||||
|
||||
assert sample_notification.status == 'technical-failure'
|
||||
|
||||
@@ -105,7 +77,7 @@ def test_should_go_into_technical_error_if_exceeds_retries_on_deliver_email_task
|
||||
|
||||
deliver_email(sample_notification.id)
|
||||
|
||||
provider_tasks.deliver_email.retry.assert_called_with(queue="retry-tasks", countdown=10)
|
||||
provider_tasks.deliver_email.retry.assert_called_with(queue="retry-tasks")
|
||||
assert sample_notification.status == 'technical-failure'
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user