make 500s change priorities quicker

it's not acceptable for a constantly failing provider to take 50 minutes
to drain (5x reducing priority by 10). But similarly, we need _some_
delay, or a handful of concurrent failures will completely turn off a
provider, rendering the whole excercise kinda pointless. Setting the
delay before it tries to reduce priority again to one minute is nice
because it means that if one request times out and returns 502, then any
other requests that are in flight at that time will time out before the
one minute is up and not switch, but any requests made after the switch
that take sixty seconds to time out will affect it.
This commit is contained in:
Leo Hemsted
2019-11-20 17:23:39 +00:00
parent 2d7bf664f5
commit f7fbd6de5b
6 changed files with 28 additions and 19 deletions

View File

@@ -122,7 +122,7 @@ def switch_current_sms_provider_on_slow_delivery():
for provider_name, is_slow in slow_delivery_notifications.items():
if is_slow:
current_app.logger.warning('Slow delivery notifications detected for provider {}'.format(provider_name))
dao_reduce_sms_provider_priority(provider_name)
dao_reduce_sms_provider_priority(provider_name, time_threshold=timedelta(minutes=10))
@notify_celery.task(name='check-job-status')

View File

@@ -1,4 +1,4 @@
from datetime import datetime, timedelta
from datetime import datetime
from notifications_utils.timezones import convert_utc_to_bst
from sqlalchemy import asc, desc, func
@@ -35,7 +35,13 @@ def dao_get_provider_versions(provider_id):
@transactional
def dao_reduce_sms_provider_priority(identifier):
def dao_reduce_sms_provider_priority(identifier, *, time_threshold):
"""
Will reduce a chosen sms provider's priority, and increase the other provider's priority by 10 points each.
If either provider has been updated in the last `time_threshold`, then it won't take any action.
"""
amount_to_reduce_by = 10
# get current priority of both providers
q = ProviderDetails.query.filter(
ProviderDetails.notification_type == 'sms',
@@ -46,8 +52,8 @@ def dao_reduce_sms_provider_priority(identifier):
other_identifier = get_alternative_sms_provider(identifier)
# if something updated recently, don't update again. If the updated_at is null, treat it as min time
if any((provider.updated_at or datetime.min) > datetime.utcnow() - timedelta(minutes=10) for provider in q):
current_app.logger.info("Not adjusting providers, providers updated less than 10 minutes ago.")
if any((provider.updated_at or datetime.min) > datetime.utcnow() - time_threshold for provider in q):
current_app.logger.info("Not adjusting providers, providers updated less than {} ago.".format(time_threshold))
return
reduced_provider = providers[identifier]
@@ -56,8 +62,8 @@ def dao_reduce_sms_provider_priority(identifier):
pre_increase_priority = increased_provider.priority
# always keep values between 0 and 100
reduced_provider.priority = max(0, reduced_provider.priority - 10)
increased_provider.priority = min(100, increased_provider.priority + 10)
reduced_provider.priority = max(0, reduced_provider.priority - amount_to_reduce_by)
increased_provider.priority = min(100, increased_provider.priority + amount_to_reduce_by)
current_app.logger.info('Adjusting provider priority - {} going from {} to {}'.format(
reduced_provider.identifier,

View File

@@ -1,6 +1,6 @@
import random
from urllib import parse
from datetime import datetime
from datetime import datetime, timedelta
from flask import current_app
from notifications_utils.recipients import (
@@ -66,7 +66,7 @@ def send_sms_to_provider(notification):
except Exception as e:
notification.billable_units = template.fragment_count
dao_update_notification(notification)
dao_reduce_sms_provider_priority(provider.get_name())
dao_reduce_sms_provider_priority(provider.get_name(), time_threshold=timedelta(minutes=1))
raise e
else:
notification.billable_units = template.fragment_count