From f7fbd6de5be387523187e53140ce24b53ac64cd3 Mon Sep 17 00:00:00 2001 From: Leo Hemsted Date: Wed, 20 Nov 2019 17:23:39 +0000 Subject: [PATCH] make 500s change priorities quicker it's not acceptable for a constantly failing provider to take 50 minutes to drain (5x reducing priority by 10). But similarly, we need _some_ delay, or a handful of concurrent failures will completely turn off a provider, rendering the whole excercise kinda pointless. Setting the delay before it tries to reduce priority again to one minute is nice because it means that if one request times out and returns 502, then any other requests that are in flight at that time will time out before the one minute is up and not switch, but any requests made after the switch that take sixty seconds to time out will affect it. --- app/celery/scheduled_tasks.py | 2 +- app/dao/provider_details_dao.py | 18 ++++++++++++------ app/delivery/send_to_providers.py | 4 ++-- tests/app/celery/test_scheduled_tasks.py | 4 ++-- tests/app/dao/test_provider_details_dao.py | 12 +++++++----- tests/app/delivery/test_send_to_providers.py | 7 ++++--- 6 files changed, 28 insertions(+), 19 deletions(-) diff --git a/app/celery/scheduled_tasks.py b/app/celery/scheduled_tasks.py index 297713ebd..28da6811f 100644 --- a/app/celery/scheduled_tasks.py +++ b/app/celery/scheduled_tasks.py @@ -122,7 +122,7 @@ def switch_current_sms_provider_on_slow_delivery(): for provider_name, is_slow in slow_delivery_notifications.items(): if is_slow: current_app.logger.warning('Slow delivery notifications detected for provider {}'.format(provider_name)) - dao_reduce_sms_provider_priority(provider_name) + dao_reduce_sms_provider_priority(provider_name, time_threshold=timedelta(minutes=10)) @notify_celery.task(name='check-job-status') diff --git a/app/dao/provider_details_dao.py b/app/dao/provider_details_dao.py index 878e8c23d..2580d2a4b 100644 --- a/app/dao/provider_details_dao.py +++ b/app/dao/provider_details_dao.py @@ -1,4 +1,4 @@ -from datetime import datetime, timedelta +from datetime import datetime from notifications_utils.timezones import convert_utc_to_bst from sqlalchemy import asc, desc, func @@ -35,7 +35,13 @@ def dao_get_provider_versions(provider_id): @transactional -def dao_reduce_sms_provider_priority(identifier): +def dao_reduce_sms_provider_priority(identifier, *, time_threshold): + """ + Will reduce a chosen sms provider's priority, and increase the other provider's priority by 10 points each. + If either provider has been updated in the last `time_threshold`, then it won't take any action. + """ + amount_to_reduce_by = 10 + # get current priority of both providers q = ProviderDetails.query.filter( ProviderDetails.notification_type == 'sms', @@ -46,8 +52,8 @@ def dao_reduce_sms_provider_priority(identifier): other_identifier = get_alternative_sms_provider(identifier) # if something updated recently, don't update again. If the updated_at is null, treat it as min time - if any((provider.updated_at or datetime.min) > datetime.utcnow() - timedelta(minutes=10) for provider in q): - current_app.logger.info("Not adjusting providers, providers updated less than 10 minutes ago.") + if any((provider.updated_at or datetime.min) > datetime.utcnow() - time_threshold for provider in q): + current_app.logger.info("Not adjusting providers, providers updated less than {} ago.".format(time_threshold)) return reduced_provider = providers[identifier] @@ -56,8 +62,8 @@ def dao_reduce_sms_provider_priority(identifier): pre_increase_priority = increased_provider.priority # always keep values between 0 and 100 - reduced_provider.priority = max(0, reduced_provider.priority - 10) - increased_provider.priority = min(100, increased_provider.priority + 10) + reduced_provider.priority = max(0, reduced_provider.priority - amount_to_reduce_by) + increased_provider.priority = min(100, increased_provider.priority + amount_to_reduce_by) current_app.logger.info('Adjusting provider priority - {} going from {} to {}'.format( reduced_provider.identifier, diff --git a/app/delivery/send_to_providers.py b/app/delivery/send_to_providers.py index 416032471..883eb866a 100644 --- a/app/delivery/send_to_providers.py +++ b/app/delivery/send_to_providers.py @@ -1,6 +1,6 @@ import random from urllib import parse -from datetime import datetime +from datetime import datetime, timedelta from flask import current_app from notifications_utils.recipients import ( @@ -66,7 +66,7 @@ def send_sms_to_provider(notification): except Exception as e: notification.billable_units = template.fragment_count dao_update_notification(notification) - dao_reduce_sms_provider_priority(provider.get_name()) + dao_reduce_sms_provider_priority(provider.get_name(), time_threshold=timedelta(minutes=1)) raise e else: notification.billable_units = template.fragment_count diff --git a/tests/app/celery/test_scheduled_tasks.py b/tests/app/celery/test_scheduled_tasks.py index fe160a7ea..71576ff0f 100644 --- a/tests/app/celery/test_scheduled_tasks.py +++ b/tests/app/celery/test_scheduled_tasks.py @@ -58,7 +58,7 @@ def test_should_call_delete_codes_on_delete_verify_codes_task(notify_db_session, assert scheduled_tasks.delete_codes_older_created_more_than_a_day_ago.call_count == 1 -def test_should_call_delete_invotations_on_delete_invitations_task(notify_api, mocker): +def test_should_call_delete_invotations_on_delete_invitations_task(notify_db_session, mocker): mocker.patch('app.celery.scheduled_tasks.delete_invitations_created_more_than_two_days_ago') delete_invitations() assert scheduled_tasks.delete_invitations_created_more_than_two_days_ago.call_count == 1 @@ -119,7 +119,7 @@ def test_switch_current_sms_provider_on_slow_delivery_switches_when_one_provider created_at=datetime(2017, 5, 1, 13, 50), delivery_time=timedelta(minutes=4) ) - mock_reduce.assert_called_once_with('firetext') + mock_reduce.assert_called_once_with('firetext', time_threshold=timedelta(minutes=10)) @freeze_time('2017-05-01 14:00:00') diff --git a/tests/app/dao/test_provider_details_dao.py b/tests/app/dao/test_provider_details_dao.py index 33fc91709..ab229a868 100644 --- a/tests/app/dao/test_provider_details_dao.py +++ b/tests/app/dao/test_provider_details_dao.py @@ -1,6 +1,6 @@ import pytest -from datetime import datetime +from datetime import datetime, timedelta from freezegun import freeze_time from sqlalchemy.sql import desc @@ -153,7 +153,7 @@ def test_reduce_sms_provider_priority_switches_provider( ProviderDetails.query.filter(ProviderDetails.notification_type == 'sms').update({'updated_at': datetime.min}) # switch away from mmg. currently both 50/50 - dao_reduce_sms_provider_priority('mmg') + dao_reduce_sms_provider_priority('mmg', time_threshold=timedelta(minutes=10)) assert firetext.priority == expected_priorities['firetext'] assert mmg.priority == expected_priorities['mmg'] @@ -177,7 +177,7 @@ def test_reduce_sms_provider_priority_adds_rows_to_history_table( desc(ProviderDetailsHistory.version) ).all() - dao_reduce_sms_provider_priority(mmg.identifier) + dao_reduce_sms_provider_priority(mmg.identifier, time_threshold=timedelta(minutes=10)) updated_provider_history_rows = ProviderDetailsHistory.query.filter( ProviderDetailsHistory.id == mmg.id @@ -197,9 +197,11 @@ def test_reduce_sms_provider_priority_does_nothing_if_providers_have_recently_ch mock_is_slow = mocker.patch('app.celery.scheduled_tasks.is_delivery_slow_for_providers') mock_reduce = mocker.patch('app.celery.scheduled_tasks.dao_reduce_sms_provider_priority') ProviderDetails.query.filter(ProviderDetails.identifier == 'firetext').update({'updated_at': datetime.min}) - ProviderDetails.query.filter(ProviderDetails.identifier == 'mmg').update({'updated_at': datetime.utcnow()}) + ProviderDetails.query.filter(ProviderDetails.identifier == 'mmg').update( + {'updated_at': datetime.utcnow() - timedelta(minutes=4, seconds=59)} + ) - dao_reduce_sms_provider_priority('firetext') + dao_reduce_sms_provider_priority('firetext', time_threshold=timedelta(minutes=5)) assert mock_is_slow.called is False assert mock_reduce.called is False diff --git a/tests/app/delivery/test_send_to_providers.py b/tests/app/delivery/test_send_to_providers.py index a8564d332..8ef6ba872 100644 --- a/tests/app/delivery/test_send_to_providers.py +++ b/tests/app/delivery/test_send_to_providers.py @@ -1,6 +1,6 @@ import uuid from collections import namedtuple -from datetime import datetime +from datetime import datetime, timedelta from unittest.mock import ANY import pytest @@ -560,12 +560,12 @@ def test_should_update_billable_units_and_status_according_to_research_mode_and_ assert notification.status == expected_status -def test_should_set_notification_billable_units_if_sending_to_provider_fails( +def test_should_set_notification_billable_units_and_reduces_provider_priority_if_sending_to_provider_fails( sample_notification, mocker, ): mocker.patch('app.mmg_client.send_sms', side_effect=Exception()) - mocker.patch('app.delivery.send_to_providers.dao_reduce_sms_provider_priority') + mock_reduce = mocker.patch('app.delivery.send_to_providers.dao_reduce_sms_provider_priority') sample_notification.billable_units = 0 assert sample_notification.sent_by is None @@ -574,6 +574,7 @@ def test_should_set_notification_billable_units_if_sending_to_provider_fails( send_to_providers.send_sms_to_provider(sample_notification) assert sample_notification.billable_units == 1 + mock_reduce.assert_called_once_with('mmg', time_threshold=timedelta(minutes=1)) def test_should_send_sms_to_international_providers(