From a9617d4df6021c3080bf888c9f0e5cca4b72044d Mon Sep 17 00:00:00 2001 From: David McDonald Date: Wed, 24 Nov 2021 17:08:45 +0000 Subject: [PATCH 1/5] Bump utils to 49.1.0 --- app/notify_client/job_api_client.py | 2 +- requirements.in | 3 +-- requirements.txt | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/app/notify_client/job_api_client.py b/app/notify_client/job_api_client.py index c9f84401c..115ece59e 100644 --- a/app/notify_client/job_api_client.py +++ b/app/notify_client/job_api_client.py @@ -103,7 +103,7 @@ class JobApiClient(NotifyAdminAPIClient): redis_client.set( 'has_jobs-{}'.format(service_id), b'true', - ex=cache.TTL, + ex=cache.DEFAULT_TTL, ) return job diff --git a/requirements.in b/requirements.in index 138d8d730..7343ca794 100644 --- a/requirements.in +++ b/requirements.in @@ -30,8 +30,7 @@ pyproj==3.2.1 # PaaS awscli-cwlogs>=1.4,<1.5 itsdangerous==1.1.0 # pyup: <2 - -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@49.0.0 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@49.1.0 govuk-frontend-jinja @ git+https://github.com/alphagov/govuk-frontend-jinja.git@v0.5.8-alpha # cryptography 3.4+ incorporates Rust code, which isn't supported on PaaS diff --git a/requirements.txt b/requirements.txt index 23c0b431d..feba75754 100644 --- a/requirements.txt +++ b/requirements.txt @@ -123,7 +123,7 @@ mistune==0.8.4 # via notifications-utils notifications-python-client==6.3.0 # via -r requirements.in -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@49.0.0 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@49.1.0 # via -r requirements.in openpyxl==3.0.7 # via pyexcel-xlsx From 2767328e79f9490145e17ec7969d7d72797013d4 Mon Sep 17 00:00:00 2001 From: David McDonald Date: Wed, 24 Nov 2021 17:29:32 +0000 Subject: [PATCH 2/5] Reduce impact of stale cache for performance page I came across a bug where the performance page might be visited on say the 22nd and we expect it to show data for yesterday and the past 7 days (so the 21st and back 7 days) but instead it only shows it for the 20th and then back 6 days. The cause is that we have nightly tasks that run to calculate the number of notifications sent per service (the ft_notification_status table) calculate the number of notifications sent within 10 seconds (the ft_processing_time table) Both of these tables get filled between the hours of midnight and 2:30am. The first seems to be filled by about 12:30 every night whereas the processing stats seems to be filled about 2am every night. The admin app currently caches the results of the call to the API (https://github.com/alphagov/notifications-api/blob/master/app/performance_dashboard/rest.py#L24) to get this data with the key performance-stats-{start_date}-to-{end_date}. Now the issue here is that if the performance page is visited at 00:01 on the 23rd, it will call the API for data on the 22nd and backwards 7 days. The data it gets at 00:01 will not yet have the data for the 22nd, it will only go as far as the 21st, because the overnight jobs to put data in the tables have not run yet. The admin app then caches this data, meaning that for the rest of the day the page will be missing data for the 22nd, even though it becomes available after approx 2am. We have seen it a few times in production where the performance page has indeed been visited before say 2 am, leaving the page with missing data. In terms of solutions, there are several potential options. 1. Remove the caching that we do in redis. The downside of this is that we will hit the API more (currently once a day for the first visitor to the performance page). We have had 255 requests to the performance page in the last week and when there is no value in redis, the call to the API takes approximately 1-2 seconds. Removing the caching would against why the caching was originally added which was to act as a basic protection against malicious DOS attempts. 2. Make the admin app only set the cache value if it is after 2:30am. This one feels a bit gross and snowflakey. 3. The API flushes the redis cache after it has finished its nightly tasks. We don't have that much precedent of the API interacting with redis, it is mostly the admin app that does but this is still a valid option. 4. Cache in redis the data for 2 days ago to 7 days ago and then get the data for yesterday freshly every time. This would mean some things are still cached but the smallest bit that we can't rely on can be gathered fresh. 5. Remove the caching we do in redis but then try and optimise the API endpoint to return results even faster and with less interaction with the DB. My hypothesis is that the slowest part is where the API has to calculate how many notifications Notify has ever sent (https://github.com/alphagov/notifications-api/blob/master/app/performance_dashboard/rest.py#L36) and so we could potentially try and store these totals in the DB every night and update them nightly rather than having to query all the data in the table to figure them out every time. 6. Reduce the expiry time of the cache key so although the bug will still exist, it will only exist for a much shorter time. In the end, we've gone for option 6. The user impact of this bug sticking around for an hour is minimal, in particular because that would be in the early hours of the morning. The solution is also quick to implement and keeps the protection against a DOS attack. The value of 1 hour for expiry was a fairly abitrary choice, it could have been as little as 1 minute or as much as 6 hours. --- app/notify_client/performance_dashboard_api_client.py | 2 +- tests/app/notify_client/test_performance_platform_api_client.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/app/notify_client/performance_dashboard_api_client.py b/app/notify_client/performance_dashboard_api_client.py index c08734aff..e8800b321 100644 --- a/app/notify_client/performance_dashboard_api_client.py +++ b/app/notify_client/performance_dashboard_api_client.py @@ -3,7 +3,7 @@ from app.notify_client import NotifyAdminAPIClient, cache class PerformanceDashboardAPIClient(NotifyAdminAPIClient): - @cache.set('performance-stats-{start_date}-to-{end_date}') + @cache.set('performance-stats-{start_date}-to-{end_date}', ttl_in_seconds=3600) def get_performance_dashboard_stats( self, *, diff --git a/tests/app/notify_client/test_performance_platform_api_client.py b/tests/app/notify_client/test_performance_platform_api_client.py index dbdfbd6a1..5a68d743f 100644 --- a/tests/app/notify_client/test_performance_platform_api_client.py +++ b/tests/app/notify_client/test_performance_platform_api_client.py @@ -54,7 +54,7 @@ def test_sets_value_in_cache(mocker): call( 'performance-stats-2021-01-01-to-2022-02-02', '{"data_from": "api"}', - ex=604800, + ex=3600, ), ] From 6acc7838e250512957392fe3577f24f97c712cd2 Mon Sep 17 00:00:00 2001 From: David McDonald Date: Wed, 24 Nov 2021 18:07:43 +0000 Subject: [PATCH 3/5] Add tests for existing status api client Essentially copies the tests found in the performance_platform api client. --- .../notify_client/test_status_api_client.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 tests/app/notify_client/test_status_api_client.py diff --git a/tests/app/notify_client/test_status_api_client.py b/tests/app/notify_client/test_status_api_client.py new file mode 100644 index 000000000..f836056bb --- /dev/null +++ b/tests/app/notify_client/test_status_api_client.py @@ -0,0 +1,59 @@ +from app.notify_client.status_api_client import StatusApiClient + + +def test_get_count_of_live_services_and_organisations(mocker): + mocker.patch('app.extensions.RedisClient.get', return_value=None) + client = StatusApiClient() + mock = mocker.patch.object(client, 'get', return_value={}) + + client.get_count_of_live_services_and_organisations() + + mock.assert_called_once_with(url='/_status/live-service-and-organisation-counts') + + +def test_sets_value_in_cache(mocker): + client = StatusApiClient() + + mock_redis_get = mocker.patch( + 'app.extensions.RedisClient.get', + return_value=None + ) + mock_api_get = mocker.patch( + 'app.notify_client.NotifyAdminAPIClient.get', + return_value={'data_from': 'api'}, + ) + mock_redis_set = mocker.patch( + 'app.extensions.RedisClient.set', + ) + + assert client.get_count_of_live_services_and_organisations() == {'data_from': 'api'} + + mock_redis_get.assert_called_once_with('live-service-and-organisation-counts') + mock_api_get.assert_called_once_with(url='/_status/live-service-and-organisation-counts') + mock_redis_set.assert_called_once_with( + 'live-service-and-organisation-counts', + '{"data_from": "api"}', + ex=604800 + ) + + +def test_returns_value_from_cache(mocker): + client = StatusApiClient() + + mock_redis_get = mocker.patch( + 'app.extensions.RedisClient.get', + return_value=b'{"data_from": "cache"}', + ) + mock_api_get = mocker.patch( + 'app.notify_client.NotifyAdminAPIClient.get', + ) + mock_redis_set = mocker.patch( + 'app.extensions.RedisClient.set', + ) + + assert client.get_count_of_live_services_and_organisations() == {'data_from': 'cache'} + + mock_redis_get.assert_called_once_with('live-service-and-organisation-counts') + + assert mock_api_get.called is False + assert mock_redis_set.called is False From 20cc1e230fbbd95afa687185735d3ac818f1abe1 Mon Sep 17 00:00:00 2001 From: David McDonald Date: Wed, 24 Nov 2021 18:10:30 +0000 Subject: [PATCH 4/5] Reduce TTL to 1 hour for get_count_of_live_services_and_organisations This stat is shown on a few of our pages, such as our homepage, the performance page and also a platform admin page and is currently catched for the default TTL of 1 week. I think there is no reason we can't make this only cache once an hour and give slightly more up to date stats which will update more regularly. This mimics the approach and also the TTL choice of 1 hour that has been added for the performance page (although there is no particular bug here to fix, it is just nice to have slightly more up up to date data). Note, the API call only takes about 0.3 seconds at the moment so it is not particularly intensive on the DB to run this more regularly. --- app/notify_client/status_api_client.py | 2 +- tests/app/notify_client/test_status_api_client.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/app/notify_client/status_api_client.py b/app/notify_client/status_api_client.py index 7228621c6..f3224b41d 100644 --- a/app/notify_client/status_api_client.py +++ b/app/notify_client/status_api_client.py @@ -6,7 +6,7 @@ class StatusApiClient(NotifyAdminAPIClient): def get_status(self, *params): return self.get(url='/_status', *params) - @cache.set('live-service-and-organisation-counts') + @cache.set('live-service-and-organisation-counts', ttl_in_seconds=3600) def get_count_of_live_services_and_organisations(self): return self.get(url='/_status/live-service-and-organisation-counts') diff --git a/tests/app/notify_client/test_status_api_client.py b/tests/app/notify_client/test_status_api_client.py index f836056bb..e434bc9d1 100644 --- a/tests/app/notify_client/test_status_api_client.py +++ b/tests/app/notify_client/test_status_api_client.py @@ -33,7 +33,7 @@ def test_sets_value_in_cache(mocker): mock_redis_set.assert_called_once_with( 'live-service-and-organisation-counts', '{"data_from": "api"}', - ex=604800 + ex=3600 ) From e7d8918f7f6359138f3bf6a874d328615dd5bd2f Mon Sep 17 00:00:00 2001 From: David McDonald Date: Wed, 8 Dec 2021 15:45:50 +0000 Subject: [PATCH 5/5] Refactor test assertions Doing this to reduce the use of overly verbose `call()` to match assertions in test_status_api_client. --- .../test_performance_platform_api_client.py | 27 +++++++------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/tests/app/notify_client/test_performance_platform_api_client.py b/tests/app/notify_client/test_performance_platform_api_client.py index 5a68d743f..686a67fe7 100644 --- a/tests/app/notify_client/test_performance_platform_api_client.py +++ b/tests/app/notify_client/test_performance_platform_api_client.py @@ -1,5 +1,4 @@ from datetime import date -from unittest.mock import call from app.notify_client.performance_dashboard_api_client import ( PerformanceDashboardAPIClient, @@ -42,21 +41,15 @@ def test_sets_value_in_cache(mocker): end_date=date(2022, 2, 2), ) == {'data_from': 'api'} - assert mock_redis_get.call_args_list == [ - call('performance-stats-2021-01-01-to-2022-02-02'), - ] - assert mock_api_get.call_args_list == [ - call('/performance-dashboard', params={ + mock_redis_get.assert_called_once_with('performance-stats-2021-01-01-to-2022-02-02') + mock_api_get.assert_called_once_with('/performance-dashboard', params={ 'start_date': '2021-01-01', 'end_date': '2022-02-02' - }), - ] - assert mock_redis_set.call_args_list == [ - call( - 'performance-stats-2021-01-01-to-2022-02-02', - '{"data_from": "api"}', - ex=3600, - ), - ] + }) + mock_redis_set.assert_called_once_with( + 'performance-stats-2021-01-01-to-2022-02-02', + '{"data_from": "api"}', + ex=3600, + ) def test_returns_value_from_cache(mocker): @@ -78,8 +71,6 @@ def test_returns_value_from_cache(mocker): end_date=date(2022, 2, 2), ) == {'data_from': 'cache'} - assert mock_redis_get.call_args_list == [ - call('performance-stats-2021-01-01-to-2022-02-02'), - ] + mock_redis_get.assert_called_once_with('performance-stats-2021-01-01-to-2022-02-02') assert mock_api_get.called is False assert mock_redis_set.called is False