From f974108934dab3f6b5b356c59935018e1dbdc16d Mon Sep 17 00:00:00 2001 From: Ben Thorner Date: Wed, 20 Oct 2021 17:22:59 +0100 Subject: [PATCH] Include status in stats about delivery times Previously these metrics weren't very useful because they could be skewed by long timings for failed notifications, which can take up to 72 hours to deliver. I'm intentionally not trying to have a dual running period (with the old and new names) because: - We don't use the current stats for anything (checking Grafana). - The current stats get turned into a "bucket" metric in Prometheus [1][2], which isn't very useful because it can only tell us the mean time to deliver, but we're actually interested in percentiles. Switching to a new naming is an opportunity to fix the raw data and the way it's aggregated, using the same kind of "summary" metric that we now use for stats about our Celery tasks [3]. [1]: https://github.com/alphagov/notifications-aws/blob/c330a8ac8a3d3949d09cd4d1040a452d5bff0b50/paas/statsd/statsd-mapping.yml#L82 [2]: https://prometheus.io/docs/practices/histograms/#quantiles [3]: https://github.com/alphagov/notifications-aws/pull/890 --- app/celery/process_ses_receipts_tasks.py | 6 +++++- app/celery/process_sms_client_response_tasks.py | 2 +- tests/app/celery/test_process_ses_receipts_tasks.py | 2 +- tests/app/notifications/test_process_client_response.py | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/app/celery/process_ses_receipts_tasks.py b/app/celery/process_ses_receipts_tasks.py index 21968740a..fef0fb1f7 100644 --- a/app/celery/process_ses_receipts_tasks.py +++ b/app/celery/process_ses_receipts_tasks.py @@ -70,7 +70,11 @@ def process_ses_results(self, response): statsd_client.incr('callback.ses.{}'.format(notification_status)) if notification.sent_at: - statsd_client.timing_with_dates('callback.ses.elapsed-time', datetime.utcnow(), notification.sent_at) + statsd_client.timing_with_dates( + f'callback.ses.{notification_status}.elapsed-time', + datetime.utcnow(), + notification.sent_at + ) _check_and_queue_callback_task(notification) diff --git a/app/celery/process_sms_client_response_tasks.py b/app/celery/process_sms_client_response_tasks.py index 5c7c67ecb..6cc04d7fb 100644 --- a/app/celery/process_sms_client_response_tasks.py +++ b/app/celery/process_sms_client_response_tasks.py @@ -75,7 +75,7 @@ def _process_for_status(notification_status, client_name, provider_reference, de if notification.sent_at: statsd_client.timing_with_dates( - 'callback.{}.elapsed-time'.format(client_name.lower()), + f'callback.{client_name.lower()}.{notification_status}.elapsed-time', datetime.utcnow(), notification.sent_at ) diff --git a/tests/app/celery/test_process_ses_receipts_tasks.py b/tests/app/celery/test_process_ses_receipts_tasks.py index 693972acd..ee8c2081b 100644 --- a/tests/app/celery/test_process_ses_receipts_tasks.py +++ b/tests/app/celery/test_process_ses_receipts_tasks.py @@ -86,7 +86,7 @@ def test_ses_callback_should_update_notification_status( assert process_ses_results(ses_notification_callback(reference='ref')) assert get_notification_by_id(notification.id).status == 'delivered' statsd_client.timing_with_dates.assert_any_call( - "callback.ses.elapsed-time", datetime.utcnow(), notification.sent_at + "callback.ses.delivered.elapsed-time", datetime.utcnow(), notification.sent_at ) statsd_client.incr.assert_any_call("callback.ses.delivered") updated_notification = Notification.query.get(notification.id) diff --git a/tests/app/notifications/test_process_client_response.py b/tests/app/notifications/test_process_client_response.py index e4bc103e9..f31a9c9ed 100644 --- a/tests/app/notifications/test_process_client_response.py +++ b/tests/app/notifications/test_process_client_response.py @@ -144,7 +144,7 @@ def test_process_sms_client_response_records_statsd_metrics(sample_notification, statsd_client.incr.assert_any_call("callback.firetext.delivered") statsd_client.timing_with_dates.assert_any_call( - "callback.firetext.elapsed-time", datetime.utcnow(), sample_notification.sent_at + "callback.firetext.delivered.elapsed-time", datetime.utcnow(), sample_notification.sent_at )