From 11278c47f56f447e2879706b89406e4446752705 Mon Sep 17 00:00:00 2001 From: Ben Thorner Date: Tue, 14 Dec 2021 12:26:00 +0000 Subject: [PATCH 1/2] Replace log with StatsD gauge for slow delivery A gauge is more useful as we can visualise it and combine it with other stats - we already have other stats for the total number of notifications sent by provider, and we can extrapolate the number of slow notifications using this, if needed. We also still have logs to say the task is running, as well as a log in the calling code when we actually make a switch [1], so we're not losing anything by removing the log here. [1]: https://github.com/alphagov/notifications-api/blob/a9306c45573402495d68f6948a9eb3d6ffbac28c/app/celery/scheduled_tasks.py#L117 --- app/dao/notifications_dao.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/app/dao/notifications_dao.py b/app/dao/notifications_dao.py index da75f285a..39cac60cd 100644 --- a/app/dao/notifications_dao.py +++ b/app/dao/notifications_dao.py @@ -20,7 +20,7 @@ from sqlalchemy.sql import functions from sqlalchemy.sql.expression import case from werkzeug.datastructures import MultiDict -from app import create_uuid, db +from app import create_uuid, db, statsd_client from app.clients.sms.firetext import ( get_message_status_and_reason_from_firetext_code, ) @@ -565,10 +565,7 @@ def is_delivery_slow_for_providers( slow_notifications = sum(row.count for row in rows if row.slow) slow_providers[provider] = (slow_notifications / total_notifications >= threshold) - - current_app.logger.info("Slow delivery notifications count for provider {}: {} out of {}. Ratio {}".format( - provider, slow_notifications, total_notifications, slow_notifications / total_notifications - )) + statsd_client.gauge(f'slow-delivery.{provider}.ratio', slow_notifications / total_notifications) return slow_providers From c8cf057ebae4ad0bbe53683cef1d90e2dc45cbfc Mon Sep 17 00:00:00 2001 From: Ben Thorner Date: Tue, 14 Dec 2021 12:38:25 +0000 Subject: [PATCH 2/2] Record providers we time out notifications for This will help us monitor issues with delivery receipts and keep track of provider performance over time. I'm not concerned about performance here: - The number of notifications to time out is usually small. - This task only runs once a day. - Calls to StatsD are quick and cheap. --- app/celery/nightly_tasks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/celery/nightly_tasks.py b/app/celery/nightly_tasks.py index 15f96b4e3..5479baff3 100644 --- a/app/celery/nightly_tasks.py +++ b/app/celery/nightly_tasks.py @@ -8,7 +8,7 @@ from notifications_utils.clients.zendesk.zendesk_client import ( from sqlalchemy import func from sqlalchemy.exc import SQLAlchemyError -from app import notify_celery, zendesk_client +from app import notify_celery, statsd_client, zendesk_client from app.aws import s3 from app.config import QueueNames from app.cronitor import cronitor @@ -123,6 +123,7 @@ def timeout_notifications(): notifications = dao_timeout_notifications(cutoff_time) for notification in notifications: + statsd_client.incr(f'timeout-sending.{notification.sent_by}') check_and_queue_callback_task(notification) current_app.logger.info(