Merge pull request #637 from GSA/notify-api-634

Create internal alarms/notifications for 50, 75, 90% of 10K db max
This commit is contained in:
Carlo Costino
2023-12-07 11:32:28 -05:00
committed by GitHub
5 changed files with 138 additions and 12 deletions

View File

@@ -1,3 +1,4 @@
import os
from datetime import datetime, timedelta
from flask import current_app
@@ -5,7 +6,7 @@ from notifications_utils.clients.zendesk.zendesk_client import NotifySupportTick
from sqlalchemy import between
from sqlalchemy.exc import SQLAlchemyError
from app import notify_celery, zendesk_client
from app import notify_celery, redis_store, zendesk_client
from app.celery.tasks import (
get_recipient_csv_and_template_and_sender_id,
process_incomplete_jobs,
@@ -23,12 +24,16 @@ from app.dao.jobs_dao import (
find_jobs_with_missing_rows,
find_missing_row_for_job,
)
from app.dao.notifications_dao import notifications_not_yet_sent
from app.dao.notifications_dao import (
dao_get_failed_notification_count,
notifications_not_yet_sent,
)
from app.dao.services_dao import (
dao_find_services_sending_to_tv_numbers,
dao_find_services_with_high_failure_rates,
)
from app.dao.users_dao import delete_codes_older_created_more_than_a_day_ago
from app.delivery.send_to_providers import provider_to_use
from app.models import (
EMAIL_TYPE,
JOB_STATUS_ERROR,
@@ -39,6 +44,8 @@ from app.models import (
)
from app.notifications.process_notifications import send_notification_to_queue
MAX_NOTIFICATION_FAILS = 10000
@notify_celery.task(name="run-scheduled-jobs")
def run_scheduled_jobs():
@@ -91,6 +98,78 @@ def expire_or_delete_invitations():
raise
@notify_celery.task(name="check-db-notification-fails")
def check_db_notification_fails():
"""
We are going to use redis to keep track of the previous fail count.
If the number of fails is more than 100% of the limit, we want to send an alert every time this
runs, because it is urgent to fix it.
If the number is more than 25%, 50% or 75% of the limit, we only want to send an alert
on a breach. I.e., if the last number was at 23% and the current number is 27%, send an email.
But if the last number was 26% and the current is 27%, don't.
"""
last_value = redis_store.get("LAST_DB_NOTIFICATION_COUNT")
if not last_value:
last_value = 0
failed_count = dao_get_failed_notification_count()
if failed_count > last_value:
redis_store.set("LAST_DB_NOTIFICATION_COUNT", failed_count)
message = ""
curr_env = os.getenv("ENVIRONMENT")
if failed_count >= MAX_NOTIFICATION_FAILS:
message = f"We are over 100% in the db for failed notifications on {curr_env}"
elif (
failed_count >= MAX_NOTIFICATION_FAILS * 0.9
and last_value < MAX_NOTIFICATION_FAILS * 0.9
):
message = (
"tts-notify-alerts@gsa.gov",
f"We crossed above 90% in the db for failed notifications on {curr_env}",
)
elif (
failed_count >= MAX_NOTIFICATION_FAILS * 0.75
and last_value < MAX_NOTIFICATION_FAILS * 0.75
):
message = (
"tts-notify-alerts@gsa.gov",
f"We crossed above 75% in the db for failed notifications on {curr_env}",
)
elif (
failed_count >= MAX_NOTIFICATION_FAILS * 0.5
and last_value < MAX_NOTIFICATION_FAILS * 0.5
):
message = (
"tts-notify-alerts@gsa.gov",
f"We crossed above 50% in the db for failed notifications on {curr_env}",
)
elif (
failed_count >= MAX_NOTIFICATION_FAILS * 0.25
and last_value < MAX_NOTIFICATION_FAILS * 0.25
):
message = (
"tts-notify-alerts@gsa.gov",
f"We crossed above 25% in the db for failed notifications on {curr_env}",
)
# suppress any spam coming from development tier
if message and curr_env != "development":
provider = provider_to_use(EMAIL_TYPE, False)
from_address = '"{}" <{}@{}>'.format(
"Failed Notification Count Alert",
"test_sender",
current_app.config["NOTIFY_EMAIL_DOMAIN"],
)
provider.send_email(
from_address,
"tts-notify-alerts@gsa.gov",
"DB Notification Failures Level Breached",
body=str(message),
)
@notify_celery.task(name="check-job-status")
def check_job_status():
"""

View File

@@ -199,6 +199,11 @@ class Config(object):
"schedule": timedelta(minutes=66),
"options": {"queue": QueueNames.PERIODIC},
},
"check-db-notification-fails": {
"task": "check-db-notification-fails",
"schedule": crontab(minute="18, 48"),
"options": {"queue": QueueNames.PERIODIC},
},
"check-job-status": {
"task": "check-job-status",
"schedule": crontab(),

View File

@@ -20,6 +20,7 @@ from app.models import (
EMAIL_TYPE,
KEY_TYPE_TEST,
NOTIFICATION_CREATED,
NOTIFICATION_FAILED,
NOTIFICATION_PENDING,
NOTIFICATION_PENDING_VIRUS_CHECK,
NOTIFICATION_PERMANENT_FAILURE,
@@ -202,6 +203,11 @@ def dao_get_notification_count_for_service(*, service_id):
return notification_count
def dao_get_failed_notification_count():
failed_count = Notification.query.filter_by(status=NOTIFICATION_FAILED).count()
return failed_count
def get_notification_with_personalisation(service_id, notification_id, key_type):
filter_dict = {"service_id": service_id, "id": notification_id}
if key_type: