Files
notifications-api/app/celery/scheduled_tasks.py
2025-10-02 07:36:55 -07:00

350 lines
14 KiB
Python

import json
from datetime import datetime, timedelta
from flask import current_app
from sqlalchemy import between, select, union
from sqlalchemy.exc import SQLAlchemyError
from app import db, get_zendesk_client, notify_celery, redis_store
from app.celery.tasks import (
get_recipient_csv_and_template_and_sender_id,
process_incomplete_jobs,
process_job,
process_row,
)
from app.clients.cloudwatch.aws_cloudwatch import AwsCloudwatchClient
from app.config import QueueNames
from app.dao.invited_org_user_dao import (
delete_org_invitations_created_more_than_two_days_ago,
)
from app.dao.invited_user_dao import expire_invitations_created_more_than_two_days_ago
from app.dao.jobs_dao import (
dao_set_scheduled_jobs_to_pending,
dao_update_job_status_to_error,
find_jobs_with_missing_rows,
find_missing_row_for_job,
)
from app.dao.notifications_dao import (
dao_batch_insert_notifications,
dao_close_out_delivery_receipts,
dao_update_delivery_receipts,
notifications_not_yet_sent,
)
from app.dao.services_dao import (
dao_find_services_sending_to_tv_numbers,
dao_find_services_with_high_failure_rates,
)
from app.dao.users_dao import delete_codes_older_created_more_than_a_day_ago
from app.enums import JobStatus, NotificationType
from app.models import Job, Notification
from app.notifications.process_notifications import send_notification_to_queue
from app.utils import utc_now
from notifications_utils import aware_utcnow
from notifications_utils.clients.zendesk.zendesk_client import NotifySupportTicket
MAX_NOTIFICATION_FAILS = 10000
zendesk_client = get_zendesk_client()
@notify_celery.task(name="run-scheduled-jobs")
def run_scheduled_jobs():
try:
for job in dao_set_scheduled_jobs_to_pending():
process_job.apply_async([str(job.id)], queue=QueueNames.JOBS)
current_app.logger.info(
"Job ID {} added to process job queue".format(job.id)
)
except SQLAlchemyError:
current_app.logger.exception("Failed to run scheduled jobs")
raise
@notify_celery.task(name="delete-verify-codes")
def delete_verify_codes():
try:
start = utc_now()
deleted = delete_codes_older_created_more_than_a_day_ago()
current_app.logger.info(
"Delete job started {} finished {} deleted {} verify codes".format(
start, utc_now(), deleted
)
)
except SQLAlchemyError:
current_app.logger.exception("Failed to delete verify codes")
raise
@notify_celery.task(name="expire-or-delete-invitations")
def expire_or_delete_invitations():
try:
start = utc_now()
expired_invites = expire_invitations_created_more_than_two_days_ago()
current_app.logger.info(
f"Expire job started {start} finished {utc_now()} expired {expired_invites} invitations"
)
except SQLAlchemyError:
current_app.logger.exception("Failed to expire invitations")
raise
try:
start = utc_now()
deleted_invites = delete_org_invitations_created_more_than_two_days_ago()
current_app.logger.info(
f"Delete job started {start} finished {utc_now()} deleted {deleted_invites} invitations"
)
except SQLAlchemyError:
current_app.logger.exception("Failed to delete invitations")
raise
@notify_celery.task(name="check-job-status")
def check_job_status():
"""
every x minutes do this check
select
from jobs
where job_status == 'in progress'
and processing started some time ago
OR where the job_status == 'pending'
and the job scheduled_for timestamp is some time ago.
if any results then
update the job_status to 'error'
process the rows in the csv that are missing (in another task) just do the check here.
"""
START_MINUTES = 245
END_MINUTES = 240
end_minutes_ago = utc_now() - timedelta(minutes=END_MINUTES)
start_minutes_ago = utc_now() - timedelta(minutes=START_MINUTES)
incomplete_in_progress_jobs = select(Job).where(
Job.job_status == JobStatus.IN_PROGRESS,
between(Job.processing_started, start_minutes_ago, end_minutes_ago),
)
incomplete_pending_jobs = select(Job).where(
Job.job_status == JobStatus.PENDING,
Job.scheduled_for.isnot(None),
between(Job.scheduled_for, start_minutes_ago, end_minutes_ago),
)
jobs_not_completed_after_allotted_time = union(
incomplete_in_progress_jobs, incomplete_pending_jobs
)
jobs_not_completed_after_allotted_time = (
jobs_not_completed_after_allotted_time.order_by(
Job.processing_started, Job.scheduled_for
)
)
jobs_not_complete_after_allotted_time = db.session.execute(
jobs_not_completed_after_allotted_time
).all()
# temporarily mark them as ERROR so that they don't get picked up by future check_job_status tasks
# if they haven't been re-processed in time.
job_ids = []
for job in jobs_not_complete_after_allotted_time:
dao_update_job_status_to_error(job)
job_ids.append(str(job.id))
if job_ids:
current_app.logger.info("Job(s) {} have not completed.".format(job_ids))
process_incomplete_jobs.apply_async([job_ids], queue=QueueNames.JOBS)
@notify_celery.task(name="replay-created-notifications")
def replay_created_notifications():
# if the notification has not be send after 1 hour, then try to resend.
resend_created_notifications_older_than = 60 * 60
for notification_type in (NotificationType.EMAIL, NotificationType.SMS):
notifications_to_resend = notifications_not_yet_sent(
resend_created_notifications_older_than, notification_type
)
if len(notifications_to_resend) > 0:
current_app.logger.info(
"Sending {} {} notifications "
"to the delivery queue because the notification "
"status was created.".format(
len(notifications_to_resend), notification_type
)
)
for n in notifications_to_resend:
send_notification_to_queue(notification=n)
@notify_celery.task(name="check-for-missing-rows-in-completed-jobs")
def check_for_missing_rows_in_completed_jobs():
jobs = find_jobs_with_missing_rows()
for job in jobs:
(
recipient_csv,
template,
sender_id,
) = get_recipient_csv_and_template_and_sender_id(job)
missing_rows = find_missing_row_for_job(job.id, job.notification_count)
for row_to_process in missing_rows:
row = recipient_csv[row_to_process.missing_row]
current_app.logger.info(
f"Processing missing row: {row_to_process.missing_row} for job: {job.id}"
)
process_row(row, template, job, job.service, sender_id=sender_id)
@notify_celery.task(
name="check-for-services-with-high-failure-rates-or-sending-to-tv-numbers"
)
def check_for_services_with_high_failure_rates_or_sending_to_tv_numbers():
start_date = utc_now() - timedelta(days=1)
end_date = utc_now()
message = ""
services_with_failures = dao_find_services_with_high_failure_rates(
start_date=start_date, end_date=end_date
)
services_sending_to_tv_numbers = dao_find_services_sending_to_tv_numbers(
start_date=start_date, end_date=end_date
)
if services_with_failures:
message += "{} service(s) have had high permanent-failure rates for sms messages in last 24 hours:\n".format(
len(services_with_failures)
)
for service in services_with_failures:
service_dashboard = "{}/services/{}".format(
current_app.config["ADMIN_BASE_URL"],
str(service.service_id),
)
message += "service: {} failure rate: {},\n".format(
service_dashboard, service.permanent_failure_rate
)
elif services_sending_to_tv_numbers:
message += "{} service(s) have sent over 500 sms messages to tv numbers in last 24 hours:\n".format(
len(services_sending_to_tv_numbers)
)
for service in services_sending_to_tv_numbers:
service_dashboard = "{}/services/{}".format(
current_app.config["ADMIN_BASE_URL"],
str(service.service_id),
)
message += "service: {} count of sms to tv numbers: {},\n".format(
service_dashboard, service.notification_count
)
if services_with_failures or services_sending_to_tv_numbers:
current_app.logger.warning(message)
if current_app.config["NOTIFY_ENVIRONMENT"] in ["live", "production", "test"]:
message += (
"\nYou can find instructions for this ticket in our manual:\n"
"https://github.com/alphagov/notifications-manuals/wiki/Support-Runbook#Deal-with-services-with-high-failure-rates-or-sending-sms-to-tv-numbers" # noqa
)
ticket = NotifySupportTicket(
subject=f"[{current_app.config['NOTIFY_ENVIRONMENT']}] High failure rates for sms spotted for services",
message=message,
ticket_type=NotifySupportTicket.TYPE_INCIDENT,
technical_ticket=True,
)
zendesk_client.send_ticket_to_zendesk(ticket)
@notify_celery.task(
bind=True, max_retries=7, default_retry_delay=3600, name="process-delivery-receipts"
)
def process_delivery_receipts(self):
# If we need to check db settings do it here for convenience
# current_app.logger.info(f"POOL SIZE {app.db.engine.pool.size()}")
"""
Every eight minutes or so (see config.py) we run this task, which searches the last ten
minutes of logs for delivery receipts and batch updates the db with the results. The overlap
is intentional. We don't mind re-updating things, it is better than losing data.
We also set this to retry with exponential backoff in the case of failure. The only way this would
fail is if, for example the db went down, or redis filled causing the app to stop processing. But if
it does fail, we need to go back over at some point when things are running again and process those results.
"""
try:
batch_size = 1000 # in theory with postgresql this could be 10k to 20k?
cloudwatch = AwsCloudwatchClient()
cloudwatch.init_app(current_app)
start_time = aware_utcnow() - timedelta(minutes=3)
end_time = aware_utcnow()
delivered_receipts, failed_receipts = cloudwatch.check_delivery_receipts(
start_time, end_time
)
delivered_receipts = list(delivered_receipts)
for i in range(0, len(delivered_receipts), batch_size):
batch = delivered_receipts[i : i + batch_size]
dao_update_delivery_receipts(batch, True)
failed_receipts = list(failed_receipts)
for i in range(0, len(failed_receipts), batch_size):
batch = failed_receipts[i : i + batch_size]
dao_update_delivery_receipts(batch, False)
except Exception as ex:
retry_count = self.request.retries
wait_time = 3600 * 2**retry_count
current_app.logger.exception(str(ex))
try:
raise self.retry(ex=ex, countdown=wait_time)
except self.MaxRetriesExceededError:
current_app.logger.error(
"Failed process delivery receipts after max retries"
)
@notify_celery.task(
bind=True, max_retries=2, default_retry_delay=3600, name="cleanup-delivery-receipts"
)
def cleanup_delivery_receipts(self):
dao_close_out_delivery_receipts()
@notify_celery.task(bind=True, name="batch-insert-notifications")
def batch_insert_notifications(self):
batch = []
current_len = redis_store.llen("message_queue")
with redis_store.pipeline():
# since this list is being fed by other processes, just grab what is available when
# this call is made and process that.
count = 0
while count < current_len:
count = count + 1
notification_bytes = redis_store.lpop("message_queue")
notification_dict = json.loads(notification_bytes.decode("utf-8"))
notification_dict["status"] = notification_dict.pop("notification_status")
if not notification_dict.get("created_at"):
notification_dict["created_at"] = utc_now()
elif isinstance(notification_dict["created_at"], list):
notification_dict["created_at"] = notification_dict["created_at"][0]
notification = Notification(**notification_dict)
# notify-api-749 do not write to db
# if we have a verify_code we know this is the authentication notification at login time
# and not csv (containing PII) provided by the user, so allow verify_code to continue to exist
if notification is None:
continue
if "verify_code" in str(notification.personalisation):
pass
else:
batch.append(notification)
try:
dao_batch_insert_notifications(batch)
except Exception:
current_app.logger.exception("Notification batch insert failed")
for n in batch:
# Use 'created_at' as a TTL so we don't retry infinitely
notification_time = n.created_at
if isinstance(notification_time, str):
notification_time = datetime.fromisoformat(n.created_at)
if notification_time < utc_now() - timedelta(seconds=50):
current_app.logger.warning(
f"Abandoning stale data, could not write to db: {n.serialize_for_redis(n)}"
)
continue
else:
redis_store.rpush("message_queue", json.dumps(n.serialize_for_redis(n)))