This commit is contained in:
Kenneth Kehl
2024-12-13 15:04:18 -08:00
parent 56ebed5445
commit 7529ae0153
3 changed files with 4 additions and 264 deletions

View File

@@ -1,107 +1,19 @@
import json
import os
from datetime import timedelta
from botocore.exceptions import ClientError
from flask import current_app
from sqlalchemy.orm.exc import NoResultFound
from app import aws_cloudwatch_client, notify_celery, redis_store
from app import notify_celery, redis_store
from app.clients.email import EmailClientNonRetryableException
from app.clients.email.aws_ses import AwsSesClientThrottlingSendRateException
from app.clients.sms import SmsClientResponseException
from app.config import Config, QueueNames
from app.dao import notifications_dao
from app.dao.notifications_dao import (
sanitize_successful_notification_by_id,
update_notification_status_by_id,
)
from app.dao.notifications_dao import update_notification_status_by_id
from app.delivery import send_to_providers
from app.enums import NotificationStatus
from app.exceptions import NotificationTechnicalFailureException
from app.utils import utc_now
# This is the amount of time to wait after sending an sms message before we check the aws logs and look for delivery
# receipts
DELIVERY_RECEIPT_DELAY_IN_SECONDS = 30
@notify_celery.task(
bind=True,
name="check_sms_delivery_receipt",
max_retries=48,
default_retry_delay=300,
)
def check_sms_delivery_receipt(self, message_id, notification_id, sent_at):
"""
This is called after deliver_sms to check the status of the message. This uses the same number of
retries and the same delay period as deliver_sms. In addition, this fires five minutes after
deliver_sms initially. So the idea is that most messages will succeed and show up in the logs quickly.
Other message will resolve successfully after a retry or to. A few will fail but it will take up to
4 hours to know for sure. The call to check_sms will raise an exception if neither a success nor a
failure appears in the cloudwatch logs, so this should keep retrying until the log appears, or until
we run out of retries.
"""
# TODO the localstack cloudwatch doesn't currently have our log groups. Possibly create them with awslocal?
if aws_cloudwatch_client.is_localstack():
status = "success"
provider_response = "this is a fake successful localstack sms message"
carrier = "unknown"
else:
try:
status, provider_response, carrier = aws_cloudwatch_client.check_sms(
message_id, notification_id, sent_at
)
except NotificationTechnicalFailureException as ntfe:
provider_response = "Unable to find carrier response -- still looking"
status = "pending"
carrier = ""
update_notification_status_by_id(
notification_id,
status,
carrier=carrier,
provider_response=provider_response,
)
raise self.retry(exc=ntfe)
except ClientError as err:
# Probably a ThrottlingException but could be something else
error_code = err.response["Error"]["Code"]
provider_response = (
f"{error_code} while checking sms receipt -- still looking"
)
status = "pending"
carrier = ""
update_notification_status_by_id(
notification_id,
status,
carrier=carrier,
provider_response=provider_response,
)
raise self.retry(exc=err)
if status == "success":
status = NotificationStatus.DELIVERED
elif status == "failure":
status = NotificationStatus.FAILED
# if status is not success or failure the client raised an exception and this method will retry
if status == NotificationStatus.DELIVERED:
sanitize_successful_notification_by_id(
notification_id, carrier=carrier, provider_response=provider_response
)
current_app.logger.info(
f"Sanitized notification {notification_id} that was successfully delivered"
)
else:
update_notification_status_by_id(
notification_id,
status,
carrier=carrier,
provider_response=provider_response,
)
current_app.logger.info(
f"Updated notification {notification_id} with response '{provider_response}'"
)
@notify_celery.task(
@@ -127,17 +39,8 @@ def deliver_sms(self, notification_id):
ansi_green + f"AUTHENTICATION CODE: {notification.content}" + ansi_reset
)
# Code branches off to send_to_providers.py
message_id = send_to_providers.send_sms_to_provider(notification)
send_to_providers.send_sms_to_provider(notification)
# DEPRECATED
# We have to put it in UTC. For other timezones, the delay
# will be ignored and it will fire immediately (although this probably only affects developer testing)
my_eta = utc_now() + timedelta(seconds=DELIVERY_RECEIPT_DELAY_IN_SECONDS)
check_sms_delivery_receipt.apply_async(
[message_id, notification_id, notification.created_at],
eta=my_eta,
queue=QueueNames.CHECK_SMS,
)
except Exception as e:
update_notification_status_by_id(
notification_id,

View File

@@ -110,65 +110,6 @@ class AwsCloudwatchClient(Client):
return logline
return None
# DEPRECATED
def check_sms(self, message_id, notification_id, created_at):
region = cloud_config.sns_region
# TODO this clumsy approach to getting the account number will be fixed as part of notify-api #258
account_number = self._extract_account_number(cloud_config.ses_domain_arn)
time_now = utc_now()
log_group_name = f"sns/{region}/{account_number[4]}/DirectPublishToPhoneNumber"
filter_pattern = '{$.notification.messageId="XXXXX"}'
filter_pattern = filter_pattern.replace("XXXXX", message_id)
all_log_events = self._get_log(filter_pattern, log_group_name, created_at)
if all_log_events and len(all_log_events) > 0:
event = all_log_events[0]
message = json.loads(event["message"])
self.warn_if_dev_is_opted_out(
message["delivery"]["providerResponse"], notification_id
)
# Here we map the answer from aws to the message_id.
# Previously, in send_to_providers, we mapped the job_id and row number
# to the message id. And on the admin side we mapped the csv filename
# to the job_id. So by tracing through all the logs we can go:
# filename->job_id->message_id->what really happened
current_app.logger.info(
hilite(f"DELIVERED: {message} for message_id {message_id}")
)
return (
"success",
message["delivery"]["providerResponse"],
message["delivery"].get("phoneCarrier", "Unknown Carrier"),
)
log_group_name = (
f"sns/{region}/{account_number[4]}/DirectPublishToPhoneNumber/Failure"
)
all_failed_events = self._get_log(filter_pattern, log_group_name, created_at)
if all_failed_events and len(all_failed_events) > 0:
event = all_failed_events[0]
message = json.loads(event["message"])
self.warn_if_dev_is_opted_out(
message["delivery"]["providerResponse"], notification_id
)
current_app.logger.info(
hilite(f"FAILED: {message} for message_id {message_id}")
)
return (
"failure",
message["delivery"]["providerResponse"],
message["delivery"].get("phoneCarrier", "Unknown Carrier"),
)
if time_now > (created_at + timedelta(hours=3)):
# see app/models.py Notification. This message corresponds to "permanent-failure",
# but we are copy/pasting here to avoid circular imports.
return "failure", "Unable to find carrier response."
raise NotificationTechnicalFailureException(
f"No event found for message_id {message_id} notification_id {notification_id}"
)
def do_log_insights(self):
region = cloud_config.sns_region