add debug tag adr

2025-12-15 01:32:20 -05:00 · 2025-02-27 13:24:32 -08:00
parent e9e69777de
commit 4cfb980ac6
10 changed files with 113 additions and 49 deletions
--- a/app/aws/s3.py
+++ b/app/aws/s3.py
@@ -118,7 +118,7 @@ def list_s3_objects():
                break
    except Exception:
        current_app.logger.exception(
-            "An error occurred while regenerating cache #notify-admin-1200",
+            "An error occurred while regenerating cache #notify-debug-admin-1200",
        )


@@ -200,7 +200,7 @@ def read_s3_file(bucket_name, object_key, s3res):

    except LookupError:
        # perhaps our key is not formatted as we expected.  If so skip it.
-        current_app.logger.exception("LookupError #notify-admin-1200")
+        current_app.logger.exception("LookupError #notify-debug-admin-1200")


 def get_s3_files():
@@ -213,7 +213,7 @@ def get_s3_files():

    s3res = get_s3_resource()
    current_app.logger.info(
-        f"job_cache length before regen: {len_job_cache()} #notify-admin-1200"
+        f"job_cache length before regen: {len_job_cache()} #notify-debug-admin-1200"
    )
    try:
        with ThreadPoolExecutor() as executor:
@@ -222,7 +222,7 @@ def get_s3_files():
        current_app.logger.exception("Connection pool issue")

    current_app.logger.info(
-        f"job_cache length after regen: {len_job_cache()} #notify-admin-1200"
+        f"job_cache length after regen: {len_job_cache()} #notify-debug-admin-1200"
    )


@@ -290,7 +290,7 @@ def file_exists(file_location):

 def get_job_location(service_id, job_id):
    current_app.logger.debug(
-        f"#s3-partitioning NEW JOB_LOCATION: {NEW_FILE_LOCATION_STRUCTURE.format(service_id, job_id)}"
+        f"#notify-debug-s3-partitioning NEW JOB_LOCATION: {NEW_FILE_LOCATION_STRUCTURE.format(service_id, job_id)}"
    )
    return (
        current_app.config["CSV_UPLOAD_BUCKET"]["bucket"],
@@ -308,7 +308,7 @@ def get_old_job_location(service_id, job_id):
    Remove this when everything works with the NEW_FILE_LOCATION_STRUCTURE.
    """
    current_app.logger.debug(
-        f"#s3-partitioning OLD JOB LOCATION: {FILE_LOCATION_STRUCTURE.format(service_id, job_id)}"
+        f"#notify-debug-s3-partitioning OLD JOB LOCATION: {FILE_LOCATION_STRUCTURE.format(service_id, job_id)}"
    )
    return (
        current_app.config["CSV_UPLOAD_BUCKET"]["bucket"],
@@ -507,7 +507,7 @@ def get_personalisation_from_s3(service_id, job_id, job_row_number):

 def get_job_metadata_from_s3(service_id, job_id):
    current_app.logger.debug(
-        f"#s3-partitioning CALLING GET_JOB_METADATA with {service_id}, {job_id}"
+        f"#notify-debug-s3-partitioning CALLING GET_JOB_METADATA with {service_id}, {job_id}"
    )
    obj = get_s3_object(*get_job_location(service_id, job_id))
    return obj.get()["Metadata"]
--- a/app/clients/pinpoint/aws_pinpoint.py
+++ b/app/clients/pinpoint/aws_pinpoint.py
@@ -39,7 +39,7 @@ class AwsPinpointClient(Client):
            current_app.logger.info(hilite(response))
        except ClientError:
            current_app.logger.exception(
-                "#validate-phone-number Could not validate with pinpoint"
+                "#notify-debug-validate-phone-number Could not validate with pinpoint"
            )

        # TODO This is the structure of the response.  When the phone validation
--- a/app/clients/sms/aws_sns.py
+++ b/app/clients/sms/aws_sns.py
@@ -68,15 +68,16 @@ class AwsSnsClient(SmsClient):
                non_scrubbable = " ".join(sender)

                self.current_app.logger.info(
-                    f"notify-api-1385 sender {non_scrubbable} is a {type(sender)} default is a {type(default_num)}"
+                    f"notify-debug-api-1385 sender {non_scrubbable} is a {type(sender)} \
+                        default is a {type(default_num)}"
                )
            else:
                self.current_app.logger.warning(
-                    f"notify-api-1385 sender is type {type(sender)}!! {sender}"
+                    f"notify-debug-api-1385 sender is type {type(sender)}!! {sender}"
                )
            if self._valid_sender_number(sender):
                self.current_app.logger.info(
-                    f"notify-api-1385 use valid sender {non_scrubbable} instead of default {default_num}"
+                    f"notify-debug-api-1385 use valid sender {non_scrubbable} instead of default {default_num}"
                )

                attributes["AWS.MM.SMS.OriginationNumber"] = {
@@ -85,7 +86,7 @@ class AwsSnsClient(SmsClient):
                }
            else:
                self.current_app.logger.info(
-                    f"notify-api-1385 use default {default_num} instead of invalid sender"
+                    f"notify-debug-api-1385 use default {default_num} instead of invalid sender"
                )

                attributes["AWS.MM.SMS.OriginationNumber"] = {
--- a/app/dao/jobs_dao.py
+++ b/app/dao/jobs_dao.py
@@ -158,17 +158,17 @@ def dao_create_job(job):
    now_time = utc_now()
    diff_time = now_time - orig_time
    current_app.logger.info(
-        f"#notify-admin-1859 dao_create_job orig created at {orig_time} and now {now_time}"
+        f"#notify-debug-admin-1859 dao_create_job orig created at {orig_time} and now {now_time}"
    )
    if diff_time.total_seconds() > 300:  # It should be only a few seconds diff at most
        current_app.logger.error(
-            "#notify-admin-1859 Something is wrong with job.created_at!"
+            "#notify-debug-admin-1859 Something is wrong with job.created_at!"
        )
        if os.getenv("NOTIFY_ENVIRONMENT") not in ["test"]:
            job.created_at = now_time
            dao_update_job(job)
            current_app.logger.error(
-                f"#notify-admin-1859 Job created_at reset to {job.created_at}"
+                f"#notify-debug-admin-1859 Job created_at reset to {job.created_at}"
            )


--- a/app/dao/services_dao.py
+++ b/app/dao/services_dao.py
@@ -534,7 +534,9 @@ def dao_fetch_stats_for_service_from_hours(service_id, start_date, end_date):
    # Update to group by HOUR instead of DAY
    total_substmt = (
        select(
-            func.date_trunc("hour", NotificationAllTimeView.created_at).label("hour"),  # UPDATED
+            func.date_trunc("hour", NotificationAllTimeView.created_at).label(
+                "hour"
+            ),  # UPDATED
            Job.notification_count.label("notification_count"),
        )
        .join(Job, NotificationAllTimeView.job_id == Job.id)
@@ -556,11 +558,14 @@ def dao_fetch_stats_for_service_from_hours(service_id, start_date, end_date):
    total_stmt = select(
        total_substmt.c.hour,  # UPDATED
        func.sum(total_substmt.c.notification_count).label("total_notifications"),
-    ).group_by(total_substmt.c.hour)  # UPDATED
+    ).group_by(
+        total_substmt.c.hour
+    )  # UPDATED

    # Ensure we're using hourly timestamps in the response
    total_notifications = {
-        row.hour: row.total_notifications for row in db.session.execute(total_stmt).all()
+        row.hour: row.total_notifications
+        for row in db.session.execute(total_stmt).all()
    }

    # Update the second query to also use "hour"
@@ -568,7 +573,9 @@ def dao_fetch_stats_for_service_from_hours(service_id, start_date, end_date):
        select(
            NotificationAllTimeView.notification_type,
            NotificationAllTimeView.status,
-            func.date_trunc("hour", NotificationAllTimeView.created_at).label("hour"),  # UPDATED
+            func.date_trunc("hour", NotificationAllTimeView.created_at).label(
+                "hour"
+            ),  # UPDATED
            func.count(NotificationAllTimeView.id).label("count"),
        )
        .where(
@@ -895,7 +902,9 @@ def get_specific_days_stats(
    return stats


-def get_specific_hours_stats(data, start_date, hours=None, end_date=None, total_notifications=None):
+def get_specific_hours_stats(
+    data, start_date, hours=None, end_date=None, total_notifications=None
+):
    if hours is not None and end_date is not None:
        raise ValueError("Only set hours OR set end_date, not both.")
    elif hours is not None:
@@ -919,10 +928,10 @@ def get_specific_hours_stats(data, start_date, hours=None, end_date=None, total_
    # Format statistics, returning only hours with results
    stats = {
        hour.strftime("%Y-%m-%dT%H:00:00Z"): statistics.format_statistics(
-            rows,
-            total_notifications.get(hour, 0) if total_notifications else None
+            rows, total_notifications.get(hour, 0) if total_notifications else None
        )
-        for hour, rows in grouped_data.items() if rows
+        for hour, rows in grouped_data.items()
+        if rows
    }

    return stats
--- a/app/delivery/send_to_providers.py
+++ b/app/delivery/send_to_providers.py
@@ -107,7 +107,7 @@ def send_sms_to_provider(notification):
                sender_numbers = get_sender_numbers(notification)
                if notification.reply_to_text not in sender_numbers:
                    raise ValueError(
-                        f"{notification.reply_to_text} not in {sender_numbers} #notify-admin-1701"
+                        f"{notification.reply_to_text} not in {sender_numbers} #notify-debug-admin-1701"
                    )

                send_sms_kwargs = {
@@ -152,7 +152,7 @@ def _experimentally_validate_phone_numbers(recipient):
    if recipient_lookup in current_app.config["SIMULATED_SMS_NUMBERS"] and os.getenv(
        "NOTIFY_ENVIRONMENT"
    ) in ["development", "test"]:
-        current_app.logger.info(hilite("#validate-phone-number fired"))
+        current_app.logger.info(hilite("#notify-debug-validate-phone-number fired"))
        aws_pinpoint_client.validate_phone_number("01", recipient)


--- a/app/job/rest.py
+++ b/app/job/rest.py
@@ -242,7 +242,9 @@ def create_job(service_id):
    original_file_name = data.get("original_file_name")
    data.update({"service": service_id})
    try:
-        current_app.logger.info(f"#s3-partitioning DATA IN CREATE_JOB: {data}")
+        current_app.logger.info(
+            f"#notify-debug-s3-partitioning DATA IN CREATE_JOB: {data}"
+        )
        data.update(**get_job_metadata_from_s3(service_id, data["id"]))
    except KeyError:
        raise InvalidRequest(
--- a/docs/adrs/0013-log-debug-tags.md
+++ b/docs/adrs/0013-log-debug-tags.md
@@ -0,0 +1,38 @@
+# Use of debug search tags for cloudwatch logs
+
+Status: Accepted
+Date: 27 February 2025
+
+### Context
+
+We encountered some recurrent problems in specific parts of the code that re-occurred regularly.
+
+We found it hard to debug these issues searching through cloudwatch logs as a particular issue
+tends to be exploded into multiple log lines, and these lines are not even time sorted by default
+so it can get quite hard on the eyes and test one's patience to debug this way.
+
+### Decision
+
+We decided to start using unique tags for individual issues so we could quickly search for related
+groups of log lines.  This worked fairly well and even though some of these issues at long last
+have been resolved, it might be worthwhile to continue this debugging pattern.
+
+### Consequences
+
+Here are the existing tags and what they are used for:
+
+#notify-debug-admin-1200: job cache regeneration
+#notify-debug-admin-1701: wrong sender phone number
+#notify-debug-admin-1859: job creation time reset due to strange SqlAlchemy constructor issue setting wrong created_at time
+#notify-debug-api-1385: a separate identify to resolve the wrong sender phone number issue
+#notify-debug-s3-partitioning: modify s3 partitioning in line with best aws practices
+#notify-debug-validate-phone-number: experimental code to try to validate phone numbers with aws pinpoint
+
+
+
+
+### Author
+@kenkehl
+
+### Stakeholders
+@ccostino
--- a/tests/app/aws/test_s3.py
+++ b/tests/app/aws/test_s3.py
@@ -449,9 +449,9 @@ def test_get_s3_files_success(client, mocker):

    mock_read_s3_file.assert_has_calls(calls, any_order=True)

-    # mock_current_app.info.assert_any_call("job_cache length before regen: 0 #notify-admin-1200")
+    # mock_current_app.info.assert_any_call("job_cache length before regen: 0 #notify-debug-admin-1200")

-    # mock_current_app.info.assert_any_call("job_cache length after regen: 0 #notify-admin-1200")
+    # mock_current_app.info.assert_any_call("job_cache length after regen: 0 #notify-debug-admin-1200")


@patch("app.aws.s3.s3_client", None)  # ensure it starts as None
--- a/tests/app/dao/test_services_get_specific_hours.py
+++ b/tests/app/dao/test_services_get_specific_hours.py
@@ -7,7 +7,9 @@ from app.dao.services_dao import get_specific_hours_stats
 from app.enums import StatisticsType
 from app.models import TemplateType

-NotificationRow = namedtuple("NotificationRow", ["notification_type", "status", "timestamp", "count"])
+NotificationRow = namedtuple(
+    "NotificationRow", ["notification_type", "status", "timestamp", "count"]
+)


 def generate_expected_hourly_output(requested_sms_hours):
@@ -38,27 +40,31 @@ def create_mock_notification(notification_type, status, timestamp, count=1):
        notification_type=notification_type,
        status=status,
        timestamp=timestamp.replace(minute=0, second=0, microsecond=0),
-        count=count
+        count=count,
    )


 test_cases = [
    (
-        [create_mock_notification(
+        [
+            create_mock_notification(
                TemplateType.SMS,
                StatisticsType.REQUESTED,
                datetime(2025, 2, 18, 14, 15, 0),
-        )],
+            )
+        ],
        datetime(2025, 2, 18, 12, 0),
        6,
        generate_expected_hourly_output(["2025-02-18T14:00:00Z"]),
    ),
    (
-        [create_mock_notification(
+        [
+            create_mock_notification(
                TemplateType.SMS,
                StatisticsType.REQUESTED,
                datetime(2025, 2, 18, 17, 59, 59),
-        )],
+            )
+        ],
        datetime(2025, 2, 18, 15, 0),
        3,
        generate_expected_hourly_output(["2025-02-18T17:00:00Z"]),
@@ -66,21 +72,29 @@ test_cases = [
    ([], datetime(2025, 2, 18, 10, 0), 4, {}),
    (
        [
-            create_mock_notification(TemplateType.SMS, StatisticsType.REQUESTED, datetime(2025, 2, 18, 9, 30, 0)),
-            create_mock_notification(TemplateType.SMS, StatisticsType.REQUESTED, datetime(2025, 2, 18, 11, 45, 0)),
+            create_mock_notification(
+                TemplateType.SMS,
+                StatisticsType.REQUESTED,
+                datetime(2025, 2, 18, 9, 30, 0),
+            ),
+            create_mock_notification(
+                TemplateType.SMS,
+                StatisticsType.REQUESTED,
+                datetime(2025, 2, 18, 11, 45, 0),
+            ),
        ],
        datetime(2025, 2, 18, 8, 0),
        5,
-        generate_expected_hourly_output(["2025-02-18T09:00:00Z", "2025-02-18T11:00:00Z"]),
+        generate_expected_hourly_output(
+            ["2025-02-18T09:00:00Z", "2025-02-18T11:00:00Z"]
+        ),
    ),
 ]


-@pytest.mark.parametrize("mocked_notifications, start_date, hours, expected_output", test_cases)
+@pytest.mark.parametrize(
+    "mocked_notifications, start_date, hours, expected_output", test_cases
+)
 def test_get_specific_hours(mocked_notifications, start_date, hours, expected_output):
-    results = get_specific_hours_stats(
-        mocked_notifications,
-        start_date,
-        hours=hours
-    )
+    results = get_specific_hours_stats(mocked_notifications, start_date, hours=hours)
    assert results == expected_output, f"Expected {expected_output}, but got {results}"