From 4cfb980ac66f23f49dae4694cca1be9061a98aa9 Mon Sep 17 00:00:00 2001
From: Kenneth Kehl <@kkehl@flexion.us>
Date: Thu, 27 Feb 2025 13:24:32 -0800
Subject: [PATCH] add debug tag adr

---
 app/aws/s3.py                                 | 14 ++---
 app/clients/pinpoint/aws_pinpoint.py          |  2 +-
 app/clients/sms/aws_sns.py                    |  9 +--
 app/dao/jobs_dao.py                           |  6 +-
 app/dao/services_dao.py                       | 25 ++++++---
 app/delivery/send_to_providers.py             |  4 +-
 app/job/rest.py                               |  4 +-
 docs/adrs/0013-log-debug-tags.md              | 38 +++++++++++++
 tests/app/aws/test_s3.py                      |  4 +-
 .../dao/test_services_get_specific_hours.py   | 56 ++++++++++++-------
 10 files changed, 113 insertions(+), 49 deletions(-)
 create mode 100644 docs/adrs/0013-log-debug-tags.md

diff --git a/app/aws/s3.py b/app/aws/s3.py
index e8841b20c..bc1728541 100644
--- a/app/aws/s3.py
+++ b/app/aws/s3.py
@@ -118,7 +118,7 @@ def list_s3_objects():
                 break
     except Exception:
         current_app.logger.exception(
-            "An error occurred while regenerating cache #notify-admin-1200",
+            "An error occurred while regenerating cache #notify-debug-admin-1200",
         )
 
 
@@ -200,7 +200,7 @@ def read_s3_file(bucket_name, object_key, s3res):
 
     except LookupError:
         # perhaps our key is not formatted as we expected.  If so skip it.
-        current_app.logger.exception("LookupError #notify-admin-1200")
+        current_app.logger.exception("LookupError #notify-debug-admin-1200")
 
 
 def get_s3_files():
@@ -213,7 +213,7 @@ def get_s3_files():
 
     s3res = get_s3_resource()
     current_app.logger.info(
-        f"job_cache length before regen: {len_job_cache()} #notify-admin-1200"
+        f"job_cache length before regen: {len_job_cache()} #notify-debug-admin-1200"
     )
     try:
         with ThreadPoolExecutor() as executor:
@@ -222,7 +222,7 @@ def get_s3_files():
         current_app.logger.exception("Connection pool issue")
 
     current_app.logger.info(
-        f"job_cache length after regen: {len_job_cache()} #notify-admin-1200"
+        f"job_cache length after regen: {len_job_cache()} #notify-debug-admin-1200"
     )
 
 
@@ -290,7 +290,7 @@ def file_exists(file_location):
 
 def get_job_location(service_id, job_id):
     current_app.logger.debug(
-        f"#s3-partitioning NEW JOB_LOCATION: {NEW_FILE_LOCATION_STRUCTURE.format(service_id, job_id)}"
+        f"#notify-debug-s3-partitioning NEW JOB_LOCATION: {NEW_FILE_LOCATION_STRUCTURE.format(service_id, job_id)}"
     )
     return (
         current_app.config["CSV_UPLOAD_BUCKET"]["bucket"],
@@ -308,7 +308,7 @@ def get_old_job_location(service_id, job_id):
     Remove this when everything works with the NEW_FILE_LOCATION_STRUCTURE.
     """
     current_app.logger.debug(
-        f"#s3-partitioning OLD JOB LOCATION: {FILE_LOCATION_STRUCTURE.format(service_id, job_id)}"
+        f"#notify-debug-s3-partitioning OLD JOB LOCATION: {FILE_LOCATION_STRUCTURE.format(service_id, job_id)}"
     )
     return (
         current_app.config["CSV_UPLOAD_BUCKET"]["bucket"],
@@ -507,7 +507,7 @@ def get_personalisation_from_s3(service_id, job_id, job_row_number):
 
 def get_job_metadata_from_s3(service_id, job_id):
     current_app.logger.debug(
-        f"#s3-partitioning CALLING GET_JOB_METADATA with {service_id}, {job_id}"
+        f"#notify-debug-s3-partitioning CALLING GET_JOB_METADATA with {service_id}, {job_id}"
     )
     obj = get_s3_object(*get_job_location(service_id, job_id))
     return obj.get()["Metadata"]
diff --git a/app/clients/pinpoint/aws_pinpoint.py b/app/clients/pinpoint/aws_pinpoint.py
index d15d94601..3b4de7682 100644
--- a/app/clients/pinpoint/aws_pinpoint.py
+++ b/app/clients/pinpoint/aws_pinpoint.py
@@ -39,7 +39,7 @@ class AwsPinpointClient(Client):
             current_app.logger.info(hilite(response))
         except ClientError:
             current_app.logger.exception(
-                "#validate-phone-number Could not validate with pinpoint"
+                "#notify-debug-validate-phone-number Could not validate with pinpoint"
             )
 
         # TODO This is the structure of the response.  When the phone validation
diff --git a/app/clients/sms/aws_sns.py b/app/clients/sms/aws_sns.py
index d36af600c..3c2282752 100644
--- a/app/clients/sms/aws_sns.py
+++ b/app/clients/sms/aws_sns.py
@@ -68,15 +68,16 @@ class AwsSnsClient(SmsClient):
                 non_scrubbable = " ".join(sender)
 
                 self.current_app.logger.info(
-                    f"notify-api-1385 sender {non_scrubbable} is a {type(sender)} default is a {type(default_num)}"
+                    f"notify-debug-api-1385 sender {non_scrubbable} is a {type(sender)} \
+                        default is a {type(default_num)}"
                 )
             else:
                 self.current_app.logger.warning(
-                    f"notify-api-1385 sender is type {type(sender)}!! {sender}"
+                    f"notify-debug-api-1385 sender is type {type(sender)}!! {sender}"
                 )
             if self._valid_sender_number(sender):
                 self.current_app.logger.info(
-                    f"notify-api-1385 use valid sender {non_scrubbable} instead of default {default_num}"
+                    f"notify-debug-api-1385 use valid sender {non_scrubbable} instead of default {default_num}"
                 )
 
                 attributes["AWS.MM.SMS.OriginationNumber"] = {
@@ -85,7 +86,7 @@ class AwsSnsClient(SmsClient):
                 }
             else:
                 self.current_app.logger.info(
-                    f"notify-api-1385 use default {default_num} instead of invalid sender"
+                    f"notify-debug-api-1385 use default {default_num} instead of invalid sender"
                 )
 
                 attributes["AWS.MM.SMS.OriginationNumber"] = {
diff --git a/app/dao/jobs_dao.py b/app/dao/jobs_dao.py
index feec601a4..2e479a77d 100644
--- a/app/dao/jobs_dao.py
+++ b/app/dao/jobs_dao.py
@@ -158,17 +158,17 @@ def dao_create_job(job):
     now_time = utc_now()
     diff_time = now_time - orig_time
     current_app.logger.info(
-        f"#notify-admin-1859 dao_create_job orig created at {orig_time} and now {now_time}"
+        f"#notify-debug-admin-1859 dao_create_job orig created at {orig_time} and now {now_time}"
     )
     if diff_time.total_seconds() > 300:  # It should be only a few seconds diff at most
         current_app.logger.error(
-            "#notify-admin-1859 Something is wrong with job.created_at!"
+            "#notify-debug-admin-1859 Something is wrong with job.created_at!"
         )
         if os.getenv("NOTIFY_ENVIRONMENT") not in ["test"]:
             job.created_at = now_time
             dao_update_job(job)
             current_app.logger.error(
-                f"#notify-admin-1859 Job created_at reset to {job.created_at}"
+                f"#notify-debug-admin-1859 Job created_at reset to {job.created_at}"
             )
 
 
diff --git a/app/dao/services_dao.py b/app/dao/services_dao.py
index 1c8a5e157..a7db2415d 100644
--- a/app/dao/services_dao.py
+++ b/app/dao/services_dao.py
@@ -534,7 +534,9 @@ def dao_fetch_stats_for_service_from_hours(service_id, start_date, end_date):
     # Update to group by HOUR instead of DAY
     total_substmt = (
         select(
-            func.date_trunc("hour", NotificationAllTimeView.created_at).label("hour"),  # UPDATED
+            func.date_trunc("hour", NotificationAllTimeView.created_at).label(
+                "hour"
+            ),  # UPDATED
             Job.notification_count.label("notification_count"),
         )
         .join(Job, NotificationAllTimeView.job_id == Job.id)
@@ -556,11 +558,14 @@ def dao_fetch_stats_for_service_from_hours(service_id, start_date, end_date):
     total_stmt = select(
         total_substmt.c.hour,  # UPDATED
         func.sum(total_substmt.c.notification_count).label("total_notifications"),
-    ).group_by(total_substmt.c.hour)  # UPDATED
+    ).group_by(
+        total_substmt.c.hour
+    )  # UPDATED
 
     # Ensure we're using hourly timestamps in the response
     total_notifications = {
-        row.hour: row.total_notifications for row in db.session.execute(total_stmt).all()
+        row.hour: row.total_notifications
+        for row in db.session.execute(total_stmt).all()
     }
 
     # Update the second query to also use "hour"
@@ -568,7 +573,9 @@ def dao_fetch_stats_for_service_from_hours(service_id, start_date, end_date):
         select(
             NotificationAllTimeView.notification_type,
             NotificationAllTimeView.status,
-            func.date_trunc("hour", NotificationAllTimeView.created_at).label("hour"),  # UPDATED
+            func.date_trunc("hour", NotificationAllTimeView.created_at).label(
+                "hour"
+            ),  # UPDATED
             func.count(NotificationAllTimeView.id).label("count"),
         )
         .where(
@@ -895,7 +902,9 @@ def get_specific_days_stats(
     return stats
 
 
-def get_specific_hours_stats(data, start_date, hours=None, end_date=None, total_notifications=None):
+def get_specific_hours_stats(
+    data, start_date, hours=None, end_date=None, total_notifications=None
+):
     if hours is not None and end_date is not None:
         raise ValueError("Only set hours OR set end_date, not both.")
     elif hours is not None:
@@ -919,10 +928,10 @@ def get_specific_hours_stats(data, start_date, hours=None, end_date=None, total_
     # Format statistics, returning only hours with results
     stats = {
         hour.strftime("%Y-%m-%dT%H:00:00Z"): statistics.format_statistics(
-            rows,
-            total_notifications.get(hour, 0) if total_notifications else None
+            rows, total_notifications.get(hour, 0) if total_notifications else None
         )
-        for hour, rows in grouped_data.items() if rows
+        for hour, rows in grouped_data.items()
+        if rows
     }
 
     return stats
diff --git a/app/delivery/send_to_providers.py b/app/delivery/send_to_providers.py
index 515d418e7..ef42aecc5 100644
--- a/app/delivery/send_to_providers.py
+++ b/app/delivery/send_to_providers.py
@@ -107,7 +107,7 @@ def send_sms_to_provider(notification):
                 sender_numbers = get_sender_numbers(notification)
                 if notification.reply_to_text not in sender_numbers:
                     raise ValueError(
-                        f"{notification.reply_to_text} not in {sender_numbers} #notify-admin-1701"
+                        f"{notification.reply_to_text} not in {sender_numbers} #notify-debug-admin-1701"
                     )
 
                 send_sms_kwargs = {
@@ -152,7 +152,7 @@ def _experimentally_validate_phone_numbers(recipient):
     if recipient_lookup in current_app.config["SIMULATED_SMS_NUMBERS"] and os.getenv(
         "NOTIFY_ENVIRONMENT"
     ) in ["development", "test"]:
-        current_app.logger.info(hilite("#validate-phone-number fired"))
+        current_app.logger.info(hilite("#notify-debug-validate-phone-number fired"))
         aws_pinpoint_client.validate_phone_number("01", recipient)
 
 
diff --git a/app/job/rest.py b/app/job/rest.py
index ed506a5aa..571c1e342 100644
--- a/app/job/rest.py
+++ b/app/job/rest.py
@@ -242,7 +242,9 @@ def create_job(service_id):
     original_file_name = data.get("original_file_name")
     data.update({"service": service_id})
     try:
-        current_app.logger.info(f"#s3-partitioning DATA IN CREATE_JOB: {data}")
+        current_app.logger.info(
+            f"#notify-debug-s3-partitioning DATA IN CREATE_JOB: {data}"
+        )
         data.update(**get_job_metadata_from_s3(service_id, data["id"]))
     except KeyError:
         raise InvalidRequest(
diff --git a/docs/adrs/0013-log-debug-tags.md b/docs/adrs/0013-log-debug-tags.md
new file mode 100644
index 000000000..83a88107b
--- /dev/null
+++ b/docs/adrs/0013-log-debug-tags.md
@@ -0,0 +1,38 @@
+# Use of debug search tags for cloudwatch logs
+
+Status: Accepted
+Date: 27 February 2025
+
+### Context
+
+We encountered some recurrent problems in specific parts of the code that re-occurred regularly.
+
+We found it hard to debug these issues searching through cloudwatch logs as a particular issue
+tends to be exploded into multiple log lines, and these lines are not even time sorted by default
+so it can get quite hard on the eyes and test one's patience to debug this way.
+
+### Decision
+
+We decided to start using unique tags for individual issues so we could quickly search for related
+groups of log lines.  This worked fairly well and even though some of these issues at long last
+have been resolved, it might be worthwhile to continue this debugging pattern.
+
+### Consequences
+
+Here are the existing tags and what they are used for:
+
+#notify-debug-admin-1200: job cache regeneration
+#notify-debug-admin-1701: wrong sender phone number
+#notify-debug-admin-1859: job creation time reset due to strange SqlAlchemy constructor issue setting wrong created_at time
+#notify-debug-api-1385: a separate identify to resolve the wrong sender phone number issue
+#notify-debug-s3-partitioning: modify s3 partitioning in line with best aws practices
+#notify-debug-validate-phone-number: experimental code to try to validate phone numbers with aws pinpoint
+
+
+
+
+### Author
+@kenkehl
+
+### Stakeholders
+@ccostino
diff --git a/tests/app/aws/test_s3.py b/tests/app/aws/test_s3.py
index de26f5760..ec5cea861 100644
--- a/tests/app/aws/test_s3.py
+++ b/tests/app/aws/test_s3.py
@@ -449,9 +449,9 @@ def test_get_s3_files_success(client, mocker):
 
     mock_read_s3_file.assert_has_calls(calls, any_order=True)
 
-    # mock_current_app.info.assert_any_call("job_cache length before regen: 0 #notify-admin-1200")
+    # mock_current_app.info.assert_any_call("job_cache length before regen: 0 #notify-debug-admin-1200")
 
-    # mock_current_app.info.assert_any_call("job_cache length after regen: 0 #notify-admin-1200")
+    # mock_current_app.info.assert_any_call("job_cache length after regen: 0 #notify-debug-admin-1200")
 
 
 @patch("app.aws.s3.s3_client", None)  # ensure it starts as None
diff --git a/tests/app/dao/test_services_get_specific_hours.py b/tests/app/dao/test_services_get_specific_hours.py
index b96d442c0..a97dbd150 100644
--- a/tests/app/dao/test_services_get_specific_hours.py
+++ b/tests/app/dao/test_services_get_specific_hours.py
@@ -7,7 +7,9 @@ from app.dao.services_dao import get_specific_hours_stats
 from app.enums import StatisticsType
 from app.models import TemplateType
 
-NotificationRow = namedtuple("NotificationRow", ["notification_type", "status", "timestamp", "count"])
+NotificationRow = namedtuple(
+    "NotificationRow", ["notification_type", "status", "timestamp", "count"]
+)
 
 
 def generate_expected_hourly_output(requested_sms_hours):
@@ -38,27 +40,31 @@ def create_mock_notification(notification_type, status, timestamp, count=1):
         notification_type=notification_type,
         status=status,
         timestamp=timestamp.replace(minute=0, second=0, microsecond=0),
-        count=count
+        count=count,
     )
 
 
 test_cases = [
     (
-        [create_mock_notification(
-            TemplateType.SMS,
-            StatisticsType.REQUESTED,
-            datetime(2025, 2, 18, 14, 15, 0),
-        )],
+        [
+            create_mock_notification(
+                TemplateType.SMS,
+                StatisticsType.REQUESTED,
+                datetime(2025, 2, 18, 14, 15, 0),
+            )
+        ],
         datetime(2025, 2, 18, 12, 0),
         6,
         generate_expected_hourly_output(["2025-02-18T14:00:00Z"]),
     ),
     (
-        [create_mock_notification(
-            TemplateType.SMS,
-            StatisticsType.REQUESTED,
-            datetime(2025, 2, 18, 17, 59, 59),
-        )],
+        [
+            create_mock_notification(
+                TemplateType.SMS,
+                StatisticsType.REQUESTED,
+                datetime(2025, 2, 18, 17, 59, 59),
+            )
+        ],
         datetime(2025, 2, 18, 15, 0),
         3,
         generate_expected_hourly_output(["2025-02-18T17:00:00Z"]),
@@ -66,21 +72,29 @@ test_cases = [
     ([], datetime(2025, 2, 18, 10, 0), 4, {}),
     (
         [
-            create_mock_notification(TemplateType.SMS, StatisticsType.REQUESTED, datetime(2025, 2, 18, 9, 30, 0)),
-            create_mock_notification(TemplateType.SMS, StatisticsType.REQUESTED, datetime(2025, 2, 18, 11, 45, 0)),
+            create_mock_notification(
+                TemplateType.SMS,
+                StatisticsType.REQUESTED,
+                datetime(2025, 2, 18, 9, 30, 0),
+            ),
+            create_mock_notification(
+                TemplateType.SMS,
+                StatisticsType.REQUESTED,
+                datetime(2025, 2, 18, 11, 45, 0),
+            ),
         ],
         datetime(2025, 2, 18, 8, 0),
         5,
-        generate_expected_hourly_output(["2025-02-18T09:00:00Z", "2025-02-18T11:00:00Z"]),
+        generate_expected_hourly_output(
+            ["2025-02-18T09:00:00Z", "2025-02-18T11:00:00Z"]
+        ),
     ),
 ]
 
 
-@pytest.mark.parametrize("mocked_notifications, start_date, hours, expected_output", test_cases)
+@pytest.mark.parametrize(
+    "mocked_notifications, start_date, hours, expected_output", test_cases
+)
 def test_get_specific_hours(mocked_notifications, start_date, hours, expected_output):
-    results = get_specific_hours_stats(
-        mocked_notifications,
-        start_date,
-        hours=hours
-    )
+    results = get_specific_hours_stats(mocked_notifications, start_date, hours=hours)
     assert results == expected_output, f"Expected {expected_output}, but got {results}"