From 4cfb980ac66f23f49dae4694cca1be9061a98aa9 Mon Sep 17 00:00:00 2001 From: Kenneth Kehl <@kkehl@flexion.us> Date: Thu, 27 Feb 2025 13:24:32 -0800 Subject: [PATCH] add debug tag adr --- app/aws/s3.py | 14 ++--- app/clients/pinpoint/aws_pinpoint.py | 2 +- app/clients/sms/aws_sns.py | 9 +-- app/dao/jobs_dao.py | 6 +- app/dao/services_dao.py | 25 ++++++--- app/delivery/send_to_providers.py | 4 +- app/job/rest.py | 4 +- docs/adrs/0013-log-debug-tags.md | 38 +++++++++++++ tests/app/aws/test_s3.py | 4 +- .../dao/test_services_get_specific_hours.py | 56 ++++++++++++------- 10 files changed, 113 insertions(+), 49 deletions(-) create mode 100644 docs/adrs/0013-log-debug-tags.md diff --git a/app/aws/s3.py b/app/aws/s3.py index e8841b20c..bc1728541 100644 --- a/app/aws/s3.py +++ b/app/aws/s3.py @@ -118,7 +118,7 @@ def list_s3_objects(): break except Exception: current_app.logger.exception( - "An error occurred while regenerating cache #notify-admin-1200", + "An error occurred while regenerating cache #notify-debug-admin-1200", ) @@ -200,7 +200,7 @@ def read_s3_file(bucket_name, object_key, s3res): except LookupError: # perhaps our key is not formatted as we expected. If so skip it. - current_app.logger.exception("LookupError #notify-admin-1200") + current_app.logger.exception("LookupError #notify-debug-admin-1200") def get_s3_files(): @@ -213,7 +213,7 @@ def get_s3_files(): s3res = get_s3_resource() current_app.logger.info( - f"job_cache length before regen: {len_job_cache()} #notify-admin-1200" + f"job_cache length before regen: {len_job_cache()} #notify-debug-admin-1200" ) try: with ThreadPoolExecutor() as executor: @@ -222,7 +222,7 @@ def get_s3_files(): current_app.logger.exception("Connection pool issue") current_app.logger.info( - f"job_cache length after regen: {len_job_cache()} #notify-admin-1200" + f"job_cache length after regen: {len_job_cache()} #notify-debug-admin-1200" ) @@ -290,7 +290,7 @@ def file_exists(file_location): def get_job_location(service_id, job_id): current_app.logger.debug( - f"#s3-partitioning NEW JOB_LOCATION: {NEW_FILE_LOCATION_STRUCTURE.format(service_id, job_id)}" + f"#notify-debug-s3-partitioning NEW JOB_LOCATION: {NEW_FILE_LOCATION_STRUCTURE.format(service_id, job_id)}" ) return ( current_app.config["CSV_UPLOAD_BUCKET"]["bucket"], @@ -308,7 +308,7 @@ def get_old_job_location(service_id, job_id): Remove this when everything works with the NEW_FILE_LOCATION_STRUCTURE. """ current_app.logger.debug( - f"#s3-partitioning OLD JOB LOCATION: {FILE_LOCATION_STRUCTURE.format(service_id, job_id)}" + f"#notify-debug-s3-partitioning OLD JOB LOCATION: {FILE_LOCATION_STRUCTURE.format(service_id, job_id)}" ) return ( current_app.config["CSV_UPLOAD_BUCKET"]["bucket"], @@ -507,7 +507,7 @@ def get_personalisation_from_s3(service_id, job_id, job_row_number): def get_job_metadata_from_s3(service_id, job_id): current_app.logger.debug( - f"#s3-partitioning CALLING GET_JOB_METADATA with {service_id}, {job_id}" + f"#notify-debug-s3-partitioning CALLING GET_JOB_METADATA with {service_id}, {job_id}" ) obj = get_s3_object(*get_job_location(service_id, job_id)) return obj.get()["Metadata"] diff --git a/app/clients/pinpoint/aws_pinpoint.py b/app/clients/pinpoint/aws_pinpoint.py index d15d94601..3b4de7682 100644 --- a/app/clients/pinpoint/aws_pinpoint.py +++ b/app/clients/pinpoint/aws_pinpoint.py @@ -39,7 +39,7 @@ class AwsPinpointClient(Client): current_app.logger.info(hilite(response)) except ClientError: current_app.logger.exception( - "#validate-phone-number Could not validate with pinpoint" + "#notify-debug-validate-phone-number Could not validate with pinpoint" ) # TODO This is the structure of the response. When the phone validation diff --git a/app/clients/sms/aws_sns.py b/app/clients/sms/aws_sns.py index d36af600c..3c2282752 100644 --- a/app/clients/sms/aws_sns.py +++ b/app/clients/sms/aws_sns.py @@ -68,15 +68,16 @@ class AwsSnsClient(SmsClient): non_scrubbable = " ".join(sender) self.current_app.logger.info( - f"notify-api-1385 sender {non_scrubbable} is a {type(sender)} default is a {type(default_num)}" + f"notify-debug-api-1385 sender {non_scrubbable} is a {type(sender)} \ + default is a {type(default_num)}" ) else: self.current_app.logger.warning( - f"notify-api-1385 sender is type {type(sender)}!! {sender}" + f"notify-debug-api-1385 sender is type {type(sender)}!! {sender}" ) if self._valid_sender_number(sender): self.current_app.logger.info( - f"notify-api-1385 use valid sender {non_scrubbable} instead of default {default_num}" + f"notify-debug-api-1385 use valid sender {non_scrubbable} instead of default {default_num}" ) attributes["AWS.MM.SMS.OriginationNumber"] = { @@ -85,7 +86,7 @@ class AwsSnsClient(SmsClient): } else: self.current_app.logger.info( - f"notify-api-1385 use default {default_num} instead of invalid sender" + f"notify-debug-api-1385 use default {default_num} instead of invalid sender" ) attributes["AWS.MM.SMS.OriginationNumber"] = { diff --git a/app/dao/jobs_dao.py b/app/dao/jobs_dao.py index feec601a4..2e479a77d 100644 --- a/app/dao/jobs_dao.py +++ b/app/dao/jobs_dao.py @@ -158,17 +158,17 @@ def dao_create_job(job): now_time = utc_now() diff_time = now_time - orig_time current_app.logger.info( - f"#notify-admin-1859 dao_create_job orig created at {orig_time} and now {now_time}" + f"#notify-debug-admin-1859 dao_create_job orig created at {orig_time} and now {now_time}" ) if diff_time.total_seconds() > 300: # It should be only a few seconds diff at most current_app.logger.error( - "#notify-admin-1859 Something is wrong with job.created_at!" + "#notify-debug-admin-1859 Something is wrong with job.created_at!" ) if os.getenv("NOTIFY_ENVIRONMENT") not in ["test"]: job.created_at = now_time dao_update_job(job) current_app.logger.error( - f"#notify-admin-1859 Job created_at reset to {job.created_at}" + f"#notify-debug-admin-1859 Job created_at reset to {job.created_at}" ) diff --git a/app/dao/services_dao.py b/app/dao/services_dao.py index 1c8a5e157..a7db2415d 100644 --- a/app/dao/services_dao.py +++ b/app/dao/services_dao.py @@ -534,7 +534,9 @@ def dao_fetch_stats_for_service_from_hours(service_id, start_date, end_date): # Update to group by HOUR instead of DAY total_substmt = ( select( - func.date_trunc("hour", NotificationAllTimeView.created_at).label("hour"), # UPDATED + func.date_trunc("hour", NotificationAllTimeView.created_at).label( + "hour" + ), # UPDATED Job.notification_count.label("notification_count"), ) .join(Job, NotificationAllTimeView.job_id == Job.id) @@ -556,11 +558,14 @@ def dao_fetch_stats_for_service_from_hours(service_id, start_date, end_date): total_stmt = select( total_substmt.c.hour, # UPDATED func.sum(total_substmt.c.notification_count).label("total_notifications"), - ).group_by(total_substmt.c.hour) # UPDATED + ).group_by( + total_substmt.c.hour + ) # UPDATED # Ensure we're using hourly timestamps in the response total_notifications = { - row.hour: row.total_notifications for row in db.session.execute(total_stmt).all() + row.hour: row.total_notifications + for row in db.session.execute(total_stmt).all() } # Update the second query to also use "hour" @@ -568,7 +573,9 @@ def dao_fetch_stats_for_service_from_hours(service_id, start_date, end_date): select( NotificationAllTimeView.notification_type, NotificationAllTimeView.status, - func.date_trunc("hour", NotificationAllTimeView.created_at).label("hour"), # UPDATED + func.date_trunc("hour", NotificationAllTimeView.created_at).label( + "hour" + ), # UPDATED func.count(NotificationAllTimeView.id).label("count"), ) .where( @@ -895,7 +902,9 @@ def get_specific_days_stats( return stats -def get_specific_hours_stats(data, start_date, hours=None, end_date=None, total_notifications=None): +def get_specific_hours_stats( + data, start_date, hours=None, end_date=None, total_notifications=None +): if hours is not None and end_date is not None: raise ValueError("Only set hours OR set end_date, not both.") elif hours is not None: @@ -919,10 +928,10 @@ def get_specific_hours_stats(data, start_date, hours=None, end_date=None, total_ # Format statistics, returning only hours with results stats = { hour.strftime("%Y-%m-%dT%H:00:00Z"): statistics.format_statistics( - rows, - total_notifications.get(hour, 0) if total_notifications else None + rows, total_notifications.get(hour, 0) if total_notifications else None ) - for hour, rows in grouped_data.items() if rows + for hour, rows in grouped_data.items() + if rows } return stats diff --git a/app/delivery/send_to_providers.py b/app/delivery/send_to_providers.py index 515d418e7..ef42aecc5 100644 --- a/app/delivery/send_to_providers.py +++ b/app/delivery/send_to_providers.py @@ -107,7 +107,7 @@ def send_sms_to_provider(notification): sender_numbers = get_sender_numbers(notification) if notification.reply_to_text not in sender_numbers: raise ValueError( - f"{notification.reply_to_text} not in {sender_numbers} #notify-admin-1701" + f"{notification.reply_to_text} not in {sender_numbers} #notify-debug-admin-1701" ) send_sms_kwargs = { @@ -152,7 +152,7 @@ def _experimentally_validate_phone_numbers(recipient): if recipient_lookup in current_app.config["SIMULATED_SMS_NUMBERS"] and os.getenv( "NOTIFY_ENVIRONMENT" ) in ["development", "test"]: - current_app.logger.info(hilite("#validate-phone-number fired")) + current_app.logger.info(hilite("#notify-debug-validate-phone-number fired")) aws_pinpoint_client.validate_phone_number("01", recipient) diff --git a/app/job/rest.py b/app/job/rest.py index ed506a5aa..571c1e342 100644 --- a/app/job/rest.py +++ b/app/job/rest.py @@ -242,7 +242,9 @@ def create_job(service_id): original_file_name = data.get("original_file_name") data.update({"service": service_id}) try: - current_app.logger.info(f"#s3-partitioning DATA IN CREATE_JOB: {data}") + current_app.logger.info( + f"#notify-debug-s3-partitioning DATA IN CREATE_JOB: {data}" + ) data.update(**get_job_metadata_from_s3(service_id, data["id"])) except KeyError: raise InvalidRequest( diff --git a/docs/adrs/0013-log-debug-tags.md b/docs/adrs/0013-log-debug-tags.md new file mode 100644 index 000000000..83a88107b --- /dev/null +++ b/docs/adrs/0013-log-debug-tags.md @@ -0,0 +1,38 @@ +# Use of debug search tags for cloudwatch logs + +Status: Accepted +Date: 27 February 2025 + +### Context + +We encountered some recurrent problems in specific parts of the code that re-occurred regularly. + +We found it hard to debug these issues searching through cloudwatch logs as a particular issue +tends to be exploded into multiple log lines, and these lines are not even time sorted by default +so it can get quite hard on the eyes and test one's patience to debug this way. + +### Decision + +We decided to start using unique tags for individual issues so we could quickly search for related +groups of log lines. This worked fairly well and even though some of these issues at long last +have been resolved, it might be worthwhile to continue this debugging pattern. + +### Consequences + +Here are the existing tags and what they are used for: + +#notify-debug-admin-1200: job cache regeneration +#notify-debug-admin-1701: wrong sender phone number +#notify-debug-admin-1859: job creation time reset due to strange SqlAlchemy constructor issue setting wrong created_at time +#notify-debug-api-1385: a separate identify to resolve the wrong sender phone number issue +#notify-debug-s3-partitioning: modify s3 partitioning in line with best aws practices +#notify-debug-validate-phone-number: experimental code to try to validate phone numbers with aws pinpoint + + + + +### Author +@kenkehl + +### Stakeholders +@ccostino diff --git a/tests/app/aws/test_s3.py b/tests/app/aws/test_s3.py index de26f5760..ec5cea861 100644 --- a/tests/app/aws/test_s3.py +++ b/tests/app/aws/test_s3.py @@ -449,9 +449,9 @@ def test_get_s3_files_success(client, mocker): mock_read_s3_file.assert_has_calls(calls, any_order=True) - # mock_current_app.info.assert_any_call("job_cache length before regen: 0 #notify-admin-1200") + # mock_current_app.info.assert_any_call("job_cache length before regen: 0 #notify-debug-admin-1200") - # mock_current_app.info.assert_any_call("job_cache length after regen: 0 #notify-admin-1200") + # mock_current_app.info.assert_any_call("job_cache length after regen: 0 #notify-debug-admin-1200") @patch("app.aws.s3.s3_client", None) # ensure it starts as None diff --git a/tests/app/dao/test_services_get_specific_hours.py b/tests/app/dao/test_services_get_specific_hours.py index b96d442c0..a97dbd150 100644 --- a/tests/app/dao/test_services_get_specific_hours.py +++ b/tests/app/dao/test_services_get_specific_hours.py @@ -7,7 +7,9 @@ from app.dao.services_dao import get_specific_hours_stats from app.enums import StatisticsType from app.models import TemplateType -NotificationRow = namedtuple("NotificationRow", ["notification_type", "status", "timestamp", "count"]) +NotificationRow = namedtuple( + "NotificationRow", ["notification_type", "status", "timestamp", "count"] +) def generate_expected_hourly_output(requested_sms_hours): @@ -38,27 +40,31 @@ def create_mock_notification(notification_type, status, timestamp, count=1): notification_type=notification_type, status=status, timestamp=timestamp.replace(minute=0, second=0, microsecond=0), - count=count + count=count, ) test_cases = [ ( - [create_mock_notification( - TemplateType.SMS, - StatisticsType.REQUESTED, - datetime(2025, 2, 18, 14, 15, 0), - )], + [ + create_mock_notification( + TemplateType.SMS, + StatisticsType.REQUESTED, + datetime(2025, 2, 18, 14, 15, 0), + ) + ], datetime(2025, 2, 18, 12, 0), 6, generate_expected_hourly_output(["2025-02-18T14:00:00Z"]), ), ( - [create_mock_notification( - TemplateType.SMS, - StatisticsType.REQUESTED, - datetime(2025, 2, 18, 17, 59, 59), - )], + [ + create_mock_notification( + TemplateType.SMS, + StatisticsType.REQUESTED, + datetime(2025, 2, 18, 17, 59, 59), + ) + ], datetime(2025, 2, 18, 15, 0), 3, generate_expected_hourly_output(["2025-02-18T17:00:00Z"]), @@ -66,21 +72,29 @@ test_cases = [ ([], datetime(2025, 2, 18, 10, 0), 4, {}), ( [ - create_mock_notification(TemplateType.SMS, StatisticsType.REQUESTED, datetime(2025, 2, 18, 9, 30, 0)), - create_mock_notification(TemplateType.SMS, StatisticsType.REQUESTED, datetime(2025, 2, 18, 11, 45, 0)), + create_mock_notification( + TemplateType.SMS, + StatisticsType.REQUESTED, + datetime(2025, 2, 18, 9, 30, 0), + ), + create_mock_notification( + TemplateType.SMS, + StatisticsType.REQUESTED, + datetime(2025, 2, 18, 11, 45, 0), + ), ], datetime(2025, 2, 18, 8, 0), 5, - generate_expected_hourly_output(["2025-02-18T09:00:00Z", "2025-02-18T11:00:00Z"]), + generate_expected_hourly_output( + ["2025-02-18T09:00:00Z", "2025-02-18T11:00:00Z"] + ), ), ] -@pytest.mark.parametrize("mocked_notifications, start_date, hours, expected_output", test_cases) +@pytest.mark.parametrize( + "mocked_notifications, start_date, hours, expected_output", test_cases +) def test_get_specific_hours(mocked_notifications, start_date, hours, expected_output): - results = get_specific_hours_stats( - mocked_notifications, - start_date, - hours=hours - ) + results = get_specific_hours_stats(mocked_notifications, start_date, hours=hours) assert results == expected_output, f"Expected {expected_output}, but got {results}"