From 00259893f1ea271e542b39ebc7a9396b076f3216 Mon Sep 17 00:00:00 2001 From: Leo Hemsted Date: Mon, 7 Mar 2022 18:20:22 +0000 Subject: [PATCH 1/2] automatically retry letters stuck in pending-virus-scan Since sept 2019 we've had to log on to production around once every twenty days to restart the virus scan task for a letter. Most of the time this is just a case of making sure the file is in the scan bucket, and then triggering the task. If the file isn't in the scan bucket we'd need to do some more manual investigation to find out exactly where the file got stuck, but I can only remember times when it's been in the scan bucket. So if the file is in the scan bucket, we can just check that with code and kick the task off automatically. --- app/celery/scheduled_tasks.py | 32 ++++++++++++-- tests/app/celery/test_scheduled_tasks.py | 56 +++++++++++++++++++----- 2 files changed, 75 insertions(+), 13 deletions(-) diff --git a/app/celery/scheduled_tasks.py b/app/celery/scheduled_tasks.py index 968a4231d..b38220611 100644 --- a/app/celery/scheduled_tasks.py +++ b/app/celery/scheduled_tasks.py @@ -8,6 +8,7 @@ from sqlalchemy import between from sqlalchemy.exc import SQLAlchemyError from app import db, notify_celery, zendesk_client +from app.aws import s3 from app.celery.broadcast_message_tasks import trigger_link_test from app.celery.letters_pdf_tasks import get_pdf_for_templated_letter from app.celery.tasks import ( @@ -45,6 +46,7 @@ from app.dao.services_dao import ( dao_find_services_with_high_failure_rates, ) from app.dao.users_dao import delete_codes_older_created_more_than_a_day_ago +from app.letters.utils import generate_letter_pdf_filename from app.models import ( EMAIL_TYPE, JOB_STATUS_ERROR, @@ -207,14 +209,38 @@ def replay_created_notifications(): @notify_celery.task(name='check-if-letters-still-pending-virus-check') def check_if_letters_still_pending_virus_check(): - letters = dao_precompiled_letters_still_pending_virus_check() + letters = [] + + for letter in dao_precompiled_letters_still_pending_virus_check(): + # find letter in the scan bucket + filename = generate_letter_pdf_filename( + letter.reference, + letter.created_at, + ignore_folder=True, + postage=letter.postage + ) + + if s3.file_exists(current_app.config['LETTERS_SCAN_BUCKET_NAME'], filename): + current_app.logger.warning( + f'Letter id {letter.id} got stuck in pending-virus-check. Sending off for scan again.' + ) + notify_celery.send_task( + name=TaskNames.SCAN_FILE, + kwargs={'filename': filename}, + queue=QueueNames.ANTIVIRUS, + ) + else: + letters.append(letter) if len(letters) > 0: letter_ids = [(str(letter.id), letter.reference) for letter in letters] - msg = """{} precompiled letters have been pending-virus-check for over 90 minutes. Follow runbook to resolve: + msg = f"""{len(letters)} precompiled letters have been pending-virus-check for over 90 minutes. + We couldn't find them in the scan bucket. + + Follow runbook to resolve: https://github.com/alphagov/notifications-manuals/wiki/Support-Runbook#Deal-with-letter-pending-virus-scan-for-90-minutes. - Notifications: {}""".format(len(letters), sorted(letter_ids)) + Notifications: {sorted(letter_ids)}""" if current_app.config['NOTIFY_ENVIRONMENT'] in ['live', 'production', 'test']: ticket = NotifySupportTicket( diff --git a/tests/app/celery/test_scheduled_tasks.py b/tests/app/celery/test_scheduled_tasks.py index edd8cbeee..c7cedc111 100644 --- a/tests/app/celery/test_scheduled_tasks.py +++ b/tests/app/celery/test_scheduled_tasks.py @@ -366,9 +366,43 @@ def test_check_job_status_task_does_not_raise_error(sample_template): @freeze_time("2019-05-30 14:00:00") -def test_check_if_letters_still_pending_virus_check(mocker, sample_letter_template): - mock_logger = mocker.patch('app.celery.tasks.current_app.logger.error') +def test_check_if_letters_still_pending_virus_check_restarts_scan_for_stuck_letters( + mocker, + sample_letter_template +): + mock_file_exists = mocker.patch('app.aws.s3.file_exists', return_value=True) mock_create_ticket = mocker.spy(NotifySupportTicket, '__init__') + mock_celery = mocker.patch('app.celery.scheduled_tasks.notify_celery.send_task') + + create_notification( + template=sample_letter_template, + status=NOTIFICATION_PENDING_VIRUS_CHECK, + created_at=datetime.utcnow() - timedelta(seconds=5401), + reference='one' + ) + expected_filename = 'NOTIFY.ONE.D.2.C.20190530122959.PDF' + + check_if_letters_still_pending_virus_check() + + mock_file_exists.assert_called_once_with('test-letters-scan', expected_filename) + + mock_celery.assert_called_once_with( + name=TaskNames.SCAN_FILE, + kwargs={'filename': expected_filename}, + queue=QueueNames.ANTIVIRUS + ) + + assert mock_create_ticket.called is False + + +@freeze_time("2019-05-30 14:00:00") +def test_check_if_letters_still_pending_virus_check_raises_zendesk_if_files_cant_be_found( + mocker, + sample_letter_template +): + mock_file_exists = mocker.patch('app.aws.s3.file_exists', return_value=False) + mock_create_ticket = mocker.spy(NotifySupportTicket, '__init__') + mock_celery = mocker.patch('app.celery.scheduled_tasks.notify_celery.send_task') mock_send_ticket_to_zendesk = mocker.patch( 'app.celery.scheduled_tasks.zendesk_client.send_ticket_to_zendesk', autospec=True, @@ -391,22 +425,24 @@ def test_check_if_letters_still_pending_virus_check(mocker, sample_letter_templa check_if_letters_still_pending_virus_check() - id_references = sorted([(str(notification_1.id), notification_1.reference), - (str(notification_2.id), notification_2.reference)]) + assert mock_file_exists.call_count == 2 + mock_file_exists.assert_has_calls([ + call('test-letters-scan', 'NOTIFY.ONE.D.2.C.20190530122959.PDF'), + call('test-letters-scan', 'NOTIFY.TWO.D.2.C.20190529183320.PDF'), + ], any_order=True) + assert mock_celery.called is False - message = """2 precompiled letters have been pending-virus-check for over 90 minutes. Follow runbook to resolve: - https://github.com/alphagov/notifications-manuals/wiki/Support-Runbook#Deal-with-letter-pending-virus-scan-for-90-minutes. - Notifications: {}""".format(id_references) - - mock_logger.assert_called_once_with(message) mock_create_ticket.assert_called_once_with( ANY, subject='[test] Letters still pending virus check', - message=message, + message=ANY, ticket_type='incident', technical_ticket=True, ticket_categories=['notify_letters'] ) + assert '2 precompiled letters have been pending-virus-check' in mock_create_ticket.call_args.kwargs['message'] + assert f'{(str(notification_1.id), notification_1.reference)}' in mock_create_ticket.call_args.kwargs['message'] + assert f'{(str(notification_2.id), notification_2.reference)}' in mock_create_ticket.call_args.kwargs['message'] mock_send_ticket_to_zendesk.assert_called_once() From 9e8df8b623723cc082a3178224bcc33da57a6dfb Mon Sep 17 00:00:00 2001 From: Leo Hemsted Date: Thu, 10 Mar 2022 14:10:01 +0000 Subject: [PATCH 2/2] remove "letters stuck pending av" runbook there's not anything we know we need to do now that we resolve stuck letters automatically. Letters couuld still get into this state, so it's worth alerting us. However, we don't have anything concrete that we know how to fix these letters, so we should just remove the runbook entirely. --- app/celery/scheduled_tasks.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/app/celery/scheduled_tasks.py b/app/celery/scheduled_tasks.py index b38220611..d02faa0ab 100644 --- a/app/celery/scheduled_tasks.py +++ b/app/celery/scheduled_tasks.py @@ -236,10 +236,9 @@ def check_if_letters_still_pending_virus_check(): letter_ids = [(str(letter.id), letter.reference) for letter in letters] msg = f"""{len(letters)} precompiled letters have been pending-virus-check for over 90 minutes. - We couldn't find them in the scan bucket. + We couldn't find them in the scan bucket. We'll need to find out where the files are and kick them off + again or move them to technical failure. - Follow runbook to resolve: - https://github.com/alphagov/notifications-manuals/wiki/Support-Runbook#Deal-with-letter-pending-virus-scan-for-90-minutes. Notifications: {sorted(letter_ids)}""" if current_app.config['NOTIFY_ENVIRONMENT'] in ['live', 'production', 'test']: