automatically retry letters stuck in pending-virus-scan

Since sept 2019 we've had to log on to production around once every
twenty days to restart the virus scan task for a letter. Most of the
time this is just a case of making sure the file is in the scan bucket,
and then triggering the task. If the file isn't in the scan bucket we'd
need to do some more manual investigation to find out exactly where the
file got stuck, but I can only remember times when it's been in the scan
bucket.

So if the file is in the scan bucket, we can just check that with code
and kick the task off automatically.
This commit is contained in:
Leo Hemsted
2022-03-07 18:20:22 +00:00
parent 520d621893
commit 00259893f1
2 changed files with 75 additions and 13 deletions

View File

@@ -8,6 +8,7 @@ from sqlalchemy import between
from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.exc import SQLAlchemyError
from app import db, notify_celery, zendesk_client from app import db, notify_celery, zendesk_client
from app.aws import s3
from app.celery.broadcast_message_tasks import trigger_link_test from app.celery.broadcast_message_tasks import trigger_link_test
from app.celery.letters_pdf_tasks import get_pdf_for_templated_letter from app.celery.letters_pdf_tasks import get_pdf_for_templated_letter
from app.celery.tasks import ( from app.celery.tasks import (
@@ -45,6 +46,7 @@ from app.dao.services_dao import (
dao_find_services_with_high_failure_rates, dao_find_services_with_high_failure_rates,
) )
from app.dao.users_dao import delete_codes_older_created_more_than_a_day_ago from app.dao.users_dao import delete_codes_older_created_more_than_a_day_ago
from app.letters.utils import generate_letter_pdf_filename
from app.models import ( from app.models import (
EMAIL_TYPE, EMAIL_TYPE,
JOB_STATUS_ERROR, JOB_STATUS_ERROR,
@@ -207,14 +209,38 @@ def replay_created_notifications():
@notify_celery.task(name='check-if-letters-still-pending-virus-check') @notify_celery.task(name='check-if-letters-still-pending-virus-check')
def check_if_letters_still_pending_virus_check(): def check_if_letters_still_pending_virus_check():
letters = dao_precompiled_letters_still_pending_virus_check() letters = []
for letter in dao_precompiled_letters_still_pending_virus_check():
# find letter in the scan bucket
filename = generate_letter_pdf_filename(
letter.reference,
letter.created_at,
ignore_folder=True,
postage=letter.postage
)
if s3.file_exists(current_app.config['LETTERS_SCAN_BUCKET_NAME'], filename):
current_app.logger.warning(
f'Letter id {letter.id} got stuck in pending-virus-check. Sending off for scan again.'
)
notify_celery.send_task(
name=TaskNames.SCAN_FILE,
kwargs={'filename': filename},
queue=QueueNames.ANTIVIRUS,
)
else:
letters.append(letter)
if len(letters) > 0: if len(letters) > 0:
letter_ids = [(str(letter.id), letter.reference) for letter in letters] letter_ids = [(str(letter.id), letter.reference) for letter in letters]
msg = """{} precompiled letters have been pending-virus-check for over 90 minutes. Follow runbook to resolve: msg = f"""{len(letters)} precompiled letters have been pending-virus-check for over 90 minutes.
We couldn't find them in the scan bucket.
Follow runbook to resolve:
https://github.com/alphagov/notifications-manuals/wiki/Support-Runbook#Deal-with-letter-pending-virus-scan-for-90-minutes. https://github.com/alphagov/notifications-manuals/wiki/Support-Runbook#Deal-with-letter-pending-virus-scan-for-90-minutes.
Notifications: {}""".format(len(letters), sorted(letter_ids)) Notifications: {sorted(letter_ids)}"""
if current_app.config['NOTIFY_ENVIRONMENT'] in ['live', 'production', 'test']: if current_app.config['NOTIFY_ENVIRONMENT'] in ['live', 'production', 'test']:
ticket = NotifySupportTicket( ticket = NotifySupportTicket(

View File

@@ -366,9 +366,43 @@ def test_check_job_status_task_does_not_raise_error(sample_template):
@freeze_time("2019-05-30 14:00:00") @freeze_time("2019-05-30 14:00:00")
def test_check_if_letters_still_pending_virus_check(mocker, sample_letter_template): def test_check_if_letters_still_pending_virus_check_restarts_scan_for_stuck_letters(
mock_logger = mocker.patch('app.celery.tasks.current_app.logger.error') mocker,
sample_letter_template
):
mock_file_exists = mocker.patch('app.aws.s3.file_exists', return_value=True)
mock_create_ticket = mocker.spy(NotifySupportTicket, '__init__') mock_create_ticket = mocker.spy(NotifySupportTicket, '__init__')
mock_celery = mocker.patch('app.celery.scheduled_tasks.notify_celery.send_task')
create_notification(
template=sample_letter_template,
status=NOTIFICATION_PENDING_VIRUS_CHECK,
created_at=datetime.utcnow() - timedelta(seconds=5401),
reference='one'
)
expected_filename = 'NOTIFY.ONE.D.2.C.20190530122959.PDF'
check_if_letters_still_pending_virus_check()
mock_file_exists.assert_called_once_with('test-letters-scan', expected_filename)
mock_celery.assert_called_once_with(
name=TaskNames.SCAN_FILE,
kwargs={'filename': expected_filename},
queue=QueueNames.ANTIVIRUS
)
assert mock_create_ticket.called is False
@freeze_time("2019-05-30 14:00:00")
def test_check_if_letters_still_pending_virus_check_raises_zendesk_if_files_cant_be_found(
mocker,
sample_letter_template
):
mock_file_exists = mocker.patch('app.aws.s3.file_exists', return_value=False)
mock_create_ticket = mocker.spy(NotifySupportTicket, '__init__')
mock_celery = mocker.patch('app.celery.scheduled_tasks.notify_celery.send_task')
mock_send_ticket_to_zendesk = mocker.patch( mock_send_ticket_to_zendesk = mocker.patch(
'app.celery.scheduled_tasks.zendesk_client.send_ticket_to_zendesk', 'app.celery.scheduled_tasks.zendesk_client.send_ticket_to_zendesk',
autospec=True, autospec=True,
@@ -391,22 +425,24 @@ def test_check_if_letters_still_pending_virus_check(mocker, sample_letter_templa
check_if_letters_still_pending_virus_check() check_if_letters_still_pending_virus_check()
id_references = sorted([(str(notification_1.id), notification_1.reference), assert mock_file_exists.call_count == 2
(str(notification_2.id), notification_2.reference)]) mock_file_exists.assert_has_calls([
call('test-letters-scan', 'NOTIFY.ONE.D.2.C.20190530122959.PDF'),
call('test-letters-scan', 'NOTIFY.TWO.D.2.C.20190529183320.PDF'),
], any_order=True)
assert mock_celery.called is False
message = """2 precompiled letters have been pending-virus-check for over 90 minutes. Follow runbook to resolve:
https://github.com/alphagov/notifications-manuals/wiki/Support-Runbook#Deal-with-letter-pending-virus-scan-for-90-minutes.
Notifications: {}""".format(id_references)
mock_logger.assert_called_once_with(message)
mock_create_ticket.assert_called_once_with( mock_create_ticket.assert_called_once_with(
ANY, ANY,
subject='[test] Letters still pending virus check', subject='[test] Letters still pending virus check',
message=message, message=ANY,
ticket_type='incident', ticket_type='incident',
technical_ticket=True, technical_ticket=True,
ticket_categories=['notify_letters'] ticket_categories=['notify_letters']
) )
assert '2 precompiled letters have been pending-virus-check' in mock_create_ticket.call_args.kwargs['message']
assert f'{(str(notification_1.id), notification_1.reference)}' in mock_create_ticket.call_args.kwargs['message']
assert f'{(str(notification_2.id), notification_2.reference)}' in mock_create_ticket.call_args.kwargs['message']
mock_send_ticket_to_zendesk.assert_called_once() mock_send_ticket_to_zendesk.assert_called_once()