From 001f84b0b865cafc7c9db4d1c039f8cac9aaa5ab Mon Sep 17 00:00:00 2001 From: Leo Hemsted Date: Tue, 21 Aug 2018 18:02:17 +0100 Subject: [PATCH 1/2] Sanitise PDFs after virus scan Notify antivirus, on success, calls the process_virus_scan_passed taks. Previously, this task would: * update status to created (or delivered for test keys) * copy the file from the scan bucket to either the live or test bucket based on the results * delete the old file in the scan bucket We want it to: * download file from scan bucket * sanitise PDF using new template-preview functionality * if sanitise failed, set to new status "validation-failed" and save the pdf somewhere. * send new pdf to live/test bucket * update status to created (or delivered for test keys) * delete the original file in the scan bucket This PR does some of that: * download file from scan bucket * sanitise PDF using new template-preview functionality * if sanitise failed, just log. * send OLD pdf to live/test bucket * update status to created (or delivered for test keys) * delete the original file in the scan bucket So if sanitising fails, we won't fall over and not deliver the letter, we'll just log a message for now. If sanitise throws an unexpected error (as opposed to a 400), we'll retry up to fifteen times (the same as when creating a new letter). I've added the code for using the sanitised pdf, but it's commented out for now --- app/celery/letters_pdf_tasks.py | 81 +++++++++++++++++++++++++++++---- 1 file changed, 73 insertions(+), 8 deletions(-) diff --git a/app/celery/letters_pdf_tasks.py b/app/celery/letters_pdf_tasks.py index d81ecb398..17a557794 100644 --- a/app/celery/letters_pdf_tasks.py +++ b/app/celery/letters_pdf_tasks.py @@ -9,6 +9,7 @@ from requests import ( ) from notifications_utils.statsd_decorators import statsd +from notifications_utils.s3 import s3upload from app import notify_celery from app.aws import s3 @@ -24,9 +25,11 @@ from app.dao.notifications_dao import ( from app.errors import VirusScanError from app.letters.utils import ( get_reference_from_filename, - move_scanned_pdf_to_test_or_live_pdf_bucket, + get_folder_name, upload_letter_pdf, - move_failed_pdf, ScanErrorType, move_error_pdf_to_scan_bucket, + move_failed_pdf, + ScanErrorType, + move_error_pdf_to_scan_bucket, get_file_names_from_error_bucket ) from app.models import ( @@ -34,7 +37,8 @@ from app.models import ( NOTIFICATION_CREATED, NOTIFICATION_DELIVERED, NOTIFICATION_VIRUS_SCAN_FAILED, - NOTIFICATION_TECHNICAL_FAILURE + NOTIFICATION_TECHNICAL_FAILURE, + # NOTIFICATION_VALIDATION_FAILED ) @@ -163,22 +167,83 @@ def letter_in_created_state(filename): return False -@notify_celery.task(name='process-virus-scan-passed') -def process_virus_scan_passed(filename): +@notify_celery.task(bind=True, name='process-virus-scan-passed', max_retries=15, default_retry_delay=300) +def process_virus_scan_passed(self, filename): reference = get_reference_from_filename(filename) notification = dao_get_notification_by_reference(reference) current_app.logger.info('notification id {} Virus scan passed: {}'.format(notification.id, filename)) is_test_key = notification.key_type == KEY_TYPE_TEST - move_scanned_pdf_to_test_or_live_pdf_bucket( + + scan_pdf_object = s3.get_s3_object(current_app.config['LETTERS_SCAN_BUCKET_NAME'], filename) + old_pdf = scan_pdf_object.get()['Body'].read() + + new_pdf = _sanitise_precomiled_pdf(self, notification, old_pdf) + + if not new_pdf: + current_app.logger.info('Invalid precompiled pdf received {} ({})'.format(notification.id, filename)) + # update_notification_status_by_id(notification.id, NOTIFICATION_VALIDATION_FAILED) + # move_scan_to_invalid_pdf_bucket() # TODO: implement this (and create bucket etc) + # scan_pdf_object.delete() + # return + + current_app.logger.info('notification id {} ({}) sanitised and ready to send'.format(notification.id, filename)) + + # temporarily upload original pdf while testing sanitise flow. + _upload_pdf_to_test_or_live_pdf_bucket( + old_pdf, # TODO: change to new_pdf filename, - is_test_letter=is_test_key - ) + is_test_letter=is_test_key) + update_letter_pdf_status( reference, NOTIFICATION_DELIVERED if is_test_key else NOTIFICATION_CREATED ) + scan_pdf_object.delete() + + +def _upload_pdf_to_test_or_live_pdf_bucket(pdf_data, filename, is_test_letter): + target_bucket_config = 'TEST_LETTERS_BUCKET_NAME' if is_test_letter else 'LETTERS_PDF_BUCKET_NAME' + target_bucket_name = current_app.config[target_bucket_config] + target_filename = get_folder_name(datetime.utcnow(), is_test_letter) + filename + + s3upload( + filedata=pdf_data, + region=current_app.config['AWS_REGION'], + bucket_name=target_bucket_name, + file_location=target_filename + ) + + +def _sanitise_precomiled_pdf(self, notification, precompiled_pdf): + try: + resp = requests_post( + '{}/precompiled/sanitise'.format( + current_app.config['TEMPLATE_PREVIEW_API_HOST'] + ), + data=precompiled_pdf, + headers={'Authorization': 'Token {}'.format(current_app.config['TEMPLATE_PREVIEW_API_KEY'])} + ) + resp.raise_for_status() + return resp.content + except RequestException as ex: + if ex.status_code == 400: + # validation error + return None + + try: + current_app.logger.exception( + "sanitise_precomiled_pdf failed for notification: {}".format(notification.id) + ) + self.retry(queue=QueueNames.RETRY) + except self.MaxRetriesExceededError: + current_app.logger.exception( + "RETRY FAILED: sanitise_precomiled_pdf failed for notification {}".format(notification.id), + ) + update_notification_status_by_id(notification.id, NOTIFICATION_TECHNICAL_FAILURE) + raise + @notify_celery.task(name='process-virus-scan-failed') def process_virus_scan_failed(filename): From 40c537a9f5dfd5a521490aaf804cf9c100dff982 Mon Sep 17 00:00:00 2001 From: Leo Hemsted Date: Fri, 24 Aug 2018 15:12:02 +0100 Subject: [PATCH 2/2] add tests --- app/celery/letters_pdf_tasks.py | 8 +- app/letters/utils.py | 10 --- tests/app/celery/test_letters_pdf_tasks.py | 90 ++++++++++++++++++---- tests/app/letters/test_letter_utils.py | 27 ------- 4 files changed, 81 insertions(+), 54 deletions(-) diff --git a/app/celery/letters_pdf_tasks.py b/app/celery/letters_pdf_tasks.py index 17a557794..8a65ba333 100644 --- a/app/celery/letters_pdf_tasks.py +++ b/app/celery/letters_pdf_tasks.py @@ -7,7 +7,7 @@ from requests import ( post as requests_post, RequestException ) - +from celery.exceptions import MaxRetriesExceededError from notifications_utils.statsd_decorators import statsd from notifications_utils.s3 import s3upload @@ -70,7 +70,7 @@ def create_letters_pdf(self, notification_id): "Letters PDF notification creation for id: {} failed".format(notification_id) ) self.retry(queue=QueueNames.RETRY) - except self.MaxRetriesExceededError: + except MaxRetriesExceededError: current_app.logger.exception( "RETRY FAILED: task create_letters_pdf failed for notification {}".format(notification_id), ) @@ -228,7 +228,7 @@ def _sanitise_precomiled_pdf(self, notification, precompiled_pdf): resp.raise_for_status() return resp.content except RequestException as ex: - if ex.status_code == 400: + if ex.response is not None and ex.response.status_code == 400: # validation error return None @@ -237,7 +237,7 @@ def _sanitise_precomiled_pdf(self, notification, precompiled_pdf): "sanitise_precomiled_pdf failed for notification: {}".format(notification.id) ) self.retry(queue=QueueNames.RETRY) - except self.MaxRetriesExceededError: + except MaxRetriesExceededError: current_app.logger.exception( "RETRY FAILED: sanitise_precomiled_pdf failed for notification {}".format(notification.id), ) diff --git a/app/letters/utils.py b/app/letters/utils.py index 8b9248790..e4f163dea 100644 --- a/app/letters/utils.py +++ b/app/letters/utils.py @@ -90,16 +90,6 @@ def upload_letter_pdf(notification, pdf_data, precompiled=False): return upload_file_name -def move_scanned_pdf_to_test_or_live_pdf_bucket(source_filename, is_test_letter=False): - source_bucket_name = current_app.config['LETTERS_SCAN_BUCKET_NAME'] - target_bucket_config = 'TEST_LETTERS_BUCKET_NAME' if is_test_letter else 'LETTERS_PDF_BUCKET_NAME' - target_bucket_name = current_app.config[target_bucket_config] - - target_filename = get_folder_name(datetime.utcnow(), is_test_letter) + source_filename - - _move_s3_object(source_bucket_name, source_filename, target_bucket_name, target_filename) - - def move_failed_pdf(source_filename, scan_error_type): scan_bucket = current_app.config['LETTERS_SCAN_BUCKET_NAME'] diff --git a/tests/app/celery/test_letters_pdf_tasks.py b/tests/app/celery/test_letters_pdf_tasks.py index ea1c73787..88262961c 100644 --- a/tests/app/celery/test_letters_pdf_tasks.py +++ b/tests/app/celery/test_letters_pdf_tasks.py @@ -1,12 +1,13 @@ +from unittest.mock import Mock, call, ANY + +import boto3 +from moto import mock_s3 from flask import current_app - -from unittest.mock import call - from freezegun import freeze_time import pytest import requests_mock from botocore.exceptions import ClientError -from celery.exceptions import MaxRetriesExceededError +from celery.exceptions import MaxRetriesExceededError, Retry from requests import RequestException from sqlalchemy.orm.exc import NoResultFound @@ -19,7 +20,9 @@ from app.celery.letters_pdf_tasks import ( letter_in_created_state, process_virus_scan_passed, process_virus_scan_failed, - process_virus_scan_error, replay_letters_in_error + process_virus_scan_error, + replay_letters_in_error, + _sanitise_precomiled_pdf ) from app.letters.utils import get_letter_pdf_filename, ScanErrorType from app.models import ( @@ -320,22 +323,45 @@ def test_letter_in_created_state_fails_if_notification_doesnt_exist(sample_notif assert letter_in_created_state(filename) is False -@pytest.mark.parametrize('key_type,is_test_letter,noti_status', [ - (KEY_TYPE_NORMAL, False, NOTIFICATION_CREATED), - (KEY_TYPE_TEST, True, NOTIFICATION_DELIVERED) +@freeze_time('2018-01-01 18:00') +@mock_s3 +@pytest.mark.parametrize('key_type,is_test_letter,noti_status,bucket_config_name,destination_folder', [ + (KEY_TYPE_NORMAL, False, NOTIFICATION_CREATED, 'LETTERS_PDF_BUCKET_NAME', '2018-01-02/'), + (KEY_TYPE_TEST, True, NOTIFICATION_DELIVERED, 'TEST_LETTERS_BUCKET_NAME', '') ]) def test_process_letter_task_check_virus_scan_passed( - sample_letter_notification, mocker, key_type, is_test_letter, noti_status + sample_letter_notification, mocker, key_type, is_test_letter, noti_status, bucket_config_name, destination_folder ): filename = 'NOTIFY.{}'.format(sample_letter_notification.reference) + source_bucket_name = current_app.config['LETTERS_SCAN_BUCKET_NAME'] + target_bucket_name = current_app.config[bucket_config_name] + + conn = boto3.resource('s3', region_name='eu-west-1') + conn.create_bucket(Bucket=source_bucket_name) + conn.create_bucket(Bucket=target_bucket_name) + + s3 = boto3.client('s3', region_name='eu-west-1') + s3.put_object(Bucket=source_bucket_name, Key=filename, Body=b'pdf_content') + sample_letter_notification.status = 'pending-virus-check' sample_letter_notification.key_type = key_type - mock_move_pdf = mocker.patch('app.celery.letters_pdf_tasks.move_scanned_pdf_to_test_or_live_pdf_bucket') + mock_s3upload = mocker.patch('app.celery.letters_pdf_tasks.s3upload') + mock_sanitise = mocker.patch('app.celery.letters_pdf_tasks._sanitise_precomiled_pdf') process_virus_scan_passed(filename) - mock_move_pdf.assert_called_once_with(filename, is_test_letter=is_test_letter) assert sample_letter_notification.status == noti_status + mock_s3upload.assert_called_once_with( + filedata=b'pdf_content', + region='eu-west-1', + bucket_name=target_bucket_name, + file_location=destination_folder + filename + ) + mock_sanitise.assert_called_once_with( + ANY, + sample_letter_notification, + b'pdf_content' + ) def test_process_letter_task_check_virus_scan_failed(sample_letter_notification, mocker): @@ -365,7 +391,6 @@ def test_process_letter_task_check_virus_scan_error(sample_letter_notification, def test_replay_letters_in_error_for_all_letters_in_error_bucket(notify_api, mocker): - import boto3 mockObject = boto3.resource('s3').Object('ERROR', 'ERROR/file_name') mocker.patch("app.celery.letters_pdf_tasks.get_file_names_from_error_bucket", return_value=[mockObject]) mock_move = mocker.patch("app.celery.letters_pdf_tasks.move_error_pdf_to_scan_bucket") @@ -376,7 +401,6 @@ def test_replay_letters_in_error_for_all_letters_in_error_bucket(notify_api, moc def test_replay_letters_in_error_for_one_file(notify_api, mocker): - import boto3 mockObject = boto3.resource('s3').Object('ERROR', 'ERROR/file_name') mocker.patch("app.celery.letters_pdf_tasks.get_file_names_from_error_bucket", return_value=[mockObject]) mock_move = mocker.patch("app.celery.letters_pdf_tasks.move_error_pdf_to_scan_bucket") @@ -384,3 +408,43 @@ def test_replay_letters_in_error_for_one_file(notify_api, mocker): replay_letters_in_error("file_name") mock_move.assert_called_once_with('file_name') mock_celery.assert_called_once_with(name='scan-file', kwargs={'filename': 'file_name'}, queue='antivirus-tasks') + + +def test_sanitise_precompiled_pdf_returns_data_from_template_preview(rmock, sample_letter_notification): + rmock.post('http://localhost:9999/precompiled/sanitise', content=b'new_pdf', status_code=200) + mock_celery = Mock(**{'retry.side_effect': Retry}) + + res = _sanitise_precomiled_pdf(mock_celery, sample_letter_notification, b'old_pdf') + + assert res == b'new_pdf' + assert rmock.last_request.text == 'old_pdf' + + +def test_sanitise_precompiled_pdf_returns_none_on_validation_error(rmock, sample_letter_notification): + rmock.post('http://localhost:9999/precompiled/sanitise', content=b'new_pdf', status_code=400) + mock_celery = Mock(**{'retry.side_effect': Retry}) + + res = _sanitise_precomiled_pdf(mock_celery, sample_letter_notification, b'old_pdf') + + assert res is None + + +def test_sanitise_precompiled_pdf_retries_on_http_error(rmock, sample_letter_notification): + rmock.post('http://localhost:9999/precompiled/sanitise', content=b'new_pdf', status_code=500) + mock_celery = Mock(**{'retry.side_effect': Retry}) + + with pytest.raises(Retry): + _sanitise_precomiled_pdf(mock_celery, sample_letter_notification, b'old_pdf') + + +def test_sanitise_precompiled_pdf_sets_notification_to_technical_failure_after_too_many_errors( + rmock, + sample_letter_notification +): + rmock.post('http://localhost:9999/precompiled/sanitise', content=b'new_pdf', status_code=500) + mock_celery = Mock(**{'retry.side_effect': MaxRetriesExceededError}) + + with pytest.raises(MaxRetriesExceededError): + _sanitise_precomiled_pdf(mock_celery, sample_letter_notification, b'old_pdf') + + assert sample_letter_notification.status == NOTIFICATION_TECHNICAL_FAILURE diff --git a/tests/app/letters/test_letter_utils.py b/tests/app/letters/test_letter_utils.py index b4c557fb5..11ce869cd 100644 --- a/tests/app/letters/test_letter_utils.py +++ b/tests/app/letters/test_letter_utils.py @@ -11,7 +11,6 @@ from app.letters.utils import ( get_letter_pdf_filename, get_letter_pdf, upload_letter_pdf, - move_scanned_pdf_to_test_or_live_pdf_bucket, ScanErrorType, move_failed_pdf, get_folder_name ) from app.models import KEY_TYPE_NORMAL, KEY_TYPE_TEST, PRECOMPILED_TEMPLATE_NAME @@ -138,32 +137,6 @@ def test_upload_letter_pdf_to_correct_bucket( ) -@mock_s3 -@pytest.mark.parametrize('is_test_letter,bucket_config_name,folder_date_name', [ - (False, 'LETTERS_PDF_BUCKET_NAME', '2018-03-14/'), - (True, 'TEST_LETTERS_BUCKET_NAME', '') -]) -@freeze_time(FROZEN_DATE_TIME) -def test_move_scanned_letter_pdf_to_processing_bucket( - notify_api, is_test_letter, bucket_config_name, folder_date_name -): - filename = 'test.pdf' - source_bucket_name = current_app.config['LETTERS_SCAN_BUCKET_NAME'] - target_bucket_name = current_app.config[bucket_config_name] - - conn = boto3.resource('s3', region_name='eu-west-1') - source_bucket = conn.create_bucket(Bucket=source_bucket_name) - target_bucket = conn.create_bucket(Bucket=target_bucket_name) - - s3 = boto3.client('s3', region_name='eu-west-1') - s3.put_object(Bucket=source_bucket_name, Key=filename, Body=b'pdf_content') - - move_scanned_pdf_to_test_or_live_pdf_bucket(filename, is_test_letter=is_test_letter) - - assert folder_date_name + filename in [o.key for o in target_bucket.objects.all()] - assert filename not in [o.key for o in source_bucket.objects.all()] - - @mock_s3 @freeze_time(FROZEN_DATE_TIME) def test_move_failed_pdf_error(notify_api):