Merge pull request #2717 from alphagov/collate-pdf

Look at all previous days when sending letters
This commit is contained in:
David McDonald
2020-02-24 10:16:18 +00:00
committed by GitHub
5 changed files with 201 additions and 101 deletions

View File

@@ -19,6 +19,12 @@ def get_s3_object(bucket_name, file_location):
return s3.Object(bucket_name, file_location)
def head_s3_object(bucket_name, file_location):
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.head_object
boto_client = client('s3', current_app.config['AWS_REGION'])
return boto_client.head_object(Bucket=bucket_name, Key=file_location)
def file_exists(bucket_name, file_location):
try:
# try and access metadata of object
@@ -56,7 +62,7 @@ def remove_job_from_s3(service_id, job_id):
return remove_s3_object(*get_job_location(service_id, job_id))
def get_s3_bucket_objects(bucket_name, subfolder='', older_than=7, limit_days=2):
def get_s3_bucket_objects(bucket_name, subfolder=''):
boto_client = client('s3', current_app.config['AWS_REGION'])
paginator = boto_client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(

View File

@@ -1,6 +1,6 @@
import math
import base64
from datetime import datetime
from datetime import datetime, timedelta
from uuid import UUID
from hashlib import sha512
from base64 import urlsafe_b64encode
@@ -14,6 +14,8 @@ from requests import (
from celery.exceptions import MaxRetriesExceededError
from notifications_utils.statsd_decorators import statsd
from notifications_utils.s3 import s3upload
from notifications_utils.letter_timings import LETTER_PROCESSING_DEADLINE
from notifications_utils.timezones import convert_utc_to_bst
from app import encryption, notify_celery
from app.aws import s3
@@ -23,9 +25,10 @@ from app.dao.notifications_dao import (
update_notification_status_by_id,
dao_update_notification,
dao_get_notification_by_reference,
dao_get_notifications_by_references,
dao_update_notifications_by_reference,
dao_get_letters_to_be_printed,
)
from app.letters.utils import get_letter_pdf_filename
from app.errors import VirusScanError
from app.exceptions import NotificationTechnicalFailureException
from app.letters.utils import (
@@ -115,28 +118,33 @@ def get_letters_pdf(template, contact_block, filename, values):
return resp.content, billable_units
@notify_celery.task(name='collate-letter-pdfs-for-day')
@cronitor("collate-letter-pdfs-for-day")
def collate_letter_pdfs_for_day(date=None):
if not date:
# Using the truncated date is ok because UTC to BST does not make a difference to the date,
# since it is triggered mid afternoon.
date = datetime.utcnow().strftime("%Y-%m-%d")
@notify_celery.task(name='collate-letter-pdfs-to-be-sent')
@cronitor("collate-letter-pdfs-to-be-sent")
def collate_letter_pdfs_to_be_sent():
"""
Finds all letters which are still waiting to be sent to DVLA for printing
letter_pdfs = sorted(
s3.get_s3_bucket_objects(
current_app.config['LETTERS_PDF_BUCKET_NAME'],
subfolder=date
),
key=lambda letter: letter['Key']
This would usually be run at 5.50pm and collect up letters created between before 5:30pm today
that have not yet been sent.
If run after midnight, it will collect up letters created before 5:30pm the day before.
"""
print_run_date = convert_utc_to_bst(datetime.utcnow())
if print_run_date.time() < LETTER_PROCESSING_DEADLINE:
print_run_date = print_run_date - timedelta(days=1)
print_run_deadline = print_run_date.replace(
hour=17, minute=30, second=0, microsecond=0
)
for i, letters in enumerate(group_letters(letter_pdfs)):
letters_to_print = get_key_and_size_of_letters_to_be_sent_to_print(print_run_deadline)
for i, letters in enumerate(group_letters(letters_to_print)):
filenames = [letter['Key'] for letter in letters]
hash = urlsafe_b64encode(sha512(''.join(filenames).encode()).digest())[:20].decode()
# eg NOTIFY.2018-12-31.001.Wjrui5nAvObjPd-3GEL-.ZIP
dvla_filename = 'NOTIFY.{date}.{num:03}.{hash}.ZIP'.format(
date=date,
date=print_run_deadline.strftime("%Y-%m-%d"),
num=i + 1,
hash=hash
)
@@ -159,6 +167,24 @@ def collate_letter_pdfs_for_day(date=None):
)
def get_key_and_size_of_letters_to_be_sent_to_print(print_run_deadline):
letters_awaiting_sending = dao_get_letters_to_be_printed(print_run_deadline)
letter_pdfs = []
for letter in letters_awaiting_sending:
letter_file_name = get_letter_pdf_filename(
reference=letter.reference,
crown=letter.service.crown,
sending_date=letter.created_at,
postage=letter.postage
)
letter_head = s3.head_s3_object(current_app.config['LETTERS_PDF_BUCKET_NAME'], letter_file_name)
letter_pdfs.append({"Key": letter_file_name, "Size": letter_head['ContentLength']})
return letter_pdfs
def group_letters(letter_pdfs):
"""
Group letters in chunks of MAX_LETTER_PDF_ZIP_FILESIZE. Will add files to lists, never going over that size.
@@ -168,7 +194,7 @@ def group_letters(letter_pdfs):
running_filesize = 0
list_of_files = []
for letter in letter_pdfs:
if letter['Key'].lower().endswith('.pdf') and letter_in_created_state(letter['Key']):
if letter['Key'].lower().endswith('.pdf'):
if (
running_filesize + letter['Size'] > current_app.config['MAX_LETTER_PDF_ZIP_FILESIZE'] or
len(list_of_files) >= current_app.config['MAX_LETTER_PDF_COUNT_PER_ZIP']
@@ -184,22 +210,6 @@ def group_letters(letter_pdfs):
yield list_of_files
def letter_in_created_state(filename):
# filename looks like '2018-01-13/NOTIFY.ABCDEF1234567890.D.2.C.C.20180113120000.PDF'
subfolder = filename.split('/')[0]
ref = get_reference_from_filename(filename)
notifications = dao_get_notifications_by_references([ref])
if notifications:
if notifications[0].status == NOTIFICATION_CREATED:
return True
current_app.logger.info('Collating letters for {} but notification with reference {} already in {}'.format(
subfolder,
ref,
notifications[0].status
))
return False
@notify_celery.task(bind=True, name='process-virus-scan-passed', max_retries=15, default_retry_delay=300)
def process_virus_scan_passed(self, filename):
reference = get_reference_from_filename(filename)

View File

@@ -296,8 +296,8 @@ class Config(object):
},
# The collate-letter-pdf does assume it is called in an hour that BST does not make a
# difference to the truncate date which translates to the filename to process
'collate-letter-pdfs-for-day': {
'task': 'collate-letter-pdfs-for-day',
'collate-letter-pdfs-to-be-sent': {
'task': 'collate-letter-pdfs-to-be-sent',
'schedule': crontab(hour=17, minute=50),
'options': {'queue': QueueNames.PERIODIC}
},

View File

@@ -731,6 +731,21 @@ def notifications_not_yet_sent(should_be_sending_after_seconds, notification_typ
return notifications
def dao_get_letters_to_be_printed(print_run_deadline):
"""
Return all letters created before the print run deadline that have not yet been sent
"""
notifications = Notification.query.filter(
Notification.created_at < convert_bst_to_utc(print_run_deadline),
Notification.notification_type == LETTER_TYPE,
Notification.status == NOTIFICATION_CREATED,
Notification.key_type == KEY_TYPE_NORMAL
).order_by(
Notification.created_at
).all()
return notifications
def dao_old_letters_with_created_status():
yesterday_bst = convert_utc_to_bst(datetime.utcnow()) - timedelta(days=1)
last_processing_deadline = yesterday_bst.replace(hour=17, minute=30, second=0, microsecond=0)

View File

@@ -2,7 +2,7 @@ from unittest.mock import Mock, call, ANY
import base64
import boto3
from datetime import datetime
from datetime import datetime, timedelta
from moto import mock_s3
from flask import current_app
from freezegun import freeze_time
@@ -19,9 +19,9 @@ from app.exceptions import NotificationTechnicalFailureException
from app.celery.letters_pdf_tasks import (
create_letters_pdf,
get_letters_pdf,
collate_letter_pdfs_for_day,
collate_letter_pdfs_to_be_sent,
get_key_and_size_of_letters_to_be_sent_to_print,
group_letters,
letter_in_created_state,
process_sanitised_letter,
process_virus_scan_passed,
process_virus_scan_failed,
@@ -40,7 +40,6 @@ from app.models import (
NOTIFICATION_CREATED,
NOTIFICATION_DELIVERED,
NOTIFICATION_PENDING_VIRUS_CHECK,
NOTIFICATION_SENDING,
NOTIFICATION_TECHNICAL_FAILURE,
NOTIFICATION_VALIDATION_FAILED,
NOTIFICATION_VIRUS_SCAN_FAILED,
@@ -53,7 +52,7 @@ from tests.conftest import set_config_values
def test_should_have_decorated_tasks_functions():
assert create_letters_pdf.__wrapped__.__name__ == 'create_letters_pdf'
assert collate_letter_pdfs_for_day.__wrapped__.__name__ == 'collate_letter_pdfs_for_day'
assert collate_letter_pdfs_to_be_sent.__wrapped__.__name__ == 'collate_letter_pdfs_to_be_sent'
assert process_virus_scan_passed.__wrapped__.__name__ == 'process_virus_scan_passed'
assert process_virus_scan_failed.__wrapped__.__name__ == 'process_virus_scan_failed'
assert process_virus_scan_error.__wrapped__.__name__ == 'process_virus_scan_error'
@@ -242,27 +241,128 @@ def test_create_letters_gets_the_right_logo_when_service_has_letter_branding_log
)
def test_collate_letter_pdfs_for_day(notify_api, mocker):
mock_s3 = mocker.patch('app.celery.tasks.s3.get_s3_bucket_objects', return_value=[
{'Key': 'B.pDf', 'Size': 2},
{'Key': 'A.PDF', 'Size': 1},
{'Key': 'C.pdf', 'Size': 3}
@freeze_time('2020-02-17 18:00:00')
def test_get_key_and_size_of_letters_to_be_sent_to_print(notify_api, mocker, sample_letter_template):
create_notification(
template=sample_letter_template,
status='created',
reference='ref0',
created_at=(datetime.now() - timedelta(hours=2))
)
create_notification(
template=sample_letter_template,
status='created',
reference='ref1',
created_at=(datetime.now() - timedelta(hours=3))
)
create_notification(
template=sample_letter_template,
status='created',
reference='ref2',
created_at=(datetime.now() - timedelta(days=2))
)
# notifications we don't expect to get sent to print as they are in the wrong status
for status in ['delivered', 'validation-failed', 'cancelled', 'sending']:
create_notification(
template=sample_letter_template,
status=status,
reference='ref3',
created_at=(datetime.now() - timedelta(days=2))
)
# notification we don't expect to get sent as instead will make into this evenings print run
create_notification(
template=sample_letter_template,
status='created',
reference='ref4',
created_at=(datetime.now() - timedelta(minutes=1))
)
# test notification we don't expect to get sent
create_notification(
template=sample_letter_template,
status='created',
reference='ref4',
created_at=(datetime.now() - timedelta(days=1)),
key_type=KEY_TYPE_TEST
)
mock_s3 = mocker.patch('app.celery.tasks.s3.head_s3_object', side_effect=[
{'ContentLength': 2},
{'ContentLength': 1},
{'ContentLength': 3},
])
mock_group_letters = mocker.patch('app.celery.letters_pdf_tasks.group_letters', return_value=[
[{'Key': 'A.PDF', 'Size': 1}, {'Key': 'B.pDf', 'Size': 2}],
[{'Key': 'C.pdf', 'Size': 3}]
results = get_key_and_size_of_letters_to_be_sent_to_print(datetime.now() - timedelta(minutes=30))
assert mock_s3.call_count == 3
mock_s3.assert_has_calls(
[
call(current_app.config['LETTERS_PDF_BUCKET_NAME'], '2020-02-16/NOTIFY.REF2.D.2.C.C.20200215180000.PDF'),
call(current_app.config['LETTERS_PDF_BUCKET_NAME'], '2020-02-17/NOTIFY.REF1.D.2.C.C.20200217150000.PDF'),
call(current_app.config['LETTERS_PDF_BUCKET_NAME'], '2020-02-17/NOTIFY.REF0.D.2.C.C.20200217160000.PDF'),
]
)
assert len(results) == 3
assert results == [
{'Key': '2020-02-16/NOTIFY.REF2.D.2.C.C.20200215180000.PDF', 'Size': 2},
{'Key': '2020-02-17/NOTIFY.REF1.D.2.C.C.20200217150000.PDF', 'Size': 1},
{'Key': '2020-02-17/NOTIFY.REF0.D.2.C.C.20200217160000.PDF', 'Size': 3},
]
@pytest.mark.parametrize('time_to_run_task', [
"2020-02-17 18:00:00", # after 5:30pm
"2020-02-18 02:00:00", # the next day after midnight, before 5:30pm we expect the same results
])
def test_collate_letter_pdfs_to_be_sent(notify_api, sample_letter_template, mocker, time_to_run_task):
with freeze_time("2020-02-17 18:00:00"):
create_notification(
template=sample_letter_template,
status='created',
reference='ref0',
created_at=(datetime.now() - timedelta(hours=2))
)
create_notification(
template=sample_letter_template,
status='created',
reference='ref1',
created_at=(datetime.now() - timedelta(hours=3))
)
create_notification(
template=sample_letter_template,
status='created',
reference='ref2',
created_at=(datetime.now() - timedelta(days=2))
)
mocker.patch('app.celery.tasks.s3.head_s3_object', side_effect=[
{'ContentLength': 2},
{'ContentLength': 1},
{'ContentLength': 3},
])
mock_celery = mocker.patch('app.celery.letters_pdf_tasks.notify_celery.send_task')
collate_letter_pdfs_for_day('2017-01-02')
with set_config_values(notify_api, {'MAX_LETTER_PDF_COUNT_PER_ZIP': 2}):
with freeze_time(time_to_run_task):
collate_letter_pdfs_to_be_sent()
mock_s3.assert_called_once_with('test-letters-pdf', subfolder='2017-01-02')
mock_group_letters.assert_called_once_with(sorted(mock_s3.return_value, key=lambda x: x['Key']))
assert len(mock_celery.call_args_list) == 2
assert mock_celery.call_args_list[0] == call(
name='zip-and-send-letter-pdfs',
kwargs={
'filenames_to_zip': ['A.PDF', 'B.pDf'],
'upload_filename': 'NOTIFY.2017-01-02.001.oqdjIM2-NAUU9Sm5Slmi.ZIP'
'filenames_to_zip': [
'2020-02-16/NOTIFY.REF2.D.2.C.C.20200215180000.PDF',
'2020-02-17/NOTIFY.REF1.D.2.C.C.20200217150000.PDF'
],
'upload_filename': 'NOTIFY.2020-02-17.001.k3x_WqC5KhB6e2DWv9Ma.ZIP'
},
queue='process-ftp-tasks',
compression='zlib'
@@ -270,24 +370,17 @@ def test_collate_letter_pdfs_for_day(notify_api, mocker):
assert mock_celery.call_args_list[1] == call(
name='zip-and-send-letter-pdfs',
kwargs={
'filenames_to_zip': ['C.pdf'],
'upload_filename': 'NOTIFY.2017-01-02.002.tdr7hcdPieiqjkVoS4kU.ZIP'
'filenames_to_zip': [
'2020-02-17/NOTIFY.REF0.D.2.C.C.20200217160000.PDF'
],
'upload_filename': 'NOTIFY.2020-02-17.002.J85cUw-FWlKuAIOcwdLS.ZIP'
},
queue='process-ftp-tasks',
compression='zlib'
)
@freeze_time('2018-09-12 17:50:00')
def test_collate_letter_pdfs_for_day_works_without_date_param(notify_api, mocker):
mock_s3 = mocker.patch('app.celery.tasks.s3.get_s3_bucket_objects')
collate_letter_pdfs_for_day()
expected_date = '2018-09-12'
mock_s3.assert_called_once_with('test-letters-pdf', subfolder=expected_date)
def test_group_letters_splits_on_file_size(notify_api, mocker):
mocker.patch('app.celery.letters_pdf_tasks.letter_in_created_state', return_value=True)
def test_group_letters_splits_on_file_size(notify_api):
letters = [
# ends under max but next one is too big
{'Key': 'A.pdf', 'Size': 1}, {'Key': 'B.pdf', 'Size': 2},
@@ -313,8 +406,7 @@ def test_group_letters_splits_on_file_size(notify_api, mocker):
assert next(x, None) is None
def test_group_letters_splits_on_file_count(notify_api, mocker):
mocker.patch('app.celery.letters_pdf_tasks.letter_in_created_state', return_value=True)
def test_group_letters_splits_on_file_count(notify_api):
letters = [
{'Key': 'A.pdf', 'Size': 1},
{'Key': 'B.pdf', 'Size': 2},
@@ -337,8 +429,7 @@ def test_group_letters_splits_on_file_count(notify_api, mocker):
assert next(x, None) is None
def test_group_letters_splits_on_file_size_and_file_count(notify_api, mocker):
mocker.patch('app.celery.letters_pdf_tasks.letter_in_created_state', return_value=True)
def test_group_letters_splits_on_file_size_and_file_count(notify_api):
letters = [
# ends under max file size but next file is too big
{'Key': 'A.pdf', 'Size': 1},
@@ -372,44 +463,22 @@ def test_group_letters_splits_on_file_size_and_file_count(notify_api, mocker):
assert next(x, None) is None
def test_group_letters_ignores_non_pdfs(notify_api, mocker):
mocker.patch('app.celery.letters_pdf_tasks.letter_in_created_state', return_value=True)
letters = [{'Key': 'A.zip'}]
@pytest.mark.parametrize('key', ["A.ZIP", "B.zip"])
def test_group_letters_ignores_non_pdfs(key):
letters = [{'Key': key, 'Size': 1}]
assert list(group_letters(letters)) == []
def test_group_letters_ignores_notifications_already_sent(notify_api, mocker):
mock = mocker.patch('app.celery.letters_pdf_tasks.letter_in_created_state', return_value=False)
letters = [{'Key': 'A.pdf'}]
assert list(group_letters(letters)) == []
mock.assert_called_once_with('A.pdf')
@pytest.mark.parametrize('key', ["A.PDF", "B.pdf", "C.PdF"])
def test_group_letters_includes_pdf_files(key):
letters = [{'Key': key, 'Size': 1}]
assert list(group_letters(letters)) == [[{'Key': key, 'Size': 1}]]
def test_group_letters_with_no_letters(notify_api, mocker):
mocker.patch('app.celery.letters_pdf_tasks.letter_in_created_state', return_value=True)
def test_group_letters_with_no_letters():
assert list(group_letters([])) == []
def test_letter_in_created_state(sample_notification):
sample_notification.reference = 'ABCDEF1234567890'
filename = '2018-01-13/NOTIFY.ABCDEF1234567890.D.2.C.C.20180113120000.PDF'
assert letter_in_created_state(filename) is True
def test_letter_in_created_state_fails_if_notification_not_in_created(sample_notification):
sample_notification.reference = 'ABCDEF1234567890'
sample_notification.status = NOTIFICATION_SENDING
filename = '2018-01-13/NOTIFY.ABCDEF1234567890.D.2.C.C.20180113120000.PDF'
assert letter_in_created_state(filename) is False
def test_letter_in_created_state_fails_if_notification_doesnt_exist(sample_notification):
sample_notification.reference = 'QWERTY1234567890'
filename = '2018-01-13/NOTIFY.ABCDEF1234567890.D.2.C.C.20180113120000.PDF'
assert letter_in_created_state(filename) is False
@freeze_time('2018-01-01 18:00')
@mock_s3
@pytest.mark.parametrize('key_type,noti_status,bucket_config_name,destination_folder', [