From b288031adb6e0207d0802c2c01a68c380eb26252 Mon Sep 17 00:00:00 2001 From: Leo Hemsted Date: Thu, 21 Mar 2019 15:40:24 +0000 Subject: [PATCH] add a hash of letter filenames to the dvla zip file name if we partially retry a day, we would create new zip files, containing different letters (if some were processed succesfully). We need these files to have different filenames to earlier zip files so that we can avoid overwriting log data in zips_sent. Hashing the filename means that we'll only overwrite if it was the same file containing the same content. --- app/celery/letters_pdf_tasks.py | 22 +++++++++++++++++----- tests/app/celery/test_letters_pdf_tasks.py | 18 ++++++++++++++---- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/app/celery/letters_pdf_tasks.py b/app/celery/letters_pdf_tasks.py index d50c5b42c..e43af4607 100644 --- a/app/celery/letters_pdf_tasks.py +++ b/app/celery/letters_pdf_tasks.py @@ -2,6 +2,8 @@ import io import math from datetime import datetime from uuid import UUID +from hashlib import sha512 +from base64 import urlsafe_b64encode from PyPDF2.utils import PdfReadError from botocore.exceptions import ClientError as BotoClientError @@ -115,14 +117,24 @@ def collate_letter_pdfs_for_day(date=None): # since it is triggered mid afternoon. date = datetime.utcnow().strftime("%Y-%m-%d") - letter_pdfs = s3.get_s3_bucket_objects( - current_app.config['LETTERS_PDF_BUCKET_NAME'], - subfolder=date + letter_pdfs = sorted( + s3.get_s3_bucket_objects( + current_app.config['LETTERS_PDF_BUCKET_NAME'], + subfolder=date + ), + key=lambda letter: letter['Key'] ) for i, letters in enumerate(group_letters(letter_pdfs)): - # eg NOTIFY.2018-12-31.001.ZIP - dvla_filename = 'NOTIFY.{date}.{num:03}.ZIP'.format(date=date, num=i + 1) filenames = [letter['Key'] for letter in letters] + + hash = urlsafe_b64encode(sha512(''.join(filenames).encode()).digest())[:20].decode() + # eg NOTIFY.2018-12-31.001.Wjrui5nAvObjPd-3GEL-.ZIP + dvla_filename = 'NOTIFY.{date}.{num:03}.{hash}.ZIP'.format( + date=date, + num=i + 1, + hash=hash + ) + current_app.logger.info( 'Calling task zip-and-send-letter-pdfs for {} pdfs to upload {} with total size {:,} bytes'.format( len(filenames), diff --git a/tests/app/celery/test_letters_pdf_tasks.py b/tests/app/celery/test_letters_pdf_tasks.py index 4cc8d0dbd..8396347e6 100644 --- a/tests/app/celery/test_letters_pdf_tasks.py +++ b/tests/app/celery/test_letters_pdf_tasks.py @@ -218,7 +218,11 @@ def test_create_letters_gets_the_right_logo_when_service_has_letter_branding_log def test_collate_letter_pdfs_for_day(notify_api, mocker): - mock_s3 = mocker.patch('app.celery.tasks.s3.get_s3_bucket_objects') + mock_s3 = mocker.patch('app.celery.tasks.s3.get_s3_bucket_objects', return_value=[ + {'Key': 'B.pDf', 'Size': 2}, + {'Key': 'A.PDF', 'Size': 1}, + {'Key': 'C.pdf', 'Size': 3} + ]) mock_group_letters = mocker.patch('app.celery.letters_pdf_tasks.group_letters', return_value=[ [{'Key': 'A.PDF', 'Size': 1}, {'Key': 'B.pDf', 'Size': 2}], [{'Key': 'C.pdf', 'Size': 3}] @@ -228,16 +232,22 @@ def test_collate_letter_pdfs_for_day(notify_api, mocker): collate_letter_pdfs_for_day('2017-01-02') mock_s3.assert_called_once_with('test-letters-pdf', subfolder='2017-01-02') - mock_group_letters.assert_called_once_with(mock_s3.return_value) + mock_group_letters.assert_called_once_with(sorted(mock_s3.return_value, key=lambda x: x['Key'])) assert mock_celery.call_args_list[0] == call( name='zip-and-send-letter-pdfs', - kwargs={'filenames_to_zip': ['A.PDF', 'B.pDf'], 'upload_filename': 'NOTIFY.2017-01-02.001.ZIP'}, + kwargs={ + 'filenames_to_zip': ['A.PDF', 'B.pDf'], + 'upload_filename': 'NOTIFY.2017-01-02.001.oqdjIM2-NAUU9Sm5Slmi.ZIP' + }, queue='process-ftp-tasks', compression='zlib' ) assert mock_celery.call_args_list[1] == call( name='zip-and-send-letter-pdfs', - kwargs={'filenames_to_zip': ['C.pdf'], 'upload_filename': 'NOTIFY.2017-01-02.002.ZIP'}, + kwargs={ + 'filenames_to_zip': ['C.pdf'], + 'upload_filename': 'NOTIFY.2017-01-02.002.tdr7hcdPieiqjkVoS4kU.ZIP' + }, queue='process-ftp-tasks', compression='zlib' )