diff --git a/app/celery/letters_pdf_tasks.py b/app/celery/letters_pdf_tasks.py index 5f715c97d..75c2e5c62 100644 --- a/app/celery/letters_pdf_tasks.py +++ b/app/celery/letters_pdf_tasks.py @@ -182,15 +182,15 @@ def get_key_and_size_of_letters_to_be_sent_to_print(print_run_deadline, postage) try: letter_file_name = get_letter_pdf_filename( reference=letter.reference, - crown=letter.service.crown, + crown=letter.crown, created_at=letter.created_at, - postage=letter.postage + postage=postage ) letter_head = s3.head_s3_object(current_app.config['LETTERS_PDF_BUCKET_NAME'], letter_file_name) yield { "Key": letter_file_name, "Size": letter_head['ContentLength'], - "ServiceId": str(letter.service.id) + "ServiceId": str(letter.service_id) } except BotoClientError as e: current_app.logger.exception( diff --git a/app/dao/notifications_dao.py b/app/dao/notifications_dao.py index 7c8564489..aa57779f7 100644 --- a/app/dao/notifications_dao.py +++ b/app/dao/notifications_dao.py @@ -717,11 +717,30 @@ def notifications_not_yet_sent(should_be_sending_after_seconds, notification_typ return notifications -def dao_get_letters_to_be_printed(print_run_deadline, postage): +def dao_get_letters_to_be_printed(print_run_deadline, postage, query_limit=10000): """ - Return all letters created before the print run deadline that have not yet been sent + Return all letters created before the print run deadline that have not yet been sent. This yields in batches of 10k + to prevent the query taking too long and eating up too much memory. As each 10k batch is yielded, the + get_key_and_size_of_letters_to_be_sent_to_print function will go and fetch the s3 data, andhese start sending off + tasks to the notify-ftp app to send them. + + CAUTION! Modify this query with caution. Modifying filters etc is fine, but if we join onto another table, then + there may be undefined behaviour. Essentially we need each ORM object returned for each row to be unique, + and we should avoid modifying state of returned objects. + + For more reading: + https://docs.sqlalchemy.org/en/13/orm/query.html?highlight=yield_per#sqlalchemy.orm.query.Query.yield_per + https://www.mail-archive.com/sqlalchemy@googlegroups.com/msg12443.html """ - notifications = Notification.query.filter( + notifications = db.session.query( + Notification.id, + Notification.created_at, + Notification.reference, + Notification.service_id, + Service.crown, + ).join( + Notification.service + ).filter( Notification.created_at < convert_bst_to_utc(print_run_deadline), Notification.notification_type == LETTER_TYPE, Notification.status == NOTIFICATION_CREATED, @@ -730,7 +749,7 @@ def dao_get_letters_to_be_printed(print_run_deadline, postage): ).order_by( Notification.service_id, Notification.created_at - ).limit(50000) + ).yield_per(query_limit) return notifications diff --git a/tests/app/dao/notification_dao/test_notification_dao.py b/tests/app/dao/notification_dao/test_notification_dao.py index 8b8717740..b0e4938b2 100644 --- a/tests/app/dao/notification_dao/test_notification_dao.py +++ b/tests/app/dao/notification_dao/test_notification_dao.py @@ -1688,10 +1688,20 @@ def test_letters_to_be_printed_sort_by_service(notify_db_session): second_service = create_service(service_name='second service', service_id='642bf33b-54b5-45f2-8c13-942a46616704') first_template = create_template(service=first_service, template_type='letter', postage='second') second_template = create_template(service=second_service, template_type='letter', postage='second') - notification_1 = create_notification(template=first_template, created_at=datetime(2020, 12, 1, 9, 30)) - notification_2 = create_notification(template=first_template, created_at=datetime(2020, 12, 1, 12, 30)) - notification_3 = create_notification(template=second_template, created_at=datetime(2020, 12, 1, 8, 30)) + letters_ordered_by_service_then_time = [ + create_notification(template=first_template, created_at=datetime(2020, 12, 1, 9, 30)), + create_notification(template=first_template, created_at=datetime(2020, 12, 1, 12, 30)), + create_notification(template=first_template, created_at=datetime(2020, 12, 1, 13, 30)), + create_notification(template=first_template, created_at=datetime(2020, 12, 1, 14, 30)), + create_notification(template=first_template, created_at=datetime(2020, 12, 1, 15, 30)), + create_notification(template=second_template, created_at=datetime(2020, 12, 1, 8, 30)), + create_notification(template=second_template, created_at=datetime(2020, 12, 1, 8, 31)), + create_notification(template=second_template, created_at=datetime(2020, 12, 1, 8, 32)), + create_notification(template=second_template, created_at=datetime(2020, 12, 1, 8, 33)), + create_notification(template=second_template, created_at=datetime(2020, 12, 1, 8, 34)) + ] - results = list(dao_get_letters_to_be_printed(print_run_deadline=datetime(2020, 12, 1, 17, 30), postage='second')) - assert len(results) == 3 - assert results == [notification_1, notification_2, notification_3] + results = list( + dao_get_letters_to_be_printed(print_run_deadline=datetime(2020, 12, 1, 17, 30), postage='second', query_limit=4) + ) + assert [x.id for x in results] == [x.id for x in letters_ordered_by_service_then_time]