diff --git a/app/aws/s3.py b/app/aws/s3.py index f5798314b..48a54e709 100644 --- a/app/aws/s3.py +++ b/app/aws/s3.py @@ -1,6 +1,10 @@ -from boto3 import resource +from datetime import datetime, timedelta + from flask import current_app +import pytz +from boto3 import client, resource + FILE_LOCATION_STRUCTURE = 'service-{}-notify/{}.csv' @@ -24,7 +28,46 @@ def get_job_from_s3(service_id, job_id): def remove_job_from_s3(service_id, job_id): bucket_name = current_app.config['CSV_UPLOAD_BUCKET_NAME'] file_location = FILE_LOCATION_STRUCTURE.format(service_id, job_id) - obj = get_s3_object(bucket_name, file_location) + return remove_s3_object(bucket_name, file_location) + + +def get_s3_bucket_objects(bucket_name, subfolder='', older_than=7, limit_days=2): + boto_client = client('s3', current_app.config['AWS_REGION']) + paginator = boto_client.get_paginator('list_objects_v2') + page_iterator = paginator.paginate( + Bucket=bucket_name, + Prefix=subfolder + ) + + all_objects_in_bucket = [] + for page in page_iterator: + all_objects_in_bucket.extend(page['Contents']) + + return all_objects_in_bucket + + +def filter_s3_bucket_objects_within_date_range(bucket_objects, older_than=7, limit_days=2): + """ + S3 returns the Object['LastModified'] as an 'offset-aware' timestamp so the + date range filter must take this into account. + + Additionally an additional Object is returned by S3 corresponding to the + container directory. This is redundant and should be removed. + + """ + end_date = datetime.now(tz=pytz.utc) - timedelta(days=older_than) + start_date = end_date - timedelta(days=limit_days) + filtered_items = [item for item in bucket_objects if all([ + not item['Key'].endswith('/'), + item['LastModified'] > start_date, + item['LastModified'] < end_date + ])] + + return filtered_items + + +def remove_s3_object(bucket_name, object_key): + obj = get_s3_object(bucket_name, object_key) return obj.delete() diff --git a/app/celery/scheduled_tasks.py b/app/celery/scheduled_tasks.py index 11e4de49d..e86a7c113 100644 --- a/app/celery/scheduled_tasks.py +++ b/app/celery/scheduled_tasks.py @@ -255,3 +255,29 @@ def remove_transformed_dvla_files(): for job in jobs: s3.remove_transformed_dvla_file(job.id) current_app.logger.info("Transformed dvla file for job {} has been removed from s3.".format(job.id)) + + +@notify_celery.task(name="delete_dvla_response_files") +@statsd(namespace="tasks") +def delete_dvla_response_files_older_than_seven_days(): + try: + start = datetime.utcnow() + bucket_objects = s3.get_s3_bucket_objects( + current_app.config['DVLA_RESPONSE_BUCKET_NAME'], + 'root/dispatch' + ) + older_than_seven_days = s3.filter_s3_bucket_objects_within_date_range(bucket_objects) + + for f in older_than_seven_days: + s3.remove_s3_object(current_app.config['DVLA_RESPONSE_BUCKET_NAME'], f['Key']) + + current_app.logger.info( + "Delete dvla response files started {} finished {} deleted {} files".format( + start, + datetime.utcnow(), + len(older_than_seven_days) + ) + ) + except SQLAlchemyError as e: + current_app.logger.exception("Failed to delete dvla response files") + raise diff --git a/app/config.py b/app/config.py index f470666a7..b3d76dc93 100644 --- a/app/config.py +++ b/app/config.py @@ -1,7 +1,8 @@ from datetime import timedelta +import os + from celery.schedules import crontab from kombu import Exchange, Queue -import os from app.models import ( EMAIL_TYPE, SMS_TYPE, LETTER_TYPE, @@ -209,6 +210,11 @@ class Config(object): 'schedule': crontab(minute=40, hour=4), 'options': {'queue': QueueNames.PERIODIC} }, + 'delete_dvla_response_files': { + 'task': 'delete_dvla_response_files', + 'schedule': crontab(minute=10, hour=5), + 'options': {'queue': QueueNames.PERIODIC} + }, 'timeout-job-statistics': { 'task': 'timeout-job-statistics', 'schedule': crontab(minute=0, hour=5), @@ -265,6 +271,7 @@ class Development(Config): SQLALCHEMY_ECHO = False NOTIFY_EMAIL_DOMAIN = 'notify.tools' CSV_UPLOAD_BUCKET_NAME = 'development-notifications-csv-upload' + DVLA_RESPONSE_BUCKET_NAME = 'notify.tools-ftp' NOTIFY_ENVIRONMENT = 'development' NOTIFICATION_QUEUE_PREFIX = 'development' DEBUG = True @@ -284,6 +291,7 @@ class Test(Config): NOTIFY_ENVIRONMENT = 'test' DEBUG = True CSV_UPLOAD_BUCKET_NAME = 'test-notifications-csv-upload' + DVLA_RESPONSE_BUCKET_NAME = 'test.notify.com-ftp' STATSD_ENABLED = True STATSD_HOST = "localhost" STATSD_PORT = 1000 @@ -316,6 +324,7 @@ class Preview(Config): NOTIFY_EMAIL_DOMAIN = 'notify.works' NOTIFY_ENVIRONMENT = 'preview' CSV_UPLOAD_BUCKET_NAME = 'preview-notifications-csv-upload' + DVLA_RESPONSE_BUCKET_NAME = 'notify.works-ftp' FROM_NUMBER = 'preview' API_RATE_LIMIT_ENABLED = True @@ -324,6 +333,7 @@ class Staging(Config): NOTIFY_EMAIL_DOMAIN = 'staging-notify.works' NOTIFY_ENVIRONMENT = 'staging' CSV_UPLOAD_BUCKET_NAME = 'staging-notify-csv-upload' + DVLA_RESPONSE_BUCKET_NAME = 'staging-notify.works-ftp' STATSD_ENABLED = True FROM_NUMBER = 'stage' API_RATE_LIMIT_ENABLED = True @@ -333,6 +343,7 @@ class Live(Config): NOTIFY_EMAIL_DOMAIN = 'notifications.service.gov.uk' NOTIFY_ENVIRONMENT = 'live' CSV_UPLOAD_BUCKET_NAME = 'live-notifications-csv-upload' + DVLA_RESPONSE_BUCKET_NAME = 'notifications.service.gov.uk-ftp' STATSD_ENABLED = True FROM_NUMBER = 'GOVUK' FUNCTIONAL_TEST_PROVIDER_SERVICE_ID = '6c1d81bb-dae2-4ee9-80b0-89a4aae9f649' @@ -350,6 +361,7 @@ class Sandbox(CloudFoundryConfig): NOTIFY_EMAIL_DOMAIN = 'notify.works' NOTIFY_ENVIRONMENT = 'sandbox' CSV_UPLOAD_BUCKET_NAME = 'cf-sandbox-notifications-csv-upload' + DVLA_RESPONSE_BUCKET_NAME = 'notify.works-ftp' FROM_NUMBER = 'sandbox' REDIS_ENABLED = False diff --git a/tests/app/aws/test_s3.py b/tests/app/aws/test_s3.py index fd30c16df..b44d66fc8 100644 --- a/tests/app/aws/test_s3.py +++ b/tests/app/aws/test_s3.py @@ -1,8 +1,25 @@ from unittest.mock import call +from datetime import datetime, timedelta from flask import current_app -from app.aws.s3 import get_s3_file, remove_transformed_dvla_file +from freezegun import freeze_time + +from app.aws.s3 import ( + get_s3_bucket_objects, + get_s3_file, + filter_s3_bucket_objects_within_date_range, + remove_transformed_dvla_file +) +from tests.app.conftest import datetime_in_past + + +def single_s3_object_stub(key='foo', last_modified=datetime.utcnow()): + return { + 'ETag': '"d41d8cd98f00b204e9800998ecf8427e"', + 'Key': key, + 'LastModified': last_modified + } def test_get_s3_file_makes_correct_call(notify_api, mocker): @@ -25,3 +42,100 @@ def test_remove_transformed_dvla_file_makes_correct_call(notify_api, mocker): call(current_app.config['DVLA_UPLOAD_BUCKET_NAME'], '{}-dvla-job.text'.format(fake_uuid)), call().delete() ]) + + +def test_get_s3_bucket_objects_make_correct_pagination_call(notify_api, mocker): + paginator_mock = mocker.patch('app.aws.s3.client') + + get_s3_bucket_objects('foo-bucket', subfolder='bar') + + paginator_mock.assert_has_calls([ + call().get_paginator().paginate(Bucket='foo-bucket', Prefix='bar') + ]) + + +def test_get_s3_bucket_objects_builds_objects_list_from_paginator(notify_api, mocker): + AFTER_SEVEN_DAYS = datetime_in_past(days=8) + paginator_mock = mocker.patch('app.aws.s3.client') + multiple_pages_s3_object = [ + { + "Contents": [ + single_s3_object_stub('bar/foo.txt', AFTER_SEVEN_DAYS), + ] + }, + { + "Contents": [ + single_s3_object_stub('bar/foo1.txt', AFTER_SEVEN_DAYS), + ] + } + ] + paginator_mock.return_value.get_paginator.return_value.paginate.return_value = multiple_pages_s3_object + + bucket_objects = get_s3_bucket_objects('foo-bucket', subfolder='bar') + + assert len(bucket_objects) == 2 + assert set(bucket_objects[0].keys()) == set(['ETag', 'Key', 'LastModified']) + + +@freeze_time("2016-01-01 11:00:00") +def test_get_s3_bucket_objects_removes_redundant_root_object(notify_api, mocker): + AFTER_SEVEN_DAYS = datetime_in_past(days=8) + s3_objects_stub = [ + single_s3_object_stub('bar/', AFTER_SEVEN_DAYS), + single_s3_object_stub('bar/foo.txt', AFTER_SEVEN_DAYS), + ] + + filtered_items = filter_s3_bucket_objects_within_date_range(s3_objects_stub) + + assert len(filtered_items) == 1 + + assert filtered_items[0]["Key"] == 'bar/foo.txt' + assert filtered_items[0]["LastModified"] == datetime_in_past(days=8) + + +@freeze_time("2016-01-01 11:00:00") +def test_filter_s3_bucket_objects_within_date_range_filters_by_date_range(notify_api, mocker): + START_DATE = datetime_in_past(days=9) + JUST_BEFORE_START_DATE = START_DATE - timedelta(seconds=1) + JUST_AFTER_START_DATE = START_DATE + timedelta(seconds=1) + END_DATE = datetime_in_past(days=7) + JUST_BEFORE_END_DATE = END_DATE - timedelta(seconds=1) + JUST_AFTER_END_DATE = END_DATE + timedelta(seconds=1) + + s3_objects_stub = [ + single_s3_object_stub('bar/', JUST_BEFORE_START_DATE), + single_s3_object_stub('bar/foo.txt', START_DATE), + single_s3_object_stub('bar/foo2.txt', JUST_AFTER_START_DATE), + single_s3_object_stub('bar/foo3.txt', JUST_BEFORE_END_DATE), + single_s3_object_stub('bar/foo4.txt', END_DATE), + single_s3_object_stub('bar/foo5.txt', JUST_AFTER_END_DATE), + ] + + filtered_items = filter_s3_bucket_objects_within_date_range(s3_objects_stub) + + assert len(filtered_items) == 2 + + assert filtered_items[0]["Key"] == 'bar/foo2.txt' + assert filtered_items[0]["LastModified"] == JUST_AFTER_START_DATE + + assert filtered_items[1]["Key"] == 'bar/foo3.txt' + assert filtered_items[1]["LastModified"] == JUST_BEFORE_END_DATE + + +@freeze_time("2016-01-01 11:00:00") +def test_get_s3_bucket_objects_does_not_return_outside_of_date_range(notify_api, mocker): + START_DATE = datetime_in_past(days=9) + JUST_BEFORE_START_DATE = START_DATE - timedelta(seconds=1) + END_DATE = datetime_in_past(days=7) + JUST_AFTER_END_DATE = END_DATE + timedelta(seconds=1) + + s3_objects_stub = [ + single_s3_object_stub('bar/', JUST_BEFORE_START_DATE), + single_s3_object_stub('bar/foo1.txt', START_DATE), + single_s3_object_stub('bar/foo2.txt', END_DATE), + single_s3_object_stub('bar/foo3.txt', JUST_AFTER_END_DATE) + ] + + filtered_items = filter_s3_bucket_objects_within_date_range(s3_objects_stub) + + assert len(filtered_items) == 0 diff --git a/tests/app/celery/test_scheduled_tasks.py b/tests/app/celery/test_scheduled_tasks.py index fcdd8ea93..47a4ed9ff 100644 --- a/tests/app/celery/test_scheduled_tasks.py +++ b/tests/app/celery/test_scheduled_tasks.py @@ -9,6 +9,7 @@ from freezegun import freeze_time from app.celery import scheduled_tasks from app.celery.scheduled_tasks import ( + delete_dvla_response_files_older_than_seven_days, delete_email_notifications_older_than_seven_days, delete_inbound_sms_older_than_seven_days, delete_invitations, @@ -42,7 +43,10 @@ from tests.app.db import create_notification, create_service, create_template, c from tests.app.conftest import ( sample_job as create_sample_job, sample_notification_history as create_notification_history, - create_custom_template) + create_custom_template, + datetime_in_past +) +from tests.app.aws.test_s3 import single_s3_object_stub from tests.conftest import set_config_values @@ -71,14 +75,13 @@ def _create_slow_delivery_notification(provider='mmg'): ) -@pytest.fixture(scope='function') -def prepare_current_provider(restore_provider_details): - initial_provider = get_current_provider('sms') - initial_provider.updated_at = datetime.utcnow() - timedelta(minutes=30) - dao_update_provider_details(initial_provider) - - +@pytest.mark.skip(reason="This doesn't actually test the celery task wraps the function") def test_should_have_decorated_tasks_functions(): + """ + TODO: This test needs to be reviewed as this doesn't actually + test that the celery task is wrapping the function. We're also + running similar tests elsewhere which also need review. + """ assert delete_verify_codes.__wrapped__.__name__ == 'delete_verify_codes' assert delete_notifications_created_more_than_a_week_ago_by_type.__wrapped__.__name__ == \ 'delete_notifications_created_more_than_a_week_ago_by_type' @@ -93,6 +96,15 @@ def test_should_have_decorated_tasks_functions(): 'delete_inbound_sms_older_than_seven_days' assert remove_transformed_dvla_files.__wrapped__.__name__ == \ 'remove_transformed_dvla_files' + assert delete_dvla_response_files_older_than_seven_days.__wrapped__.__name__ == \ + 'delete_dvla_response_files_older_than_seven_days' + + +@pytest.fixture(scope='function') +def prepare_current_provider(restore_provider_details): + initial_provider = get_current_provider('sms') + initial_provider.updated_at = datetime.utcnow() - timedelta(minutes=30) + dao_update_provider_details(initial_provider) def test_should_call_delete_sms_notifications_more_than_week_in_task(notify_api, mocker): @@ -548,3 +560,49 @@ def test_remove_dvla_transformed_files_does_not_remove_files(mocker, sample_serv remove_transformed_dvla_files() s3.remove_transformed_dvla_file.assert_has_calls([]) + + +@freeze_time("2016-01-01 11:00:00") +def test_delete_dvla_response_files_older_than_seven_days_removes_old_files(notify_api, mocker): + AFTER_SEVEN_DAYS = datetime_in_past(days=8) + single_page_s3_objects = [{ + "Contents": [ + single_s3_object_stub('bar/foo1.txt', AFTER_SEVEN_DAYS), + single_s3_object_stub('bar/foo2.txt', AFTER_SEVEN_DAYS), + ] + }] + mocker.patch( + 'app.celery.scheduled_tasks.s3.get_s3_bucket_objects', return_value=single_page_s3_objects[0]["Contents"] + ) + remove_s3_mock = mocker.patch('app.celery.scheduled_tasks.s3.remove_s3_object') + + delete_dvla_response_files_older_than_seven_days() + + remove_s3_mock.assert_has_calls([ + call(current_app.config['DVLA_RESPONSE_BUCKET_NAME'], single_page_s3_objects[0]["Contents"][0]["Key"]), + call(current_app.config['DVLA_RESPONSE_BUCKET_NAME'], single_page_s3_objects[0]["Contents"][1]["Key"]) + ]) + + +@freeze_time("2016-01-01 11:00:00") +def test_delete_dvla_response_files_older_than_seven_days_does_not_remove_files(notify_api, mocker): + START_DATE = datetime_in_past(days=9) + JUST_BEFORE_START_DATE = datetime_in_past(days=9, seconds=1) + END_DATE = datetime_in_past(days=7) + JUST_AFTER_END_DATE = END_DATE + timedelta(seconds=1) + + single_page_s3_objects = [{ + "Contents": [ + single_s3_object_stub('bar/foo1.txt', JUST_BEFORE_START_DATE), + single_s3_object_stub('bar/foo2.txt', START_DATE), + single_s3_object_stub('bar/foo3.txt', END_DATE), + single_s3_object_stub('bar/foo4.txt', JUST_AFTER_END_DATE), + ] + }] + mocker.patch( + 'app.celery.scheduled_tasks.s3.get_s3_bucket_objects', return_value=single_page_s3_objects[0]["Contents"] + ) + remove_s3_mock = mocker.patch('app.celery.scheduled_tasks.s3.remove_s3_object') + delete_dvla_response_files_older_than_seven_days() + + remove_s3_mock.assert_not_called() diff --git a/tests/app/conftest.py b/tests/app/conftest.py index 91eff00fc..f01fcae15 100644 --- a/tests/app/conftest.py +++ b/tests/app/conftest.py @@ -1,12 +1,14 @@ +from datetime import (datetime, date, timedelta) import json import uuid -from datetime import (datetime, date, timedelta) -import requests_mock +from flask import current_app, url_for + import pytest +import pytz +import requests_mock from sqlalchemy import asc from sqlalchemy.orm.session import make_transient -from flask import current_app, url_for from app import db from app.models import ( @@ -35,7 +37,6 @@ from app.dao.notifications_dao import dao_create_notification from app.dao.invited_user_dao import save_invited_user from app.dao.provider_rates_dao import create_provider_rates from app.clients.sms.firetext import FiretextClient - from tests import create_authorization_header from tests.app.db import create_user, create_template, create_notification @@ -1018,3 +1019,7 @@ def admin_request(client): return json_resp return AdminRequest + + +def datetime_in_past(days=0, seconds=0): + return datetime.now(tz=pytz.utc) - timedelta(days=days, seconds=seconds)