Merge pull request #1032 from alphagov/imdad-feat-remove-dvla-response-files

Remove DVLA response files older than 7 days
This commit is contained in:
Imdad Ahad
2017-06-13 17:23:58 +01:00
committed by GitHub
6 changed files with 274 additions and 16 deletions

View File

@@ -1,6 +1,10 @@
from boto3 import resource from datetime import datetime, timedelta
from flask import current_app from flask import current_app
import pytz
from boto3 import client, resource
FILE_LOCATION_STRUCTURE = 'service-{}-notify/{}.csv' FILE_LOCATION_STRUCTURE = 'service-{}-notify/{}.csv'
@@ -24,7 +28,46 @@ def get_job_from_s3(service_id, job_id):
def remove_job_from_s3(service_id, job_id): def remove_job_from_s3(service_id, job_id):
bucket_name = current_app.config['CSV_UPLOAD_BUCKET_NAME'] bucket_name = current_app.config['CSV_UPLOAD_BUCKET_NAME']
file_location = FILE_LOCATION_STRUCTURE.format(service_id, job_id) file_location = FILE_LOCATION_STRUCTURE.format(service_id, job_id)
obj = get_s3_object(bucket_name, file_location) return remove_s3_object(bucket_name, file_location)
def get_s3_bucket_objects(bucket_name, subfolder='', older_than=7, limit_days=2):
boto_client = client('s3', current_app.config['AWS_REGION'])
paginator = boto_client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(
Bucket=bucket_name,
Prefix=subfolder
)
all_objects_in_bucket = []
for page in page_iterator:
all_objects_in_bucket.extend(page['Contents'])
return all_objects_in_bucket
def filter_s3_bucket_objects_within_date_range(bucket_objects, older_than=7, limit_days=2):
"""
S3 returns the Object['LastModified'] as an 'offset-aware' timestamp so the
date range filter must take this into account.
Additionally an additional Object is returned by S3 corresponding to the
container directory. This is redundant and should be removed.
"""
end_date = datetime.now(tz=pytz.utc) - timedelta(days=older_than)
start_date = end_date - timedelta(days=limit_days)
filtered_items = [item for item in bucket_objects if all([
not item['Key'].endswith('/'),
item['LastModified'] > start_date,
item['LastModified'] < end_date
])]
return filtered_items
def remove_s3_object(bucket_name, object_key):
obj = get_s3_object(bucket_name, object_key)
return obj.delete() return obj.delete()

View File

@@ -255,3 +255,29 @@ def remove_transformed_dvla_files():
for job in jobs: for job in jobs:
s3.remove_transformed_dvla_file(job.id) s3.remove_transformed_dvla_file(job.id)
current_app.logger.info("Transformed dvla file for job {} has been removed from s3.".format(job.id)) current_app.logger.info("Transformed dvla file for job {} has been removed from s3.".format(job.id))
@notify_celery.task(name="delete_dvla_response_files")
@statsd(namespace="tasks")
def delete_dvla_response_files_older_than_seven_days():
try:
start = datetime.utcnow()
bucket_objects = s3.get_s3_bucket_objects(
current_app.config['DVLA_RESPONSE_BUCKET_NAME'],
'root/dispatch'
)
older_than_seven_days = s3.filter_s3_bucket_objects_within_date_range(bucket_objects)
for f in older_than_seven_days:
s3.remove_s3_object(current_app.config['DVLA_RESPONSE_BUCKET_NAME'], f['Key'])
current_app.logger.info(
"Delete dvla response files started {} finished {} deleted {} files".format(
start,
datetime.utcnow(),
len(older_than_seven_days)
)
)
except SQLAlchemyError as e:
current_app.logger.exception("Failed to delete dvla response files")
raise

View File

@@ -1,7 +1,8 @@
from datetime import timedelta from datetime import timedelta
import os
from celery.schedules import crontab from celery.schedules import crontab
from kombu import Exchange, Queue from kombu import Exchange, Queue
import os
from app.models import ( from app.models import (
EMAIL_TYPE, SMS_TYPE, LETTER_TYPE, EMAIL_TYPE, SMS_TYPE, LETTER_TYPE,
@@ -209,6 +210,11 @@ class Config(object):
'schedule': crontab(minute=40, hour=4), 'schedule': crontab(minute=40, hour=4),
'options': {'queue': QueueNames.PERIODIC} 'options': {'queue': QueueNames.PERIODIC}
}, },
'delete_dvla_response_files': {
'task': 'delete_dvla_response_files',
'schedule': crontab(minute=10, hour=5),
'options': {'queue': QueueNames.PERIODIC}
},
'timeout-job-statistics': { 'timeout-job-statistics': {
'task': 'timeout-job-statistics', 'task': 'timeout-job-statistics',
'schedule': crontab(minute=0, hour=5), 'schedule': crontab(minute=0, hour=5),
@@ -265,6 +271,7 @@ class Development(Config):
SQLALCHEMY_ECHO = False SQLALCHEMY_ECHO = False
NOTIFY_EMAIL_DOMAIN = 'notify.tools' NOTIFY_EMAIL_DOMAIN = 'notify.tools'
CSV_UPLOAD_BUCKET_NAME = 'development-notifications-csv-upload' CSV_UPLOAD_BUCKET_NAME = 'development-notifications-csv-upload'
DVLA_RESPONSE_BUCKET_NAME = 'notify.tools-ftp'
NOTIFY_ENVIRONMENT = 'development' NOTIFY_ENVIRONMENT = 'development'
NOTIFICATION_QUEUE_PREFIX = 'development' NOTIFICATION_QUEUE_PREFIX = 'development'
DEBUG = True DEBUG = True
@@ -284,6 +291,7 @@ class Test(Config):
NOTIFY_ENVIRONMENT = 'test' NOTIFY_ENVIRONMENT = 'test'
DEBUG = True DEBUG = True
CSV_UPLOAD_BUCKET_NAME = 'test-notifications-csv-upload' CSV_UPLOAD_BUCKET_NAME = 'test-notifications-csv-upload'
DVLA_RESPONSE_BUCKET_NAME = 'test.notify.com-ftp'
STATSD_ENABLED = True STATSD_ENABLED = True
STATSD_HOST = "localhost" STATSD_HOST = "localhost"
STATSD_PORT = 1000 STATSD_PORT = 1000
@@ -316,6 +324,7 @@ class Preview(Config):
NOTIFY_EMAIL_DOMAIN = 'notify.works' NOTIFY_EMAIL_DOMAIN = 'notify.works'
NOTIFY_ENVIRONMENT = 'preview' NOTIFY_ENVIRONMENT = 'preview'
CSV_UPLOAD_BUCKET_NAME = 'preview-notifications-csv-upload' CSV_UPLOAD_BUCKET_NAME = 'preview-notifications-csv-upload'
DVLA_RESPONSE_BUCKET_NAME = 'notify.works-ftp'
FROM_NUMBER = 'preview' FROM_NUMBER = 'preview'
API_RATE_LIMIT_ENABLED = True API_RATE_LIMIT_ENABLED = True
@@ -324,6 +333,7 @@ class Staging(Config):
NOTIFY_EMAIL_DOMAIN = 'staging-notify.works' NOTIFY_EMAIL_DOMAIN = 'staging-notify.works'
NOTIFY_ENVIRONMENT = 'staging' NOTIFY_ENVIRONMENT = 'staging'
CSV_UPLOAD_BUCKET_NAME = 'staging-notify-csv-upload' CSV_UPLOAD_BUCKET_NAME = 'staging-notify-csv-upload'
DVLA_RESPONSE_BUCKET_NAME = 'staging-notify.works-ftp'
STATSD_ENABLED = True STATSD_ENABLED = True
FROM_NUMBER = 'stage' FROM_NUMBER = 'stage'
API_RATE_LIMIT_ENABLED = True API_RATE_LIMIT_ENABLED = True
@@ -333,6 +343,7 @@ class Live(Config):
NOTIFY_EMAIL_DOMAIN = 'notifications.service.gov.uk' NOTIFY_EMAIL_DOMAIN = 'notifications.service.gov.uk'
NOTIFY_ENVIRONMENT = 'live' NOTIFY_ENVIRONMENT = 'live'
CSV_UPLOAD_BUCKET_NAME = 'live-notifications-csv-upload' CSV_UPLOAD_BUCKET_NAME = 'live-notifications-csv-upload'
DVLA_RESPONSE_BUCKET_NAME = 'notifications.service.gov.uk-ftp'
STATSD_ENABLED = True STATSD_ENABLED = True
FROM_NUMBER = 'GOVUK' FROM_NUMBER = 'GOVUK'
FUNCTIONAL_TEST_PROVIDER_SERVICE_ID = '6c1d81bb-dae2-4ee9-80b0-89a4aae9f649' FUNCTIONAL_TEST_PROVIDER_SERVICE_ID = '6c1d81bb-dae2-4ee9-80b0-89a4aae9f649'
@@ -350,6 +361,7 @@ class Sandbox(CloudFoundryConfig):
NOTIFY_EMAIL_DOMAIN = 'notify.works' NOTIFY_EMAIL_DOMAIN = 'notify.works'
NOTIFY_ENVIRONMENT = 'sandbox' NOTIFY_ENVIRONMENT = 'sandbox'
CSV_UPLOAD_BUCKET_NAME = 'cf-sandbox-notifications-csv-upload' CSV_UPLOAD_BUCKET_NAME = 'cf-sandbox-notifications-csv-upload'
DVLA_RESPONSE_BUCKET_NAME = 'notify.works-ftp'
FROM_NUMBER = 'sandbox' FROM_NUMBER = 'sandbox'
REDIS_ENABLED = False REDIS_ENABLED = False

View File

@@ -1,8 +1,25 @@
from unittest.mock import call from unittest.mock import call
from datetime import datetime, timedelta
from flask import current_app from flask import current_app
from app.aws.s3 import get_s3_file, remove_transformed_dvla_file from freezegun import freeze_time
from app.aws.s3 import (
get_s3_bucket_objects,
get_s3_file,
filter_s3_bucket_objects_within_date_range,
remove_transformed_dvla_file
)
from tests.app.conftest import datetime_in_past
def single_s3_object_stub(key='foo', last_modified=datetime.utcnow()):
return {
'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
'Key': key,
'LastModified': last_modified
}
def test_get_s3_file_makes_correct_call(notify_api, mocker): def test_get_s3_file_makes_correct_call(notify_api, mocker):
@@ -25,3 +42,100 @@ def test_remove_transformed_dvla_file_makes_correct_call(notify_api, mocker):
call(current_app.config['DVLA_UPLOAD_BUCKET_NAME'], '{}-dvla-job.text'.format(fake_uuid)), call(current_app.config['DVLA_UPLOAD_BUCKET_NAME'], '{}-dvla-job.text'.format(fake_uuid)),
call().delete() call().delete()
]) ])
def test_get_s3_bucket_objects_make_correct_pagination_call(notify_api, mocker):
paginator_mock = mocker.patch('app.aws.s3.client')
get_s3_bucket_objects('foo-bucket', subfolder='bar')
paginator_mock.assert_has_calls([
call().get_paginator().paginate(Bucket='foo-bucket', Prefix='bar')
])
def test_get_s3_bucket_objects_builds_objects_list_from_paginator(notify_api, mocker):
AFTER_SEVEN_DAYS = datetime_in_past(days=8)
paginator_mock = mocker.patch('app.aws.s3.client')
multiple_pages_s3_object = [
{
"Contents": [
single_s3_object_stub('bar/foo.txt', AFTER_SEVEN_DAYS),
]
},
{
"Contents": [
single_s3_object_stub('bar/foo1.txt', AFTER_SEVEN_DAYS),
]
}
]
paginator_mock.return_value.get_paginator.return_value.paginate.return_value = multiple_pages_s3_object
bucket_objects = get_s3_bucket_objects('foo-bucket', subfolder='bar')
assert len(bucket_objects) == 2
assert set(bucket_objects[0].keys()) == set(['ETag', 'Key', 'LastModified'])
@freeze_time("2016-01-01 11:00:00")
def test_get_s3_bucket_objects_removes_redundant_root_object(notify_api, mocker):
AFTER_SEVEN_DAYS = datetime_in_past(days=8)
s3_objects_stub = [
single_s3_object_stub('bar/', AFTER_SEVEN_DAYS),
single_s3_object_stub('bar/foo.txt', AFTER_SEVEN_DAYS),
]
filtered_items = filter_s3_bucket_objects_within_date_range(s3_objects_stub)
assert len(filtered_items) == 1
assert filtered_items[0]["Key"] == 'bar/foo.txt'
assert filtered_items[0]["LastModified"] == datetime_in_past(days=8)
@freeze_time("2016-01-01 11:00:00")
def test_filter_s3_bucket_objects_within_date_range_filters_by_date_range(notify_api, mocker):
START_DATE = datetime_in_past(days=9)
JUST_BEFORE_START_DATE = START_DATE - timedelta(seconds=1)
JUST_AFTER_START_DATE = START_DATE + timedelta(seconds=1)
END_DATE = datetime_in_past(days=7)
JUST_BEFORE_END_DATE = END_DATE - timedelta(seconds=1)
JUST_AFTER_END_DATE = END_DATE + timedelta(seconds=1)
s3_objects_stub = [
single_s3_object_stub('bar/', JUST_BEFORE_START_DATE),
single_s3_object_stub('bar/foo.txt', START_DATE),
single_s3_object_stub('bar/foo2.txt', JUST_AFTER_START_DATE),
single_s3_object_stub('bar/foo3.txt', JUST_BEFORE_END_DATE),
single_s3_object_stub('bar/foo4.txt', END_DATE),
single_s3_object_stub('bar/foo5.txt', JUST_AFTER_END_DATE),
]
filtered_items = filter_s3_bucket_objects_within_date_range(s3_objects_stub)
assert len(filtered_items) == 2
assert filtered_items[0]["Key"] == 'bar/foo2.txt'
assert filtered_items[0]["LastModified"] == JUST_AFTER_START_DATE
assert filtered_items[1]["Key"] == 'bar/foo3.txt'
assert filtered_items[1]["LastModified"] == JUST_BEFORE_END_DATE
@freeze_time("2016-01-01 11:00:00")
def test_get_s3_bucket_objects_does_not_return_outside_of_date_range(notify_api, mocker):
START_DATE = datetime_in_past(days=9)
JUST_BEFORE_START_DATE = START_DATE - timedelta(seconds=1)
END_DATE = datetime_in_past(days=7)
JUST_AFTER_END_DATE = END_DATE + timedelta(seconds=1)
s3_objects_stub = [
single_s3_object_stub('bar/', JUST_BEFORE_START_DATE),
single_s3_object_stub('bar/foo1.txt', START_DATE),
single_s3_object_stub('bar/foo2.txt', END_DATE),
single_s3_object_stub('bar/foo3.txt', JUST_AFTER_END_DATE)
]
filtered_items = filter_s3_bucket_objects_within_date_range(s3_objects_stub)
assert len(filtered_items) == 0

View File

@@ -9,6 +9,7 @@ from freezegun import freeze_time
from app.celery import scheduled_tasks from app.celery import scheduled_tasks
from app.celery.scheduled_tasks import ( from app.celery.scheduled_tasks import (
delete_dvla_response_files_older_than_seven_days,
delete_email_notifications_older_than_seven_days, delete_email_notifications_older_than_seven_days,
delete_inbound_sms_older_than_seven_days, delete_inbound_sms_older_than_seven_days,
delete_invitations, delete_invitations,
@@ -42,7 +43,10 @@ from tests.app.db import create_notification, create_service, create_template, c
from tests.app.conftest import ( from tests.app.conftest import (
sample_job as create_sample_job, sample_job as create_sample_job,
sample_notification_history as create_notification_history, sample_notification_history as create_notification_history,
create_custom_template) create_custom_template,
datetime_in_past
)
from tests.app.aws.test_s3 import single_s3_object_stub
from tests.conftest import set_config_values from tests.conftest import set_config_values
@@ -71,14 +75,13 @@ def _create_slow_delivery_notification(provider='mmg'):
) )
@pytest.fixture(scope='function') @pytest.mark.skip(reason="This doesn't actually test the celery task wraps the function")
def prepare_current_provider(restore_provider_details):
initial_provider = get_current_provider('sms')
initial_provider.updated_at = datetime.utcnow() - timedelta(minutes=30)
dao_update_provider_details(initial_provider)
def test_should_have_decorated_tasks_functions(): def test_should_have_decorated_tasks_functions():
"""
TODO: This test needs to be reviewed as this doesn't actually
test that the celery task is wrapping the function. We're also
running similar tests elsewhere which also need review.
"""
assert delete_verify_codes.__wrapped__.__name__ == 'delete_verify_codes' assert delete_verify_codes.__wrapped__.__name__ == 'delete_verify_codes'
assert delete_notifications_created_more_than_a_week_ago_by_type.__wrapped__.__name__ == \ assert delete_notifications_created_more_than_a_week_ago_by_type.__wrapped__.__name__ == \
'delete_notifications_created_more_than_a_week_ago_by_type' 'delete_notifications_created_more_than_a_week_ago_by_type'
@@ -93,6 +96,15 @@ def test_should_have_decorated_tasks_functions():
'delete_inbound_sms_older_than_seven_days' 'delete_inbound_sms_older_than_seven_days'
assert remove_transformed_dvla_files.__wrapped__.__name__ == \ assert remove_transformed_dvla_files.__wrapped__.__name__ == \
'remove_transformed_dvla_files' 'remove_transformed_dvla_files'
assert delete_dvla_response_files_older_than_seven_days.__wrapped__.__name__ == \
'delete_dvla_response_files_older_than_seven_days'
@pytest.fixture(scope='function')
def prepare_current_provider(restore_provider_details):
initial_provider = get_current_provider('sms')
initial_provider.updated_at = datetime.utcnow() - timedelta(minutes=30)
dao_update_provider_details(initial_provider)
def test_should_call_delete_sms_notifications_more_than_week_in_task(notify_api, mocker): def test_should_call_delete_sms_notifications_more_than_week_in_task(notify_api, mocker):
@@ -548,3 +560,49 @@ def test_remove_dvla_transformed_files_does_not_remove_files(mocker, sample_serv
remove_transformed_dvla_files() remove_transformed_dvla_files()
s3.remove_transformed_dvla_file.assert_has_calls([]) s3.remove_transformed_dvla_file.assert_has_calls([])
@freeze_time("2016-01-01 11:00:00")
def test_delete_dvla_response_files_older_than_seven_days_removes_old_files(notify_api, mocker):
AFTER_SEVEN_DAYS = datetime_in_past(days=8)
single_page_s3_objects = [{
"Contents": [
single_s3_object_stub('bar/foo1.txt', AFTER_SEVEN_DAYS),
single_s3_object_stub('bar/foo2.txt', AFTER_SEVEN_DAYS),
]
}]
mocker.patch(
'app.celery.scheduled_tasks.s3.get_s3_bucket_objects', return_value=single_page_s3_objects[0]["Contents"]
)
remove_s3_mock = mocker.patch('app.celery.scheduled_tasks.s3.remove_s3_object')
delete_dvla_response_files_older_than_seven_days()
remove_s3_mock.assert_has_calls([
call(current_app.config['DVLA_RESPONSE_BUCKET_NAME'], single_page_s3_objects[0]["Contents"][0]["Key"]),
call(current_app.config['DVLA_RESPONSE_BUCKET_NAME'], single_page_s3_objects[0]["Contents"][1]["Key"])
])
@freeze_time("2016-01-01 11:00:00")
def test_delete_dvla_response_files_older_than_seven_days_does_not_remove_files(notify_api, mocker):
START_DATE = datetime_in_past(days=9)
JUST_BEFORE_START_DATE = datetime_in_past(days=9, seconds=1)
END_DATE = datetime_in_past(days=7)
JUST_AFTER_END_DATE = END_DATE + timedelta(seconds=1)
single_page_s3_objects = [{
"Contents": [
single_s3_object_stub('bar/foo1.txt', JUST_BEFORE_START_DATE),
single_s3_object_stub('bar/foo2.txt', START_DATE),
single_s3_object_stub('bar/foo3.txt', END_DATE),
single_s3_object_stub('bar/foo4.txt', JUST_AFTER_END_DATE),
]
}]
mocker.patch(
'app.celery.scheduled_tasks.s3.get_s3_bucket_objects', return_value=single_page_s3_objects[0]["Contents"]
)
remove_s3_mock = mocker.patch('app.celery.scheduled_tasks.s3.remove_s3_object')
delete_dvla_response_files_older_than_seven_days()
remove_s3_mock.assert_not_called()

View File

@@ -1,12 +1,14 @@
from datetime import (datetime, date, timedelta)
import json import json
import uuid import uuid
from datetime import (datetime, date, timedelta)
import requests_mock from flask import current_app, url_for
import pytest import pytest
import pytz
import requests_mock
from sqlalchemy import asc from sqlalchemy import asc
from sqlalchemy.orm.session import make_transient from sqlalchemy.orm.session import make_transient
from flask import current_app, url_for
from app import db from app import db
from app.models import ( from app.models import (
@@ -35,7 +37,6 @@ from app.dao.notifications_dao import dao_create_notification
from app.dao.invited_user_dao import save_invited_user from app.dao.invited_user_dao import save_invited_user
from app.dao.provider_rates_dao import create_provider_rates from app.dao.provider_rates_dao import create_provider_rates
from app.clients.sms.firetext import FiretextClient from app.clients.sms.firetext import FiretextClient
from tests import create_authorization_header from tests import create_authorization_header
from tests.app.db import create_user, create_template, create_notification from tests.app.db import create_user, create_template, create_notification
@@ -1018,3 +1019,7 @@ def admin_request(client):
return json_resp return json_resp
return AdminRequest return AdminRequest
def datetime_in_past(days=0, seconds=0):
return datetime.now(tz=pytz.utc) - timedelta(days=days, seconds=seconds)