2017-12-18 16:20:57 +00:00
|
|
|
from datetime import datetime, timedelta
|
2017-06-12 15:55:05 +01:00
|
|
|
|
2016-04-07 13:44:04 +01:00
|
|
|
from flask import current_app
|
2016-02-24 17:12:30 +00:00
|
|
|
|
2017-06-12 15:55:05 +01:00
|
|
|
import pytz
|
|
|
|
|
from boto3 import client, resource
|
2018-07-12 16:53:10 +01:00
|
|
|
import botocore
|
2017-06-12 15:55:05 +01:00
|
|
|
|
2016-04-07 13:44:04 +01:00
|
|
|
FILE_LOCATION_STRUCTURE = 'service-{}-notify/{}.csv'
|
2016-02-24 17:12:30 +00:00
|
|
|
|
2016-04-07 13:44:04 +01:00
|
|
|
|
2017-05-12 17:39:15 +01:00
|
|
|
def get_s3_file(bucket_name, file_location):
|
|
|
|
|
s3_file = get_s3_object(bucket_name, file_location)
|
|
|
|
|
return s3_file.get()['Body'].read().decode('utf-8')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_s3_object(bucket_name, file_location):
|
2016-02-24 17:12:30 +00:00
|
|
|
s3 = resource('s3')
|
2017-05-12 17:21:07 +01:00
|
|
|
return s3.Object(bucket_name, file_location)
|
2016-04-05 14:28:19 +01:00
|
|
|
|
|
|
|
|
|
2018-07-12 16:53:10 +01:00
|
|
|
def file_exists(bucket_name, file_location):
|
|
|
|
|
try:
|
|
|
|
|
# try and access metadata of object
|
|
|
|
|
get_s3_object(bucket_name, file_location).metadata
|
|
|
|
|
return True
|
|
|
|
|
except botocore.exceptions.ClientError as e:
|
|
|
|
|
if e.response['ResponseMetadata']['HTTPStatusCode'] == 404:
|
|
|
|
|
return False
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
2018-04-30 11:47:13 +01:00
|
|
|
def get_job_location(service_id, job_id):
|
|
|
|
|
return (
|
|
|
|
|
current_app.config['CSV_UPLOAD_BUCKET_NAME'],
|
|
|
|
|
FILE_LOCATION_STRUCTURE.format(service_id, job_id),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2016-04-07 13:44:04 +01:00
|
|
|
def get_job_from_s3(service_id, job_id):
|
2018-04-30 11:47:13 +01:00
|
|
|
obj = get_s3_object(*get_job_location(service_id, job_id))
|
2017-05-12 17:21:07 +01:00
|
|
|
return obj.get()['Body'].read().decode('utf-8')
|
2016-04-05 14:28:19 +01:00
|
|
|
|
|
|
|
|
|
2018-04-30 11:47:13 +01:00
|
|
|
def get_job_metadata_from_s3(service_id, job_id):
|
|
|
|
|
obj = get_s3_object(*get_job_location(service_id, job_id))
|
|
|
|
|
return obj.get()['Metadata']
|
|
|
|
|
|
|
|
|
|
|
2016-04-07 13:44:04 +01:00
|
|
|
def remove_job_from_s3(service_id, job_id):
|
2018-04-30 11:47:13 +01:00
|
|
|
return remove_s3_object(*get_job_location(service_id, job_id))
|
2017-06-12 15:55:05 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_s3_bucket_objects(bucket_name, subfolder='', older_than=7, limit_days=2):
|
|
|
|
|
boto_client = client('s3', current_app.config['AWS_REGION'])
|
|
|
|
|
paginator = boto_client.get_paginator('list_objects_v2')
|
|
|
|
|
page_iterator = paginator.paginate(
|
|
|
|
|
Bucket=bucket_name,
|
|
|
|
|
Prefix=subfolder
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
all_objects_in_bucket = []
|
|
|
|
|
for page in page_iterator:
|
2018-08-13 14:09:51 +01:00
|
|
|
if page.get('Contents'):
|
|
|
|
|
all_objects_in_bucket.extend(page['Contents'])
|
2017-06-12 15:55:05 +01:00
|
|
|
|
|
|
|
|
return all_objects_in_bucket
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def filter_s3_bucket_objects_within_date_range(bucket_objects, older_than=7, limit_days=2):
|
|
|
|
|
"""
|
|
|
|
|
S3 returns the Object['LastModified'] as an 'offset-aware' timestamp so the
|
|
|
|
|
date range filter must take this into account.
|
|
|
|
|
|
|
|
|
|
Additionally an additional Object is returned by S3 corresponding to the
|
|
|
|
|
container directory. This is redundant and should be removed.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
end_date = datetime.now(tz=pytz.utc) - timedelta(days=older_than)
|
|
|
|
|
start_date = end_date - timedelta(days=limit_days)
|
|
|
|
|
filtered_items = [item for item in bucket_objects if all([
|
|
|
|
|
not item['Key'].endswith('/'),
|
|
|
|
|
item['LastModified'] > start_date,
|
|
|
|
|
item['LastModified'] < end_date
|
|
|
|
|
])]
|
|
|
|
|
|
|
|
|
|
return filtered_items
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def remove_s3_object(bucket_name, object_key):
|
|
|
|
|
obj = get_s3_object(bucket_name, object_key)
|
2017-05-12 17:21:07 +01:00
|
|
|
return obj.delete()
|
2017-06-07 16:31:14 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def remove_transformed_dvla_file(job_id):
|
2017-09-15 17:46:08 +01:00
|
|
|
bucket_name = current_app.config['DVLA_BUCKETS']['job']
|
2017-06-07 16:31:14 +01:00
|
|
|
file_location = '{}-dvla-job.text'.format(job_id)
|
|
|
|
|
obj = get_s3_object(bucket_name, file_location)
|
|
|
|
|
return obj.delete()
|
2017-12-04 16:39:32 +00:00
|
|
|
|
|
|
|
|
|
2018-01-16 09:29:31 +00:00
|
|
|
def get_list_of_files_by_suffix(bucket_name, subfolder='', suffix='', last_modified=None):
|
2018-01-12 15:10:42 +00:00
|
|
|
s3_client = client('s3', current_app.config['AWS_REGION'])
|
|
|
|
|
paginator = s3_client.get_paginator('list_objects_v2')
|
|
|
|
|
|
|
|
|
|
page_iterator = paginator.paginate(
|
|
|
|
|
Bucket=bucket_name,
|
|
|
|
|
Prefix=subfolder
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
for page in page_iterator:
|
2018-01-17 13:51:57 +00:00
|
|
|
for obj in page.get('Contents', []):
|
2018-01-18 10:44:36 +00:00
|
|
|
key = obj['Key']
|
|
|
|
|
if key.lower().endswith(suffix.lower()):
|
2018-01-17 13:51:57 +00:00
|
|
|
if not last_modified or obj['LastModified'] >= last_modified:
|
|
|
|
|
yield key
|