diff --git a/app/aws/s3.py b/app/aws/s3.py index 17baeb398..dc4abeb19 100644 --- a/app/aws/s3.py +++ b/app/aws/s3.py @@ -82,6 +82,35 @@ def list_s3_objects(): ) +def cleanup_old_s3_objects(): + + bucket_name = current_app.config["CSV_UPLOAD_BUCKET"]["bucket"] + s3_client = get_s3_client() + # Our reports only support 7 days, but can be scheduled 3 days in advance + # Use 14 day for the v1.0 version of this behavior + time_limit = aware_utcnow() - datetime.timedelta(days=14) + try: + response = s3_client.list_objects_v2(Bucket=bucket_name) + while True: + for obj in response.get("Contents", []): + if obj["LastModified"] <= time_limit: + current_app.logger.info( + f"#delete-old-s3-objects Wanting to delete: {obj['LastModified']} {obj['Key']}" + ) + if "NextContinuationToken" in response: + response = s3_client.list_objects_v2( + Bucket=bucket_name, + ContinuationToken=response["NextContinuationToken"], + ) + else: + break + except Exception: + current_app.logger.error( + "#delete-old-s3-objects An error occurred while cleaning up old s3 objects", + exc_info=True, + ) + + def get_s3_files(): bucket_name = current_app.config["CSV_UPLOAD_BUCKET"]["bucket"] diff --git a/app/celery/tasks.py b/app/celery/tasks.py index e6ed717e7..b173afbb3 100644 --- a/app/celery/tasks.py +++ b/app/celery/tasks.py @@ -446,6 +446,11 @@ def regenerate_job_cache(): s3.get_s3_files() +@notify_celery.task(name="delete-old-s3-objects") +def delete_old_s3_objects(): + s3.cleanup_old_s3_objects() + + @notify_celery.task(name="process-incomplete-jobs") def process_incomplete_jobs(job_ids): jobs = [dao_get_job_by_id(job_id) for job_id in job_ids] diff --git a/app/config.py b/app/config.py index c4ab09e3c..71fa4ed23 100644 --- a/app/config.py +++ b/app/config.py @@ -249,6 +249,11 @@ class Config(object): "schedule": crontab(hour=6, minute=0), "options": {"queue": QueueNames.PERIODIC}, }, + "delete_old_s3_objects": { + "task": "delete-old-s3-objects", + "schedule": crontab(minute="*/5"), + "options": {"queue": QueueNames.PERIODIC}, + }, "regenerate-job-cache": { "task": "regenerate-job-cache", "schedule": crontab(minute="*/30"),