merge from main

2026-01-31 23:26:23 -05:00 · 2024-08-22 08:38:01 -07:00
parent 3ac2db4379 6383feea3e
commit 01fff300f8
29 changed files with 855 additions and 1374 deletions
--- a/app/aws/s3.py
+++ b/app/aws/s3.py
@@ -19,26 +19,52 @@ JOBS = ExpiringDict(max_len=20000, max_age_seconds=ttl)
 JOBS_CACHE_HITS = "JOBS_CACHE_HITS"
 JOBS_CACHE_MISSES = "JOBS_CACHE_MISSES"

+# Global variable
+s3_client = None
+s3_resource = None
+
+
+def get_s3_client():
+    global s3_client
+    if s3_client is None:
+        access_key = current_app.config["CSV_UPLOAD_BUCKET"]["access_key_id"]
+        secret_key = current_app.config["CSV_UPLOAD_BUCKET"]["secret_access_key"]
+        region = current_app.config["CSV_UPLOAD_BUCKET"]["region"]
+        session = Session(
+            aws_access_key_id=access_key,
+            aws_secret_access_key=secret_key,
+            region_name=region,
+        )
+        s3_client = session.client("s3")
+    return s3_client
+
+
+def get_s3_resource():
+    global s3_resource
+    if s3_resource is None:
+        access_key = current_app.config["CSV_UPLOAD_BUCKET"]["access_key_id"]
+        secret_key = current_app.config["CSV_UPLOAD_BUCKET"]["secret_access_key"]
+        region = current_app.config["CSV_UPLOAD_BUCKET"]["region"]
+        session = Session(
+            aws_access_key_id=access_key,
+            aws_secret_access_key=secret_key,
+            region_name=region,
+        )
+        s3_resource = session.resource("s3", config=AWS_CLIENT_CONFIG)
+    return s3_resource
+

 def list_s3_objects():
-    bucket_name = current_app.config["CSV_UPLOAD_BUCKET"]["bucket"]
-    access_key = current_app.config["CSV_UPLOAD_BUCKET"]["access_key_id"]
-    secret_key = current_app.config["CSV_UPLOAD_BUCKET"]["secret_access_key"]
-    region = current_app.config["CSV_UPLOAD_BUCKET"]["region"]
-    session = Session(
-        aws_access_key_id=access_key,
-        aws_secret_access_key=secret_key,
-        region_name=region,
-    )
-    s3 = session.client("s3")

+    bucket_name = current_app.config["CSV_UPLOAD_BUCKET"]["bucket"]
+    s3_client = get_s3_client()
    try:
-        response = s3.list_objects_v2(Bucket=bucket_name)
+        response = s3_client.list_objects_v2(Bucket=bucket_name)
        while True:
            for obj in response.get("Contents", []):
                yield obj["Key"]
            if "NextContinuationToken" in response:
-                response = s3.list_objects_v2(
+                response = s3_client.list_objects_v2(
                    Bucket=bucket_name,
                    ContinuationToken=response["NextContinuationToken"],
                )
@@ -52,19 +78,11 @@ def list_s3_objects():


 def get_s3_files():
-    current_app.logger.info("Regenerate job cache #notify-admin-1200")
+
    bucket_name = current_app.config["CSV_UPLOAD_BUCKET"]["bucket"]
-    access_key = current_app.config["CSV_UPLOAD_BUCKET"]["access_key_id"]
-    secret_key = current_app.config["CSV_UPLOAD_BUCKET"]["secret_access_key"]
-    region = current_app.config["CSV_UPLOAD_BUCKET"]["region"]
-    session = Session(
-        aws_access_key_id=access_key,
-        aws_secret_access_key=secret_key,
-        region_name=region,
-    )
    objects = list_s3_objects()

-    s3res = session.resource("s3", config=AWS_CLIENT_CONFIG)
+    s3res = get_s3_resource()
    current_app.logger.info(
        f"JOBS cache length before regen: {len(JOBS)} #notify-admin-1200"
    )
@@ -100,12 +118,8 @@ def get_s3_file(bucket_name, file_location, access_key, secret_key, region):
 def download_from_s3(
    bucket_name, s3_key, local_filename, access_key, secret_key, region
 ):
-    session = Session(
-        aws_access_key_id=access_key,
-        aws_secret_access_key=secret_key,
-        region_name=region,
-    )
-    s3 = session.client("s3", config=AWS_CLIENT_CONFIG)
+
+    s3 = get_s3_client()
    result = None
    try:
        result = s3.download_file(bucket_name, s3_key, local_filename)
@@ -124,27 +138,28 @@ def download_from_s3(


 def get_s3_object(bucket_name, file_location, access_key, secret_key, region):
-    session = Session(
-        aws_access_key_id=access_key,
-        aws_secret_access_key=secret_key,
-        region_name=region,
-    )
-    s3 = session.resource("s3", config=AWS_CLIENT_CONFIG)
-    return s3.Object(bucket_name, file_location)
+
+    s3 = get_s3_resource()
+    try:
+        return s3.Object(bucket_name, file_location)
+    except botocore.exceptions.ClientError:
+        current_app.logger.error(
+            f"Can't retrieve S3 Object from {file_location}", exc_info=True
+        )


 def purge_bucket(bucket_name, access_key, secret_key, region):
-    session = Session(
-        aws_access_key_id=access_key,
-        aws_secret_access_key=secret_key,
-        region_name=region,
-    )
-    s3 = session.resource("s3", config=AWS_CLIENT_CONFIG)
+    s3 = get_s3_resource()
    bucket = s3.Bucket(bucket_name)
    bucket.objects.all().delete()


-def file_exists(bucket_name, file_location, access_key, secret_key, region):
+def file_exists(file_location):
+    bucket_name = current_app.config["CSV_UPLOAD_BUCKET"]["bucket"]
+    access_key = current_app.config["CSV_UPLOAD_BUCKET"]["access_key_id"]
+    secret_key = current_app.config["CSV_UPLOAD_BUCKET"]["secret_access_key"]
+    region = current_app.config["CSV_UPLOAD_BUCKET"]["region"]
+
    try:
        # try and access metadata of object
        get_s3_object(
@@ -173,9 +188,25 @@ def get_job_and_metadata_from_s3(service_id, job_id):


 def get_job_from_s3(service_id, job_id):
+    """
+    If and only if we hit a throttling exception of some kind, we want to try
+    exponential backoff.  However, if we are getting NoSuchKey or something
+    that indicates things are permanently broken, we want to give up right away
+    to save time.
+    """
+    # We have to make sure the retries don't take up to much time, because
+    # we might be retrieving dozens of jobs.  So max time is:
+    # 0.2 + 0.4 + 0.8 + 1.6 = 3.0 seconds
    retries = 0
-    max_retries = 5
-    backoff_factor = 1
+    max_retries = 4
+    backoff_factor = 0.2
+
+    if not file_exists(FILE_LOCATION_STRUCTURE.format(service_id, job_id)):
+        current_app.logger.error(
+            f"This file does not exist {FILE_LOCATION_STRUCTURE.format(service_id, job_id)}"
+        )
+        return None
+
    while retries < max_retries:

        try:
@@ -187,15 +218,34 @@ def get_job_from_s3(service_id, job_id):
                "RequestTimeout",
                "SlowDown",
            ]:
+                current_app.logger.error(
+                    f"Retrying job fetch {FILE_LOCATION_STRUCTURE.format(service_id, job_id)} retry_count={retries}",
+                    exc_info=True,
+                )
                retries += 1
                sleep_time = backoff_factor * (2**retries)  # Exponential backoff
                time.sleep(sleep_time)
                continue
-        except Exception:
-            current_app.logger.error("Failed to get object from bucket", exc_info=True)
-            raise
+            else:
+                # Typically this is "NoSuchKey"
+                current_app.logger.error(
+                    f"Failed to get job {FILE_LOCATION_STRUCTURE.format(service_id, job_id)}",
+                    exc_info=True,
+                )
+                return None

-    raise Exception("Failed to get object after 5 attempts")
+        except Exception:
+            current_app.logger.error(
+                f"Failed to get job {FILE_LOCATION_STRUCTURE.format(service_id, job_id)} retry_count={retries}",
+                exc_info=True,
+            )
+            return None
+
+    current_app.logger.error(
+        f"Never retrieved job {FILE_LOCATION_STRUCTURE.format(service_id, job_id)}",
+        exc_info=True,
+    )
+    return None


 def incr_jobs_cache_misses():
@@ -267,19 +317,15 @@ def get_phone_number_from_s3(service_id, job_id, job_row_number):
    if job is None:
        current_app.logger.info(f"job {job_id} was not in the cache")
        job = get_job_from_s3(service_id, job_id)
+        # Even if it is None, put it here to avoid KeyErrors
        JOBS[job_id] = job
        incr_jobs_cache_misses()
    else:
        incr_jobs_cache_hits()

-    # If the job is None after our attempt to retrieve it from s3, it
-    # probably means the job is old and has been deleted from s3, in
-    # which case there is nothing we can do.  It's unlikely to run into
-    # this, but it could theoretically happen, especially if we ever
-    # change the task schedules
    if job is None:
-        current_app.logger.warning(
-            f"Couldnt find phone for job_id {job_id} row number {job_row_number} because job is missing"
+        current_app.logger.error(
+            f"Couldnt find phone for job {FILE_LOCATION_STRUCTURE.format(service_id, job_id)} because job is missing"
        )
        return "Unavailable"

@@ -324,7 +370,7 @@ def get_personalisation_from_s3(service_id, job_id, job_row_number):
    # change the task schedules
    if job is None:
        current_app.logger.warning(
-            "Couldnt find personalisation for job_id {job_id} row number {job_row_number} because job is missing"
+            f"Couldnt find personalisation for job_id {job_id} row number {job_row_number} because job is missing"
        )
        return {}