From 3007ff8f04d151cb83808bda83f38aa7769acf19 Mon Sep 17 00:00:00 2001 From: Ben Thorner Date: Wed, 12 Jan 2022 16:18:53 +0000 Subject: [PATCH] Investigate duplicate tasks This proves that a task can be duplicated **even** when the prefetch multiplier [1] is set to 1. This is because prefetched tasks are still present on the SQS queue and subject to the visibility timeout, whereas tasks in progress are not (deleted, unless acks_late is set [2]). I get the same results with: - A concurrency of 2 processes; or - A concurrency of 1 and 2 workers This is consistent with [3], which reinforces that there's no way to prevent this behaviour for successive, long-running tasks. Arguably the current behaviour - without acks_late - is worse because the task that gets duplicated is not the task at "fault" (the long-running one). Suggested actions ----------------- Keep the worker as-is: the settings are appropriate for long-running tasks and do help minimise any duplication. Turning on acks_late has a risk of creating new kinds of duplication - may be worse than now. Continue other work to parallelise and optimise long-running tasks, so that the visibility timeout is never an issue in the first place. Demo of task duplication ------------------------ In order to do the demo we need: - 3 tasks and 2 proceses: the hypothesis is that process #1 will pick up tasks #1 and #2, with task #2 "timing out". - To poll more often than the timeout, so that process #2 will pick up (duplicate) task #2 after it finishes #3. Alternatively, use two workers: source environment.sh && celery -A run_celery.notify_celery worker --loglevel=INFO --concurrency=1 source environment.sh && celery -A run_celery.notify_celery worker --loglevel=INFO --concurrency=1 Command to schedule the tasks: from app.celery.reporting_tasks import long_task [long_task.apply_async(queue='reporting-tasks') for i in [1,2,3]] Logs that show the duplication: [2022-01-12 16:50:09,333: INFO/MainProcess] Task long[4c23eaf0-69fc-4951-b215-437d79e30462] received [2022-01-12 16:50:09,901: INFO/ForkPoolWorker-1] Running long_task [2022-01-12 16:50:10,208: INFO/MainProcess] Task long[7c22c30d-e61d-4d2d-834c-7dae56b20064] received [2022-01-12 16:50:10,253: INFO/ForkPoolWorker-2] Running long_task [2022-01-12 16:50:11,373: INFO/MainProcess] Task long[67e5526d-4a82-430b-8eb5-b8022a20bf70] received [2022-01-12 16:50:21,410: INFO/MainProcess] Task long[67e5526d-4a82-430b-8eb5-b8022a20bf70] received [2022-01-12 16:50:30,499: INFO/ForkPoolWorker-3] Running long_task [2022-01-12 16:50:36,555: INFO/ForkPoolWorker-4] Running long_task Note that when the first two tasks are received varies: I've seen them received separately and at the same time. [1]: https://docs.celeryproject.org/en/stable/userguide/configuration.html#std-setting-worker_prefetch_multiplier [2]: https://docs.celeryproject.org/en/stable/userguide/tasks.html#Task.acks_late [3]: https://github.com/celery/celery/issues/2788#issuecomment-136273421 --- Makefile | 2 +- app/celery/reporting_tasks.py | 6 ++++++ app/config.py | 5 ++++- run_celery.py | 1 + 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 0204d01d3..a15e5a300 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ run-celery: ## Run celery -A run_celery.notify_celery worker \ --pidfile="/tmp/celery.pid" \ --loglevel=INFO \ - --concurrency=4 + --concurrency=2 .PHONY: run-celery-beat run-celery-beat: ## Run celery beat diff --git a/app/celery/reporting_tasks.py b/app/celery/reporting_tasks.py index fae3f148e..f72a3879e 100644 --- a/app/celery/reporting_tasks.py +++ b/app/celery/reporting_tasks.py @@ -16,6 +16,12 @@ from app.dao.fact_notification_status_dao import ( ) from app.models import EMAIL_TYPE, LETTER_TYPE, SMS_TYPE +@notify_celery.task(name='long') +def long_task(): + from time import sleep + current_app.logger.info("Running long_task") + sleep(20) + @notify_celery.task(name="create-nightly-billing") @cronitor("create-nightly-billing") diff --git a/app/config.py b/app/config.py index 46413ca70..475bd87ce 100644 --- a/app/config.py +++ b/app/config.py @@ -192,7 +192,8 @@ class Config(object): 'broker_url': 'sqs://', 'broker_transport_options': { 'region': AWS_REGION, - 'visibility_timeout': 310, + 'visibility_timeout': 10, + 'wait_time_seconds': 1, 'queue_name_prefix': NOTIFICATION_QUEUE_PREFIX, }, 'timezone': 'Europe/London', @@ -343,10 +344,12 @@ class Config(object): # we can set celeryd_prefetch_multiplier to be 1 for celery apps which handle only long running tasks if os.getenv('CELERYD_PREFETCH_MULTIPLIER'): CELERY['worker_prefetch_multiplier'] = os.getenv('CELERYD_PREFETCH_MULTIPLIER') + CELERY['worker_prefetch_multiplier'] = 1 # on reporting worker, restart workers after each task is executed to help prevent memory leaks if os.getenv('CELERYD_MAX_TASKS_PER_CHILD'): CELERY['worker_max_tasks_per_child'] = int(os.getenv('CELERYD_MAX_TASKS_PER_CHILD')) + CELERY['worker_max_tasks_per_child'] = 1 FROM_NUMBER = 'development' diff --git a/run_celery.py b/run_celery.py index e21533c86..d042beddc 100644 --- a/run_celery.py +++ b/run_celery.py @@ -1,5 +1,6 @@ #!/usr/bin/env python +import pycurl from flask import Flask # notify_celery is referenced from manifest_delivery_base.yml, and cannot be removed