From 7ad0c4103a5c22aa650baf31cba5ab28eade0b1e Mon Sep 17 00:00:00 2001 From: Ben Thorner Date: Mon, 24 Jan 2022 12:45:24 +0000 Subject: [PATCH] Stop killing reporting processes after each task Previously we think this setting was necessary to avoid a memory leak [1], but it's unclear if this is still an issue: - We've advanced two major versions of Celery. - Some of the tasks are now quicker and leaner. Restarting worker sub-processes after each task is a big problem for performance, as we move towards parallelising our reporting. This is something of a test to see if we can manage without this setting. Note that we need to unset the variable manually: cf unset-env notify-delivery-worker-reporting CELERYD_MAX_TASKS_PER_CHILD In the worst case we can always re-run any failed tasks. To check the worker is still behaving as expected, we can: - Monitor CPU / memory graphs for it. - Check `cf events` for unexpected restarts / crashes. - Compare numbers of task completion logs to previous days. - Check the number of new billing / status rows looks right. [1]: https://github.com/alphagov/notifications-api/commit/ad419f7592524292f8da8447814cc0bbc37f240e --- app/config.py | 4 ---- manifest.yml.j2 | 1 - 2 files changed, 5 deletions(-) diff --git a/app/config.py b/app/config.py index 46413ca70..0afc852ed 100644 --- a/app/config.py +++ b/app/config.py @@ -344,10 +344,6 @@ class Config(object): if os.getenv('CELERYD_PREFETCH_MULTIPLIER'): CELERY['worker_prefetch_multiplier'] = os.getenv('CELERYD_PREFETCH_MULTIPLIER') - # on reporting worker, restart workers after each task is executed to help prevent memory leaks - if os.getenv('CELERYD_MAX_TASKS_PER_CHILD'): - CELERY['worker_max_tasks_per_child'] = int(os.getenv('CELERYD_MAX_TASKS_PER_CHILD')) - FROM_NUMBER = 'development' STATSD_HOST = os.getenv('STATSD_HOST') diff --git a/manifest.yml.j2 b/manifest.yml.j2 index b2328641e..b8ac8a803 100644 --- a/manifest.yml.j2 +++ b/manifest.yml.j2 @@ -56,7 +56,6 @@ 'notify-delivery-worker-periodic': {}, 'notify-delivery-worker-reporting': { 'additional_env_vars': { - 'CELERYD_MAX_TASKS_PER_CHILD': 1, 'CELERYD_PREFETCH_MULTIPLIER': 1, 'SQLALCHEMY_STATEMENT_TIMEOUT': 7200 }