From afcdf1f9a1ea8a324f0ca19abb07e096c4914a06 Mon Sep 17 00:00:00 2001 From: Toby Lorne Date: Wed, 23 Jan 2019 14:26:43 +0000 Subject: [PATCH 1/3] Exit if celery processes are not running In 4427827b2ff7cc790d1e3400a9eeca7b8c22b991 and celery monitoring was changed from using PID files to actually looking at processes. If celery workers get OOM killed (for instance) the container init script would not restart them, this is because `get_celery_pids` would not contain any processes that contained the string celery. This would cause the pipe to fail (-o pipefail). APP_PIDS would not get updated but the script would continue to run. This caused the script to not restart the celery processes. We think the correct behaviour when celery processes are killed (i.e. there are no more celery processes running in a container) is to kill the container. The PaaS should then schedule new ones which may remediate the cause of the celery processes being killed. Upon detection of no celery processes running, some diagnostic information from the environment is sent to the logs, e.g.: ``` CF_INSTANCE_ADDR=10.0.32.4:61012 CF_INSTANCE_INTERNAL_IP=10.255.184.9 CF_INSTANCE_GUID=81c57dbc-e706-411e-6a5f-2013 CF_INSTANCE_PORT=61012 CF_INSTANCE_IP=10.0.32.4 ``` Then the script (which is the container entrypoint) exits 1. Co-author: @servingupaces @tlwr --- scripts/run_multi_worker_app_paas.sh | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/scripts/run_multi_worker_app_paas.sh b/scripts/run_multi_worker_app_paas.sh index f195e59dd..b44ab28d3 100755 --- a/scripts/run_multi_worker_app_paas.sh +++ b/scripts/run_multi_worker_app_paas.sh @@ -69,7 +69,10 @@ function get_celery_pids { # get the PIDs of the process whose parent is the root process # print only pid and their command, get the ones with "celery" in their name # and keep only these PIDs + + set +o pipefail # so grep returning no matches does not premature fail pipe APP_PIDS=$(pgrep -P 1 | xargs ps -o pid=,command= -p | grep celery | cut -f1 -d/) + set -o pipefail # pipefail should be set everywhere else } function send_signal_to_celery_processes { @@ -98,9 +101,28 @@ function start_logs_tail { echo "tail pid: ${LOGS_TAIL_PID}" } +function ensure_celery_is_running { + if [ "${APP_PIDS}" = "" ]; then + echo "There are no celery processes running, this container is bad" + + echo "Exporting CF information for diagnosis" + + env | grep CF + + echo "Sleeping 15 seconds for logs to get shipped" + + sleep 15 + + exit 1 + fi +} + function run { while true; do get_celery_pids + + ensure_celery_is_running + for APP_PID in ${APP_PIDS}; do kill -0 ${APP_PID} 2&>/dev/null || return 1 done From fa4cff5eb75ec47851002b2aefade133ce6b240c Mon Sep 17 00:00:00 2001 From: Athanasios Voutsadakis Date: Wed, 23 Jan 2019 16:00:00 +0000 Subject: [PATCH 2/3] Bump sender memory to 3GB --- manifest-delivery-base.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifest-delivery-base.yml b/manifest-delivery-base.yml index 1751b3e66..bf136b1df 100644 --- a/manifest-delivery-base.yml +++ b/manifest-delivery-base.yml @@ -68,7 +68,7 @@ applications: - name: notify-delivery-worker-sender command: scripts/run_multi_worker_app_paas.sh celery multi start 3 -c 10 -A run_celery.notify_celery --loglevel=INFO -Q send-sms-tasks,send-email-tasks - memory: 2G + memory: 3G env: NOTIFY_APP_NAME: delivery-worker-sender From 3528aab25ba17c4fc38c7d38170c1bdcc80efc7a Mon Sep 17 00:00:00 2001 From: Athanasios Voutsadakis Date: Wed, 23 Jan 2019 16:23:58 +0000 Subject: [PATCH 3/3] Kill the other processes started by the script We use exec to start awslogs_agent and then a tail to print logs to stdout. CF docs[1] recommend to use exec to start processes which seems to imply that as long as there are commands running the container will remain up and running. This commit ensures that if there are no celery tasks running we will kill any other processes that we have started, so that the container will no longer be considered healthy by cloudfoundry and will be replaced. 1: https://docs.cloudfoundry.org/devguide/deploy-apps/manifest.html#start-commands --- scripts/run_multi_worker_app_paas.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/run_multi_worker_app_paas.sh b/scripts/run_multi_worker_app_paas.sh index b44ab28d3..6824923ea 100755 --- a/scripts/run_multi_worker_app_paas.sh +++ b/scripts/run_multi_worker_app_paas.sh @@ -113,6 +113,10 @@ function ensure_celery_is_running { sleep 15 + echo "Killing awslogs_agent and tail" + kill -9 ${AWSLOGS_AGENT_PID} + kill -9 ${LOGS_TAIL_PID} + exit 1 fi }