From 7c36c3894d60893374802f66ea146b2a877623eb Mon Sep 17 00:00:00 2001 From: Athanasios Voutsadakis Date: Tue, 27 Feb 2018 17:37:09 +0000 Subject: [PATCH 01/11] Add a start script for running celery multi The existing script would not work with `celery multi` as it was trying to put it in the background and the get its pid.~ `celery multi` creates a number of worker processes and stores their PIDs in files named celeryN.pid, where N the index number of the worker (starting at 1). --- scripts/run_multi_worker_app_paas.sh | 118 +++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100755 scripts/run_multi_worker_app_paas.sh diff --git a/scripts/run_multi_worker_app_paas.sh b/scripts/run_multi_worker_app_paas.sh new file mode 100755 index 000000000..82bfe6e55 --- /dev/null +++ b/scripts/run_multi_worker_app_paas.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +set -e -o pipefail + +TERMINATE_TIMEOUT=30 + +function check_params { + if [ -z "${NOTIFY_APP_NAME}" ]; then + echo "You must set NOTIFY_APP_NAME" + exit 1 + fi + + if [ -z "${CW_APP_NAME}" ]; then + CW_APP_NAME=${NOTIFY_APP_NAME} + fi +} + +function configure_aws_logs { + # create files so that aws logs agent doesn't complain + touch /home/vcap/logs/gunicorn_error.log + touch /home/vcap/logs/app.log.json + + aws configure set plugins.cwlogs cwlogs + + cat > /home/vcap/app/awslogs.conf << EOF +[general] +state_file = /home/vcap/logs/awslogs-state + +[/home/vcap/logs/app.log] +file = /home/vcap/logs/app.log.json +log_group_name = paas-${CW_APP_NAME}-application +log_stream_name = {hostname} + +[/home/vcap/logs/gunicorn_error.log] +file = /home/vcap/logs/gunicorn_error.log +log_group_name = paas-${CW_APP_NAME}-gunicorn +log_stream_name = {hostname} +EOF +} + +# For every PID, check if it's still running +# if it is, send the sigterm +function on_exit { + n=0 + while true; do + # refresh pids to account for the case that + # some workers may have terminated but others not + get_celery_pids + + # look here for explanation regarding this syntax: + # https://unix.stackexchange.com/a/298942/230401 + PROCESS_COUNT="${#APP_PIDS[@]}" + if [[ "${PROCESS_COUNT}" -eq "0" ]]; then + echo "No more .pid files found, exiting" + break + fi + + echo "Terminating celery processes with pids ${APP_PIDS}" + APP_PIDS=($APP_PIDS) + for APP_PID in ${APP_PIDS}; do + # if TERMINATE_TIMEOUT is reached, send SIGKILL + if [[ "$n" -ge "$TERMINATE_TIMEOUT" ]]; then + echo "Timeout reached, killing process with pid ${APP_PID}" + kill -9 ${APP_PID} || true + break + else + # else, if process is still running send SIGTERM + if [[ $(kill -0 ${APP_PID} 2&>/dev/null) ]]; then + echo "Terminating celery process with pid ${APP_PID}" + kill ${APP_PID} || true + fi + fi + done + let n=n+1 + sleep 1 + done +} + +function get_celery_pids { + APP_PIDS=`cat celery*.pid` +} + +function start_application { + exec "$@" + get_celery_pids + echo "Application process pids: ${APP_PIDS}" +} + +function start_aws_logs_agent { + exec aws logs push --region eu-west-1 --config-file /home/vcap/app/awslogs.conf & + AWSLOGS_AGENT_PID=$! + echo "AWS logs agent pid: ${AWSLOGS_AGENT_PID}" +} + +function run { + while true; do + for APP_PID in ${APP_PIDS}; do + kill -0 ${APP_PID} 2&>/dev/null || return 1 + done + kill -0 ${AWSLOGS_AGENT_PID} 2&>/dev/null || start_aws_logs_agent + sleep 1 + done +} + +echo "Run script pid: $$" + +check_params + +trap "on_exit" EXIT + +configure_aws_logs + +# The application has to start first! +start_application "$@" + +start_aws_logs_agent + +run From cf439575e539898d5ca7d2992952aed6d98258dc Mon Sep 17 00:00:00 2001 From: Athanasios Voutsadakis Date: Tue, 27 Feb 2018 17:38:16 +0000 Subject: [PATCH 02/11] Make the sender-worker use 4 celery workers and 2G of RAM --- manifest-delivery-base.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/manifest-delivery-base.yml b/manifest-delivery-base.yml index e141c6482..bd17f9138 100644 --- a/manifest-delivery-base.yml +++ b/manifest-delivery-base.yml @@ -63,7 +63,8 @@ applications: NOTIFY_APP_NAME: delivery-worker-research - name: notify-delivery-worker-sender - command: scripts/run_app_paas.sh celery -A run_celery.notify_celery worker --loglevel=INFO --concurrency=11 -Q send-sms-tasks,send-email-tasks + command: scripts/run_multi_worker_app_paas.sh celery multi start 4 -c 10 -A run_celery.notify_celery --loglevel=INFO -Q send-sms-tasks,send-email-tasks + memory: 2G env: NOTIFY_APP_NAME: delivery-worker-sender From 9a8e771d955cd7f2202ef866ceef6a45ec08d9bf Mon Sep 17 00:00:00 2001 From: Athanasios Voutsadakis Date: Tue, 27 Feb 2018 17:44:16 +0000 Subject: [PATCH 03/11] Lower the termination timeout to 10s Cloudfoundry only gives us 10s to terminate our processes: https://docs.cloudfoundry.org/devguide/deploy-apps/app-lifecycle.html#shutdown --- scripts/run_app_paas.sh | 2 +- scripts/run_multi_worker_app_paas.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/run_app_paas.sh b/scripts/run_app_paas.sh index cf0c7ccc7..105fa51f8 100755 --- a/scripts/run_app_paas.sh +++ b/scripts/run_app_paas.sh @@ -2,7 +2,7 @@ set -e -o pipefail -TERMINATE_TIMEOUT=30 +TERMINATE_TIMEOUT=10 function check_params { if [ -z "${NOTIFY_APP_NAME}" ]; then diff --git a/scripts/run_multi_worker_app_paas.sh b/scripts/run_multi_worker_app_paas.sh index 82bfe6e55..d54e4bd83 100755 --- a/scripts/run_multi_worker_app_paas.sh +++ b/scripts/run_multi_worker_app_paas.sh @@ -2,7 +2,7 @@ set -e -o pipefail -TERMINATE_TIMEOUT=30 +TERMINATE_TIMEOUT=10 function check_params { if [ -z "${NOTIFY_APP_NAME}" ]; then From bef80e3414092ed5ef1d5fc68c1ce32ce726d748 Mon Sep 17 00:00:00 2001 From: Athanasios Voutsadakis Date: Wed, 28 Feb 2018 14:16:15 +0000 Subject: [PATCH 04/11] Use `eval` to run the command instead of exec `exec` is replacing the current shell to run the command, which means that the script execution stops at that line. Passing it to the background with `exec "$@" &` won't work either, because the script will move directly to the next command where it looks for the `.pid` files that have not yet been created because celery takes a few seconds to spin up all the processes. Using `sleep X` to remedy this seems just wrong given that 1. we can use `eval` that blocks until the command returns 2. there is no obvious benefit in sticking with `exec` --- scripts/run_multi_worker_app_paas.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_multi_worker_app_paas.sh b/scripts/run_multi_worker_app_paas.sh index d54e4bd83..191b1f564 100755 --- a/scripts/run_multi_worker_app_paas.sh +++ b/scripts/run_multi_worker_app_paas.sh @@ -81,7 +81,7 @@ function get_celery_pids { } function start_application { - exec "$@" + eval "$@" get_celery_pids echo "Application process pids: ${APP_PIDS}" } From 138d4eee25dc7cfa98cc1408b019861855281ed1 Mon Sep 17 00:00:00 2001 From: Athanasios Voutsadakis Date: Wed, 28 Feb 2018 14:34:48 +0000 Subject: [PATCH 05/11] Better logging of PIDs array --- scripts/run_multi_worker_app_paas.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/run_multi_worker_app_paas.sh b/scripts/run_multi_worker_app_paas.sh index 191b1f564..26674378f 100755 --- a/scripts/run_multi_worker_app_paas.sh +++ b/scripts/run_multi_worker_app_paas.sh @@ -55,8 +55,7 @@ function on_exit { break fi - echo "Terminating celery processes with pids ${APP_PIDS}" - APP_PIDS=($APP_PIDS) + echo "Terminating celery processes with pids "${APP_PIDS} for APP_PID in ${APP_PIDS}; do # if TERMINATE_TIMEOUT is reached, send SIGKILL if [[ "$n" -ge "$TERMINATE_TIMEOUT" ]]; then @@ -64,6 +63,7 @@ function on_exit { kill -9 ${APP_PID} || true break else + echo "Timeout not reached yet, checking " ${APP_PID} # else, if process is still running send SIGTERM if [[ $(kill -0 ${APP_PID} 2&>/dev/null) ]]; then echo "Terminating celery process with pid ${APP_PID}" @@ -83,7 +83,7 @@ function get_celery_pids { function start_application { eval "$@" get_celery_pids - echo "Application process pids: ${APP_PIDS}" + echo "Application process pids: "${APP_PIDS} } function start_aws_logs_agent { From 1adee472306a5a58b6932b82ac95aa83feba3c02 Mon Sep 17 00:00:00 2001 From: Athanasios Voutsadakis Date: Wed, 28 Feb 2018 14:36:06 +0000 Subject: [PATCH 06/11] Always get the latest PIDs to check --- scripts/run_multi_worker_app_paas.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/run_multi_worker_app_paas.sh b/scripts/run_multi_worker_app_paas.sh index 26674378f..3b716a505 100755 --- a/scripts/run_multi_worker_app_paas.sh +++ b/scripts/run_multi_worker_app_paas.sh @@ -94,6 +94,7 @@ function start_aws_logs_agent { function run { while true; do + get_celery_pids for APP_PID in ${APP_PIDS}; do kill -0 ${APP_PID} 2&>/dev/null || return 1 done From 7d562f9e85d23f84376de2c101e2092bf15acf34 Mon Sep 17 00:00:00 2001 From: Athanasios Voutsadakis Date: Wed, 28 Feb 2018 14:36:40 +0000 Subject: [PATCH 07/11] Get empty array when no .pid file exists --- scripts/run_multi_worker_app_paas.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/run_multi_worker_app_paas.sh b/scripts/run_multi_worker_app_paas.sh index 3b716a505..0f9ea3459 100755 --- a/scripts/run_multi_worker_app_paas.sh +++ b/scripts/run_multi_worker_app_paas.sh @@ -77,7 +77,11 @@ function on_exit { } function get_celery_pids { - APP_PIDS=`cat celery*.pid` + if [[ $(ls /home/vcap/app/celery*.pid) ]]; then + APP_PIDS=`cat /home/vcap/app/celery*.pid` + else + APP_PIDS=() + fi } function start_application { From b158c667050023f2674b210c937f33b243d8153f Mon Sep 17 00:00:00 2001 From: Athanasios Voutsadakis Date: Wed, 28 Feb 2018 14:37:02 +0000 Subject: [PATCH 08/11] Set timeout to 9 This will allow us an extra second to `kill -9` any remaining processes. --- scripts/run_multi_worker_app_paas.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_multi_worker_app_paas.sh b/scripts/run_multi_worker_app_paas.sh index 0f9ea3459..69c504425 100755 --- a/scripts/run_multi_worker_app_paas.sh +++ b/scripts/run_multi_worker_app_paas.sh @@ -2,7 +2,7 @@ set -e -o pipefail -TERMINATE_TIMEOUT=10 +TERMINATE_TIMEOUT=9 function check_params { if [ -z "${NOTIFY_APP_NAME}" ]; then From 3045f6233b949cb61bd9f86bd53ed9285c5eb15e Mon Sep 17 00:00:00 2001 From: Athanasios Voutsadakis Date: Wed, 28 Feb 2018 14:50:23 +0000 Subject: [PATCH 09/11] Use `continue` instead of `break` on SIGKILL `break`ing would keep trying to kill the same process and never move to the next --- scripts/run_multi_worker_app_paas.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_multi_worker_app_paas.sh b/scripts/run_multi_worker_app_paas.sh index 69c504425..3a5c16766 100755 --- a/scripts/run_multi_worker_app_paas.sh +++ b/scripts/run_multi_worker_app_paas.sh @@ -61,7 +61,7 @@ function on_exit { if [[ "$n" -ge "$TERMINATE_TIMEOUT" ]]; then echo "Timeout reached, killing process with pid ${APP_PID}" kill -9 ${APP_PID} || true - break + continue else echo "Timeout not reached yet, checking " ${APP_PID} # else, if process is still running send SIGTERM From d365921093703c4e7576bfbd40e2e609654833b3 Mon Sep 17 00:00:00 2001 From: Athanasios Voutsadakis Date: Tue, 6 Mar 2018 15:58:33 +0000 Subject: [PATCH 10/11] Address PR comments Make timeout 9s on both files Use more descriptive variable name Declare the logs dir as a constant --- scripts/run_app_paas.sh | 23 ++++++++++++----------- scripts/run_multi_worker_app_paas.sh | 21 +++++++++++---------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/scripts/run_app_paas.sh b/scripts/run_app_paas.sh index 105fa51f8..a4416e56f 100755 --- a/scripts/run_app_paas.sh +++ b/scripts/run_app_paas.sh @@ -2,7 +2,8 @@ set -e -o pipefail -TERMINATE_TIMEOUT=10 +TERMINATE_TIMEOUT=9 +readonly LOGS_DIR="/home/vcap/logs" function check_params { if [ -z "${NOTIFY_APP_NAME}" ]; then @@ -17,22 +18,22 @@ function check_params { function configure_aws_logs { # create files so that aws logs agent doesn't complain - touch /home/vcap/logs/gunicorn_error.log - touch /home/vcap/logs/app.log.json + touch ${LOGS_DIR}/gunicorn_error.log + touch ${LOGS_DIR}/app.log.json aws configure set plugins.cwlogs cwlogs cat > /home/vcap/app/awslogs.conf << EOF [general] -state_file = /home/vcap/logs/awslogs-state +state_file = ${LOGS_DIR}/awslogs-state -[/home/vcap/logs/app.log] -file = /home/vcap/logs/app.log.json +[${LOGS_DIR}/app.log] +file = ${LOGS_DIR}/app.log.json log_group_name = paas-${CW_APP_NAME}-application log_stream_name = {hostname} -[/home/vcap/logs/gunicorn_error.log] -file = /home/vcap/logs/gunicorn_error.log +[${LOGS_DIR}/gunicorn_error.log] +file = ${LOGS_DIR}/gunicorn_error.log log_group_name = paas-${CW_APP_NAME}-gunicorn log_stream_name = {hostname} EOF @@ -41,12 +42,12 @@ EOF function on_exit { echo "Terminating application process with pid ${APP_PID}" kill ${APP_PID} || true - n=0 + wait_time=0 while (kill -0 ${APP_PID} 2&>/dev/null); do echo "Application is still running.." sleep 1 - let n=n+1 - if [ "$n" -ge "$TERMINATE_TIMEOUT" ]; then + let wait_time=wait_time+1 + if [ "$wait_time" -ge "$TERMINATE_TIMEOUT" ]; then echo "Timeout reached, killing process with pid ${APP_PID}" kill -9 ${APP_PID} || true break diff --git a/scripts/run_multi_worker_app_paas.sh b/scripts/run_multi_worker_app_paas.sh index 3a5c16766..d50358c6b 100755 --- a/scripts/run_multi_worker_app_paas.sh +++ b/scripts/run_multi_worker_app_paas.sh @@ -3,6 +3,7 @@ set -e -o pipefail TERMINATE_TIMEOUT=9 +readonly LOGS_DIR="/home/vcap/logs" function check_params { if [ -z "${NOTIFY_APP_NAME}" ]; then @@ -17,22 +18,22 @@ function check_params { function configure_aws_logs { # create files so that aws logs agent doesn't complain - touch /home/vcap/logs/gunicorn_error.log - touch /home/vcap/logs/app.log.json + touch ${LOGS_DIR}/gunicorn_error.log + touch ${LOGS_DIR}/app.log.json aws configure set plugins.cwlogs cwlogs cat > /home/vcap/app/awslogs.conf << EOF [general] -state_file = /home/vcap/logs/awslogs-state +state_file = ${LOGS_DIR}/awslogs-state -[/home/vcap/logs/app.log] -file = /home/vcap/logs/app.log.json +[${LOGS_DIR}/app.log] +file = ${LOGS_DIR}/app.log.json log_group_name = paas-${CW_APP_NAME}-application log_stream_name = {hostname} -[/home/vcap/logs/gunicorn_error.log] -file = /home/vcap/logs/gunicorn_error.log +[${LOGS_DIR}/gunicorn_error.log] +file = ${LOGS_DIR}/gunicorn_error.log log_group_name = paas-${CW_APP_NAME}-gunicorn log_stream_name = {hostname} EOF @@ -41,7 +42,7 @@ EOF # For every PID, check if it's still running # if it is, send the sigterm function on_exit { - n=0 + wait_time=0 while true; do # refresh pids to account for the case that # some workers may have terminated but others not @@ -58,7 +59,7 @@ function on_exit { echo "Terminating celery processes with pids "${APP_PIDS} for APP_PID in ${APP_PIDS}; do # if TERMINATE_TIMEOUT is reached, send SIGKILL - if [[ "$n" -ge "$TERMINATE_TIMEOUT" ]]; then + if [[ "$wait_time" -ge "$TERMINATE_TIMEOUT" ]]; then echo "Timeout reached, killing process with pid ${APP_PID}" kill -9 ${APP_PID} || true continue @@ -71,7 +72,7 @@ function on_exit { fi fi done - let n=n+1 + let wait_time=wait_time+1 sleep 1 done } From 2b6efd09b48926d68da5a71f548adf7493586a77 Mon Sep 17 00:00:00 2001 From: Athanasios Voutsadakis Date: Fri, 9 Mar 2018 16:57:12 +0000 Subject: [PATCH 11/11] Set pool size to 1 for sender worker --- manifest-delivery-base.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/manifest-delivery-base.yml b/manifest-delivery-base.yml index bd17f9138..bc0bc5b29 100644 --- a/manifest-delivery-base.yml +++ b/manifest-delivery-base.yml @@ -66,6 +66,7 @@ applications: command: scripts/run_multi_worker_app_paas.sh celery multi start 4 -c 10 -A run_celery.notify_celery --loglevel=INFO -Q send-sms-tasks,send-email-tasks memory: 2G env: + SQLALCHEMY_POOL_SIZE: 1 NOTIFY_APP_NAME: delivery-worker-sender - name: notify-delivery-worker-periodic