From 2108498eb163adedded0a0f230b47ff631c484f1 Mon Sep 17 00:00:00 2001 From: sakisv Date: Tue, 22 Dec 2020 17:34:11 +0200 Subject: [PATCH 1/3] Send worker-sender celery logs to /dev/null We are using our custom logger to log to `NOTIFY_LOG_PATH`, so this logging from celery is neither needed nor desired. We also need to define the location of the pidfiles, because of what appears to be a bug in celery where it uses the location of logs to infer the location of the pidfiles if it is not defined, i.e. in this case it was trying to find the pidfiles in `/dev/null/%N.pid`. --- scripts/paas_app_wrapper.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/paas_app_wrapper.sh b/scripts/paas_app_wrapper.sh index 47fb14b61..cd072e3c6 100755 --- a/scripts/paas_app_wrapper.sh +++ b/scripts/paas_app_wrapper.sh @@ -22,7 +22,7 @@ case $NOTIFY_APP_NAME in ;; delivery-worker-sender) exec scripts/run_multi_worker_app_paas.sh celery multi start 3 -c 10 -A run_celery.notify_celery --loglevel=INFO \ - -Q send-sms-tasks,send-email-tasks + --logfile=/dev/null --pidfile=/tmp/celery%N.pid -Q send-sms-tasks,send-email-tasks ;; delivery-worker-periodic) exec scripts/run_app_paas.sh celery -A run_celery.notify_celery worker --loglevel=INFO --concurrency=2 \ From a6ecfd66b6dd623eb8feabc12ed04faa2249fdd6 Mon Sep 17 00:00:00 2001 From: sakisv Date: Wed, 23 Dec 2020 18:48:57 +0200 Subject: [PATCH 2/3] Terminate instance if it's running out of disk space --- scripts/run_app_paas.sh | 19 +++++++++++++++++++ scripts/run_multi_worker_app_paas.sh | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/scripts/run_app_paas.sh b/scripts/run_app_paas.sh index a4416e56f..6abdb9e13 100755 --- a/scripts/run_app_paas.sh +++ b/scripts/run_app_paas.sh @@ -3,6 +3,7 @@ set -e -o pipefail TERMINATE_TIMEOUT=9 +MAX_DISK_SPACE_USAGE=75 readonly LOGS_DIR="/home/vcap/logs" function check_params { @@ -69,10 +70,28 @@ function start_aws_logs_agent { echo "AWS logs agent pid: ${AWSLOGS_AGENT_PID}" } +function check_disk_space { + # get something like: + # + # Filesystem Use% + # overlay 56% + # tmpfs 0% + # + # and only keep '56' + SPACE_USAGE=$(df --output="source,pcent" | grep overlay | tr --squeeze-repeats " " | cut -f2 -d" "| cut -f1 -d"%") + + if [[ "${SPACE_USAGE}" -ge "${MAX_DISK_SPACE_USAGE}" ]]; then + echo "Terminating ${NOTIFY_APP_NAME}, instance ${INSTANCE_INDEX} because we're running out of disk space" + echo "Usage: ${SPACE_USAGE}% - limit ${MAX_DISK_SPACE_USAGE}%" + exit + fi +} + function run { while true; do kill -0 ${APP_PID} 2&>/dev/null || break kill -0 ${AWSLOGS_AGENT_PID} 2&>/dev/null || start_aws_logs_agent + check_disk_space sleep 1 done } diff --git a/scripts/run_multi_worker_app_paas.sh b/scripts/run_multi_worker_app_paas.sh index af02058f6..f170e8f02 100755 --- a/scripts/run_multi_worker_app_paas.sh +++ b/scripts/run_multi_worker_app_paas.sh @@ -3,6 +3,7 @@ set -e -o pipefail TERMINATE_TIMEOUT=9 +MAX_DISK_SPACE_USAGE=75 readonly LOGS_DIR="/home/vcap/logs" function check_params { @@ -124,6 +125,23 @@ function ensure_celery_is_running { fi } +function check_disk_space { + # get something like: + # + # Filesystem Use% + # overlay 56% + # tmpfs 0% + # + # and only keep '56' + SPACE_USAGE=$(df --output="source,pcent" | grep overlay | tr --squeeze-repeats " " | cut -f2 -d" "| cut -f1 -d"%") + + if [[ "${SPACE_USAGE}" -ge "${MAX_DISK_SPACE_USAGE}" ]]; then + echo "Terminating ${NOTIFY_APP_NAME}, instance ${INSTANCE_INDEX} because we're running out of disk space" + echo "Usage: ${SPACE_USAGE}% - limit ${MAX_DISK_SPACE_USAGE}%" + exit + fi +} + function run { while true; do get_celery_pids @@ -135,6 +153,7 @@ function run { done kill -0 ${AWSLOGS_AGENT_PID} 2&>/dev/null || start_aws_logs_agent kill -0 ${LOGS_TAIL_PID} 2&>/dev/null || start_logs_tail + check_disk_space sleep 1 done } From 1bfdac84170d18b731896d6df1c43f2c87554ab3 Mon Sep 17 00:00:00 2001 From: sakisv Date: Thu, 24 Dec 2020 18:44:26 +0200 Subject: [PATCH 3/3] Temporarily remove disk space check from multi_worker script There seems to be some kind of complication in this script that doesn't allow it to terminate properly. This is being removed for now to allow deploying the rest of the fixes in time for the holiday period. --- scripts/run_multi_worker_app_paas.sh | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/scripts/run_multi_worker_app_paas.sh b/scripts/run_multi_worker_app_paas.sh index f170e8f02..af02058f6 100755 --- a/scripts/run_multi_worker_app_paas.sh +++ b/scripts/run_multi_worker_app_paas.sh @@ -3,7 +3,6 @@ set -e -o pipefail TERMINATE_TIMEOUT=9 -MAX_DISK_SPACE_USAGE=75 readonly LOGS_DIR="/home/vcap/logs" function check_params { @@ -125,23 +124,6 @@ function ensure_celery_is_running { fi } -function check_disk_space { - # get something like: - # - # Filesystem Use% - # overlay 56% - # tmpfs 0% - # - # and only keep '56' - SPACE_USAGE=$(df --output="source,pcent" | grep overlay | tr --squeeze-repeats " " | cut -f2 -d" "| cut -f1 -d"%") - - if [[ "${SPACE_USAGE}" -ge "${MAX_DISK_SPACE_USAGE}" ]]; then - echo "Terminating ${NOTIFY_APP_NAME}, instance ${INSTANCE_INDEX} because we're running out of disk space" - echo "Usage: ${SPACE_USAGE}% - limit ${MAX_DISK_SPACE_USAGE}%" - exit - fi -} - function run { while true; do get_celery_pids @@ -153,7 +135,6 @@ function run { done kill -0 ${AWSLOGS_AGENT_PID} 2&>/dev/null || start_aws_logs_agent kill -0 ${LOGS_TAIL_PID} 2&>/dev/null || start_logs_tail - check_disk_space sleep 1 done }