mirror of
https://github.com/GSA/notifications-api.git
synced 2026-02-05 02:41:14 -05:00
Exit if celery processes are not running
In 4427827b2f and celery monitoring was
changed from using PID files to actually looking at processes.
If celery workers get OOM killed (for instance) the container init
script would not restart them, this is because `get_celery_pids` would
not contain any processes that contained the string celery. This would
cause the pipe to fail (-o pipefail). APP_PIDS would not get updated but
the script would continue to run. This caused the script to not restart
the celery processes.
We think the correct behaviour when celery processes are killed (i.e.
there are no more celery processes running in a container) is to kill
the container. The PaaS should then schedule new ones which may
remediate the cause of the celery processes being killed.
Upon detection of no celery processes running, some diagnostic
information from the environment is sent to the logs, e.g.:
```
CF_INSTANCE_ADDR=10.0.32.4:61012
CF_INSTANCE_INTERNAL_IP=10.255.184.9
CF_INSTANCE_GUID=81c57dbc-e706-411e-6a5f-2013
CF_INSTANCE_PORT=61012
CF_INSTANCE_IP=10.0.32.4
```
Then the script (which is the container entrypoint) exits 1.
Co-author: @servingupaces @tlwr
This commit is contained in:
@@ -69,7 +69,10 @@ function get_celery_pids {
|
||||
# get the PIDs of the process whose parent is the root process
|
||||
# print only pid and their command, get the ones with "celery" in their name
|
||||
# and keep only these PIDs
|
||||
|
||||
set +o pipefail # so grep returning no matches does not premature fail pipe
|
||||
APP_PIDS=$(pgrep -P 1 | xargs ps -o pid=,command= -p | grep celery | cut -f1 -d/)
|
||||
set -o pipefail # pipefail should be set everywhere else
|
||||
}
|
||||
|
||||
function send_signal_to_celery_processes {
|
||||
@@ -98,9 +101,28 @@ function start_logs_tail {
|
||||
echo "tail pid: ${LOGS_TAIL_PID}"
|
||||
}
|
||||
|
||||
function ensure_celery_is_running {
|
||||
if [ "${APP_PIDS}" = "" ]; then
|
||||
echo "There are no celery processes running, this container is bad"
|
||||
|
||||
echo "Exporting CF information for diagnosis"
|
||||
|
||||
env | grep CF
|
||||
|
||||
echo "Sleeping 15 seconds for logs to get shipped"
|
||||
|
||||
sleep 15
|
||||
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
function run {
|
||||
while true; do
|
||||
get_celery_pids
|
||||
|
||||
ensure_celery_is_running
|
||||
|
||||
for APP_PID in ${APP_PIDS}; do
|
||||
kill -0 ${APP_PID} 2&>/dev/null || return 1
|
||||
done
|
||||
|
||||
Reference in New Issue
Block a user