Files
notifications-api/manifest.yml.j2
David McDonald 1ac3ca250c Add more memory for the sender and letter workers
On monday, we had a build of emails in the email queue that weren't
getting picked up by the sender worker and causing delays.

After further investigation with Andy from the PaaS, we believe the
following happened.

We received a bunch of traffic at 8:30ish which consisted of some
very large emails in terms of their length and complexity. The amount
of memory used by the app instances got very high and a few apps
crashed due to OOM (recorded by 5 cf app event crashes). When new
app instances tried to spin up, they weren't able to as they
potentially also ran out of memory immediately.

This left us in the position of having fewer app instances than we
needed, on top of which they were all using a very large amount of
CPU and may have been limited how quickly an individual app
instance would process tasks. This meant that we were overall
processing fewer tasks then we needed to and our queue of emails
started to build up.

So it appears our sender workers did not have the memory available that
they needed. By looking at a graph for the past 30 days of memory usage
on the sender workers, we see that it on several days breached 90%
memory usage for long periods of time. This in combination of the
hypothesis above of what happened leads us to decide that we want to
give the app instances a bigger memory quota so it has been upped from
3GB to 4GB.

Whilst doing, I also looked at long term memory usage graphs for our
other workers and saw that the letters worker was similarly close to
around 90% of memory used so have taken the opportunity to bump that
too.
2020-12-24 15:03:39 +00:00

153 lines
5.0 KiB
Django/Jinja

{%- set app_vars = {
'notify-api': {
'NOTIFY_APP_NAME': 'api',
'disk_quota': '2G',
'sqlalchemy_pool_size': 30,
'additional_env_vars': {
'STATSD_HOST': None
},
'routes': {
'preview': ['api.notify.works'],
'staging': ['api.staging-notify.works'],
'production': ['api.notifications.service.gov.uk'],
},
'health-check-type': 'port',
'health-check-invocation-timeout': 3,
'instances': {
'preview': None,
'staging': None,
'production': 25
},
},
'notify-api-sms-callbacks': {
'NOTIFY_APP_NAME': 'api',
'disk_quota': '2G',
'additional_env_vars': {
'STATSD_HOST': None
},
'routes': {
'preview': ['api.notify.works/notifications/sms/mmg', 'api.notify.works/notifications/sms/firetext'],
'staging': ['api.staging-notify.works/notifications/sms/mmg', 'api.staging-notify.works/notifications/sms/firetext'],
'production': ['api.notifications.service.gov.uk/notifications/sms/mmg', 'api.notifications.service.gov.uk/notifications/sms/firetext'],
},
'health-check-type': 'port',
'health-check-invocation-timeout': 3,
'instances': {
'preview': 1,
'staging': 2,
'production': 10
},
},
'notify-api-db-migration': {
'NOTIFY_APP_NAME': 'api',
'instances': {
'preview': 0,
'staging': 0,
'production': 0
},
},
'notify-delivery-celery-beat': {'memory': '128M'},
'notify-delivery-worker-jobs': {},
'notify-delivery-worker-research': {},
'notify-delivery-worker-sender': {'disk_quota': '2G', 'memory': '4G'},
'notify-delivery-worker-periodic': {},
'notify-delivery-worker-reporting': {
'additional_env_vars': {
'CELERYD_MAX_TASKS_PER_CHILD': 1,
'CELERYD_PREFETCH_MULTIPLIER': 1,
}
},
'notify-delivery-worker-priority': {},
'notify-delivery-worker-letters': {'memory': '2G'},
'notify-delivery-worker-retry-tasks': {},
'notify-delivery-worker-internal': {},
'notify-delivery-worker-receipts': {},
'notify-delivery-worker-service-callbacks': {'disk_quota': '2G'},
'notify-delivery-worker-save-api-notifications': {'disk_quota': '2G'},
} -%}
{%- set app = app_vars[CF_APP] -%}
{%- set instance_count = app.get('instances', {}).get(environment) -%}
---
applications:
- name: {{ CF_APP }}
buildpack: python_buildpack
{% if instance_count is not none %}
instances: {{ instance_count }}
{%- endif %}
memory: {{ app.get('memory', '1G') }}
disk_quota: {{ app.get('disk_quota', '1G')}}
routes:
{%- for route in app.get('routes', {}).get(environment, []) %}
- route: {{ route }}
{%- endfor%}
- route: {{ CF_APP }}-{{ environment }}.cloudapps.digital
health-check-type: {{ app.get('health-check-type', 'process') }}
health-check-invocation-timeout: {{ app.get('health-check-invocation-timeout', 1) }}
services:
- notify-db
- logit-ssl-syslog-drain
{% if CF_APP == 'notify-api' %}
- notify-prometheus
{% endif %}
env:
NOTIFY_APP_NAME: {{ app.get('NOTIFY_APP_NAME', CF_APP.replace('notify-', '')) }}
SQLALCHEMY_POOL_SIZE: {{ app.get('sqlalchemy_pool_size', 1) }}
FLASK_APP: application.py
# Credentials variables
ADMIN_BASE_URL: '{{ ADMIN_BASE_URL }}'
API_INTERNAL_SECRETS: '{{ API_INTERNAL_SECRETS | tojson }}'
API_HOST_NAME: '{{ API_HOST_NAME }}'
DANGEROUS_SALT: '{{ DANGEROUS_SALT }}'
SECRET_KEY: '{{ SECRET_KEY }}'
ROUTE_SECRET_KEY_1: '{{ ROUTE_SECRET_KEY_1 }}'
ROUTE_SECRET_KEY_2: '{{ ROUTE_SECRET_KEY_2 }}'
CRONITOR_KEYS: '{{ CRONITOR_KEYS | tojson }}'
HIGH_VOLUME_SERVICE: '{{ HIGH_VOLUME_SERVICE | tojson }}'
PERFORMANCE_PLATFORM_ENDPOINTS: '{{ PERFORMANCE_PLATFORM_ENDPOINTS | tojson }}'
DOCUMENT_DOWNLOAD_API_HOST: '{{ DOCUMENT_DOWNLOAD_API_HOST }}'
DOCUMENT_DOWNLOAD_API_KEY: '{{ DOCUMENT_DOWNLOAD_API_KEY }}'
NOTIFICATION_QUEUE_PREFIX: '{{ NOTIFICATION_QUEUE_PREFIX }}'
AWS_ACCESS_KEY_ID: '{{ AWS_ACCESS_KEY_ID }}'
AWS_SECRET_ACCESS_KEY: '{{ AWS_SECRET_ACCESS_KEY }}'
{% if CBC_PROXY_AWS_ACCESS_KEY_ID is defined %}
CBC_PROXY_AWS_ACCESS_KEY_ID: '{{ CBC_PROXY_AWS_ACCESS_KEY_ID }}'
CBC_PROXY_AWS_SECRET_ACCESS_KEY: '{{ CBC_PROXY_AWS_SECRET_ACCESS_KEY }}'
{% endif %}
STATSD_HOST: "notify-statsd-exporter-{{ environment }}.apps.internal"
ZENDESK_API_KEY: '{{ ZENDESK_API_KEY }}'
MMG_API_KEY: '{{ MMG_API_KEY }}'
MMG_INBOUND_SMS_AUTH: '{{ MMG_INBOUND_SMS_AUTH | tojson }}'
MMG_INBOUND_SMS_USERNAME: '{{ MMG_INBOUND_SMS_USERNAME | tojson }}'
FIRETEXT_API_KEY: '{{ FIRETEXT_API_KEY }}'
FIRETEXT_INBOUND_SMS_AUTH: '{{ FIRETEXT_INBOUND_SMS_AUTH | tojson }}'
REDIS_ENABLED: '{{ REDIS_ENABLED }}'
REDIS_URL: '{{ REDIS_URL }}'
TEMPLATE_PREVIEW_API_HOST: '{{ TEMPLATE_PREVIEW_API_HOST }}'
TEMPLATE_PREVIEW_API_KEY: '{{ TEMPLATE_PREVIEW_API_KEY }}'
{% for key, value in app.get('additional_env_vars', {}).items() %}
{{key}}: '{{value}}'
{% endfor %}