add prometheus metrics for connection pools (both web and sql)

add the following prometheus metrics to keep track of general app
instance health.

 # sqs_apply_async_duration

how long does the actual SQS call (a standard web request to AWS) take.
a histogram with default bucket sizes, split up per task that was
created.

 # concurrent_web_request_count

how many web requests is this app currently serving. this is split up
per process, so we'd expect multiple responses per app instance

 # db_connection_total_connected

how many connections does this app (process) have open to the database.
They might be idle.

 # db_connection_total_checked_out

how many connections does this app (process) have open that are
currently in use by a web worker

 # db_connection_open_duration_seconds

a histogram per endpoint of how long the db connection was taken from
the pool for. won't have any data if a connection was never opened.
This commit is contained in:
Leo Hemsted
2020-04-24 14:36:21 +01:00
committed by David McDonald
parent 4bb37a05ec
commit 6e32ca5996
2 changed files with 85 additions and 1 deletions

View File

@@ -1,3 +1,4 @@
import time
import os
import random
import string
@@ -8,12 +9,14 @@ from flask_sqlalchemy import SQLAlchemy as _SQLAlchemy
from flask_marshmallow import Marshmallow
from flask_migrate import Migrate
from gds_metrics import GDSMetrics
from gds_metrics.metrics import Gauge, Histogram
from time import monotonic
from notifications_utils.clients.zendesk.zendesk_client import ZendeskClient
from notifications_utils.clients.statsd.statsd_client import StatsdClient
from notifications_utils.clients.redis.redis_client import RedisClient
from notifications_utils.clients.encryption.encryption_client import Encryption
from notifications_utils import logging, request_helper
from sqlalchemy import event
from werkzeug.exceptions import HTTPException as WerkzeugHTTPException
from werkzeug.local import LocalProxy
@@ -108,6 +111,9 @@ def create_app(application):
from app.commands import setup_commands
setup_commands(application)
# set up sqlalchemy events
setup_sqlalchemy_events(application)
return application
@@ -255,17 +261,27 @@ def register_v2_blueprints(application):
def init_app(app):
CONCURRENT_REQUESTS = Gauge(
'concurrent_web_request_count',
'How many concurrent requests are currently being served',
)
@app.before_request
def record_user_agent():
statsd_client.incr("user-agent.{}".format(process_user_agent(request.headers.get('User-Agent', None))))
@app.before_request
def record_request_details():
CONCURRENT_REQUESTS.inc()
g.start = monotonic()
g.endpoint = request.endpoint
@app.after_request
def after_request(response):
CONCURRENT_REQUESTS.dec()
response.headers.add('Access-Control-Allow-Origin', '*')
response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization')
response.headers.add('Access-Control-Allow-Methods', 'GET,PUT,POST,DELETE')
@@ -311,3 +327,63 @@ def process_user_agent(user_agent_string):
return "non-notify-user-agent"
else:
return "unknown"
def setup_sqlalchemy_events(app):
TOTAL_DB_CONNECTIONS = Gauge(
'db_connection_total_connected',
'How many db connections are currently held (potentially idel) by the server',
)
TOTAL_CHECKED_OUT_DB_CONNECTIONS = Gauge(
'db_connection_total_checked_out',
'How many db connections are currently checked out by web requests',
)
DB_CONNECTION_OPEN_DURATION_SECONDS = Histogram(
'db_connection_open_duration_seconds',
'How long db connections are held open for in seconds',
['method', 'host', 'path']
)
# need this or db.engine isn't accessible
with app.app_context():
@event.listens_for(db.engine, 'connect')
def connect(dbapi_connection, connection_record):
# connection first opened with db
TOTAL_DB_CONNECTIONS.inc()
@event.listens_for(db.engine, 'close')
def close(dbapi_connection, connection_record):
# connection closed (probably only happens with overflow connections)
TOTAL_DB_CONNECTIONS.dec()
@event.listens_for(db.engine, 'checkout')
def checkout(dbapi_connection, connection_record, connection_proxy):
# connection given to a web worker
TOTAL_CHECKED_OUT_DB_CONNECTIONS.inc()
# this will overwrite any previous checkout_at timestamp
connection_record.info['checkout_at'] = time.monotonic()
# checkin runs after the request is already torn down so we'll need to capture request info earlier
connection_record.info['request_data'] = {
'method': request.method,
'host': request.host,
'url_rule': request.url_rule.rule if request.url_rule else 'No endpoint'
}
@event.listens_for(db.engine, 'checkin')
def checkin(dbapi_connection, connection_record):
# connection returned by a web worker
TOTAL_CHECKED_OUT_DB_CONNECTIONS.dec()
# duration that connection was held by a single web request
duration = time.monotonic() - connection_record.info['checkout_at']
DB_CONNECTION_OPEN_DURATION_SECONDS.labels(
connection_record.info['request_data']['method'],
connection_record.info['request_data']['host'],
connection_record.info['request_data']['url_rule']
).observe(duration)

View File

@@ -1,5 +1,6 @@
import time
from gds_metrics.metrics import Histogram
from celery import Celery, Task
from celery.signals import worker_process_shutdown
from flask import g, request
@@ -19,6 +20,12 @@ def log_on_worker_shutdown(sender, signal, pid, exitcode, **kwargs):
def make_task(app):
SQS_APPLY_ASYNC_DURATION = Histogram(
'sqs_apply_async_duration',
'Time taken to put task on queue',
['task_name']
)
class NotifyTask(Task):
abstract = True
start = None
@@ -52,7 +59,8 @@ def make_task(app):
if has_request_context() and hasattr(request, 'request_id'):
kwargs['request_id'] = request.request_id
return super().apply_async(args, kwargs, task_id, producer, link, link_error, **options)
with SQS_APPLY_ASYNC_DURATION.labels(self.name).time():
return super().apply_async(args, kwargs, task_id, producer, link, link_error, **options)
return NotifyTask