Skip to content

Track used database conns with an histogram #3829

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ impl App {
instance_metrics
.database_time_to_obtain_connection
.with_label_values(&["primary"]),
instance_metrics
.database_used_conns_histogram
.with_label_values(&["primary"]),
)
.unwrap()
};
Expand Down Expand Up @@ -155,6 +158,9 @@ impl App {
instance_metrics
.database_time_to_obtain_connection
.with_label_values(&["follower"]),
instance_metrics
.database_used_conns_histogram
.with_label_values(&["follower"]),
)
.unwrap(),
)
Expand Down
8 changes: 8 additions & 0 deletions src/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use crate::middleware::app::RequestApp;
pub enum DieselPool {
Pool {
pool: r2d2::Pool<ConnectionManager<PgConnection>>,
used_conns_metric: Histogram,
time_to_obtain_connection_metric: Histogram,
},
Test(Arc<ReentrantMutex<PgConnection>>),
Expand All @@ -22,6 +23,7 @@ impl DieselPool {
pub(crate) fn new(
url: &str,
config: r2d2::Builder<ConnectionManager<PgConnection>>,
used_conns_metric: Histogram,
time_to_obtain_connection_metric: Histogram,
) -> Result<DieselPool, PoolError> {
let manager = ConnectionManager::new(connection_url(url));
Expand All @@ -39,6 +41,7 @@ impl DieselPool {
// automatically be marked as unhealthy and the rest of the application will adapt.
let pool = DieselPool::Pool {
pool: config.build_unchecked(manager),
used_conns_metric,
time_to_obtain_connection_metric,
};
match pool.wait_until_healthy(Duration::from_secs(5)) {
Expand All @@ -62,8 +65,13 @@ impl DieselPool {
match self {
DieselPool::Pool {
pool,
used_conns_metric,
time_to_obtain_connection_metric,
} => time_to_obtain_connection_metric.observe_closure_duration(|| {
// Record the number of used connections before obtaining the current one.
let state = pool.state();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume accessing pool.state() is an inexpensive operation?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This just requires obtaining a mutex lock, so shouldn't be expensive.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

requires obtaining a mutex lock

sounds pretty expensive to me 😅

used_conns_metric.observe((state.connections - state.idle_connections) as f64);

if let Some(conn) = pool.try_get() {
Ok(DieselPooledConn::Pool(conn))
} else if !self.is_healthy() {
Expand Down
103 changes: 103 additions & 0 deletions src/metrics/histogram.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
use crate::metrics::macros::MetricFromOpts;
use std::marker::PhantomData;
use std::ops::{Deref, DerefMut};

/// Prometheus's histograms work by dividing datapoints in buckets, with each bucket containing the
/// count of datapoints equal or greater to the bucket value.
///
/// Histogram buckets are not an exact science, so feel free to tweak the buckets or create new
/// ones if you see that the histograms are not really accurate. Just avoid adding too many buckets
/// for a single type as that increases the number of exported metric series.
pub trait HistogramBuckets {
const BUCKETS: &'static [f64];
}

/// Buckets geared towards measuring timings, such as the response time of our requests, going from
/// 0.5ms to 100ms with a higher resolution and from 100ms to 5 seconds with a slightly lower
/// resolution. This allows us to properly measure download requests (which take around 1ms) and
/// other requests (our 95h is around 10-20ms).
pub struct TimingBuckets;

impl HistogramBuckets for TimingBuckets {
const BUCKETS: &'static [f64] = &[
0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5, 1.0, 5.0,
];
}

/// Buckets geared towards measuring how many database connections are used in the database pool.
pub struct DatabasePoolBuckets;

impl HistogramBuckets for DatabasePoolBuckets {
const BUCKETS: &'static [f64] = &[0.0, 1.0, 2.0, 5.0, 10.0, 15.0, 20.0, 30.0, 50.0, 100.0];
}

/// Wrapper type over [`prometheus::Histogram`] to support defining buckets.
pub struct Histogram<B: HistogramBuckets> {
inner: prometheus::Histogram,
_phantom: PhantomData<B>,
}

impl<B: HistogramBuckets> MetricFromOpts for Histogram<B> {
fn from_opts(opts: prometheus::Opts) -> Result<Self, prometheus::Error> {
Ok(Histogram {
inner: prometheus::Histogram::with_opts(prometheus::HistogramOpts {
common_opts: opts,
buckets: B::BUCKETS.to_vec(),
})?,
_phantom: PhantomData,
})
}
}

impl<B: HistogramBuckets> Deref for Histogram<B> {
type Target = prometheus::Histogram;

fn deref(&self) -> &Self::Target {
&self.inner
}
}

impl<B: HistogramBuckets> DerefMut for Histogram<B> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.inner
}
}

/// Wrapper type over [`prometheus::HistogramVec`] to support defining buckets.
pub struct HistogramVec<B: HistogramBuckets> {
inner: prometheus::HistogramVec,
_phantom: PhantomData<B>,
}

impl<B: HistogramBuckets> MetricFromOpts for HistogramVec<B> {
fn from_opts(opts: prometheus::Opts) -> Result<Self, prometheus::Error> {
Ok(HistogramVec {
inner: prometheus::HistogramVec::new(
prometheus::HistogramOpts {
common_opts: opts.clone(),
buckets: B::BUCKETS.to_vec(),
},
opts.variable_labels
.iter()
.map(|s| s.as_str())
.collect::<Vec<_>>()
.as_slice(),
)?,
_phantom: PhantomData,
})
}
}

impl<B: HistogramBuckets> Deref for HistogramVec<B> {
type Target = prometheus::HistogramVec;

fn deref(&self) -> &Self::Target {
&self.inner
}
}

impl<B: HistogramBuckets> DerefMut for HistogramVec<B> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.inner
}
}
13 changes: 7 additions & 6 deletions src/metrics/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,10 @@
//! As a rule of thumb, if the metric requires a database query to be updated it's probably a
//! service-level metric, and you should add it to `src/metrics/service.rs` instead.

use crate::metrics::histogram::{DatabasePoolBuckets, Histogram, HistogramVec, TimingBuckets};
use crate::util::errors::AppResult;
use crate::{app::App, db::DieselPool};
use prometheus::{
proto::MetricFamily, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
};
use prometheus::{proto::MetricFamily, IntCounter, IntCounterVec, IntGauge, IntGaugeVec};

metrics! {
pub struct InstanceMetrics {
Expand All @@ -30,15 +29,17 @@ metrics! {
/// Number of used database connections in the pool
database_used_conns: IntGaugeVec["pool"],
/// Amount of time required to obtain a database connection
pub database_time_to_obtain_connection: HistogramVec["pool"],
pub database_time_to_obtain_connection: HistogramVec<TimingBuckets>["pool"],
/// Number of used database connections in the pool, as histogram
pub database_used_conns_histogram: HistogramVec<DatabasePoolBuckets>["pool"],

/// Number of requests processed by this instance
pub requests_total: IntCounter,
/// Number of requests currently being processed
pub requests_in_flight: IntGauge,

/// Response times of our endpoints
pub response_times: HistogramVec["endpoint"],
pub response_times: HistogramVec<TimingBuckets>["endpoint"],
/// Nmber of responses per status code
pub responses_by_status_code_total: IntCounterVec["status"],

Expand All @@ -47,7 +48,7 @@ metrics! {
/// Number of download requests with a non-canonical crate name.
pub downloads_non_canonical_crate_name_total: IntCounter,
/// How long it takes to execute the SELECT query in the download endpoint.
pub downloads_select_query_execution_time: Histogram,
pub downloads_select_query_execution_time: Histogram<TimingBuckets>,
/// Number of download requests that are not counted yet.
downloads_not_counted_total: IntGauge,
}
Expand Down
44 changes: 2 additions & 42 deletions src/metrics/macros.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,4 @@
use prometheus::{Histogram, HistogramOpts, HistogramVec, Opts};

/// Prometheus's histograms work by dividing datapoints in buckets, with each bucket containing
/// the count of datapoints equal or greater to the bucket value.
///
/// The buckets used by crates.io are geared towards measuring the response time of our requests,
/// going from 0.5ms to 100ms with a higher resolution and from 100ms to 5 seconds with a slightly
/// lower resolution. This allows us to properly measure download requests (which take around 1ms)
/// and other requests (our 95h is around 10-20ms).
///
/// Histogram buckets are not an exact science, so feel free to tweak the buckets if you see that
/// the histograms are not really accurate. Just avoid adding too many buckets as that increases
/// the number of exported metric series.
const HISTOGRAM_BUCKETS: &[f64] = &[
0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5, 1.0, 5.0,
];
use prometheus::Opts;

pub(super) trait MetricFromOpts: Sized {
fn from_opts(opts: Opts) -> Result<Self, prometheus::Error>;
Expand Down Expand Up @@ -105,29 +90,4 @@ load_metric_type!(GaugeVec as vec);
load_metric_type!(IntGauge as single);
load_metric_type!(IntGaugeVec as vec);

// Use a custom implementation for histograms to customize the buckets.

impl MetricFromOpts for Histogram {
fn from_opts(opts: Opts) -> Result<Self, prometheus::Error> {
Histogram::with_opts(HistogramOpts {
common_opts: opts,
buckets: HISTOGRAM_BUCKETS.to_vec(),
})
}
}

impl MetricFromOpts for HistogramVec {
fn from_opts(opts: Opts) -> Result<Self, prometheus::Error> {
HistogramVec::new(
HistogramOpts {
common_opts: opts.clone(),
buckets: HISTOGRAM_BUCKETS.to_vec(),
},
opts.variable_labels
.iter()
.map(|s| s.as_str())
.collect::<Vec<_>>()
.as_slice(),
)
}
}
// Histograms are defined in histogram.rs
1 change: 1 addition & 0 deletions src/metrics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ pub use self::service::ServiceMetrics;
#[macro_use]
mod macros;

mod histogram;
mod instance;
mod log_encoder;
mod service;