Auto merge of #3857 - rust-lang:revert-3829-pa-db-pool-histogram, r=pietroalbini

bors · bors · commit 1d3266a5baac · 2021-08-22T14:12:23.000Z
Revert "Track used database conns with an histogram" Reverts #3829 This has been causing issues in production: ![image](https://user-images.githubusercontent.com/141300/130358224-673c5126-622d-4a92-bbcf-5fa95404533a.png)
diff --git a/src/app.rs b/src/app.rs
@@ -128,9 +128,6 @@ impl App {
                 instance_metrics
                     .database_time_to_obtain_connection
                     .with_label_values(&["primary"]),
-                instance_metrics
-                    .database_used_conns_histogram
-                    .with_label_values(&["primary"]),
             )
             .unwrap()
         };
@@ -158,9 +155,6 @@ impl App {
                         instance_metrics
                             .database_time_to_obtain_connection
                             .with_label_values(&["follower"]),
-                        instance_metrics
-                            .database_used_conns_histogram
-                            .with_label_values(&["follower"]),
                     )
                     .unwrap(),
                 )
diff --git a/src/db.rs b/src/db.rs
@@ -13,7 +13,6 @@ use crate::middleware::app::RequestApp;
 pub enum DieselPool {
     Pool {
         pool: r2d2::Pool<ConnectionManager<PgConnection>>,
-        used_conns_metric: Histogram,
         time_to_obtain_connection_metric: Histogram,
     },
     Test(Arc<ReentrantMutex<PgConnection>>),
@@ -23,7 +22,6 @@ impl DieselPool {
     pub(crate) fn new(
         url: &str,
         config: r2d2::Builder<ConnectionManager<PgConnection>>,
-        used_conns_metric: Histogram,
         time_to_obtain_connection_metric: Histogram,
     ) -> Result<DieselPool, PoolError> {
         let manager = ConnectionManager::new(connection_url(url));
@@ -41,7 +39,6 @@ impl DieselPool {
         // automatically be marked as unhealthy and the rest of the application will adapt.
         let pool = DieselPool::Pool {
             pool: config.build_unchecked(manager),
-            used_conns_metric,
             time_to_obtain_connection_metric,
         };
         match pool.wait_until_healthy(Duration::from_secs(5)) {
@@ -65,13 +62,8 @@ impl DieselPool {
         match self {
             DieselPool::Pool {
                 pool,
-                used_conns_metric,
                 time_to_obtain_connection_metric,
             } => time_to_obtain_connection_metric.observe_closure_duration(|| {
-                // Record the number of used connections before obtaining the current one.
-                let state = pool.state();
-                used_conns_metric.observe((state.connections - state.idle_connections) as f64);
-
                 if let Some(conn) = pool.try_get() {
                     Ok(DieselPooledConn::Pool(conn))
                 } else if !self.is_healthy() {
diff --git a/src/metrics/histogram.rs b/src/metrics/histogram.rs
diff --git a/src/metrics/instance.rs b/src/metrics/instance.rs
@@ -17,10 +17,11 @@
 //! As a rule of thumb, if the metric requires a database query to be updated it's probably a
 //! service-level metric, and you should add it to `src/metrics/service.rs` instead.
 
-use crate::metrics::histogram::{DatabasePoolBuckets, Histogram, HistogramVec, TimingBuckets};
 use crate::util::errors::AppResult;
 use crate::{app::App, db::DieselPool};
-use prometheus::{proto::MetricFamily, IntCounter, IntCounterVec, IntGauge, IntGaugeVec};
+use prometheus::{
+    proto::MetricFamily, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
+};
 
 metrics! {
     pub struct InstanceMetrics {
@@ -29,17 +30,15 @@ metrics! {
         /// Number of used database connections in the pool
         database_used_conns: IntGaugeVec["pool"],
         /// Amount of time required to obtain a database connection
-        pub database_time_to_obtain_connection: HistogramVec<TimingBuckets>["pool"],
-        /// Number of used database connections in the pool, as histogram
-        pub database_used_conns_histogram: HistogramVec<DatabasePoolBuckets>["pool"],
+        pub database_time_to_obtain_connection: HistogramVec["pool"],
 
         /// Number of requests processed by this instance
         pub requests_total: IntCounter,
         /// Number of requests currently being processed
         pub requests_in_flight: IntGauge,
 
         /// Response times of our endpoints
-        pub response_times: HistogramVec<TimingBuckets>["endpoint"],
+        pub response_times: HistogramVec["endpoint"],
         /// Nmber of responses per status code
         pub responses_by_status_code_total: IntCounterVec["status"],
 
@@ -48,7 +47,7 @@ metrics! {
         /// Number of download requests with a non-canonical crate name.
         pub downloads_non_canonical_crate_name_total: IntCounter,
         /// How long it takes to execute the SELECT query in the download endpoint.
-        pub downloads_select_query_execution_time: Histogram<TimingBuckets>,
+        pub downloads_select_query_execution_time: Histogram,
         /// Number of download requests that are not counted yet.
         downloads_not_counted_total: IntGauge,
     }
diff --git a/src/metrics/macros.rs b/src/metrics/macros.rs
@@ -1,4 +1,19 @@
-use prometheus::Opts;
+use prometheus::{Histogram, HistogramOpts, HistogramVec, Opts};
+
+/// Prometheus's histograms work by dividing datapoints in buckets, with each bucket containing
+/// the count of datapoints equal or greater to the bucket value.
+///
+/// The buckets used by crates.io are geared towards measuring the response time of our requests,
+/// going from 0.5ms to 100ms with a higher resolution and from 100ms to 5 seconds with a slightly
+/// lower resolution. This allows us to properly measure download requests (which take around 1ms)
+/// and other requests (our 95h is around 10-20ms).
+///
+/// Histogram buckets are not an exact science, so feel free to tweak the buckets if you see that
+/// the histograms are not really accurate. Just avoid adding too many buckets as that increases
+/// the number of exported metric series.
+const HISTOGRAM_BUCKETS: &[f64] = &[
+    0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5, 1.0, 5.0,
+];
 
 pub(super) trait MetricFromOpts: Sized {
     fn from_opts(opts: Opts) -> Result<Self, prometheus::Error>;
@@ -90,4 +105,29 @@ load_metric_type!(GaugeVec as vec);
 load_metric_type!(IntGauge as single);
 load_metric_type!(IntGaugeVec as vec);
 
-// Histograms are defined in histogram.rs
+// Use a custom implementation for histograms to customize the buckets.
+
+impl MetricFromOpts for Histogram {
+    fn from_opts(opts: Opts) -> Result<Self, prometheus::Error> {
+        Histogram::with_opts(HistogramOpts {
+            common_opts: opts,
+            buckets: HISTOGRAM_BUCKETS.to_vec(),
+        })
+    }
+}
+
+impl MetricFromOpts for HistogramVec {
+    fn from_opts(opts: Opts) -> Result<Self, prometheus::Error> {
+        HistogramVec::new(
+            HistogramOpts {
+                common_opts: opts.clone(),
+                buckets: HISTOGRAM_BUCKETS.to_vec(),
+            },
+            opts.variable_labels
+                .iter()
+                .map(|s| s.as_str())
+                .collect::<Vec<_>>()
+                .as_slice(),
+        )
+    }
+}
diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs
@@ -5,7 +5,6 @@ pub use self::service::ServiceMetrics;
 #[macro_use]
 mod macros;
 
-mod histogram;
 mod instance;
 mod log_encoder;
 mod service;

Original file line number	Diff line number	Diff line change
`@@ -128,9 +128,6 @@ impl App {`
`128`	`128`	`instance_metrics`
`129`	`129`	`.database_time_to_obtain_connection`
`130`	`130`	`.with_label_values(&["primary"]),`
`131`		`- instance_metrics`
`132`		`- .database_used_conns_histogram`
`133`		`- .with_label_values(&["primary"]),`
`134`	`131`	`)`
`135`	`132`	`.unwrap()`
`136`	`133`	`};`
`@@ -158,9 +155,6 @@ impl App {`
`158`	`155`	`instance_metrics`
`159`	`156`	`.database_time_to_obtain_connection`
`160`	`157`	`.with_label_values(&["follower"]),`
`161`		`- instance_metrics`
`162`		`- .database_used_conns_histogram`
`163`		`- .with_label_values(&["follower"]),`
`164`	`158`	`)`
`165`	`159`	`.unwrap(),`
`166`	`160`	`)`