From 6b2d5264435ac3f8ed57445206b82f5f5b2bec77 Mon Sep 17 00:00:00 2001 From: Sean Griffin Date: Mon, 6 Mar 2017 07:39:03 -0500 Subject: [PATCH] Simplify and optimize `Categories::toplevel` `EXPLAIN ANALYZE` output (tested assuming `ORDER BY crates_cnt DESC LIMIT 10 OFFSET 0`) Before: ``` (cost=7129.79..7129.82 rows=10 width=127) (actual time=1.184..1.187 rows=10 loops=1) ``` After: ``` (cost=5.88..5.88 rows=1 width=131) (actual time=0.175..0.177 rows=10 loops=1) ``` About half of the performance comes from removing the `COALESCE`. Since the subselect is including the `crates_cnt` from the toplevel category (which is why it doesn't have to add `c.crates_cnt`), so it can never return null. The second big win is changing from a subselect to a join. PG is usually quite good at figuring out when these cases are equivalent, but I suspect that the use of an aggregate function in the subselect means that it will actually have to subselect in a loop. Finally, we avoid using `LIKE`, since it's more expensive than we need, and can't be indexed. I've opted to use `split_part(slug)` in both the join and outer filter, so that both can be covered by a single index later. The "cheapest" way to do the outer filter is probably `strpos(slug, '::') = 0`, but the difference is so small that it doesn't matter. I explicitly did not include an index here, since the data set is small enough that it would never be used. If the number of categories grows beyond a few hundred, this query can also benefit from an index on `split_part(slug, '::', 1)`. The test coverage around this method was pretty light, so I've added some unit tests to give it a bit more coverage for correctness. --- Cargo.toml | 1 - src/bin/delete-crate.rs | 1 - src/category.rs | 124 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 114 insertions(+), 12 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 50d6f6e3e3e..3edc35f3074 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,6 @@ opt-level = 2 [lib] name = "cargo_registry" -test = false doctest = false [[test]] diff --git a/src/bin/delete-crate.rs b/src/bin/delete-crate.rs index 639f0628024..1793b288c3d 100644 --- a/src/bin/delete-crate.rs +++ b/src/bin/delete-crate.rs @@ -7,7 +7,6 @@ #![deny(warnings)] -#[macro_use] extern crate cargo_registry; extern crate postgres; extern crate time; diff --git a/src/category.rs b/src/category.rs index f209acea690..704b3fadbe4 100644 --- a/src/category.rs +++ b/src/category.rs @@ -157,16 +157,13 @@ impl Category { // Collect all the top-level categories and sum up the crates_cnt of // the crates in all subcategories let stmt = conn.prepare(&format!( - "SELECT c.id, c.category, c.slug, c.description, c.created_at, \ - COALESCE (( \ - SELECT sum(c2.crates_cnt)::int \ - FROM categories as c2 \ - WHERE c2.slug = c.slug \ - OR c2.slug LIKE c.slug || '::%' \ - ), 0) as crates_cnt \ - FROM categories as c \ - WHERE c.category NOT LIKE '%::%' {} \ - LIMIT $1 OFFSET $2", + "SELECT c.id, c.category, c.slug, c.description, c.created_at, + sum(c2.crates_cnt)::int as crates_cnt + FROM categories as c + INNER JOIN categories c2 ON split_part(c2.slug, '::', 1) = c.slug + WHERE split_part(c.slug, '::', 1) = c.slug + GROUP BY c.id + {} LIMIT $1 OFFSET $2", sort_sql ))?; @@ -278,3 +275,110 @@ pub fn slugs(req: &mut Request) -> CargoResult { struct R { category_slugs: Vec } Ok(req.json(&R { category_slugs: slugs })) } + +#[cfg(test)] +mod tests { + use super::*; + use pg::{Connection, TlsMode}; + use dotenv::dotenv; + use std::env; + + fn pg_connection() -> Connection { + let _ = dotenv(); + let database_url = env::var("TEST_DATABASE_URL") + .expect("TEST_DATABASE_URL must be set to run tests"); + let conn = Connection::connect(database_url, TlsMode::None).unwrap(); + // These tests deadlock if run concurrently + conn.batch_execute("BEGIN; LOCK categories IN ACCESS EXCLUSIVE MODE").unwrap(); + conn + } + + #[test] + fn category_toplevel_excludes_subcategories() { + let conn = pg_connection(); + conn.batch_execute("INSERT INTO categories (category, slug) VALUES + ('Cat 2', 'cat2'), ('Cat 1', 'cat1'), ('Cat 1::sub', 'cat1::sub') + ").unwrap(); + + let categories = Category::toplevel(&conn, "", 10, 0).unwrap() + .into_iter().map(|c| c.category).collect::>(); + let expected = vec!["Cat 1".to_string(), "Cat 2".to_string()]; + assert_eq!(expected, categories); + } + + #[test] + fn category_toplevel_orders_by_crates_cnt_when_sort_given() { + let conn = pg_connection(); + conn.batch_execute("INSERT INTO categories (category, slug, crates_cnt) VALUES + ('Cat 1', 'cat1', 0), ('Cat 2', 'cat2', 2), ('Cat 3', 'cat3', 1) + ").unwrap(); + + let categories = Category::toplevel(&conn, "crates", 10, 0).unwrap() + .into_iter().map(|c| c.category).collect::>(); + let expected = vec!["Cat 2".to_string(), "Cat 3".to_string(), "Cat 1".to_string()]; + assert_eq!(expected, categories); + } + + #[test] + fn category_toplevel_applies_limit_and_offset() { + let conn = pg_connection(); + conn.batch_execute("INSERT INTO categories (category, slug) VALUES + ('Cat 1', 'cat1'), ('Cat 2', 'cat2') + ").unwrap(); + + let categories = Category::toplevel(&conn, "", 1, 0).unwrap() + .into_iter().map(|c| c.category).collect::>(); + let expected = vec!["Cat 1".to_string()]; + assert_eq!(expected, categories); + + let categories = Category::toplevel(&conn, "", 1, 1).unwrap() + .into_iter().map(|c| c.category).collect::>(); + let expected = vec!["Cat 2".to_string()]; + assert_eq!(expected, categories); + } + + #[test] + fn category_toplevel_includes_subcategories_in_crate_cnt() { + let conn = pg_connection(); + conn.batch_execute("INSERT INTO categories (category, slug, crates_cnt) VALUES + ('Cat 1', 'cat1', 1), ('Cat 1::sub', 'cat1::sub', 2), + ('Cat 2', 'cat2', 3), ('Cat 2::Sub 1', 'cat2::sub1', 4), + ('Cat 2::Sub 2', 'cat2::sub2', 5), ('Cat 3', 'cat3', 6) + ").unwrap(); + + let categories = Category::toplevel(&conn, "crates", 10, 0).unwrap() + .into_iter().map(|c| (c.category, c.crates_cnt)).collect::>(); + let expected = vec![ + ("Cat 2".to_string(), 12), + ("Cat 3".to_string(), 6), + ("Cat 1".to_string(), 3), + ]; + assert_eq!(expected, categories); + } + + #[test] + fn category_toplevel_applies_limit_and_offset_after_grouping() { + let conn = pg_connection(); + conn.batch_execute("INSERT INTO categories (category, slug, crates_cnt) VALUES + ('Cat 1', 'cat1', 1), ('Cat 1::sub', 'cat1::sub', 2), + ('Cat 2', 'cat2', 3), ('Cat 2::Sub 1', 'cat2::sub1', 4), + ('Cat 2::Sub 2', 'cat2::sub2', 5), ('Cat 3', 'cat3', 6) + ").unwrap(); + + let categories = Category::toplevel(&conn, "crates", 2, 0).unwrap() + .into_iter().map(|c| (c.category, c.crates_cnt)).collect::>(); + let expected = vec![ + ("Cat 2".to_string(), 12), + ("Cat 3".to_string(), 6), + ]; + assert_eq!(expected, categories); + + let categories = Category::toplevel(&conn, "crates", 2, 1).unwrap() + .into_iter().map(|c| (c.category, c.crates_cnt)).collect::>(); + let expected = vec![ + ("Cat 3".to_string(), 6), + ("Cat 1".to_string(), 3), + ]; + assert_eq!(expected, categories); + } +}