diff --git a/compiler/rustc_middle/src/mir/mono.rs b/compiler/rustc_middle/src/mir/mono.rs
index 16addc2dc1e75..f4133dfbc95b9 100644
--- a/compiler/rustc_middle/src/mir/mono.rs
+++ b/compiler/rustc_middle/src/mir/mono.rs
@@ -56,6 +56,8 @@ impl<'tcx> MonoItem<'tcx> {
         }
     }
 
+    // Note: if you change how item size estimates work, you might need to
+    // change NON_INCR_MIN_CGU_SIZE as well.
     pub fn size_estimate(&self, tcx: TyCtxt<'tcx>) -> usize {
         match *self {
             MonoItem::Fn(instance) => {
@@ -248,8 +250,14 @@ pub struct CodegenUnit<'tcx> {
 /// Auxiliary info about a `MonoItem`.
 #[derive(Copy, Clone, PartialEq, Debug, HashStable)]
 pub struct MonoItemData {
+    /// A cached copy of the result of `MonoItem::instantiation_mode`, where
+    /// `GloballyShared` maps to `false` and `LocalCopy` maps to `true`.
+    pub inlined: bool,
+
     pub linkage: Linkage,
     pub visibility: Visibility,
+
+    /// A cached copy of the result of `MonoItem::size_estimate`.
     pub size_estimate: usize,
 }
 
diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index 54096abb2e0c8..71aef53192f48 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -248,7 +248,8 @@ where
         }
         let size_estimate = mono_item.size_estimate(cx.tcx);
 
-        cgu.items_mut().insert(mono_item, MonoItemData { linkage, visibility, size_estimate });
+        cgu.items_mut()
+            .insert(mono_item, MonoItemData { inlined: false, linkage, visibility, size_estimate });
 
         // Get all inlined items that are reachable from `mono_item` without
         // going via another root item. This includes drop-glue, functions from
@@ -263,6 +264,7 @@ where
         for inlined_item in reachable_inlined_items {
             // This is a CGU-private copy.
             cgu.items_mut().entry(inlined_item).or_insert_with(|| MonoItemData {
+                inlined: true,
                 linkage: Linkage::Internal,
                 visibility: Visibility::Default,
                 size_estimate: inlined_item.size_estimate(cx.tcx),
@@ -316,6 +318,60 @@ fn merge_codegen_units<'tcx>(
     let mut cgu_contents: FxHashMap<Symbol, Vec<Symbol>> =
         codegen_units.iter().map(|cgu| (cgu.name(), vec![cgu.name()])).collect();
 
+    // If N is the maximum number of CGUs, and the CGUs are sorted from largest
+    // to smallest, we repeatedly find which CGU in codegen_units[N..] has the
+    // greatest overlap of inlined items with codegen_units[N-1], merge that
+    // CGU into codegen_units[N-1], then re-sort by size and repeat.
+    //
+    // We use inlined item overlap to guide this merging because it minimizes
+    // duplication of inlined items, which makes LLVM be faster and generate
+    // better and smaller machine code.
+    //
+    // Why merge into codegen_units[N-1]? We want CGUs to have similar sizes,
+    // which means we don't want codegen_units[0..N] (the already big ones)
+    // getting any bigger, if we can avoid it. When we have more than N CGUs
+    // then at least one of the biggest N will have to grow. codegen_units[N-1]
+    // is the smallest of those, and so has the most room to grow.
+    let max_codegen_units = cx.tcx.sess.codegen_units().as_usize();
+    while codegen_units.len() > max_codegen_units {
+        // Sort small CGUs to the back.
+        codegen_units.sort_by_key(|cgu| cmp::Reverse(cgu.size_estimate()));
+
+        let cgu_dst = &codegen_units[max_codegen_units - 1];
+
+        // Find the CGU that overlaps the most with `cgu_dst`. In the case of a
+        // tie, favour the earlier (bigger) CGU.
+        let mut max_overlap = 0;
+        let mut max_overlap_i = max_codegen_units;
+        for (i, cgu_src) in codegen_units.iter().enumerate().skip(max_codegen_units) {
+            if cgu_src.size_estimate() <= max_overlap {
+                // None of the remaining overlaps can exceed `max_overlap`, so
+                // stop looking.
+                break;
+            }
+
+            let overlap = compute_inlined_overlap(cgu_dst, cgu_src);
+            if overlap > max_overlap {
+                max_overlap = overlap;
+                max_overlap_i = i;
+            }
+        }
+
+        let mut cgu_src = codegen_units.swap_remove(max_overlap_i);
+        let cgu_dst = &mut codegen_units[max_codegen_units - 1];
+
+        // Move the items from `cgu_src` to `cgu_dst`. Some of them may be
+        // duplicate inlined items, in which case the destination CGU is
+        // unaffected. Recalculate size estimates afterwards.
+        cgu_dst.items_mut().extend(cgu_src.items_mut().drain());
+        cgu_dst.compute_size_estimate();
+
+        // Record that `cgu_dst` now contains all the stuff that was in
+        // `cgu_src` before.
+        let mut consumed_cgu_names = cgu_contents.remove(&cgu_src.name()).unwrap();
+        cgu_contents.get_mut(&cgu_dst.name()).unwrap().append(&mut consumed_cgu_names);
+    }
+
     // Having multiple CGUs can drastically speed up compilation. But for
     // non-incremental builds, tiny CGUs slow down compilation *and* result in
     // worse generated code. So we don't allow CGUs smaller than this (unless
@@ -323,24 +379,22 @@ fn merge_codegen_units<'tcx>(
     // common in larger programs, so this isn't all that large.
     const NON_INCR_MIN_CGU_SIZE: usize = 1800;
 
-    // Repeatedly merge the two smallest codegen units as long as:
-    // - we have more CGUs than the upper limit, or
-    // - (Non-incremental builds only) the user didn't specify a CGU count, and
-    //   there are multiple CGUs, and some are below the minimum size.
+    // Repeatedly merge the two smallest codegen units as long as: it's a
+    // non-incremental build, and the user didn't specify a CGU count, and
+    // there are multiple CGUs, and some are below the minimum size.
     //
     // The "didn't specify a CGU count" condition is because when an explicit
     // count is requested we observe it as closely as possible. For example,
     // the `compiler_builtins` crate sets `codegen-units = 10000` and it's
     // critical they aren't merged. Also, some tests use explicit small values
     // and likewise won't work if small CGUs are merged.
-    while codegen_units.len() > cx.tcx.sess.codegen_units().as_usize()
-        || (cx.tcx.sess.opts.incremental.is_none()
-            && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_))
-            && codegen_units.len() > 1
-            && codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE))
+    while cx.tcx.sess.opts.incremental.is_none()
+        && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_))
+        && codegen_units.len() > 1
+        && codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE)
     {
         // Sort small cgus to the back.
-        codegen_units.sort_by_key(|cgu| cmp::Reverse(cgu.size_estimate()));
+        codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate()));
 
         let mut smallest = codegen_units.pop().unwrap();
         let second_smallest = codegen_units.last_mut().unwrap();
@@ -351,16 +405,7 @@ fn merge_codegen_units<'tcx>(
         second_smallest.items_mut().extend(smallest.items_mut().drain());
         second_smallest.compute_size_estimate();
 
-        // Record that `second_smallest` now contains all the stuff that was
-        // in `smallest` before.
-        let mut consumed_cgu_names = cgu_contents.remove(&smallest.name()).unwrap();
-        cgu_contents.get_mut(&second_smallest.name()).unwrap().append(&mut consumed_cgu_names);
-
-        debug!(
-            "CodegenUnit {} merged into CodegenUnit {}",
-            smallest.name(),
-            second_smallest.name()
-        );
+        // Don't update `cgu_contents`, that's only for incremental builds.
     }
 
     let cgu_name_builder = &mut CodegenUnitNameBuilder::new(cx.tcx);
@@ -439,6 +484,25 @@ fn merge_codegen_units<'tcx>(
     }
 }
 
+/// Compute the combined size of all inlined items that appear in both `cgu1`
+/// and `cgu2`.
+fn compute_inlined_overlap<'tcx>(cgu1: &CodegenUnit<'tcx>, cgu2: &CodegenUnit<'tcx>) -> usize {
+    // Either order works. We pick the one that involves iterating over fewer
+    // items.
+    let (src_cgu, dst_cgu) =
+        if cgu1.items().len() <= cgu2.items().len() { (cgu1, cgu2) } else { (cgu2, cgu1) };
+
+    let mut overlap = 0;
+    for (item, data) in src_cgu.items().iter() {
+        if data.inlined {
+            if dst_cgu.items().contains_key(item) {
+                overlap += data.size_estimate;
+            }
+        }
+    }
+    overlap
+}
+
 fn internalize_symbols<'tcx>(
     cx: &PartitioningCx<'_, 'tcx>,
     codegen_units: &mut [CodegenUnit<'tcx>],
@@ -870,19 +934,16 @@ fn debug_dump<'a, 'tcx: 'a>(tcx: TyCtxt<'tcx>, label: &str, cgus: &[CodegenUnit<
             all_cgu_sizes.push(cgu.size_estimate());
 
             for (item, data) in cgu.items() {
-                match item.instantiation_mode(tcx) {
-                    InstantiationMode::GloballyShared { .. } => {
-                        root_items += 1;
-                        root_size += data.size_estimate;
-                    }
-                    InstantiationMode::LocalCopy => {
-                        if inlined_items.insert(item) {
-                            unique_inlined_items += 1;
-                            unique_inlined_size += data.size_estimate;
-                        }
-                        placed_inlined_items += 1;
-                        placed_inlined_size += data.size_estimate;
+                if !data.inlined {
+                    root_items += 1;
+                    root_size += data.size_estimate;
+                } else {
+                    if inlined_items.insert(item) {
+                        unique_inlined_items += 1;
+                        unique_inlined_size += data.size_estimate;
                     }
+                    placed_inlined_items += 1;
+                    placed_inlined_size += data.size_estimate;
                 }
             }
         }
@@ -937,10 +998,7 @@ fn debug_dump<'a, 'tcx: 'a>(tcx: TyCtxt<'tcx>, label: &str, cgus: &[CodegenUnit<
                 let symbol_name = item.symbol_name(tcx).name;
                 let symbol_hash_start = symbol_name.rfind('h');
                 let symbol_hash = symbol_hash_start.map_or("<no hash>", |i| &symbol_name[i..]);
-                let kind = match item.instantiation_mode(tcx) {
-                    InstantiationMode::GloballyShared { .. } => "root",
-                    InstantiationMode::LocalCopy => "inlined",
-                };
+                let kind = if !data.inlined { "root" } else { "inlined" };
                 let size = data.size_estimate;
                 let _ = with_no_trimmed_paths!(writeln!(
                     s,