From 49bca2abe4512180f080f678e73c0fa40885379a Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 2 Sep 2024 07:48:19 +0200 Subject: [PATCH 01/17] Make clear what `gix commit-graph` is (#1572) --- src/plumbing/options/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/plumbing/options/mod.rs b/src/plumbing/options/mod.rs index 82a74e51123..6265cf959f8 100644 --- a/src/plumbing/options/mod.rs +++ b/src/plumbing/options/mod.rs @@ -83,7 +83,7 @@ pub enum Subcommands { Archive(archive::Platform), #[cfg(feature = "gitoxide-core-tools-clean")] Clean(clean::Command), - /// Subcommands for interacting with commit-graphs + /// Subcommands for interacting with commit-graph files #[clap(subcommand)] CommitGraph(commitgraph::Subcommands), /// Interact with the object database. @@ -738,13 +738,13 @@ pub mod credential { pub mod commitgraph { #[derive(Debug, clap::Subcommand)] pub enum Subcommands { - /// Verify the integrity of a commit graph + /// Verify the integrity of a commit graph file Verify { - /// output statistical information about the pack + /// output statistical information about the graph. #[clap(long, short = 's')] statistics: bool, }, - /// List all entries in the commit-graph as reachable by starting from `HEAD`. + /// List all entries in the commit-graph file as reachable by starting from `HEAD`. List { /// The rev-spec to list reachable commits from. #[clap(default_value = "@")] From adf90cc7bde3bad4edc7604922b07cffba762145 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 2 Sep 2024 07:54:45 +0200 Subject: [PATCH 02/17] Make clear that `cmake` is required for the `max` configuration (#1572) --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 4e986796616..49bbcf1150f 100644 --- a/README.md +++ b/README.md @@ -198,20 +198,20 @@ There are various build configurations, all of them are [documented here](https: for packagers who need to tune external dependencies. ``` -# A certain way to install `gitoxide` with just Rust and a C compiler installed. +# A way to install `gitoxide` with just Rust and a C compiler installed. # If there are problems with SSL certificates during clones, try to omit `--locked`. cargo install gitoxide --locked --no-default-features --features max-pure -# The default installation, 'max', is the fastest, but also needs some libraries available to build successfully. -# Installing these is platform-dependent and thus can't be explained here. +# The default installation, 'max', is the fastest, but also needs `cmake` to build successfully. +# Installing it is platform-dependent. cargo install gitoxide -# For smaller binaries and even faster build times that are traded for a less fancy CLI implementation, use `lean` -# or `lean-termion` respectively. +# For smaller binaries and even faster build times that are traded for a less fancy CLI implementation, +# use the `lean` feature. cargo install gitoxide --locked --no-default-features --features lean ``` -The following installs the latest unpublished release directly from git: +The following installs the latest unpublished `max` release directly from git: ```sh cargo install --git https://github.com/Byron/gitoxide gitoxide From a10e7c2b26d2151894a29f543e3cd9576bc92dc4 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 2 Sep 2024 11:27:31 +0200 Subject: [PATCH 03/17] fix: assure build-script is added to the package (#1572) Otherwise, the `GIX_VERSION` environment variable is not available at build time, which can lead to runtime errors. --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 332d0b9b76f..a4d4a2ad2fe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ edition = "2021" license = "MIT OR Apache-2.0" version = "0.38.0" default-run = "gix" -include = ["src/**/*", "LICENSE-*", "README.md"] +include = ["src/**/*", "/build.rs", "LICENSE-*", "README.md"] resolver = "2" [[bin]] From c545d71ebfd4ac7d960fc75aae5558b341d8ecf2 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 2 Sep 2024 11:12:09 +0200 Subject: [PATCH 04/17] fix!: `Tree::lookup_entry()` looses its `buf` argument. The buffer will now be previded from the free-list of the repository. --- gix/src/object/tree/mod.rs | 25 +++++++++++-------------- gix/src/repository/object.rs | 2 +- gix/tests/object/tree/mod.rs | 7 +------ 3 files changed, 13 insertions(+), 21 deletions(-) diff --git a/gix/src/object/tree/mod.rs b/gix/src/object/tree/mod.rs index 72f2a9e973c..2f7df7ec33e 100644 --- a/gix/src/object/tree/mod.rs +++ b/gix/src/object/tree/mod.rs @@ -41,7 +41,6 @@ impl<'repo> Tree<'repo> { /// Follow a sequence of `path` components starting from this instance, and look them up one by one until the last component /// is looked up and its tree entry is returned. - /// Use `buf` as temporary location for sub-trees to avoid allocating a temporary buffer for each lookup. /// /// # Performance Notes /// @@ -49,16 +48,18 @@ impl<'repo> Tree<'repo> { /// to reuse a vector and use a binary search instead, which might be able to improve performance over all. /// However, a benchmark should be created first to have some data and see which trade-off to choose here. /// - pub fn lookup_entry(&self, path: I, buf: &mut Vec) -> Result>, find::existing::Error> + pub fn lookup_entry(&self, path: I) -> Result>, find::existing::Error> where I: IntoIterator, P: PartialEq, { - let mut path = path.into_iter().peekable(); + let mut buf = self.repo.shared_empty_buf(); buf.clear(); + + let mut path = path.into_iter().peekable(); buf.extend_from_slice(&self.data); while let Some(component) = path.next() { - match TreeRefIter::from_bytes(buf) + match TreeRefIter::from_bytes(&buf) .filter_map(Result::ok) .find(|entry| component.eq(entry.filename)) { @@ -70,7 +71,7 @@ impl<'repo> Tree<'repo> { })); } else { let next_id = entry.oid.to_owned(); - let obj = self.repo.objects.find(&next_id, buf)?; + let obj = self.repo.objects.find(&next_id, &mut buf)?; if !obj.kind.is_tree() { return Ok(None); } @@ -134,17 +135,13 @@ impl<'repo> Tree<'repo> { pub fn lookup_entry_by_path( &self, relative_path: impl AsRef, - buf: &mut Vec, ) -> Result>, find::existing::Error> { use crate::bstr::ByteSlice; - self.lookup_entry( - relative_path.as_ref().components().map(|c: std::path::Component<'_>| { - gix_path::os_str_into_bstr(c.as_os_str()) - .unwrap_or_else(|_| "".into()) - .as_bytes() - }), - buf, - ) + self.lookup_entry(relative_path.as_ref().components().map(|c: std::path::Component<'_>| { + gix_path::os_str_into_bstr(c.as_os_str()) + .unwrap_or_else(|_| "".into()) + .as_bytes() + })) } /// Like [`Self::peel_to_entry()`], but takes a `Path` directly via `relative_path`, a path relative to this tree. diff --git a/gix/src/repository/object.rs b/gix/src/repository/object.rs index db2344a83c3..6230556b3c5 100644 --- a/gix/src/repository/object.rs +++ b/gix/src/repository/object.rs @@ -139,7 +139,7 @@ impl crate::Repository { } } - fn shared_empty_buf(&self) -> std::cell::RefMut<'_, Vec> { + pub(crate) fn shared_empty_buf(&self) -> std::cell::RefMut<'_, Vec> { let mut bufs = self.bufs.borrow_mut(); if bufs.last().is_none() { bufs.push(Vec::with_capacity(512)); diff --git a/gix/tests/object/tree/mod.rs b/gix/tests/object/tree/mod.rs index a331adb2fa4..4c0dc350db5 100644 --- a/gix/tests/object/tree/mod.rs +++ b/gix/tests/object/tree/mod.rs @@ -17,11 +17,6 @@ fn find_entry() -> crate::Result { fn lookup_entry_by_path() -> crate::Result { let repo = named_subrepo_opts("make_worktree_repo.sh", "repo", gix::open::Options::isolated())?; let tree = repo.head_commit()?.tree()?; - assert_eq!( - tree.lookup_entry_by_path("dir/c", &mut Vec::new())? - .expect("present") - .filename(), - "c" - ); + assert_eq!(tree.lookup_entry_by_path("dir/c")?.expect("present").filename(), "c"); Ok(()) } From cdf2a6320e208bac69b9b1152c10f0b81b238c3e Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 2 Sep 2024 13:41:41 +0200 Subject: [PATCH 05/17] fix!: rename `hash::Sha1` to `hash::Hasher` and `Sha1Digest` to `Digest`. That way, one day we can turn this type into a compatible one which produce different kinds of hashes as well. --- gix-features/src/hash.rs | 26 ++++++++++++-------------- gix-features/tests/hash.rs | 6 +++--- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/gix-features/src/hash.rs b/gix-features/src/hash.rs index 4c5389fad1c..eebc40fa075 100644 --- a/gix-features/src/hash.rs +++ b/gix-features/src/hash.rs @@ -5,7 +5,7 @@ //! Otherwise, a minimal yet performant implementation is used instead for a decent trade-off between compile times and run-time performance. #[cfg(all(feature = "rustsha1", not(feature = "fast-sha1")))] mod _impl { - use super::Sha1Digest; + use super::Digest; /// A implementation of the Sha1 hash, which can be used once. #[derive(Default, Clone)] @@ -17,22 +17,20 @@ mod _impl { self.0.update(bytes); } /// Finalize the hash and produce a digest. - pub fn digest(self) -> Sha1Digest { + pub fn digest(self) -> Digest { self.0.digest().bytes() } } } -/// A 20 bytes digest produced by a [`Sha1`] hash implementation. +/// A hash-digest produced by a [`Hasher`] hash implementation. #[cfg(any(feature = "fast-sha1", feature = "rustsha1"))] -pub type Sha1Digest = [u8; 20]; +pub type Digest = [u8; 20]; #[cfg(feature = "fast-sha1")] mod _impl { use sha1::Digest; - use super::Sha1Digest; - /// A implementation of the Sha1 hash, which can be used once. #[derive(Default, Clone)] pub struct Sha1(sha1::Sha1); @@ -43,14 +41,14 @@ mod _impl { self.0.update(bytes); } /// Finalize the hash and produce a digest. - pub fn digest(self) -> Sha1Digest { + pub fn digest(self) -> super::Digest { self.0.finalize().into() } } } #[cfg(any(feature = "rustsha1", feature = "fast-sha1"))] -pub use _impl::Sha1; +pub use _impl::Sha1 as Hasher; /// Compute a CRC32 hash from the given `bytes`, returning the CRC32 hash. /// @@ -76,9 +74,9 @@ pub fn crc32(bytes: &[u8]) -> u32 { /// Produce a hasher suitable for the given kind of hash. #[cfg(any(feature = "rustsha1", feature = "fast-sha1"))] -pub fn hasher(kind: gix_hash::Kind) -> Sha1 { +pub fn hasher(kind: gix_hash::Kind) -> Hasher { match kind { - gix_hash::Kind::Sha1 => Sha1::default(), + gix_hash::Kind::Sha1 => Hasher::default(), } } @@ -127,7 +125,7 @@ pub fn bytes( pub fn bytes_with_hasher( read: &mut dyn std::io::Read, num_bytes_from_start: u64, - mut hasher: Sha1, + mut hasher: Hasher, progress: &mut dyn crate::progress::Progress, should_interrupt: &std::sync::atomic::AtomicBool, ) -> std::io::Result { @@ -160,12 +158,12 @@ pub fn bytes_with_hasher( #[cfg(any(feature = "rustsha1", feature = "fast-sha1"))] mod write { - use crate::hash::Sha1; + use crate::hash::Hasher; /// A utility to automatically generate a hash while writing into an inner writer. pub struct Write { /// The hash implementation. - pub hash: Sha1, + pub hash: Hasher, /// The inner writer. pub inner: T, } @@ -194,7 +192,7 @@ mod write { match object_hash { gix_hash::Kind::Sha1 => Write { inner, - hash: Sha1::default(), + hash: Hasher::default(), }, } } diff --git a/gix-features/tests/hash.rs b/gix-features/tests/hash.rs index 4cccdd77043..7e7f6654dbd 100644 --- a/gix-features/tests/hash.rs +++ b/gix-features/tests/hash.rs @@ -1,16 +1,16 @@ -use gix_features::hash::Sha1; +use gix_features::hash::Hasher; #[cfg(not(feature = "fast-sha1"))] #[test] fn size_of_sha1() { - assert_eq!(std::mem::size_of::(), 96); + assert_eq!(std::mem::size_of::(), 96); } #[cfg(feature = "fast-sha1")] #[test] fn size_of_sha1() { assert_eq!( - std::mem::size_of::(), + std::mem::size_of::(), if cfg!(target_arch = "x86") { 96 } else { 104 } ); } From ee80b2bd8c3be6f80f85987f8df32fa4a796247f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 2 Sep 2024 13:43:49 +0200 Subject: [PATCH 06/17] adapt to changes in `gix-features` --- gix-pack/src/data/input/bytes_to_entries.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gix-pack/src/data/input/bytes_to_entries.rs b/gix-pack/src/data/input/bytes_to_entries.rs index f42cf016444..ed5902d60b9 100644 --- a/gix-pack/src/data/input/bytes_to_entries.rs +++ b/gix-pack/src/data/input/bytes_to_entries.rs @@ -1,6 +1,6 @@ use std::{fs, io}; -use gix_features::{hash::Sha1, zlib::Decompress}; +use gix_features::{hash::Hasher, zlib::Decompress}; use gix_hash::ObjectId; use crate::data::input; @@ -15,7 +15,7 @@ pub struct BytesToEntriesIter
{ had_error: bool, version: crate::data::Version, objects_left: u32, - hash: Option, + hash: Option, mode: input::Mode, compressed: input::EntryDataMode, compressed_buf: Option>, @@ -303,7 +303,7 @@ where /// A utility to automatically generate a hash while writing into an inner writer. pub struct HashWrite<'a, T> { /// The hash implementation. - pub hash: &'a mut Sha1, + pub hash: &'a mut Hasher, /// The inner writer. pub inner: T, } From 71bf80883e010f242460e9e36d5ea27a14e6494b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 2 Sep 2024 14:01:55 +0200 Subject: [PATCH 07/17] feat: add `Default` implementation to `Tree` --- gix-object/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gix-object/src/lib.rs b/gix-object/src/lib.rs index 43cb5f5bfd0..0ef56d52954 100644 --- a/gix-object/src/lib.rs +++ b/gix-object/src/lib.rs @@ -232,7 +232,7 @@ pub struct TreeRefIter<'a> { } /// A mutable Tree, containing other trees, blobs or commits. -#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] +#[derive(Default, PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Tree { /// The directories and files contained in this tree. They must be and remain sorted by [`filename`][tree::Entry::filename]. From 14dfcf01dc4a6e48fbd363f4f1c8cf08c1e7f6e2 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 3 Sep 2024 17:25:34 +0200 Subject: [PATCH 08/17] feat: add `tree::Editor` With it it's easy to alter existing trees or build entirely new ones, efficiently. --- Cargo.lock | 9 + gix-object/Cargo.toml | 3 + gix-object/src/tree/editor.rs | 313 ++++++++++++++ gix-object/src/tree/mod.rs | 3 + gix-object/src/tree/write.rs | 2 +- gix-object/tests/tree/editor.rs | 639 ++++++++++++++++++++++++++++ gix-object/tests/tree/entries.rs | 42 ++ gix-object/tests/tree/entry_mode.rs | 71 ++++ gix-object/tests/tree/from_bytes.rs | 98 +++++ gix-object/tests/tree/iter.rs | 54 +++ gix-object/tests/tree/mod.rs | 281 +----------- 11 files changed, 1238 insertions(+), 277 deletions(-) create mode 100644 gix-object/src/tree/editor.rs create mode 100644 gix-object/tests/tree/editor.rs create mode 100644 gix-object/tests/tree/entries.rs create mode 100644 gix-object/tests/tree/entry_mode.rs create mode 100644 gix-object/tests/tree/from_bytes.rs create mode 100644 gix-object/tests/tree/iter.rs diff --git a/Cargo.lock b/Cargo.lock index e72c7e2c5ba..9ddc5a3c30c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2089,6 +2089,8 @@ dependencies = [ "gix-date 0.9.0", "gix-features 0.38.2", "gix-hash 0.14.2", + "gix-hashtable 0.5.2", + "gix-odb", "gix-testtools", "gix-utils 0.1.12", "gix-validate 0.9.0", @@ -2096,6 +2098,7 @@ dependencies = [ "pretty_assertions", "serde", "smallvec", + "termtree", "thiserror", "winnow", ] @@ -4675,6 +4678,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "termtree" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" + [[package]] name = "thiserror" version = "1.0.63" diff --git a/gix-object/Cargo.toml b/gix-object/Cargo.toml index f3047525668..fc05fb13650 100644 --- a/gix-object/Cargo.toml +++ b/gix-object/Cargo.toml @@ -41,6 +41,7 @@ gix-features = { version = "^0.38.2", path = "../gix-features", features = [ "progress", ] } gix-hash = { version = "^0.14.2", path = "../gix-hash" } +gix-hashtable = { version = "^0.5.2", path = "../gix-hashtable" } gix-validate = { version = "^0.9.0", path = "../gix-validate" } gix-actor = { version = "^0.32.0", path = "../gix-actor" } gix-date = { version = "^0.9.0", path = "../gix-date" } @@ -64,6 +65,8 @@ document-features = { version = "0.2.0", optional = true } criterion = "0.5.1" pretty_assertions = "1.0.0" gix-testtools = { path = "../tests/tools" } +gix-odb = { path = "../gix-odb" } +termtree = "0.5.1" [package.metadata.docs.rs] all-features = true diff --git a/gix-object/src/tree/editor.rs b/gix-object/src/tree/editor.rs new file mode 100644 index 00000000000..b17b04bad14 --- /dev/null +++ b/gix-object/src/tree/editor.rs @@ -0,0 +1,313 @@ +use crate::tree::EntryKind; +use crate::{tree, Tree}; +use bstr::{BStr, BString, ByteSlice, ByteVec}; +use gix_hash::ObjectId; +use gix_hashtable::hash_map::Entry; +use std::cmp::Ordering; + +/// The state needed to apply edits instantly to in-memory trees. +/// +/// It's made so that each tree is looked at in the object database at most once, and held in memory for +/// all edits until everything is flushed to write all changed trees. +/// +/// The editor is optimized to edit existing trees, but can deal with building entirely new trees as well +/// with some penalties. +/// +/// ### Note +/// +/// For reasons of efficiency, internally a SHA1 based hashmap is used to avoid having to store full paths +/// to each edited tree. The chance of collision is low, but could be engineered to overwrite or write into +/// an unintended tree. +#[doc(alias = "TreeUpdateBuilder", alias = "git2")] +pub struct Editor<'a> { + /// A way to lookup trees. + find: &'a dyn crate::FindExt, + /// All trees we currently hold in memory. Each of these may change while adding and removing entries. + /// null-object-ids mark tree-entries whose value we don't know yet, they are placeholders that will be + /// dropped when writing at the latest. + trees: gix_hashtable::HashMap, + /// A buffer to build up paths when finding the tree to edit. + path_buf: BString, + /// Our buffer for storing tree-data in, right before decoding it. + tree_buf: Vec, +} + +/// Lifecycle +impl<'a> Editor<'a> { + /// Create a new editor that uses `root` as base for all edits. Use `find` to lookup existing + /// trees when edits are made. Each tree will only be looked-up once and then edited in place from + /// that point on. + pub fn new(root: Tree, find: &'a dyn crate::FindExt) -> Self { + Editor { + find, + trees: gix_hashtable::HashMap::from_iter(Some((empty_path_hash(), root))), + path_buf: Vec::with_capacity(256).into(), + tree_buf: Vec::with_capacity(512), + } + } +} + +/// Operations +impl<'a> Editor<'a> { + /// Write the entire in-memory state of all changed trees (and only changed trees) to `out`. + /// Note that the returned object id *can* be the empty tree if everything was removed or if nothing + /// was added to the tree. + /// + /// The last call to `out` will be the changed root tree, whose object-id will also be returned. + /// `out` is free to do any kind of additional validation, like to assure that all entries in the tree exist. + /// We don't assure that as there is no validation that inserted entries are valid object ids. + /// + /// Future calls to [`upsert`](Self::upsert) or similar will keep working on the last seen state of the + /// just-written root-tree. + /// If this is not desired, use [set_root()](Self::set_root()). + pub fn write(&mut self, mut out: impl FnMut(&Tree) -> Result) -> Result { + assert_ne!(self.trees.len(), 0, "there is at least the root tree"); + + // back is for children, front is for parents. + let mut parents = vec![( + None::, + BString::default(), + self.trees + .remove(&empty_path_hash()) + .expect("root tree is always present"), + )]; + let mut children = Vec::new(); + while let Some((parent_idx, mut rela_path, mut tree)) = children.pop().or_else(|| parents.pop()) { + let mut all_entries_unchanged_or_written = true; + for entry in &tree.entries { + if entry.mode.is_tree() { + let prev_len = push_path_component(&mut rela_path, &entry.filename); + if let Some(sub_tree) = self.trees.remove(&path_hash(&rela_path)) { + all_entries_unchanged_or_written = false; + let next_parent_idx = parents.len(); + children.push((Some(next_parent_idx), rela_path.clone(), sub_tree)); + } + rela_path.truncate(prev_len); + } + } + if all_entries_unchanged_or_written { + tree.entries.retain(|e| !e.oid.is_null()); + if let Some((_, _, parent_to_adjust)) = + parent_idx.map(|idx| parents.get_mut(idx).expect("always present, pointing towards zero")) + { + let name = filename(rela_path.as_bstr()); + let entry_idx = parent_to_adjust + .entries + .binary_search_by(|e| cmp_entry_with_name(e, name, true)) + .expect("the parent always knows us by name"); + if tree.entries.is_empty() { + parent_to_adjust.entries.remove(entry_idx); + } else { + parent_to_adjust.entries[entry_idx].oid = out(&tree)?; + } + } else if parents.is_empty() { + debug_assert!(children.is_empty(), "we consume children before parents"); + debug_assert!(rela_path.is_empty(), "this should always be the root tree"); + + // There may be left-over trees if they are replaced with blobs for example. + let root_tree_id = out(&tree)?; + self.trees.clear(); + self.trees.insert(empty_path_hash(), tree); + return Ok(root_tree_id); + } else if !tree.entries.is_empty() { + out(&tree)?; + } + } else { + parents.push((parent_idx, rela_path, tree)); + } + } + + unreachable!("we exit as soon as everything is consumed") + } + + /// Remove the entry at `rela_path`, loading all trees on the path accordingly. + /// It's no error if the entry doesn't exist, or if `rela_path` doesn't lead to an existing entry at all. + pub fn remove(&mut self, rela_path: I) -> Result<&mut Self, crate::find::existing_object::Error> + where + I: IntoIterator, + C: AsRef, + { + self.upsert_or_remove(rela_path, None) + } + + /// Insert a new entry of `kind` with `id` at `rela_path`, an iterator over each path component in the tree, + /// like `a/b/c`. Names are matched case-sensitively. + /// + /// Existing leaf-entries will be overwritten unconditionally, and it is assumed that `id` is available in the object database + /// or will be made available at a later point to assure the integrity of the produced tree. + /// + /// Intermediate trees will be created if they don't exist in the object database, otherwise they will be loaded and entries + /// will be inserted into them instead. + /// + /// Note that `id` can be [null](ObjectId::null()) to create a placeholder. These will not be written, and paths leading + /// through them will not be considered a problem. + /// + /// `id` can also be an empty tree, along with [the respective `kind`](EntryKind::Tree), even though that's normally not allowed + /// in Git trees. + pub fn upsert( + &mut self, + rela_path: I, + kind: EntryKind, + id: ObjectId, + ) -> Result<&mut Self, crate::find::existing_object::Error> + where + I: IntoIterator, + C: AsRef, + { + self.upsert_or_remove(rela_path, Some((kind, id))) + } + + fn upsert_or_remove( + &mut self, + rela_path: I, + kind_and_id: Option<(EntryKind, ObjectId)>, + ) -> Result<&mut Self, crate::find::existing_object::Error> + where + I: IntoIterator, + C: AsRef, + { + let mut cursor = self.trees.get_mut(&empty_path_hash()).expect("root is always present"); + self.path_buf.clear(); + let mut rela_path = rela_path.into_iter().peekable(); + let new_kind_is_tree = kind_and_id.map_or(false, |(kind, _)| kind == EntryKind::Tree); + while let Some(name) = rela_path.next() { + let name = name.as_ref(); + let is_last = rela_path.peek().is_none(); + let mut needs_sorting = false; + let current_level_must_be_tree = !is_last || new_kind_is_tree; + let check_type_change = |entry: &tree::Entry| entry.mode.is_tree() != current_level_must_be_tree; + let tree_to_lookup = match cursor + .entries + .binary_search_by(|e| cmp_entry_with_name(e, name, false)) + .or_else(|file_insertion_idx| { + cursor + .entries + .binary_search_by(|e| cmp_entry_with_name(e, name, true)) + .map_err(|dir_insertion_index| { + if current_level_must_be_tree { + dir_insertion_index + } else { + file_insertion_idx + } + }) + }) { + Ok(idx) => { + match kind_and_id { + None => { + if is_last { + cursor.entries.remove(idx); + break; + } else { + let entry = &cursor.entries[idx]; + if entry.mode.is_tree() { + Some(entry.oid) + } else { + break; + } + } + } + Some((kind, id)) => { + let entry = &mut cursor.entries[idx]; + if is_last { + // unconditionally overwrite what's there. + entry.oid = id; + needs_sorting = check_type_change(entry); + entry.mode = kind.into(); + None + } else if entry.mode.is_tree() { + // Possibly lookup the existing tree on our way down the path. + Some(entry.oid) + } else { + // it is no tree, but we are traversing a path, so turn it into one. + entry.oid = id.kind().null(); + needs_sorting = check_type_change(entry); + entry.mode = EntryKind::Tree.into(); + None + } + } + } + } + Err(insertion_idx) => match kind_and_id { + None => break, + Some((kind, id)) => { + cursor.entries.insert( + insertion_idx, + tree::Entry { + filename: name.into(), + mode: if is_last { kind.into() } else { EntryKind::Tree.into() }, + oid: if is_last { id } else { id.kind().null() }, + }, + ); + if is_last { + break; + } + None + } + }, + }; + if needs_sorting { + cursor.entries.sort(); + } + if is_last { + break; + } + push_path_component(&mut self.path_buf, name); + let path_id = path_hash(&self.path_buf); + cursor = match self.trees.entry(path_id) { + Entry::Occupied(e) => e.into_mut(), + Entry::Vacant(e) => e.insert( + if let Some(tree_id) = tree_to_lookup.filter(|tree_id| !tree_id.is_empty_tree()) { + self.find.find_tree(&tree_id, &mut self.tree_buf)?.into() + } else { + Tree::default() + }, + ), + }; + } + Ok(self) + } + + /// Set the root tree of the modification to `root`, assuring it has a well-known state. + /// + /// Note that this erases all previous edits. + /// + /// This is useful if the same editor is re-used for various trees. + pub fn set_root(&mut self, root: Tree) -> &mut Self { + self.trees.clear(); + self.trees.insert(empty_path_hash(), root); + self + } +} + +fn cmp_entry_with_name(a: &tree::Entry, filename: &BStr, is_tree: bool) -> Ordering { + let common = a.filename.len().min(filename.len()); + a.filename[..common].cmp(&filename[..common]).then_with(|| { + let a = a.filename.get(common).or_else(|| a.mode.is_tree().then_some(&b'/')); + let b = filename.get(common).or_else(|| is_tree.then_some(&b'/')); + a.cmp(&b) + }) +} + +fn filename(path: &BStr) -> &BStr { + path.rfind_byte(b'/').map_or(path, |pos| &path[pos + 1..]) +} + +fn empty_path_hash() -> ObjectId { + gix_features::hash::hasher(gix_hash::Kind::Sha1).digest().into() +} + +fn path_hash(path: &[u8]) -> ObjectId { + let mut hasher = gix_features::hash::hasher(gix_hash::Kind::Sha1); + hasher.update(path); + hasher.digest().into() +} + +fn push_path_component(base: &mut BString, component: &[u8]) -> usize { + let prev_len = base.len(); + debug_assert!(base.last() != Some(&b'/')); + if !base.is_empty() { + base.push_byte(b'/'); + } + base.push_str(component); + prev_len +} diff --git a/gix-object/src/tree/mod.rs b/gix-object/src/tree/mod.rs index 1014fcfde29..5b5e7463a9f 100644 --- a/gix-object/src/tree/mod.rs +++ b/gix-object/src/tree/mod.rs @@ -5,6 +5,9 @@ use crate::{ tree, }; +mod editor; +pub use editor::Editor; + mod ref_iter; /// pub mod write; diff --git a/gix-object/src/tree/write.rs b/gix-object/src/tree/write.rs index 9c43dcc99a4..e2c98d76e13 100644 --- a/gix-object/src/tree/write.rs +++ b/gix-object/src/tree/write.rs @@ -27,12 +27,12 @@ impl crate::WriteTo for Tree { /// Serialize this tree to `out` in the git internal format. fn write_to(&self, out: &mut dyn io::Write) -> io::Result<()> { debug_assert_eq!( + &self.entries, &{ let mut entries_sorted = self.entries.clone(); entries_sorted.sort(); entries_sorted }, - &self.entries, "entries for serialization must be sorted by filename" ); let mut buf = Default::default(); diff --git a/gix-object/tests/tree/editor.rs b/gix-object/tests/tree/editor.rs new file mode 100644 index 00000000000..e752085decc --- /dev/null +++ b/gix-object/tests/tree/editor.rs @@ -0,0 +1,639 @@ +use gix_object::tree::EntryKind; +use gix_object::Tree; + +#[test] +fn from_empty_removal() -> crate::Result { + let (storage, mut write, num_writes_and_clear) = new_inmemory_writes(); + let odb = StorageOdb::new(storage.clone()); + let mut edit = gix_object::tree::Editor::new(Tree::default(), &odb); + + let actual = edit + .remove(Some("non-existing"))? + .remove(["still", "does", "not", "exist"])? + .write(&mut write)?; + assert_eq!(actual, empty_tree(), "nothing was actually done"); + assert_eq!(num_writes_and_clear(), 1, "it has to write the empty tree though"); + assert_eq!(storage.borrow().len(), 1, "the empty tree ends up in storage, too"); + assert_eq!(odb.access_count_and_clear(), 0); + + let actual = edit + .upsert(Some("file"), EntryKind::Blob, any_blob())? + .upsert(Some("empty-dir"), EntryKind::Tree, empty_tree())? + .upsert(["with-subdir", "dir", "file"], EntryKind::Blob, any_blob())? + .upsert(["with-subdir2", "dir", "file"], EntryKind::Blob, any_blob())? + .remove(Some("file"))? + .remove(Some("empty-dir"))? + .remove(Some("with-subdir"))? + .remove(["with-subdir2", "dir"])? + .remove(Some("with-subdir2"))? + .write(&mut write)?; + assert_eq!(actual, empty_tree(), "nothing was actually done"); + assert_eq!(num_writes_and_clear(), 1, "still nothing to write but the empty tree"); + assert_eq!(odb.access_count_and_clear(), 0); + + let actual = edit + .upsert(Some("file"), EntryKind::Blob, any_blob())? + .upsert(Some("empty-dir"), EntryKind::Tree, empty_tree())? + .upsert(["with-subdir", "dir", "file"], EntryKind::Blob, any_blob())? + .upsert(["with-subdir2", "dir", "file"], EntryKind::Blob, any_blob())? + .write(&mut write)?; + assert_eq!( + display_tree(actual, &storage), + "9e608223b4cbc733abd20fb6d5b8ea80b074be17 +├── empty-dir (empty) +├── file bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +├── with-subdir +│ └── dir +│ └── file bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +└── with-subdir2 + └── dir + └── file bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +" + ); + assert_eq!(num_writes_and_clear(), 5); + assert_eq!(odb.access_count_and_clear(), 0); + + let actual = edit + .remove(Some("file"))? + .remove(Some("empty-dir"))? + .remove(Some("with-subdir"))? + .remove(["with-subdir2", "dir"])? + .remove(Some("with-subdir2"))? + .write(&mut write)?; + assert_eq!(actual, empty_tree(), "everything was removed, leaving nothing"); + assert_eq!(num_writes_and_clear(), 1, "only the empty tree to write"); + assert_eq!( + odb.access_count_and_clear(), + 1, + "has to get `with-subdir2` to remove child-entry" + ); + + let actual = edit + .upsert(["with-subdir", "file"], EntryKind::Blob, any_blob())? + .upsert(["with-subdir", "dir", "file"], EntryKind::Blob, any_blob())? + .remove(["with-subdir", "dir"])? + .write(&mut write)?; + assert_eq!( + display_tree(actual, &storage), + "da2277079f7e9e5a012c9a03d1aac710866ee2c5 +└── with-subdir + └── file bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +", + "only one file remains, empty dirs are removed automatically" + ); + assert_eq!(num_writes_and_clear(), 1 + 1, "root and one subtree"); + assert_eq!(storage.borrow().len(), 1 + 4, "empty tree and 4 unique trees"); + assert_eq!(odb.access_count_and_clear(), 0); + + Ok(()) +} + +#[test] +fn from_existing_remove() -> crate::Result { + let (storage, mut write, num_writes_and_clear) = new_inmemory_writes(); + let odb = StorageOdb::new_with_odb(storage.clone(), tree_odb()?); + let root_tree_id = hex_to_id("ff7e7d2aecae1c3fb15054b289a4c58aa65b8646"); + let root_tree = find_tree(&odb, root_tree_id)?; + odb.access_count_and_clear(); + let mut edit = gix_object::tree::Editor::new(root_tree.clone(), &odb); + + let actual = edit + .remove(["file"])? + .remove(Some("does not exist"))? + .remove(["also", "does", "not", "exist"])? + .remove(Some("bin.d"))? + .remove(Some("file.toml.bin"))? + .remove(Some("file.0"))? + .write(&mut write)?; + assert_eq!( + display_tree_with_odb(actual, &storage, &odb), + "dfd0d048f8e852879ad8e1a6a9b810873de16a9c +├── bin e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file.to e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file.toml e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +└── file0 e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +" + ); + assert_eq!(num_writes_and_clear(), 1, "only the root tree is written"); + assert_eq!( + odb.access_count_and_clear(), + 0, + "no sub-tree has to be queried for removal" + ); + + let actual = edit + .remove(Some("bin"))? + .remove(Some("file.to"))? + .remove(Some("file.toml"))? + .remove(Some("file0"))? + .write(&mut write)?; + assert_eq!(actual, empty_tree(), "nothing is left"); + assert_eq!(num_writes_and_clear(), 1, "only the empty tree is written"); + assert_eq!(odb.access_count_and_clear(), 0); + + let actual = edit.set_root(root_tree).remove(["file", "a"])?.write(&mut write)?; + assert_eq!(num_writes_and_clear(), 1, "it writes the changed root-tree"); + assert_eq!( + odb.access_count_and_clear(), + 1, + "lookup `file` to delete its (only) content" + ); + assert_eq!( + display_tree_with_odb(actual, &storage, &odb), + "38a66b78d1d5dd8daedf6188d2fafd98357a870c +├── bin e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── bin.d e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file.to e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file.toml e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file.toml.bin e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +└── file0 e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +", + "`file` is removed as it remains empty" + ); + + Ok(()) +} + +#[test] +fn from_empty_add() -> crate::Result { + let (storage, mut write, num_writes_and_clear) = new_inmemory_writes(); + let odb = StorageOdb::new(storage.clone()); + let mut edit = gix_object::tree::Editor::new(Tree::default(), &odb); + + let actual = edit.write(&mut write).expect("no changes are fine"); + assert_eq!(actual, empty_tree(), "empty stays empty"); + assert_eq!(num_writes_and_clear(), 1, "the empty tree was written"); + assert_eq!( + display_tree(actual, &storage), + "4b825dc642cb6eb9a060e54bf8d69288fbee4904\n" + ); + assert_eq!(odb.access_count_and_clear(), 0); + + let actual = edit + .upsert(Some("hi"), EntryKind::Blob, gix_hash::Kind::Sha1.null())? + .write(&mut write) + .expect("effectively no changes are fine"); + assert_eq!( + actual, + empty_tree(), + "null-ids are dropped automatically, they act as placeholders" + ); + assert_eq!(num_writes_and_clear(), 1, "the empty tree was written, nothing new"); + assert_eq!(odb.access_count_and_clear(), 0); + + let actual = edit + .upsert(["a", "b", "c"], EntryKind::Blob, gix_hash::Kind::Sha1.null())? + .upsert(["a", "b", "d", "e"], EntryKind::Blob, gix_hash::Kind::Sha1.null())? + .write(&mut write) + .expect("effectively no changes are fine"); + assert_eq!( + actual, + empty_tree(), + "null-ids are dropped automatically, recursively, they act as placeholders" + ); + assert_eq!( + num_writes_and_clear(), + 1, + "the empty tree was written as root, nothing new" + ); + assert_eq!(storage.borrow().len(), 1, "still nothing but empty trees"); + assert_eq!(odb.access_count_and_clear(), 0); + + let actual = edit + .upsert(["a", "b"], EntryKind::Tree, empty_tree())? + .upsert(["a", "b", "c"], EntryKind::Tree, empty_tree())? + .upsert(["a", "b", "d", "e"], EntryKind::Tree, empty_tree())? + .write(&mut write) + .expect("it's OK to write empty trees"); + assert_eq!( + display_tree(actual, &storage), + "bf91a94ae659ac8a9da70d26acf42df1a36adb6e +└── a + └── b + ├── c (empty) + └── d + └── e (empty) +", + "one can write through trees, and empty trees are also fine" + ); + assert_eq!(num_writes_and_clear(), 4, "it wrote the trees it needed to write"); + assert_eq!(odb.access_count_and_clear(), 0); + + let actual = edit + .upsert(["a"], EntryKind::Blob, any_blob())? + .upsert(["a", "b"], EntryKind::Blob, any_blob())? + .upsert(["a", "b", "c"], EntryKind::BlobExecutable, any_blob())? + .upsert(["x", "z"], EntryKind::Blob, any_blob())? + .write(&mut write) + .expect("writing made-up blobs is fine"); + assert_eq!( + display_tree(actual, &storage), + "7534352b2f718388b6c6d0f70aaf12399f38258c +├── a +│ └── b +│ └── c bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100755 +└── x + └── z bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +", + "it's possible to write through previously added blobs" + ); + assert_eq!(num_writes_and_clear(), 4); + assert_eq!(odb.access_count_and_clear(), 0); + + let actual = edit.upsert(["x"], EntryKind::Blob, any_blob())?.write(&mut write)?; + assert_eq!( + display_tree(actual, &storage), + "c2bb1616d4db21d99a30a1219d7d47e969f42e26 +├── a +│ └── b +│ └── c bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100755 +└── x bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +" + ); + assert_eq!(num_writes_and_clear(), 1, "just the root tree changed"); + assert_eq!(odb.access_count_and_clear(), 0); + + let prev_tree = actual; + let actual = edit + .upsert(["a", "b", "c"], EntryKind::BlobExecutable, any_blob())? + .write(&mut write)?; + assert_eq!(actual, prev_tree, "writing the same path again is a no-op"); + assert_eq!( + num_writes_and_clear(), + 3, + "it still rewrites all paths that (potentially) changed. \ + There is no actual change tracking as no-changes aren't the default case for an editor" + ); + assert_eq!(odb.access_count_and_clear(), 2, "`a` and `a/b`"); + + let actual = edit + .upsert(["a", "b", "c"], EntryKind::Blob, any_blob())? + .upsert(["a"], EntryKind::Blob, any_blob())? + .write(&mut write) + .expect("we can turn overwrite a newly added tree (at 'a/') with a blob"); + assert_eq!( + display_tree(actual, &storage), + "7c66d7d5cbfdbbb37085ff5c8c6e5b048727cf88 +├── a bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +└── x bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +", + "now a tree was once again changed into a blob" + ); + assert_eq!(num_writes_and_clear(), 1, "only the root-tree changes effectively"); + assert_eq!(odb.access_count_and_clear(), 2, "`a` and `a/b`"); + + let actual = edit + .set_root(Tree::default()) + .upsert(["a", "b", "c"], EntryKind::Blob, any_blob())? + .upsert(["a"], EntryKind::BlobExecutable, any_blob())? + .write(&mut write)?; + assert_eq!( + display_tree(actual, &storage), + "f7b85940c3afa596829cacf98e98ff8bfd7c68ed +└── a bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100755 +", + "now the root is back to a well-known state, so edits are more intuitive" + ); + assert_eq!( + num_writes_and_clear(), + 1, + "still, only the root-tree changes effectively" + ); + assert_eq!(odb.access_count_and_clear(), 0); + Ok(()) +} + +#[test] +fn from_existing_add() -> crate::Result { + let (storage, mut write, num_writes_and_clear) = new_inmemory_writes(); + let odb = StorageOdb::new_with_odb(storage.clone(), tree_odb()?); + let root_tree_id = hex_to_id("ff7e7d2aecae1c3fb15054b289a4c58aa65b8646"); + let root_tree = find_tree(&odb, root_tree_id)?; + odb.access_count_and_clear(); + let mut edit = gix_object::tree::Editor::new(root_tree.clone(), &odb); + + let actual = edit.write(&mut write).expect("no changes are fine"); + assert_eq!(actual, root_tree_id, "it rewrites the same tree"); + assert_eq!(odb.access_count_and_clear(), 0); + assert_eq!( + display_tree_with_odb(actual, &storage, &odb), + "ff7e7d2aecae1c3fb15054b289a4c58aa65b8646 +├── bin e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── bin.d e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file.to e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file.toml e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file.toml.bin e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file +│ └── a e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +└── file0 e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +" + ); + assert_eq!( + num_writes_and_clear(), + 1, + "only the root is written - there is no change tracking" + ); + + let actual = edit + .upsert(["file", "hi"], EntryKind::Blob, gix_hash::Kind::Sha1.null())? + .write(&mut write) + .expect("effectively no changes are fine"); + assert_eq!( + actual, root_tree_id, + "null-ids are dropped automatically, they act as placeholders, ultimately the tree is not changed" + ); + assert_eq!( + storage.borrow().len(), + 2, + "it writes two trees, even though none is new" + ); + assert_eq!(num_writes_and_clear(), 2, "the write-count reflects that"); + + odb.access_count_and_clear(); + let actual = edit + .upsert(["a", "b", "c"], EntryKind::Blob, gix_hash::Kind::Sha1.null())? + .upsert(["a", "b", "d", "e"], EntryKind::Blob, gix_hash::Kind::Sha1.null())? + .write(&mut write) + .expect("effectively no changes are fine"); + assert_eq!( + actual, root_tree_id, + "null-ids are dropped automatically, recursively, and empty intermediate trees are removed as well" + ); + assert_eq!(storage.borrow().len(), 2, "still the same amount of trees"); + assert_eq!( + num_writes_and_clear(), + 1, + "but only the root-tree is written (with nulls pruned)" + ); + assert_eq!(odb.access_count_and_clear(), 0); + + odb.access_count_and_clear(); + let actual = edit + .upsert(["bin", "b"], EntryKind::Tree, empty_tree())? + .upsert(["bin", "b", "c"], EntryKind::Tree, empty_tree())? + .upsert(["a", "b", "d", "e"], EntryKind::Tree, empty_tree())? + .write(&mut write) + .expect("it's OK to write empty leaf-trees"); + assert_eq!( + odb.access_count_and_clear(), + 0, + "we write through blobs, and thus create trees on the fly" + ); + assert_eq!( + display_tree_with_odb(actual, &storage, &odb), + "a57b69b5c216b8417d332e64a4f3014eb9962099 +├── a +│ └── b +│ └── d +│ └── e (empty) +├── bin.d e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── bin +│ └── b +│ └── c (empty) +├── file.to e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file.toml e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file.toml.bin e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file +│ └── a e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +└── file0 e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +", + "one can write through trees and blobs, and empty leaf trees are also fine" + ); + assert_eq!( + num_writes_and_clear(), + 1 + 2 + 3, + "each changed tree is written: root, the two subtrees" + ); + + odb.access_count_and_clear(); + let actual = edit + .upsert(["a", "b", "c"], EntryKind::Blob, any_blob())? + .upsert(["a", "b"], EntryKind::Blob, any_blob())? + .upsert(["file"], EntryKind::BlobExecutable, any_blob())? + .write(&mut write)?; + assert_eq!(odb.access_count_and_clear(), 2, "`a` and `a/b`"); + assert_eq!( + display_tree_with_odb(actual, &storage, &odb), + "25a0bd46bc61342e17f03fafcc029e9db52d4c64 +├── a +│ └── b bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +├── bin.d e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── bin +│ └── b +│ └── c (empty) +├── file bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100755 +├── file.to e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file.toml e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file.toml.bin e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +└── file0 e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +", + "it properly sorts entries after type-changes" + ); + assert_eq!(num_writes_and_clear(), 1 + 1, "the root and one subtree"); + + odb.access_count_and_clear(); + let actual = edit + .set_root(root_tree) + .upsert(["file", "subdir", "exe"], EntryKind::BlobExecutable, any_blob())? + .write(&mut write)?; + assert_eq!( + odb.access_count_and_clear(), + 1, + "`file` only, everything else is inserted" + ); + assert_eq!( + display_tree_with_odb(actual, &storage, &odb), + "6a5115bfb88b8303b837854199b90232621f8535 +├── bin e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── bin.d e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file.to e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file.toml e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file.toml.bin e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file +│ ├── a e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +│ └── subdir +│ └── exe bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100755 +└── file0 e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +", + "now the root is back to a well-known state" + ); + assert_eq!(num_writes_and_clear(), 1 + 2, "the root and one subtree with directory"); + Ok(()) +} + +mod utils { + use crate::hex_to_id; + use bstr::{BStr, ByteSlice}; + use gix_hash::ObjectId; + use gix_object::{Tree, WriteTo}; + use std::cell::{Cell, RefCell}; + use std::rc::Rc; + + type TreeStore = Rc>>; + pub(super) struct StorageOdb(TreeStore, Option, Cell); + + pub(super) fn tree_odb() -> gix_testtools::Result { + let root = gix_testtools::scripted_fixture_read_only("make_trees.sh")?; + Ok(gix_odb::at(root.join(".git/objects"))?) + } + + pub(super) fn find_tree(odb: &impl gix_object::FindExt, id: ObjectId) -> gix_testtools::Result { + let mut buf = Vec::new(); + Ok(odb.find_tree(&id, &mut buf)?.into()) + } + + pub(super) fn new_inmemory_writes() -> ( + TreeStore, + impl FnMut(&Tree) -> Result, + impl Fn() -> usize, + ) { + let store = TreeStore::default(); + let num_writes = Rc::new(Cell::new(0_usize)); + let write_tree = { + let store = store.clone(); + let num_writes = num_writes.clone(); + let mut buf = Vec::with_capacity(512); + move |tree: &Tree| { + buf.clear(); + tree.write_to(&mut buf) + .expect("write to memory can't fail and tree is valid"); + let header = gix_object::encode::loose_header(gix_object::Kind::Tree, buf.len() as u64); + let mut hasher = gix_features::hash::hasher(gix_hash::Kind::Sha1); + hasher.update(&header); + hasher.update(&buf); + let id = hasher.digest().into(); + store.borrow_mut().insert(id, tree.clone()); + let old = num_writes.get(); + num_writes.set(old + 1); + Ok(id) + } + }; + let obtain_num_writes = { + let c = num_writes.clone(); + move || { + let res = c.get(); + c.set(0); + res + } + }; + (store, write_tree, obtain_num_writes) + } + + impl StorageOdb { + pub fn new(storage: TreeStore) -> Self { + Self(storage, None, Cell::new(0)) + } + pub fn new_with_odb(storage: TreeStore, odb: gix_odb::Handle) -> Self { + Self(storage, Some(odb), Cell::new(0)) + } + pub fn access_count_and_clear(&self) -> usize { + let res = self.2.get(); + self.2.set(0); + res + } + } + + impl gix_object::Find for StorageOdb { + fn try_find<'a>( + &self, + id: &gix_hash::oid, + buffer: &'a mut Vec, + ) -> Result>, gix_object::find::Error> { + let borrow = self.0.borrow(); + let old = self.2.get(); + self.2.set(old + 1); + match borrow.get(id) { + None => self.1.as_ref().map_or(Ok(None), |odb| odb.try_find(id, buffer)), + Some(tree) => { + buffer.clear(); + tree.write_to(buffer).expect("valid trees can always be serialized"); + Ok(Some(gix_object::Data { + kind: gix_object::Kind::Tree, + data: &*buffer, + })) + } + } + } + } + + fn display_tree_recursive( + tree_id: ObjectId, + storage: &TreeStore, + odb: Option<&dyn gix_object::FindExt>, + name: Option<&BStr>, + buf: &mut Vec, + ) -> termtree::Tree { + let mut tree_storage = None; + let borrow = storage.borrow(); + let tree = borrow + .get(&tree_id) + .or_else(|| { + if tree_id.is_empty_tree() { + tree_storage = Some(Tree::default()); + tree_storage.as_ref() + } else { + odb.and_then(|odb| { + tree_storage = odb.find_tree(&tree_id, buf).map(Into::into).ok(); + tree_storage.as_ref() + }) + } + }) + .unwrap_or_else(|| panic!("tree {tree_id} is always present")); + + let mut termtree = termtree::Tree::new(if let Some(name) = name { + if tree.entries.is_empty() { + format!("{name} (empty)") + } else { + name.to_string() + } + } else { + tree_id.to_string() + }); + + for entry in &tree.entries { + if entry.mode.is_tree() { + termtree.push(display_tree_recursive( + entry.oid, + storage, + odb, + Some(entry.filename.as_bstr()), + buf, + )); + } else { + termtree.push(format!( + "{} {}.{}", + entry.filename, + entry.oid, + entry.mode.kind().as_octal_str() + )); + } + } + termtree + } + + pub(super) fn display_tree(tree_id: ObjectId, storage: &TreeStore) -> String { + let mut buf = Vec::new(); + display_tree_recursive(tree_id, storage, None, None, &mut buf).to_string() + } + + pub(super) fn display_tree_with_odb( + tree_id: ObjectId, + storage: &TreeStore, + odb: &impl gix_object::FindExt, + ) -> String { + let mut buf = Vec::new(); + display_tree_recursive(tree_id, storage, Some(odb), None, &mut buf).to_string() + } + + pub(super) fn empty_tree() -> ObjectId { + ObjectId::empty_tree(gix_hash::Kind::Sha1) + } + + pub(super) fn any_blob() -> ObjectId { + hex_to_id("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb") + } +} +use crate::hex_to_id; +use utils::{ + any_blob, display_tree, display_tree_with_odb, empty_tree, find_tree, new_inmemory_writes, tree_odb, StorageOdb, +}; diff --git a/gix-object/tests/tree/entries.rs b/gix-object/tests/tree/entries.rs new file mode 100644 index 00000000000..f60e342da9d --- /dev/null +++ b/gix-object/tests/tree/entries.rs @@ -0,0 +1,42 @@ +use gix_object::{Tree, TreeRef}; + +#[test] +fn sort_order_is_correct() -> crate::Result { + let root = gix_testtools::scripted_fixture_read_only("make_trees.sh")?; + let input = std::fs::read(root.join("tree.baseline"))?; + + let mut tree = TreeRef::from_bytes(&input)?; + let expected = tree.entries.clone(); + + tree.entries.sort(); + assert_eq!(tree.entries, expected); + let mut failures_when_searching_by_name = 0; + for entry in expected { + assert!( + tree.entries.binary_search_by(|e| e.cmp(&entry)).is_ok(), + "ordering works with binary search" + ); + failures_when_searching_by_name += usize::from( + tree.entries + .binary_search_by(|e| e.filename.cmp(entry.filename)) + .is_err(), + ); + assert_eq!( + tree.bisect_entry(entry.filename, entry.mode.is_tree()) + .expect("entry is present"), + entry + ); + } + + assert_ne!( + failures_when_searching_by_name, 0, + "it's not possible to do a binary search by name alone" + ); + + let mut tree: Tree = tree.into(); + let expected = tree.entries.clone(); + tree.entries.sort(); + + assert_eq!(tree.entries, expected); + Ok(()) +} diff --git a/gix-object/tests/tree/entry_mode.rs b/gix-object/tests/tree/entry_mode.rs new file mode 100644 index 00000000000..0fe4c17fc59 --- /dev/null +++ b/gix-object/tests/tree/entry_mode.rs @@ -0,0 +1,71 @@ +use gix_object::tree::{EntryKind, EntryMode}; + +#[test] +fn size_in_bytes() { + assert_eq!( + std::mem::size_of::(), + 2, + "it should not change without notice" + ); +} + +#[test] +fn is_methods() { + fn mode(kind: EntryKind) -> EntryMode { + kind.into() + } + + assert!(mode(EntryKind::Blob).is_blob()); + assert!(EntryMode(0o100645).is_blob()); + assert_eq!(EntryMode(0o100645).kind(), EntryKind::Blob); + assert!(!EntryMode(0o100675).is_executable()); + assert!(EntryMode(0o100700).is_executable()); + assert_eq!(EntryMode(0o100700).kind(), EntryKind::BlobExecutable); + assert!(!mode(EntryKind::Blob).is_link()); + assert!(mode(EntryKind::BlobExecutable).is_blob()); + assert!(mode(EntryKind::BlobExecutable).is_executable()); + assert!(mode(EntryKind::Blob).is_blob_or_symlink()); + assert!(mode(EntryKind::BlobExecutable).is_blob_or_symlink()); + + assert!(!mode(EntryKind::Link).is_blob()); + assert!(mode(EntryKind::Link).is_link()); + assert!(EntryMode(0o121234).is_link()); + assert_eq!(EntryMode(0o121234).kind(), EntryKind::Link); + assert!(mode(EntryKind::Link).is_blob_or_symlink()); + assert!(mode(EntryKind::Tree).is_tree()); + assert!(EntryMode(0o040101).is_tree()); + assert_eq!(EntryMode(0o040101).kind(), EntryKind::Tree); + assert!(mode(EntryKind::Commit).is_commit()); + assert!(EntryMode(0o167124).is_commit()); + assert_eq!(EntryMode(0o167124).kind(), EntryKind::Commit); + assert_eq!( + EntryMode(0o000000).kind(), + EntryKind::Commit, + "commit is really 'anything else' as `kind()` can't fail" + ); +} + +#[test] +fn as_bytes() { + let mut buf = Default::default(); + for (mode, expected) in [ + (EntryMode::from(EntryKind::Tree), EntryKind::Tree.as_octal_str()), + (EntryKind::Blob.into(), EntryKind::Blob.as_octal_str()), + ( + EntryKind::BlobExecutable.into(), + EntryKind::BlobExecutable.as_octal_str(), + ), + (EntryKind::Link.into(), EntryKind::Link.as_octal_str()), + (EntryKind::Commit.into(), EntryKind::Commit.as_octal_str()), + ( + EntryMode::try_from(b"100744 ".as_ref()).expect("valid"), + "100744".into(), + ), + ( + EntryMode::try_from(b"100644 ".as_ref()).expect("valid"), + "100644".into(), + ), + ] { + assert_eq!(mode.as_bytes(&mut buf), expected); + } +} diff --git a/gix-object/tests/tree/from_bytes.rs b/gix-object/tests/tree/from_bytes.rs new file mode 100644 index 00000000000..b0c62c905e0 --- /dev/null +++ b/gix-object/tests/tree/from_bytes.rs @@ -0,0 +1,98 @@ +use gix_object::{bstr::ByteSlice, tree, tree::EntryRef, TreeRef, TreeRefIter}; + +use crate::{fixture_name, hex_to_id}; + +#[test] +fn empty() -> crate::Result { + assert_eq!( + TreeRef::from_bytes(&[])?, + TreeRef { entries: vec![] }, + "empty trees are valid despite usually rare in the wild" + ); + Ok(()) +} + +#[test] +fn everything() -> crate::Result { + assert_eq!( + TreeRef::from_bytes(&fixture_name("tree", "everything.tree"))?, + TreeRef { + entries: vec![ + EntryRef { + mode: tree::EntryKind::BlobExecutable.into(), + filename: b"exe".as_bstr(), + oid: &hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391") + }, + EntryRef { + mode: tree::EntryKind::Blob.into(), + filename: b"file".as_bstr(), + oid: &hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391") + }, + EntryRef { + mode: tree::EntryKind::Commit.into(), + filename: b"grit-submodule".as_bstr(), + oid: &hex_to_id("b2d1b5d684bdfda5f922b466cc13d4ce2d635cf8") + }, + EntryRef { + mode: tree::EntryKind::Tree.into(), + filename: b"subdir".as_bstr(), + oid: &hex_to_id("4d5fcadc293a348e88f777dc0920f11e7d71441c") + }, + EntryRef { + mode: tree::EntryKind::Link.into(), + filename: b"symlink".as_bstr(), + oid: &hex_to_id("1a010b1c0f081b2e8901d55307a15c29ff30af0e") + } + ] + } + ); + Ok(()) +} + +#[test] +fn invalid() { + let fixture = fixture_name("tree", "definitely-special.tree"); + let partial_tree = &fixture[..fixture.len() / 2]; + let err = TreeRef::from_bytes(partial_tree).unwrap_err().to_string(); + if cfg!(feature = "verbose-object-parsing-errors") { + assert!(err.starts_with("object parsing failed at `100644"), "{err}"); + } else { + assert_eq!(err, "object parsing failed"); + } + assert_eq!( + TreeRefIter::from_bytes(partial_tree).take_while(Result::is_ok).count(), + 9, + "we can decode about half of it before failing" + ); +} + +#[test] +fn fuzzed() { + assert!(gix_object::TreeRef::from_bytes(b"2").is_err(), "fail, but don't crash"); +} + +#[test] +fn special_trees() -> crate::Result { + for (name, expected_entry_count) in [ + ("maybe-special", 160), + ("definitely-special", 19), + ("special-1", 5), + ("special-2", 18), + ("special-3", 5), + ("special-4", 18), + ("special-5", 17), + ] { + let fixture = fixture_name("tree", &format!("{name}.tree")); + assert_eq!( + TreeRef::from_bytes(&fixture)?.entries.len(), + expected_entry_count, + "{name}" + ); + assert_eq!( + TreeRefIter::from_bytes(&fixture).map(Result::unwrap).count(), + expected_entry_count, + "{name}" + ); + } + Ok(()) +} diff --git a/gix-object/tests/tree/iter.rs b/gix-object/tests/tree/iter.rs new file mode 100644 index 00000000000..6ba637b2546 --- /dev/null +++ b/gix-object/tests/tree/iter.rs @@ -0,0 +1,54 @@ +use gix_object::{bstr::ByteSlice, tree, tree::EntryRef, TreeRefIter}; + +use crate::{fixture_name, hex_to_id}; + +#[test] +fn empty() { + assert_eq!(TreeRefIter::from_bytes(&[]).count(), 0, "empty trees are definitely ok"); +} + +#[test] +fn error_handling() { + let data = fixture_name("tree", "everything.tree"); + let iter = TreeRefIter::from_bytes(&data[..data.len() / 2]); + let entries = iter.collect::>(); + assert!( + entries.last().expect("at least one token").is_err(), + "errors are propagated and none is returned from that point on" + ); +} + +#[test] +fn everything() -> crate::Result { + assert_eq!( + TreeRefIter::from_bytes(&fixture_name("tree", "everything.tree")).collect::, _>>()?, + vec![ + EntryRef { + mode: tree::EntryKind::BlobExecutable.into(), + filename: b"exe".as_bstr(), + oid: &hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391") + }, + EntryRef { + mode: tree::EntryKind::Blob.into(), + filename: b"file".as_bstr(), + oid: &hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391") + }, + EntryRef { + mode: tree::EntryKind::Commit.into(), + filename: b"grit-submodule".as_bstr(), + oid: &hex_to_id("b2d1b5d684bdfda5f922b466cc13d4ce2d635cf8") + }, + EntryRef { + mode: tree::EntryKind::Tree.into(), + filename: b"subdir".as_bstr(), + oid: &hex_to_id("4d5fcadc293a348e88f777dc0920f11e7d71441c") + }, + EntryRef { + mode: tree::EntryKind::Link.into(), + filename: b"symlink".as_bstr(), + oid: &hex_to_id("1a010b1c0f081b2e8901d55307a15c29ff30af0e") + } + ] + ); + Ok(()) +} diff --git a/gix-object/tests/tree/mod.rs b/gix-object/tests/tree/mod.rs index bb1055c776a..1265f21b5d0 100644 --- a/gix-object/tests/tree/mod.rs +++ b/gix-object/tests/tree/mod.rs @@ -1,276 +1,5 @@ -mod iter { - use gix_object::{bstr::ByteSlice, tree, tree::EntryRef, TreeRefIter}; - - use crate::{fixture_name, hex_to_id}; - - #[test] - fn empty() { - assert_eq!(TreeRefIter::from_bytes(&[]).count(), 0, "empty trees are definitely ok"); - } - - #[test] - fn error_handling() { - let data = fixture_name("tree", "everything.tree"); - let iter = TreeRefIter::from_bytes(&data[..data.len() / 2]); - let entries = iter.collect::>(); - assert!( - entries.last().expect("at least one token").is_err(), - "errors are propagated and none is returned from that point on" - ); - } - - #[test] - fn everything() -> crate::Result { - assert_eq!( - TreeRefIter::from_bytes(&fixture_name("tree", "everything.tree")).collect::, _>>()?, - vec![ - EntryRef { - mode: tree::EntryKind::BlobExecutable.into(), - filename: b"exe".as_bstr(), - oid: &hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391") - }, - EntryRef { - mode: tree::EntryKind::Blob.into(), - filename: b"file".as_bstr(), - oid: &hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391") - }, - EntryRef { - mode: tree::EntryKind::Commit.into(), - filename: b"grit-submodule".as_bstr(), - oid: &hex_to_id("b2d1b5d684bdfda5f922b466cc13d4ce2d635cf8") - }, - EntryRef { - mode: tree::EntryKind::Tree.into(), - filename: b"subdir".as_bstr(), - oid: &hex_to_id("4d5fcadc293a348e88f777dc0920f11e7d71441c") - }, - EntryRef { - mode: tree::EntryKind::Link.into(), - filename: b"symlink".as_bstr(), - oid: &hex_to_id("1a010b1c0f081b2e8901d55307a15c29ff30af0e") - } - ] - ); - Ok(()) - } -} - -mod from_bytes { - use gix_object::{bstr::ByteSlice, tree, tree::EntryRef, TreeRef, TreeRefIter}; - - use crate::{fixture_name, hex_to_id}; - - #[test] - fn empty() -> crate::Result { - assert_eq!( - TreeRef::from_bytes(&[])?, - TreeRef { entries: vec![] }, - "empty trees are valid despite usually rare in the wild" - ); - Ok(()) - } - - #[test] - fn everything() -> crate::Result { - assert_eq!( - TreeRef::from_bytes(&fixture_name("tree", "everything.tree"))?, - TreeRef { - entries: vec![ - EntryRef { - mode: tree::EntryKind::BlobExecutable.into(), - filename: b"exe".as_bstr(), - oid: &hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391") - }, - EntryRef { - mode: tree::EntryKind::Blob.into(), - filename: b"file".as_bstr(), - oid: &hex_to_id("e69de29bb2d1d6434b8b29ae775ad8c2e48c5391") - }, - EntryRef { - mode: tree::EntryKind::Commit.into(), - filename: b"grit-submodule".as_bstr(), - oid: &hex_to_id("b2d1b5d684bdfda5f922b466cc13d4ce2d635cf8") - }, - EntryRef { - mode: tree::EntryKind::Tree.into(), - filename: b"subdir".as_bstr(), - oid: &hex_to_id("4d5fcadc293a348e88f777dc0920f11e7d71441c") - }, - EntryRef { - mode: tree::EntryKind::Link.into(), - filename: b"symlink".as_bstr(), - oid: &hex_to_id("1a010b1c0f081b2e8901d55307a15c29ff30af0e") - } - ] - } - ); - Ok(()) - } - - #[test] - fn invalid() { - let fixture = fixture_name("tree", "definitely-special.tree"); - let partial_tree = &fixture[..fixture.len() / 2]; - let err = TreeRef::from_bytes(partial_tree).unwrap_err().to_string(); - if cfg!(feature = "verbose-object-parsing-errors") { - assert!(err.starts_with("object parsing failed at `100644"), "{err}"); - } else { - assert_eq!(err, "object parsing failed"); - } - assert_eq!( - TreeRefIter::from_bytes(partial_tree).take_while(Result::is_ok).count(), - 9, - "we can decode about half of it before failing" - ); - } - - #[test] - fn fuzzed() { - assert!(gix_object::TreeRef::from_bytes(b"2").is_err(), "fail, but don't crash"); - } - - #[test] - fn special_trees() -> crate::Result { - for (name, expected_entry_count) in [ - ("maybe-special", 160), - ("definitely-special", 19), - ("special-1", 5), - ("special-2", 18), - ("special-3", 5), - ("special-4", 18), - ("special-5", 17), - ] { - let fixture = fixture_name("tree", &format!("{name}.tree")); - assert_eq!( - TreeRef::from_bytes(&fixture)?.entries.len(), - expected_entry_count, - "{name}" - ); - assert_eq!( - TreeRefIter::from_bytes(&fixture).map(Result::unwrap).count(), - expected_entry_count, - "{name}" - ); - } - Ok(()) - } -} - -mod entries { - use gix_object::{Tree, TreeRef}; - - #[test] - fn sort_order_is_correct() -> crate::Result { - let root = gix_testtools::scripted_fixture_read_only("make_trees.sh")?; - let input = std::fs::read(root.join("tree.baseline"))?; - - let mut tree = TreeRef::from_bytes(&input)?; - let expected = tree.entries.clone(); - - tree.entries.sort(); - assert_eq!(tree.entries, expected); - let mut failures_when_searching_by_name = 0; - for entry in expected { - assert!( - tree.entries.binary_search_by(|e| e.cmp(&entry)).is_ok(), - "ordering works with binary search" - ); - failures_when_searching_by_name += usize::from( - tree.entries - .binary_search_by(|e| e.filename.cmp(entry.filename)) - .is_err(), - ); - assert_eq!( - tree.bisect_entry(entry.filename, entry.mode.is_tree()) - .expect("entry is present"), - entry - ); - } - - assert_ne!( - failures_when_searching_by_name, 0, - "it's not possible to do a binary search by name alone" - ); - - let mut tree: Tree = tree.into(); - let expected = tree.entries.clone(); - tree.entries.sort(); - - assert_eq!(tree.entries, expected); - Ok(()) - } -} - -mod entry_mode { - use gix_object::tree::{EntryKind, EntryMode}; - - #[test] - fn size_in_bytes() { - assert_eq!( - std::mem::size_of::(), - 2, - "it should not change without notice" - ); - } - - #[test] - fn is_methods() { - fn mode(kind: EntryKind) -> EntryMode { - kind.into() - } - - assert!(mode(EntryKind::Blob).is_blob()); - assert!(EntryMode(0o100645).is_blob()); - assert_eq!(EntryMode(0o100645).kind(), EntryKind::Blob); - assert!(!EntryMode(0o100675).is_executable()); - assert!(EntryMode(0o100700).is_executable()); - assert_eq!(EntryMode(0o100700).kind(), EntryKind::BlobExecutable); - assert!(!mode(EntryKind::Blob).is_link()); - assert!(mode(EntryKind::BlobExecutable).is_blob()); - assert!(mode(EntryKind::BlobExecutable).is_executable()); - assert!(mode(EntryKind::Blob).is_blob_or_symlink()); - assert!(mode(EntryKind::BlobExecutable).is_blob_or_symlink()); - - assert!(!mode(EntryKind::Link).is_blob()); - assert!(mode(EntryKind::Link).is_link()); - assert!(EntryMode(0o121234).is_link()); - assert_eq!(EntryMode(0o121234).kind(), EntryKind::Link); - assert!(mode(EntryKind::Link).is_blob_or_symlink()); - assert!(mode(EntryKind::Tree).is_tree()); - assert!(EntryMode(0o040101).is_tree()); - assert_eq!(EntryMode(0o040101).kind(), EntryKind::Tree); - assert!(mode(EntryKind::Commit).is_commit()); - assert!(EntryMode(0o167124).is_commit()); - assert_eq!(EntryMode(0o167124).kind(), EntryKind::Commit); - assert_eq!( - EntryMode(0o000000).kind(), - EntryKind::Commit, - "commit is really 'anything else' as `kind()` can't fail" - ); - } - - #[test] - fn as_bytes() { - let mut buf = Default::default(); - for (mode, expected) in [ - (EntryMode::from(EntryKind::Tree), EntryKind::Tree.as_octal_str()), - (EntryKind::Blob.into(), EntryKind::Blob.as_octal_str()), - ( - EntryKind::BlobExecutable.into(), - EntryKind::BlobExecutable.as_octal_str(), - ), - (EntryKind::Link.into(), EntryKind::Link.as_octal_str()), - (EntryKind::Commit.into(), EntryKind::Commit.as_octal_str()), - ( - EntryMode::try_from(b"100744 ".as_ref()).expect("valid"), - "100744".into(), - ), - ( - EntryMode::try_from(b"100644 ".as_ref()).expect("valid"), - "100644".into(), - ), - ] { - assert_eq!(mode.as_bytes(&mut buf), expected); - } - } -} +mod editor; +mod entries; +mod entry_mode; +mod from_bytes; +mod iter; From 91fa4786df7fe25f2da3d78ef0fc14d53b69b846 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 3 Sep 2024 17:26:38 +0200 Subject: [PATCH 09/17] Add `tree::Editor::cursor()` to allow speedier creation of sub-trees. --- gix-object/src/tree/editor.rs | 251 +++++++++++++++++++++----------- gix-object/src/tree/mod.rs | 42 +++++- gix-object/tests/tree/editor.rs | 190 +++++++++++++++++++++++- 3 files changed, 392 insertions(+), 91 deletions(-) diff --git a/gix-object/src/tree/editor.rs b/gix-object/src/tree/editor.rs index b17b04bad14..74e889ff8ca 100644 --- a/gix-object/src/tree/editor.rs +++ b/gix-object/src/tree/editor.rs @@ -1,35 +1,17 @@ -use crate::tree::EntryKind; +use crate::tree::{Editor, EntryKind}; use crate::{tree, Tree}; use bstr::{BStr, BString, ByteSlice, ByteVec}; use gix_hash::ObjectId; use gix_hashtable::hash_map::Entry; use std::cmp::Ordering; -/// The state needed to apply edits instantly to in-memory trees. -/// -/// It's made so that each tree is looked at in the object database at most once, and held in memory for -/// all edits until everything is flushed to write all changed trees. -/// -/// The editor is optimized to edit existing trees, but can deal with building entirely new trees as well -/// with some penalties. -/// -/// ### Note -/// -/// For reasons of efficiency, internally a SHA1 based hashmap is used to avoid having to store full paths -/// to each edited tree. The chance of collision is low, but could be engineered to overwrite or write into -/// an unintended tree. -#[doc(alias = "TreeUpdateBuilder", alias = "git2")] -pub struct Editor<'a> { - /// A way to lookup trees. - find: &'a dyn crate::FindExt, - /// All trees we currently hold in memory. Each of these may change while adding and removing entries. - /// null-object-ids mark tree-entries whose value we don't know yet, they are placeholders that will be - /// dropped when writing at the latest. - trees: gix_hashtable::HashMap, - /// A buffer to build up paths when finding the tree to edit. - path_buf: BString, - /// Our buffer for storing tree-data in, right before decoding it. - tree_buf: Vec, +/// A way to constrain all [tree-edits](Editor) to a given subtree. +pub struct Cursor<'a, 'find> { + /// The underlying editor + parent: &'a mut Editor<'find>, + /// Our own location, used as prefix for all operations. + /// Note that it's assumed to always contain a tree. + prefix: BString, } /// Lifecycle @@ -37,9 +19,11 @@ impl<'a> Editor<'a> { /// Create a new editor that uses `root` as base for all edits. Use `find` to lookup existing /// trees when edits are made. Each tree will only be looked-up once and then edited in place from /// that point on. - pub fn new(root: Tree, find: &'a dyn crate::FindExt) -> Self { + /// `object_hash` denotes the kind of hash to create. + pub fn new(root: Tree, find: &'a dyn crate::FindExt, object_hash: gix_hash::Kind) -> Self { Editor { find, + object_hash, trees: gix_hashtable::HashMap::from_iter(Some((empty_path_hash(), root))), path_buf: Vec::with_capacity(256).into(), tree_buf: Vec::with_capacity(512), @@ -60,15 +44,63 @@ impl<'a> Editor<'a> { /// Future calls to [`upsert`](Self::upsert) or similar will keep working on the last seen state of the /// just-written root-tree. /// If this is not desired, use [set_root()](Self::set_root()). - pub fn write(&mut self, mut out: impl FnMut(&Tree) -> Result) -> Result { + pub fn write(&mut self, out: impl FnMut(&Tree) -> Result) -> Result { + self.path_buf.clear(); + self.write_at_pathbuf(out, WriteMode::Normal) + } + + /// Remove the entry at `rela_path`, loading all trees on the path accordingly. + /// It's no error if the entry doesn't exist, or if `rela_path` doesn't lead to an existing entry at all. + pub fn remove(&mut self, rela_path: I) -> Result<&mut Self, crate::find::existing_object::Error> + where + I: IntoIterator, + C: AsRef, + { + self.path_buf.clear(); + self.upsert_or_remove_at_pathbuf(rela_path, None) + } + + /// Insert a new entry of `kind` with `id` at `rela_path`, an iterator over each path component in the tree, + /// like `a/b/c`. Names are matched case-sensitively. + /// + /// Existing leaf-entries will be overwritten unconditionally, and it is assumed that `id` is available in the object database + /// or will be made available at a later point to assure the integrity of the produced tree. + /// + /// Intermediate trees will be created if they don't exist in the object database, otherwise they will be loaded and entries + /// will be inserted into them instead. + /// + /// Note that `id` can be [null](ObjectId::null()) to create a placeholder. These will not be written, and paths leading + /// through them will not be considered a problem. + /// + /// `id` can also be an empty tree, along with [the respective `kind`](EntryKind::Tree), even though that's normally not allowed + /// in Git trees. + pub fn upsert( + &mut self, + rela_path: I, + kind: EntryKind, + id: ObjectId, + ) -> Result<&mut Self, crate::find::existing_object::Error> + where + I: IntoIterator, + C: AsRef, + { + self.path_buf.clear(); + self.upsert_or_remove_at_pathbuf(rela_path, Some((kind, id, UpsertMode::Normal))) + } + + fn write_at_pathbuf( + &mut self, + mut out: impl FnMut(&Tree) -> Result, + mode: WriteMode, + ) -> Result { assert_ne!(self.trees.len(), 0, "there is at least the root tree"); // back is for children, front is for parents. let mut parents = vec![( None::, - BString::default(), + self.path_buf.clone(), self.trees - .remove(&empty_path_hash()) + .remove(&path_hash(&self.path_buf)) .expect("root tree is always present"), )]; let mut children = Vec::new(); @@ -102,12 +134,17 @@ impl<'a> Editor<'a> { } } else if parents.is_empty() { debug_assert!(children.is_empty(), "we consume children before parents"); - debug_assert!(rela_path.is_empty(), "this should always be the root tree"); + debug_assert_eq!(rela_path, self.path_buf, "this should always be the root tree"); // There may be left-over trees if they are replaced with blobs for example. let root_tree_id = out(&tree)?; - self.trees.clear(); - self.trees.insert(empty_path_hash(), tree); + match mode { + WriteMode::Normal => { + self.trees.clear(); + } + WriteMode::FromCursor => {} + } + self.trees.insert(path_hash(&self.path_buf), tree); return Ok(root_tree_id); } else if !tree.entries.is_empty() { out(&tree)?; @@ -120,56 +157,21 @@ impl<'a> Editor<'a> { unreachable!("we exit as soon as everything is consumed") } - /// Remove the entry at `rela_path`, loading all trees on the path accordingly. - /// It's no error if the entry doesn't exist, or if `rela_path` doesn't lead to an existing entry at all. - pub fn remove(&mut self, rela_path: I) -> Result<&mut Self, crate::find::existing_object::Error> - where - I: IntoIterator, - C: AsRef, - { - self.upsert_or_remove(rela_path, None) - } - - /// Insert a new entry of `kind` with `id` at `rela_path`, an iterator over each path component in the tree, - /// like `a/b/c`. Names are matched case-sensitively. - /// - /// Existing leaf-entries will be overwritten unconditionally, and it is assumed that `id` is available in the object database - /// or will be made available at a later point to assure the integrity of the produced tree. - /// - /// Intermediate trees will be created if they don't exist in the object database, otherwise they will be loaded and entries - /// will be inserted into them instead. - /// - /// Note that `id` can be [null](ObjectId::null()) to create a placeholder. These will not be written, and paths leading - /// through them will not be considered a problem. - /// - /// `id` can also be an empty tree, along with [the respective `kind`](EntryKind::Tree), even though that's normally not allowed - /// in Git trees. - pub fn upsert( + fn upsert_or_remove_at_pathbuf( &mut self, rela_path: I, - kind: EntryKind, - id: ObjectId, + kind_and_id: Option<(EntryKind, ObjectId, UpsertMode)>, ) -> Result<&mut Self, crate::find::existing_object::Error> where I: IntoIterator, C: AsRef, { - self.upsert_or_remove(rela_path, Some((kind, id))) - } - - fn upsert_or_remove( - &mut self, - rela_path: I, - kind_and_id: Option<(EntryKind, ObjectId)>, - ) -> Result<&mut Self, crate::find::existing_object::Error> - where - I: IntoIterator, - C: AsRef, - { - let mut cursor = self.trees.get_mut(&empty_path_hash()).expect("root is always present"); - self.path_buf.clear(); + let mut cursor = self + .trees + .get_mut(&path_hash(&self.path_buf)) + .expect("root is always present"); let mut rela_path = rela_path.into_iter().peekable(); - let new_kind_is_tree = kind_and_id.map_or(false, |(kind, _)| kind == EntryKind::Tree); + let new_kind_is_tree = kind_and_id.map_or(false, |(kind, _, _)| kind == EntryKind::Tree); while let Some(name) = rela_path.next() { let name = name.as_ref(); let is_last = rela_path.peek().is_none(); @@ -206,7 +208,7 @@ impl<'a> Editor<'a> { } } } - Some((kind, id)) => { + Some((kind, id, _mode)) => { let entry = &mut cursor.entries[idx]; if is_last { // unconditionally overwrite what's there. @@ -229,7 +231,7 @@ impl<'a> Editor<'a> { } Err(insertion_idx) => match kind_and_id { None => break, - Some((kind, id)) => { + Some((kind, id, _mode)) => { cursor.entries.insert( insertion_idx, tree::Entry { @@ -238,9 +240,6 @@ impl<'a> Editor<'a> { oid: if is_last { id } else { id.kind().null() }, }, ); - if is_last { - break; - } None } }, @@ -248,7 +247,7 @@ impl<'a> Editor<'a> { if needs_sorting { cursor.entries.sort(); } - if is_last { + if is_last && kind_and_id.map_or(false, |(_, _, mode)| mode == UpsertMode::Normal) { break; } push_path_component(&mut self.path_buf, name); @@ -279,6 +278,96 @@ impl<'a> Editor<'a> { } } +mod cursor { + use crate::tree::editor::{Cursor, UpsertMode, WriteMode}; + use crate::tree::{Editor, EntryKind}; + use crate::Tree; + use bstr::{BStr, BString}; + use gix_hash::ObjectId; + + /// Cursor handling + impl<'a> Editor<'a> { + /// Turn ourselves as a cursor, which points to the same tree as the editor. + /// + /// This is useful if a method takes a [`Cursor`], not an [`Editor`]. + pub fn to_cursor(&mut self) -> Cursor<'_, 'a> { + Cursor { + parent: self, + prefix: BString::default(), + } + } + + /// Create a cursor at the given `rela_path`, which must be a tree or is turned into a tree as its own edit. + /// + /// The returned cursor will then allow applying edits to the tree at `rela_path` as root. + /// If `rela_path` is a single empty string, it is equivalent to using the current instance itself. + pub fn cursor_at(&mut self, rela_path: I) -> Result, crate::find::existing_object::Error> + where + I: IntoIterator, + C: AsRef, + { + self.path_buf.clear(); + self.upsert_or_remove_at_pathbuf( + rela_path, + Some((EntryKind::Tree, self.object_hash.null(), UpsertMode::AssureTreeOnly)), + )?; + Ok(Cursor { + prefix: self.path_buf.clone(), /* set during the upsert call */ + parent: self, + }) + } + } + + impl<'a, 'find> Cursor<'a, 'find> { + /// Like [`Editor::upsert()`], but with the constraint of only editing in this cursor's tree. + pub fn upsert( + &mut self, + rela_path: I, + kind: EntryKind, + id: ObjectId, + ) -> Result<&mut Self, crate::find::existing_object::Error> + where + I: IntoIterator, + C: AsRef, + { + self.parent.path_buf.clone_from(&self.prefix); + self.parent + .upsert_or_remove_at_pathbuf(rela_path, Some((kind, id, UpsertMode::Normal)))?; + Ok(self) + } + + /// Like [`Editor::remove()`], but with the constraint of only editing in this cursor's tree. + pub fn remove(&mut self, rela_path: I) -> Result<&mut Self, crate::find::existing_object::Error> + where + I: IntoIterator, + C: AsRef, + { + self.parent.path_buf.clone_from(&self.prefix); + self.parent.upsert_or_remove_at_pathbuf(rela_path, None)?; + Ok(self) + } + + /// Like [`Editor::write()`], but will write only the subtree of the cursor. + pub fn write(&mut self, out: impl FnMut(&Tree) -> Result) -> Result { + self.parent.path_buf.clone_from(&self.prefix); + self.parent.write_at_pathbuf(out, WriteMode::FromCursor) + } + } +} + +#[derive(Copy, Clone, Eq, PartialEq)] +enum UpsertMode { + Normal, + /// Only make sure there is a tree at the given location (requires kind tree and null-id) + AssureTreeOnly, +} + +enum WriteMode { + Normal, + /// Perform less cleanup to assure parent-editor still stays intact + FromCursor, +} + fn cmp_entry_with_name(a: &tree::Entry, filename: &BStr, is_tree: bool) -> Ordering { let common = a.filename.len().min(filename.len()); a.filename[..common].cmp(&filename[..common]).then_with(|| { diff --git a/gix-object/src/tree/mod.rs b/gix-object/src/tree/mod.rs index 5b5e7463a9f..d916f6393da 100644 --- a/gix-object/src/tree/mod.rs +++ b/gix-object/src/tree/mod.rs @@ -1,17 +1,47 @@ -use std::cmp::Ordering; - use crate::{ bstr::{BStr, BString}, - tree, + tree, Tree, }; +use gix_hash::ObjectId; +use std::cmp::Ordering; -mod editor; -pub use editor::Editor; +/// +pub mod editor; mod ref_iter; /// pub mod write; +/// The state needed to apply edits instantly to in-memory trees. +/// +/// It's made so that each tree is looked at in the object database at most once, and held in memory for +/// all edits until everything is flushed to write all changed trees. +/// +/// The editor is optimized to edit existing trees, but can deal with building entirely new trees as well +/// with some penalties. +/// +/// ### Note +/// +/// For reasons of efficiency, internally a SHA1 based hashmap is used to avoid having to store full paths +/// to each edited tree. The chance of collision is low, but could be engineered to overwrite or write into +/// an unintended tree. +#[doc(alias = "TreeUpdateBuilder", alias = "git2")] +#[derive(Clone)] +pub struct Editor<'a> { + /// A way to lookup trees. + find: &'a dyn crate::FindExt, + /// The kind of hashes to produce + object_hash: gix_hash::Kind, + /// All trees we currently hold in memory. Each of these may change while adding and removing entries. + /// null-object-ids mark tree-entries whose value we don't know yet, they are placeholders that will be + /// dropped when writing at the latest. + trees: gix_hashtable::HashMap, + /// A buffer to build up paths when finding the tree to edit. + path_buf: BString, + /// Our buffer for storing tree-data in, right before decoding it. + tree_buf: Vec, +} + /// The mode of items storable in a tree, similar to the file mode on a unix file system. /// /// Used in [`mutable::Entry`][crate::tree::Entry] and [`EntryRef`]. @@ -204,7 +234,7 @@ impl<'a> Ord for EntryRef<'a> { } } -/// An entry in a [`Tree`][crate::Tree], similar to an entry in a directory. +/// An entry in a [`Tree`], similar to an entry in a directory. #[derive(PartialEq, Eq, Debug, Hash, Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Entry { diff --git a/gix-object/tests/tree/editor.rs b/gix-object/tests/tree/editor.rs index e752085decc..8df5384050e 100644 --- a/gix-object/tests/tree/editor.rs +++ b/gix-object/tests/tree/editor.rs @@ -1,11 +1,193 @@ use gix_object::tree::EntryKind; use gix_object::Tree; +#[test] +fn from_empty_cursor() -> crate::Result { + let (storage, mut write, num_writes_and_clear) = new_inmemory_writes(); + let odb = StorageOdb::new(storage.clone()); + let mut edit = gix_object::tree::Editor::new(Tree::default(), &odb, gix_hash::Kind::Sha1); + + edit.upsert(Some("root-file"), EntryKind::Blob, any_blob())?.upsert( + ["nested", "from", "root"], + EntryKind::BlobExecutable, + any_blob(), + )?; + let cursor_path = ["some", "deeply", "nested", "path"]; + let mut cursor = edit.cursor_at(cursor_path)?; + let actual = cursor + .upsert(Some("file"), EntryKind::Blob, any_blob())? + .upsert(Some("empty-dir-via-cursor"), EntryKind::Tree, empty_tree())? + .upsert(["with-subdir", "dir", "file"], EntryKind::Blob, any_blob())? + .upsert(["with-subdir2", "dir", "file"], EntryKind::Blob, any_blob())? + .remove(Some("file"))? + .remove(["with-subdir", "dir", "file"])? + .remove(Some("with-subdir2"))? + .remove(Some("with-subdir2"))? + .write(&mut write)?; + assert_eq!( + display_tree(actual, &storage), + "e2339a3f62e2f3fc54a406739a62a4173ee3b5ac +└── empty-dir-via-cursor (empty) +", + "only one item is left in the tree, which also keeps it alive" + ); + assert_eq!(num_writes_and_clear(), 1, "root tree"); + + let actual = edit.write(&mut write)?; + assert_eq!( + display_tree(actual, &storage), + "76e0729de84047d19711d90cfcbb4e60bb432682 +├── nested +│ └── from +│ └── root bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100755 +├── root-file bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +└── some + └── deeply + └── nested + └── path + └── empty-dir-via-cursor (empty) +" + ); + + let mut cursor = edit.cursor_at(cursor_path)?; + let actual = cursor.remove(Some("empty-dir-via-cursor"))?.write(&mut write)?; + assert_eq!(actual, empty_tree(), "it keeps the empty tree like the editor would"); + + let actual = edit.write(&mut write)?; + assert_eq!( + display_tree(actual, &storage), + "6cc592046dcaac06d3c619b4892d9ac78738fb5d +├── nested +│ └── from +│ └── root bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100755 +└── root-file bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +", + "now the editor naturally prunes all empty trees thus far, removing the cursor root" + ); + + let mut cursor = edit.cursor_at(cursor_path)?; + let actual = cursor + .upsert(Some("root-file"), EntryKind::BlobExecutable, any_blob())? + .upsert(["nested", "from"], EntryKind::BlobExecutable, any_blob())? + .write(&mut write)?; + + assert_eq!( + display_tree(actual, &storage), + "4580ae6d4c22b407cee521d7575e69708ff980a1 +├── nested +│ └── from bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100755 +└── root-file bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100755 +", + "it is able to write the sub-tree, even though names from the top-level tree are re-used" + ); + + let actual = edit.write(&mut write)?; + assert_eq!( + display_tree(actual, &storage), + "8febb45a1c34e405d70a7ae059d57abdd8254063 +├── nested +│ └── from +│ └── root bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100755 +├── root-file bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +└── some + └── deeply + └── nested + └── path + ├── nested + │ └── from bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100755 + └── root-file bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100755 +", + "it places the subtree exactly where it's expected" + ); + Ok(()) +} + +#[test] +fn from_existing_cursor() -> crate::Result { + let (storage, mut write, num_writes_and_clear) = new_inmemory_writes(); + let odb = StorageOdb::new_with_odb(storage.clone(), tree_odb()?); + let root_tree_id = hex_to_id("ff7e7d2aecae1c3fb15054b289a4c58aa65b8646"); + let root_tree = find_tree(&odb, root_tree_id)?; + odb.access_count_and_clear(); + let mut edit = gix_object::tree::Editor::new(root_tree.clone(), &odb, gix_hash::Kind::Sha1); + + let mut cursor = edit.cursor_at(Some(""))?; + let actual = cursor + .remove(Some("bin"))? + .remove(Some("bin.d"))? + .remove(Some("file.to"))? + .remove(Some("file.toml"))? + .remove(Some("file.toml.bin"))? + .upsert(["some", "nested", "file"], EntryKind::Blob, any_blob())? + .write(&mut write)?; + assert_eq!( + num_writes_and_clear(), + 1 + 2, + "just the altered root tree, and two of trees towards `some/tested/file`" + ); + assert_eq!( + display_tree_with_odb(actual, &storage, &odb), + "10364deb76aeee372eb486c1216dca2a98dbd379 +├── file +│ └── a e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file0 e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +└── some + └── nested + └── file bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +", + "a cursor at '' is equivalent to 'as_cursor()', or the editor itself" + ); + let mut cursor = edit.cursor_at(["some", "nested"])?; + let actual = cursor + .upsert(Some("hello-from-cursor"), EntryKind::Blob, any_blob())? + .remove(Some("file"))? + .write(&mut write)?; + assert_eq!( + display_tree_with_odb(actual, &storage, &odb), + "0f090b7c09c94f7895d0d8ce63c1da7693c026b3 +└── hello-from-cursor bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +" + ); + + let mut cursor = edit.set_root(root_tree).to_cursor(); + let actual = cursor + .remove(Some("bin"))? + .remove(Some("bin.d"))? + .remove(Some("file.to"))? + .remove(Some("file.toml"))? + .remove(Some("file.toml.bin"))? + .upsert(["some", "nested", "file"], EntryKind::Blob, any_blob())? + .write(&mut write)?; + assert_eq!( + display_tree_with_odb(actual, &storage, &odb), + "10364deb76aeee372eb486c1216dca2a98dbd379 +├── file +│ └── a e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +├── file0 e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +└── some + └── nested + └── file bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +", + "this cursor is the same as the editor" + ); + let actual = cursor.remove(["some", "nested", "file"])?.write(&mut write)?; + assert_eq!( + display_tree_with_odb(actual, &storage, &odb), + "6e2806ab1e4d4ae2c9d24ce113a9bb54f8eff97b +├── file +│ └── a e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +└── file0 e69de29bb2d1d6434b8b29ae775ad8c2e48c5391.100644 +", + "it's possible to delete a deeply nested file" + ); + Ok(()) +} + #[test] fn from_empty_removal() -> crate::Result { let (storage, mut write, num_writes_and_clear) = new_inmemory_writes(); let odb = StorageOdb::new(storage.clone()); - let mut edit = gix_object::tree::Editor::new(Tree::default(), &odb); + let mut edit = gix_object::tree::Editor::new(Tree::default(), &odb, gix_hash::Kind::Sha1); let actual = edit .remove(Some("non-existing"))? @@ -95,7 +277,7 @@ fn from_existing_remove() -> crate::Result { let root_tree_id = hex_to_id("ff7e7d2aecae1c3fb15054b289a4c58aa65b8646"); let root_tree = find_tree(&odb, root_tree_id)?; odb.access_count_and_clear(); - let mut edit = gix_object::tree::Editor::new(root_tree.clone(), &odb); + let mut edit = gix_object::tree::Editor::new(root_tree.clone(), &odb, gix_hash::Kind::Sha1); let actual = edit .remove(["file"])? @@ -158,7 +340,7 @@ fn from_existing_remove() -> crate::Result { fn from_empty_add() -> crate::Result { let (storage, mut write, num_writes_and_clear) = new_inmemory_writes(); let odb = StorageOdb::new(storage.clone()); - let mut edit = gix_object::tree::Editor::new(Tree::default(), &odb); + let mut edit = gix_object::tree::Editor::new(Tree::default(), &odb, gix_hash::Kind::Sha1); let actual = edit.write(&mut write).expect("no changes are fine"); assert_eq!(actual, empty_tree(), "empty stays empty"); @@ -310,7 +492,7 @@ fn from_existing_add() -> crate::Result { let root_tree_id = hex_to_id("ff7e7d2aecae1c3fb15054b289a4c58aa65b8646"); let root_tree = find_tree(&odb, root_tree_id)?; odb.access_count_and_clear(); - let mut edit = gix_object::tree::Editor::new(root_tree.clone(), &odb); + let mut edit = gix_object::tree::Editor::new(root_tree.clone(), &odb, gix_hash::Kind::Sha1); let actual = edit.write(&mut write).expect("no changes are fine"); assert_eq!(actual, root_tree_id, "it rewrites the same tree"); From a085b7e435317f566f6763f4155243ba3309cd82 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 3 Sep 2024 21:08:03 +0200 Subject: [PATCH 10/17] fix!: `Tree|TreeRef::write_to()` will now assure the most basic sanity of entry names. Previously, it would allow null-bytes in the name which would corrupt the written tree. Now this is forbidden. For some reason, it disallowed newlines, but that is now allowed as validation is should be handled on a higher level. --- gix-object/src/tree/editor.rs | 87 +++++++++++++++++++++-------- gix-object/src/tree/write.rs | 12 ++-- gix-object/tests/encode/mod.rs | 35 ++++++++++++ gix-object/tests/tree/editor.rs | 57 +++++++++++++++++-- gix-object/tests/tree/from_bytes.rs | 24 +++++--- 5 files changed, 173 insertions(+), 42 deletions(-) diff --git a/gix-object/src/tree/editor.rs b/gix-object/src/tree/editor.rs index 74e889ff8ca..767997433ea 100644 --- a/gix-object/src/tree/editor.rs +++ b/gix-object/src/tree/editor.rs @@ -4,6 +4,7 @@ use bstr::{BStr, BString, ByteSlice, ByteVec}; use gix_hash::ObjectId; use gix_hashtable::hash_map::Entry; use std::cmp::Ordering; +use std::fmt::Formatter; /// A way to constrain all [tree-edits](Editor) to a given subtree. pub struct Cursor<'a, 'find> { @@ -14,6 +15,26 @@ pub struct Cursor<'a, 'find> { prefix: BString, } +impl std::fmt::Debug for Editor<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Editor") + .field("object_hash", &self.object_hash) + .field("path_buf", &self.path_buf) + .field("trees", &self.trees) + .finish() + } +} + +/// The error returned by [Editor] or [Cursor] edit operation. +#[derive(Debug, thiserror::Error)] +#[allow(missing_docs)] +pub enum Error { + #[error("Empty path components are not allowed")] + EmptyPathComponent, + #[error(transparent)] + FindExistingObject(#[from] crate::find::existing_object::Error), +} + /// Lifecycle impl<'a> Editor<'a> { /// Create a new editor that uses `root` as base for all edits. Use `find` to lookup existing @@ -44,6 +65,12 @@ impl<'a> Editor<'a> { /// Future calls to [`upsert`](Self::upsert) or similar will keep working on the last seen state of the /// just-written root-tree. /// If this is not desired, use [set_root()](Self::set_root()). + /// + /// ### Validation + /// + /// Note that no additional validation is performed to assure correctness of entry-names. + /// It is absolutely and intentionally possible to write out invalid trees with this method. + /// Higher layers are expected to perform detailed validation. pub fn write(&mut self, out: impl FnMut(&Tree) -> Result) -> Result { self.path_buf.clear(); self.write_at_pathbuf(out, WriteMode::Normal) @@ -51,7 +78,9 @@ impl<'a> Editor<'a> { /// Remove the entry at `rela_path`, loading all trees on the path accordingly. /// It's no error if the entry doesn't exist, or if `rela_path` doesn't lead to an existing entry at all. - pub fn remove(&mut self, rela_path: I) -> Result<&mut Self, crate::find::existing_object::Error> + /// + /// Note that trying to remove a path with an empty component is also forbidden. + pub fn remove(&mut self, rela_path: I) -> Result<&mut Self, Error> where I: IntoIterator, C: AsRef, @@ -74,12 +103,7 @@ impl<'a> Editor<'a> { /// /// `id` can also be an empty tree, along with [the respective `kind`](EntryKind::Tree), even though that's normally not allowed /// in Git trees. - pub fn upsert( - &mut self, - rela_path: I, - kind: EntryKind, - id: ObjectId, - ) -> Result<&mut Self, crate::find::existing_object::Error> + pub fn upsert(&mut self, rela_path: I, kind: EntryKind, id: ObjectId) -> Result<&mut Self, Error> where I: IntoIterator, C: AsRef, @@ -130,22 +154,39 @@ impl<'a> Editor<'a> { if tree.entries.is_empty() { parent_to_adjust.entries.remove(entry_idx); } else { - parent_to_adjust.entries[entry_idx].oid = out(&tree)?; + match out(&tree) { + Ok(id) => { + parent_to_adjust.entries[entry_idx].oid = id; + } + Err(err) => { + let root_tree = parents.into_iter().next().expect("root wasn't consumed yet"); + self.trees.insert(path_hash(&root_tree.1), root_tree.2); + return Err(err); + } + } } } else if parents.is_empty() { debug_assert!(children.is_empty(), "we consume children before parents"); debug_assert_eq!(rela_path, self.path_buf, "this should always be the root tree"); // There may be left-over trees if they are replaced with blobs for example. - let root_tree_id = out(&tree)?; - match mode { - WriteMode::Normal => { - self.trees.clear(); + match out(&tree) { + Ok(id) => { + let root_tree_id = id; + match mode { + WriteMode::Normal => { + self.trees.clear(); + } + WriteMode::FromCursor => {} + } + self.trees.insert(path_hash(&rela_path), tree); + return Ok(root_tree_id); + } + Err(err) => { + self.trees.insert(path_hash(&rela_path), tree); + return Err(err); } - WriteMode::FromCursor => {} } - self.trees.insert(path_hash(&self.path_buf), tree); - return Ok(root_tree_id); } else if !tree.entries.is_empty() { out(&tree)?; } @@ -161,7 +202,7 @@ impl<'a> Editor<'a> { &mut self, rela_path: I, kind_and_id: Option<(EntryKind, ObjectId, UpsertMode)>, - ) -> Result<&mut Self, crate::find::existing_object::Error> + ) -> Result<&mut Self, Error> where I: IntoIterator, C: AsRef, @@ -174,6 +215,9 @@ impl<'a> Editor<'a> { let new_kind_is_tree = kind_and_id.map_or(false, |(kind, _, _)| kind == EntryKind::Tree); while let Some(name) = rela_path.next() { let name = name.as_ref(); + if name.is_empty() { + return Err(Error::EmptyPathComponent); + } let is_last = rela_path.peek().is_none(); let mut needs_sorting = false; let current_level_must_be_tree = !is_last || new_kind_is_tree; @@ -301,7 +345,7 @@ mod cursor { /// /// The returned cursor will then allow applying edits to the tree at `rela_path` as root. /// If `rela_path` is a single empty string, it is equivalent to using the current instance itself. - pub fn cursor_at(&mut self, rela_path: I) -> Result, crate::find::existing_object::Error> + pub fn cursor_at(&mut self, rela_path: I) -> Result, super::Error> where I: IntoIterator, C: AsRef, @@ -320,12 +364,7 @@ mod cursor { impl<'a, 'find> Cursor<'a, 'find> { /// Like [`Editor::upsert()`], but with the constraint of only editing in this cursor's tree. - pub fn upsert( - &mut self, - rela_path: I, - kind: EntryKind, - id: ObjectId, - ) -> Result<&mut Self, crate::find::existing_object::Error> + pub fn upsert(&mut self, rela_path: I, kind: EntryKind, id: ObjectId) -> Result<&mut Self, super::Error> where I: IntoIterator, C: AsRef, @@ -337,7 +376,7 @@ mod cursor { } /// Like [`Editor::remove()`], but with the constraint of only editing in this cursor's tree. - pub fn remove(&mut self, rela_path: I) -> Result<&mut Self, crate::find::existing_object::Error> + pub fn remove(&mut self, rela_path: I) -> Result<&mut Self, super::Error> where I: IntoIterator, C: AsRef, diff --git a/gix-object/src/tree/write.rs b/gix-object/src/tree/write.rs index e2c98d76e13..831a65a236f 100644 --- a/gix-object/src/tree/write.rs +++ b/gix-object/src/tree/write.rs @@ -12,8 +12,8 @@ use crate::{ #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { - #[error("Newlines are invalid in file paths: {name:?}")] - NewlineInFilename { name: BString }, + #[error("Nullbytes are invalid in file paths as they are separators: {name:?}")] + NullbyteInFilename { name: BString }, } impl From for io::Error { @@ -40,8 +40,8 @@ impl crate::WriteTo for Tree { out.write_all(mode.as_bytes(&mut buf))?; out.write_all(SPACE)?; - if filename.find_byte(b'\n').is_some() { - return Err(Error::NewlineInFilename { + if filename.find_byte(0).is_some() { + return Err(Error::NullbyteInFilename { name: (*filename).to_owned(), } .into()); @@ -87,8 +87,8 @@ impl<'a> crate::WriteTo for TreeRef<'a> { out.write_all(mode.as_bytes(&mut buf))?; out.write_all(SPACE)?; - if filename.find_byte(b'\n').is_some() { - return Err(Error::NewlineInFilename { + if filename.find_byte(0).is_some() { + return Err(Error::NullbyteInFilename { name: (*filename).to_owned(), } .into()); diff --git a/gix-object/tests/encode/mod.rs b/gix-object/tests/encode/mod.rs index 503933c354d..00365cdd35e 100644 --- a/gix-object/tests/encode/mod.rs +++ b/gix-object/tests/encode/mod.rs @@ -88,6 +88,41 @@ mod commit { } mod tree { + use gix_object::tree::EntryKind; + use gix_object::{tree, WriteTo}; + + #[test] + fn write_to_does_not_validate() { + let mut tree = gix_object::Tree::empty(); + tree.entries.push(tree::Entry { + mode: EntryKind::Blob.into(), + filename: "".into(), + oid: gix_hash::Kind::Sha1.null(), + }); + tree.entries.push(tree::Entry { + mode: EntryKind::Tree.into(), + filename: "something\nwith\newlines\n".into(), + oid: gix_hash::ObjectId::empty_tree(gix_hash::Kind::Sha1), + }); + tree.write_to(&mut std::io::sink()) + .expect("write succeeds, no validation is performed"); + } + + #[test] + fn write_to_does_not_allow_separator() { + let mut tree = gix_object::Tree::empty(); + tree.entries.push(tree::Entry { + mode: EntryKind::Blob.into(), + filename: "hi\0ho".into(), + oid: gix_hash::Kind::Sha1.null(), + }); + let err = tree.write_to(&mut std::io::sink()).unwrap_err(); + assert_eq!( + err.to_string(), + "Nullbytes are invalid in file paths as they are separators: \"hi\\0ho\"" + ); + } + round_trip!(gix_object::Tree, gix_object::TreeRef, "tree/everything.tree"); } diff --git a/gix-object/tests/tree/editor.rs b/gix-object/tests/tree/editor.rs index 8df5384050e..f74b5c15602 100644 --- a/gix-object/tests/tree/editor.rs +++ b/gix-object/tests/tree/editor.rs @@ -111,7 +111,7 @@ fn from_existing_cursor() -> crate::Result { odb.access_count_and_clear(); let mut edit = gix_object::tree::Editor::new(root_tree.clone(), &odb, gix_hash::Kind::Sha1); - let mut cursor = edit.cursor_at(Some(""))?; + let mut cursor = edit.to_cursor(); let actual = cursor .remove(Some("bin"))? .remove(Some("bin.d"))? @@ -336,6 +336,56 @@ fn from_existing_remove() -> crate::Result { Ok(()) } +#[test] +fn from_empty_invalid_write() -> crate::Result { + let (storage, mut write, _num_writes_and_clear) = new_inmemory_writes(); + let odb = StorageOdb::new(storage.clone()); + let mut edit = gix_object::tree::Editor::new(Tree::default(), &odb, gix_hash::Kind::Sha1); + + let actual = edit + .upsert(["a", "\n"], EntryKind::Blob, any_blob())? + .write(&mut write) + .expect("no validation is performed"); + assert_eq!( + display_tree(actual, &storage), + "d23290ea39c284156731188dce62c17ac6b71bda +└── a + └── \n bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +" + ); + + let err = edit + .upsert(Some("with\0null"), EntryKind::Blob, any_blob())? + .write(&mut write) + .unwrap_err(); + assert_eq!( + err.to_string(), + "Nullbytes are invalid in file paths as they are separators: \"with\\0null\"" + ); + + let err = edit.upsert(Some(""), EntryKind::Blob, any_blob()).unwrap_err(); + let expected_msg = "Empty path components are not allowed"; + assert_eq!(err.to_string(), expected_msg); + let err = edit + .upsert(["fine", "", "previous is not fine"], EntryKind::Blob, any_blob()) + .unwrap_err(); + assert_eq!(err.to_string(), expected_msg); + + let actual = edit + .remove(Some("a"))? + .remove(Some("with\0null"))? + .upsert(Some("works"), EntryKind::Blob, any_blob())? + .write(&mut write)?; + assert_eq!( + display_tree(actual, &storage), + "d5b913c39b06507c7c64adb16c268ce1102ef5c1 +└── works bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb.100644 +", + "after removing invalid entries, it can write again" + ); + Ok(()) +} + #[test] fn from_empty_add() -> crate::Result { let (storage, mut write, num_writes_and_clear) = new_inmemory_writes(); @@ -666,7 +716,7 @@ mod utils { pub(super) fn new_inmemory_writes() -> ( TreeStore, - impl FnMut(&Tree) -> Result, + impl FnMut(&Tree) -> Result, impl Fn() -> usize, ) { let store = TreeStore::default(); @@ -677,8 +727,7 @@ mod utils { let mut buf = Vec::with_capacity(512); move |tree: &Tree| { buf.clear(); - tree.write_to(&mut buf) - .expect("write to memory can't fail and tree is valid"); + tree.write_to(&mut buf)?; let header = gix_object::encode::loose_header(gix_object::Kind::Tree, buf.len() as u64); let mut hasher = gix_features::hash::hasher(gix_hash::Kind::Sha1); hasher.update(&header); diff --git a/gix-object/tests/tree/from_bytes.rs b/gix-object/tests/tree/from_bytes.rs index b0c62c905e0..97b68d8daa0 100644 --- a/gix-object/tests/tree/from_bytes.rs +++ b/gix-object/tests/tree/from_bytes.rs @@ -1,21 +1,32 @@ -use gix_object::{bstr::ByteSlice, tree, tree::EntryRef, TreeRef, TreeRefIter}; +use gix_object::{bstr::ByteSlice, tree, tree::EntryRef, Tree, TreeRef, TreeRefIter, WriteTo}; use crate::{fixture_name, hex_to_id}; #[test] fn empty() -> crate::Result { + let tree_ref = TreeRef::from_bytes(&[])?; assert_eq!( - TreeRef::from_bytes(&[])?, + tree_ref, TreeRef { entries: vec![] }, "empty trees are valid despite usually rare in the wild" ); + + let mut buf = Vec::new(); + tree_ref.write_to(&mut buf)?; + assert!(buf.is_empty()); + + buf.clear(); + Tree::from(tree_ref).write_to(&mut buf)?; + assert!(buf.is_empty()); Ok(()) } #[test] fn everything() -> crate::Result { + let fixture = fixture_name("tree", "everything.tree"); + let tree_ref = TreeRef::from_bytes(&fixture)?; assert_eq!( - TreeRef::from_bytes(&fixture_name("tree", "everything.tree"))?, + tree_ref, TreeRef { entries: vec![ EntryRef { @@ -83,11 +94,8 @@ fn special_trees() -> crate::Result { ("special-5", 17), ] { let fixture = fixture_name("tree", &format!("{name}.tree")); - assert_eq!( - TreeRef::from_bytes(&fixture)?.entries.len(), - expected_entry_count, - "{name}" - ); + let actual = TreeRef::from_bytes(&fixture)?; + assert_eq!(actual.entries.len(), expected_entry_count, "{name}"); assert_eq!( TreeRefIter::from_bytes(&fixture).map(Result::unwrap).count(), expected_entry_count, From b5dc45f2854012981cb6071f511e5d987020a003 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 4 Sep 2024 10:16:04 +0200 Subject: [PATCH 11/17] Add benchmarks for the tree editor and the tree-editor cursor --- gix-object/Cargo.toml | 5 + gix-object/benches/edit_tree.rs | 190 ++++++++++++++++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100644 gix-object/benches/edit_tree.rs diff --git a/gix-object/Cargo.toml b/gix-object/Cargo.toml index fc05fb13650..dbe6e1fc0d5 100644 --- a/gix-object/Cargo.toml +++ b/gix-object/Cargo.toml @@ -19,6 +19,11 @@ name = "decode-objects" harness = false path = "./benches/decode_objects.rs" +[[bench]] +name = "edit-tree" +harness = false +path = "./benches/edit_tree.rs" + [features] ## Data structures implement `serde::Serialize` and `serde::Deserialize`. diff --git a/gix-object/benches/edit_tree.rs b/gix-object/benches/edit_tree.rs new file mode 100644 index 00000000000..ba063d89a08 --- /dev/null +++ b/gix-object/benches/edit_tree.rs @@ -0,0 +1,190 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; +use gix_hash::ObjectId; +use gix_hashtable::hash_map::Entry; +use gix_object::tree::EntryKind; +use gix_object::{tree, Tree, WriteTo}; +use std::cell::RefCell; +use std::rc::Rc; + +fn create_new_tree_add_and_remove(c: &mut Criterion) { + let (storage, mut write) = new_inmemory_writes(); + let mut editor = tree::Editor::new(Tree::default(), &gix_object::find::Never, gix_hash::Kind::Sha1); + let mut group = c.benchmark_group("editor"); + let small_throughput = Throughput::Elements((1 + 2 + 4) + 3); + group.throughput(small_throughput.clone()); + group.bench_function("small tree (empty -> full -> empty)", |b| { + b.iter(|| { + let tree_id = editor + .upsert(Some("file"), EntryKind::Blob, any_blob()) + .unwrap() + .upsert(["dir", "file"], EntryKind::Blob, any_blob()) + .unwrap() + .upsert(["more", "deeply", "nested", "file"], EntryKind::Blob, any_blob()) + .unwrap() + .write(&mut write) + .unwrap(); + black_box(tree_id); + let actual = editor + .remove(Some("file")) + .unwrap() + .remove(Some("dir")) + .unwrap() + .remove(Some("more")) + .unwrap() + .write(&mut write) + .unwrap(); + assert_eq!(actual, gix_hash::ObjectId::empty_tree(gix_hash::Kind::Sha1)); + }); + }); + + let odb = StorageOdb(storage); + let mut editor = tree::Editor::new(Tree::default(), &odb, gix_hash::Kind::Sha1); + let prefixed_throughput = Throughput::Elements((1 + 2 + 4) + 6 * 3 + (3 + 6 * 3)); + group.throughput(prefixed_throughput.clone()); + group.bench_function("deeply nested tree (empty -> full -> empty)", |b| { + b.iter(|| { + let tree_id = editor + .upsert(["a", "b", "c", "d", "e", "f", "file"], EntryKind::Blob, any_blob()) + .unwrap() + .upsert( + ["a", "b", "c", "d", "e", "f", "dir", "file"], + EntryKind::Blob, + any_blob(), + ) + .unwrap() + .upsert( + ["a", "b", "c", "d", "e", "f", "more", "deeply", "nested", "file"], + EntryKind::Blob, + any_blob(), + ) + .unwrap() + .write(&mut write) + .unwrap(); + black_box(tree_id); + let tree_id = editor + .remove(["a", "b", "c", "d", "e", "f", "file"]) + .unwrap() + .remove(["a", "b", "c", "d", "e", "f", "dir"]) + .unwrap() + .remove(["a", "b", "c", "d", "e", "f", "more"]) + .unwrap() + .write(&mut write) + .unwrap(); + black_box(tree_id); + }); + }); + + drop(group); + let mut group = c.benchmark_group("cursor"); + group.throughput(small_throughput); + group.bench_function("small tree (empty -> full -> empty)", |b| { + let mut editor = editor.to_cursor(); + b.iter(|| { + let tree_id = editor + .upsert(Some("file"), EntryKind::Blob, any_blob()) + .unwrap() + .upsert(["dir", "file"], EntryKind::Blob, any_blob()) + .unwrap() + .upsert(["more", "deeply", "nested", "file"], EntryKind::Blob, any_blob()) + .unwrap() + .write(&mut write) + .unwrap(); + black_box(tree_id); + let actual = editor + .remove(Some("file")) + .unwrap() + .remove(Some("dir")) + .unwrap() + .remove(Some("more")) + .unwrap() + .write(&mut write) + .unwrap(); + assert_eq!(actual, gix_hash::ObjectId::empty_tree(gix_hash::Kind::Sha1)); + }); + }); + + group.throughput(prefixed_throughput); + group.bench_function("deeply nested tree (empty -> full -> empty)", |b| { + let mut editor = editor.cursor_at(["a", "b", "c", "d", "e", "f"]).unwrap(); + b.iter(|| { + let tree_id = editor + .upsert(["file"], EntryKind::Blob, any_blob()) + .unwrap() + .upsert(["dir", "file"], EntryKind::Blob, any_blob()) + .unwrap() + .upsert(["more", "deeply", "nested", "file"], EntryKind::Blob, any_blob()) + .unwrap() + .write(&mut write) + .unwrap(); + black_box(tree_id); + let actual = editor + .remove(["file"]) + .unwrap() + .remove(["dir"]) + .unwrap() + .remove(["more"]) + .unwrap() + .write(&mut write) + .unwrap(); + assert_eq!(actual, gix_hash::ObjectId::empty_tree(gix_hash::Kind::Sha1)); + }); + }); +} + +criterion_group!(benches, create_new_tree_add_and_remove); +criterion_main!(benches); + +type TreeStore = Rc>>; + +fn new_inmemory_writes() -> (TreeStore, impl FnMut(&Tree) -> Result) { + let store = TreeStore::default(); + let write_tree = { + let store = store.clone(); + let mut buf = Vec::with_capacity(512); + move |tree: &Tree| { + buf.clear(); + tree.write_to(&mut buf)?; + let header = gix_object::encode::loose_header(gix_object::Kind::Tree, buf.len() as u64); + let mut hasher = gix_features::hash::hasher(gix_hash::Kind::Sha1); + hasher.update(&header); + hasher.update(&buf); + let id = hasher.digest().into(); + let mut borrowed = store.borrow_mut(); + match borrowed.entry(id) { + Entry::Occupied(_) => {} + Entry::Vacant(e) => { + e.insert(tree.clone()); + } + }; + Ok(id) + } + }; + (store, write_tree) +} + +struct StorageOdb(TreeStore); + +impl gix_object::Find for StorageOdb { + fn try_find<'a>( + &self, + id: &gix_hash::oid, + buffer: &'a mut Vec, + ) -> Result>, gix_object::find::Error> { + let borrow = self.0.borrow(); + match borrow.get(id) { + None => Ok(None), + Some(tree) => { + buffer.clear(); + tree.write_to(buffer).expect("valid trees can always be serialized"); + Ok(Some(gix_object::Data { + kind: gix_object::Kind::Tree, + data: &*buffer, + })) + } + } + } +} + +fn any_blob() -> ObjectId { + ObjectId::from_hex("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb".as_bytes()).unwrap() +} From 39d8e03047df09db0102246071015505652bec8b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 4 Sep 2024 10:57:03 +0200 Subject: [PATCH 12/17] Use a standard HashMap instead of one based on SHA1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's faster throughout the board. ``` ❯ cargo bench -p gix-object@0.44.0 --bench edit-tree Compiling gix-object v0.44.0 (/Users/byron/dev/github.com/Byron/gitoxide/gix-object) Compiling gix-pack v0.53.0 (/Users/byron/dev/github.com/Byron/gitoxide/gix-pack) Compiling gix-odb v0.63.0 (/Users/byron/dev/github.com/Byron/gitoxide/gix-odb) Finished `bench` profile [optimized] target(s) in 5.97s Running benches/edit_tree.rs (target/release/deps/edit_tree-6af6651a1c453a05) Gnuplot not found, using plotters backend editor/small tree (empty -> full -> empty) time: [2.5972 µs 2.6019 µs 2.6075 µs] thrpt: [3.8351 Melem/s 3.8434 Melem/s 3.8503 Melem/s] change: time: [-32.618% -32.355% -32.038%] (p = 0.00 < 0.05) thrpt: [+47.142% +47.831% +48.409%] Performance has improved. Found 14 outliers among 100 measurements (14.00%) 13 (13.00%) high mild 1 (1.00%) high severe editor/deeply nested tree (empty -> full -> empty) time: [8.2019 µs 8.2079 µs 8.2145 µs] thrpt: [5.5998 Melem/s 5.6043 Melem/s 5.6084 Melem/s] change: time: [-33.517% -33.377% -33.246%] (p = 0.00 < 0.05) thrpt: [+49.804% +50.099% +50.415%] Performance has improved. Found 13 outliers among 100 measurements (13.00%) 8 (8.00%) high mild 5 (5.00%) high severe cursor/small tree (empty -> full -> empty) time: [2.6911 µs 2.6935 µs 2.6961 µs] thrpt: [3.7090 Melem/s 3.7127 Melem/s 3.7160 Melem/s] change: time: [-33.881% -33.546% -33.225%] (p = 0.00 < 0.05) thrpt: [+49.757% +50.480% +51.242%] Performance has improved. Found 14 outliers among 100 measurements (14.00%) 4 (4.00%) high mild 10 (10.00%) high severe cursor/deeply nested tree (empty -> full -> empty) time: [1.3616 µs 1.3631 µs 1.3649 µs] thrpt: [33.703 Melem/s 33.747 Melem/s 33.783 Melem/s] change: time: [-40.063% -39.675% -39.234%] (p = 0.00 < 0.05) thrpt: [+64.566% +65.769% +66.843%] Performance has improved. Found 20 outliers among 100 measurements (20.00%) 18 (18.00%) high mild 2 (2.00%) high severe ``` --- gix-object/src/tree/editor.rs | 26 ++++++++++++-------------- gix-object/src/tree/mod.rs | 9 +-------- 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/gix-object/src/tree/editor.rs b/gix-object/src/tree/editor.rs index 767997433ea..d7632b7728d 100644 --- a/gix-object/src/tree/editor.rs +++ b/gix-object/src/tree/editor.rs @@ -2,8 +2,8 @@ use crate::tree::{Editor, EntryKind}; use crate::{tree, Tree}; use bstr::{BStr, BString, ByteSlice, ByteVec}; use gix_hash::ObjectId; -use gix_hashtable::hash_map::Entry; use std::cmp::Ordering; +use std::collections::{hash_map, HashMap}; use std::fmt::Formatter; /// A way to constrain all [tree-edits](Editor) to a given subtree. @@ -45,7 +45,7 @@ impl<'a> Editor<'a> { Editor { find, object_hash, - trees: gix_hashtable::HashMap::from_iter(Some((empty_path_hash(), root))), + trees: HashMap::from_iter(Some((empty_path(), root))), path_buf: Vec::with_capacity(256).into(), tree_buf: Vec::with_capacity(512), } @@ -160,7 +160,7 @@ impl<'a> Editor<'a> { } Err(err) => { let root_tree = parents.into_iter().next().expect("root wasn't consumed yet"); - self.trees.insert(path_hash(&root_tree.1), root_tree.2); + self.trees.insert(root_tree.1, root_tree.2); return Err(err); } } @@ -179,11 +179,11 @@ impl<'a> Editor<'a> { } WriteMode::FromCursor => {} } - self.trees.insert(path_hash(&rela_path), tree); + self.trees.insert(rela_path, tree); return Ok(root_tree_id); } Err(err) => { - self.trees.insert(path_hash(&rela_path), tree); + self.trees.insert(rela_path, tree); return Err(err); } } @@ -297,8 +297,8 @@ impl<'a> Editor<'a> { push_path_component(&mut self.path_buf, name); let path_id = path_hash(&self.path_buf); cursor = match self.trees.entry(path_id) { - Entry::Occupied(e) => e.into_mut(), - Entry::Vacant(e) => e.insert( + hash_map::Entry::Occupied(e) => e.into_mut(), + hash_map::Entry::Vacant(e) => e.insert( if let Some(tree_id) = tree_to_lookup.filter(|tree_id| !tree_id.is_empty_tree()) { self.find.find_tree(&tree_id, &mut self.tree_buf)?.into() } else { @@ -317,7 +317,7 @@ impl<'a> Editor<'a> { /// This is useful if the same editor is re-used for various trees. pub fn set_root(&mut self, root: Tree) -> &mut Self { self.trees.clear(); - self.trees.insert(empty_path_hash(), root); + self.trees.insert(empty_path(), root); self } } @@ -420,14 +420,12 @@ fn filename(path: &BStr) -> &BStr { path.rfind_byte(b'/').map_or(path, |pos| &path[pos + 1..]) } -fn empty_path_hash() -> ObjectId { - gix_features::hash::hasher(gix_hash::Kind::Sha1).digest().into() +fn empty_path() -> BString { + BString::default() } -fn path_hash(path: &[u8]) -> ObjectId { - let mut hasher = gix_features::hash::hasher(gix_hash::Kind::Sha1); - hasher.update(path); - hasher.digest().into() +fn path_hash(path: &[u8]) -> BString { + path.to_vec().into() } fn push_path_component(base: &mut BString, component: &[u8]) -> usize { diff --git a/gix-object/src/tree/mod.rs b/gix-object/src/tree/mod.rs index d916f6393da..979940012e2 100644 --- a/gix-object/src/tree/mod.rs +++ b/gix-object/src/tree/mod.rs @@ -2,7 +2,6 @@ use crate::{ bstr::{BStr, BString}, tree, Tree, }; -use gix_hash::ObjectId; use std::cmp::Ordering; /// @@ -19,12 +18,6 @@ pub mod write; /// /// The editor is optimized to edit existing trees, but can deal with building entirely new trees as well /// with some penalties. -/// -/// ### Note -/// -/// For reasons of efficiency, internally a SHA1 based hashmap is used to avoid having to store full paths -/// to each edited tree. The chance of collision is low, but could be engineered to overwrite or write into -/// an unintended tree. #[doc(alias = "TreeUpdateBuilder", alias = "git2")] #[derive(Clone)] pub struct Editor<'a> { @@ -35,7 +28,7 @@ pub struct Editor<'a> { /// All trees we currently hold in memory. Each of these may change while adding and removing entries. /// null-object-ids mark tree-entries whose value we don't know yet, they are placeholders that will be /// dropped when writing at the latest. - trees: gix_hashtable::HashMap, + trees: std::collections::HashMap, /// A buffer to build up paths when finding the tree to edit. path_buf: BString, /// Our buffer for storing tree-data in, right before decoding it. From 7c485561582c583ce0d929680e7f9abf5ceeea60 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 4 Sep 2024 15:42:49 +0200 Subject: [PATCH 13/17] feat: `TreeRef::to_owned()` and `Tree::into_owned()` for a convenient way to get a fully-owned `Tree` --- gix-object/src/tree/mod.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/gix-object/src/tree/mod.rs b/gix-object/src/tree/mod.rs index 979940012e2..1d3c0838df0 100644 --- a/gix-object/src/tree/mod.rs +++ b/gix-object/src/tree/mod.rs @@ -1,6 +1,6 @@ use crate::{ bstr::{BStr, BString}, - tree, Tree, + tree, Tree, TreeRef, }; use std::cmp::Ordering; @@ -194,6 +194,21 @@ impl EntryMode { } } +impl TreeRef<'_> { + /// Convert this instance into its own version, creating a copy of all data. + /// + /// This will temporarily allocate an extra copy in memory, so at worst three copies of the tree exist + /// at some intermediate point in time. Use [`Self::into_owned()`] to avoid this. + pub fn to_owned(&self) -> Tree { + self.clone().into() + } + + /// Convert this instance into its own version, creating a copy of all data. + pub fn into_owned(self) -> Tree { + self.into() + } +} + /// An element of a [`TreeRef`][crate::TreeRef::entries]. #[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] From b279957beaf581c16293343dbdb2121bd1d4dd1c Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 4 Sep 2024 12:05:19 +0200 Subject: [PATCH 14/17] feat: add tree-editing capabilities to `Tree` and `Repository`. Create a tree editor using `Tree::edit()` or `Repository::edit_tree(id)`. --- Cargo.lock | 2 + gix/Cargo.toml | 15 +- gix/src/config/cache/access.rs | 2 +- gix/src/object/tree/editor.rs | 271 +++++++++++++++++++++++++++++++++ gix/src/object/tree/mod.rs | 12 ++ gix/src/repository/mod.rs | 14 ++ gix/src/repository/object.rs | 24 ++- gix/src/types.rs | 2 +- gix/tests/repository/object.rs | 205 +++++++++++++++++++++++++ justfile | 6 +- 10 files changed, 546 insertions(+), 7 deletions(-) create mode 100644 gix/src/object/tree/editor.rs diff --git a/Cargo.lock b/Cargo.lock index 9ddc5a3c30c..c4c7d061430 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1303,6 +1303,7 @@ dependencies = [ "anyhow", "async-std", "document-features", + "gix", "gix-actor 0.32.0", "gix-archive", "gix-attributes 0.22.5", @@ -1360,6 +1361,7 @@ dependencies = [ "serial_test", "signal-hook", "smallvec", + "termtree", "thiserror", "walkdir", ] diff --git a/gix/Cargo.toml b/gix/Cargo.toml index bffb10dc27a..db6616465c2 100644 --- a/gix/Cargo.toml +++ b/gix/Cargo.toml @@ -64,9 +64,12 @@ extras = [ "credentials", "interrupt", "status", - "dirwalk", + "dirwalk" ] +## A collection of features that need a larger MSRV, and thus are disabled by default. +need-more-recent-msrv = ["tree-editor"] + ## Various progress-related features that improve the look of progress message units. comfort = [ "gix-features/progress-unit-bytes", @@ -103,6 +106,12 @@ worktree-mutation = ["attributes", "dep:gix-worktree-state"] ## Retrieve a worktree stack for querying exclude information excludes = ["dep:gix-ignore", "dep:gix-worktree", "index"] +## Provide facilities to edit trees conveniently. +## +## Not that currently, this requires [Rust 1.75](https://caniuse.rs/features/return_position_impl_trait_in_trait). +## This feature toggle is likely going away then. +tree-editor = [] + ## Query attributes and excludes. Enables access to pathspecs, worktree checkouts, filter-pipelines and submodules. attributes = [ "excludes", @@ -384,6 +393,8 @@ parking_lot = { version = "0.12.1", optional = true } document-features = { version = "0.2.0", optional = true } [dev-dependencies] +# For additional features that aren't enabled by default due to MSRV +gix = { path = ".", default-features = false, features = ["tree-editor"] } pretty_assertions = "1.4.0" gix-testtools = { path = "../tests/tools" } is_ci = "1.1.1" @@ -391,6 +402,7 @@ anyhow = "1" walkdir = "2.3.2" serial_test = { version = "3.1.0", default-features = false } async-std = { version = "1.12.0", features = ["attributes"] } +termtree = "0.5.1" [package.metadata.docs.rs] features = [ @@ -398,5 +410,6 @@ features = [ "max-performance", "blocking-network-client", "blocking-http-transport-curl", + "need-more-recent-msrv", "serde", ] diff --git a/gix/src/config/cache/access.rs b/gix/src/config/cache/access.rs index 2ebf2c5c06b..2575901df0e 100644 --- a/gix/src/config/cache/access.rs +++ b/gix/src/config/cache/access.rs @@ -251,7 +251,7 @@ impl Cache { }) } - #[cfg(feature = "index")] + #[cfg(any(feature = "index", feature = "tree-editor"))] pub(crate) fn protect_options(&self) -> Result { const IS_WINDOWS: bool = cfg!(windows); const IS_MACOS: bool = cfg!(target_os = "macos"); diff --git a/gix/src/object/tree/editor.rs b/gix/src/object/tree/editor.rs new file mode 100644 index 00000000000..fdcdcf87d40 --- /dev/null +++ b/gix/src/object/tree/editor.rs @@ -0,0 +1,271 @@ +use crate::bstr::{BStr, BString}; +use crate::prelude::ObjectIdExt; +use crate::{Id, Repository}; +use gix_hash::ObjectId; +use gix_object::tree::EntryKind; + +/// +pub mod init { + /// The error returned by [`Editor::new()](crate::object::tree::Editor::new()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error(transparent)] + DecodeTree(#[from] gix_object::decode::Error), + #[error(transparent)] + ValidationOptions(#[from] crate::config::boolean::Error), + } +} + +/// +pub mod write { + use crate::bstr::BString; + + /// The error returned by [`Editor::write()](crate::object::tree::Editor::write()) and [`Cursor::write()](super::Cursor::write). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error(transparent)] + WriteTree(#[from] crate::object::write::Error), + #[error("The object {} ({}) at '{}' could not be found", id, kind.as_octal_str(), filename)] + MissingObject { + filename: BString, + kind: gix_object::tree::EntryKind, + id: gix_hash::ObjectId, + }, + #[error("The object {} ({}) has an invalid filename: '{}'", id, kind.as_octal_str(), filename)] + InvalidFilename { + filename: BString, + kind: gix_object::tree::EntryKind, + id: gix_hash::ObjectId, + source: gix_validate::path::component::Error, + }, + } +} + +/// A cursor at a specific portion of a tree to [edit](super::Editor). +pub struct Cursor<'a, 'repo> { + inner: gix_object::tree::editor::Cursor<'a, 'repo>, + validate: gix_validate::path::component::Options, + repo: &'repo Repository, +} + +/// Lifecycle +impl<'repo> super::Editor<'repo> { + /// Initialize a new editor from the given `tree`. + pub fn new(tree: &crate::Tree<'repo>) -> Result { + let tree_ref = tree.decode()?; + let repo = tree.repo; + let validate = repo.config.protect_options()?; + Ok(super::Editor { + inner: gix_object::tree::Editor::new(tree_ref.into(), &repo.objects, repo.object_hash()), + validate, + repo, + }) + } +} + +/// Tree editing +#[cfg(feature = "tree-editor")] +impl<'repo> crate::Tree<'repo> { + /// Start editing a new tree based on this one. + #[doc(alias = "treebuilder", alias = "git2")] + pub fn edit(&self) -> Result, init::Error> { + super::Editor::new(self) + } +} + +/// Obtain an iterator over `BStr`-components. +/// +/// Note that the implementation is simple, and it's mainly meant for statically known strings +/// or locations obtained during a merge. +pub trait ToComponents { + /// Return an iterator over the components of a path, without the separator. + fn to_components(&self) -> impl Iterator; +} + +impl ToComponents for &str { + fn to_components(&self) -> impl Iterator { + self.split('/').map(Into::into) + } +} + +impl ToComponents for String { + fn to_components(&self) -> impl Iterator { + self.split('/').map(Into::into) + } +} + +impl ToComponents for &String { + fn to_components(&self) -> impl Iterator { + self.split('/').map(Into::into) + } +} + +impl ToComponents for BString { + fn to_components(&self) -> impl Iterator { + self.split(|b| *b == b'/').map(Into::into) + } +} + +impl ToComponents for &BString { + fn to_components(&self) -> impl Iterator { + self.split(|b| *b == b'/').map(Into::into) + } +} + +impl ToComponents for &BStr { + fn to_components(&self) -> impl Iterator { + self.split(|b| *b == b'/').map(Into::into) + } +} + +/// Cursor Handling +impl<'repo> super::Editor<'repo> { + /// Turn ourselves as a cursor, which points to the same tree as the editor. + /// + /// This is useful if a method takes a [`Cursor`], not an [`Editor`](super::Editor). + pub fn to_cursor(&mut self) -> Cursor<'_, 'repo> { + Cursor { + inner: self.inner.to_cursor(), + validate: self.validate, + repo: self.repo, + } + } + + /// Create a cursor at the given `rela_path`, which must be a tree or is turned into a tree as its own edit. + /// + /// The returned cursor will then allow applying edits to the tree at `rela_path` as root. + /// If `rela_path` is a single empty string, it is equivalent to using the current instance itself. + pub fn cursor_at( + &mut self, + rela_path: impl ToComponents, + ) -> Result, gix_object::tree::editor::Error> { + Ok(Cursor { + inner: self.inner.cursor_at(rela_path.to_components())?, + validate: self.validate, + repo: self.repo, + }) + } +} +/// Operations +impl<'repo> Cursor<'_, 'repo> { + /// Like [`Editor::upsert()`](super::Editor::upsert()), but with the constraint of only editing in this cursor's tree. + pub fn upsert( + &mut self, + rela_path: impl ToComponents, + kind: EntryKind, + id: impl Into, + ) -> Result<&mut Self, gix_object::tree::editor::Error> { + self.inner.upsert(rela_path.to_components(), kind, id.into())?; + Ok(self) + } + + /// Like [`Editor::remove()`](super::Editor::remove), but with the constraint of only editing in this cursor's tree. + pub fn remove(&mut self, rela_path: impl ToComponents) -> Result<&mut Self, gix_object::tree::editor::Error> { + self.inner.remove(rela_path.to_components())?; + Ok(self) + } + + /// Like [`Editor::write()`](super::Editor::write()), but will write only the subtree of the cursor. + pub fn write(&mut self) -> Result, write::Error> { + write_cursor(self) + } +} + +/// Operations +impl<'repo> super::Editor<'repo> { + /// Set the root tree of the modification to `root`, assuring it has a well-known state. + /// + /// Note that this erases all previous edits. + /// + /// This is useful if the same editor is re-used for various trees. + pub fn set_root(&mut self, root: &crate::Tree<'repo>) -> Result<&mut Self, init::Error> { + let new_editor = super::Editor::new(root)?; + self.inner = new_editor.inner; + self.repo = new_editor.repo; + Ok(self) + } + /// Insert a new entry of `kind` with `id` at `rela_path`, an iterator over each path component in the tree, + /// like `a/b/c`. Names are matched case-sensitively. + /// + /// Existing leaf-entries will be overwritten unconditionally, and it is assumed that `id` is available in the object database + /// or will be made available at a later point to assure the integrity of the produced tree. + /// + /// Intermediate trees will be created if they don't exist in the object database, otherwise they will be loaded and entries + /// will be inserted into them instead. + /// + /// Note that `id` can be [null](ObjectId::null()) to create a placeholder. These will not be written, and paths leading + /// through them will not be considered a problem. + /// + /// `id` can also be an empty tree, along with [the respective `kind`](EntryKind::Tree), even though that's normally not allowed + /// in Git trees. + /// + /// Validation of path-components will not be performed here, but when [writing the tree](Self::write()). + pub fn upsert( + &mut self, + rela_path: impl ToComponents, + kind: EntryKind, + id: impl Into, + ) -> Result<&mut Self, gix_object::tree::editor::Error> { + self.inner.upsert(rela_path.to_components(), kind, id.into())?; + Ok(self) + } + + /// Remove the entry at `rela_path`, loading all trees on the path accordingly. + /// It's no error if the entry doesn't exist, or if `rela_path` doesn't lead to an existing entry at all. + pub fn remove(&mut self, rela_path: impl ToComponents) -> Result<&mut Self, gix_object::tree::editor::Error> { + self.inner.remove(rela_path.to_components())?; + Ok(self) + } + + /// Write the entire in-memory state of all changed trees (and only changed trees) to the object database. + /// Note that the returned object id *can* be the empty tree if everything was removed or if nothing + /// was added to the tree. + /// + /// The last call to `out` will be the changed root tree, whose object-id will also be returned. + /// `out` is free to do any kind of additional validation, like to assure that all entries in the tree exist. + /// We don't assure that as there is no validation that inserted entries are valid object ids. + /// + /// Future calls to [`upsert`](Self::upsert) or similar will keep working on the last seen state of the + /// just-written root-tree. + /// If this is not desired, use [set_root()](Self::set_root()). + /// + /// Before writing a tree, all of its entries (not only added ones), will be validated to assure they are + /// correct. The objects pointed to by entries also have to exist already. + pub fn write(&mut self) -> Result, write::Error> { + write_cursor(&mut self.to_cursor()) + } +} + +fn write_cursor<'repo>(cursor: &mut Cursor<'_, 'repo>) -> Result, write::Error> { + cursor + .inner + .write(|tree| -> Result { + for entry in &tree.entries { + gix_validate::path::component( + entry.filename.as_ref(), + entry + .mode + .is_link() + .then_some(gix_validate::path::component::Mode::Symlink), + cursor.validate, + ) + .map_err(|err| write::Error::InvalidFilename { + filename: entry.filename.clone(), + kind: entry.mode.into(), + id: entry.oid, + source: err, + })?; + if !cursor.repo.has_object(entry.oid) { + return Err(write::Error::MissingObject { + filename: entry.filename.clone(), + kind: entry.mode.into(), + id: entry.oid, + }); + } + } + Ok(cursor.repo.write_object(tree)?.detach()) + }) + .map(|id| id.attach(cursor.repo)) +} diff --git a/gix/src/object/tree/mod.rs b/gix/src/object/tree/mod.rs index 2f7df7ec33e..65906c42fda 100644 --- a/gix/src/object/tree/mod.rs +++ b/gix/src/object/tree/mod.rs @@ -4,6 +4,14 @@ use gix_object::{bstr::BStr, FindExt, TreeRefIter}; use crate::{object::find, Id, ObjectDetached, Tree}; +/// All state needed to conveniently edit a tree, using only [update-or-insert](Editor::upsert()) and [removals](Editor::remove()). +#[cfg(feature = "tree-editor")] +pub struct Editor<'repo> { + inner: gix_object::tree::Editor<'repo>, + validate: gix_validate::path::component::Options, + repo: &'repo crate::Repository, +} + /// Initialization impl<'repo> Tree<'repo> { /// Obtain a tree instance by handing in all components that it is made up of. @@ -163,6 +171,10 @@ impl<'repo> Tree<'repo> { } } +/// +#[cfg(feature = "tree-editor")] +pub mod editor; + /// #[cfg(feature = "blob-diff")] pub mod diff; diff --git a/gix/src/repository/mod.rs b/gix/src/repository/mod.rs index 77467f77f9a..41b2aaeca52 100644 --- a/gix/src/repository/mod.rs +++ b/gix/src/repository/mod.rs @@ -72,6 +72,20 @@ mod submodule; mod thread_safe; mod worktree; +/// +#[cfg(feature = "tree-editor")] +pub mod edit_tree { + /// The error returned by [Repository::edit_tree()](crate::Repository::edit_tree). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error(transparent)] + FindTree(#[from] crate::object::find::existing::with_conversion::Error), + #[error(transparent)] + InitEditor(#[from] crate::object::tree::editor::init::Error), + } +} + /// #[cfg(feature = "revision")] pub mod merge_base { diff --git a/gix/src/repository/object.rs b/gix/src/repository/object.rs index 6230556b3c5..29d3bdbfcd1 100644 --- a/gix/src/repository/object.rs +++ b/gix/src/repository/object.rs @@ -12,7 +12,23 @@ use smallvec::SmallVec; use crate::{commit, ext::ObjectIdExt, object, tag, Blob, Commit, Id, Object, Reference, Tag, Tree}; -/// Methods related to object creation. +/// Tree editing +#[cfg(feature = "tree-editor")] +impl crate::Repository { + /// Return an editor for adjusting the tree at `id`. + /// + /// This can be the [empty tree id](ObjectId::empty_tree) to build a tree from scratch. + #[doc(alias = "treebuilder", alias = "git2")] + pub fn edit_tree( + &self, + id: impl Into, + ) -> Result, crate::repository::edit_tree::Error> { + let tree = self.find_tree(id)?; + Ok(tree.edit()?) + } +} + +/// Find objects of various kins impl crate::Repository { /// Find the object with `id` in the object database or return an error if it could not be found. /// @@ -138,7 +154,10 @@ impl crate::Repository { None => Ok(None), } } +} +/// Write objects of any type. +impl crate::Repository { pub(crate) fn shared_empty_buf(&self) -> std::cell::RefMut<'_, Vec> { let mut bufs = self.bufs.borrow_mut(); if bufs.last().is_none() { @@ -217,7 +236,10 @@ impl crate::Repository { .map_err(Into::into) .map(|oid| oid.attach(self)) } +} +/// Create commits and tags +impl crate::Repository { /// Create a tag reference named `name` (without `refs/tags/` prefix) pointing to a newly created tag object /// which in turn points to `target` and return the newly created reference. /// diff --git a/gix/src/types.rs b/gix/src/types.rs index fc726309b17..3e5edd1abd0 100644 --- a/gix/src/types.rs +++ b/gix/src/types.rs @@ -67,7 +67,7 @@ impl<'a> Drop for Blob<'a> { /// A decoded tree object with access to its owning repository. #[derive(Clone)] pub struct Tree<'repo> { - /// The id of the tree + /// Thek[ id of the tree pub id: ObjectId, /// The fully decoded tree data pub data: Vec, diff --git a/gix/tests/repository/object.rs b/gix/tests/repository/object.rs index 2aa2607cc69..341b1f9fea1 100644 --- a/gix/tests/repository/object.rs +++ b/gix/tests/repository/object.rs @@ -1,5 +1,210 @@ use gix_testtools::tempfile; +#[cfg(feature = "tree-editor")] +mod edit_tree { + use crate::util::hex_to_id; + use gix::bstr::{BStr, BString}; + use gix_object::tree::EntryKind; + + #[test] + // Some part of the test validation the implementation for this exists, but it's needless nonetheless. + #[allow(clippy::needless_borrows_for_generic_args)] + fn from_head_tree() -> crate::Result { + let (repo, _tmp) = crate::repo_rw("make_packed_and_loose.sh")?; + let head_tree_id = repo.head_tree_id()?; + assert_eq!( + display_tree(head_tree_id, &repo), + "24374df94315568adfaee119d038f710d1f45397 +├── that ce013625030ba8dba906f756967f9e9ca394464a.100644 +└── this 317e9677c3bcffd006f9fc84bbb0a54ef1676197.100644 +" + ); + let this_id = hex_to_id("317e9677c3bcffd006f9fc84bbb0a54ef1676197"); + let that_id = hex_to_id("ce013625030ba8dba906f756967f9e9ca394464a"); + let mut editor = repo.edit_tree(head_tree_id)?; + let actual = editor + .upsert("a/b", EntryKind::Blob, this_id)? + .upsert(String::from("this/subdir/that"), EntryKind::Blob, this_id)? + .upsert(BString::from("that/other/that"), EntryKind::Blob, that_id)? + .remove(BStr::new("that"))? + .remove(&String::from("that"))? + .remove(&BString::from("that"))? + .write()?; + + assert_eq!( + display_tree(actual, &repo), + "fe02a8bd15e4c0476d938f772f1eece6d164b1bd +├── a +│ └── b 317e9677c3bcffd006f9fc84bbb0a54ef1676197.100644 +└── this + └── subdir + └── that 317e9677c3bcffd006f9fc84bbb0a54ef1676197.100644 +", + "all trees are actually written, or else we couldn't visualize them." + ); + + let actual = editor + .upsert("a/b", EntryKind::Blob, that_id)? + .upsert(String::from("this/subdir/that"), EntryKind::Blob, this_id)? + .remove(BStr::new("does-not-exist"))? + .write()?; + assert_eq!( + display_tree(actual, &repo), + "219596ff52fc84b6b39bc327f202d408cc02e1db +├── a +│ └── b ce013625030ba8dba906f756967f9e9ca394464a.100644 +└── this + └── subdir + └── that 317e9677c3bcffd006f9fc84bbb0a54ef1676197.100644 +", + "existing blobs can also be changed" + ); + + let mut cursor = editor.cursor_at("something/very/nested/to/add/entries/to")?; + let actual = cursor + .upsert("a/b", EntryKind::Blob, this_id)? + .upsert(String::from("this/subdir/that"), EntryKind::Blob, that_id)? + .upsert(BString::from("that/other/that"), EntryKind::Blob, that_id)? + .remove(BStr::new("that"))? + .write()?; + + assert_eq!( + display_tree(actual, &repo), + "35ea623106198f21b6959dd2731740e5153db2bb +├── a +│ └── b 317e9677c3bcffd006f9fc84bbb0a54ef1676197.100644 +└── this + └── subdir + └── that ce013625030ba8dba906f756967f9e9ca394464a.100644 +", + "all remaining subtrees are written from the cursor position" + ); + + let actual = editor.write()?; + assert_eq!( + display_tree(actual, &repo), + "9ebdc2c1d22e91636fa876a51521464f8a88dd6f +├── a +│ └── b ce013625030ba8dba906f756967f9e9ca394464a.100644 +├── something +│ └── very +│ └── nested +│ └── to +│ └── add +│ └── entries +│ └── to +│ ├── a +│ │ └── b 317e9677c3bcffd006f9fc84bbb0a54ef1676197.100644 +│ └── this +│ └── subdir +│ └── that ce013625030ba8dba906f756967f9e9ca394464a.100644 +└── this + └── subdir + └── that 317e9677c3bcffd006f9fc84bbb0a54ef1676197.100644 +", + "it looks as it should when seen from the root tree" + ); + + editor.set_root(&head_tree_id.object()?.into_tree())?; + let actual = editor.write()?; + assert_eq!( + display_tree(actual, &repo), + "24374df94315568adfaee119d038f710d1f45397 +├── that ce013625030ba8dba906f756967f9e9ca394464a.100644 +└── this 317e9677c3bcffd006f9fc84bbb0a54ef1676197.100644 +", + "it's possible to set the editor to any tree after creating it, could help with memory re-use" + ); + Ok(()) + } + + #[test] + fn missing_objects_and_illformed_path_components_trigger_error() -> crate::Result { + let (repo, _tmp) = crate::repo_rw("make_packed_and_loose.sh")?; + let tree = repo.head_tree_id()?.object()?.into_tree(); + let mut editor = tree.edit()?; + let actual = editor + .upsert("non-existing", EntryKind::Blob, repo.object_hash().null())? + .write()?; + assert_eq!( + actual, + tree.id(), + "nulls are pruned before writing the tree, so it just rewrites the same tree" + ); + + let err = editor + .upsert( + "non-existing", + EntryKind::Blob, + hex_to_id("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"), + )? + .write() + .unwrap_err(); + assert_eq!( + err.to_string(), + "The object aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa (100644) at 'non-existing' could not be found", + "each entry to be written is checked for existence" + ); + + let this_id = hex_to_id("317e9677c3bcffd006f9fc84bbb0a54ef1676197"); + let err = editor + .remove("non-existing")? + .upsert(".git", EntryKind::Blob, this_id)? + .write() + .expect_err(".git is universally forbidden in trees"); + assert_eq!( + err.to_string(), + "The object 317e9677c3bcffd006f9fc84bbb0a54ef1676197 (100644) has an invalid filename: '.git'", + "each component is validated" + ); + + Ok(()) + } + + mod utils { + use gix::bstr::{BStr, ByteSlice}; + use gix::Repository; + use gix_hash::ObjectId; + + fn display_tree_recursive( + tree_id: ObjectId, + repo: &Repository, + name: Option<&BStr>, + ) -> anyhow::Result> { + let tree = repo.find_tree(tree_id)?.decode()?.to_owned(); + let mut termtree = termtree::Tree::new(if let Some(name) = name { + if tree.entries.is_empty() { + format!("{name} (empty)") + } else { + name.to_string() + } + } else { + tree_id.to_string() + }); + + for entry in &tree.entries { + if entry.mode.is_tree() { + termtree.push(display_tree_recursive(entry.oid, repo, Some(entry.filename.as_bstr()))?); + } else { + termtree.push(format!( + "{} {}.{}", + entry.filename, + entry.oid, + entry.mode.kind().as_octal_str() + )); + } + } + Ok(termtree) + } + + pub(super) fn display_tree(tree_id: impl Into, odb: &Repository) -> String { + display_tree_recursive(tree_id.into(), odb, None) + .expect("tree exists and everything was written") + .to_string() + } + } + use utils::display_tree; +} mod write_object { use crate::repository::object::empty_bare_repo; diff --git a/justfile b/justfile index 4a215d02f0a..758646d0f2c 100755 --- a/justfile +++ b/justfile @@ -146,8 +146,8 @@ check: # Run cargo doc on all crates doc $RUSTDOCFLAGS="-D warnings": - cargo doc --all --no-deps - cargo doc --features=max,lean,small --all --no-deps + cargo doc --all --no-deps --features need-more-recent-msrv + cargo doc --features=max,lean,small --all --no-deps --features need-more-recent-msrv # run all unit tests unit-tests: @@ -183,7 +183,7 @@ unit-tests: cargo nextest run -p gix-protocol --features blocking-client cargo nextest run -p gix-protocol --features async-client cargo nextest run -p gix --no-default-features - cargo nextest run -p gix --no-default-features --features basic,extras,comfort + cargo nextest run -p gix --no-default-features --features basic,extras,comfort,need-more-recent-msrv cargo nextest run -p gix --features async-network-client cargo nextest run -p gix --features blocking-network-client cargo nextest run -p gitoxide-core --lib From ca5294af0971da372c5d5a5c7a569810dd53de93 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 5 Sep 2024 08:31:41 +0200 Subject: [PATCH 15/17] feat: add `InMemoryPassThrough` implementation. An implementation of `Header`, `Write` and `Find`, that can optionally write everything to an in-memory store, and if enabled, also read objects back from there. That way it can present a consistent view to objects from two locations. --- Cargo.lock | 1 + gix-odb/Cargo.toml | 1 + gix-odb/src/lib.rs | 3 + gix-odb/src/memory.rs | 258 ++++++++++++++++++++++++++++++++++++ gix-odb/tests/odb/memory.rs | 115 ++++++++++++++++ gix-odb/tests/odb/mod.rs | 1 + 6 files changed, 379 insertions(+) create mode 100644 gix-odb/src/memory.rs create mode 100644 gix-odb/tests/odb/memory.rs diff --git a/Cargo.lock b/Cargo.lock index c4c7d061430..fadad33adf5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2115,6 +2115,7 @@ dependencies = [ "gix-features 0.38.2", "gix-fs 0.11.3", "gix-hash 0.14.2", + "gix-hashtable 0.5.2", "gix-object 0.44.0", "gix-pack", "gix-path 0.10.10", diff --git a/gix-odb/Cargo.toml b/gix-odb/Cargo.toml index ce87448814c..bad6f16033b 100644 --- a/gix-odb/Cargo.toml +++ b/gix-odb/Cargo.toml @@ -21,6 +21,7 @@ serde = ["dep:serde", "gix-hash/serde", "gix-object/serde", "gix-pack/serde"] [dependencies] gix-features = { version = "^0.38.2", path = "../gix-features", features = ["rustsha1", "walkdir", "zlib", "crc32"] } +gix-hashtable = { version = "^0.5.2", path = "../gix-hashtable" } gix-hash = { version = "^0.14.2", path = "../gix-hash" } gix-date = { version = "^0.9.0", path = "../gix-date" } gix-path = { version = "^0.10.10", path = "../gix-path" } diff --git a/gix-odb/src/lib.rs b/gix-odb/src/lib.rs index 40d7c2aea90..aaedc74a175 100644 --- a/gix-odb/src/lib.rs +++ b/gix-odb/src/lib.rs @@ -66,6 +66,9 @@ pub fn sink(object_hash: gix_hash::Kind) -> Sink { } } +/// +pub mod memory; + mod sink; /// diff --git a/gix-odb/src/memory.rs b/gix-odb/src/memory.rs new file mode 100644 index 00000000000..5fdbd2abe3e --- /dev/null +++ b/gix-odb/src/memory.rs @@ -0,0 +1,258 @@ +use crate::find::Header; +use crate::Cache; +use gix_object::Data; +use std::cell::RefCell; +use std::ops::{Deref, DerefMut}; +use std::rc::Rc; +use std::sync::Arc; + +/// An object database to read from any implementation but write to memory. +/// Previously written objects can be returned from memory upon query, which makes the view of objects consistent. +/// In-Memory objects can be disabled by [taking out its storage](Proxy::take_object_memory). From there in-memory +/// object can also be persisted one by one. +/// +/// It's possible to turn off the memory by removing it from the instance. +pub struct Proxy { + /// The actual odb implementation + inner: T, + /// The kind of hash to produce when writing new objects. + object_hash: gix_hash::Kind, + /// The storage for in-memory objects. + /// If `None`, the proxy will always read from and write-through to `inner`. + memory: Option>, +} + +/// Lifecycle +impl Proxy { + /// Create a new instance using `odb` as actual object provider, with an empty in-memory store for + /// objects that are to be written. + /// Use `object_hash` to determine the kind of hash to produce when writing new objects. + pub fn new(odb: T, object_hash: gix_hash::Kind) -> Proxy { + Proxy { + inner: odb, + object_hash, + memory: Some(Default::default()), + } + } + + /// Turn ourselves into our inner object database, while deallocating objects stored in memory. + pub fn into_inner(self) -> T { + self.inner + } + + /// Strip object memory off this instance, which means that writes will go through to the inner object database + /// right away. + /// This mode makes the proxy fully transparent. + pub fn with_write_passthrough(mut self) -> Self { + self.memory.take(); + self + } +} + +impl Proxy>>> { + /// No op, as we are containing an arc handle already. + pub fn into_arc(self) -> std::io::Result>>>> { + Ok(self) + } +} + +impl Proxy>>> { + /// Create an entirely new instance, but with the in-memory objects moving between them. + pub fn into_arc(self) -> std::io::Result>>>> { + Ok(Proxy { + inner: self.inner.into_arc()?, + object_hash: self.object_hash, + memory: self.memory, + }) + } +} + +impl From for Proxy { + fn from(odb: crate::Handle) -> Self { + let object_hash = odb.store.object_hash; + Proxy::new(odb, object_hash) + } +} + +/// Memory Access +impl Proxy { + /// Take all the objects in memory so far, with the memory storage itself and return it. + /// + /// The instance will remain in a state where it won't be able to store objects in memory at all, + /// they will now be stored in the underlying object database. + /// This mode makes the proxy fully transparent. + /// + /// To avoid that, use [`reset_object_memory()`](Self::reset_object_memory()) or return the storage + /// using [`set_object_memory()`](Self::set_object_memory()). + pub fn take_object_memory(&mut self) -> Option { + self.memory.take().map(RefCell::into_inner) + } + + /// Set the object storage to contain only `new` objects, and return whichever objects were there previously. + pub fn set_object_memory(&mut self, new: Storage) -> Option { + let previous = self.take_object_memory(); + self.memory = Some(RefCell::new(new)); + previous + } + + /// If objects aren't written to memory yet, this will happen after the call. + /// + /// Otherwise, no change will be performed. + pub fn enable_object_memory(&mut self) -> &mut Self { + if self.memory.is_none() { + self.memory = Some(Default::default()); + } + self + } + + /// Reset the internal storage to be empty, and return the previous storage, with all objects + /// it contained. + /// + /// Note that this does nothing if this instance didn't contain object memory in the first place. + /// In that case, set it explicitly. + pub fn reset_object_memory(&self) -> Option { + self.memory.as_ref().map(|m| std::mem::take(&mut *m.borrow_mut())) + } + + /// Return the amount of objects currently stored in memory. + pub fn num_objects_in_memory(&self) -> usize { + self.memory.as_ref().map_or(0, |m| m.borrow().len()) + } +} + +impl Clone for Proxy +where + T: Clone, +{ + fn clone(&self) -> Self { + Proxy { + inner: self.inner.clone(), + object_hash: self.object_hash, + memory: self.memory.clone(), + } + } +} + +impl gix_object::Find for Proxy +where + T: gix_object::Find, +{ + fn try_find<'a>( + &self, + id: &gix_hash::oid, + buffer: &'a mut Vec, + ) -> Result>, gix_object::find::Error> { + if let Some(map) = self.memory.as_ref() { + let map = map.borrow(); + if let Some((kind, data)) = map.get(id) { + buffer.clear(); + buffer.extend_from_slice(data); + return Ok(Some(Data { + kind: *kind, + data: &*buffer, + })); + } + } + self.inner.try_find(id, buffer) + } +} + +impl gix_object::Exists for Proxy +where + T: gix_object::Exists, +{ + fn exists(&self, id: &gix_hash::oid) -> bool { + self.memory.as_ref().map_or(false, |map| map.borrow().contains_key(id)) || self.inner.exists(id) + } +} + +impl crate::Header for Proxy +where + T: crate::Header, +{ + fn try_header(&self, id: &gix_hash::oid) -> Result, gix_object::find::Error> { + if let Some(map) = self.memory.as_ref() { + let map = map.borrow(); + if let Some((kind, data)) = map.get(id) { + return Ok(Some(Header::Loose { + kind: *kind, + size: data.len() as u64, + })); + } + } + self.inner.try_header(id) + } +} + +impl gix_object::FindHeader for Proxy +where + T: gix_object::FindHeader, +{ + fn try_header(&self, id: &gix_hash::oid) -> Result, gix_object::find::Error> { + if let Some(map) = self.memory.as_ref() { + let map = map.borrow(); + if let Some((kind, data)) = map.get(id) { + return Ok(Some(gix_object::Header { + kind: *kind, + size: data.len() as u64, + })); + } + } + self.inner.try_header(id) + } +} + +impl crate::Write for Proxy +where + T: crate::Write, +{ + fn write_stream( + &self, + kind: gix_object::Kind, + size: u64, + from: &mut dyn std::io::Read, + ) -> Result { + let Some(map) = self.memory.as_ref() else { + return self.inner.write_stream(kind, size, from); + }; + + let mut buf = Vec::new(); + from.read_to_end(&mut buf)?; + + let id = gix_object::compute_hash(self.object_hash, kind, &buf); + map.borrow_mut().insert(id, (kind, buf)); + Ok(id) + } +} + +impl Deref for Proxy { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +impl DerefMut for Proxy { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } +} + +/// A mapping between an object id and all data corresponding to an object, acting like a `HashMap`. +#[derive(Default, Debug, Clone, Eq, PartialEq)] +pub struct Storage(gix_hashtable::HashMap)>); + +impl Deref for Storage { + type Target = gix_hashtable::HashMap)>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for Storage { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} diff --git a/gix-odb/tests/odb/memory.rs b/gix-odb/tests/odb/memory.rs new file mode 100644 index 00000000000..d0d8ddb2bfd --- /dev/null +++ b/gix-odb/tests/odb/memory.rs @@ -0,0 +1,115 @@ +use crate::odb::hex_to_id; +use gix_object::{tree, Exists, FindExt}; +use gix_odb::Write; +use gix_testtools::tempfile::TempDir; + +#[test] +fn without_memory() -> crate::Result { + use gix_odb::HeaderExt; + let (mut odb, _tmp) = db_rw()?; + let mut buf = Vec::new(); + let mem = odb.take_object_memory().expect("it starts out with memory set"); + assert_eq!(mem.len(), 0, "no object is stored initially"); + let existing = hex_to_id("21d3ba9a26b790a4858d67754ae05d04dfce4d0c"); + let tree = odb.find_tree(&existing, &mut buf).expect("present and valid"); + assert_eq!(tree.entries.len(), 1); + odb.header(existing).expect("header can be found just the same"); + assert!(odb.exists(&existing)); + + let mut tree = tree.to_owned(); + tree.entries.push(tree::Entry { + mode: tree::EntryKind::Blob.into(), + filename: "z-for-sorting_another-file-with-same-content".into(), + oid: existing, + }); + let new_tree_id = odb.write(&tree)?; + assert_eq!(new_tree_id, hex_to_id("249b0b4106a5e9e7875e446a26468e22ec47a05c")); + let actual = odb.header(new_tree_id).expect("header of new objects can be found"); + assert_eq!(actual.kind(), gix_object::Kind::Tree); + assert_eq!(actual.size(), 104); + + let new_tree = odb + .find_tree(&new_tree_id, &mut buf) + .expect("new tree is also available as object") + .to_owned(); + assert_eq!(new_tree, tree); + assert!(!odb.exists(&gix_hash::Kind::Sha1.null())); + + Ok(()) +} + +#[test] +fn with_memory() -> crate::Result { + use gix_object::FindHeader; + let mut odb = db()?; + assert_eq!( + (*odb).iter()?.count(), + 6, + "let's be sure we didn't accidentally write anything" + ); + let mut buf = Vec::new(); + let existing = hex_to_id("21d3ba9a26b790a4858d67754ae05d04dfce4d0c"); + let tree = odb.find_tree(&existing, &mut buf).expect("present and valid"); + assert!(odb.exists(&existing)); + assert_eq!(tree.entries.len(), 1); + odb.try_header(&existing)?.expect("header can be found just the same"); + assert_eq!( + odb.num_objects_in_memory(), + 0, + "nothing is stored when fetching objects - it's not an object cache" + ); + + let mut tree = tree.to_owned(); + tree.entries.push(tree::Entry { + mode: tree::EntryKind::Blob.into(), + filename: "z-for-sorting_another-file-with-same-content".into(), + oid: existing, + }); + let new_tree_id = odb.write(&tree)?; + assert_eq!(new_tree_id, hex_to_id("249b0b4106a5e9e7875e446a26468e22ec47a05c")); + let actual = odb + .try_header(&new_tree_id)? + .expect("header of new objects can be found"); + assert_eq!(actual.kind, gix_object::Kind::Tree); + assert_eq!(actual.size, 104); + + let new_tree = odb + .find_tree(&new_tree_id, &mut buf) + .expect("new tree is also available as object") + .to_owned(); + assert_eq!(new_tree, tree); + + let mem = odb.reset_object_memory().expect("memory is still available"); + assert_eq!(mem.len(), 1, "one new object was just written"); + + assert_eq!( + odb.try_header(&new_tree_id)?, + None, + "without memory, the object can't be found anymore" + ); + + let prev_mem = odb.set_object_memory(mem).expect("reset means it's just cleared"); + assert_eq!(prev_mem.len(), 0, "nothing was stored after the reset"); + + assert_eq!(odb.num_objects_in_memory(), 1, "we put all previous objects back"); + + let odb2 = odb.clone(); + assert_eq!(odb2.num_objects_in_memory(), 1, "it also clones the object memory"); + + assert!(!odb.exists(&gix_hash::Kind::Sha1.null())); + + Ok(()) +} + +fn db() -> crate::Result> { + let odb = gix_odb::at( + gix_testtools::scripted_fixture_read_only_standalone("repo_with_loose_objects.sh")?.join(".git/objects"), + )?; + Ok(gix_odb::memory::Proxy::new(odb, gix_hash::Kind::Sha1)) +} + +fn db_rw() -> crate::Result<(gix_odb::memory::Proxy, TempDir)> { + let tmp = gix_testtools::scripted_fixture_writable_standalone("repo_with_loose_objects.sh")?; + let odb = gix_odb::at(tmp.path().join(".git/objects"))?; + Ok((gix_odb::memory::Proxy::new(odb, gix_hash::Kind::Sha1), tmp)) +} diff --git a/gix-odb/tests/odb/mod.rs b/gix-odb/tests/odb/mod.rs index 6e4b1506bae..353bb78ee66 100644 --- a/gix-odb/tests/odb/mod.rs +++ b/gix-odb/tests/odb/mod.rs @@ -18,6 +18,7 @@ fn db_small_packs() -> gix_odb::Handle { pub mod alternate; pub mod find; pub mod header; +pub mod memory; pub mod regression; pub mod sink; pub mod store; From dfbc732effc0fd1c90aba9c736a670d244c042e0 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 5 Sep 2024 20:21:06 +0200 Subject: [PATCH 16/17] feat!: optionally store objects new objects in memory only. The default object database changed to a version that allows to keep objects in memory. This needs a mutable `Repository` instance to setup. --- gix/src/lib.rs | 5 ++++- gix/src/repository/cache.rs | 11 +++++++++++ gix/src/repository/impls.rs | 4 ++-- gix/src/repository/object.rs | 2 +- gix/src/types.rs | 2 +- gix/tests/repository/object.rs | 18 +++++++++++++++++- 6 files changed, 36 insertions(+), 6 deletions(-) diff --git a/gix/src/lib.rs b/gix/src/lib.rs index 56fd5e7e21d..6176e2461b5 100644 --- a/gix/src/lib.rs +++ b/gix/src/lib.rs @@ -158,7 +158,10 @@ pub mod path; /// The standard type for a store to handle git references. pub type RefStore = gix_ref::file::Store; /// A handle for finding objects in an object database, abstracting away caches for thread-local use. -pub type OdbHandle = gix_odb::Handle; +pub type OdbHandle = gix_odb::memory::Proxy; +/// A handle for finding objects in an object database, abstracting away caches for moving across threads. +pub type OdbHandleArc = gix_odb::memory::Proxy; + /// A way to access git configuration pub(crate) type Config = OwnShared>; diff --git a/gix/src/repository/cache.rs b/gix/src/repository/cache.rs index cd765203e90..c499d6e5d41 100644 --- a/gix/src/repository/cache.rs +++ b/gix/src/repository/cache.rs @@ -39,3 +39,14 @@ impl crate::Repository { (ten_mb_for_every_10k_files as usize).max(4 * 1024) } } + +/// Handling of InMemory object writing +impl crate::Repository { + /// When writing objects, keep them in memory instead of writing them to disk. + /// This makes any change to the object database non-persisting, while keeping the view + /// to the object database consistent for this instance. + pub fn with_object_memory(mut self) -> Self { + self.objects.enable_object_memory(); + self + } +} diff --git a/gix/src/repository/impls.rs b/gix/src/repository/impls.rs index 36fd788dc6c..4b7fbf57d5b 100644 --- a/gix/src/repository/impls.rs +++ b/gix/src/repository/impls.rs @@ -38,7 +38,7 @@ impl From<&crate::ThreadSafeRepository> for crate::Repository { fn from(repo: &crate::ThreadSafeRepository) -> Self { crate::Repository::from_refs_and_objects( repo.refs.clone(), - repo.objects.to_handle().into(), + gix_odb::memory::Proxy::from(gix_odb::Cache::from(repo.objects.to_handle())).with_write_passthrough(), repo.work_tree.clone(), repo.common_dir.clone(), repo.config.clone(), @@ -56,7 +56,7 @@ impl From for crate::Repository { fn from(repo: crate::ThreadSafeRepository) -> Self { crate::Repository::from_refs_and_objects( repo.refs, - repo.objects.to_handle().into(), + gix_odb::memory::Proxy::from(gix_odb::Cache::from(repo.objects.to_handle())).with_write_passthrough(), repo.work_tree, repo.common_dir, repo.config, diff --git a/gix/src/repository/object.rs b/gix/src/repository/object.rs index 29d3bdbfcd1..7fe3ded853f 100644 --- a/gix/src/repository/object.rs +++ b/gix/src/repository/object.rs @@ -109,7 +109,7 @@ impl crate::Repository { #[doc(alias = "exists", alias = "git2")] pub fn has_object(&self, id: impl AsRef) -> bool { let id = id.as_ref(); - if id == ObjectId::empty_tree(self.object_hash()) { + if id.to_owned().is_empty_tree() { true } else { self.objects.exists(id) diff --git a/gix/src/types.rs b/gix/src/types.rs index 3e5edd1abd0..fa03f601e95 100644 --- a/gix/src/types.rs +++ b/gix/src/types.rs @@ -247,7 +247,7 @@ pub struct PathspecDetached { /// The prepared search to use for checking matches. pub search: gix_pathspec::Search, /// A thread-safe version of an ODB. - pub odb: gix_odb::HandleArc, + pub odb: crate::OdbHandleArc, } /// A stand-in for the submodule of a particular name. diff --git a/gix/tests/repository/object.rs b/gix/tests/repository/object.rs index 341b1f9fea1..27993e23b4b 100644 --- a/gix/tests/repository/object.rs +++ b/gix/tests/repository/object.rs @@ -229,8 +229,24 @@ mod write_blob { #[test] fn from_slice() -> crate::Result { let (_tmp, repo) = empty_bare_repo()?; + let expected = hex_to_id("95d09f2b10159347eece71399a7e2e907ea3df4f"); + assert!(!repo.has_object(expected)); + let oid = repo.write_blob(b"hello world")?; - assert_eq!(oid, hex_to_id("95d09f2b10159347eece71399a7e2e907ea3df4f")); + assert_eq!(oid, expected); + + let mut other_repo = gix::open_opts(repo.path(), gix::open::Options::isolated())?; + other_repo.objects.enable_object_memory(); + assert!( + other_repo.has_object(oid), + "we definitely don't accidentally write to memory only" + ); + let in_memory_id = other_repo.write_blob("hello world - to memory")?; + assert!(!repo.has_object(in_memory_id), "the object was never written to disk…"); + assert!( + other_repo.has_object(in_memory_id), + "…and exists only in the instance that wrote it" + ); Ok(()) } From b91d90989d0b31f532242f11b9f1324e412936b4 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 5 Sep 2024 22:27:37 +0200 Subject: [PATCH 17/17] thanks clippy --- gix-packetline-blocking/src/encode/async_io.rs | 2 +- gix-packetline-blocking/src/write/async_io.rs | 2 +- gix-packetline/src/encode/async_io.rs | 2 +- gix-packetline/src/write/async_io.rs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gix-packetline-blocking/src/encode/async_io.rs b/gix-packetline-blocking/src/encode/async_io.rs index fbf734b685c..27566ada140 100644 --- a/gix-packetline-blocking/src/encode/async_io.rs +++ b/gix-packetline-blocking/src/encode/async_io.rs @@ -178,7 +178,7 @@ async fn prefixed_data_to_write(prefix: &[u8], data: &[u8], out: impl AsyncWrite /// Write a `text` message to `out`, which is assured to end in a newline. pub async fn text_to_write(text: &[u8], out: impl AsyncWrite + Unpin) -> io::Result { - prefixed_and_suffixed_data_to_write(&[], text, &[b'\n'], out).await + prefixed_and_suffixed_data_to_write(&[], text, b"\n", out).await } /// Write a `data` message to `out`. diff --git a/gix-packetline-blocking/src/write/async_io.rs b/gix-packetline-blocking/src/write/async_io.rs index 6684ab19fd8..a77dbe274a7 100644 --- a/gix-packetline-blocking/src/write/async_io.rs +++ b/gix-packetline-blocking/src/write/async_io.rs @@ -54,7 +54,7 @@ impl Writer { /// If called, each call to [`write()`][io::Write::write()] will write the input as text, appending a trailing newline /// if needed before writing. pub fn enable_text_mode(&mut self) { - self.inner.suffix = &[b'\n']; + self.inner.suffix = b"\n"; } } diff --git a/gix-packetline/src/encode/async_io.rs b/gix-packetline/src/encode/async_io.rs index 57b379225a8..983644c1d9f 100644 --- a/gix-packetline/src/encode/async_io.rs +++ b/gix-packetline/src/encode/async_io.rs @@ -176,7 +176,7 @@ async fn prefixed_data_to_write(prefix: &[u8], data: &[u8], out: impl AsyncWrite /// Write a `text` message to `out`, which is assured to end in a newline. pub async fn text_to_write(text: &[u8], out: impl AsyncWrite + Unpin) -> io::Result { - prefixed_and_suffixed_data_to_write(&[], text, &[b'\n'], out).await + prefixed_and_suffixed_data_to_write(&[], text, b"\n", out).await } /// Write a `data` message to `out`. diff --git a/gix-packetline/src/write/async_io.rs b/gix-packetline/src/write/async_io.rs index 35b93012675..707d2b7b576 100644 --- a/gix-packetline/src/write/async_io.rs +++ b/gix-packetline/src/write/async_io.rs @@ -52,7 +52,7 @@ impl Writer { /// If called, each call to [`write()`][io::Write::write()] will write the input as text, appending a trailing newline /// if needed before writing. pub fn enable_text_mode(&mut self) { - self.inner.suffix = &[b'\n']; + self.inner.suffix = b"\n"; } }