From a161e330fb90f23eb8760cd170316358c34f7359 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 18 Mar 2022 09:49:33 +0800 Subject: [PATCH 1/9] A sketch of the parser API for ignore files (#301) Use an iterator for flexibility, while parsing what matters and leaving details to the caller, like generating the actual glob from the pre-escaped string. --- Cargo.lock | 4 +++ git-attributes/Cargo.toml | 2 ++ git-attributes/src/lib.rs | 58 ++++++++++++++++++++++++++++++ git-attributes/tests/attributes.rs | 8 +++++ 4 files changed, 72 insertions(+) create mode 100644 git-attributes/tests/attributes.rs diff --git a/Cargo.lock b/Cargo.lock index d898db32a5e..1b9539e2fd7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1073,6 +1073,10 @@ dependencies = [ [[package]] name = "git-attributes" version = "0.0.0" +dependencies = [ + "bitflags", + "bstr", +] [[package]] name = "git-bitmap" diff --git a/git-attributes/Cargo.toml b/git-attributes/Cargo.toml index 88a87c456e4..5af88cc5337 100644 --- a/git-attributes/Cargo.toml +++ b/git-attributes/Cargo.toml @@ -13,3 +13,5 @@ doctest = false # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +bstr = { version = "0.2.13", default-features = false, features = ["std"]} +bitflags = "1.3.2" diff --git a/git-attributes/src/lib.rs b/git-attributes/src/lib.rs index d7a83e4f525..a2041f24e2a 100644 --- a/git-attributes/src/lib.rs +++ b/git-attributes/src/lib.rs @@ -1 +1,59 @@ #![forbid(unsafe_code, rust_2018_idioms)] + +pub mod ignore { + pub mod pattern { + use bitflags::bitflags; + + bitflags! { + pub struct Mode: u32 { + const NO_DIR = 1 << 0; + // TODO: find a much better name! + const ENDS_WITH = 1 << 1; + const MUST_BE_DIR = 1 << 2; + const NEGATIVE = 1 << 3; + } + } + } +} + +pub mod parse { + pub mod ignore { + use crate::ignore; + use bstr::{BStr, BString, ByteSlice}; + + pub struct Iter<'a> { + cursor: &'a BStr, + } + + impl<'a> Iter<'a> { + pub fn new(buf: &'a [u8]) -> Self { + Iter { cursor: buf.as_bstr() } + } + } + + impl<'a> Iterator for Iter<'a> { + type Item = (BString, ignore::pattern::Mode); + + fn next(&mut self) -> Option { + if self.cursor.is_empty() { + return None; + } + let mut lines = self.cursor.lines(); + let res = None; + while let Some(line) = lines.next() { + if line.starts_with(b"#") { + continue; + } + todo!("handle escapes and trim trailing non-escaped whitespace") + } + if let Some(next_line) = lines.next() { + self.cursor = next_line.as_bstr(); + } + res + } + } + } + pub fn ignore(buf: &[u8]) -> ignore::Iter<'_> { + ignore::Iter::new(buf) + } +} diff --git a/git-attributes/tests/attributes.rs b/git-attributes/tests/attributes.rs new file mode 100644 index 00000000000..47ef9753652 --- /dev/null +++ b/git-attributes/tests/attributes.rs @@ -0,0 +1,8 @@ +mod parse { + mod ignore { + #[test] + fn comments_are_ignored() { + assert!(git_attributes::parse::ignore(b"# hello world").next().is_none()); + } + } +} From f5639b688df78648479fe1666a7aa2ed65ea6753 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 18 Mar 2022 09:56:27 +0800 Subject: [PATCH 2/9] thanks clippy --- git-attributes/src/lib.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/git-attributes/src/lib.rs b/git-attributes/src/lib.rs index a2041f24e2a..2f3d9d9fffd 100644 --- a/git-attributes/src/lib.rs +++ b/git-attributes/src/lib.rs @@ -40,15 +40,13 @@ pub mod parse { } let mut lines = self.cursor.lines(); let res = None; - while let Some(line) = lines.next() { + for line in lines.by_ref() { if line.starts_with(b"#") { continue; } todo!("handle escapes and trim trailing non-escaped whitespace") } - if let Some(next_line) = lines.next() { - self.cursor = next_line.as_bstr(); - } + self.cursor = lines.next().unwrap_or(&[]).as_bstr(); res } } From 3825874ff70da2cfcc932a680e96fb67d3815e8a Mon Sep 17 00:00:00 2001 From: Svetlin Stefanov Date: Wed, 16 Mar 2022 21:46:18 +0100 Subject: [PATCH 3/9] Fix openssl-src cargo-deny. --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1b9539e2fd7..5318f52c68c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2018,9 +2018,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-src" -version = "111.17.0+1.1.1m" +version = "111.18.0+1.1.1n" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05d6a336abd10814198f66e2a91ccd7336611f30334119ca8ce300536666fcf4" +checksum = "7897a926e1e8d00219127dc020130eca4292e5ca666dd592480d72c3eca2ff6c" dependencies = [ "cc", ] From 9a5d089010c5f46dc0470a8611b88c532836f841 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 18 Mar 2022 11:35:45 +0800 Subject: [PATCH 4/9] Handle trailing whitespaces (#301) --- git-attributes/src/lib.rs | 51 ++++++++++++++++-- git-attributes/tests/attributes.rs | 85 ++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+), 3 deletions(-) diff --git a/git-attributes/src/lib.rs b/git-attributes/src/lib.rs index 2f3d9d9fffd..e5989809e8a 100644 --- a/git-attributes/src/lib.rs +++ b/git-attributes/src/lib.rs @@ -39,17 +39,62 @@ pub mod parse { return None; } let mut lines = self.cursor.lines(); - let res = None; - for line in lines.by_ref() { + let mut res = None; + for mut line in lines.by_ref() { + let mut mode = ignore::pattern::Mode::empty(); + if line.is_empty() { + continue; + }; if line.starts_with(b"#") { continue; + } else if line.starts_with(b"\\#") { + line = &line[1..]; + } else if line.starts_with(b"!") { + mode |= ignore::pattern::Mode::NEGATIVE; + line = &line[1..]; + } else if line.starts_with(b"\\!") { + line = &line[1..]; } - todo!("handle escapes and trim trailing non-escaped whitespace") + res = Some((truncate_non_escaped_trailing_spaces(line), mode)); } self.cursor = lines.next().unwrap_or(&[]).as_bstr(); res } } + + /// We always copy just because that's ultimately needed anyway, not because we always have to. + fn truncate_non_escaped_trailing_spaces(buf: &[u8]) -> BString { + match buf.rfind_not_byteset(br"\ ") { + Some(pos) if pos + 1 == buf.len() => buf.into(), // does not end in (escaped) whitespace + None => buf.into(), + Some(start_of_non_space) => { + // This seems a bit strange but attempts to recreate the git implementation while + // actually removing the escape characters before spaces. We leave other backslashes + // for escapes to be handled by `glob/globset`. + let mut res: BString = buf[..start_of_non_space + 1].into(); + + let mut trailing_bytes = buf[start_of_non_space + 1..].iter(); + let mut bare_spaces = 0; + while let Some(b) = trailing_bytes.next() { + match b { + b' ' => { + bare_spaces += 1; + } + b'\\' => { + res.extend(std::iter::repeat(b' ').take(bare_spaces)); + bare_spaces = 0; + // Skip what follows, like git does, but keep spaces if possible. + if trailing_bytes.next() == Some(&b' ') { + res.push(b' '); + } + } + _ => unreachable!("BUG: this must be either backslash or space"), + } + } + res + } + } + } } pub fn ignore(buf: &[u8]) -> ignore::Iter<'_> { ignore::Iter::new(buf) diff --git a/git-attributes/tests/attributes.rs b/git-attributes/tests/attributes.rs index 47ef9753652..3ac73492e40 100644 --- a/git-attributes/tests/attributes.rs +++ b/git-attributes/tests/attributes.rs @@ -1,8 +1,93 @@ mod parse { mod ignore { + use git_attributes::ignore; + #[test] fn comments_are_ignored() { assert!(git_attributes::parse::ignore(b"# hello world").next().is_none()); } + + #[test] + fn backslashes_before_hashes_are_no_comments() { + assert_eq!( + git_attributes::parse::ignore(br"\#hello").next(), + Some((r"#hello".into(), ignore::pattern::Mode::empty())) + ); + } + + #[test] + fn backslashes_are_part_of_the_pattern_if_not_in_specific_positions() { + assert_eq!( + git_attributes::parse::ignore(br"\hello\world").next(), + Some((r"\hello\world".into(), ignore::pattern::Mode::empty())) + ); + } + + #[test] + fn leading_exclamation_mark_negates_pattern() { + assert_eq!( + git_attributes::parse::ignore(b"!hello").next(), + Some(("hello".into(), ignore::pattern::Mode::NEGATIVE)) + ); + } + + #[test] + fn leading_exclamation_marks_can_be_escaped_with_backslash() { + assert_eq!( + git_attributes::parse::ignore(br"\!hello").next(), + Some(("!hello".into(), ignore::pattern::Mode::empty())) + ); + } + + #[test] + fn trailing_spaces_are_ignored() { + assert_eq!( + git_attributes::parse::ignore(br"a ").next(), + Some(("a".into(), ignore::pattern::Mode::empty())) + ); + assert_eq!( + git_attributes::parse::ignore(b"a\t\t ").next(), + Some(("a\t\t".into(), ignore::pattern::Mode::empty()),), + "trailing tabs are not ignored" + ); + } + #[test] + fn trailing_spaces_can_be_escaped_to_be_literal() { + assert_eq!( + git_attributes::parse::ignore(br"a \ ").next(), + Some(("a ".into(), ignore::pattern::Mode::empty())), + "a single escape in front of the last desired space is enough" + ); + assert_eq!( + git_attributes::parse::ignore(br"a b c ").next(), + Some(("a b c".into(), ignore::pattern::Mode::empty())), + "spaces in the middle are fine" + ); + assert_eq!( + git_attributes::parse::ignore(br"a\ \ \ ").next(), + Some(("a ".into(), ignore::pattern::Mode::empty())), + "one can also escape every single one" + ); + assert_eq!( + git_attributes::parse::ignore(br"a \ ").next(), + Some(("a ".into(), ignore::pattern::Mode::empty())), + "or just the one in the middle, losing the last actual space" + ); + assert_eq!( + git_attributes::parse::ignore(br"a \").next(), + Some(("a ".into(), ignore::pattern::Mode::empty())), + "escaping nothing also works as a whitespace protection" + ); + assert_eq!( + git_attributes::parse::ignore(br"a \\\ ").next(), + Some((r"a ".into(), ignore::pattern::Mode::empty())), + "strange things like these work too" + ); + assert_eq!( + git_attributes::parse::ignore(br"a \\ ").next(), + Some((r"a ".into(), ignore::pattern::Mode::empty())), + "strange things like these work as well" + ); + } } } From d95905f57e10c90d615243ec692a81404b3571da Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 18 Mar 2022 12:12:00 +0800 Subject: [PATCH 5/9] implement most of the ignore flags (#301) Maybe some aren't needed as it really depends on how we do globbing, let's see. --- git-attributes/src/lib.rs | 27 ++++++++++----- git-attributes/tests/attributes.rs | 53 ++++++++++++++++++++++-------- 2 files changed, 58 insertions(+), 22 deletions(-) diff --git a/git-attributes/src/lib.rs b/git-attributes/src/lib.rs index e5989809e8a..402e3749b8c 100644 --- a/git-attributes/src/lib.rs +++ b/git-attributes/src/lib.rs @@ -6,9 +6,11 @@ pub mod ignore { bitflags! { pub struct Mode: u32 { - const NO_DIR = 1 << 0; + /// The pattern does not contain a sub-directory and - it doesn't contain slashes after removing the trailing one. + const NO_SUB_DIR = 1 << 0; // TODO: find a much better name! const ENDS_WITH = 1 << 1; + /// The pattern must match a directory, and not a file. const MUST_BE_DIR = 1 << 2; const NEGATIVE = 1 << 3; } @@ -45,17 +47,26 @@ pub mod parse { if line.is_empty() { continue; }; - if line.starts_with(b"#") { + if line.first() == Some(&b'#') { continue; - } else if line.starts_with(b"\\#") { - line = &line[1..]; - } else if line.starts_with(b"!") { + } else if line.first() == Some(&b'!') { mode |= ignore::pattern::Mode::NEGATIVE; line = &line[1..]; - } else if line.starts_with(b"\\!") { - line = &line[1..]; + } else if line.first() == Some(&b'\\') { + let second = line.get(1); + if second == Some(&b'!') || second == Some(&b'#') { + line = &line[1..]; + } + } + let mut line = truncate_non_escaped_trailing_spaces(line); + if line.last() == Some(&b'/') { + mode |= ignore::pattern::Mode::MUST_BE_DIR; + line.pop(); + } + if !line.contains(&b'/') { + mode |= ignore::pattern::Mode::NO_SUB_DIR; } - res = Some((truncate_non_escaped_trailing_spaces(line), mode)); + res = Some((line, mode)); } self.cursor = lines.next().unwrap_or(&[]).as_bstr(); res diff --git a/git-attributes/tests/attributes.rs b/git-attributes/tests/attributes.rs index 3ac73492e40..c75c3ea5a68 100644 --- a/git-attributes/tests/attributes.rs +++ b/git-attributes/tests/attributes.rs @@ -1,6 +1,6 @@ mod parse { mod ignore { - use git_attributes::ignore; + use git_attributes::ignore::pattern::Mode; #[test] fn comments_are_ignored() { @@ -11,7 +11,7 @@ mod parse { fn backslashes_before_hashes_are_no_comments() { assert_eq!( git_attributes::parse::ignore(br"\#hello").next(), - Some((r"#hello".into(), ignore::pattern::Mode::empty())) + Some((r"#hello".into(), Mode::NO_SUB_DIR)) ); } @@ -19,7 +19,7 @@ mod parse { fn backslashes_are_part_of_the_pattern_if_not_in_specific_positions() { assert_eq!( git_attributes::parse::ignore(br"\hello\world").next(), - Some((r"\hello\world".into(), ignore::pattern::Mode::empty())) + Some((r"\hello\world".into(), Mode::NO_SUB_DIR)) ); } @@ -27,7 +27,7 @@ mod parse { fn leading_exclamation_mark_negates_pattern() { assert_eq!( git_attributes::parse::ignore(b"!hello").next(), - Some(("hello".into(), ignore::pattern::Mode::NEGATIVE)) + Some(("hello".into(), Mode::NEGATIVE | Mode::NO_SUB_DIR)) ); } @@ -35,7 +35,32 @@ mod parse { fn leading_exclamation_marks_can_be_escaped_with_backslash() { assert_eq!( git_attributes::parse::ignore(br"\!hello").next(), - Some(("!hello".into(), ignore::pattern::Mode::empty())) + Some(("!hello".into(), Mode::NO_SUB_DIR)) + ); + } + + #[test] + fn absence_of_sub_directories_are_marked() { + assert_eq!( + git_attributes::parse::ignore(br"a/b").next(), + Some(("a/b".into(), Mode::empty())) + ); + assert_eq!( + git_attributes::parse::ignore(br"ab").next(), + Some(("ab".into(), Mode::NO_SUB_DIR)) + ); + } + + #[test] + fn trailing_slashes_are_marked_and_removed() { + assert_eq!( + git_attributes::parse::ignore(b"dir/").next(), + Some(("dir".into(), Mode::MUST_BE_DIR | Mode::NO_SUB_DIR)) + ); + assert_eq!( + git_attributes::parse::ignore(b"dir///").next(), + Some(("dir//".into(), Mode::MUST_BE_DIR)), + "but only the last slash is removed" ); } @@ -43,11 +68,11 @@ mod parse { fn trailing_spaces_are_ignored() { assert_eq!( git_attributes::parse::ignore(br"a ").next(), - Some(("a".into(), ignore::pattern::Mode::empty())) + Some(("a".into(), Mode::NO_SUB_DIR)) ); assert_eq!( git_attributes::parse::ignore(b"a\t\t ").next(), - Some(("a\t\t".into(), ignore::pattern::Mode::empty()),), + Some(("a\t\t".into(), Mode::NO_SUB_DIR)), "trailing tabs are not ignored" ); } @@ -55,37 +80,37 @@ mod parse { fn trailing_spaces_can_be_escaped_to_be_literal() { assert_eq!( git_attributes::parse::ignore(br"a \ ").next(), - Some(("a ".into(), ignore::pattern::Mode::empty())), + Some(("a ".into(), Mode::NO_SUB_DIR)), "a single escape in front of the last desired space is enough" ); assert_eq!( git_attributes::parse::ignore(br"a b c ").next(), - Some(("a b c".into(), ignore::pattern::Mode::empty())), + Some(("a b c".into(), Mode::NO_SUB_DIR)), "spaces in the middle are fine" ); assert_eq!( git_attributes::parse::ignore(br"a\ \ \ ").next(), - Some(("a ".into(), ignore::pattern::Mode::empty())), + Some(("a ".into(), Mode::NO_SUB_DIR)), "one can also escape every single one" ); assert_eq!( git_attributes::parse::ignore(br"a \ ").next(), - Some(("a ".into(), ignore::pattern::Mode::empty())), + Some(("a ".into(), Mode::NO_SUB_DIR)), "or just the one in the middle, losing the last actual space" ); assert_eq!( git_attributes::parse::ignore(br"a \").next(), - Some(("a ".into(), ignore::pattern::Mode::empty())), + Some(("a ".into(), Mode::NO_SUB_DIR)), "escaping nothing also works as a whitespace protection" ); assert_eq!( git_attributes::parse::ignore(br"a \\\ ").next(), - Some((r"a ".into(), ignore::pattern::Mode::empty())), + Some((r"a ".into(), Mode::NO_SUB_DIR)), "strange things like these work too" ); assert_eq!( git_attributes::parse::ignore(br"a \\ ").next(), - Some((r"a ".into(), ignore::pattern::Mode::empty())), + Some((r"a ".into(), Mode::NO_SUB_DIR)), "strange things like these work as well" ); } From 0906bedc7525979eb02192beb007f096cd6ac45f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 18 Mar 2022 15:59:42 +0800 Subject: [PATCH 6/9] Make line number accessible (#301) --- git-attributes/src/lib.rs | 11 +++++++--- git-attributes/tests/attributes.rs | 34 +++++++++++++++--------------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/git-attributes/src/lib.rs b/git-attributes/src/lib.rs index 402e3749b8c..eaa81d72bd6 100644 --- a/git-attributes/src/lib.rs +++ b/git-attributes/src/lib.rs @@ -25,16 +25,20 @@ pub mod parse { pub struct Iter<'a> { cursor: &'a BStr, + line_no: usize, } impl<'a> Iter<'a> { pub fn new(buf: &'a [u8]) -> Self { - Iter { cursor: buf.as_bstr() } + Iter { + cursor: buf.as_bstr(), + line_no: 0, + } } } impl<'a> Iterator for Iter<'a> { - type Item = (BString, ignore::pattern::Mode); + type Item = (BString, ignore::pattern::Mode, usize); fn next(&mut self) -> Option { if self.cursor.is_empty() { @@ -43,6 +47,7 @@ pub mod parse { let mut lines = self.cursor.lines(); let mut res = None; for mut line in lines.by_ref() { + self.line_no += 1; let mut mode = ignore::pattern::Mode::empty(); if line.is_empty() { continue; @@ -66,7 +71,7 @@ pub mod parse { if !line.contains(&b'/') { mode |= ignore::pattern::Mode::NO_SUB_DIR; } - res = Some((line, mode)); + res = Some((line, mode, self.line_no)); } self.cursor = lines.next().unwrap_or(&[]).as_bstr(); res diff --git a/git-attributes/tests/attributes.rs b/git-attributes/tests/attributes.rs index c75c3ea5a68..daa502831d7 100644 --- a/git-attributes/tests/attributes.rs +++ b/git-attributes/tests/attributes.rs @@ -11,7 +11,7 @@ mod parse { fn backslashes_before_hashes_are_no_comments() { assert_eq!( git_attributes::parse::ignore(br"\#hello").next(), - Some((r"#hello".into(), Mode::NO_SUB_DIR)) + Some((r"#hello".into(), Mode::NO_SUB_DIR, 1)) ); } @@ -19,7 +19,7 @@ mod parse { fn backslashes_are_part_of_the_pattern_if_not_in_specific_positions() { assert_eq!( git_attributes::parse::ignore(br"\hello\world").next(), - Some((r"\hello\world".into(), Mode::NO_SUB_DIR)) + Some((r"\hello\world".into(), Mode::NO_SUB_DIR, 1)) ); } @@ -27,7 +27,7 @@ mod parse { fn leading_exclamation_mark_negates_pattern() { assert_eq!( git_attributes::parse::ignore(b"!hello").next(), - Some(("hello".into(), Mode::NEGATIVE | Mode::NO_SUB_DIR)) + Some(("hello".into(), Mode::NEGATIVE | Mode::NO_SUB_DIR, 1)) ); } @@ -35,7 +35,7 @@ mod parse { fn leading_exclamation_marks_can_be_escaped_with_backslash() { assert_eq!( git_attributes::parse::ignore(br"\!hello").next(), - Some(("!hello".into(), Mode::NO_SUB_DIR)) + Some(("!hello".into(), Mode::NO_SUB_DIR, 1)) ); } @@ -43,11 +43,11 @@ mod parse { fn absence_of_sub_directories_are_marked() { assert_eq!( git_attributes::parse::ignore(br"a/b").next(), - Some(("a/b".into(), Mode::empty())) + Some(("a/b".into(), Mode::empty(), 1)) ); assert_eq!( git_attributes::parse::ignore(br"ab").next(), - Some(("ab".into(), Mode::NO_SUB_DIR)) + Some(("ab".into(), Mode::NO_SUB_DIR, 1)) ); } @@ -55,11 +55,11 @@ mod parse { fn trailing_slashes_are_marked_and_removed() { assert_eq!( git_attributes::parse::ignore(b"dir/").next(), - Some(("dir".into(), Mode::MUST_BE_DIR | Mode::NO_SUB_DIR)) + Some(("dir".into(), Mode::MUST_BE_DIR | Mode::NO_SUB_DIR, 1)) ); assert_eq!( git_attributes::parse::ignore(b"dir///").next(), - Some(("dir//".into(), Mode::MUST_BE_DIR)), + Some(("dir//".into(), Mode::MUST_BE_DIR, 1)), "but only the last slash is removed" ); } @@ -68,11 +68,11 @@ mod parse { fn trailing_spaces_are_ignored() { assert_eq!( git_attributes::parse::ignore(br"a ").next(), - Some(("a".into(), Mode::NO_SUB_DIR)) + Some(("a".into(), Mode::NO_SUB_DIR, 1)) ); assert_eq!( git_attributes::parse::ignore(b"a\t\t ").next(), - Some(("a\t\t".into(), Mode::NO_SUB_DIR)), + Some(("a\t\t".into(), Mode::NO_SUB_DIR, 1)), "trailing tabs are not ignored" ); } @@ -80,37 +80,37 @@ mod parse { fn trailing_spaces_can_be_escaped_to_be_literal() { assert_eq!( git_attributes::parse::ignore(br"a \ ").next(), - Some(("a ".into(), Mode::NO_SUB_DIR)), + Some(("a ".into(), Mode::NO_SUB_DIR, 1)), "a single escape in front of the last desired space is enough" ); assert_eq!( git_attributes::parse::ignore(br"a b c ").next(), - Some(("a b c".into(), Mode::NO_SUB_DIR)), + Some(("a b c".into(), Mode::NO_SUB_DIR, 1)), "spaces in the middle are fine" ); assert_eq!( git_attributes::parse::ignore(br"a\ \ \ ").next(), - Some(("a ".into(), Mode::NO_SUB_DIR)), + Some(("a ".into(), Mode::NO_SUB_DIR, 1)), "one can also escape every single one" ); assert_eq!( git_attributes::parse::ignore(br"a \ ").next(), - Some(("a ".into(), Mode::NO_SUB_DIR)), + Some(("a ".into(), Mode::NO_SUB_DIR, 1)), "or just the one in the middle, losing the last actual space" ); assert_eq!( git_attributes::parse::ignore(br"a \").next(), - Some(("a ".into(), Mode::NO_SUB_DIR)), + Some(("a ".into(), Mode::NO_SUB_DIR, 1)), "escaping nothing also works as a whitespace protection" ); assert_eq!( git_attributes::parse::ignore(br"a \\\ ").next(), - Some((r"a ".into(), Mode::NO_SUB_DIR)), + Some((r"a ".into(), Mode::NO_SUB_DIR, 1)), "strange things like these work too" ); assert_eq!( git_attributes::parse::ignore(br"a \\ ").next(), - Some((r"a ".into(), Mode::NO_SUB_DIR)), + Some((r"a ".into(), Mode::NO_SUB_DIR, 1)), "strange things like these work as well" ); } From 6a37eee5292bacdaca8c97608e900872128ae9bf Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 18 Mar 2022 16:19:38 +0800 Subject: [PATCH 7/9] iterator actually iterates all lines in a buffer (#301) --- Cargo.lock | 1 + git-attributes/Cargo.toml | 3 + git-attributes/src/ignore.rs | 15 +++ git-attributes/src/lib.rs | 117 +----------------- git-attributes/src/parse/ignore.rs | 106 ++++++++++++++++ git-attributes/src/parse/mod.rs | 5 + git-attributes/tests/attributes.rs | 31 +++++ .../tests/fixtures/ignore/various.txt | 14 +++ 8 files changed, 177 insertions(+), 115 deletions(-) create mode 100644 git-attributes/src/ignore.rs create mode 100644 git-attributes/src/parse/ignore.rs create mode 100644 git-attributes/src/parse/mod.rs create mode 100644 git-attributes/tests/fixtures/ignore/various.txt diff --git a/Cargo.lock b/Cargo.lock index 5318f52c68c..a4d78cf2bdf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1076,6 +1076,7 @@ version = "0.0.0" dependencies = [ "bitflags", "bstr", + "git-testtools", ] [[package]] diff --git a/git-attributes/Cargo.toml b/git-attributes/Cargo.toml index 5af88cc5337..5e8ac460d10 100644 --- a/git-attributes/Cargo.toml +++ b/git-attributes/Cargo.toml @@ -15,3 +15,6 @@ doctest = false [dependencies] bstr = { version = "0.2.13", default-features = false, features = ["std"]} bitflags = "1.3.2" + +[dev-dependencies] +git-testtools = { path = "../tests/tools"} diff --git a/git-attributes/src/ignore.rs b/git-attributes/src/ignore.rs new file mode 100644 index 00000000000..c587e84d6ad --- /dev/null +++ b/git-attributes/src/ignore.rs @@ -0,0 +1,15 @@ +pub mod pattern { + use bitflags::bitflags; + + bitflags! { + pub struct Mode: u32 { + /// The pattern does not contain a sub-directory and - it doesn't contain slashes after removing the trailing one. + const NO_SUB_DIR = 1 << 0; + // TODO: find a much better name! + const ENDS_WITH = 1 << 1; + /// The pattern must match a directory, and not a file. + const MUST_BE_DIR = 1 << 2; + const NEGATIVE = 1 << 3; + } + } +} diff --git a/git-attributes/src/lib.rs b/git-attributes/src/lib.rs index eaa81d72bd6..ca3616728c5 100644 --- a/git-attributes/src/lib.rs +++ b/git-attributes/src/lib.rs @@ -1,118 +1,5 @@ #![forbid(unsafe_code, rust_2018_idioms)] -pub mod ignore { - pub mod pattern { - use bitflags::bitflags; +pub mod ignore; - bitflags! { - pub struct Mode: u32 { - /// The pattern does not contain a sub-directory and - it doesn't contain slashes after removing the trailing one. - const NO_SUB_DIR = 1 << 0; - // TODO: find a much better name! - const ENDS_WITH = 1 << 1; - /// The pattern must match a directory, and not a file. - const MUST_BE_DIR = 1 << 2; - const NEGATIVE = 1 << 3; - } - } - } -} - -pub mod parse { - pub mod ignore { - use crate::ignore; - use bstr::{BStr, BString, ByteSlice}; - - pub struct Iter<'a> { - cursor: &'a BStr, - line_no: usize, - } - - impl<'a> Iter<'a> { - pub fn new(buf: &'a [u8]) -> Self { - Iter { - cursor: buf.as_bstr(), - line_no: 0, - } - } - } - - impl<'a> Iterator for Iter<'a> { - type Item = (BString, ignore::pattern::Mode, usize); - - fn next(&mut self) -> Option { - if self.cursor.is_empty() { - return None; - } - let mut lines = self.cursor.lines(); - let mut res = None; - for mut line in lines.by_ref() { - self.line_no += 1; - let mut mode = ignore::pattern::Mode::empty(); - if line.is_empty() { - continue; - }; - if line.first() == Some(&b'#') { - continue; - } else if line.first() == Some(&b'!') { - mode |= ignore::pattern::Mode::NEGATIVE; - line = &line[1..]; - } else if line.first() == Some(&b'\\') { - let second = line.get(1); - if second == Some(&b'!') || second == Some(&b'#') { - line = &line[1..]; - } - } - let mut line = truncate_non_escaped_trailing_spaces(line); - if line.last() == Some(&b'/') { - mode |= ignore::pattern::Mode::MUST_BE_DIR; - line.pop(); - } - if !line.contains(&b'/') { - mode |= ignore::pattern::Mode::NO_SUB_DIR; - } - res = Some((line, mode, self.line_no)); - } - self.cursor = lines.next().unwrap_or(&[]).as_bstr(); - res - } - } - - /// We always copy just because that's ultimately needed anyway, not because we always have to. - fn truncate_non_escaped_trailing_spaces(buf: &[u8]) -> BString { - match buf.rfind_not_byteset(br"\ ") { - Some(pos) if pos + 1 == buf.len() => buf.into(), // does not end in (escaped) whitespace - None => buf.into(), - Some(start_of_non_space) => { - // This seems a bit strange but attempts to recreate the git implementation while - // actually removing the escape characters before spaces. We leave other backslashes - // for escapes to be handled by `glob/globset`. - let mut res: BString = buf[..start_of_non_space + 1].into(); - - let mut trailing_bytes = buf[start_of_non_space + 1..].iter(); - let mut bare_spaces = 0; - while let Some(b) = trailing_bytes.next() { - match b { - b' ' => { - bare_spaces += 1; - } - b'\\' => { - res.extend(std::iter::repeat(b' ').take(bare_spaces)); - bare_spaces = 0; - // Skip what follows, like git does, but keep spaces if possible. - if trailing_bytes.next() == Some(&b' ') { - res.push(b' '); - } - } - _ => unreachable!("BUG: this must be either backslash or space"), - } - } - res - } - } - } - } - pub fn ignore(buf: &[u8]) -> ignore::Iter<'_> { - ignore::Iter::new(buf) - } -} +pub mod parse; diff --git a/git-attributes/src/parse/ignore.rs b/git-attributes/src/parse/ignore.rs new file mode 100644 index 00000000000..281c685c266 --- /dev/null +++ b/git-attributes/src/parse/ignore.rs @@ -0,0 +1,106 @@ +use crate::ignore; +use bstr::{BStr, BString, ByteSlice}; + +pub struct Iter<'a> { + cursor: &'a BStr, + line_no: usize, +} + +impl<'a> Iter<'a> { + pub fn new(buf: &'a [u8]) -> Self { + Iter { + cursor: buf.as_bstr(), + line_no: 0, + } + } +} + +impl<'a> Iterator for Iter<'a> { + type Item = (BString, ignore::pattern::Mode, usize); + + fn next(&mut self) -> Option { + if self.cursor.is_empty() { + return None; + } + let mut lines = self.cursor.lines_with_terminator(); + let mut res = None; + let mut offset = 0; + for mut line in lines.by_ref() { + self.line_no += 1; + offset += line.len(); + line = trim_newline(line); + let mut mode = ignore::pattern::Mode::empty(); + if line.is_empty() { + continue; + }; + if line.first() == Some(&b'#') { + continue; + } else if line.first() == Some(&b'!') { + mode |= ignore::pattern::Mode::NEGATIVE; + line = &line[1..]; + } else if line.first() == Some(&b'\\') { + let second = line.get(1); + if second == Some(&b'!') || second == Some(&b'#') { + line = &line[1..]; + } + } + let mut line = truncate_non_escaped_trailing_spaces(line); + if line.last() == Some(&b'/') { + mode |= ignore::pattern::Mode::MUST_BE_DIR; + line.pop(); + } + if !line.contains(&b'/') { + mode |= ignore::pattern::Mode::NO_SUB_DIR; + } + res = Some((line, mode, self.line_no)); + break; + } + self.cursor = &self.cursor[offset..]; + res + } +} + +#[inline] +fn trim_newline(mut line: &[u8]) -> &[u8] { + if line.last_byte() == Some(b'\n') { + line = &line[..line.len() - 1]; + if line.last_byte() == Some(b'\r') { + line = &line[..line.len() - 1]; + } + } + line +} + +/// We always copy just because that's ultimately needed anyway, not because we always have to. +fn truncate_non_escaped_trailing_spaces(buf: &[u8]) -> BString { + match buf.rfind_not_byteset(br"\ ") { + Some(pos) if pos + 1 == buf.len() => buf.into(), // does not end in (escaped) whitespace + None => buf.into(), + Some(start_of_non_space) => { + // This seems a bit strange but attempts to recreate the git implementation while + // actually removing the escape characters before spaces. We leave other backslashes + // for escapes to be handled by `glob/globset`. + let mut res: BString = buf[..start_of_non_space + 1].into(); + + let mut trailing_bytes = buf[start_of_non_space + 1..].iter(); + let mut bare_spaces = 0; + while let Some(b) = trailing_bytes.next() { + match b { + b' ' => { + bare_spaces += 1; + } + b'\\' => { + res.extend(std::iter::repeat(b' ').take(bare_spaces)); + bare_spaces = 0; + // Skip what follows, like git does, but keep spaces if possible. + if trailing_bytes.next() == Some(&b' ') { + res.push(b' '); + } + } + _ => unreachable!("BUG: this must be either backslash or space"), + } + } + res + } + } +} diff --git a/git-attributes/src/parse/mod.rs b/git-attributes/src/parse/mod.rs new file mode 100644 index 00000000000..03b01b5790e --- /dev/null +++ b/git-attributes/src/parse/mod.rs @@ -0,0 +1,5 @@ +pub mod ignore; + +pub fn ignore(buf: &[u8]) -> ignore::Iter<'_> { + ignore::Iter::new(buf) +} diff --git a/git-attributes/tests/attributes.rs b/git-attributes/tests/attributes.rs index daa502831d7..a422bf5ec12 100644 --- a/git-attributes/tests/attributes.rs +++ b/git-attributes/tests/attributes.rs @@ -1,6 +1,37 @@ mod parse { mod ignore { use git_attributes::ignore::pattern::Mode; + use git_testtools::fixture_path; + + #[test] + fn line_numbers_are_counted_correctly() { + let ignore = std::fs::read(fixture_path("ignore/various.txt")).unwrap(); + let actual: Vec<_> = git_attributes::parse::ignore(&ignore).collect(); + assert_eq!( + actual, + vec![ + ("*.[oa]".into(), Mode::NO_SUB_DIR, 2), + ("*.html".into(), Mode::NO_SUB_DIR, 5), + ("foo.html".into(), Mode::NO_SUB_DIR | Mode::NEGATIVE, 8), + ("/*".into(), Mode::empty(), 11), + ("/foo".into(), Mode::NEGATIVE, 12), + ("/foo/*".into(), Mode::empty(), 13), + ("/foo/bar".into(), Mode::NEGATIVE, 14) + ] + ); + } + + #[test] + fn line_endings_can_be_windows_or_unix() { + assert_eq!( + git_attributes::parse::ignore(b"unix\nwindows\r\nlast").collect::>(), + vec![ + (r"unix".into(), Mode::NO_SUB_DIR, 1), + (r"windows".into(), Mode::NO_SUB_DIR, 2), + (r"last".into(), Mode::NO_SUB_DIR, 3) + ] + ); + } #[test] fn comments_are_ignored() { diff --git a/git-attributes/tests/fixtures/ignore/various.txt b/git-attributes/tests/fixtures/ignore/various.txt new file mode 100644 index 00000000000..214e46f2d54 --- /dev/null +++ b/git-attributes/tests/fixtures/ignore/various.txt @@ -0,0 +1,14 @@ +# ignore objects and archives, anywhere in the tree. +*.[oa] + +# ignore generated html files, +*.html + +# except foo.html which is maintained by hand +!foo.html + +# exclude everything except directory foo/bar +/* +!/foo +/foo/* +!/foo/bar From e9d222a19541e2c75370d3e2feeb24beec093859 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 18 Mar 2022 16:36:40 +0800 Subject: [PATCH 8/9] Support for 'ends_with' matching mode (#301) --- git-attributes/src/ignore.rs | 2 +- git-attributes/src/parse/ignore.rs | 5 ++++- git-attributes/tests/attributes.rs | 28 +++++++++++++++++++++++++++- 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/git-attributes/src/ignore.rs b/git-attributes/src/ignore.rs index c587e84d6ad..1bde72dc586 100644 --- a/git-attributes/src/ignore.rs +++ b/git-attributes/src/ignore.rs @@ -5,7 +5,7 @@ pub mod pattern { pub struct Mode: u32 { /// The pattern does not contain a sub-directory and - it doesn't contain slashes after removing the trailing one. const NO_SUB_DIR = 1 << 0; - // TODO: find a much better name! + /// A pattern that is '*literal', meaning that it ends with what's given here const ENDS_WITH = 1 << 1; /// The pattern must match a directory, and not a file. const MUST_BE_DIR = 1 << 2; diff --git a/git-attributes/src/parse/ignore.rs b/git-attributes/src/parse/ignore.rs index 281c685c266..1fb6342d9ef 100644 --- a/git-attributes/src/parse/ignore.rs +++ b/git-attributes/src/parse/ignore.rs @@ -24,7 +24,7 @@ impl<'a> Iterator for Iter<'a> { } let mut lines = self.cursor.lines_with_terminator(); let mut res = None; - let mut offset = 0; + let mut offset = 0; // TODO: prefer `lines()` with `into_bytes()` instead. for mut line in lines.by_ref() { self.line_no += 1; offset += line.len(); @@ -52,6 +52,9 @@ impl<'a> Iterator for Iter<'a> { if !line.contains(&b'/') { mode |= ignore::pattern::Mode::NO_SUB_DIR; } + if line.first() == Some(&b'*') && line[1..].find_byteset(br"*?[\").is_none() { + mode |= ignore::pattern::Mode::ENDS_WITH; + } res = Some((line, mode, self.line_no)); break; } diff --git a/git-attributes/tests/attributes.rs b/git-attributes/tests/attributes.rs index a422bf5ec12..05840a46a13 100644 --- a/git-attributes/tests/attributes.rs +++ b/git-attributes/tests/attributes.rs @@ -11,7 +11,7 @@ mod parse { actual, vec![ ("*.[oa]".into(), Mode::NO_SUB_DIR, 2), - ("*.html".into(), Mode::NO_SUB_DIR, 5), + ("*.html".into(), Mode::NO_SUB_DIR | Mode::ENDS_WITH, 5), ("foo.html".into(), Mode::NO_SUB_DIR | Mode::NEGATIVE, 8), ("/*".into(), Mode::empty(), 11), ("/foo".into(), Mode::NEGATIVE, 12), @@ -33,6 +33,32 @@ mod parse { ); } + #[test] + fn mark_ends_with_pattern_specifically() { + assert_eq!( + git_attributes::parse::ignore(br"*literal").next(), + Some((r"*literal".into(), Mode::NO_SUB_DIR | Mode::ENDS_WITH, 1)) + ); + assert_eq!( + git_attributes::parse::ignore(br"**literal").next(), + Some((r"**literal".into(), Mode::NO_SUB_DIR, 1)), + "double-asterisk won't allow for fast comparisons" + ); + assert_eq!( + git_attributes::parse::ignore(br"*litera[l]").next(), + Some((r"*litera[l]".into(), Mode::NO_SUB_DIR, 1)) + ); + assert_eq!( + git_attributes::parse::ignore(br"*litera?").next(), + Some((r"*litera?".into(), Mode::NO_SUB_DIR, 1)) + ); + assert_eq!( + git_attributes::parse::ignore(br"*litera\?").next(), + Some((r"*litera\?".into(), Mode::NO_SUB_DIR, 1)), + "for now we don't handle escapes properly like git seems to do" + ); + } + #[test] fn comments_are_ignored() { assert!(git_attributes::parse::ignore(b"# hello world").next().is_none()); From 9a9115f8db0a84818600f125b1185d1773f10d39 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 18 Mar 2022 16:54:42 +0800 Subject: [PATCH 9/9] refactor (#301) --- git-attributes/src/parse/ignore.rs | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/git-attributes/src/parse/ignore.rs b/git-attributes/src/parse/ignore.rs index 1fb6342d9ef..b9ebd783ffd 100644 --- a/git-attributes/src/parse/ignore.rs +++ b/git-attributes/src/parse/ignore.rs @@ -1,15 +1,15 @@ use crate::ignore; -use bstr::{BStr, BString, ByteSlice}; +use bstr::{BString, ByteSlice}; pub struct Iter<'a> { - cursor: &'a BStr, + lines: bstr::Lines<'a>, line_no: usize, } impl<'a> Iter<'a> { pub fn new(buf: &'a [u8]) -> Self { Iter { - cursor: buf.as_bstr(), + lines: buf.lines(), line_no: 0, } } @@ -19,16 +19,9 @@ impl<'a> Iterator for Iter<'a> { type Item = (BString, ignore::pattern::Mode, usize); fn next(&mut self) -> Option { - if self.cursor.is_empty() { - return None; - } - let mut lines = self.cursor.lines_with_terminator(); let mut res = None; - let mut offset = 0; // TODO: prefer `lines()` with `into_bytes()` instead. - for mut line in lines.by_ref() { + for mut line in self.lines.by_ref() { self.line_no += 1; - offset += line.len(); - line = trim_newline(line); let mut mode = ignore::pattern::Mode::empty(); if line.is_empty() { continue; @@ -58,22 +51,10 @@ impl<'a> Iterator for Iter<'a> { res = Some((line, mode, self.line_no)); break; } - self.cursor = &self.cursor[offset..]; res } } -#[inline] -fn trim_newline(mut line: &[u8]) -> &[u8] { - if line.last_byte() == Some(b'\n') { - line = &line[..line.len() - 1]; - if line.last_byte() == Some(b'\r') { - line = &line[..line.len() - 1]; - } - } - line -} - /// We always copy just because that's ultimately needed anyway, not because we always have to. fn truncate_non_escaped_trailing_spaces(buf: &[u8]) -> BString { match buf.rfind_not_byteset(br"\ ") {