Skip to content

parse git-ignore files #359

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions git-attributes/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,8 @@ doctest = false
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
bstr = { version = "0.2.13", default-features = false, features = ["std"]}
bitflags = "1.3.2"

[dev-dependencies]
git-testtools = { path = "../tests/tools"}
15 changes: 15 additions & 0 deletions git-attributes/src/ignore.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
pub mod pattern {
use bitflags::bitflags;

bitflags! {
pub struct Mode: u32 {
/// The pattern does not contain a sub-directory and - it doesn't contain slashes after removing the trailing one.
const NO_SUB_DIR = 1 << 0;
/// A pattern that is '*literal', meaning that it ends with what's given here
const ENDS_WITH = 1 << 1;
/// The pattern must match a directory, and not a file.
const MUST_BE_DIR = 1 << 2;
const NEGATIVE = 1 << 3;
}
}
}
4 changes: 4 additions & 0 deletions git-attributes/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
#![forbid(unsafe_code, rust_2018_idioms)]

pub mod ignore;

pub mod parse;
90 changes: 90 additions & 0 deletions git-attributes/src/parse/ignore.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
use crate::ignore;
use bstr::{BString, ByteSlice};

pub struct Iter<'a> {
lines: bstr::Lines<'a>,
line_no: usize,
}

impl<'a> Iter<'a> {
pub fn new(buf: &'a [u8]) -> Self {
Iter {
lines: buf.lines(),
line_no: 0,
}
}
}

impl<'a> Iterator for Iter<'a> {
type Item = (BString, ignore::pattern::Mode, usize);

fn next(&mut self) -> Option<Self::Item> {
let mut res = None;
for mut line in self.lines.by_ref() {
self.line_no += 1;
let mut mode = ignore::pattern::Mode::empty();
if line.is_empty() {
continue;
};
if line.first() == Some(&b'#') {
continue;
} else if line.first() == Some(&b'!') {
mode |= ignore::pattern::Mode::NEGATIVE;
line = &line[1..];
} else if line.first() == Some(&b'\\') {
let second = line.get(1);
if second == Some(&b'!') || second == Some(&b'#') {
line = &line[1..];
}
}
let mut line = truncate_non_escaped_trailing_spaces(line);
if line.last() == Some(&b'/') {
mode |= ignore::pattern::Mode::MUST_BE_DIR;
line.pop();
}
if !line.contains(&b'/') {
mode |= ignore::pattern::Mode::NO_SUB_DIR;
}
if line.first() == Some(&b'*') && line[1..].find_byteset(br"*?[\").is_none() {
mode |= ignore::pattern::Mode::ENDS_WITH;
}
res = Some((line, mode, self.line_no));
break;
}
res
}
}

/// We always copy just because that's ultimately needed anyway, not because we always have to.
fn truncate_non_escaped_trailing_spaces(buf: &[u8]) -> BString {
match buf.rfind_not_byteset(br"\ ") {
Some(pos) if pos + 1 == buf.len() => buf.into(), // does not end in (escaped) whitespace
None => buf.into(),
Some(start_of_non_space) => {
// This seems a bit strange but attempts to recreate the git implementation while
// actually removing the escape characters before spaces. We leave other backslashes
// for escapes to be handled by `glob/globset`.
let mut res: BString = buf[..start_of_non_space + 1].into();

let mut trailing_bytes = buf[start_of_non_space + 1..].iter();
let mut bare_spaces = 0;
while let Some(b) = trailing_bytes.next() {
match b {
b' ' => {
bare_spaces += 1;
}
b'\\' => {
res.extend(std::iter::repeat(b' ').take(bare_spaces));
bare_spaces = 0;
// Skip what follows, like git does, but keep spaces if possible.
if trailing_bytes.next() == Some(&b' ') {
res.push(b' ');
}
}
_ => unreachable!("BUG: this must be either backslash or space"),
}
}
res
}
}
}
5 changes: 5 additions & 0 deletions git-attributes/src/parse/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pub mod ignore;

pub fn ignore(buf: &[u8]) -> ignore::Iter<'_> {
ignore::Iter::new(buf)
}
175 changes: 175 additions & 0 deletions git-attributes/tests/attributes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
mod parse {
mod ignore {
use git_attributes::ignore::pattern::Mode;
use git_testtools::fixture_path;

#[test]
fn line_numbers_are_counted_correctly() {
let ignore = std::fs::read(fixture_path("ignore/various.txt")).unwrap();
let actual: Vec<_> = git_attributes::parse::ignore(&ignore).collect();
assert_eq!(
actual,
vec![
("*.[oa]".into(), Mode::NO_SUB_DIR, 2),
("*.html".into(), Mode::NO_SUB_DIR | Mode::ENDS_WITH, 5),
("foo.html".into(), Mode::NO_SUB_DIR | Mode::NEGATIVE, 8),
("/*".into(), Mode::empty(), 11),
("/foo".into(), Mode::NEGATIVE, 12),
("/foo/*".into(), Mode::empty(), 13),
("/foo/bar".into(), Mode::NEGATIVE, 14)
]
);
}

#[test]
fn line_endings_can_be_windows_or_unix() {
assert_eq!(
git_attributes::parse::ignore(b"unix\nwindows\r\nlast").collect::<Vec<_>>(),
vec![
(r"unix".into(), Mode::NO_SUB_DIR, 1),
(r"windows".into(), Mode::NO_SUB_DIR, 2),
(r"last".into(), Mode::NO_SUB_DIR, 3)
]
);
}

#[test]
fn mark_ends_with_pattern_specifically() {
assert_eq!(
git_attributes::parse::ignore(br"*literal").next(),
Some((r"*literal".into(), Mode::NO_SUB_DIR | Mode::ENDS_WITH, 1))
);
assert_eq!(
git_attributes::parse::ignore(br"**literal").next(),
Some((r"**literal".into(), Mode::NO_SUB_DIR, 1)),
"double-asterisk won't allow for fast comparisons"
);
assert_eq!(
git_attributes::parse::ignore(br"*litera[l]").next(),
Some((r"*litera[l]".into(), Mode::NO_SUB_DIR, 1))
);
assert_eq!(
git_attributes::parse::ignore(br"*litera?").next(),
Some((r"*litera?".into(), Mode::NO_SUB_DIR, 1))
);
assert_eq!(
git_attributes::parse::ignore(br"*litera\?").next(),
Some((r"*litera\?".into(), Mode::NO_SUB_DIR, 1)),
"for now we don't handle escapes properly like git seems to do"
);
}

#[test]
fn comments_are_ignored() {
assert!(git_attributes::parse::ignore(b"# hello world").next().is_none());
}

#[test]
fn backslashes_before_hashes_are_no_comments() {
assert_eq!(
git_attributes::parse::ignore(br"\#hello").next(),
Some((r"#hello".into(), Mode::NO_SUB_DIR, 1))
);
}

#[test]
fn backslashes_are_part_of_the_pattern_if_not_in_specific_positions() {
assert_eq!(
git_attributes::parse::ignore(br"\hello\world").next(),
Some((r"\hello\world".into(), Mode::NO_SUB_DIR, 1))
);
}

#[test]
fn leading_exclamation_mark_negates_pattern() {
assert_eq!(
git_attributes::parse::ignore(b"!hello").next(),
Some(("hello".into(), Mode::NEGATIVE | Mode::NO_SUB_DIR, 1))
);
}

#[test]
fn leading_exclamation_marks_can_be_escaped_with_backslash() {
assert_eq!(
git_attributes::parse::ignore(br"\!hello").next(),
Some(("!hello".into(), Mode::NO_SUB_DIR, 1))
);
}

#[test]
fn absence_of_sub_directories_are_marked() {
assert_eq!(
git_attributes::parse::ignore(br"a/b").next(),
Some(("a/b".into(), Mode::empty(), 1))
);
assert_eq!(
git_attributes::parse::ignore(br"ab").next(),
Some(("ab".into(), Mode::NO_SUB_DIR, 1))
);
}

#[test]
fn trailing_slashes_are_marked_and_removed() {
assert_eq!(
git_attributes::parse::ignore(b"dir/").next(),
Some(("dir".into(), Mode::MUST_BE_DIR | Mode::NO_SUB_DIR, 1))
);
assert_eq!(
git_attributes::parse::ignore(b"dir///").next(),
Some(("dir//".into(), Mode::MUST_BE_DIR, 1)),
"but only the last slash is removed"
);
}

#[test]
fn trailing_spaces_are_ignored() {
assert_eq!(
git_attributes::parse::ignore(br"a ").next(),
Some(("a".into(), Mode::NO_SUB_DIR, 1))
);
assert_eq!(
git_attributes::parse::ignore(b"a\t\t ").next(),
Some(("a\t\t".into(), Mode::NO_SUB_DIR, 1)),
"trailing tabs are not ignored"
);
}
#[test]
fn trailing_spaces_can_be_escaped_to_be_literal() {
assert_eq!(
git_attributes::parse::ignore(br"a \ ").next(),
Some(("a ".into(), Mode::NO_SUB_DIR, 1)),
"a single escape in front of the last desired space is enough"
);
assert_eq!(
git_attributes::parse::ignore(br"a b c ").next(),
Some(("a b c".into(), Mode::NO_SUB_DIR, 1)),
"spaces in the middle are fine"
);
assert_eq!(
git_attributes::parse::ignore(br"a\ \ \ ").next(),
Some(("a ".into(), Mode::NO_SUB_DIR, 1)),
"one can also escape every single one"
);
assert_eq!(
git_attributes::parse::ignore(br"a \ ").next(),
Some(("a ".into(), Mode::NO_SUB_DIR, 1)),
"or just the one in the middle, losing the last actual space"
);
assert_eq!(
git_attributes::parse::ignore(br"a \").next(),
Some(("a ".into(), Mode::NO_SUB_DIR, 1)),
"escaping nothing also works as a whitespace protection"
);
assert_eq!(
git_attributes::parse::ignore(br"a \\\ ").next(),
Some((r"a ".into(), Mode::NO_SUB_DIR, 1)),
"strange things like these work too"
);
assert_eq!(
git_attributes::parse::ignore(br"a \\ ").next(),
Some((r"a ".into(), Mode::NO_SUB_DIR, 1)),
"strange things like these work as well"
);
}
}
}
14 changes: 14 additions & 0 deletions git-attributes/tests/fixtures/ignore/various.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# ignore objects and archives, anywhere in the tree.
*.[oa]

# ignore generated html files,
*.html

# except foo.html which is maintained by hand
!foo.html

# exclude everything except directory foo/bar
/*
!/foo
/foo/*
!/foo/bar