Skip to content

Commit d3f5f27

Browse files
committed
feat!: add support for not tracking single empty files.
Empty files are equivalent to having no content, which also means such files have no identity to speak off. This definitely helps with false positives of `.gitignore` for instance, which can be empty to tell Git to track a directory. On top of that, Git has a heuristic to do rename tracking of small files by similarity as the similarity may be off of files just have a couple of lines to speak about. Note that empty files that are renamed as part of a whole directory will still be tracked as renames.
1 parent da585db commit d3f5f27

File tree

5 files changed

+52
-8
lines changed

5 files changed

+52
-8
lines changed

gix-diff/src/lib.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ pub struct Rewrites {
3737
/// If the limit would not be enough to test the entire set of combinations, the algorithm will trade in precision and not
3838
/// run the fuzzy version of identity tests at all. That way results are never partial.
3939
pub limit: usize,
40+
41+
/// If `true`, empty blobs will be tracked. If `false`, they do not participate in rename tracking.
42+
///
43+
/// Leaving this off usually leads to better results as empty files don't have a unique-enough identity.
44+
pub track_empty: bool,
4045
}
4146

4247
/// Contains a [Tracker](rewrites::Tracker) to detect rewrites.

gix-diff/src/rewrites/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ impl Default for Rewrites {
7070
copies: None,
7171
percentage: Some(0.5),
7272
limit: 1000,
73+
track_empty: false,
7374
}
7475
}
7576
}

gix-diff/src/rewrites/tracker.rs

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -286,12 +286,10 @@ impl<T: Change> Tracker<T> {
286286
CopySource::FromSetOfModifiedFiles => {}
287287
CopySource::FromSetOfModifiedFilesAndAllSources => {
288288
push_source_tree(&mut |change, location| {
289-
assert!(
290-
self.try_push_change(change, location).is_none(),
291-
"we must accept every change"
292-
);
293-
// make sure these aren't viable to be emitted anymore.
294-
self.items.last_mut().expect("just pushed").emitted = true;
289+
if self.try_push_change(change, location).is_none() {
290+
// make sure these aren't viable to be emitted anymore.
291+
self.items.last_mut().expect("just pushed").emitted = true;
292+
}
295293
})
296294
.map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?;
297295
self.items.sort_by(by_id_and_location);
@@ -404,7 +402,19 @@ impl<T: Change> Tracker<T> {
404402
while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| {
405403
(!item.emitted
406404
&& matches!(item.change.kind(), ChangeKind::Addition)
407-
&& filter.map_or(true, |f| f(&item.change)))
405+
&& filter.map_or_else(
406+
|| {
407+
self.rewrites.track_empty
408+
// We always want to keep track of entries that are involved of a directory rename.
409+
// Note that this may still match them up arbitrarily if empty, but empty is empty.
410+
|| matches!(item.change.relation(), Some(Relation::ChildOfParent(_)))
411+
|| {
412+
let id = item.change.id();
413+
id != gix_hash::ObjectId::empty_blob(id.kind())
414+
}
415+
},
416+
|f| f(&item.change),
417+
))
408418
.then_some((idx, item))
409419
}) {
410420
dest_idx += dest_ofs;

gix-diff/tests/diff/rewrites/tracker.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ fn rename_by_id() -> crate::Result {
2727
copies: None,
2828
percentage: None,
2929
limit,
30+
track_empty: false,
3031
};
3132
let mut track = util::new_tracker(rewrites);
3233
assert!(
@@ -80,6 +81,7 @@ fn copy_by_similarity_reports_limit_if_encountered() -> crate::Result {
8081
}),
8182
percentage: None,
8283
limit: 1,
84+
track_empty: false,
8385
};
8486
let mut track = util::new_tracker(rewrites);
8587
let odb = util::add_retained_blobs(
@@ -132,6 +134,7 @@ fn copy_by_id() -> crate::Result {
132134
}),
133135
percentage: None,
134136
limit,
137+
track_empty: false,
135138
};
136139
let mut track = util::new_tracker(rewrites);
137140
let odb = util::add_retained_blobs(
@@ -206,6 +209,7 @@ fn copy_by_id_search_in_all_sources() -> crate::Result {
206209
}),
207210
percentage: None,
208211
limit,
212+
track_empty: false,
209213
};
210214
let mut track = util::new_tracker(rewrites);
211215
let odb = util::add_retained_blobs(
@@ -284,6 +288,7 @@ fn copy_by_50_percent_similarity() -> crate::Result {
284288
}),
285289
percentage: None,
286290
limit: 0,
291+
track_empty: false,
287292
};
288293
let mut track = util::new_tracker(rewrites);
289294
let odb = util::add_retained_blobs(
@@ -363,6 +368,7 @@ fn copy_by_id_in_additions_only() -> crate::Result {
363368
}),
364369
percentage: None,
365370
limit: 0,
371+
track_empty: false,
366372
};
367373
let mut track = util::new_tracker(rewrites);
368374
let odb = util::add_retained_blobs(
@@ -413,6 +419,7 @@ fn rename_by_similarity_reports_limit_if_encountered() -> crate::Result {
413419
copies: None,
414420
percentage: Some(0.5),
415421
limit: 1,
422+
track_empty: false,
416423
};
417424
let mut track = util::new_tracker(rewrites);
418425
let odb = util::add_retained_blobs(
@@ -458,6 +465,7 @@ fn rename_by_50_percent_similarity() -> crate::Result {
458465
copies: None,
459466
percentage: Some(0.5),
460467
limit: 0,
468+
track_empty: false,
461469
};
462470
let mut track = util::new_tracker(rewrites);
463471
let odb = util::add_retained_blobs(
@@ -547,6 +555,7 @@ fn directory_renames_by_id_can_fail_gracefully() -> crate::Result {
547555
copies: None,
548556
percentage: Some(0.5),
549557
limit: 0,
558+
track_empty: false,
550559
};
551560
let mut track = util::new_tracker(rename_by_similarity);
552561
let tree_dst_id = 1;
@@ -638,6 +647,7 @@ fn simple_directory_rename_by_id() -> crate::Result {
638647
copies: None,
639648
percentage: None,
640649
limit: 0,
650+
track_empty: false,
641651
};
642652
let mut track = util::new_tracker(renames_by_identity);
643653
let tree_dst_id = 1;

gix-diff/tests/diff/tree_with_rewrites.rs

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,12 +192,13 @@ fn changes_against_modified_tree_with_filename_tracking() -> crate::Result {
192192

193193
#[test]
194194
fn renames_by_identity() -> crate::Result {
195-
for (from, to, expected, assert_msg) in [
195+
for (from, to, expected, assert_msg, track_empty) in [
196196
(
197197
"c3-modification",
198198
"r1-identity",
199199
vec![BStr::new("a"), "dir/a-moved".into()],
200200
"one rename and nothing else",
201+
false,
201202
),
202203
(
203204
"c4 - add identical files",
@@ -211,24 +212,35 @@ fn renames_by_identity() -> crate::Result {
211212
"z".into(),
212213
],
213214
"multiple possible sources decide by ordering everything lexicographically",
215+
true,
216+
),
217+
(
218+
"c4 - add identical files",
219+
"r2-ambiguous",
220+
vec![],
221+
"nothing is tracked with `track_empty = false`",
222+
false,
214223
),
215224
(
216225
"c5 - add links",
217226
"r4-symlinks",
218227
vec!["link-1".into(), "renamed-link-1".into()],
219228
"symlinks are only tracked by identity",
229+
false,
220230
),
221231
(
222232
"r1-identity",
223233
"c4 - add identical files",
224234
vec![],
225235
"not having any renames is OK as well",
236+
false,
226237
),
227238
(
228239
"tc1-identity",
229240
"tc1-identity",
230241
vec![],
231242
"copy tracking is off by default",
243+
false,
232244
),
233245
] {
234246
for percentage in [None, Some(0.5)] {
@@ -239,6 +251,7 @@ fn renames_by_identity() -> crate::Result {
239251
location: Some(Location::Path),
240252
rewrites: Some(Rewrites {
241253
percentage,
254+
track_empty,
242255
..Default::default()
243256
}),
244257
},
@@ -704,6 +717,7 @@ fn copies_in_entire_tree_by_similarity_with_limit() -> crate::Result {
704717
..Default::default()
705718
}),
706719
limit: 2, // similarity checks can't be made that way
720+
track_empty: false,
707721
..Default::default()
708722
}),
709723
},
@@ -833,6 +847,7 @@ fn realistic_renames_by_identity() -> crate::Result {
833847
rewrites: Some(Rewrites {
834848
copies: Some(Copies::default()),
835849
limit: 1,
850+
track_empty: true,
836851
..Default::default()
837852
}),
838853
},
@@ -1324,6 +1339,7 @@ fn realistic_renames_by_identity_3() -> crate::Result {
13241339
rewrites: Some(Rewrites {
13251340
copies: Some(Copies::default()),
13261341
limit: 1,
1342+
track_empty: true,
13271343
..Default::default()
13281344
}),
13291345
},
@@ -1402,6 +1418,7 @@ fn realistic_renames_2() -> crate::Result {
14021418
rewrites: Some(Rewrites {
14031419
copies: Some(Copies::default()),
14041420
limit: 1,
1421+
track_empty: false,
14051422
..Default::default()
14061423
}),
14071424
},
@@ -1665,6 +1682,7 @@ fn realistic_renames_3_without_identity() -> crate::Result {
16651682
copies: None,
16661683
percentage: None,
16671684
limit: 0,
1685+
track_empty: false,
16681686
}),
16691687
},
16701688
)?;

0 commit comments

Comments
 (0)