diff --git a/gix-diff/Cargo.toml b/gix-diff/Cargo.toml index c3f0cd92078..e251aa5d33e 100644 --- a/gix-diff/Cargo.toml +++ b/gix-diff/Cargo.toml @@ -42,7 +42,7 @@ gix-trace = { version = "^0.1.12", path = "../gix-trace", optional = true } gix-traverse = { version = "^0.46.2", path = "../gix-traverse", optional = true } thiserror = "2.0.0" -imara-diff = { version = "0.1.7", optional = true } +imara-diff = { version = "0.1.8", optional = true } serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } getrandom = { version = "0.2.8", optional = true, default-features = false, features = ["js"] } bstr = { version = "1.12.0", default-features = false } diff --git a/gix-diff/src/blob/git_diff.rs b/gix-diff/src/blob/git_diff.rs new file mode 100644 index 00000000000..7cd148b9735 --- /dev/null +++ b/gix-diff/src/blob/git_diff.rs @@ -0,0 +1,326 @@ +//! Facilities to produce git-formatted diffs. + +use crate::blob::GitDiff; +use bstr::ByteSlice; +use imara_diff::intern::{InternedInput, Interner, Token}; +use imara_diff::Sink; +use std::cmp::Ordering; +use std::ops::Range; + +// Explanation for the following numbers can be found here: +// https://github.com/git/git/blob/324fbaab88126196bd42e7fa383ee94e165d61b5/xdiff/xdiffi.c#L535 +const MAX_INDENT: u8 = 200; +const MAX_BLANKS: i16 = 20; +const INDENT_WEIGHT: i16 = 60; +const INDENT_HEURISTIC_MAX_SLIDING: usize = 100; + +const START_OF_FILE_PENALTY: i16 = 1; +const END_OF_FILE_PENALTY: i16 = 21; +const TOTAL_BLANK_WEIGHT: i16 = -30; +const POST_BLANK_WEIGHT: i16 = 6; +const RELATIVE_INDENT_PENALTY: i16 = -4; +const RELATIVE_INDENT_WITH_BLANK_PENALTY: i16 = 10; +const RELATIVE_OUTDENT_PENALTY: i16 = 24; +const RELATIVE_OUTDENT_WITH_BLANK_PENALTY: i16 = 17; +const RELATIVE_DEDENT_PENALTY: i16 = 23; +const RELATIVE_DEDENT_WITH_BLANK_PENALTY: i16 = 17; + +pub(super) mod types { + use crate::blob::git_diff::ChangeGroup; + + /// A [`Sink`](imara_diff::Sink) that creates a diff like git would. + /// + /// See the [diff slider repository](https://github.com/mhagger/diff-slider-tools) for more information. + pub struct GitDiff<'a, T> + where + T: AsRef<[u8]>, + { + pub(crate) after: &'a [imara_diff::intern::Token], + pub(crate) interner: &'a imara_diff::intern::Interner, + pub(crate) changes: Vec, + } +} + +/// An enum indicating the kind of change that occurred. +#[derive(PartialEq, Debug)] +pub enum ChangeKind { + /// Indicates that a change introduced new lines. + Added, + /// Indicates that a change removed lines before the starting line of the change. + RemovedAbove, + /// Indicates that a change removed lines after the ending line of the change. + RemovedBelow, + /// Indicates that the change modified lines. + Modified, +} + +#[derive(PartialEq)] +struct Score { + effective_indent: i16, + penalty: i16, +} + +impl PartialOrd for Score { + // A score is considered "Greater" if it is equal or less than 0 + fn partial_cmp(&self, other: &Self) -> Option { + let indent_penalty = match self.effective_indent.cmp(&other.effective_indent) { + Ordering::Greater => INDENT_WEIGHT, + Ordering::Less => -INDENT_WEIGHT, + Ordering::Equal => 0, + }; + + Some((indent_penalty + (self.penalty - other.penalty)).cmp(&0).reverse()) + } +} + +/// A [`ChangeGroup`] represents a block of changed lines. +#[derive(PartialEq, Debug)] +pub struct ChangeGroup { + /// Range indicating the lines of the previous block. + /// To actually see what the previous block looked like, you need to combine this range with + /// the [`InternedInput`]. + pub before: Range, + /// Range indicating the lines of the new block + /// To actually see how the current block looks like, you need to combine this range with + /// the [`InternedInput`]. + pub after: Range, + /// Further specify what kind of change is denoted by the ranges above. + pub change_kind: ChangeKind, +} + +impl ChangeGroup { + /// Return [before](Self::before) and [after](Self::after) as `u32` ranges for use in [Sink::process_change()]. + /// + /// This is useful for creating [unified diffs](crate::blob::UnifiedDiff), for example. + pub fn as_u32_ranges(&self) -> (Range, Range) { + ( + self.before.start as u32..self.before.end as u32, + self.after.start as u32..self.after.end as u32, + ) + } +} + +// Calculate the indentation of a single line as number of tabs. +fn get_indent(s: &[u8]) -> Option { + let mut indent = 0; + + for char in s.bytes() { + if !char.is_ascii_whitespace() { + return Some(indent); + } else if char == b' ' { + indent += 1; + } else if char == b'\t' { + indent += 8 - indent % 8; + } + + if indent >= MAX_INDENT { + return Some(MAX_INDENT); + } + } + + None +} + +fn measure_and_score_change>(lines: &[Token], split: usize, interner: &Interner, score: &mut Score) { + // Gather information about the surroundings of the change + let end_of_file = split >= lines.len(); + let mut indent: Option = if split >= lines.len() { + None + } else { + get_indent(interner[lines[split]].as_ref()) + }; + let mut pre_blank = 0; + let mut pre_indent: Option = None; + let mut post_blank = 0; + let mut post_indent: Option = None; + + for line in (0..=split.saturating_sub(1)).rev() { + pre_indent = get_indent(interner[lines[line]].as_ref()); + if pre_indent.is_none() { + pre_blank += 1; + if pre_blank == MAX_BLANKS { + pre_indent = Some(0); + break; + } + } + } + for line in split + 1..lines.len() { + post_indent = get_indent(interner[lines[line]].as_ref()); + if post_indent.is_none() { + post_blank += 1; + if post_blank == MAX_BLANKS { + post_indent = Some(0); + break; + } + } + } + + // Calculate score of the currently applied split + post_blank = if indent.is_none() { 1 + post_blank } else { 0 }; + let total_blank = pre_blank + post_blank; + if indent.is_none() { + indent = post_indent; + } + let any_blanks = total_blank != 0; + + if pre_indent.is_none() && pre_blank == 0 { + score.penalty += START_OF_FILE_PENALTY; + } + + if end_of_file { + score.penalty += END_OF_FILE_PENALTY; + } + + score.penalty += TOTAL_BLANK_WEIGHT * total_blank; + score.penalty += POST_BLANK_WEIGHT * post_blank; + + score.effective_indent += if let Some(indent) = indent { indent as i16 } else { -1 }; + + if indent.is_none() || pre_indent.is_none() || indent == pre_indent { + } else if indent > pre_indent { + score.penalty += if any_blanks { + RELATIVE_INDENT_WITH_BLANK_PENALTY + } else { + RELATIVE_INDENT_PENALTY + }; + } else if post_indent.is_some() && post_indent > indent { + score.penalty += if any_blanks { + RELATIVE_OUTDENT_WITH_BLANK_PENALTY + } else { + RELATIVE_OUTDENT_PENALTY + }; + } else { + score.penalty += if any_blanks { + RELATIVE_DEDENT_WITH_BLANK_PENALTY + } else { + RELATIVE_DEDENT_PENALTY + }; + } +} + +impl<'a, T> GitDiff<'a, T> +where + T: AsRef<[u8]>, +{ + /// Create a new instance of [`GitDiff`] that can then be passed to [`imara_diff::diff`] + /// and generate a more human-readable diff. + pub fn new(input: &'a InternedInput) -> Self { + Self { + after: &input.after, + interner: &input.interner, + changes: Vec::new(), + } + } +} + +impl Sink for GitDiff<'_, T> +where + T: AsRef<[u8]>, +{ + type Out = Vec; + + fn process_change(&mut self, before: Range, after: Range) { + let change_kind = match (before.is_empty(), after.is_empty()) { + (true, false) => ChangeKind::Added, + (false, true) => { + if after.start == 0 { + ChangeKind::RemovedAbove + } else { + ChangeKind::RemovedBelow + } + } + _ => ChangeKind::Modified, + }; + self.changes.push(ChangeGroup { + before: before.start as usize..before.end as usize, + after: after.start as usize..after.end as usize, + change_kind, + }); + } + + fn finish(mut self) -> Self::Out { + if self.changes.is_empty() { + return self.changes; + } + + let mut shift: usize; + for change in &mut self.changes { + // Skip one-liner changes + if change.after.is_empty() { + continue; + } + + // Move this change up by one line if the line before the change and the last line in + // the change are equal + loop { + if change.after.start > 0 && self.after[change.after.start - 1] == self.after[change.after.end - 1] { + change.after.start -= 1; + change.after.end -= 1; + } else { + break; + } + } + + shift = change.after.end; + + // Move this change down by one line if the first line in the change the line after the + // change are equal + loop { + if change.after.end < self.after.len() && self.after[change.after.start] == self.after[change.after.end] + { + change.after.start += 1; + change.after.end += 1; + } else { + break; + } + } + + let mut best_shift: Option = None; + let mut best_score = Score { + effective_indent: 0, + penalty: 0, + }; + + if change.after.end.saturating_sub(change.after.len()) > shift { + shift = change.after.end - change.after.len(); + } + + if change.after.end.saturating_sub(INDENT_HEURISTIC_MAX_SLIDING) > shift { + shift = change.after.end - INDENT_HEURISTIC_MAX_SLIDING; + } + + while shift <= change.after.end { + let mut score = Score { + effective_indent: 0, + penalty: 0, + }; + + measure_and_score_change(self.after, shift, self.interner, &mut score); + measure_and_score_change(self.after, shift - change.after.len(), self.interner, &mut score); + + if best_shift.is_none() || score > best_score { + best_score = score; + best_shift = Some(shift); + } + shift += 1; + } + + if let Some(best_shift) = best_shift { + while change.after.end > best_shift { + loop { + if change.after.start > 0 + && self.after[change.after.start - 1] == self.after[change.after.end - 1] + { + change.after.start -= 1; + change.after.end -= 1; + } else { + break; + } + } + } + } + } + + self.changes + } +} diff --git a/gix-diff/src/blob/mod.rs b/gix-diff/src/blob/mod.rs index 541e978d752..8adf1aaeffa 100644 --- a/gix-diff/src/blob/mod.rs +++ b/gix-diff/src/blob/mod.rs @@ -14,6 +14,9 @@ pub mod platform; pub mod unified_diff; pub use unified_diff::_impl::UnifiedDiff; +pub mod git_diff; +pub use git_diff::types::GitDiff; + /// Information about the diff performed to detect similarity. #[derive(Debug, Default, Clone, Copy, PartialEq, PartialOrd)] pub struct DiffLineStats { diff --git a/gix-diff/tests/diff/blob/git_diff.rs b/gix-diff/tests/diff/blob/git_diff.rs new file mode 100644 index 00000000000..54199344981 --- /dev/null +++ b/gix-diff/tests/diff/blob/git_diff.rs @@ -0,0 +1,164 @@ +use gix_diff::blob::intern::InternedInput; +use gix_diff::blob::unified_diff::{ContextSize, NewlineSeparator}; +use gix_diff::blob::{ + git_diff::{ChangeGroup, ChangeKind}, + Algorithm, GitDiff, Sink, UnifiedDiff, +}; +use std::hash::Hash; + +#[test] +fn basic() -> crate::Result { + let before = r#"struct SomeStruct { + field1: f64, + field2: f64, +} + +fn main() { + // Some comment + let c = SomeStruct { field1: 10.0, field2: 10.0 }; + + println!( + "Print field1 from SomeStruct {}", + get_field1(&c) + ); +} + +fn get_field1(c: &SomeStruct) -> f64 { + c.field1 +} +"#; + + let after = r#"/// This is a struct +struct SomeStruct { + field1: f64, + field2: f64, +} + +fn main() { + let c = SomeStruct { field1: 10.0, field2: 10.0 }; + + println!( + "Print field1 and field2 from SomeStruct {} {}", + get_field1(&c), get_field2(&c) + ); + println!("Print another line"); +} + +fn get_field1(c: &SomeStruct) -> f64 { + c.field1 +} + +fn get_field2(c: &SomeStruct) -> f64 { + c.field2 +} +"#; + use crate::blob::git_diff::ChangeKind; + + let expected = &[ + ChangeGroup { + before: 0..0, + after: 0..1, + change_kind: ChangeKind::Added, + }, + ChangeGroup { + before: 6..7, + after: 7..7, + change_kind: ChangeKind::RemovedBelow, + }, + ChangeGroup { + before: 10..12, + after: 10..12, + change_kind: ChangeKind::Modified, + }, + ChangeGroup { + before: 13..13, + after: 13..14, + change_kind: ChangeKind::Added, + }, + ChangeGroup { + before: 17..17, + after: 19..23, + change_kind: ChangeKind::Added, + }, + ]; + + let input = InternedInput::new(before, after); + let actual = gix_diff::blob::diff(Algorithm::Histogram, &input, GitDiff::new(&input)); + assert_eq!(actual, expected); + insta::assert_snapshot!(uni_diff_string(&input, actual), @r#" + @@ -1,1 +1,2 @@ + +/// This is a struct + struct SomeStruct { + @@ -6,3 +7,2 @@ + fn main() { + - // Some comment + let c = SomeStruct { field1: 10.0, field2: 10.0 }; + @@ -10,5 +10,6 @@ + println!( + - "Print field1 from SomeStruct {}", + - get_field1(&c) + + "Print field1 and field2 from SomeStruct {} {}", + + get_field1(&c), get_field2(&c) + ); + + println!("Print another line"); + } + @@ -17,2 +19,6 @@ + c.field1 + + + +fn get_field2(c: &SomeStruct) -> f64 { + + c.field2 + +} + } + "#); + + let standard_slider = gix_diff::blob::diff(Algorithm::Histogram, &input, uni_diff(&input))?; + insta::assert_snapshot!(standard_slider, @r#" + @@ -1,1 +1,2 @@ + +/// This is a struct + struct SomeStruct { + @@ -6,3 +7,2 @@ + fn main() { + - // Some comment + let c = SomeStruct { field1: 10.0, field2: 10.0 }; + @@ -10,5 +10,6 @@ + println!( + - "Print field1 from SomeStruct {}", + - get_field1(&c) + + "Print field1 and field2 from SomeStruct {} {}", + + get_field1(&c), get_field2(&c) + ); + + println!("Print another line"); + } + @@ -17,2 +18,6 @@ + c.field1 + +} + + + +fn get_field2(c: &SomeStruct) -> f64 { + + c.field2 + } + "#); + + let input = InternedInput::new(before.as_bytes(), after.as_bytes()); + let actual = gix_diff::blob::diff(Algorithm::Histogram, &input, GitDiff::new(&input)); + assert_eq!(actual, expected); + + Ok(()) +} + +fn uni_diff>(input: &InternedInput) -> UnifiedDiff<'_, T, String> { + UnifiedDiff::new( + input, + String::default(), + NewlineSeparator::AfterHeaderAndLine("\n"), + ContextSize::symmetrical(1), + ) +} + +fn uni_diff_string>(input: &InternedInput, changes: Vec) -> String { + let mut uni = uni_diff(input); + for change in changes { + let (before, after) = change.as_u32_ranges(); + uni.process_change(before, after); + } + uni.finish().expect("in-memory is infallible") +} diff --git a/gix-diff/tests/diff/blob/mod.rs b/gix-diff/tests/diff/blob/mod.rs index 1959c4e6fdb..49f1345a1de 100644 --- a/gix-diff/tests/diff/blob/mod.rs +++ b/gix-diff/tests/diff/blob/mod.rs @@ -1,3 +1,4 @@ +mod git_diff; pub(crate) mod pipeline; mod platform; mod unified_diff; diff --git a/gix-merge/Cargo.toml b/gix-merge/Cargo.toml index 23976b05169..2a0faaa6431 100644 --- a/gix-merge/Cargo.toml +++ b/gix-merge/Cargo.toml @@ -35,7 +35,7 @@ gix-diff = { version = "^0.52.1", path = "../gix-diff", default-features = false gix-index = { version = "^0.40.1", path = "../gix-index" } thiserror = "2.0.0" -imara-diff = { version = "0.1.7" } +imara-diff = { version = "0.1.8" } bstr = { version = "1.12.0", default-features = false } serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } diff --git a/gix-merge/fuzz/Cargo.toml b/gix-merge/fuzz/Cargo.toml index 881b91cb8fd..c39a2b91fa2 100644 --- a/gix-merge/fuzz/Cargo.toml +++ b/gix-merge/fuzz/Cargo.toml @@ -12,7 +12,7 @@ cargo-fuzz = true anyhow = "1.0.76" libfuzzer-sys = "0.4" arbitrary = { version = "1.3.2", features = ["derive"] } -imara-diff = { version = "0.1.7" } +imara-diff = { version = "0.1.8" } gix-merge = { path = ".." } # Prevent this from interfering with workspaces