diff --git a/Cargo.lock b/Cargo.lock index 47d8945ec5..5912a30127 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2053,9 +2053,11 @@ dependencies = [ "gix-object 0.44.0", "gix-path 0.10.11", "gix-tempfile 14.0.2", + "gix-testtools", "gix-trace 0.1.10", "gix-worktree 0.36.0", "imara-diff", + "pretty_assertions", "serde", "thiserror", ] diff --git a/gix-merge/Cargo.toml b/gix-merge/Cargo.toml index b75d4cb384..6d8da01014 100644 --- a/gix-merge/Cargo.toml +++ b/gix-merge/Cargo.toml @@ -39,6 +39,10 @@ serde = { version = "1.0.114", optional = true, default-features = false, featur document-features = { version = "0.2.0", optional = true } +[dev-dependencies] +gix-testtools = { path = "../tests/tools" } +pretty_assertions = "1.4.0" + [package.metadata.docs.rs] all-features = true features = ["document-features"] diff --git a/gix-merge/src/blob/builtin_driver.rs b/gix-merge/src/blob/builtin_driver.rs index cacef327ac..6c19d96521 100644 --- a/gix-merge/src/blob/builtin_driver.rs +++ b/gix-merge/src/blob/builtin_driver.rs @@ -23,8 +23,6 @@ impl BuiltinDriver { /// pub mod binary { - use crate::blob::Resolution; - /// What to do when having to pick a side to resolve a conflict. #[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] pub enum ResolveWith { @@ -47,24 +45,33 @@ pub mod binary { Theirs, } - /// As this algorithm doesn't look at the actual data, it returns a choice solely based on logic. - /// - /// It always results in a conflict with `current` being picked unless `on_conflict` is not `None`. - pub fn merge(on_conflict: Option) -> (Pick, Resolution) { - match on_conflict { - None => (Pick::Ours, Resolution::Conflict), - Some(ResolveWith::Ours) => (Pick::Ours, Resolution::Complete), - Some(ResolveWith::Theirs) => (Pick::Theirs, Resolution::Complete), - Some(ResolveWith::Ancestor) => (Pick::Ancestor, Resolution::Complete), + pub(super) mod function { + use crate::blob::builtin_driver::binary::{Pick, ResolveWith}; + use crate::blob::Resolution; + + /// As this algorithm doesn't look at the actual data, it returns a choice solely based on logic. + /// + /// It always results in a conflict with `current` being picked unless `on_conflict` is not `None`. + pub fn merge(on_conflict: Option) -> (Pick, Resolution) { + match on_conflict { + None => (Pick::Ours, Resolution::Conflict), + Some(resolve) => ( + match resolve { + ResolveWith::Ours => Pick::Ours, + ResolveWith::Theirs => Pick::Theirs, + ResolveWith::Ancestor => Pick::Ancestor, + }, + Resolution::Complete, + ), + } } } } +pub use binary::function::merge as binary; /// pub mod text { - use crate::blob::Resolution; - - /// The way the built-in [text driver](crate::blob::BuiltinDriver::Text) will express + /// The way the built-in [text driver](imara_diff::blob::BuiltinDriver::Text) will express /// merge conflicts in the resulting file. #[derive(Default, Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] pub enum ConflictStyle { @@ -114,9 +121,12 @@ pub mod text { ZealousDiff3, } - /// Options for the builtin [text driver](crate::blob::BuiltinDriver::Text). - #[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] + /// Options for the builtin [text driver](imara_diff::blob::BuiltinDriver::Text). + #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub struct Options { + /// Determine of the diff will be performed. + /// Defaults to [`imara_diff::Algorithm::Myers`]. + pub diff_algorithm: imara_diff::Algorithm, /// How to visualize conflicts in merged files. pub conflict_style: ConflictStyle, /// The amount of markers to draw, defaults to 7, i.e. `<<<<<<<` @@ -132,6 +142,7 @@ pub mod text { conflict_style: Default::default(), marker_size: 7, on_conflict: None, + diff_algorithm: imara_diff::Algorithm::Myers, } } } @@ -147,10 +158,373 @@ pub mod text { Union, } - /// Merge `current` and `other` with `ancestor` as base according to `opts`. - /// - /// Place the merged result in `out` and return the resolution. - pub fn merge(_out: &mut Vec, _current: &[u8], _ancestor: &[u8], _other: &[u8], _opts: Options) -> Resolution { - todo!("text merge"); + pub(super) mod function { + use crate::blob::builtin_driver::text::{ConflictStyle, Options, ResolveWith}; + use crate::blob::Resolution; + use bstr::ByteSlice; + use std::iter::Peekable; + use std::ops::Range; + use std::vec::IntoIter; + + /// Merge `current` and `other` with `ancestor` as base according to `opts`. + /// + /// `input` is for reusing memory of lists of tokens, and `other_tokens` is memory + /// for storing tokens for `other`. + /// Place the merged result in `out` and return the resolution. + /// + /// # Important + /// + /// *The caller* is responsible for clearing it, otherwise tokens will accumulate. + /// This idea is to save time if the input is known to be very similar. + pub fn merge<'a>( + out: &mut Vec, + input: &mut imara_diff::intern::InternedInput<&'a [u8]>, + current_tokens: &mut Vec, + current: &'a [u8], + ancestor: &'a [u8], + other: &'a [u8], + opts: Options, + ) -> Resolution { + input.update_before(tokens(ancestor)); + input.update_after(tokens(current)); + + let current_hunks = imara_diff::diff( + opts.diff_algorithm, + input, + CollectHunks { + side: Side::Current, + hunks: Default::default(), + }, + ); + + std::mem::swap(&mut input.after, current_tokens); + eprintln!("{}", ancestor.as_bstr()); + eprintln!("{}", current.as_bstr()); + eprintln!("{}", other.as_bstr()); + input.update_after(tokens(other)); + + let mut hunks = imara_diff::diff( + opts.diff_algorithm, + input, + CollectHunks { + side: Side::Other, + hunks: current_hunks, + }, + ); + + hunks.sort_by(|a, b| a.before.start.cmp(&b.before.start)); + dbg!(&hunks); + let mut hunks = hunks.into_iter().peekable(); + let mut intersecting = Vec::new(); + let mut ancestor_integrated_until = 0; + let mut last_hunk_ancestor_end = 0; + let mut resolution = Resolution::Complete; + while let Some(mut hunk) = hunks.next() { + if take_intersecting(&hunk, &mut hunks, &mut intersecting) { + fill_ancestor(&hunk.before, &mut intersecting); + match opts.on_conflict { + None => { + match opts.conflict_style { + ConflictStyle::Merge | ConflictStyle::ZealousDiff3 => { + zealously_contract_hunks( + &mut hunk, + &mut intersecting, + &input, + ¤t_tokens, + ZealousMode::Partial, + ); + } + ConflictStyle::Diff3 => { + zealously_contract_hunks( + &mut hunk, + &mut intersecting, + &input, + ¤t_tokens, + ZealousMode::Full, + ); + } + } + write_tokens( + &input.interner, + &input.before[ancestor_integrated_until as usize..hunk.before.start as usize], + out, + ); + ancestor_integrated_until = hunk.before.end; + if intersecting.is_empty() { + write_hunk(&hunk, &input, ¤t_tokens, out); + } else { + resolution = Resolution::Conflict; + todo!("write conflict") + } + } + Some(resolve) => { + let hunks_to_output = match resolve { + ResolveWith::Ours => std::slice::from_ref(&hunk), + ResolveWith::Theirs => &intersecting, + ResolveWith::Union => { + todo!("figure out how union works, and in what order") + } + }; + + write_tokens( + &input.interner, + &input.before[ancestor_integrated_until as usize..hunk.before.start as usize], + out, + ); + ancestor_integrated_until = hunk.before.end; + for hunk in hunks_to_output { + write_hunk(&hunk, &input, ¤t_tokens, out); + } + } + } + } else { + write_tokens( + &input.interner, + &input.before[ancestor_integrated_until as usize..hunk.before.start as usize], + out, + ); + ancestor_integrated_until = hunk.before.end; + write_hunk(&hunk, &input, ¤t_tokens, out); + } + last_hunk_ancestor_end = hunk.before.end; + } + + write_tokens( + &input.interner, + &input.before[ancestor_integrated_until as usize..last_hunk_ancestor_end as usize], + out, + ); + + resolution + } + + /// Look at all hunks in `in_out` and fill in the ancestor in the range of `ancestor_range`. + /// This is all based on knowing the ranges are sequences of tokens. + fn fill_ancestor(_ancestor_range: &Range, _in_out: &mut Vec) { + todo!() + } + + enum ZealousMode { + /// Allow hunks to be partially contracted, going only from the start and the end. + Partial, + /// Require removal of entire hunks, do not just make them smaller - all or nothing. + Full, + } + + /// Reduce the area of `hunk` and the hunks in `intersecting` so that only those lines that are + /// actually different remain. + /// Hunks from `intersecting` maybe removed in the process from the front and back, in case they + /// are entirely equal to what's in `hunk`. + /// Use `mode` to determine how hunks may be handled. + fn zealously_contract_hunks( + _hunk: &mut Hunk, + _intersecting: &mut Vec, + _input: &imara_diff::intern::InternedInput<&[u8]>, + _current_tokens: &[imara_diff::intern::Token], + _mode: ZealousMode, + ) { + todo!("contract hunks") + } + + fn write_hunk( + hunk: &Hunk, + input: &imara_diff::intern::InternedInput<&[u8]>, + current_tokens: &[imara_diff::intern::Token], + out: &mut Vec, + ) { + let tokens = match hunk.side { + Side::Current => current_tokens, + Side::Other => &input.after, + Side::Ancestor => &input.before, + }; + write_tokens(&input.interner, &tokens[usize_range(&hunk.after)], out) + } + + fn usize_range(range: &Range) -> Range { + range.start as usize..range.end as usize + } + + fn write_tokens( + interner: &imara_diff::intern::Interner<&[u8]>, + tokens: &[imara_diff::intern::Token], + out: &mut Vec, + ) { + for token in tokens { + out.extend_from_slice(&interner[*token]); + } + } + + /// Find all hunks in `iter` which aren't from the same side as `hunk` and intersect with it. + /// Return `true` if `out` is non-empty after the operation, indicating overlapping hunks were found. + fn take_intersecting(hunk: &Hunk, iter: &mut Peekable>, out: &mut Vec) -> bool { + out.clear(); + + while iter + .peek() + .filter(|b_hunk| b_hunk.side != hunk.side && hunk.before.contains(&b_hunk.before.start)) + .is_some() + { + out.extend(iter.next()); + } + !out.is_empty() + } + + fn tokens(input: &[u8]) -> imara_diff::sources::ByteLines<'_, true> { + imara_diff::sources::byte_lines_with_terminator(input) + } + + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + enum Side { + Current, + Other, + /// A special marker that is just used to be able to mix-in hunks that only point to the ancestor. + /// Only `before` matters then. + Ancestor, + } + + #[derive(Debug)] + struct Hunk { + before: Range, + after: Range, + side: Side, + } + + struct CollectHunks { + hunks: Vec, + side: Side, + } + + impl imara_diff::Sink for CollectHunks { + type Out = Vec; + + fn process_change(&mut self, before: Range, after: Range) { + self.hunks.push(Hunk { + before, + after, + side: self.side, + }) + } + + fn finish(self) -> Self::Out { + self.hunks + } + } + + /// An adapted version of the unified diff writer to get a first idea. + // TODO: remove this + #[allow(dead_code)] + mod unified_test { + use bstr::ByteVec; + use imara_diff::intern::{InternedInput, Interner, Token}; + use imara_diff::Sink; + use std::ops::Range; + + /// A [`Sink`] that creates a textual diff + /// in the format typically output by git or gnu-diff if the `-u` option is used + pub struct UnifiedDiffBuilder<'a, W> + where + W: std::io::Write, + { + before: &'a [Token], + after: &'a [Token], + interner: &'a Interner<&'a [u8]>, + + pos: u32, + before_hunk_start: u32, + after_hunk_start: u32, + before_hunk_len: u32, + after_hunk_len: u32, + + buffer: Vec, + dst: W, + } + + impl<'a, W> UnifiedDiffBuilder<'a, W> + where + W: std::io::Write, + { + /// Create a new `UnifiedDiffBuilder` for the given `input`, + /// that will writes it output to the provided implementation of [`Write`]. + pub fn with_writer(input: &'a InternedInput<&'a [u8]>, writer: W) -> Self { + Self { + before_hunk_start: 0, + after_hunk_start: 0, + before_hunk_len: 0, + after_hunk_len: 0, + buffer: Vec::with_capacity(8), + dst: writer, + interner: &input.interner, + before: &input.before, + after: &input.after, + pos: 0, + } + } + + fn print_tokens(&mut self, tokens: &[Token], prefix: char) { + for &token in tokens { + self.buffer.push_char(prefix); + self.buffer.extend_from_slice(self.interner[token]); + } + } + + fn flush(&mut self) { + if self.before_hunk_len == 0 && self.after_hunk_len == 0 { + return; + } + + let end = (self.pos + 3).min(self.before.len() as u32); + self.update_pos(end, end); + + writeln!( + &mut self.dst, + "@@ -{},{} +{},{} @@", + self.before_hunk_start + 1, + self.before_hunk_len, + self.after_hunk_start + 1, + self.after_hunk_len, + ) + .unwrap(); + self.dst.write_all(&self.buffer).unwrap(); + self.buffer.clear(); + self.before_hunk_len = 0; + self.after_hunk_len = 0 + } + + fn update_pos(&mut self, print_to: u32, move_to: u32) { + self.print_tokens(&self.before[self.pos as usize..print_to as usize], ' '); + let len = print_to - self.pos; + self.pos = move_to; + self.before_hunk_len += len; + self.after_hunk_len += len; + } + } + + impl Sink for UnifiedDiffBuilder<'_, W> + where + W: std::io::Write, + { + type Out = W; + + fn process_change(&mut self, before: Range, after: Range) { + if before.start - self.pos > 6 { + self.flush(); + self.pos = before.start - 3; + self.before_hunk_start = self.pos; + self.after_hunk_start = after.start - 3; + } + self.update_pos(before.start, before.end); + self.before_hunk_len += before.end - before.start; + self.after_hunk_len += after.end - after.start; + self.print_tokens(&self.before[before.start as usize..before.end as usize], '-'); + self.print_tokens(&self.after[after.start as usize..after.end as usize], '+'); + } + + fn finish(mut self) -> Self::Out { + self.flush(); + self.dst + } + } + } } } +pub use text::function::merge as text; diff --git a/gix-merge/src/blob/platform.rs b/gix-merge/src/blob/platform.rs index 497b9bf887..6b6175ee40 100644 --- a/gix-merge/src/blob/platform.rs +++ b/gix-merge/src/blob/platform.rs @@ -137,7 +137,7 @@ pub mod merge { pub other: ResourceRef<'parent>, } - #[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] + #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub struct Options { /// If `true`, the resources being merged are contained in a virtual ancestor, /// which is the case when merge bases are merged into one. diff --git a/gix-merge/tests/fixtures/generated-archives/text-baseline.tar b/gix-merge/tests/fixtures/generated-archives/text-baseline.tar new file mode 100644 index 0000000000..13e9874372 Binary files /dev/null and b/gix-merge/tests/fixtures/generated-archives/text-baseline.tar differ diff --git a/gix-merge/tests/fixtures/text-baseline.sh b/gix-merge/tests/fixtures/text-baseline.sh new file mode 100644 index 0000000000..ffc0edf0eb --- /dev/null +++ b/gix-merge/tests/fixtures/text-baseline.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +set -eu -o pipefail + + +function baseline() { + local ours=$DIR/${1:?1: our file}.blob; + local base=$DIR/${2:?2: base file}.blob; + local theirs=$DIR/${3:?3: their file}.blob; + local output=$DIR/${4:?4: the name of the output file}.merged; + + shift 4 + git merge-file --stdout "$@" "$ours" "$base" "$theirs" > "$output" || true + + echo "$ours" "$base" "$theirs" "$output" "$@" >> baseline.cases +} + +mkdir simple +(cd simple + echo -e "line1-changed-by-both\nline2-to-be-changed-in-incoming" > ours.blob + echo -e "line1-to-be-changed-by-both\nline2-to-be-changed-in-incoming" > base.blob + echo -e "line1-changed-by-both\nline2-changed" > theirs.blob +) + +# one big change includes multiple smaller ones +mkdir multi-change +(cd multi-change + cat < base.blob +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +EOF + + cat < ours.blob +0 +1 +X +X +4 +5 +Y +Y +8 +Z +EOF + + cat < theirs.blob +T +T +T +T +T +T +T +T +T +T +EOF +) + +for dir in simple multi-change; do + DIR=$dir + baseline ours base theirs merge + baseline ours base theirs diff3 --diff3 + baseline ours base theirs zdiff3 --zdiff3 + baseline ours base theirs merge-ours --ours + baseline ours base theirs merge-theirs --theirs + baseline ours base theirs merge-union --union +done \ No newline at end of file diff --git a/gix-merge/tests/merge/blob/builtin_driver.rs b/gix-merge/tests/merge/blob/builtin_driver.rs new file mode 100644 index 0000000000..e9ac1bd709 --- /dev/null +++ b/gix-merge/tests/merge/blob/builtin_driver.rs @@ -0,0 +1,136 @@ +use gix_merge::blob::builtin_driver::binary::{Pick, ResolveWith}; +use gix_merge::blob::{builtin_driver, Resolution}; + +#[test] +fn binary() { + assert_eq!( + builtin_driver::binary(None), + (Pick::Ours, Resolution::Conflict), + "by default it picks ours and marks it as conflict" + ); + assert_eq!( + builtin_driver::binary(Some(ResolveWith::Ancestor)), + (Pick::Ancestor, Resolution::Complete), + "Otherwise we can pick anything and it will mark it as complete" + ); + assert_eq!( + builtin_driver::binary(Some(ResolveWith::Ours)), + (Pick::Ours, Resolution::Complete) + ); + assert_eq!( + builtin_driver::binary(Some(ResolveWith::Theirs)), + (Pick::Theirs, Resolution::Complete) + ); +} + +mod text { + use bstr::ByteSlice; + use gix_merge::blob::Resolution; + use pretty_assertions::assert_str_eq; + + #[test] + fn run_baseline() -> crate::Result { + let root = gix_testtools::scripted_fixture_read_only("text-baseline.sh")?; + let cases = std::fs::read_to_string(root.join("baseline.cases"))?; + let mut out = Vec::new(); + let mut tokens = Vec::new(); + for case in baseline::Expectations::new(&root, &cases).skip(6) { + let mut input = imara_diff::intern::InternedInput::default(); + let actual = gix_merge::blob::builtin_driver::text( + &mut out, + &mut input, + &mut tokens, + &case.ours, + &case.base, + &case.theirs, + case.options, + ); + let expected_resolution = if case.expected.contains_str("<<<<<<<") { + Resolution::Conflict + } else { + Resolution::Complete + }; + assert_eq!(actual, expected_resolution, "{}: resolution mismatch", case.name); + assert_str_eq!( + out.as_bstr().to_str_lossy(), + case.expected.to_str_lossy(), + "{}: output mismatch", + case.name + ); + } + Ok(()) + } + + mod baseline { + use bstr::BString; + use gix_merge::blob::builtin_driver::text::{ConflictStyle, ResolveWith}; + use std::path::Path; + + #[derive(Debug)] + pub struct Expectation { + pub ours: BString, + pub theirs: BString, + pub base: BString, + pub name: BString, + pub expected: BString, + pub options: gix_merge::blob::builtin_driver::text::Options, + } + + pub struct Expectations<'a> { + root: &'a Path, + lines: std::str::Lines<'a>, + } + + impl<'a> Expectations<'a> { + pub fn new(root: &'a Path, cases: &'a str) -> Self { + Expectations { + root, + lines: cases.lines(), + } + } + } + + impl Iterator for Expectations<'_> { + type Item = Expectation; + + fn next(&mut self) -> Option { + let line = self.lines.next()?; + let mut words = line.split(' '); + let (Some(ours), Some(base), Some(theirs), Some(output)) = + (words.next(), words.next(), words.next(), words.next()) + else { + panic!("need at least the input and output") + }; + + let read = |rela_path: &str| read_blob(&self.root, rela_path); + + let mut options = gix_merge::blob::builtin_driver::text::Options::default(); + for arg in words { + match arg { + "--diff3" => options.conflict_style = ConflictStyle::Diff3, + "--zdiff3" => options.conflict_style = ConflictStyle::ZealousDiff3, + "--ours" => options.on_conflict = Some(ResolveWith::Ours), + "--theirs" => options.on_conflict = Some(ResolveWith::Theirs), + "--union" => options.on_conflict = Some(ResolveWith::Union), + _ => panic!("Unknown argument to parse into options: '{arg}'"), + } + } + + Some(Expectation { + ours: read(ours), + theirs: read(theirs), + base: read(base), + expected: read(output), + name: output.into(), + options, + }) + } + } + + fn read_blob(root: &Path, rela_path: &str) -> BString { + std::fs::read(root.join(rela_path)) + .unwrap_or_else(|_| panic!("Failed to read '{rela_path}' in '{}'", root.display())) + .into() + } + } +} diff --git a/gix-merge/tests/merge/blob/mod.rs b/gix-merge/tests/merge/blob/mod.rs new file mode 100644 index 0000000000..f781f63e48 --- /dev/null +++ b/gix-merge/tests/merge/blob/mod.rs @@ -0,0 +1 @@ +mod builtin_driver; diff --git a/gix-merge/tests/merge/main.rs b/gix-merge/tests/merge/main.rs new file mode 100644 index 0000000000..05375cb227 --- /dev/null +++ b/gix-merge/tests/merge/main.rs @@ -0,0 +1,4 @@ +#[cfg(feature = "blob")] +mod blob; + +pub use gix_testtools::Result;