diff --git a/Cargo.lock b/Cargo.lock index fbc4bc1a3a4..47d8945ec56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2043,6 +2043,22 @@ dependencies = [ [[package]] name = "gix-merge" version = "0.0.0" +dependencies = [ + "bstr", + "document-features", + "gix-command", + "gix-filter", + "gix-fs 0.11.3", + "gix-hash 0.14.2", + "gix-object 0.44.0", + "gix-path 0.10.11", + "gix-tempfile 14.0.2", + "gix-trace 0.1.10", + "gix-worktree 0.36.0", + "imara-diff", + "serde", + "thiserror", +] [[package]] name = "gix-negotiate" diff --git a/gix-diff/src/blob/platform.rs b/gix-diff/src/blob/platform.rs index 4c540cce85d..495d23bd433 100644 --- a/gix-diff/src/blob/platform.rs +++ b/gix-diff/src/blob/platform.rs @@ -383,6 +383,7 @@ impl Platform { /// /// If one of the resources is binary, the operation reports an error as such resources don't make their data available /// which is required for the external diff to run. + // TODO: fix this - the diff shouldn't fail if binary (or large) files are used, just copy them into tempfiles. pub fn prepare_diff_command( &self, diff_command: BString, diff --git a/gix-merge/Cargo.toml b/gix-merge/Cargo.toml index 2114995cf07..b75d4cb3845 100644 --- a/gix-merge/Cargo.toml +++ b/gix-merge/Cargo.toml @@ -14,5 +14,31 @@ workspace = true [lib] doctest = false +[features] +default = ["blob"] +## Enable diffing of blobs using imara-diff, which also allows for a generic rewrite tracking implementation. +blob = ["dep:imara-diff", "dep:gix-filter", "dep:gix-worktree", "dep:gix-path", "dep:gix-fs", "dep:gix-command", "dep:gix-tempfile", "dep:gix-trace"] +## Data structures implement `serde::Serialize` and `serde::Deserialize`. +serde = ["dep:serde", "gix-hash/serde", "gix-object/serde"] + [dependencies] +gix-hash = { version = "^0.14.2", path = "../gix-hash" } +gix-object = { version = "^0.44.0", path = "../gix-object" } +gix-filter = { version = "^0.13.0", path = "../gix-filter", optional = true } +gix-worktree = { version = "^0.36.0", path = "../gix-worktree", default-features = false, features = ["attributes"], optional = true } +gix-command = { version = "^0.3.9", path = "../gix-command", optional = true } +gix-path = { version = "^0.10.11", path = "../gix-path", optional = true } +gix-fs = { version = "^0.11.3", path = "../gix-fs", optional = true } +gix-tempfile = { version = "^14.0.0", path = "../gix-tempfile", optional = true } +gix-trace = { version = "^0.1.10", path = "../gix-trace", optional = true } + +thiserror = "1.0.63" +imara-diff = { version = "0.1.7", optional = true } +bstr = { version = "1.5.0", default-features = false } +serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } + +document-features = { version = "0.2.0", optional = true } +[package.metadata.docs.rs] +all-features = true +features = ["document-features"] diff --git a/gix-merge/src/blob/builtin_driver.rs b/gix-merge/src/blob/builtin_driver.rs new file mode 100644 index 00000000000..bf32b85e6ff --- /dev/null +++ b/gix-merge/src/blob/builtin_driver.rs @@ -0,0 +1,165 @@ +use crate::blob::BuiltinDriver; + +impl BuiltinDriver { + /// Return the name of this instance. + pub fn as_str(&self) -> &str { + match self { + BuiltinDriver::Text => "text", + BuiltinDriver::Binary => "binary", + BuiltinDriver::Union => "union", + } + } + + /// Get all available built-in drivers. + pub fn all() -> &'static [Self] { + &[BuiltinDriver::Text, BuiltinDriver::Binary, BuiltinDriver::Union] + } + + /// Try to match one of our variants to `name`, case-sensitive, and return its instance. + pub fn by_name(name: &str) -> Option { + Self::all().iter().find(|variant| variant.as_str() == name).copied() + } +} + +/// Identify a merge resolution. +#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub enum Resolution { + /// Everything could be resolved during the merge. + Complete, + /// A conflict is still present. + Conflict, +} + +/// +pub mod binary { + use crate::blob::builtin_driver::Resolution; + + /// What to do when having to pick a side to resolve a conflict. + #[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] + pub enum ResolveWith { + /// Chose the ancestor to resolve a conflict. + Ancestor, + /// Chose our side to resolve a conflict. + Ours, + /// Chose their side to resolve a conflict. + Theirs, + } + + /// Tell the caller of [`merge()`] which side was picked + #[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] + pub enum Pick { + /// Chose the ancestor. + Ancestor, + /// Chose our side. + Ours, + /// Chose their side. + Theirs, + } + + /// As this algorithm doesn't look at the actual data, it returns a choice solely based on logic. + /// + /// It always results in a conflict with `current` being picked unless `on_conflict` is not `None`. + pub fn merge(on_conflict: Option) -> (Pick, Resolution) { + match on_conflict { + None => (Pick::Ours, Resolution::Conflict), + Some(ResolveWith::Ours) => (Pick::Ours, Resolution::Complete), + Some(ResolveWith::Theirs) => (Pick::Theirs, Resolution::Complete), + Some(ResolveWith::Ancestor) => (Pick::Ancestor, Resolution::Complete), + } + } +} + +/// +pub mod text { + use crate::blob::builtin_driver::Resolution; + + /// The way the built-in [text driver](crate::blob::BuiltinDriver::Text) will express + /// merge conflicts in the resulting file. + #[derive(Default, Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] + pub enum ConflictStyle { + /// Only show the zealously minified conflicting lines of the local changes and the incoming (other) changes, + /// hiding the base version entirely. + /// + /// ``` + /// line1-changed-by-both + /// <<<<<<< local + /// line2-to-be-changed-in-incoming + /// ======= + /// line2-changed + /// >>>>>>> incoming + ///``` + #[default] + Merge, + /// Show non-minimized hunks of local changes, the base, and the incoming (other) changes. + /// + /// This mode does not hide any information. + /// ``` + /// <<<<<<< local + /// line1-changed-by-both + /// line2-to-be-changed-in-incoming + /// ||||||| 9a8d80c + /// line1-to-be-changed-by-both + /// line2-to-be-changed-in-incoming + /// ======= + /// line1-changed-by-both + /// line2-changed + /// >>>>>>> incoming + ///``` + Diff3, + /// Like [`Diff3](Self::Diff3), but will show *minimized* hunks of local change and the incoming (other) changes, + /// as well as non-minimized hunks of the base. + /// + /// ``` + /// line1-changed-by-both + /// <<<<<<< local + /// line2-to-be-changed-in-incoming + /// ||||||| 9a8d80c + /// line1-to-be-changed-by-both + /// line2-to-be-changed-in-incoming + /// ======= + /// line2-changed + /// >>>>>>> incoming + /// ``` + ZealousDiff3, + } + + /// Options for the builtin [text driver](crate::blob::BuiltinDriver::Text). + #[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] + pub struct Options { + /// How to visualize conflicts in merged files. + pub conflict_style: ConflictStyle, + /// The amount of markers to draw, defaults to 7, i.e. `<<<<<<<` + pub marker_size: usize, + /// Decide what to do to automatically resolve conflicts. + /// If `None`, add conflict markers according to `conflict_style` and `marker_size`. + pub on_conflict: Option, + } + + impl Default for Options { + fn default() -> Self { + Options { + conflict_style: Default::default(), + marker_size: 7, + on_conflict: None, + } + } + } + + /// What to do to resolve a conflict. + #[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] + pub enum ResolveWith { + /// Chose our side to resolve a conflict. + Ours, + /// Chose their side to resolve a conflict. + Theirs, + /// Place our and their lines one after another, in any order + Union, + } + + /// Merge `current` and `other` with `ancestor` as base according to `opts`. + /// + /// Place the merged result in `out` and return the resolution. + pub fn merge(_out: &mut Vec, _current: &[u8], _ancestor: &[u8], _other: &[u8], _opts: Options) -> Resolution { + todo!("text merge"); + } +} diff --git a/gix-merge/src/blob/mod.rs b/gix-merge/src/blob/mod.rs new file mode 100644 index 00000000000..2b778929a54 --- /dev/null +++ b/gix-merge/src/blob/mod.rs @@ -0,0 +1,145 @@ +// TODO: remove this - only needed while &mut Vec isn't used. +#![allow(clippy::ptr_arg)] + +use bstr::BString; +use std::path::PathBuf; + +/// +pub mod builtin_driver; +/// +pub mod pipeline; +/// +pub mod platform; + +/// A way to classify a resource suitable for merging. +#[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub enum ResourceKind { + /// Our side of the state. + CurrentOrOurs, + /// Their side of the state. + OtherOrTheirs, + /// The state of the common base of both ours and theirs. + CommonAncestorOrBase, +} + +/// Define a driver program that merges +/// +/// Some values are related to diffing, some are related to conversions. +#[derive(Default, Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub enum BuiltinDriver { + /// Perform a merge between text-sources such that conflicts are marked according to + /// `merge.conflictStyle` in the Git configuration. + /// + /// If any of the inputs, *base*, *ours* or *theirs* looks like non-text/binary, + /// the [`Binary`](Self::Binary) driver will be used instead. + /// + /// Also see [`builtin_driver::text::ConflictStyle`]. + #[default] + Text, + /// Merge 'unmergable' content by choosing *ours* or *theirs*, without performing + /// an actual merge. + /// + /// Note that if the merge operation is for virtual ancestor (a merge for merge-bases), + /// then *ours* will always be chosen. + Binary, + /// Merge text-sources and resolve conflicts by adding conflicting lines one after another, + /// in random order, without adding conflict markers either. + /// + /// This can be useful for files that change a lot, but will remain usable merely by adding + /// all changed lines. + Union, +} + +/// Define a driver program that merges +/// +/// Some values are related to diffing, some are related to conversions. +#[derive(Default, Debug, Clone, PartialEq, Eq)] +pub struct Driver { + /// The name of the driver, as referred to by `[merge "name"]` in the git configuration. + pub name: BString, + /// The human-readable version of `name`, only to be used for displaying driver-information to the user. + pub display_name: BString, + /// The command to execute to perform the merge entirely like ` %O %A %B %L %P %S %X %Y`. + /// + /// * **%O** + /// - the common ancestor version, or *base*. + /// * **%A** + /// - the current version, or *ours*. + /// * **%B** + /// - the other version, or *theirs*. + /// * **%L** + /// - The conflict-marker size as positive number. + /// * **%P** + /// - The path in which the merged result will be stored. + /// * **%S** + /// - The conflict-label for the common ancestor or *base*. + /// * **%X** + /// - The conflict-label for the current version or *ours*. + /// * **%Y** + /// - The conflict-label for the other version or *theirs*. + /// + /// Note that conflict-labels are behind the conflict markers, to annotate them. + /// + /// A typical invocation with all arguments substituted could then look like this: + /// + /// ``` + /// .merge_file_nR2Qs1 .merge_file_WYXCJe .merge_file_UWbzrm 7 file e2a2970 HEAD feature + /// ``` + pub command: BString, + /// If `true`, this is the `name` of the driver to use when a virtual-merge-base is created, as a merge of all + /// available merge-bases if there are more than one. + /// + /// This value can also be special built-in drivers named `text`, `binary` or `union`. Note that user-defined + /// drivers with the same name will be preferred over built-in ones, but only for files whose git attributes + /// specified the driver by *name*. + pub recursive: Option, +} + +/// A conversion pipeline to take an object or path from what's stored in Git to what can be merged, while +/// following the guidance of git-attributes at the respective path to learn how the merge should be performed. +/// +/// Depending on the source, different conversions are performed: +/// +/// * `worktree on disk` -> `object for storage in git` +/// * `object` -> `possibly renormalized object` +/// - Renormalization means that the `object` is converted to what would be checked out into the work-tree, +/// just to turn it back into an object. +#[derive(Clone)] +pub struct Pipeline { + /// A way to read data directly from the worktree. + pub roots: pipeline::WorktreeRoots, + /// A pipeline to convert objects from the worktree to Git, and also from Git to the worktree, and back to Git. + pub filter: gix_filter::Pipeline, + /// Options affecting the way we read files. + pub options: pipeline::Options, + /// All available merge drivers. + /// + /// They are referenced in git-attributes by name, and we hand out indices into this array. + drivers: Vec, + /// Pre-configured attributes to obtain additional merge-related information. + attrs: gix_filter::attributes::search::Outcome, + /// A buffer to produce disk-accessible paths from worktree roots. + path: PathBuf, +} + +/// A utility for gathering and processing all state necessary to perform a three-way merge. +/// +/// It can re-use buffers if all three parts of participating in the merge are +/// set repeatedly. +#[derive(Clone)] +pub struct Platform { + /// The current version (ours). + current: Option, + /// The ancestor version (base). + ancestor: Option, + /// The other version (theirs). + other: Option, + + /// A way to convert objects into a diff-able format. + pub filter: Pipeline, + /// A way to access `.gitattributes` + pub attr_stack: gix_worktree::Stack, + + /// The way we convert resources into mergeable states. + filter_mode: pipeline::Mode, +} diff --git a/gix-merge/src/blob/pipeline.rs b/gix-merge/src/blob/pipeline.rs new file mode 100644 index 00000000000..90adb615051 --- /dev/null +++ b/gix-merge/src/blob/pipeline.rs @@ -0,0 +1,436 @@ +use super::{BuiltinDriver, Pipeline, ResourceKind}; +use bstr::{BStr, ByteSlice}; +use gix_filter::attributes; +use gix_filter::driver::apply::{Delay, MaybeDelayed}; +use gix_filter::pipeline::convert::{ToGitOutcome, ToWorktreeOutcome}; +use gix_object::tree::EntryKind; +use std::io::Read; +use std::path::{Path, PathBuf}; + +/// Options for use in a [`Pipeline`]. +#[derive(Default, Clone, Copy, PartialEq, Eq, Debug, Hash, Ord, PartialOrd)] +pub struct Options { + /// The amount of bytes that an object has to reach before being treated as binary. + /// These objects will not be queried, nor will their data be processed in any way. + /// If `0`, no file is ever considered binary due to their size. + /// + /// Note that for files stored in `git`, what counts is their stored, decompressed size, + /// thus `git-lfs` files would typically not be considered binary unless one explicitly sets + /// them. + /// However, if they are to be retrieved from the worktree, the worktree size is what matters, + /// even though that also might be a `git-lfs` file which is small in Git. + pub large_file_threshold_bytes: u64, + /// Capabilities of the file system which affect how we read worktree files. + pub fs: gix_fs::Capabilities, + /// Define which driver to use if the `merge` attribute for a resource is unspecified. + /// + /// This is the value of the `merge.default` git configuration. + pub default_driver: Option, +} + +/// The specific way to convert a resource. +#[derive(Default, Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub enum Mode { + /// Prepare resources as they are stored in `git`. + /// + /// This is naturally the case when object-ids are used, but a conversion is needed + /// when data is read from a worktree. + #[default] + ToGit, + /// For sources that are object-ids, convert them to what *would* be stored in the worktree, + /// and back to what *would* be stored in Git. + /// + /// Sources that are located in a worktree are merely converted to what *would* be stored in Git. + /// + /// This is useful to prevent merge conflicts due to inconcistent whitespace. + Renormalize, +} + +/// A way to access roots for different kinds of resources that are possibly located and accessible in a worktree. +#[derive(Clone, Debug, Default)] +pub struct WorktreeRoots { + /// The worktree root where the current (or our) version of the resource is present. + pub current_root: Option, + /// The worktree root where the other (or their) version of the resource is present. + pub other_root: Option, + /// The worktree root where containing the resource of the common ancestor of our and their version. + pub common_ancestor_root: Option, +} + +impl WorktreeRoots { + /// Return the root path for the given `kind` + pub fn by_kind(&self, kind: ResourceKind) -> Option<&Path> { + match kind { + ResourceKind::CurrentOrOurs => self.current_root.as_deref(), + ResourceKind::CommonAncestorOrBase => self.common_ancestor_root.as_deref(), + ResourceKind::OtherOrTheirs => self.other_root.as_deref(), + } + } + + /// Return `true` if all worktree roots are unset. + pub fn is_unset(&self) -> bool { + self.current_root.is_none() && self.other_root.is_none() && self.common_ancestor_root.is_none() + } +} + +/// Lifecycle +impl Pipeline { + /// Create a new instance of a pipeline which produces blobs suitable for merging. + /// + /// `roots` allow to read worktree files directly, and `worktree_filter` is used + /// to transform object database data directly. `drivers` further configure individual paths. + /// `options` are used to further configure the way we act.. + pub fn new( + roots: WorktreeRoots, + worktree_filter: gix_filter::Pipeline, + mut drivers: Vec, + options: Options, + ) -> Self { + drivers.sort_by(|a, b| a.name.cmp(&b.name)); + Pipeline { + roots, + filter: worktree_filter, + drivers, + options, + attrs: { + let mut out = gix_filter::attributes::search::Outcome::default(); + out.initialize_with_selection(&Default::default(), Some("merge")); + out + }, + path: Default::default(), + } + } +} + +/// Access +impl Pipeline { + /// Return all drivers that this instance was initialized with. + /// + /// They are sorted by [`name`](super::Driver::name) to support binary searches. + pub fn drivers(&self) -> &[super::Driver] { + &self.drivers + } +} + +/// Data as part of an [Outcome]. +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] +pub enum Data { + /// The data to use for merging was written into the buffer that was passed during the call to [`Pipeline::convert_to_mergeable()`]. + Buffer, + /// The size that the binary blob had at the given revision, without having applied filters, as it's either + /// considered binary or above the big-file threshold. + /// + /// In this state, the binary file cannot be merged. + Binary { + /// The size of the object prior to performing any filtering or as it was found on disk. + /// + /// Note that technically, the size isn't always representative of the same 'state' of the + /// content, as once it can be the size of the blob in git, and once it's the size of file + /// in the worktree - both can differ a lot depending on filters. + size: u64, + }, +} + +/// The selection of the driver to use by a resource obtained with [`Pipeline::convert_to_mergeable()`]. +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug, Hash)] +pub enum DriverChoice { + /// Use the given built-in driver to perform the merge. + BuiltIn(BuiltinDriver), + /// Use the user-provided driver program using the index into [the pipelines driver array](Pipeline::drivers(). + Index(usize), +} + +impl Default for DriverChoice { + fn default() -> Self { + DriverChoice::BuiltIn(Default::default()) + } +} + +/// The outcome returned by [Pipeline::convert_to_mergeable()]. +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] +pub struct Outcome { + /// If available, an index into the `drivers` field to access more diff-related information of the driver for items + /// at the given path, as previously determined by git-attributes. + /// + /// * `merge` is set + /// - Use the [`BuiltinDriver::Text`] + /// * `-merge` is unset + /// - Use the [`BuiltinDriver::Binary`] + /// * `!merge` is unspecified + /// - Use [`Options::default_driver`] or [`BuiltinDriver::Text`]. + /// * `merge=name` + /// - Search for a user-configured or built-in driver called `name`. + /// - If not found, silently default to [`BuiltinDriver::Text`] + /// + /// Note that drivers are queried even if there is no object available. + pub driver: DriverChoice, + /// The data itself, suitable for diffing, and if the object or worktree item is present at all. + /// Otherwise, it's `None`. + pub data: Option, +} + +/// +pub mod convert_to_mergeable { + use std::collections::TryReserveError; + + use bstr::BString; + use gix_object::tree::EntryKind; + + /// The error returned by [Pipeline::convert_to_mergeable()](super::Pipeline::convert_to_mergeable()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Entry at '{rela_path}' must be regular file or symlink, but was {actual:?}")] + InvalidEntryKind { rela_path: BString, actual: EntryKind }, + #[error("Entry at '{rela_path}' could not be read as symbolic link")] + ReadLink { rela_path: BString, source: std::io::Error }, + #[error("Entry at '{rela_path}' could not be opened for reading or read from")] + OpenOrRead { rela_path: BString, source: std::io::Error }, + #[error("Entry at '{rela_path}' could not be copied from a filter process to a memory buffer")] + StreamCopy { rela_path: BString, source: std::io::Error }, + #[error(transparent)] + FindObject(#[from] gix_object::find::existing_object::Error), + #[error(transparent)] + ConvertToWorktree(#[from] gix_filter::pipeline::convert::to_worktree::Error), + #[error(transparent)] + ConvertToGit(#[from] gix_filter::pipeline::convert::to_git::Error), + #[error("Memory allocation failed")] + OutOfMemory(#[from] TryReserveError), + } +} + +/// Conversion +impl Pipeline { + /// Convert the object at `id`, `mode`, `rela_path` and `kind`, providing access to `attributes` and `objects`. + /// The resulting merge-able data is written into `out`, if it's not too large or considered binary. + /// The returned [`Outcome`] contains information on how to use `out`, or if it's filled at all. + /// + /// `attributes` must be returning the attributes at `rela_path`, and `objects` must be usable if `kind` is + /// a resource in the object database, i.e. if no worktree root is available. It's notable that if a worktree root + /// is present for `kind`, then a `rela_path` is used to access it on disk. + /// + /// If `id` [is null](gix_hash::ObjectId::is_null()) or the file in question doesn't exist in the worktree in case + /// [a root](WorktreeRoots) is present, then `out` will be left cleared and [Outcome::data] will be `None`. + /// This is useful to simplify the calling code as empty buffers signal that nothing is there. + /// + /// Note that `mode` is trusted, and we will not re-validate that the entry in the worktree actually is of that mode. + /// Only blobs are allowed. + /// + /// Use `convert` to control what kind of the resource will be produced. + #[allow(clippy::too_many_arguments)] + pub fn convert_to_mergeable( + &mut self, + id: &gix_hash::oid, + mode: EntryKind, + rela_path: &BStr, + kind: ResourceKind, + attributes: &mut dyn FnMut(&BStr, &mut gix_filter::attributes::search::Outcome), + objects: &dyn gix_object::FindObjectOrHeader, + convert: Mode, + out: &mut Vec, + ) -> Result { + if !matches!(mode, EntryKind::Blob | EntryKind::BlobExecutable) { + return Err(convert_to_mergeable::Error::InvalidEntryKind { + rela_path: rela_path.to_owned(), + actual: mode, + }); + } + + out.clear(); + attributes(rela_path, &mut self.attrs); + let attr = self.attrs.iter_selected().next().expect("pre-initialized with 'diff'"); + let driver = match attr.assignment.state { + attributes::StateRef::Set => DriverChoice::BuiltIn(BuiltinDriver::Text), + attributes::StateRef::Unset => DriverChoice::BuiltIn(BuiltinDriver::Binary), + attributes::StateRef::Value(name) => { + let name = name.as_bstr(); + self.drivers + .binary_search_by(|d| d.name.as_bstr().cmp(name)) + .ok() + .map(DriverChoice::Index) + .or_else(|| { + name.to_str() + .ok() + .and_then(BuiltinDriver::by_name) + .map(DriverChoice::BuiltIn) + }) + .unwrap_or_default() + } + attributes::StateRef::Unspecified => self + .options + .default_driver + .map(DriverChoice::BuiltIn) + .unwrap_or_default(), + }; + match self.roots.by_kind(kind) { + Some(root) => { + self.path.clear(); + self.path.push(root); + self.path.push(gix_path::from_bstr(rela_path)); + let size_in_bytes = (self.options.large_file_threshold_bytes > 0) + .then(|| { + none_if_missing(self.path.metadata().map(|md| md.len())).map_err(|err| { + convert_to_mergeable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + } + }) + }) + .transpose()?; + let data = match size_in_bytes { + Some(None) => None, // missing as identified by the size check + Some(Some(size)) if size > self.options.large_file_threshold_bytes => Some(Data::Binary { size }), + _ => { + let file = none_if_missing(std::fs::File::open(&self.path)).map_err(|err| { + convert_to_mergeable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + } + })?; + + if let Some(file) = file { + match convert { + Mode::ToGit | Mode::Renormalize => { + let res = self.filter.convert_to_git( + file, + gix_path::from_bstr(rela_path).as_ref(), + attributes, + &mut |buf| objects.try_find(id, buf).map(|obj| obj.map(|_| ())), + )?; + + match res { + ToGitOutcome::Unchanged(mut file) => { + file.read_to_end(out).map_err(|err| { + convert_to_mergeable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + } + })?; + } + ToGitOutcome::Process(mut stream) => { + stream.read_to_end(out).map_err(|err| { + convert_to_mergeable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + } + })?; + } + ToGitOutcome::Buffer(buf) => { + out.clear(); + out.try_reserve(buf.len())?; + out.extend_from_slice(buf); + } + } + } + } + + Some(if is_binary_buf(out) { + let size = out.len() as u64; + out.clear(); + Data::Binary { size } + } else { + Data::Buffer + }) + } else { + None + } + } + }; + Ok(Outcome { driver, data }) + } + None => { + let data = if id.is_null() { + None + } else { + let header = objects + .try_header(id) + .map_err(gix_object::find::existing_object::Error::Find)? + .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?; + let is_binary = self.options.large_file_threshold_bytes > 0 + && header.size > self.options.large_file_threshold_bytes; + let data = if is_binary { + Data::Binary { size: header.size } + } else { + objects + .try_find(id, out) + .map_err(gix_object::find::existing_object::Error::Find)? + .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?; + + if convert == Mode::Renormalize { + let res = self + .filter + .convert_to_worktree(out, rela_path, attributes, Delay::Forbid)?; + + match res { + ToWorktreeOutcome::Unchanged(_) => {} + ToWorktreeOutcome::Buffer(src) => { + out.clear(); + out.try_reserve(src.len())?; + out.extend_from_slice(src); + } + ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => { + std::io::copy(&mut stream, out).map_err(|err| { + convert_to_mergeable::Error::StreamCopy { + rela_path: rela_path.to_owned(), + source: err, + } + })?; + } + ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => { + unreachable!("we prohibit this") + } + }; + } + + let res = self.filter.convert_to_git( + &**out, + &gix_path::from_bstr(rela_path), + attributes, + &mut |buf| objects.try_find(id, buf).map(|obj| obj.map(|_| ())), + )?; + + match res { + ToGitOutcome::Unchanged(_) => {} + ToGitOutcome::Process(mut stream) => { + stream + .read_to_end(out) + .map_err(|err| convert_to_mergeable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + })?; + } + ToGitOutcome::Buffer(buf) => { + out.clear(); + out.try_reserve(buf.len())?; + out.extend_from_slice(buf); + } + } + + if is_binary_buf(out) { + let size = out.len() as u64; + out.clear(); + Data::Binary { size } + } else { + Data::Buffer + } + }; + Some(data) + }; + Ok(Outcome { driver, data }) + } + } + } +} + +fn none_if_missing(res: std::io::Result) -> std::io::Result> { + match res { + Ok(data) => Ok(Some(data)), + Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None), + Err(err) => Err(err), + } +} + +fn is_binary_buf(buf: &[u8]) -> bool { + let buf = &buf[..buf.len().min(8000)]; + buf.contains(&0) +} diff --git a/gix-merge/src/blob/platform.rs b/gix-merge/src/blob/platform.rs new file mode 100644 index 00000000000..e6b4d3c8024 --- /dev/null +++ b/gix-merge/src/blob/platform.rs @@ -0,0 +1,407 @@ +use bstr::{BStr, BString}; + +use crate::blob::pipeline::DriverChoice; +use crate::blob::{pipeline, Pipeline, Platform, ResourceKind}; + +/// A stored value representing a resource that participates in a merge. +#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] +pub(super) struct Resource { + /// The `id` of the value, or `null` if it's only living in a worktree. + id: gix_hash::ObjectId, + /// The repository-relative path where the resource lives in the tree. + rela_path: BString, + /// The outcome of converting a resource into a diffable format using [Pipeline::convert_to_mergeable()]. + conversion: pipeline::Outcome, + /// The kind of the resource we are looking at. Only possible values are `Blob` and `BlobExecutable`. + mode: gix_object::tree::EntryKind, + /// A possibly empty buffer, depending on `conversion.data` which may indicate the data is considered binary + /// or the resource doesn't exist. + buffer: Vec, +} + +/// A blob or executable ready to be merged in one way or another. +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub struct ResourceRef<'a> { + /// The data itself, suitable for merging, and if the object or worktree item is present at all. + pub data: resource::Data<'a>, + /// The location of the resource, relative to the working tree. + pub rela_path: &'a BStr, + /// Which driver to use according to the resource's configuration. + pub driver_choice: DriverChoice, + /// The id of the content as it would be stored in `git`, or `null` if the content doesn't exist anymore at + /// `rela_path` or if it was never computed. This can happen with content read from the worktree, which + /// after its 'to-git' conversion never had its hash computed. + pub id: &'a gix_hash::oid, +} + +/// +pub mod resource { + use crate::blob::{ + pipeline, + platform::{Resource, ResourceRef}, + }; + + impl<'a> ResourceRef<'a> { + pub(super) fn new(cache: &'a Resource) -> Self { + ResourceRef { + data: cache.conversion.data.map_or(Data::Missing, |data| match data { + pipeline::Data::Buffer => Data::Buffer(&cache.buffer), + pipeline::Data::Binary { size } => Data::Binary { size }, + }), + driver_choice: cache.conversion.driver, + rela_path: cache.rela_path.as_ref(), + id: &cache.id, + } + } + } + + /// The data of a mergeable resource, as it could be determined and computed previously. + #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] + pub enum Data<'a> { + /// The object is missing, either because it didn't exist in the working tree or because its `id` was null. + Missing, + /// The textual data as processed and ready for merging, i.e. suitable for storage in Git. + Buffer(&'a [u8]), + /// The size that the binary blob had at the given revision, without having applied filters, as it's either + /// considered binary or above the big-file threshold. + /// + /// In this state, the binary file cannot be merged. + Binary { + /// The size of the object prior to performing any filtering or as it was found on disk. + /// + /// Note that technically, the size isn't always representative of the same 'state' of the + /// content, as once it can be the size of the blob in Git, and once it's the size of file + /// in the worktree. + size: u64, + }, + } + + impl<'a> Data<'a> { + /// Return ourselves as slice of bytes if this instance stores data. + pub fn as_slice(&self) -> Option<&'a [u8]> { + match self { + Data::Buffer(d) => Some(d), + Data::Binary { .. } | Data::Missing => None, + } + } + } +} + +/// +pub mod set_resource { + use bstr::BString; + + use crate::blob::{pipeline, ResourceKind}; + + /// The error returned by [Platform::set_resource](super::Platform::set_resource). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Can only diff blobs, not {mode:?}")] + InvalidMode { mode: gix_object::tree::EntryKind }, + #[error("Failed to read {kind:?} worktree data from '{rela_path}'")] + Io { + rela_path: BString, + kind: ResourceKind, + source: std::io::Error, + }, + #[error("Failed to obtain attributes for {kind:?} resource at '{rela_path}'")] + Attributes { + rela_path: BString, + kind: ResourceKind, + source: std::io::Error, + }, + #[error(transparent)] + ConvertToMergeable(#[from] pipeline::convert_to_mergeable::Error), + } +} + +/// +pub mod merge { + use crate::blob::platform::ResourceRef; + use crate::blob::{builtin_driver, BuiltinDriver}; + use bstr::BString; + + /// The product of a [`prepare_merge()`](crate::blob::Platform::prepare_merge_state()) call to finally + /// perform the merge and retrieve the merge results. + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + pub struct State<'a> { + /// The current or our side of the merge operation. + pub current: ResourceRef<'a>, + /// The ancestor or base of the merge operation. + pub ancestor: ResourceRef<'a>, + /// The other or their side of the merge operation. + pub other: ResourceRef<'a>, + } + + #[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] + pub struct Options { + /// If `true`, the resources being merged are contained in a virtual ancestor, + /// which is the case when merge bases are merged into one. + pub is_virtual_ancestor: bool, + /// Determine how to resolve conflicts. If `None`, no conflict resolution is possible and it picks a side. + pub resolve_binary_with: Option, + /// Options for the builtin [text driver](BuiltinDriver::Text). + pub text: builtin_driver::text::Options, + } + + /// + pub mod prepare_external_driver { + use std::ops::{Deref, DerefMut}; + + use crate::blob::ResourceKind; + use bstr::BString; + + /// The error returned by [Merge::prepare_merge_command()](super::Merge::prepare_merge_command()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Binary resources can't be diffed with an external command (as we don't have the data anymore)")] + SourceOrDestinationAreBinary, + #[error( + "Tempfile to store content of '{rela_path}' ({kind:?}) for passing to external merge command could not be created" + )] + CreateTempfile { + rela_path: BString, + kind: ResourceKind, + source: std::io::Error, + }, + #[error( + "Could not write content of '{rela_path}' ({kind:?}) to tempfile for passing to external merge command" + )] + WriteTempfile { + rela_path: BString, + kind: ResourceKind, + source: std::io::Error, + }, + } + + /// The product of a [`prepare_merge_command`](super::Merge::prepare_merge_command()) operation. + /// + /// This type acts like [`std::process::Command`], ready to run, with `stderr` set to *inherit*, + /// but `stdin` closed and `stdout` setup to be captured. + // TODO: remove this + #[allow(dead_code)] + pub struct Command { + /// The pre-configured command + cmd: std::process::Command, + /// A tempfile holding the *current* (ours) state of the resource. + current: gix_tempfile::Handle, + /// A tempfile holding the *ancestor* (base) state of the resource. + ancestor: gix_tempfile::Handle, + /// A tempfile holding the *other* (their) state of the resource. + other: gix_tempfile::Handle, + } + + impl Deref for Command { + type Target = std::process::Command; + + fn deref(&self) -> &Self::Target { + &self.cmd + } + } + + impl DerefMut for Command { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.cmd + } + } + } + + /// + pub mod builtin_merge { + /// An identifier to tell us how a merge conflict was resolved by [builtin_merge](super::State::builtin_merge). + pub enum Pick { + /// Chose the ancestor. + Ancestor, + /// Chose our side. + Ours, + /// Chose their side. + Theirs, + /// New data was produced with the result of the merge, to be found in the buffer that was passed to + /// [builtin_merge()](super::State::builtin_merge). + Buffer, + } + } + + impl State<'_> { + /// Given `merge_command` and `context`, typically obtained from git-configuration, and the currently set merge-resources, + /// prepare the invocation and temporary files needed to launch it according to protocol. + /// + /// Please note that this is an expensive operation this will always create three temporary files to hold all sides of the merge. + /// + /// ### Deviation + /// + /// We allow passing more context than Git would by taking a whole `context`, it's up to the caller to decide how much is filled. + pub fn prepare_external_driver( + &self, + _merge_command: BString, + _context: gix_command::Context, + ) -> Result { + todo!("prepare command") + } + + /// Perform the merge according to our resources and + /// Note that if the *pick* wasn't [`Buffer`](builtin_merge::Pick::Buffer), then `out` will not have been cleared. + pub fn builtin_merge( + &self, + _out: &mut Vec, + _driver: BuiltinDriver, + _opts: Options, + ) -> (builtin_merge::Pick, builtin_driver::Resolution) { + todo!("do full merge") + } + } +} + +/// +pub mod prepare_merge { + /// The error returned by [Platform::prepare_merge()](super::Platform::prepare_merge_state()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("The 'current', 'ancestor' or 'other' resource for the merge operation were not set")] + UnsetResource, + #[error("Tried to merge 'current' and 'other' where at least one of them is removed")] + CurrentOrOtherRemoved, + } +} + +/// Lifecycle +impl Platform { + /// Create a new instance with a way to `filter` data from the object database and turn it into something that is merge-able. + /// `filter_mode` decides how to do that specifically. + /// Use `attr_stack` to access attributes pertaining worktree filters and merge settings. + pub fn new(filter: Pipeline, filter_mode: pipeline::Mode, attr_stack: gix_worktree::Stack) -> Self { + Platform { + current: None, + ancestor: None, + other: None, + filter, + filter_mode, + attr_stack, + } + } +} + +/// Preparation +impl Platform { + /// Store enough information about a resource to eventually use it in a merge, where… + /// + /// * `id` is the hash of the resource. If it [is null](gix_hash::ObjectId::is_null()), it should either + /// be a resource in the worktree, or it's considered a non-existing, deleted object. + /// If an `id` is known, as the hash of the object as (would) be stored in `git`, then it should be provided + /// for completeness. Note that it's not expected to be in `objects` if `rela_path` is set and a worktree-root + /// is available for `kind`. + /// * `mode` is the kind of object (only blobs and links are allowed) + /// * `rela_path` is the relative path as seen from the (work)tree root. + /// * `kind` identifies the side of the merge this resource will be used for. + /// * `objects` provides access to the object database in case the resource can't be read from a worktree. + pub fn set_resource( + &mut self, + id: gix_hash::ObjectId, + mode: gix_object::tree::EntryKind, + rela_path: &BStr, + kind: ResourceKind, + objects: &impl gix_object::FindObjectOrHeader, + ) -> Result<(), set_resource::Error> { + self.set_resource_inner(id, mode, rela_path, kind, objects) + } + + /// Returns the resource of the given kind if it was set. + pub fn resource(&self, kind: ResourceKind) -> Option> { + let cache = match kind { + ResourceKind::CurrentOrOurs => self.current.as_ref(), + ResourceKind::CommonAncestorOrBase => self.ancestor.as_ref(), + ResourceKind::OtherOrTheirs => self.other.as_ref(), + }?; + ResourceRef::new(cache).into() + } + + /// Prepare a diff operation on the [previously set](Self::set_resource()) [old](ResourceKind::OldOrSource) and + /// [new](ResourceKind::NewOrDestination) resources. + /// + /// The returned outcome allows to easily perform diff operations, based on the [`prepare_merge::Merge::operation`] field, + /// which hints at what should be done. + pub fn prepare_merge_state(&self) -> Result, prepare_merge::Error> { + let current = self.current.as_ref().ok_or(prepare_merge::Error::UnsetResource)?; + let ancestor = self.ancestor.as_ref().ok_or(prepare_merge::Error::UnsetResource)?; + let other = self.other.as_ref().ok_or(prepare_merge::Error::UnsetResource)?; + + let out = merge::State { + current: ResourceRef::new(current), + ancestor: ResourceRef::new(ancestor), + other: ResourceRef::new(other), + }; + + match (current.conversion.data, other.conversion.data) { + (None, None) => Err(prepare_merge::Error::CurrentOrOtherRemoved), + (_, _) => Ok(out), + } + } +} + +impl Platform { + fn set_resource_inner( + &mut self, + id: gix_hash::ObjectId, + mode: gix_object::tree::EntryKind, + rela_path: &BStr, + kind: ResourceKind, + objects: &impl gix_object::FindObjectOrHeader, + ) -> Result<(), set_resource::Error> { + if !matches!( + mode, + gix_object::tree::EntryKind::Blob | gix_object::tree::EntryKind::BlobExecutable + ) { + return Err(set_resource::Error::InvalidMode { mode }); + } + let entry = + self.attr_stack + .at_entry(rela_path, None, objects) + .map_err(|err| set_resource::Error::Attributes { + source: err, + kind, + rela_path: rela_path.to_owned(), + })?; + + let storage = match kind { + ResourceKind::OtherOrTheirs => &mut self.other, + ResourceKind::CommonAncestorOrBase => &mut self.ancestor, + ResourceKind::CurrentOrOurs => &mut self.current, + }; + + let mut buf_storage = Vec::new(); + let out = self.filter.convert_to_mergeable( + &id, + mode, + rela_path, + kind, + &mut |_, out| { + let _ = entry.matching_attributes(out); + }, + objects, + self.filter_mode, + storage.as_mut().map_or(&mut buf_storage, |s| &mut s.buffer), + )?; + + match storage { + None => { + *storage = Some(Resource { + id, + rela_path: rela_path.to_owned(), + conversion: out, + mode, + buffer: buf_storage, + }); + } + Some(storage) => { + storage.id = id; + storage.rela_path = rela_path.to_owned(); + storage.conversion = out; + storage.mode = mode; + } + }; + Ok(()) + } +} diff --git a/gix-merge/src/lib.rs b/gix-merge/src/lib.rs index 3a6cd994a53..8e608c53ab4 100644 --- a/gix-merge/src/lib.rs +++ b/gix-merge/src/lib.rs @@ -1,2 +1,6 @@ #![deny(rust_2018_idioms)] #![forbid(unsafe_code)] + +/// +#[cfg(feature = "blob")] +pub mod blob;