diff --git a/PdfSharpCore.Test/Assets/frog-and-toad.jpg b/PdfSharpCore.Test/Assets/frog-and-toad.jpg new file mode 100644 index 00000000..62543cce Binary files /dev/null and b/PdfSharpCore.Test/Assets/frog-and-toad.jpg differ diff --git a/PdfSharpCore.Test/Merge.cs b/PdfSharpCore.Test/Merge.cs index ec1e6eed..e3bab0d7 100644 --- a/PdfSharpCore.Test/Merge.cs +++ b/PdfSharpCore.Test/Merge.cs @@ -1,25 +1,74 @@ -using System.IO; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using PdfSharpCore.Drawing; +using PdfSharpCore.Drawing.Layout; using PdfSharpCore.Pdf; using PdfSharpCore.Pdf.IO; using PdfSharpCore.Test.Helpers; using Xunit; +using Xunit.Abstractions; namespace PdfSharpCore.Test { public class Merge { + private readonly ITestOutputHelper _output; + + public Merge(ITestOutputHelper output) + { + _output = output; + } + [Fact] public void CanMerge2Documents() { var pdf1Path = PathHelper.GetInstance().GetAssetPath("FamilyTree.pdf"); var pdf2Path = PathHelper.GetInstance().GetAssetPath("test.pdf"); + var outputDocument = MergeDocuments(new[] { pdf1Path, pdf2Path }); + + var outFilePath = CreateOutFilePath("merge.pdf"); + outputDocument.Save(outFilePath); + } + + [Fact] + public void CanConsolidateImageDataInDocument() + { + var doc1 = CreateTestDocumentWithImage("lenna.png"); + var doc2 = CreateTestDocumentWithImage("frog-and-toad.jpg"); + + var pdf1Path = CreateOutFilePath("image-doc1.pdf"); + doc1.Save(pdf1Path); + + var pdf2Path = CreateOutFilePath("image-doc2.pdf"); + doc2.Save(pdf2Path); + + var pdfPathsForMerge = Enumerable.Range(1, 50).SelectMany(_ => new[] { pdf1Path, pdf2Path }); + var outputDocument = MergeDocuments(pdfPathsForMerge); + + var mergedFilePath = CreateOutFilePath("images-merged.pdf"); + outputDocument.Save(mergedFilePath); + + outputDocument.ConsolidateImages(); + var consolidatedFilePath = CreateOutFilePath("images-merged-consolidated.pdf"); + outputDocument.Save(consolidatedFilePath); + + long mergedLength = new FileInfo(mergedFilePath).Length; + long consolidatedLength = new FileInfo(consolidatedFilePath).Length; + Assert.True(consolidatedLength < mergedLength / 4); + } + + private static PdfDocument MergeDocuments(IEnumerable pdfPaths) + { var outputDocument = new PdfDocument(); - foreach (var pdfPath in new[] { pdf1Path, pdf2Path }) + foreach (var pdfPath in pdfPaths) { using var fs = File.OpenRead(pdfPath); var inputDocument = Pdf.IO.PdfReader.Open(fs, PdfDocumentOpenMode.Import); + var count = inputDocument.PageCount; for (var idx = 0; idx < count; idx++) { @@ -28,14 +77,34 @@ public void CanMerge2Documents() } } - var outFilePath = Path.Combine(PathHelper.GetInstance().RootDir, "Out", "merge.pdf"); + return outputDocument; + } + + private static string CreateOutFilePath(string filename) + { + var outFilePath = Path.Combine(PathHelper.GetInstance().RootDir, "Out", filename); var dir = Path.GetDirectoryName(outFilePath); if (!Directory.Exists(dir)) { Directory.CreateDirectory(dir); } - outputDocument.Save(outFilePath); + return outFilePath; + } + + private static PdfDocument CreateTestDocumentWithImage(string imageFilename) + { + var document = new PdfDocument(); + + var pageNewRenderer = document.AddPage(); + var renderer = XGraphics.FromPdfPage(pageNewRenderer); + var textFormatter = new XTextFormatter(renderer); + + var layout = new XRect(12, 12, 400, 50); + textFormatter.DrawString(imageFilename, new XFont("Arial", 12), XBrushes.Black, layout); + renderer.DrawImage(XImage.FromFile(PathHelper.GetInstance().GetAssetPath(imageFilename)), new XPoint(12, 100)); + + return document; } } } \ No newline at end of file diff --git a/PdfSharpCore.Test/PdfSharpCore.Test.csproj b/PdfSharpCore.Test/PdfSharpCore.Test.csproj index 844f5748..1fc9af5b 100644 --- a/PdfSharpCore.Test/PdfSharpCore.Test.csproj +++ b/PdfSharpCore.Test/PdfSharpCore.Test.csproj @@ -38,6 +38,9 @@ PreserveNewest + + PreserveNewest + diff --git a/PdfSharpCore/Pdf/PdfDocument.cs b/PdfSharpCore/Pdf/PdfDocument.cs index 87fb4d62..674ef4b0 100644 --- a/PdfSharpCore/Pdf/PdfDocument.cs +++ b/PdfSharpCore/Pdf/PdfDocument.cs @@ -28,9 +28,12 @@ #endregion using System; +using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.Linq; +using System.Security.Cryptography; +using System.Text; using PdfSharpCore.Pdf.Advanced; using PdfSharpCore.Pdf.Internal; using PdfSharpCore.Pdf.IO; @@ -809,6 +812,83 @@ public void MakeAcroFormsReadOnly() } } + public void ConsolidateImages() + { + var images = ImageInfo.FindAll(this); + + var mapHashcodeToMd5 = new Dictionary(); + var mapMd5ToPdfItem = new Dictionary(); + + // Calculate MD5 for each image XObject and build lookups for all images. + foreach (ImageInfo img in images) + { + mapHashcodeToMd5[img.XObject.GetHashCode()] = img.XObjectMD5; + mapMd5ToPdfItem[img.XObjectMD5] = img.Item.Value; + } + + // Set the PdfItem for each image to the one chosen for the MD5. + foreach (ImageInfo img in images) + { + string md5 = mapHashcodeToMd5[img.XObject.GetHashCode()]; + img.XObjects.Elements[img.Item.Key] = mapMd5ToPdfItem[md5]; + } + } + + internal class ImageInfo + { + public PdfDictionary XObjects { get; } + public KeyValuePair Item { get; } + public PdfDictionary XObject { get; } + public string XObjectMD5 { get; } + + private static readonly MD5 Hasher = MD5.Create(); + + public ImageInfo(PdfDictionary xObjects, KeyValuePair item, PdfDictionary xObject) + { + XObjects = xObjects; + Item = item; + XObject = xObject; + XObjectMD5 = ComputeMD5(xObject.Stream.Value); + } + + /// + /// Get info for each image in the document. + /// + internal static List FindAll(PdfDocument doc) => + doc.Pages.Cast() + .Select(page => page.Elements.GetDictionary("/Resources")) + .Select(resources => resources?.Elements?.GetDictionary("/XObject")) + .Where(xObjects => xObjects?.Elements != null) + .SelectMany(xObjects => + from item in xObjects.Elements + let xObject = (item.Value as PdfReference)?.Value as PdfDictionary + where xObject?.Elements?.GetString("/Subtype") == "/Image" + select new ImageInfo(xObjects, item, xObject) + ) + .ToList(); + + /// + /// Compute and return the MD5 hash of the input data. + /// + internal static string ComputeMD5(byte[] input) + { + byte[] hashBytes; + lock (Hasher) + { + hashBytes = Hasher.ComputeHash(input); + Hasher.Initialize(); + } + + var sb = new StringBuilder(); + foreach (var x in hashBytes) + { + sb.Append(x.ToString("x2")); + } + + return sb.ToString(); + } + } + /// /// Gets the security handler. ///