Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add image consolidation #447

Merged
merged 1 commit into from
Jul 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added PdfSharpCore.Test/Assets/frog-and-toad.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
77 changes: 73 additions & 4 deletions PdfSharpCore.Test/Merge.cs
Original file line number Diff line number Diff line change
@@ -1,25 +1,74 @@
using System.IO;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using PdfSharpCore.Drawing;
using PdfSharpCore.Drawing.Layout;
using PdfSharpCore.Pdf;
using PdfSharpCore.Pdf.IO;
using PdfSharpCore.Test.Helpers;
using Xunit;
using Xunit.Abstractions;

namespace PdfSharpCore.Test
{
public class Merge
{
private readonly ITestOutputHelper _output;

public Merge(ITestOutputHelper output)
{
_output = output;
}

[Fact]
public void CanMerge2Documents()
{
var pdf1Path = PathHelper.GetInstance().GetAssetPath("FamilyTree.pdf");
var pdf2Path = PathHelper.GetInstance().GetAssetPath("test.pdf");

var outputDocument = MergeDocuments(new[] { pdf1Path, pdf2Path });

var outFilePath = CreateOutFilePath("merge.pdf");
outputDocument.Save(outFilePath);
}

[Fact]
public void CanConsolidateImageDataInDocument()
{
var doc1 = CreateTestDocumentWithImage("lenna.png");
var doc2 = CreateTestDocumentWithImage("frog-and-toad.jpg");

var pdf1Path = CreateOutFilePath("image-doc1.pdf");
doc1.Save(pdf1Path);

var pdf2Path = CreateOutFilePath("image-doc2.pdf");
doc2.Save(pdf2Path);

var pdfPathsForMerge = Enumerable.Range(1, 50).SelectMany(_ => new[] { pdf1Path, pdf2Path });
var outputDocument = MergeDocuments(pdfPathsForMerge);

var mergedFilePath = CreateOutFilePath("images-merged.pdf");
outputDocument.Save(mergedFilePath);

outputDocument.ConsolidateImages();
var consolidatedFilePath = CreateOutFilePath("images-merged-consolidated.pdf");
outputDocument.Save(consolidatedFilePath);

long mergedLength = new FileInfo(mergedFilePath).Length;
long consolidatedLength = new FileInfo(consolidatedFilePath).Length;
Assert.True(consolidatedLength < mergedLength / 4);
}

private static PdfDocument MergeDocuments(IEnumerable<string> pdfPaths)
{
var outputDocument = new PdfDocument();

foreach (var pdfPath in new[] { pdf1Path, pdf2Path })
foreach (var pdfPath in pdfPaths)
{
using var fs = File.OpenRead(pdfPath);
var inputDocument = Pdf.IO.PdfReader.Open(fs, PdfDocumentOpenMode.Import);

var count = inputDocument.PageCount;
for (var idx = 0; idx < count; idx++)
{
Expand All @@ -28,14 +77,34 @@ public void CanMerge2Documents()
}
}

var outFilePath = Path.Combine(PathHelper.GetInstance().RootDir, "Out", "merge.pdf");
return outputDocument;
}

private static string CreateOutFilePath(string filename)
{
var outFilePath = Path.Combine(PathHelper.GetInstance().RootDir, "Out", filename);
var dir = Path.GetDirectoryName(outFilePath);
if (!Directory.Exists(dir))
{
Directory.CreateDirectory(dir);
}

outputDocument.Save(outFilePath);
return outFilePath;
}

private static PdfDocument CreateTestDocumentWithImage(string imageFilename)
{
var document = new PdfDocument();

var pageNewRenderer = document.AddPage();
var renderer = XGraphics.FromPdfPage(pageNewRenderer);
var textFormatter = new XTextFormatter(renderer);

var layout = new XRect(12, 12, 400, 50);
textFormatter.DrawString(imageFilename, new XFont("Arial", 12), XBrushes.Black, layout);
renderer.DrawImage(XImage.FromFile(PathHelper.GetInstance().GetAssetPath(imageFilename)), new XPoint(12, 100));

return document;
}
}
}
3 changes: 3 additions & 0 deletions PdfSharpCore.Test/PdfSharpCore.Test.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@
<None Update="Assets\**\*.png">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="Assets\frog-and-toad.jpg">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>

</Project>
80 changes: 80 additions & 0 deletions PdfSharpCore/Pdf/PdfDocument.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,12 @@
#endregion

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Security.Cryptography;
using System.Text;
using PdfSharpCore.Pdf.Advanced;
using PdfSharpCore.Pdf.Internal;
using PdfSharpCore.Pdf.IO;
Expand Down Expand Up @@ -809,6 +812,83 @@ public void MakeAcroFormsReadOnly()
}
}

public void ConsolidateImages()
{
var images = ImageInfo.FindAll(this);

var mapHashcodeToMd5 = new Dictionary<int, string>();
var mapMd5ToPdfItem = new Dictionary<string, PdfItem>();

// Calculate MD5 for each image XObject and build lookups for all images.
foreach (ImageInfo img in images)
{
mapHashcodeToMd5[img.XObject.GetHashCode()] = img.XObjectMD5;
mapMd5ToPdfItem[img.XObjectMD5] = img.Item.Value;
}

// Set the PdfItem for each image to the one chosen for the MD5.
foreach (ImageInfo img in images)
{
string md5 = mapHashcodeToMd5[img.XObject.GetHashCode()];
img.XObjects.Elements[img.Item.Key] = mapMd5ToPdfItem[md5];
}
}

internal class ImageInfo
{
public PdfDictionary XObjects { get; }
public KeyValuePair<string, PdfItem> Item { get; }
public PdfDictionary XObject { get; }
public string XObjectMD5 { get; }

private static readonly MD5 Hasher = MD5.Create();

public ImageInfo(PdfDictionary xObjects, KeyValuePair<string, PdfItem> item, PdfDictionary xObject)
{
XObjects = xObjects;
Item = item;
XObject = xObject;
XObjectMD5 = ComputeMD5(xObject.Stream.Value);
}

/// <summary>
/// Get info for each image in the document.
/// </summary>
internal static List<ImageInfo> FindAll(PdfDocument doc) =>
doc.Pages.Cast<PdfPage>()
.Select(page => page.Elements.GetDictionary("/Resources"))
.Select(resources => resources?.Elements?.GetDictionary("/XObject"))
.Where(xObjects => xObjects?.Elements != null)
.SelectMany(xObjects =>
from item in xObjects.Elements
let xObject = (item.Value as PdfReference)?.Value as PdfDictionary
where xObject?.Elements?.GetString("/Subtype") == "/Image"
select new ImageInfo(xObjects, item, xObject)
)
.ToList();

/// <summary>
/// Compute and return the MD5 hash of the input data.
/// </summary>
internal static string ComputeMD5(byte[] input)
{
byte[] hashBytes;
lock (Hasher)
{
hashBytes = Hasher.ComputeHash(input);
Hasher.Initialize();
}

var sb = new StringBuilder();
foreach (var x in hashBytes)
{
sb.Append(x.ToString("x2"));
}

return sb.ToString();
}
}

/// <summary>
/// Gets the security handler.
/// </summary>
Expand Down