Skip to content

Commit

Permalink
Added a WriteOption to insert/update document level xmp metadata stre…
Browse files Browse the repository at this point in the history
…am based on the info dictionary.
  • Loading branch information
torakiki committed Jul 1, 2024
1 parent 5775ecd commit d7f1119
Show file tree
Hide file tree
Showing 9 changed files with 326 additions and 61 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ An [Apache PDFBox](https://github.com/apache/pdfbox) fork intended to be used as

What's different from PDFBox?
---------
+ Requires JDK 17
+ Requires JDK 21
+ Lazy loading/parsing of PDF objects. Only the document xref table(s)/stream(s) is(are) initially parsed and information to lookup objects are retrieved, when later a PDF object is requested, the object is retrieve/parsed using the lookup information. This allows minimal memory footprint when you only need part of the document (Ex. you only need the information dictionary or the number of pages of the document).
+ Multiple I/O implementations to read from. SAMBox uses [Sejda-io](https://github.com/torakiki/sejda-io) allowing to use one of the provided implementation based on `java.nio.channels.FileChannel`, `java.io.InputStream` and `java.nio.MappedByteBuffer` (buffered or not).
+ Minimized GC through the use of a pool of `java.lang.StringBuilder`.
Expand Down
24 changes: 21 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.sejda</groupId>
<artifactId>sambox</artifactId>
Expand Down Expand Up @@ -56,6 +58,7 @@
<maven.javadoc.failOnError>false</maven.javadoc.failOnError>
<bouncycastle.version>1.78.1</bouncycastle.version>
<fontbox.version>2.0.28</fontbox.version>
<xmpbox.version>3.0.2</xmpbox.version>
<sejda.commons.version>2.0.0</sejda.commons.version>
<sejda.io.version>3.0.1</sejda.io.version>
<slf4j.version>2.0.13</slf4j.version>
Expand Down Expand Up @@ -1529,10 +1532,14 @@
<goal>wget</goal>
</goals>
<configuration>
<url>https://issues.apache.org/jira/secure/attachment/13047577/PDFBOX-5484.ttf</url>
<url>
https://issues.apache.org/jira/secure/attachment/13047577/PDFBOX-5484.ttf
</url>
<outputDirectory>${project.build.directory}/fonts</outputDirectory>
<outputFileName>PDFBOX-5484.ttf</outputFileName>
<sha512>7c3d8bbc18654315d6341a277dcd5c66218b95c43baf190b6e32f77817d17bab421ef76f2c904b46c97f84c49b00d58525449cff970897010534d6aa2812a4e2</sha512>
<sha512>
7c3d8bbc18654315d6341a277dcd5c66218b95c43baf190b6e32f77817d17bab421ef76f2c904b46c97f84c49b00d58525449cff970897010534d6aa2812a4e2
</sha512>
</configuration>
</execution>
</executions>
Expand Down Expand Up @@ -1572,6 +1579,17 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>xmpbox</artifactId>
<version>${xmpbox.version}</version>
<exclusions>
<exclusion>
<artifactId>commons-logging</artifactId>
<groupId>commons-logging</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.bouncycastle</groupId>
<artifactId>bcmail-jdk18on</artifactId>
Expand Down
1 change: 1 addition & 0 deletions src/main/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
requires transitive java.xml;
requires transitive org.apache.fontbox;
requires transitive org.sejda.io;
requires transitive org.apache.xmpbox;

exports org.sejda.sambox;
exports org.sejda.sambox.contentstream;
Expand Down
13 changes: 11 additions & 2 deletions src/main/java/org/sejda/sambox/output/WriteOption.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

/**
* Options that can be selected when writing a PDF document.
*
*
* @author Andrea Vacondio
*/
public enum WriteOption
Expand All @@ -42,5 +42,14 @@ public enum WriteOption
/**
* Does not automatically update metadata modified date and producer when saving
*/
NO_METADATA_PRODUCER_MODIFIED_DATE_UPDATE
NO_METADATA_PRODUCER_MODIFIED_DATE_UPDATE,
/**
* It creates or updates the document XMP metadata before the document is written.
* <ul>
* <li>Creates: if the document XMP metadata does not exist, it creates a new one based on the info dictionary</li>
* <li>Updates: if the document XMP metadata exists, it updates all the values corresponding to the info dictionary (see ISO 32000-2:2020 Chap 14.3.3 Table 349)</li>
* </ul>
* Note: we currently leave untouched a malformed metadata stream.
*/
UPSERT_DOCUMENT_METADATA_STREAM
}
88 changes: 73 additions & 15 deletions src/main/java/org/sejda/sambox/pdmodel/PDDocument.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import java.awt.Point;
import java.awt.image.DataBuffer;
import java.awt.image.Raster;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
Expand All @@ -41,6 +43,10 @@
import java.util.Set;

import org.apache.fontbox.ttf.TrueTypeFont;
import org.apache.xmpbox.XMPMetadata;
import org.apache.xmpbox.xml.DomXmpParser;
import org.apache.xmpbox.xml.XmpParsingException;
import org.apache.xmpbox.xml.XmpSerializer;
import org.sejda.commons.util.IOUtils;
import org.sejda.io.CountingWritableByteChannel;
import org.sejda.io.SeekableSources;
Expand All @@ -61,13 +67,15 @@
import org.sejda.sambox.output.PDDocumentWriter;
import org.sejda.sambox.output.PreSaveCOSTransformer;
import org.sejda.sambox.output.WriteOption;
import org.sejda.sambox.pdmodel.common.PDMetadata;
import org.sejda.sambox.pdmodel.encryption.AccessPermission;
import org.sejda.sambox.pdmodel.encryption.PDEncryption;
import org.sejda.sambox.pdmodel.encryption.SecurityHandler;
import org.sejda.sambox.pdmodel.font.Subsettable;
import org.sejda.sambox.pdmodel.graphics.color.PDDeviceRGB;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.xml.transform.TransformerException;

/**
* This is the in-memory representation of the PDF document.
Expand Down Expand Up @@ -535,14 +543,27 @@ private void writeTo(CountingWritableByteChannel output, StandardSecurity securi
{
requireOpen();

if (Arrays.stream(options)
.noneMatch(i -> i == WriteOption.NO_METADATA_PRODUCER_MODIFIED_DATE_UPDATE))
updateMetadata(options);
subsetFonts();

EncryptionContext encryptionContext = ofNullable(security).map(EncryptionContext::new)
.orElse(null);
generateFileIdentifier(output.toString().getBytes(StandardCharsets.ISO_8859_1),
encryptionContext);
try (PDDocumentWriter writer = new PDDocumentWriter(output, encryptionContext,
preSaveCOSTransformer, options))
{
// update producer and last modification date only if the write option doesn't state otherwise
getDocumentInformation().setProducer(SAMBox.PRODUCER);
getDocumentInformation().setModificationDate(Calendar.getInstance());
onBeforeWrite.onBeforeWrite();
writer.write(this);
}
finally
{
IOUtils.close(this);
}
}

private void subsetFonts()
{
for (Subsettable font : fontsToSubset)
{
try
Expand All @@ -555,20 +576,57 @@ private void writeTo(CountingWritableByteChannel output, StandardSecurity securi
}
}
fontsToSubset.clear();
EncryptionContext encryptionContext = ofNullable(security).map(EncryptionContext::new)
.orElse(null);
generateFileIdentifier(output.toString().getBytes(StandardCharsets.ISO_8859_1),
encryptionContext);
try (PDDocumentWriter writer = new PDDocumentWriter(output, encryptionContext,
preSaveCOSTransformer, options))
}

private void updateMetadata(WriteOption[] options)
{
if (Arrays.stream(options)
.noneMatch(i -> i == WriteOption.NO_METADATA_PRODUCER_MODIFIED_DATE_UPDATE))
{
onBeforeWrite.onBeforeWrite();
writer.write(this);
// update producer and last modification date only if the write option doesn't state otherwise
getDocumentInformation().setProducer(SAMBox.PRODUCER);
getDocumentInformation().setModificationDate(Calendar.getInstance());
}
finally
if (Arrays.stream(options).anyMatch(o -> o == WriteOption.UPSERT_DOCUMENT_METADATA_STREAM))
{
IOUtils.close(this);
requireMinVersion(V1_4);
var metadataStream = new PDMetadata();
try (var metadataOutputStream = new BufferedOutputStream(
metadataStream.getCOSObject().createUnfilteredStream()))
{
new XmpSerializer().serialize(
getDocumentInformation().toXMPMetadata(getOrCreateXmpMetadata(),
getVersion()), metadataOutputStream, true);
getDocumentCatalog().setMetadata(metadataStream);
}
catch (IOException | TransformerException e)
{
LOG.warn("Unable to set xmp document metadata", e);
}
catch (XmpParsingException e)
{
LOG.warn("Unable to parse existing document level xmp metadata", e);
}
}
}

private XMPMetadata getOrCreateXmpMetadata() throws XmpParsingException, IOException
{
var metadata = getDocumentCatalog().getMetadata();
if (nonNull(metadata))
{
try
{
var parser = new DomXmpParser();
parser.setStrictParsing(false);
return parser.parse(new BufferedInputStream(metadata.createInputStream()));
}
finally
{
metadata.getCOSObject().unDecode();
}
}
return XMPMetadata.createXMPMetadata();
}

/**
Expand Down
Loading

0 comments on commit d7f1119

Please sign in to comment.