From 7a543c8076a3ceb2dfa92fce06fc558cfe84cde7 Mon Sep 17 00:00:00 2001 From: tballison Date: Tue, 26 Apr 2016 10:29:41 -0400 Subject: [PATCH] TIKA-1924 - needed to add sanity check in map() --- .../parser/mp4/DirectFileReadDataSource.java | 27 +- .../org/apache/tika/parser/mp4/MP4Parser.java | 372 +++++++++--------- 2 files changed, 212 insertions(+), 187 deletions(-) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java b/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java index 4e2c1ba239..f25b41307f 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java @@ -21,6 +21,7 @@ import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; +import java.math.BigInteger; import java.nio.ByteBuffer; import java.nio.channels.WritableByteChannel; @@ -66,6 +67,9 @@ public int read(ByteBuffer byteBuffer) throws IOException { } public int readAllInOnce(ByteBuffer byteBuffer) throws IOException { + if (byteBuffer.remaining() > raf.length()) { + throw new IOException("trying to readAllInOnce past end of stream"); + } byte[] buf = new byte[byteBuffer.remaining()]; int read = raf.read(buf); byteBuffer.put(buf, 0, read); @@ -81,6 +85,9 @@ public long position() throws IOException { } public void position(long nuPos) throws IOException { + if (nuPos > raf.length()) { + throw new IOException("requesting seek past end of stream"); + } raf.seek(nuPos); } @@ -89,12 +96,30 @@ public long transferTo(long position, long count, WritableByteChannel target) th } public ByteBuffer map(long startPosition, long size) throws IOException { + if (startPosition < 0 || size < 0) { + throw new IOException("startPosition and size must both be >= 0"); + } + //make sure that start+size aren't greater than avail size + //in raf. + BigInteger end = BigInteger.valueOf(startPosition); + end.add(BigInteger.valueOf(size)); + if (end.compareTo(BigInteger.valueOf(raf.length())) > 0) { + throw new IOException("requesting read past end of stream"); + } + raf.seek(startPosition); - byte[] payload = new byte[l2i(size)]; + int payLoadSize = l2i(size); + //hack to check for potential overflow + if (Long.MAX_VALUE-payLoadSize < startPosition || + Long.MAX_VALUE-payLoadSize > raf.length()) { + throw new IOException("requesting read past end of stream"); + } + byte[] payload = new byte[payLoadSize]; raf.readFully(payload); return ByteBuffer.wrap(payload); } + @Override public void close() throws IOException { raf.close(); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java b/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java index 20c82466a3..db50e378f5 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java @@ -28,21 +28,6 @@ import java.util.Map; import java.util.Set; -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TemporaryResources; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.Property; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.metadata.XMP; -import org.apache.tika.metadata.XMPDM; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AbstractParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.XHTMLContentHandler; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - import com.coremedia.iso.IsoFile; import com.coremedia.iso.boxes.Box; import com.coremedia.iso.boxes.Container; @@ -57,9 +42,10 @@ import com.coremedia.iso.boxes.UserDataBox; import com.coremedia.iso.boxes.apple.AppleItemListBox; import com.coremedia.iso.boxes.sampleentry.AudioSampleEntry; +import com.googlecode.mp4parser.DataSource; import com.googlecode.mp4parser.boxes.apple.AppleAlbumBox; -import com.googlecode.mp4parser.boxes.apple.AppleArtistBox; import com.googlecode.mp4parser.boxes.apple.AppleArtist2Box; +import com.googlecode.mp4parser.boxes.apple.AppleArtistBox; import com.googlecode.mp4parser.boxes.apple.AppleCommentBox; import com.googlecode.mp4parser.boxes.apple.AppleCompilationBox; import com.googlecode.mp4parser.boxes.apple.AppleDiskNumberBox; @@ -70,6 +56,20 @@ import com.googlecode.mp4parser.boxes.apple.AppleTrackAuthorBox; import com.googlecode.mp4parser.boxes.apple.AppleTrackNumberBox; import com.googlecode.mp4parser.boxes.apple.Utf8AppleDataBox; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.XMP; +import org.apache.tika.metadata.XMPDM; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; /** * Parser for the MP4 media container format, as well as the older @@ -119,183 +119,183 @@ public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - IsoFile isoFile; - + // The MP4Parser library accepts either a File, or a byte array // As MP4 video files are typically large, always use a file to // avoid OOMs that may occur with in-memory buffering TemporaryResources tmp = new TemporaryResources(); TikaInputStream tstream = TikaInputStream.get(stream, tmp); - try { - isoFile = new IsoFile(new DirectFileReadDataSource(tstream.getFile())); - tmp.addResource(isoFile); - - // Grab the file type box - FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class); - if (fileType != null) { - // Identify the type - MediaType type = MediaType.application("mp4"); - for (MediaType t : typesMap.keySet()) { - if (typesMap.get(t).contains(fileType.getMajorBrand())) { - type = t; - break; - } - } - metadata.set(Metadata.CONTENT_TYPE, type.toString()); - - if (type.getType().equals("audio")) { - metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim()); - } - } else { - // Some older QuickTime files lack the FileType - metadata.set(Metadata.CONTENT_TYPE, "video/quicktime"); - } - - - // Get the main MOOV box - MovieBox moov = getOrNull(isoFile, MovieBox.class); - if (moov == null) { - // Bail out - return; - } - - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); - xhtml.startDocument(); - - - // Pull out some information from the header box - MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class); - if (mHeader != null) { - // Get the creation and modification dates - metadata.set(Metadata.CREATION_DATE, mHeader.getCreationTime()); - metadata.set(TikaCoreProperties.MODIFIED, mHeader.getModificationTime()); - - // Get the duration - double durationSeconds = ((double)mHeader.getDuration()) / mHeader.getTimescale(); - metadata.set(XMPDM.DURATION, DURATION_FORMAT.format(durationSeconds)); - - // The timescale is normally the sampling rate - metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)mHeader.getTimescale()); - } - - - // Get some more information from the track header - // TODO Decide how to handle multiple tracks - List tb = moov.getBoxes(TrackBox.class); - if (tb.size() > 0) { - TrackBox track = tb.get(0); - - TrackHeaderBox header = track.getTrackHeaderBox(); - // Get the creation and modification dates - metadata.set(TikaCoreProperties.CREATED, header.getCreationTime()); - metadata.set(TikaCoreProperties.MODIFIED, header.getModificationTime()); - - // Get the video with and height - metadata.set(Metadata.IMAGE_WIDTH, (int)header.getWidth()); - metadata.set(Metadata.IMAGE_LENGTH, (int)header.getHeight()); - - // Get the sample information - SampleTableBox samples = track.getSampleTableBox(); - SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox(); - if (sampleDesc != null) { - // Look for the first Audio Sample, if present - AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class); - if (sample != null) { - XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount()); - //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize()); // TODO Num -> Type mapping - metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)sample.getSampleRate()); - //metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket()); - //metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample()); - } - } - } - - // Get metadata from the User Data Box - UserDataBox userData = getOrNull(moov, UserDataBox.class); - if (userData != null) { - MetaBox meta = getOrNull(userData, MetaBox.class); - - // Check for iTunes Metadata - // See http://atomicparsley.sourceforge.net/mpeg-4files.html and - // http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these - AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class); - if (apple != null) { - // Title - AppleNameBox title = getOrNull(apple, AppleNameBox.class); - addMetadata(TikaCoreProperties.TITLE, metadata, title); - - // Artist - AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class); - addMetadata(TikaCoreProperties.CREATOR, metadata, artist); - addMetadata(XMPDM.ARTIST, metadata, artist); - - // Album Artist - AppleArtist2Box artist2 = getOrNull(apple, AppleArtist2Box.class); - addMetadata(XMPDM.ALBUM_ARTIST, metadata, artist2); - - // Album - AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class); - addMetadata(XMPDM.ALBUM, metadata, album); - - // Composer - AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class); - addMetadata(XMPDM.COMPOSER, metadata, composer); - - // Genre - AppleGenreBox genre = getOrNull(apple, AppleGenreBox.class); - addMetadata(XMPDM.GENRE, metadata, genre); - - // Year - AppleRecordingYear2Box year = getOrNull(apple, AppleRecordingYear2Box.class); - if (year != null) { - metadata.set(XMPDM.RELEASE_DATE, year.getValue()); - } - - // Track number - AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class); - if (trackNum != null) { - metadata.set(XMPDM.TRACK_NUMBER, trackNum.getA()); - //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getB()); // TODO - } - - // Disc number - AppleDiskNumberBox discNum = getOrNull(apple, AppleDiskNumberBox.class); - if (discNum != null) { - metadata.set(XMPDM.DISC_NUMBER, discNum.getA()); - } - - // Compilation - AppleCompilationBox compilation = getOrNull(apple, AppleCompilationBox.class); - if (compilation != null) { - metadata.set(XMPDM.COMPILATION, (int)compilation.getValue()); - } - - // Comment - AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class); - addMetadata(XMPDM.LOG_COMMENT, metadata, comment); - - // Encoder - AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class); - if (encoder != null) { - metadata.set(XMP.CREATOR_TOOL, encoder.getValue()); - } - - - // As text - for (Box box : apple.getBoxes()) { - if (box instanceof Utf8AppleDataBox) { - xhtml.element("p", ((Utf8AppleDataBox)box).getValue()); - } - } - } - - // TODO Check for other kinds too + try (DataSource dataSource = new DirectFileReadDataSource(tstream.getFile())) { + try (IsoFile isoFile = new IsoFile(dataSource)) { + tmp.addResource(isoFile); + + // Grab the file type box + FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class); + if (fileType != null) { + // Identify the type + MediaType type = MediaType.application("mp4"); + for (MediaType t : typesMap.keySet()) { + if (typesMap.get(t).contains(fileType.getMajorBrand())) { + type = t; + break; + } + } + metadata.set(Metadata.CONTENT_TYPE, type.toString()); + + if (type.getType().equals("audio")) { + metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim()); + } + } else { + // Some older QuickTime files lack the FileType + metadata.set(Metadata.CONTENT_TYPE, "video/quicktime"); + } + + + // Get the main MOOV box + MovieBox moov = getOrNull(isoFile, MovieBox.class); + if (moov == null) { + // Bail out + return; + } + + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + + // Pull out some information from the header box + MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class); + if (mHeader != null) { + // Get the creation and modification dates + metadata.set(Metadata.CREATION_DATE, mHeader.getCreationTime()); + metadata.set(TikaCoreProperties.MODIFIED, mHeader.getModificationTime()); + + // Get the duration + double durationSeconds = ((double) mHeader.getDuration()) / mHeader.getTimescale(); + metadata.set(XMPDM.DURATION, DURATION_FORMAT.format(durationSeconds)); + + // The timescale is normally the sampling rate + metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) mHeader.getTimescale()); + } + + + // Get some more information from the track header + // TODO Decide how to handle multiple tracks + List tb = moov.getBoxes(TrackBox.class); + if (tb.size() > 0) { + TrackBox track = tb.get(0); + + TrackHeaderBox header = track.getTrackHeaderBox(); + // Get the creation and modification dates + metadata.set(TikaCoreProperties.CREATED, header.getCreationTime()); + metadata.set(TikaCoreProperties.MODIFIED, header.getModificationTime()); + + // Get the video with and height + metadata.set(Metadata.IMAGE_WIDTH, (int) header.getWidth()); + metadata.set(Metadata.IMAGE_LENGTH, (int) header.getHeight()); + + // Get the sample information + SampleTableBox samples = track.getSampleTableBox(); + SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox(); + if (sampleDesc != null) { + // Look for the first Audio Sample, if present + AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class); + if (sample != null) { + XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount()); + //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize()); // TODO Num -> Type mapping + metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) sample.getSampleRate()); + //metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket()); + //metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample()); + } + } + } + + // Get metadata from the User Data Box + UserDataBox userData = getOrNull(moov, UserDataBox.class); + if (userData != null) { + MetaBox meta = getOrNull(userData, MetaBox.class); + + // Check for iTunes Metadata + // See http://atomicparsley.sourceforge.net/mpeg-4files.html and + // http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these + AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class); + if (apple != null) { + // Title + AppleNameBox title = getOrNull(apple, AppleNameBox.class); + addMetadata(TikaCoreProperties.TITLE, metadata, title); + + // Artist + AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class); + addMetadata(TikaCoreProperties.CREATOR, metadata, artist); + addMetadata(XMPDM.ARTIST, metadata, artist); + + // Album Artist + AppleArtist2Box artist2 = getOrNull(apple, AppleArtist2Box.class); + addMetadata(XMPDM.ALBUM_ARTIST, metadata, artist2); + + // Album + AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class); + addMetadata(XMPDM.ALBUM, metadata, album); + + // Composer + AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class); + addMetadata(XMPDM.COMPOSER, metadata, composer); + + // Genre + AppleGenreBox genre = getOrNull(apple, AppleGenreBox.class); + addMetadata(XMPDM.GENRE, metadata, genre); + + // Year + AppleRecordingYear2Box year = getOrNull(apple, AppleRecordingYear2Box.class); + if (year != null) { + metadata.set(XMPDM.RELEASE_DATE, year.getValue()); + } + + // Track number + AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class); + if (trackNum != null) { + metadata.set(XMPDM.TRACK_NUMBER, trackNum.getA()); + //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getB()); // TODO + } + + // Disc number + AppleDiskNumberBox discNum = getOrNull(apple, AppleDiskNumberBox.class); + if (discNum != null) { + metadata.set(XMPDM.DISC_NUMBER, discNum.getA()); + } + + // Compilation + AppleCompilationBox compilation = getOrNull(apple, AppleCompilationBox.class); + if (compilation != null) { + metadata.set(XMPDM.COMPILATION, (int) compilation.getValue()); + } + + // Comment + AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class); + addMetadata(XMPDM.LOG_COMMENT, metadata, comment); + + // Encoder + AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class); + if (encoder != null) { + metadata.set(XMP.CREATOR_TOOL, encoder.getValue()); + } + + + // As text + for (Box box : apple.getBoxes()) { + if (box instanceof Utf8AppleDataBox) { + xhtml.element("p", ((Utf8AppleDataBox) box).getValue()); + } + } + } + + // TODO Check for other kinds too + } + + // All done + xhtml.endDocument(); } - - // All done - xhtml.endDocument(); - } finally { tmp.dispose(); }