Skip to content
This repository has been archived by the owner on Dec 8, 2023. It is now read-only.

Commit

Permalink
EASY-2331: handle tar like zip (#187)
Browse files Browse the repository at this point in the history
* handle tar like zip
* documented tar file treatment
  • Loading branch information
jo-pol authored and rvanheest committed Dec 3, 2019
1 parent 6699602 commit 296c615
Show file tree
Hide file tree
Showing 10 changed files with 124 additions and 94 deletions.
28 changes: 12 additions & 16 deletions docs/api/api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -418,10 +418,11 @@ paths:
[multipart media type](https://tools.ietf.org/html/rfc2046.html#section-5.1); each part contains one file.
The header area of each part specifies the file name using a `Content-Disposition` header.
If the body contains **only one part** and that part has `Content-Type: application/zip`, then the body
is a file in ZIP format. This ZIP file can contain a directory hierarchy. The ZIP file will be
unzipped in `dir_path`. For each file in the ZIP file, the path in the deposit will be `dir_path`/`path_in_zip`.
The ZIP file may contain `nested ZIP files`, which will **not** be extracted, but stored as ZIP file.
If the body contains **only one part** and that part has `Content-Type: application/zip`
or `Content-Type: application/tar-compressed`, then the body
is a file in archive format. This archive file can contain a directory hierarchy. The archive file will be
unpacked in `dir_path`. For each file in the archive file, the path in the deposit will be `dir_path`/`path_in_archive`.
The archive file may contain `nested archive files`, which will **not** be extracted, but stored as is.
Existing files at the target location will be overwritten, existing folders at the target location will be merged with the folders in the zip.
Concurrent POST or PUT calls to the same deposit are not allowed.
Expand All @@ -445,11 +446,11 @@ paths:
201:
description: file or subdirectory added
400.1:
$ref: "#/components/responses/MalformedZip"
$ref: "#/components/responses/MalformedArchive"
400.2:
$ref: "#/components/responses/MustBeMultipartFormdata"
400.3:
$ref: "#/components/responses/ZipMustBeOnlyFile"
$ref: "#/components/responses/ArchiveMustBeOnlyFile"
401:
$ref: "#/components/responses/Unauthorized"
404:
Expand Down Expand Up @@ -514,7 +515,7 @@ paths:
- $ref: "#/components/parameters/FilePath"
description: |
If any of `file_path`'s parent do not yet exist, they are first created. The parameter `file_path` must not refer
to an existing directory. The Content-Type may be not be `application/zip` nor `multipart`.
to an existing directory. The Content-Type may be not be `application/zip`, `application/tar-compressed` or `multipart`.
Concurrent POST or PUT calls to the same deposit are not allowed.
operationId: writeFile
requestBody:
Expand All @@ -541,8 +542,6 @@ paths:
$ref: "#/components/schemas/FileInfo"
204:
description: Updated existing file.
400:
$ref: "#/components/responses/ZipFileNotAllowed"
401:
$ref: "#/components/responses/Unauthorized"
403:
Expand Down Expand Up @@ -663,8 +662,8 @@ components:
NotFound:
description: Not found. The body specifies if the deposit or something in the deposit is not found.

ZipMustBeOnlyFile:
description: Bad request. A multipart/form-data message contained a ZIP part but also other parts.
ArchiveMustBeOnlyFile:
description: Bad request. A multipart/form-data message contained a archive part but also other parts.

ConflictConcurrentUploadsNotAllowed:
description: Conflict. Concurrent file uploads to the same deposit are not allowed.
Expand All @@ -685,15 +684,12 @@ components:
MalformedState:
description: Bad request. State document is malformed.

MalformedZip:
description: Bad request. ZIP file is malformed.
MalformedArchive:
description: Bad request. Archive file is malformed.

MustBeMultipartFormdata:
description: Bad Request. The request body must be multipart/form-data.

ZipFileNotAllowed:
description: Bad request. Content-Type must not be application/zip nor start with multipart.

schemas:

UserInfo:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import nl.knaw.dans.easy.deposit.authentication.{ AuthenticationProvider, LdapAu
import nl.knaw.dans.easy.deposit.docs.StateInfo.State
import nl.knaw.dans.easy.deposit.docs.StateInfo.State.State
import nl.knaw.dans.easy.deposit.docs.{ DatasetMetadata, DepositInfo, StateInfo, UserInfo }
import nl.knaw.dans.easy.deposit.servlets.contentTypeZipPattern
import nl.knaw.dans.easy.deposit.servlets.archiveContentTypeRegexp
import nl.knaw.dans.lib.error._
import nl.knaw.dans.lib.logging.DebugEnhancedLogging
import org.apache.commons.configuration.PropertiesConfiguration
Expand Down Expand Up @@ -339,7 +339,7 @@ class EasyDepositApiApp(configuration: Configuration) extends DebugEnhancedLoggi
contentType.map(_.trim.toLowerCase) match {
case Some(str) if str.nonEmpty
&& !str.startsWith("multipart")
&& !str.matches(contentTypeZipPattern) => Success(())
&& !str.matches(archiveContentTypeRegexp) => Success(())
case _ => Failure(InvalidContentTypeException(contentType, "must not be application/zip nor start with multipart."))
}
}
Expand Down
8 changes: 4 additions & 4 deletions src/main/scala/nl.knaw.dans.easy.deposit/Errors.scala
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,10 @@ object Errors extends DebugEnhancedLogging {
s"Deposit has state $actual, can only $action deposits with one of the states: ${ allowed.mkString(", ") }"
)

case class ZipMustBeOnlyFileException(item: FileItem)
case class ArchiveMustBeOnlyFileException(item: FileItem)
extends ServletResponseException(
BAD_REQUEST_400,
s"A multipart/form-data message contained a ZIP part [${ item.name }] but also other parts."
s"A multipart/form-data message contained an archive part [${ item.name }] but also other parts."
)

case class InvalidDocumentException(document: String, t: Throwable = null)
Expand All @@ -109,8 +109,8 @@ object Errors extends DebugEnhancedLogging {
case class InvalidDoiException(uuid: UUID)
extends ServletResponseException(BAD_REQUEST_400, s"InvalidDoi: DOI must be obtained by calling GET /deposit/$uuid")

case class MalformedZipException(msgAboutEntry: String)
extends ServletResponseException(BAD_REQUEST_400, s"ZIP file is malformed. $msgAboutEntry")
case class MalformedArchiveException(msgAboutEntry: String)
extends ServletResponseException(BAD_REQUEST_400, s"Archive file is malformed. $msgAboutEntry")

case class PendingUploadException()
extends ServletResponseException(CONFLICT_409, "Another upload or submit is pending.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,12 @@ class DepositServlet(app: EasyDepositApiApp)
path <- getPath
_ <- isMultipart
fileItems = fileMultiParams.valuesIterator.flatten.buffered
maybeManagedZipInputStream <- fileItems.nextAsZipIfOnlyOne
maybeManagedArchiveInputStream <- fileItems.nextAsArchiveIfOnlyOne
(managedStagingDir, stagedFilesTarget) <- app.stageFiles(user.id, uuid, path)
_ <- managedStagingDir.apply(stagingDir =>
maybeManagedZipInputStream
.map(_.unzipPlainEntriesTo(stagingDir))
.getOrElse(app.multipartConfig.moveNonZips(fileItems, stagingDir))
maybeManagedArchiveInputStream
.map(_.unpackPlainEntriesTo(stagingDir))
.getOrElse(app.multipartConfig.moveNonArchive(fileItems, stagingDir))
.flatMap(_ => stagedFilesTarget.moveAllFrom(stagingDir))
)
} yield Created()
Expand Down
77 changes: 39 additions & 38 deletions src/main/scala/nl.knaw.dans.easy.deposit/servlets/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,11 @@ import java.util.zip.ZipException

import better.files.File
import better.files.File.CopyOptions
import nl.knaw.dans.easy.deposit.Errors.{ ConfigurationException, MalformedZipException, ZipMustBeOnlyFileException }
import nl.knaw.dans.easy.deposit.Errors.{ ArchiveMustBeOnlyFileException, ConfigurationException, MalformedArchiveException }
import nl.knaw.dans.lib.logging.DebugEnhancedLogging
import org.apache.commons.compress.archivers.ArchiveEntry
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream
import org.apache.commons.compress.archivers.{ ArchiveEntry, ArchiveInputStream }
import org.scalatra.servlet.{ FileItem, MultipartConfig }
import org.scalatra.util.RicherString._
import resource.{ ManagedResource, managed }
Expand All @@ -48,37 +49,41 @@ import scala.util.{ Failure, Success, Try }
// @formatter:on
package object servlets extends DebugEnhancedLogging {

val extensionZipPattern = ".+[.]g?z(ip)?"
private val zipExtRegexp = "g?z(ip)?"
private val tarExtRegexp = "tar(.gz)?"
val archiveExtRegexp = s".+[.]($zipExtRegexp|$tarExtRegexp)"
private val pre = "(x-)?g?zip"
private val post = "-compress(ed)?"
val contentTypeZipPattern = s"application/(($pre($post)?)|(x$post))"
private val zipContentTypeRegexp = s"(($pre($post)?)|(x$post))"
private val tarContentTypeRegexp = s"x-g?tar$post"
val archiveContentTypeRegexp = s"application/($zipContentTypeRegexp|$tarContentTypeRegexp)"

val contentTypeJson: (String, String) = "content-type" -> "application/json;charset=UTF-8"
val contentTypePlainText: (String, String) = "content-type" -> "text/plain;charset=UTF-8"

implicit class RichManagedZipInputStream(val zipInputStream: ManagedResource[ZipArchiveInputStream]) extends AnyVal {
def unzipPlainEntriesTo(dir: File): Try[Unit] = {
zipInputStream.apply(_.unzipPlainEntriesTo(dir))
implicit class RichManagedArchiveInputStream(val archiveInputStream: ManagedResource[ArchiveInputStream]) extends AnyVal {
def unpackPlainEntriesTo(dir: File): Try[Unit] = {
archiveInputStream.apply(_.unpackPlainEntriesTo(dir))
}
}

implicit class RichZipInputStream(val zipInputStream: ZipArchiveInputStream) extends AnyVal {
implicit class RichArchiveInputStream(val archiveInputStream: ArchiveInputStream) extends AnyVal {

def unzipPlainEntriesTo(targetDir: File): Try[Unit] = {
def unpackPlainEntriesTo(targetDir: File): Try[Unit] = {
def extract(entry: ArchiveEntry): Try[Unit] = {
if (!(targetDir / entry.getName).isChildOf(targetDir))
Failure(MalformedZipException(s"Can't extract ${ entry.getName }"))
Failure(MalformedArchiveException(s"Can't extract ${ entry.getName }"))
else if (entry.isDirectory)
Try((targetDir / entry.getName).createDirectories())
else {
logger.info(s"Extracting ${ entry.getName } size=${ entry.getSize } getLastModifiedDate=${ entry.getLastModifiedDate } }")
Try {
(targetDir / entry.getName).parent.createDirectories() // in case a directory was not specified separately
Files.copy(zipInputStream, (targetDir / entry.getName).path)
Files.copy(archiveInputStream, (targetDir / entry.getName).path)
()
}.recoverWith { case e: ZipException =>
logger.error(e.getMessage, e)
Failure(MalformedZipException(s"Can't extract ${ entry.getName }"))
Failure(MalformedArchiveException(s"Can't extract ${ entry.getName }"))
}
}
}
Expand All @@ -96,15 +101,15 @@ package object servlets extends DebugEnhancedLogging {
}
}

Try(Option(zipInputStream.getNextEntry)) match {
Try(Option(archiveInputStream.getNextEntry)) match {
case Success(None) |
Failure(_: EOFException) => Failure(MalformedZipException(s"No entries found."))
case Failure(e: ZipException) => Failure(MalformedZipException(e.getMessage))
Failure(_: EOFException) => Failure(MalformedArchiveException(s"No entries found."))
case Failure(e: ZipException) => Failure(MalformedArchiveException(e.getMessage))
case Failure(e) => Failure(e)
case Success(Some(firstEntry: ArchiveEntry)) => for {
_ <- extract(firstEntry)
_ <- Stream
.continually(zipInputStream.getNextEntry)
.continually(archiveInputStream.getNextEntry)
.takeWhile(Option(_).nonEmpty)
.map(extract)
.failFastOr(Success(()))
Expand All @@ -116,37 +121,33 @@ package object servlets extends DebugEnhancedLogging {

implicit class RichFileItem(val fileItem: FileItem) extends AnyVal {

def isZip: Boolean = {
val extensionIsZip = fileItem.name.matches(extensionZipPattern)
lazy val contentTypeIsZip = fileItem.contentType.exists(_.matches(contentTypeZipPattern))
logger.debug(s"ZIP check: ${ fileItem.name } : $extensionIsZip; ${ fileItem.contentType } : $contentTypeIsZip ")
extensionIsZip || contentTypeIsZip
private def matchesEitherOf(extensionRegexp: String, contentTypeRegexp: String) = {
fileItem.name.matches(extensionRegexp) || fileItem.contentType.exists(_.matches(contentTypeRegexp))
}

def getZipInputStream: Try[resource.ManagedResource[ZipArchiveInputStream]] = Try {
fileItem.charset
.map(toZipInputStream)
.getOrElse(toZipInputStream("UTF8"))
def isArchive: Boolean = {
matchesEitherOf(archiveExtRegexp, archiveContentTypeRegexp)
}

private def toZipInputStream(charSet: String): ManagedResource[ZipArchiveInputStream] = {
val useUnicodeExtraFields = true
val allowStoredEntriesWithDataDescriptor = true
managed(new ZipArchiveInputStream(fileItem.getInputStream, charSet, useUnicodeExtraFields, allowStoredEntriesWithDataDescriptor))
def getArchiveInputStream: Try[resource.ManagedResource[ArchiveInputStream]] = Try {
val charSet = fileItem.charset.getOrElse("UTF8")
if (matchesEitherOf(s".+[.]$tarExtRegexp", tarContentTypeRegexp))
managed(new TarArchiveInputStream(fileItem.getInputStream, charSet))
else managed(new ZipArchiveInputStream(fileItem.getInputStream, charSet, true, true))
}
}

implicit class RichMultipartConfig(config: MultipartConfig) {
def moveNonZips(srcItems: Iterator[FileItem], targetDir: File): Try[Unit] = {
def moveNonArchive(srcItems: Iterator[FileItem], targetDir: File): Try[Unit] = {
srcItems
.map(moveIfNonZip(_, targetDir))
.map(moveIfNotAnArchive(_, targetDir))
.failFastOr(Success(()))
}

private def moveIfNonZip(srcItem: FileItem, targetDir: File): Try[Unit] = {
private def moveIfNotAnArchive(srcItem: FileItem, targetDir: File): Try[Unit] = {
logger.info(s"staging upload: size=${ srcItem.size } contentType=${ srcItem.contentType } $targetDir/${ srcItem.name }")
if (srcItem.name.isBlank) Success(()) // skip form field without selected files
else if (srcItem.isZip) Failure(ZipMustBeOnlyFileException(srcItem))
else if (srcItem.isArchive) Failure(ArchiveMustBeOnlyFileException(srcItem))
else Try {
val f = UUID.randomUUID().toString
val location = File(config.location.getOrElse(throw ConfigurationException("multipart.location is missing")))
Expand All @@ -163,15 +164,15 @@ package object servlets extends DebugEnhancedLogging {

implicit class RichFileItems(val fileItems: BufferedIterator[FileItem]) extends AnyVal {

def nextAsZipIfOnlyOne: Try[Option[ManagedResource[ZipArchiveInputStream]]] = {
def nextAsArchiveIfOnlyOne: Try[Option[ManagedResource[ArchiveInputStream]]] = {
skipLeadingEmptyFormFields()
if (!fileItems.headOption.exists(_.isZip)) Success(None)
if (!fileItems.headOption.exists(_.isArchive)) Success(None)
else {
val leadingZipItem = fileItems.next()
val leadingArchiveItem = fileItems.next()
skipLeadingEmptyFormFields()
if (fileItems.hasNext)
Failure(ZipMustBeOnlyFileException(leadingZipItem))
else leadingZipItem.getZipInputStream.map(Some(_))
Failure(ArchiveMustBeOnlyFileException(leadingArchiveItem))
else leadingArchiveItem.getArchiveInputStream.map(Some(_))
}
}

Expand Down
Binary file not shown.
Loading

0 comments on commit 296c615

Please sign in to comment.